sengi 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +71 -0
- data/Makefile +23 -0
- data/Makefile.common +58 -0
- data/README.md +59 -0
- data/Rakefile +25 -0
- data/bin/config +148 -0
- data/bin/crawler +64 -0
- data/bin/list +129 -0
- data/bin/redis_start +11 -0
- data/bin/redis_stats +13 -0
- data/bin/redis_stop +10 -0
- data/bin/resque_crawler_restart +14 -0
- data/bin/resque_crawler_start +21 -0
- data/bin/resque_crawler_stop +20 -0
- data/bin/resque_scheduler_start +15 -0
- data/bin/resque_scheduler_stop +16 -0
- data/bin/resque_server_start +13 -0
- data/bin/resque_server_stop +13 -0
- data/config/redis.conf +120 -0
- data/config/resque_server_config.rb +6 -0
- data/lib/sengi.rb +5 -0
- data/lib/sengi/crawler.rb +589 -0
- data/lib/sengi/crawler_worker.rb +16 -0
- data/lib/sengi/uri.rb +288 -0
- data/lib/sengi/version.rb +17 -0
- data/sengi.gemspec +37 -0
- data/sengi.sublime-project +10 -0
- data/tests/tc_crawler.rb +14 -0
- data/tests/tc_uri.rb +140 -0
- data/tests/ts_all.rb +4 -0
- metadata +202 -0
data/bin/list
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: UTF-8
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'active_support'
|
6
|
+
require 'active_support/core_ext/numeric/conversions'
|
7
|
+
require 'sengi'
|
8
|
+
|
9
|
+
|
10
|
+
@options = {
|
11
|
+
}
|
12
|
+
opts = OptionParser.new do |o|
|
13
|
+
o.banner = 'Usage: list'
|
14
|
+
o.separator('')
|
15
|
+
|
16
|
+
#o.on('--no-scheme', "Don't print the scheme.") do
|
17
|
+
#end
|
18
|
+
|
19
|
+
o.on_tail('-h', '--help', 'Show this message.') do
|
20
|
+
puts o
|
21
|
+
puts
|
22
|
+
exit 3
|
23
|
+
end
|
24
|
+
end
|
25
|
+
ARGV << '-h' if ARGV.count == 0
|
26
|
+
commands = opts.parse(ARGV)
|
27
|
+
|
28
|
+
@redis = Hiredis::Connection.new
|
29
|
+
@redis.connect('127.0.0.1', 7000)
|
30
|
+
@redis.write(['SELECT', 1])
|
31
|
+
@redis.read
|
32
|
+
|
33
|
+
command = commands.shift
|
34
|
+
|
35
|
+
if command == 'urls'
|
36
|
+
@redis.write(['GET', 'urls:id'])
|
37
|
+
urls_id = @redis.read.to_i
|
38
|
+
|
39
|
+
(1..urls_id).each do |url_id|
|
40
|
+
@redis.write(['HGETALL', "urls:#{url_id}"])
|
41
|
+
raw_url = @redis.read
|
42
|
+
if raw_url.length > 0
|
43
|
+
url = Hash[*raw_url]
|
44
|
+
uri = URI(url['url'])
|
45
|
+
puts '%s' % [uri.to_s]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
elsif command == 'domains'
|
49
|
+
subcommand = commands.shift
|
50
|
+
if subcommand == 'ignore'
|
51
|
+
@redis.write(['SMEMBERS', 'domains:ignore'])
|
52
|
+
puts @redis.read
|
53
|
+
else
|
54
|
+
@redis.write(['GET', 'domains:id'])
|
55
|
+
domains_id = @redis.read.to_i
|
56
|
+
|
57
|
+
(1..domains_id).each do |domain_id|
|
58
|
+
@redis.write(['HGETALL', "domains:#{domain_id}"])
|
59
|
+
raw_domain = @redis.read
|
60
|
+
if raw_domain.length > 0
|
61
|
+
domain = Hash[*raw_domain]
|
62
|
+
puts domain['domain_nowww']
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
elsif command == 'generators'
|
67
|
+
@redis.write(['GET', 'generators:id'])
|
68
|
+
generators_id = @redis.read.to_i
|
69
|
+
|
70
|
+
(1..generators_id).each do |generator_id|
|
71
|
+
@redis.write(['HGETALL', "generators:#{generator_id}"])
|
72
|
+
raw_generator = @redis.read
|
73
|
+
if raw_generator.length > 0
|
74
|
+
generator = Hash[*raw_generator]
|
75
|
+
puts generator['name']
|
76
|
+
end
|
77
|
+
end
|
78
|
+
elsif command == 'stats'
|
79
|
+
traffic_out = 0
|
80
|
+
traffic_in = 0
|
81
|
+
|
82
|
+
@redis.write(['GET', "urls:id"])
|
83
|
+
urls_id = @redis.read.to_i
|
84
|
+
|
85
|
+
@redis.write(['GET', "requests:id"])
|
86
|
+
requests_id = @redis.read.to_i
|
87
|
+
|
88
|
+
@redis.write(['GET', "responses:id"])
|
89
|
+
responses_id = @redis.read.to_i
|
90
|
+
|
91
|
+
(1..urls_id).each do |url_id|
|
92
|
+
@redis.write(['HGETALL', "urls:#{url_id}"])
|
93
|
+
raw_url = @redis.read
|
94
|
+
if raw_url.length > 0
|
95
|
+
url = Hash[*raw_url]
|
96
|
+
|
97
|
+
@redis.write(['SMEMBERS', "urls:#{url_id}:requests"])
|
98
|
+
request_ids = @redis.read.map{ |rid| rid.to_i }
|
99
|
+
request_ids.each do |request_id|
|
100
|
+
@redis.write(['HGETALL', "requests:#{request_id}"])
|
101
|
+
raw_request = @redis.read
|
102
|
+
if raw_request.length > 0
|
103
|
+
request = Hash[*raw_request]
|
104
|
+
traffic_out += request['size'].to_i
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
@redis.write(['SMEMBERS', "urls:#{url_id}:responses"])
|
109
|
+
response_ids = @redis.read.map{ |rid| rid.to_i }
|
110
|
+
response_ids.each do |response_id|
|
111
|
+
@redis.write(['HGETALL', "responses:#{response_id}"])
|
112
|
+
raw_response = @redis.read
|
113
|
+
if raw_response.length > 0
|
114
|
+
response = Hash[*raw_response]
|
115
|
+
traffic_in += response['size'].to_i
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "urls id: %6d" % [urls_id]
|
122
|
+
puts "requests id: %6d" % [requests_id]
|
123
|
+
puts "responses id: %6d" % [responses_id]
|
124
|
+
puts
|
125
|
+
|
126
|
+
puts 'traffic'
|
127
|
+
puts 'bytes out: %9d (%s)' % [traffic_out, traffic_out.to_s(:human_size, precision: 2)]
|
128
|
+
puts 'bytes in: %9d (%s)' % [traffic_in, traffic_in.to_s(:human_size, precision: 2)]
|
129
|
+
end
|
data/bin/redis_start
ADDED
data/bin/redis_stats
ADDED
data/bin/redis_stop
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
|
8
|
+
id=${1:-0}
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
export QUEUE=crawler
|
14
|
+
export PIDFILE=./run/resque_${QUEUE}_${id}.pid
|
15
|
+
export INTERVAL=1
|
16
|
+
#export COUNT=1
|
17
|
+
#export BACKGROUND=yes
|
18
|
+
LOG=tmp/resque_${QUEUE}_${id}.log
|
19
|
+
|
20
|
+
echo "${DATE} start" >> ${LOG}
|
21
|
+
rake resque:work --trace 1>> ${LOG} 2>> ${LOG} < /dev/null &
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
|
8
|
+
id=${1:-0}
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
export QUEUE=crawler
|
14
|
+
export PIDFILE=./run/resque_${QUEUE}_${id}.pid
|
15
|
+
LOG=tmp/resque_${QUEUE}_${id}.log
|
16
|
+
|
17
|
+
pid=$(cat ${PIDFILE})
|
18
|
+
kill -QUIT ${pid}
|
19
|
+
rm ${PIDFILE}
|
20
|
+
echo "${DATE} process ended: ${pid}" >> ${LOG}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
export PIDFILE=./run/queue_scheduler.pid
|
8
|
+
export RESQUE_SCHEDULER_INTERVAL=1
|
9
|
+
LOG=tmp/resque_scheduler.log
|
10
|
+
|
11
|
+
|
12
|
+
cd "${SCRIPT_BASEDIR}/.."
|
13
|
+
|
14
|
+
echo "${DATE} start" >> ${LOG}
|
15
|
+
rake resque:scheduler --trace &> ${LOG} < /dev/null &
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
export PIDFILE=./run/queue_scheduler.pid
|
8
|
+
LOG=tmp/resque_scheduler.log
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
pid=$(cat ${PIDFILE})
|
14
|
+
kill -QUIT ${pid}
|
15
|
+
sleep 1
|
16
|
+
echo "${DATE} process exit: ${pid}" >> ${LOG}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
PIDFILE=./run/resque_server.pid
|
8
|
+
LOG=tmp/resque_server.log
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
resque-web -F -L --host 127.0.0.1 --port 8282 --pid-file ${PIDFILE} config/resque_server_config.rb &> ${LOG} < /dev/null &
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -x
|
4
|
+
set -e
|
5
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
6
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
7
|
+
SCRIPT_BASENAME=$(basename $0)
|
8
|
+
PIDFILE=./run/resque_server.pid
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
kill $(cat ${PIDFILE}) && rm ${PIDFILE}
|
data/config/redis.conf
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
|
2
|
+
daemonize yes
|
3
|
+
|
4
|
+
pidfile run/redis.pid
|
5
|
+
|
6
|
+
bind 127.0.0.1
|
7
|
+
port 7000
|
8
|
+
|
9
|
+
tcp-backlog 511
|
10
|
+
|
11
|
+
timeout 0
|
12
|
+
|
13
|
+
tcp-keepalive 0
|
14
|
+
|
15
|
+
#debug
|
16
|
+
#verbose
|
17
|
+
#notice
|
18
|
+
loglevel verbose
|
19
|
+
|
20
|
+
#logfile ""
|
21
|
+
logfile tmp/redis.log
|
22
|
+
|
23
|
+
syslog-enabled no
|
24
|
+
# syslog-ident redis
|
25
|
+
# syslog-facility local0
|
26
|
+
|
27
|
+
databases 2
|
28
|
+
|
29
|
+
save 900 1
|
30
|
+
save 300 10
|
31
|
+
save 60 10000
|
32
|
+
|
33
|
+
stop-writes-on-bgsave-error no
|
34
|
+
|
35
|
+
rdbcompression yes
|
36
|
+
|
37
|
+
rdbchecksum no
|
38
|
+
|
39
|
+
dbfilename redis_dump.rdb
|
40
|
+
#dir /var/lib/redis
|
41
|
+
|
42
|
+
#slave-read-only yes
|
43
|
+
|
44
|
+
repl-diskless-sync no
|
45
|
+
repl-diskless-sync-delay 5
|
46
|
+
repl-ping-slave-period 10
|
47
|
+
repl-timeout 60
|
48
|
+
|
49
|
+
repl-disable-tcp-nodelay no
|
50
|
+
# repl-backlog-size 1mb
|
51
|
+
# repl-backlog-ttl 3600
|
52
|
+
|
53
|
+
#slave-priority 100
|
54
|
+
|
55
|
+
# min-slaves-to-write 3
|
56
|
+
# min-slaves-max-lag 10
|
57
|
+
|
58
|
+
# requirepass foobared
|
59
|
+
|
60
|
+
# maxclients 10000
|
61
|
+
|
62
|
+
maxmemory 1gb
|
63
|
+
|
64
|
+
# maxmemory-policy noeviction
|
65
|
+
|
66
|
+
# maxmemory-samples 5
|
67
|
+
|
68
|
+
appendonly no
|
69
|
+
appendfilename "appendonly.aof"
|
70
|
+
|
71
|
+
# appendfsync always
|
72
|
+
appendfsync everysec
|
73
|
+
# appendfsync no
|
74
|
+
|
75
|
+
#no-appendfsync-on-rewrite no
|
76
|
+
|
77
|
+
auto-aof-rewrite-percentage 100
|
78
|
+
auto-aof-rewrite-min-size 64mb
|
79
|
+
|
80
|
+
aof-load-truncated yes
|
81
|
+
|
82
|
+
#lua-time-limit 5000
|
83
|
+
|
84
|
+
#cluster-enabled no
|
85
|
+
|
86
|
+
#cluster-config-file nodes_10100.conf
|
87
|
+
#cluster-node-timeout 15000
|
88
|
+
# cluster-slave-validity-factor 10
|
89
|
+
# cluster-migration-barrier 1
|
90
|
+
#cluster-require-full-coverage yes
|
91
|
+
|
92
|
+
#slowlog-log-slower-than 10000
|
93
|
+
#slowlog-max-len 128
|
94
|
+
|
95
|
+
#latency-monitor-threshold 0
|
96
|
+
|
97
|
+
notify-keyspace-events ""
|
98
|
+
|
99
|
+
hash-max-ziplist-entries 512
|
100
|
+
hash-max-ziplist-value 64
|
101
|
+
|
102
|
+
list-max-ziplist-entries 512
|
103
|
+
list-max-ziplist-value 64
|
104
|
+
|
105
|
+
set-max-intset-entries 512
|
106
|
+
|
107
|
+
zset-max-ziplist-entries 128
|
108
|
+
zset-max-ziplist-value 64
|
109
|
+
|
110
|
+
hll-sparse-max-bytes 3000
|
111
|
+
|
112
|
+
#activerehashing yes
|
113
|
+
|
114
|
+
#client-output-buffer-limit normal 0 0 0
|
115
|
+
#client-output-buffer-limit slave 256mb 64mb 60
|
116
|
+
#client-output-buffer-limit pubsub 32mb 8mb 60
|
117
|
+
|
118
|
+
hz 10
|
119
|
+
|
120
|
+
aof-rewrite-incremental-fsync yes
|
data/lib/sengi.rb
ADDED
@@ -0,0 +1,589 @@
|
|
1
|
+
|
2
|
+
require 'uri'
|
3
|
+
require 'net/http'
|
4
|
+
require 'hiredis'
|
5
|
+
require 'resque'
|
6
|
+
require 'resque-scheduler'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'time'
|
9
|
+
require 'digest'
|
10
|
+
require 'openssl'
|
11
|
+
require 'zlib'
|
12
|
+
require 'active_support/time'
|
13
|
+
|
14
|
+
require 'thefox-ext'
|
15
|
+
|
16
|
+
module TheFox
|
17
|
+
module Sengi
|
18
|
+
|
19
|
+
class Crawler
|
20
|
+
|
21
|
+
def initialize(url, options)
|
22
|
+
@url = url
|
23
|
+
@options = options
|
24
|
+
|
25
|
+
@options['serial'] = false if !@options.has_key?('serial')
|
26
|
+
@options['relative'] = false if !@options.has_key?('relative')
|
27
|
+
@options['force'] = false if !@options.has_key?('force')
|
28
|
+
@options['debug'] = false if !@options.has_key?('debug')
|
29
|
+
|
30
|
+
@options['parent_id'] = 0 if !@options.has_key?('parent_id')
|
31
|
+
@options['level'] = 0 if !@options.has_key?('level')
|
32
|
+
#pp @options
|
33
|
+
|
34
|
+
@redis = nil
|
35
|
+
@uri = nil
|
36
|
+
@request = nil
|
37
|
+
@response = nil
|
38
|
+
@html_doc = nil
|
39
|
+
@url_delay = nil
|
40
|
+
@url_separate_delay = nil
|
41
|
+
@url_reschedule = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def go
|
45
|
+
redis_setup
|
46
|
+
|
47
|
+
uri_setup
|
48
|
+
puts "#{Time.now.strftime('%F %T')} perform: #{@options['parent_id']} #{@options['level']} #{@options['relative'] ? 'y' : 'n'} #{@uri}"
|
49
|
+
|
50
|
+
check_blacklist
|
51
|
+
puts "\t" + "blacklisted: #{@uri.is_blacklisted ? 'YES' : 'no'}"
|
52
|
+
return if @uri.is_blacklisted
|
53
|
+
|
54
|
+
insert_url
|
55
|
+
puts "\t" + "url: #{@uri.id}"
|
56
|
+
if @uri.is_ignored && !@options['debug'] && !@options['force']
|
57
|
+
puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
|
58
|
+
return
|
59
|
+
end
|
60
|
+
|
61
|
+
insert_domain
|
62
|
+
puts "\t" + "domain id: #{@uri.domain_id}"
|
63
|
+
|
64
|
+
insert_request
|
65
|
+
puts "\t" + "request id: #{@uri.request_id}"
|
66
|
+
|
67
|
+
make_http_request
|
68
|
+
puts "\t" + "http response: #{@response.nil? ? 'FAILED' : 'ok'}"
|
69
|
+
return if @response.nil?
|
70
|
+
|
71
|
+
insert_response
|
72
|
+
puts "\t" + "response: #{@uri.response_id} #{@uri.response_size}"
|
73
|
+
|
74
|
+
puts "\t" + 'process http response'
|
75
|
+
process_http_response
|
76
|
+
puts "\t" + "http response"
|
77
|
+
if @uri.is_ignored && !@options['force']
|
78
|
+
puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
|
79
|
+
return
|
80
|
+
end
|
81
|
+
if @html_doc.nil?
|
82
|
+
puts "\t" + 'HTML INVALID'
|
83
|
+
return
|
84
|
+
end
|
85
|
+
|
86
|
+
puts "\t" + 'process html links'
|
87
|
+
process_html_links
|
88
|
+
|
89
|
+
puts "\t" + 'process html meta'
|
90
|
+
process_html_meta
|
91
|
+
|
92
|
+
puts "\t" + 'url done'
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def redis_setup
|
98
|
+
# Redis Setup
|
99
|
+
if @redis.nil?
|
100
|
+
@redis = Hiredis::Connection.new
|
101
|
+
@redis.connect('127.0.0.1', 7000)
|
102
|
+
@redis.write(['SELECT', 1])
|
103
|
+
@redis.read
|
104
|
+
end
|
105
|
+
|
106
|
+
@redis.write(['GET', 'urls:delay'])
|
107
|
+
@url_delay = @redis.read.to_i
|
108
|
+
if @url_delay.nil?
|
109
|
+
@url_delay = URL_DELAY
|
110
|
+
end
|
111
|
+
|
112
|
+
@redis.write(['GET', 'urls:separatedelay'])
|
113
|
+
@url_separate_delay = @redis.read.to_i
|
114
|
+
if @url_separate_delay.nil?
|
115
|
+
@url_separate_delay = URL_SEPARATE_DELAY
|
116
|
+
end
|
117
|
+
|
118
|
+
@redis.write(['GET', 'urls:reschedule'])
|
119
|
+
@url_reschedule = @redis.read.to_i
|
120
|
+
if @url_reschedule.nil?
|
121
|
+
@url_reschedule = URL_RESCHEDULE
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def uri_setup
|
126
|
+
# URL object
|
127
|
+
@uri = Uri.new(@url)
|
128
|
+
@url = @uri.to_s
|
129
|
+
end
|
130
|
+
|
131
|
+
def check_blacklist
|
132
|
+
# Check if the current URL domain (second- + top-level) is in the blacklist.
|
133
|
+
|
134
|
+
if !@uri.ruri.host.nil?
|
135
|
+
# This splits for example the domain 'www.facebook.com' to
|
136
|
+
# ['www', 'facebook', 'com'] and then uses the last two parts
|
137
|
+
# ['facebook', 'com'] to make the check.
|
138
|
+
domain_topparts = @uri.ruri.host.split('.')[-2..-1].join('.')
|
139
|
+
|
140
|
+
# Read Domains Blacklist
|
141
|
+
@redis.write(['SMEMBERS', 'domains:ignore'])
|
142
|
+
domains_ignore = @redis.read
|
143
|
+
|
144
|
+
if domains_ignore.include?(domain_topparts)
|
145
|
+
@uri.is_blacklisted = true
|
146
|
+
else
|
147
|
+
# If the domain wasn't found in the blacklist search with regex.
|
148
|
+
# For example: if you blacklist 'google' the domain 'google.com'
|
149
|
+
# will not be found by the parent if condition. So search also with regex.
|
150
|
+
@uri.is_blacklisted = domains_ignore.grep(Regexp.new(domain_topparts)).count > 0
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def insert_url
|
156
|
+
# Check if a URL already exists.
|
157
|
+
@redis.write(['EXISTS', @uri.hash_id_key_name])
|
158
|
+
if @redis.read.to_b
|
159
|
+
# A URL already exists.
|
160
|
+
@redis.write(['GET', @uri.hash_id_key_name])
|
161
|
+
@uri.id = @redis.read
|
162
|
+
|
163
|
+
@redis.write(['HGETALL', @uri.key_name])
|
164
|
+
redis_uri = Hash[*@redis.read]
|
165
|
+
#pp redis_uri
|
166
|
+
|
167
|
+
@uri.is_ignored = redis_uri['is_ignored'].to_i.to_b
|
168
|
+
request_attempts = redis_uri['request_attempts'].to_i
|
169
|
+
|
170
|
+
puts "\t" + "request attempts: #{request_attempts}"
|
171
|
+
|
172
|
+
if @uri.is_ignored
|
173
|
+
@uri.is_ignored_reason = 'already ignored'
|
174
|
+
else
|
175
|
+
if request_attempts >= 3
|
176
|
+
# Ignore the URL if it has already X attempts.
|
177
|
+
|
178
|
+
@uri.is_ignored = true
|
179
|
+
@uri.is_ignored_reason = 'attempts >= 3'
|
180
|
+
|
181
|
+
# Ignore the URL.
|
182
|
+
@redis.write(['HMSET', @uri.key_name,
|
183
|
+
'is_ignored', 1,
|
184
|
+
'ignored_at', Time.now.strftime('%F %T %z'),
|
185
|
+
])
|
186
|
+
@redis.read
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# Increase the URL attempts, even if the URL will be ignored.
|
191
|
+
# @redis.write(['HINCRBY', @uri.key_name, 'request_attempts', 1])
|
192
|
+
# @redis.read
|
193
|
+
@redis.write(['HMSET', @uri.key_name,
|
194
|
+
'request_attempts', request_attempts + 1,
|
195
|
+
'request_attempt_last_at', Time.now.strftime('%F %T %z'),
|
196
|
+
])
|
197
|
+
@redis.read
|
198
|
+
else
|
199
|
+
# New URL. Increase the URLs ID.
|
200
|
+
@redis.write(['INCR', 'urls:id'])
|
201
|
+
@uri.id = @redis.read
|
202
|
+
|
203
|
+
now_s = Time.now.strftime('%F %T %z')
|
204
|
+
|
205
|
+
# Insert the new URL.
|
206
|
+
@redis.write(['HMSET', @uri.key_name,
|
207
|
+
'url', @uri.to_s,
|
208
|
+
'hash', @uri.to_hash,
|
209
|
+
'request_attempts', 1,
|
210
|
+
'request_attempt_last_at', now_s,
|
211
|
+
'parent_id', @options['parent_id'],
|
212
|
+
'level', @options['level'],
|
213
|
+
'is_blacklisted', @uri.is_blacklisted.to_i,
|
214
|
+
'is_ignored', 0,
|
215
|
+
#'ignored_at', nil,
|
216
|
+
'is_redirect', 0,
|
217
|
+
'created_at', now_s,
|
218
|
+
])
|
219
|
+
@redis.read
|
220
|
+
|
221
|
+
# Set the URL Hash to URL ID reference.
|
222
|
+
@redis.write(['SET', @uri.hash_id_key_name, @uri.id])
|
223
|
+
@redis.read
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def insert_domain
|
228
|
+
# Add Domain to the indexed list.
|
229
|
+
@redis.write(['SADD', 'domains:indexed', @uri.domain_nowww])
|
230
|
+
@redis.read.to_b
|
231
|
+
|
232
|
+
# Check if a Domain already exists.
|
233
|
+
@redis.write(['EXISTS', @uri.domain_hash_id_key_name])
|
234
|
+
if @redis.read.to_b
|
235
|
+
# A Domain already exists.
|
236
|
+
@redis.write(['GET', @uri.domain_hash_id_key_name])
|
237
|
+
@uri.domain_id = @redis.read
|
238
|
+
else
|
239
|
+
# New Domain. Increase the Domains ID.
|
240
|
+
@redis.write(['INCR', 'domains:id'])
|
241
|
+
@uri.domain_id = @redis.read
|
242
|
+
|
243
|
+
# Insert the new Domain.
|
244
|
+
@redis.write(['HMSET', @uri.domain_key_name,
|
245
|
+
'domain_nowww', @uri.domain_nowww,
|
246
|
+
'domain_original', @uri.ruri.host,
|
247
|
+
'hash_nowww', @uri.domain_nowww_hash,
|
248
|
+
'hash_original', @uri.domain_original_hash,
|
249
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
250
|
+
])
|
251
|
+
@redis.read
|
252
|
+
|
253
|
+
# Set the Domain Hash to Domain ID reference.
|
254
|
+
@redis.write(['SET', @uri.domain_hash_id_key_name, @uri.domain_id])
|
255
|
+
@redis.read
|
256
|
+
end
|
257
|
+
|
258
|
+
# Save the URLs per Domain.
|
259
|
+
@redis.write(['SADD', "domains:#{@uri.domain_id}:urls", @uri.id])
|
260
|
+
@redis.read
|
261
|
+
end
|
262
|
+
|
263
|
+
def insert_request
|
264
|
+
# Increase the Requests ID.
|
265
|
+
@redis.write(['INCR', 'requests:id'])
|
266
|
+
@uri.request_id = @redis.read
|
267
|
+
|
268
|
+
# Create a new Request.
|
269
|
+
@redis.write(['HMSET', @uri.request_key_name,
|
270
|
+
'url_id', @uri.id,
|
271
|
+
'user_agent', HTTP_USER_AGENT,
|
272
|
+
'error', 0,
|
273
|
+
#'error_msg', nil,
|
274
|
+
'size', 0,
|
275
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
276
|
+
])
|
277
|
+
@redis.read
|
278
|
+
|
279
|
+
# Save the Requests per URL.
|
280
|
+
@redis.write(['SADD', "urls:#{@uri.id}:requests", @uri.request_id])
|
281
|
+
@redis.read
|
282
|
+
end
|
283
|
+
|
284
|
+
def make_http_request
|
285
|
+
# HTTP Request
|
286
|
+
http = Net::HTTP.new(@uri.ruri.host, @uri.ruri.port)
|
287
|
+
http.keep_alive_timeout = 0
|
288
|
+
http.open_timeout = 5
|
289
|
+
http.read_timeout = 5
|
290
|
+
http.ssl_timeout = 5
|
291
|
+
if @uri.ruri.scheme.to_s.downcase == 'https'
|
292
|
+
http.use_ssl = true
|
293
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
294
|
+
end
|
295
|
+
|
296
|
+
# Send HTTP Request
|
297
|
+
@request = Net::HTTP::Get.new(@uri.ruri.request_uri)
|
298
|
+
@request['User-Agent'] = HTTP_USER_AGENT
|
299
|
+
@request['Referer'] = HTTP_REFERER
|
300
|
+
@request['Connection'] = 'close'
|
301
|
+
@request['Accept'] = 'text/html'
|
302
|
+
@request['Accept-Encoding'] = 'gzip;q=1.0,identity;q=0.6'
|
303
|
+
@request['Accept-Language'] = 'en,en-US;q=0.8'
|
304
|
+
|
305
|
+
string_io = StringIO.new
|
306
|
+
@request.exec(string_io, Net::HTTP::HTTPVersion, @request.path)
|
307
|
+
@redis.write(['HSET', @uri.request_key_name, 'size', string_io.string.length])
|
308
|
+
@redis.read
|
309
|
+
|
310
|
+
begin
|
311
|
+
puts "\t" + 'http request'
|
312
|
+
@response = http.request(@request)
|
313
|
+
puts "\t" + 'http request ok'
|
314
|
+
rescue Exception => e
|
315
|
+
puts "\t" + "ERROR: #{e.class} #{e}"
|
316
|
+
|
317
|
+
@response = nil
|
318
|
+
|
319
|
+
# Save the error and error message to the URL Request.
|
320
|
+
@redis.write(['HMSET', @uri.request_key_name,
|
321
|
+
'error', 1,
|
322
|
+
'error_msg', e.to_s,
|
323
|
+
])
|
324
|
+
@redis.read
|
325
|
+
|
326
|
+
reenqueue
|
327
|
+
return
|
328
|
+
end
|
329
|
+
|
330
|
+
# Ignore the URL for further requests because it was successful.
|
331
|
+
@redis.write(['HMSET', @uri.key_name,
|
332
|
+
'is_ignored', 1,
|
333
|
+
'ignored_at', Time.now.strftime('%F %T %z'),
|
334
|
+
])
|
335
|
+
@redis.read
|
336
|
+
end
|
337
|
+
|
338
|
+
def insert_response
|
339
|
+
# Increase the Responses ID.
|
340
|
+
@redis.write(['INCR', 'responses:id'])
|
341
|
+
@uri.response_id = @redis.read
|
342
|
+
|
343
|
+
# Add the Response ID to the URL.
|
344
|
+
@redis.write(['SADD', "urls:#{@uri.id}:responses", @uri.response_id])
|
345
|
+
@redis.read
|
346
|
+
|
347
|
+
# This is still too inaccurate.
|
348
|
+
response_size = @response.header.to_hash.map{ |k, v|
|
349
|
+
vs = ''
|
350
|
+
if v.is_a?(Array)
|
351
|
+
vs = v.join(' ')
|
352
|
+
else
|
353
|
+
vs = v
|
354
|
+
end
|
355
|
+
"#{k}: #{vs}"
|
356
|
+
}.join("\r\n").length + 4
|
357
|
+
|
358
|
+
response_size += @response.body.length
|
359
|
+
|
360
|
+
@uri.response_size = response_size
|
361
|
+
@uri.response_content_type = @response['Content-Type']
|
362
|
+
|
363
|
+
# Insert the new Response.
|
364
|
+
@redis.write(['HMSET', @uri.response_key_name,
|
365
|
+
'code', @response.code.to_i,
|
366
|
+
'content_type', @uri.response_content_type,
|
367
|
+
'request_id', @uri.request_id,
|
368
|
+
'size', @uri.response_size,
|
369
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
370
|
+
])
|
371
|
+
@redis.read
|
372
|
+
|
373
|
+
# Add the Response to the Response Code.
|
374
|
+
@redis.write(['SADD', "responses:code:#{@response.code}", @uri.response_id])
|
375
|
+
@redis.read
|
376
|
+
end
|
377
|
+
|
378
|
+
def process_http_response
|
379
|
+
body = ''
|
380
|
+
if !@response['Content-Encoding'].nil? && @response['Content-Encoding'].downcase == 'gzip'
|
381
|
+
body = Zlib::GzipReader.new(StringIO.new(@response.body)).read
|
382
|
+
else
|
383
|
+
body = @response.body
|
384
|
+
end
|
385
|
+
|
386
|
+
code = @response.code.to_i
|
387
|
+
puts "\t" + "http response code: #{code}"
|
388
|
+
|
389
|
+
if code == 200
|
390
|
+
if @uri.response_content_type[0..8] == 'text/html'
|
391
|
+
@html_doc = Nokogiri::HTML(body)
|
392
|
+
@html_doc.remove_namespaces!
|
393
|
+
else
|
394
|
+
# Ignore the URL if the response content type isn't HTML.
|
395
|
+
@uri.is_ignored = true
|
396
|
+
@uri.is_ignored_reason = "wrong content type: #{@uri.response_content_type}"
|
397
|
+
end
|
398
|
+
elsif code >= 301 && code <= 399
|
399
|
+
@redis.write(['HSET', @uri.key_name, 'is_redirect', 1])
|
400
|
+
@redis.read
|
401
|
+
|
402
|
+
if !@response['Location'].nil?
|
403
|
+
# Follow the URL.
|
404
|
+
new_uri = Uri.new(@response['Location'])
|
405
|
+
|
406
|
+
enqueue(new_uri)
|
407
|
+
end
|
408
|
+
else
|
409
|
+
@uri.is_ignored = true
|
410
|
+
@uri.is_ignored_reason = "wrong code: #{code}"
|
411
|
+
end
|
412
|
+
|
413
|
+
if @uri.is_ignored
|
414
|
+
@redis.write(['HSET', @uri.key_name, 'is_ignored', 1])
|
415
|
+
@redis.read
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
def process_html_links
|
420
|
+
# Process all <a> tags found on the response page.
|
421
|
+
@html_doc
|
422
|
+
.xpath('//a')
|
423
|
+
.map{ |link|
|
424
|
+
|
425
|
+
href = link['href']
|
426
|
+
#puts "link #{href}"
|
427
|
+
|
428
|
+
if !href.nil?
|
429
|
+
#begin
|
430
|
+
Uri.new(href)
|
431
|
+
# rescue Exception => e
|
432
|
+
# nil
|
433
|
+
# end
|
434
|
+
end
|
435
|
+
}
|
436
|
+
.select{ |link|
|
437
|
+
!link.nil? && link.is_valid?
|
438
|
+
}
|
439
|
+
.sort{ |uri_a, uri_b|
|
440
|
+
uri_a.weight(@uri) <=> uri_b.weight(@uri)
|
441
|
+
}
|
442
|
+
.each_with_index{ |new_uri, index|
|
443
|
+
#puts "index #{index} #{new_uri} #{new_uri.is_relative?(@uri)}"
|
444
|
+
enqueue(new_uri, index)
|
445
|
+
}
|
446
|
+
end
|
447
|
+
|
448
|
+
def process_html_meta
|
449
|
+
# Process all <meta> tags found on the response page.
|
450
|
+
|
451
|
+
@html_doc.xpath('//meta').each do |meta|
|
452
|
+
meta_name = meta['name']
|
453
|
+
if !meta_name.nil?
|
454
|
+
meta_name = meta_name.downcase
|
455
|
+
|
456
|
+
if meta_name.downcase == 'generator'
|
457
|
+
process_html_meta_generator(meta)
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
def process_html_meta_generator(meta)
|
464
|
+
# Process all generator <meta> tags.
|
465
|
+
|
466
|
+
generator = meta['content']
|
467
|
+
generator_hash = Digest::SHA256.hexdigest(generator)
|
468
|
+
|
469
|
+
generator_id = nil
|
470
|
+
generator_hash_id_key_name = "generators:id:#{generator_hash}"
|
471
|
+
generator_key_name = nil
|
472
|
+
|
473
|
+
@redis.write(['EXISTS', generator_hash_id_key_name])
|
474
|
+
if @redis.read.to_b
|
475
|
+
# Found existing generator.
|
476
|
+
|
477
|
+
@redis.write(['GET', generator_hash_id_key_name])
|
478
|
+
generator_id = @redis.read
|
479
|
+
|
480
|
+
generator_key_name = "generators:#{generator_id}"
|
481
|
+
else
|
482
|
+
# New generator. Increase the Generators ID.
|
483
|
+
@redis.write(['INCR', 'generators:id'])
|
484
|
+
generator_id = @redis.read
|
485
|
+
|
486
|
+
generator_key_name = "generators:#{generator_id}"
|
487
|
+
@redis.write(['HMSET', generator_key_name,
|
488
|
+
'name', generator,
|
489
|
+
'hash', generator_hash,
|
490
|
+
'first_url_id', @uri.id,
|
491
|
+
#'last_used_at', Time.now.strftime('%F %T %z'),
|
492
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
493
|
+
])
|
494
|
+
@redis.read
|
495
|
+
|
496
|
+
# Set the Generator Hash to Generator ID reference.
|
497
|
+
@redis.write(['SET', generator_hash_id_key_name, generator_id])
|
498
|
+
@redis.read
|
499
|
+
end
|
500
|
+
|
501
|
+
# Always overwrite the last used timestamp.
|
502
|
+
@redis.write(['HSET', generator_key_name, 'last_used_at', Time.now.strftime('%F %T %z')])
|
503
|
+
@redis.read
|
504
|
+
|
505
|
+
# Add the URL to the Generator.
|
506
|
+
@redis.write(['SADD', "generators:#{generator_id}:urls", @uri.id])
|
507
|
+
@redis.read
|
508
|
+
|
509
|
+
# Add the Generator to the URL.
|
510
|
+
@redis.write(['SADD', "urls:#{@uri.id}:generators", generator_id])
|
511
|
+
@redis.read
|
512
|
+
end
|
513
|
+
|
514
|
+
def enqueue(new_uri, index = 0, debug = false)
|
515
|
+
if !@options['relative'] || new_uri.is_relative?(@uri)
|
516
|
+
new_uri = @uri.join(new_uri)
|
517
|
+
|
518
|
+
if new_uri.is_valid?
|
519
|
+
new_uri_s = new_uri.to_s
|
520
|
+
|
521
|
+
queued_time = (@url_delay + (@url_separate_delay * index)).seconds.from_now
|
522
|
+
|
523
|
+
if @options['serial']
|
524
|
+
|
525
|
+
# Check it another process is currently using 'urls:schedule:last'.
|
526
|
+
@redis.write(['GET', 'urls:schedule:lock'])
|
527
|
+
lock = @redis.read.to_i.to_b
|
528
|
+
while lock
|
529
|
+
@redis.write(['GET', 'urls:schedule:lock'])
|
530
|
+
lock = @redis.read.to_i.to_b
|
531
|
+
sleep 0.1
|
532
|
+
end
|
533
|
+
|
534
|
+
# Lock 'urls:schedule:last' for other processes.
|
535
|
+
@redis.write(['INCR', 'urls:schedule:lock'])
|
536
|
+
@redis.read
|
537
|
+
|
538
|
+
@redis.write(['GET', 'urls:schedule:last'])
|
539
|
+
queued_time = @redis.read
|
540
|
+
|
541
|
+
if queued_time.nil?
|
542
|
+
queued_time = Time.now
|
543
|
+
else
|
544
|
+
queued_time = Time.parse(queued_time)
|
545
|
+
if queued_time < Time.now
|
546
|
+
queued_time = Time.now
|
547
|
+
end
|
548
|
+
end
|
549
|
+
queued_time += @url_delay
|
550
|
+
|
551
|
+
@redis.write(['SET', 'urls:schedule:last', queued_time.strftime('%F %T %z')])
|
552
|
+
@redis.read
|
553
|
+
|
554
|
+
# Unlock 'urls:schedule:last' for other processes.
|
555
|
+
@redis.write(['DECR', 'urls:schedule:lock'])
|
556
|
+
@redis.read
|
557
|
+
end
|
558
|
+
|
559
|
+
puts "\t" + "enqueue #{@options['level']} #{index} #{queued_time} #{new_uri_s}"
|
560
|
+
|
561
|
+
if !debug
|
562
|
+
options = {
|
563
|
+
'serial' => @options['serial'],
|
564
|
+
'relative' => @options['relative'],
|
565
|
+
'parent_id' => @uri.id,
|
566
|
+
'level' => @options['level'] + 1,
|
567
|
+
}
|
568
|
+
Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, new_uri_s, options)
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
|
574
|
+
def reenqueue
|
575
|
+
queued_time = @url_reschedule.seconds.from_now
|
576
|
+
|
577
|
+
puts "\t" + "re-enqueue #{queued_time}"
|
578
|
+
|
579
|
+
options = {
|
580
|
+
'serial' => @options['serial'],
|
581
|
+
'relative' => @options['relative'],
|
582
|
+
}
|
583
|
+
Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, @uri.to_s, options)
|
584
|
+
end
|
585
|
+
|
586
|
+
end
|
587
|
+
|
588
|
+
end
|
589
|
+
end
|