sengi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +71 -0
- data/Makefile +23 -0
- data/Makefile.common +58 -0
- data/README.md +59 -0
- data/Rakefile +25 -0
- data/bin/config +148 -0
- data/bin/crawler +64 -0
- data/bin/list +129 -0
- data/bin/redis_start +11 -0
- data/bin/redis_stats +13 -0
- data/bin/redis_stop +10 -0
- data/bin/resque_crawler_restart +14 -0
- data/bin/resque_crawler_start +21 -0
- data/bin/resque_crawler_stop +20 -0
- data/bin/resque_scheduler_start +15 -0
- data/bin/resque_scheduler_stop +16 -0
- data/bin/resque_server_start +13 -0
- data/bin/resque_server_stop +13 -0
- data/config/redis.conf +120 -0
- data/config/resque_server_config.rb +6 -0
- data/lib/sengi.rb +5 -0
- data/lib/sengi/crawler.rb +589 -0
- data/lib/sengi/crawler_worker.rb +16 -0
- data/lib/sengi/uri.rb +288 -0
- data/lib/sengi/version.rb +17 -0
- data/sengi.gemspec +37 -0
- data/sengi.sublime-project +10 -0
- data/tests/tc_crawler.rb +14 -0
- data/tests/tc_uri.rb +140 -0
- data/tests/ts_all.rb +4 -0
- metadata +202 -0
data/bin/list
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: UTF-8
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
require 'active_support'
|
6
|
+
require 'active_support/core_ext/numeric/conversions'
|
7
|
+
require 'sengi'
|
8
|
+
|
9
|
+
|
10
|
+
@options = {
|
11
|
+
}
|
12
|
+
opts = OptionParser.new do |o|
|
13
|
+
o.banner = 'Usage: list'
|
14
|
+
o.separator('')
|
15
|
+
|
16
|
+
#o.on('--no-scheme', "Don't print the scheme.") do
|
17
|
+
#end
|
18
|
+
|
19
|
+
o.on_tail('-h', '--help', 'Show this message.') do
|
20
|
+
puts o
|
21
|
+
puts
|
22
|
+
exit 3
|
23
|
+
end
|
24
|
+
end
|
25
|
+
ARGV << '-h' if ARGV.count == 0
|
26
|
+
commands = opts.parse(ARGV)
|
27
|
+
|
28
|
+
@redis = Hiredis::Connection.new
|
29
|
+
@redis.connect('127.0.0.1', 7000)
|
30
|
+
@redis.write(['SELECT', 1])
|
31
|
+
@redis.read
|
32
|
+
|
33
|
+
command = commands.shift
|
34
|
+
|
35
|
+
if command == 'urls'
|
36
|
+
@redis.write(['GET', 'urls:id'])
|
37
|
+
urls_id = @redis.read.to_i
|
38
|
+
|
39
|
+
(1..urls_id).each do |url_id|
|
40
|
+
@redis.write(['HGETALL', "urls:#{url_id}"])
|
41
|
+
raw_url = @redis.read
|
42
|
+
if raw_url.length > 0
|
43
|
+
url = Hash[*raw_url]
|
44
|
+
uri = URI(url['url'])
|
45
|
+
puts '%s' % [uri.to_s]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
elsif command == 'domains'
|
49
|
+
subcommand = commands.shift
|
50
|
+
if subcommand == 'ignore'
|
51
|
+
@redis.write(['SMEMBERS', 'domains:ignore'])
|
52
|
+
puts @redis.read
|
53
|
+
else
|
54
|
+
@redis.write(['GET', 'domains:id'])
|
55
|
+
domains_id = @redis.read.to_i
|
56
|
+
|
57
|
+
(1..domains_id).each do |domain_id|
|
58
|
+
@redis.write(['HGETALL', "domains:#{domain_id}"])
|
59
|
+
raw_domain = @redis.read
|
60
|
+
if raw_domain.length > 0
|
61
|
+
domain = Hash[*raw_domain]
|
62
|
+
puts domain['domain_nowww']
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
elsif command == 'generators'
|
67
|
+
@redis.write(['GET', 'generators:id'])
|
68
|
+
generators_id = @redis.read.to_i
|
69
|
+
|
70
|
+
(1..generators_id).each do |generator_id|
|
71
|
+
@redis.write(['HGETALL', "generators:#{generator_id}"])
|
72
|
+
raw_generator = @redis.read
|
73
|
+
if raw_generator.length > 0
|
74
|
+
generator = Hash[*raw_generator]
|
75
|
+
puts generator['name']
|
76
|
+
end
|
77
|
+
end
|
78
|
+
elsif command == 'stats'
|
79
|
+
traffic_out = 0
|
80
|
+
traffic_in = 0
|
81
|
+
|
82
|
+
@redis.write(['GET', "urls:id"])
|
83
|
+
urls_id = @redis.read.to_i
|
84
|
+
|
85
|
+
@redis.write(['GET', "requests:id"])
|
86
|
+
requests_id = @redis.read.to_i
|
87
|
+
|
88
|
+
@redis.write(['GET', "responses:id"])
|
89
|
+
responses_id = @redis.read.to_i
|
90
|
+
|
91
|
+
(1..urls_id).each do |url_id|
|
92
|
+
@redis.write(['HGETALL', "urls:#{url_id}"])
|
93
|
+
raw_url = @redis.read
|
94
|
+
if raw_url.length > 0
|
95
|
+
url = Hash[*raw_url]
|
96
|
+
|
97
|
+
@redis.write(['SMEMBERS', "urls:#{url_id}:requests"])
|
98
|
+
request_ids = @redis.read.map{ |rid| rid.to_i }
|
99
|
+
request_ids.each do |request_id|
|
100
|
+
@redis.write(['HGETALL', "requests:#{request_id}"])
|
101
|
+
raw_request = @redis.read
|
102
|
+
if raw_request.length > 0
|
103
|
+
request = Hash[*raw_request]
|
104
|
+
traffic_out += request['size'].to_i
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
@redis.write(['SMEMBERS', "urls:#{url_id}:responses"])
|
109
|
+
response_ids = @redis.read.map{ |rid| rid.to_i }
|
110
|
+
response_ids.each do |response_id|
|
111
|
+
@redis.write(['HGETALL', "responses:#{response_id}"])
|
112
|
+
raw_response = @redis.read
|
113
|
+
if raw_response.length > 0
|
114
|
+
response = Hash[*raw_response]
|
115
|
+
traffic_in += response['size'].to_i
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
puts "urls id: %6d" % [urls_id]
|
122
|
+
puts "requests id: %6d" % [requests_id]
|
123
|
+
puts "responses id: %6d" % [responses_id]
|
124
|
+
puts
|
125
|
+
|
126
|
+
puts 'traffic'
|
127
|
+
puts 'bytes out: %9d (%s)' % [traffic_out, traffic_out.to_s(:human_size, precision: 2)]
|
128
|
+
puts 'bytes in: %9d (%s)' % [traffic_in, traffic_in.to_s(:human_size, precision: 2)]
|
129
|
+
end
|
data/bin/redis_start
ADDED
data/bin/redis_stats
ADDED
data/bin/redis_stop
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
|
8
|
+
id=${1:-0}
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
export QUEUE=crawler
|
14
|
+
export PIDFILE=./run/resque_${QUEUE}_${id}.pid
|
15
|
+
export INTERVAL=1
|
16
|
+
#export COUNT=1
|
17
|
+
#export BACKGROUND=yes
|
18
|
+
LOG=tmp/resque_${QUEUE}_${id}.log
|
19
|
+
|
20
|
+
echo "${DATE} start" >> ${LOG}
|
21
|
+
rake resque:work --trace 1>> ${LOG} 2>> ${LOG} < /dev/null &
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
|
8
|
+
id=${1:-0}
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
export QUEUE=crawler
|
14
|
+
export PIDFILE=./run/resque_${QUEUE}_${id}.pid
|
15
|
+
LOG=tmp/resque_${QUEUE}_${id}.log
|
16
|
+
|
17
|
+
pid=$(cat ${PIDFILE})
|
18
|
+
kill -QUIT ${pid}
|
19
|
+
rm ${PIDFILE}
|
20
|
+
echo "${DATE} process ended: ${pid}" >> ${LOG}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
export PIDFILE=./run/queue_scheduler.pid
|
8
|
+
export RESQUE_SCHEDULER_INTERVAL=1
|
9
|
+
LOG=tmp/resque_scheduler.log
|
10
|
+
|
11
|
+
|
12
|
+
cd "${SCRIPT_BASEDIR}/.."
|
13
|
+
|
14
|
+
echo "${DATE} start" >> ${LOG}
|
15
|
+
rake resque:scheduler --trace &> ${LOG} < /dev/null &
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
export PIDFILE=./run/queue_scheduler.pid
|
8
|
+
LOG=tmp/resque_scheduler.log
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
pid=$(cat ${PIDFILE})
|
14
|
+
kill -QUIT ${pid}
|
15
|
+
sleep 1
|
16
|
+
echo "${DATE} process exit: ${pid}" >> ${LOG}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
5
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
6
|
+
SCRIPT_BASENAME=$(basename $0)
|
7
|
+
PIDFILE=./run/resque_server.pid
|
8
|
+
LOG=tmp/resque_server.log
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
resque-web -F -L --host 127.0.0.1 --port 8282 --pid-file ${PIDFILE} config/resque_server_config.rb &> ${LOG} < /dev/null &
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
set -x
|
4
|
+
set -e
|
5
|
+
DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
|
6
|
+
SCRIPT_BASEDIR=$(dirname $0)
|
7
|
+
SCRIPT_BASENAME=$(basename $0)
|
8
|
+
PIDFILE=./run/resque_server.pid
|
9
|
+
|
10
|
+
|
11
|
+
cd "${SCRIPT_BASEDIR}/.."
|
12
|
+
|
13
|
+
kill $(cat ${PIDFILE}) && rm ${PIDFILE}
|
data/config/redis.conf
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
|
2
|
+
daemonize yes
|
3
|
+
|
4
|
+
pidfile run/redis.pid
|
5
|
+
|
6
|
+
bind 127.0.0.1
|
7
|
+
port 7000
|
8
|
+
|
9
|
+
tcp-backlog 511
|
10
|
+
|
11
|
+
timeout 0
|
12
|
+
|
13
|
+
tcp-keepalive 0
|
14
|
+
|
15
|
+
#debug
|
16
|
+
#verbose
|
17
|
+
#notice
|
18
|
+
loglevel verbose
|
19
|
+
|
20
|
+
#logfile ""
|
21
|
+
logfile tmp/redis.log
|
22
|
+
|
23
|
+
syslog-enabled no
|
24
|
+
# syslog-ident redis
|
25
|
+
# syslog-facility local0
|
26
|
+
|
27
|
+
databases 2
|
28
|
+
|
29
|
+
save 900 1
|
30
|
+
save 300 10
|
31
|
+
save 60 10000
|
32
|
+
|
33
|
+
stop-writes-on-bgsave-error no
|
34
|
+
|
35
|
+
rdbcompression yes
|
36
|
+
|
37
|
+
rdbchecksum no
|
38
|
+
|
39
|
+
dbfilename redis_dump.rdb
|
40
|
+
#dir /var/lib/redis
|
41
|
+
|
42
|
+
#slave-read-only yes
|
43
|
+
|
44
|
+
repl-diskless-sync no
|
45
|
+
repl-diskless-sync-delay 5
|
46
|
+
repl-ping-slave-period 10
|
47
|
+
repl-timeout 60
|
48
|
+
|
49
|
+
repl-disable-tcp-nodelay no
|
50
|
+
# repl-backlog-size 1mb
|
51
|
+
# repl-backlog-ttl 3600
|
52
|
+
|
53
|
+
#slave-priority 100
|
54
|
+
|
55
|
+
# min-slaves-to-write 3
|
56
|
+
# min-slaves-max-lag 10
|
57
|
+
|
58
|
+
# requirepass foobared
|
59
|
+
|
60
|
+
# maxclients 10000
|
61
|
+
|
62
|
+
maxmemory 1gb
|
63
|
+
|
64
|
+
# maxmemory-policy noeviction
|
65
|
+
|
66
|
+
# maxmemory-samples 5
|
67
|
+
|
68
|
+
appendonly no
|
69
|
+
appendfilename "appendonly.aof"
|
70
|
+
|
71
|
+
# appendfsync always
|
72
|
+
appendfsync everysec
|
73
|
+
# appendfsync no
|
74
|
+
|
75
|
+
#no-appendfsync-on-rewrite no
|
76
|
+
|
77
|
+
auto-aof-rewrite-percentage 100
|
78
|
+
auto-aof-rewrite-min-size 64mb
|
79
|
+
|
80
|
+
aof-load-truncated yes
|
81
|
+
|
82
|
+
#lua-time-limit 5000
|
83
|
+
|
84
|
+
#cluster-enabled no
|
85
|
+
|
86
|
+
#cluster-config-file nodes_10100.conf
|
87
|
+
#cluster-node-timeout 15000
|
88
|
+
# cluster-slave-validity-factor 10
|
89
|
+
# cluster-migration-barrier 1
|
90
|
+
#cluster-require-full-coverage yes
|
91
|
+
|
92
|
+
#slowlog-log-slower-than 10000
|
93
|
+
#slowlog-max-len 128
|
94
|
+
|
95
|
+
#latency-monitor-threshold 0
|
96
|
+
|
97
|
+
notify-keyspace-events ""
|
98
|
+
|
99
|
+
hash-max-ziplist-entries 512
|
100
|
+
hash-max-ziplist-value 64
|
101
|
+
|
102
|
+
list-max-ziplist-entries 512
|
103
|
+
list-max-ziplist-value 64
|
104
|
+
|
105
|
+
set-max-intset-entries 512
|
106
|
+
|
107
|
+
zset-max-ziplist-entries 128
|
108
|
+
zset-max-ziplist-value 64
|
109
|
+
|
110
|
+
hll-sparse-max-bytes 3000
|
111
|
+
|
112
|
+
#activerehashing yes
|
113
|
+
|
114
|
+
#client-output-buffer-limit normal 0 0 0
|
115
|
+
#client-output-buffer-limit slave 256mb 64mb 60
|
116
|
+
#client-output-buffer-limit pubsub 32mb 8mb 60
|
117
|
+
|
118
|
+
hz 10
|
119
|
+
|
120
|
+
aof-rewrite-incremental-fsync yes
|
data/lib/sengi.rb
ADDED
@@ -0,0 +1,589 @@
|
|
1
|
+
|
2
|
+
require 'uri'
|
3
|
+
require 'net/http'
|
4
|
+
require 'hiredis'
|
5
|
+
require 'resque'
|
6
|
+
require 'resque-scheduler'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'time'
|
9
|
+
require 'digest'
|
10
|
+
require 'openssl'
|
11
|
+
require 'zlib'
|
12
|
+
require 'active_support/time'
|
13
|
+
|
14
|
+
require 'thefox-ext'
|
15
|
+
|
16
|
+
module TheFox
|
17
|
+
module Sengi
|
18
|
+
|
19
|
+
class Crawler
|
20
|
+
|
21
|
+
def initialize(url, options)
|
22
|
+
@url = url
|
23
|
+
@options = options
|
24
|
+
|
25
|
+
@options['serial'] = false if !@options.has_key?('serial')
|
26
|
+
@options['relative'] = false if !@options.has_key?('relative')
|
27
|
+
@options['force'] = false if !@options.has_key?('force')
|
28
|
+
@options['debug'] = false if !@options.has_key?('debug')
|
29
|
+
|
30
|
+
@options['parent_id'] = 0 if !@options.has_key?('parent_id')
|
31
|
+
@options['level'] = 0 if !@options.has_key?('level')
|
32
|
+
#pp @options
|
33
|
+
|
34
|
+
@redis = nil
|
35
|
+
@uri = nil
|
36
|
+
@request = nil
|
37
|
+
@response = nil
|
38
|
+
@html_doc = nil
|
39
|
+
@url_delay = nil
|
40
|
+
@url_separate_delay = nil
|
41
|
+
@url_reschedule = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def go
|
45
|
+
redis_setup
|
46
|
+
|
47
|
+
uri_setup
|
48
|
+
puts "#{Time.now.strftime('%F %T')} perform: #{@options['parent_id']} #{@options['level']} #{@options['relative'] ? 'y' : 'n'} #{@uri}"
|
49
|
+
|
50
|
+
check_blacklist
|
51
|
+
puts "\t" + "blacklisted: #{@uri.is_blacklisted ? 'YES' : 'no'}"
|
52
|
+
return if @uri.is_blacklisted
|
53
|
+
|
54
|
+
insert_url
|
55
|
+
puts "\t" + "url: #{@uri.id}"
|
56
|
+
if @uri.is_ignored && !@options['debug'] && !@options['force']
|
57
|
+
puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
|
58
|
+
return
|
59
|
+
end
|
60
|
+
|
61
|
+
insert_domain
|
62
|
+
puts "\t" + "domain id: #{@uri.domain_id}"
|
63
|
+
|
64
|
+
insert_request
|
65
|
+
puts "\t" + "request id: #{@uri.request_id}"
|
66
|
+
|
67
|
+
make_http_request
|
68
|
+
puts "\t" + "http response: #{@response.nil? ? 'FAILED' : 'ok'}"
|
69
|
+
return if @response.nil?
|
70
|
+
|
71
|
+
insert_response
|
72
|
+
puts "\t" + "response: #{@uri.response_id} #{@uri.response_size}"
|
73
|
+
|
74
|
+
puts "\t" + 'process http response'
|
75
|
+
process_http_response
|
76
|
+
puts "\t" + "http response"
|
77
|
+
if @uri.is_ignored && !@options['force']
|
78
|
+
puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
|
79
|
+
return
|
80
|
+
end
|
81
|
+
if @html_doc.nil?
|
82
|
+
puts "\t" + 'HTML INVALID'
|
83
|
+
return
|
84
|
+
end
|
85
|
+
|
86
|
+
puts "\t" + 'process html links'
|
87
|
+
process_html_links
|
88
|
+
|
89
|
+
puts "\t" + 'process html meta'
|
90
|
+
process_html_meta
|
91
|
+
|
92
|
+
puts "\t" + 'url done'
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def redis_setup
|
98
|
+
# Redis Setup
|
99
|
+
if @redis.nil?
|
100
|
+
@redis = Hiredis::Connection.new
|
101
|
+
@redis.connect('127.0.0.1', 7000)
|
102
|
+
@redis.write(['SELECT', 1])
|
103
|
+
@redis.read
|
104
|
+
end
|
105
|
+
|
106
|
+
@redis.write(['GET', 'urls:delay'])
|
107
|
+
@url_delay = @redis.read.to_i
|
108
|
+
if @url_delay.nil?
|
109
|
+
@url_delay = URL_DELAY
|
110
|
+
end
|
111
|
+
|
112
|
+
@redis.write(['GET', 'urls:separatedelay'])
|
113
|
+
@url_separate_delay = @redis.read.to_i
|
114
|
+
if @url_separate_delay.nil?
|
115
|
+
@url_separate_delay = URL_SEPARATE_DELAY
|
116
|
+
end
|
117
|
+
|
118
|
+
@redis.write(['GET', 'urls:reschedule'])
|
119
|
+
@url_reschedule = @redis.read.to_i
|
120
|
+
if @url_reschedule.nil?
|
121
|
+
@url_reschedule = URL_RESCHEDULE
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def uri_setup
|
126
|
+
# URL object
|
127
|
+
@uri = Uri.new(@url)
|
128
|
+
@url = @uri.to_s
|
129
|
+
end
|
130
|
+
|
131
|
+
def check_blacklist
|
132
|
+
# Check if the current URL domain (second- + top-level) is in the blacklist.
|
133
|
+
|
134
|
+
if !@uri.ruri.host.nil?
|
135
|
+
# This splits for example the domain 'www.facebook.com' to
|
136
|
+
# ['www', 'facebook', 'com'] and then uses the last two parts
|
137
|
+
# ['facebook', 'com'] to make the check.
|
138
|
+
domain_topparts = @uri.ruri.host.split('.')[-2..-1].join('.')
|
139
|
+
|
140
|
+
# Read Domains Blacklist
|
141
|
+
@redis.write(['SMEMBERS', 'domains:ignore'])
|
142
|
+
domains_ignore = @redis.read
|
143
|
+
|
144
|
+
if domains_ignore.include?(domain_topparts)
|
145
|
+
@uri.is_blacklisted = true
|
146
|
+
else
|
147
|
+
# If the domain wasn't found in the blacklist search with regex.
|
148
|
+
# For example: if you blacklist 'google' the domain 'google.com'
|
149
|
+
# will not be found by the parent if condition. So search also with regex.
|
150
|
+
@uri.is_blacklisted = domains_ignore.grep(Regexp.new(domain_topparts)).count > 0
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def insert_url
|
156
|
+
# Check if a URL already exists.
|
157
|
+
@redis.write(['EXISTS', @uri.hash_id_key_name])
|
158
|
+
if @redis.read.to_b
|
159
|
+
# A URL already exists.
|
160
|
+
@redis.write(['GET', @uri.hash_id_key_name])
|
161
|
+
@uri.id = @redis.read
|
162
|
+
|
163
|
+
@redis.write(['HGETALL', @uri.key_name])
|
164
|
+
redis_uri = Hash[*@redis.read]
|
165
|
+
#pp redis_uri
|
166
|
+
|
167
|
+
@uri.is_ignored = redis_uri['is_ignored'].to_i.to_b
|
168
|
+
request_attempts = redis_uri['request_attempts'].to_i
|
169
|
+
|
170
|
+
puts "\t" + "request attempts: #{request_attempts}"
|
171
|
+
|
172
|
+
if @uri.is_ignored
|
173
|
+
@uri.is_ignored_reason = 'already ignored'
|
174
|
+
else
|
175
|
+
if request_attempts >= 3
|
176
|
+
# Ignore the URL if it has already X attempts.
|
177
|
+
|
178
|
+
@uri.is_ignored = true
|
179
|
+
@uri.is_ignored_reason = 'attempts >= 3'
|
180
|
+
|
181
|
+
# Ignore the URL.
|
182
|
+
@redis.write(['HMSET', @uri.key_name,
|
183
|
+
'is_ignored', 1,
|
184
|
+
'ignored_at', Time.now.strftime('%F %T %z'),
|
185
|
+
])
|
186
|
+
@redis.read
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# Increase the URL attempts, even if the URL will be ignored.
|
191
|
+
# @redis.write(['HINCRBY', @uri.key_name, 'request_attempts', 1])
|
192
|
+
# @redis.read
|
193
|
+
@redis.write(['HMSET', @uri.key_name,
|
194
|
+
'request_attempts', request_attempts + 1,
|
195
|
+
'request_attempt_last_at', Time.now.strftime('%F %T %z'),
|
196
|
+
])
|
197
|
+
@redis.read
|
198
|
+
else
|
199
|
+
# New URL. Increase the URLs ID.
|
200
|
+
@redis.write(['INCR', 'urls:id'])
|
201
|
+
@uri.id = @redis.read
|
202
|
+
|
203
|
+
now_s = Time.now.strftime('%F %T %z')
|
204
|
+
|
205
|
+
# Insert the new URL.
|
206
|
+
@redis.write(['HMSET', @uri.key_name,
|
207
|
+
'url', @uri.to_s,
|
208
|
+
'hash', @uri.to_hash,
|
209
|
+
'request_attempts', 1,
|
210
|
+
'request_attempt_last_at', now_s,
|
211
|
+
'parent_id', @options['parent_id'],
|
212
|
+
'level', @options['level'],
|
213
|
+
'is_blacklisted', @uri.is_blacklisted.to_i,
|
214
|
+
'is_ignored', 0,
|
215
|
+
#'ignored_at', nil,
|
216
|
+
'is_redirect', 0,
|
217
|
+
'created_at', now_s,
|
218
|
+
])
|
219
|
+
@redis.read
|
220
|
+
|
221
|
+
# Set the URL Hash to URL ID reference.
|
222
|
+
@redis.write(['SET', @uri.hash_id_key_name, @uri.id])
|
223
|
+
@redis.read
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def insert_domain
|
228
|
+
# Add Domain to the indexed list.
|
229
|
+
@redis.write(['SADD', 'domains:indexed', @uri.domain_nowww])
|
230
|
+
@redis.read.to_b
|
231
|
+
|
232
|
+
# Check if a Domain already exists.
|
233
|
+
@redis.write(['EXISTS', @uri.domain_hash_id_key_name])
|
234
|
+
if @redis.read.to_b
|
235
|
+
# A Domain already exists.
|
236
|
+
@redis.write(['GET', @uri.domain_hash_id_key_name])
|
237
|
+
@uri.domain_id = @redis.read
|
238
|
+
else
|
239
|
+
# New Domain. Increase the Domains ID.
|
240
|
+
@redis.write(['INCR', 'domains:id'])
|
241
|
+
@uri.domain_id = @redis.read
|
242
|
+
|
243
|
+
# Insert the new Domain.
|
244
|
+
@redis.write(['HMSET', @uri.domain_key_name,
|
245
|
+
'domain_nowww', @uri.domain_nowww,
|
246
|
+
'domain_original', @uri.ruri.host,
|
247
|
+
'hash_nowww', @uri.domain_nowww_hash,
|
248
|
+
'hash_original', @uri.domain_original_hash,
|
249
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
250
|
+
])
|
251
|
+
@redis.read
|
252
|
+
|
253
|
+
# Set the Domain Hash to Domain ID reference.
|
254
|
+
@redis.write(['SET', @uri.domain_hash_id_key_name, @uri.domain_id])
|
255
|
+
@redis.read
|
256
|
+
end
|
257
|
+
|
258
|
+
# Save the URLs per Domain.
|
259
|
+
@redis.write(['SADD', "domains:#{@uri.domain_id}:urls", @uri.id])
|
260
|
+
@redis.read
|
261
|
+
end
|
262
|
+
|
263
|
+
def insert_request
|
264
|
+
# Increase the Requests ID.
|
265
|
+
@redis.write(['INCR', 'requests:id'])
|
266
|
+
@uri.request_id = @redis.read
|
267
|
+
|
268
|
+
# Create a new Request.
|
269
|
+
@redis.write(['HMSET', @uri.request_key_name,
|
270
|
+
'url_id', @uri.id,
|
271
|
+
'user_agent', HTTP_USER_AGENT,
|
272
|
+
'error', 0,
|
273
|
+
#'error_msg', nil,
|
274
|
+
'size', 0,
|
275
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
276
|
+
])
|
277
|
+
@redis.read
|
278
|
+
|
279
|
+
# Save the Requests per URL.
|
280
|
+
@redis.write(['SADD', "urls:#{@uri.id}:requests", @uri.request_id])
|
281
|
+
@redis.read
|
282
|
+
end
|
283
|
+
|
284
|
+
def make_http_request
|
285
|
+
# HTTP Request
|
286
|
+
http = Net::HTTP.new(@uri.ruri.host, @uri.ruri.port)
|
287
|
+
http.keep_alive_timeout = 0
|
288
|
+
http.open_timeout = 5
|
289
|
+
http.read_timeout = 5
|
290
|
+
http.ssl_timeout = 5
|
291
|
+
if @uri.ruri.scheme.to_s.downcase == 'https'
|
292
|
+
http.use_ssl = true
|
293
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
294
|
+
end
|
295
|
+
|
296
|
+
# Send HTTP Request
|
297
|
+
@request = Net::HTTP::Get.new(@uri.ruri.request_uri)
|
298
|
+
@request['User-Agent'] = HTTP_USER_AGENT
|
299
|
+
@request['Referer'] = HTTP_REFERER
|
300
|
+
@request['Connection'] = 'close'
|
301
|
+
@request['Accept'] = 'text/html'
|
302
|
+
@request['Accept-Encoding'] = 'gzip;q=1.0,identity;q=0.6'
|
303
|
+
@request['Accept-Language'] = 'en,en-US;q=0.8'
|
304
|
+
|
305
|
+
string_io = StringIO.new
|
306
|
+
@request.exec(string_io, Net::HTTP::HTTPVersion, @request.path)
|
307
|
+
@redis.write(['HSET', @uri.request_key_name, 'size', string_io.string.length])
|
308
|
+
@redis.read
|
309
|
+
|
310
|
+
begin
|
311
|
+
puts "\t" + 'http request'
|
312
|
+
@response = http.request(@request)
|
313
|
+
puts "\t" + 'http request ok'
|
314
|
+
rescue Exception => e
|
315
|
+
puts "\t" + "ERROR: #{e.class} #{e}"
|
316
|
+
|
317
|
+
@response = nil
|
318
|
+
|
319
|
+
# Save the error and error message to the URL Request.
|
320
|
+
@redis.write(['HMSET', @uri.request_key_name,
|
321
|
+
'error', 1,
|
322
|
+
'error_msg', e.to_s,
|
323
|
+
])
|
324
|
+
@redis.read
|
325
|
+
|
326
|
+
reenqueue
|
327
|
+
return
|
328
|
+
end
|
329
|
+
|
330
|
+
# Ignore the URL for further requests because it was successful.
|
331
|
+
@redis.write(['HMSET', @uri.key_name,
|
332
|
+
'is_ignored', 1,
|
333
|
+
'ignored_at', Time.now.strftime('%F %T %z'),
|
334
|
+
])
|
335
|
+
@redis.read
|
336
|
+
end
|
337
|
+
|
338
|
+
def insert_response
|
339
|
+
# Increase the Responses ID.
|
340
|
+
@redis.write(['INCR', 'responses:id'])
|
341
|
+
@uri.response_id = @redis.read
|
342
|
+
|
343
|
+
# Add the Response ID to the URL.
|
344
|
+
@redis.write(['SADD', "urls:#{@uri.id}:responses", @uri.response_id])
|
345
|
+
@redis.read
|
346
|
+
|
347
|
+
# This is still too inaccurate.
|
348
|
+
response_size = @response.header.to_hash.map{ |k, v|
|
349
|
+
vs = ''
|
350
|
+
if v.is_a?(Array)
|
351
|
+
vs = v.join(' ')
|
352
|
+
else
|
353
|
+
vs = v
|
354
|
+
end
|
355
|
+
"#{k}: #{vs}"
|
356
|
+
}.join("\r\n").length + 4
|
357
|
+
|
358
|
+
response_size += @response.body.length
|
359
|
+
|
360
|
+
@uri.response_size = response_size
|
361
|
+
@uri.response_content_type = @response['Content-Type']
|
362
|
+
|
363
|
+
# Insert the new Response.
|
364
|
+
@redis.write(['HMSET', @uri.response_key_name,
|
365
|
+
'code', @response.code.to_i,
|
366
|
+
'content_type', @uri.response_content_type,
|
367
|
+
'request_id', @uri.request_id,
|
368
|
+
'size', @uri.response_size,
|
369
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
370
|
+
])
|
371
|
+
@redis.read
|
372
|
+
|
373
|
+
# Add the Response to the Response Code.
|
374
|
+
@redis.write(['SADD', "responses:code:#{@response.code}", @uri.response_id])
|
375
|
+
@redis.read
|
376
|
+
end
|
377
|
+
|
378
|
+
def process_http_response
|
379
|
+
body = ''
|
380
|
+
if !@response['Content-Encoding'].nil? && @response['Content-Encoding'].downcase == 'gzip'
|
381
|
+
body = Zlib::GzipReader.new(StringIO.new(@response.body)).read
|
382
|
+
else
|
383
|
+
body = @response.body
|
384
|
+
end
|
385
|
+
|
386
|
+
code = @response.code.to_i
|
387
|
+
puts "\t" + "http response code: #{code}"
|
388
|
+
|
389
|
+
if code == 200
|
390
|
+
if @uri.response_content_type[0..8] == 'text/html'
|
391
|
+
@html_doc = Nokogiri::HTML(body)
|
392
|
+
@html_doc.remove_namespaces!
|
393
|
+
else
|
394
|
+
# Ignore the URL if the response content type isn't HTML.
|
395
|
+
@uri.is_ignored = true
|
396
|
+
@uri.is_ignored_reason = "wrong content type: #{@uri.response_content_type}"
|
397
|
+
end
|
398
|
+
elsif code >= 301 && code <= 399
|
399
|
+
@redis.write(['HSET', @uri.key_name, 'is_redirect', 1])
|
400
|
+
@redis.read
|
401
|
+
|
402
|
+
if !@response['Location'].nil?
|
403
|
+
# Follow the URL.
|
404
|
+
new_uri = Uri.new(@response['Location'])
|
405
|
+
|
406
|
+
enqueue(new_uri)
|
407
|
+
end
|
408
|
+
else
|
409
|
+
@uri.is_ignored = true
|
410
|
+
@uri.is_ignored_reason = "wrong code: #{code}"
|
411
|
+
end
|
412
|
+
|
413
|
+
if @uri.is_ignored
|
414
|
+
@redis.write(['HSET', @uri.key_name, 'is_ignored', 1])
|
415
|
+
@redis.read
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
def process_html_links
|
420
|
+
# Process all <a> tags found on the response page.
|
421
|
+
@html_doc
|
422
|
+
.xpath('//a')
|
423
|
+
.map{ |link|
|
424
|
+
|
425
|
+
href = link['href']
|
426
|
+
#puts "link #{href}"
|
427
|
+
|
428
|
+
if !href.nil?
|
429
|
+
#begin
|
430
|
+
Uri.new(href)
|
431
|
+
# rescue Exception => e
|
432
|
+
# nil
|
433
|
+
# end
|
434
|
+
end
|
435
|
+
}
|
436
|
+
.select{ |link|
|
437
|
+
!link.nil? && link.is_valid?
|
438
|
+
}
|
439
|
+
.sort{ |uri_a, uri_b|
|
440
|
+
uri_a.weight(@uri) <=> uri_b.weight(@uri)
|
441
|
+
}
|
442
|
+
.each_with_index{ |new_uri, index|
|
443
|
+
#puts "index #{index} #{new_uri} #{new_uri.is_relative?(@uri)}"
|
444
|
+
enqueue(new_uri, index)
|
445
|
+
}
|
446
|
+
end
|
447
|
+
|
448
|
+
def process_html_meta
|
449
|
+
# Process all <meta> tags found on the response page.
|
450
|
+
|
451
|
+
@html_doc.xpath('//meta').each do |meta|
|
452
|
+
meta_name = meta['name']
|
453
|
+
if !meta_name.nil?
|
454
|
+
meta_name = meta_name.downcase
|
455
|
+
|
456
|
+
if meta_name.downcase == 'generator'
|
457
|
+
process_html_meta_generator(meta)
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
def process_html_meta_generator(meta)
|
464
|
+
# Process all generator <meta> tags.
|
465
|
+
|
466
|
+
generator = meta['content']
|
467
|
+
generator_hash = Digest::SHA256.hexdigest(generator)
|
468
|
+
|
469
|
+
generator_id = nil
|
470
|
+
generator_hash_id_key_name = "generators:id:#{generator_hash}"
|
471
|
+
generator_key_name = nil
|
472
|
+
|
473
|
+
@redis.write(['EXISTS', generator_hash_id_key_name])
|
474
|
+
if @redis.read.to_b
|
475
|
+
# Found existing generator.
|
476
|
+
|
477
|
+
@redis.write(['GET', generator_hash_id_key_name])
|
478
|
+
generator_id = @redis.read
|
479
|
+
|
480
|
+
generator_key_name = "generators:#{generator_id}"
|
481
|
+
else
|
482
|
+
# New generator. Increase the Generators ID.
|
483
|
+
@redis.write(['INCR', 'generators:id'])
|
484
|
+
generator_id = @redis.read
|
485
|
+
|
486
|
+
generator_key_name = "generators:#{generator_id}"
|
487
|
+
@redis.write(['HMSET', generator_key_name,
|
488
|
+
'name', generator,
|
489
|
+
'hash', generator_hash,
|
490
|
+
'first_url_id', @uri.id,
|
491
|
+
#'last_used_at', Time.now.strftime('%F %T %z'),
|
492
|
+
'created_at', Time.now.strftime('%F %T %z'),
|
493
|
+
])
|
494
|
+
@redis.read
|
495
|
+
|
496
|
+
# Set the Generator Hash to Generator ID reference.
|
497
|
+
@redis.write(['SET', generator_hash_id_key_name, generator_id])
|
498
|
+
@redis.read
|
499
|
+
end
|
500
|
+
|
501
|
+
# Always overwrite the last used timestamp.
|
502
|
+
@redis.write(['HSET', generator_key_name, 'last_used_at', Time.now.strftime('%F %T %z')])
|
503
|
+
@redis.read
|
504
|
+
|
505
|
+
# Add the URL to the Generator.
|
506
|
+
@redis.write(['SADD', "generators:#{generator_id}:urls", @uri.id])
|
507
|
+
@redis.read
|
508
|
+
|
509
|
+
# Add the Generator to the URL.
|
510
|
+
@redis.write(['SADD', "urls:#{@uri.id}:generators", generator_id])
|
511
|
+
@redis.read
|
512
|
+
end
|
513
|
+
|
514
|
+
def enqueue(new_uri, index = 0, debug = false)
|
515
|
+
if !@options['relative'] || new_uri.is_relative?(@uri)
|
516
|
+
new_uri = @uri.join(new_uri)
|
517
|
+
|
518
|
+
if new_uri.is_valid?
|
519
|
+
new_uri_s = new_uri.to_s
|
520
|
+
|
521
|
+
queued_time = (@url_delay + (@url_separate_delay * index)).seconds.from_now
|
522
|
+
|
523
|
+
if @options['serial']
|
524
|
+
|
525
|
+
# Check it another process is currently using 'urls:schedule:last'.
|
526
|
+
@redis.write(['GET', 'urls:schedule:lock'])
|
527
|
+
lock = @redis.read.to_i.to_b
|
528
|
+
while lock
|
529
|
+
@redis.write(['GET', 'urls:schedule:lock'])
|
530
|
+
lock = @redis.read.to_i.to_b
|
531
|
+
sleep 0.1
|
532
|
+
end
|
533
|
+
|
534
|
+
# Lock 'urls:schedule:last' for other processes.
|
535
|
+
@redis.write(['INCR', 'urls:schedule:lock'])
|
536
|
+
@redis.read
|
537
|
+
|
538
|
+
@redis.write(['GET', 'urls:schedule:last'])
|
539
|
+
queued_time = @redis.read
|
540
|
+
|
541
|
+
if queued_time.nil?
|
542
|
+
queued_time = Time.now
|
543
|
+
else
|
544
|
+
queued_time = Time.parse(queued_time)
|
545
|
+
if queued_time < Time.now
|
546
|
+
queued_time = Time.now
|
547
|
+
end
|
548
|
+
end
|
549
|
+
queued_time += @url_delay
|
550
|
+
|
551
|
+
@redis.write(['SET', 'urls:schedule:last', queued_time.strftime('%F %T %z')])
|
552
|
+
@redis.read
|
553
|
+
|
554
|
+
# Unlock 'urls:schedule:last' for other processes.
|
555
|
+
@redis.write(['DECR', 'urls:schedule:lock'])
|
556
|
+
@redis.read
|
557
|
+
end
|
558
|
+
|
559
|
+
puts "\t" + "enqueue #{@options['level']} #{index} #{queued_time} #{new_uri_s}"
|
560
|
+
|
561
|
+
if !debug
|
562
|
+
options = {
|
563
|
+
'serial' => @options['serial'],
|
564
|
+
'relative' => @options['relative'],
|
565
|
+
'parent_id' => @uri.id,
|
566
|
+
'level' => @options['level'] + 1,
|
567
|
+
}
|
568
|
+
Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, new_uri_s, options)
|
569
|
+
end
|
570
|
+
end
|
571
|
+
end
|
572
|
+
end
|
573
|
+
|
574
|
+
def reenqueue
|
575
|
+
queued_time = @url_reschedule.seconds.from_now
|
576
|
+
|
577
|
+
puts "\t" + "re-enqueue #{queued_time}"
|
578
|
+
|
579
|
+
options = {
|
580
|
+
'serial' => @options['serial'],
|
581
|
+
'relative' => @options['relative'],
|
582
|
+
}
|
583
|
+
Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, @uri.to_s, options)
|
584
|
+
end
|
585
|
+
|
586
|
+
end
|
587
|
+
|
588
|
+
end
|
589
|
+
end
|