sengi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: UTF-8
3
+
4
+ require 'optparse'
5
+ require 'active_support'
6
+ require 'active_support/core_ext/numeric/conversions'
7
+ require 'sengi'
8
+
9
+
10
+ @options = {
11
+ }
12
+ opts = OptionParser.new do |o|
13
+ o.banner = 'Usage: list'
14
+ o.separator('')
15
+
16
+ #o.on('--no-scheme', "Don't print the scheme.") do
17
+ #end
18
+
19
+ o.on_tail('-h', '--help', 'Show this message.') do
20
+ puts o
21
+ puts
22
+ exit 3
23
+ end
24
+ end
25
+ ARGV << '-h' if ARGV.count == 0
26
+ commands = opts.parse(ARGV)
27
+
28
+ @redis = Hiredis::Connection.new
29
+ @redis.connect('127.0.0.1', 7000)
30
+ @redis.write(['SELECT', 1])
31
+ @redis.read
32
+
33
+ command = commands.shift
34
+
35
+ if command == 'urls'
36
+ @redis.write(['GET', 'urls:id'])
37
+ urls_id = @redis.read.to_i
38
+
39
+ (1..urls_id).each do |url_id|
40
+ @redis.write(['HGETALL', "urls:#{url_id}"])
41
+ raw_url = @redis.read
42
+ if raw_url.length > 0
43
+ url = Hash[*raw_url]
44
+ uri = URI(url['url'])
45
+ puts '%s' % [uri.to_s]
46
+ end
47
+ end
48
+ elsif command == 'domains'
49
+ subcommand = commands.shift
50
+ if subcommand == 'ignore'
51
+ @redis.write(['SMEMBERS', 'domains:ignore'])
52
+ puts @redis.read
53
+ else
54
+ @redis.write(['GET', 'domains:id'])
55
+ domains_id = @redis.read.to_i
56
+
57
+ (1..domains_id).each do |domain_id|
58
+ @redis.write(['HGETALL', "domains:#{domain_id}"])
59
+ raw_domain = @redis.read
60
+ if raw_domain.length > 0
61
+ domain = Hash[*raw_domain]
62
+ puts domain['domain_nowww']
63
+ end
64
+ end
65
+ end
66
+ elsif command == 'generators'
67
+ @redis.write(['GET', 'generators:id'])
68
+ generators_id = @redis.read.to_i
69
+
70
+ (1..generators_id).each do |generator_id|
71
+ @redis.write(['HGETALL', "generators:#{generator_id}"])
72
+ raw_generator = @redis.read
73
+ if raw_generator.length > 0
74
+ generator = Hash[*raw_generator]
75
+ puts generator['name']
76
+ end
77
+ end
78
+ elsif command == 'stats'
79
+ traffic_out = 0
80
+ traffic_in = 0
81
+
82
+ @redis.write(['GET', "urls:id"])
83
+ urls_id = @redis.read.to_i
84
+
85
+ @redis.write(['GET', "requests:id"])
86
+ requests_id = @redis.read.to_i
87
+
88
+ @redis.write(['GET', "responses:id"])
89
+ responses_id = @redis.read.to_i
90
+
91
+ (1..urls_id).each do |url_id|
92
+ @redis.write(['HGETALL', "urls:#{url_id}"])
93
+ raw_url = @redis.read
94
+ if raw_url.length > 0
95
+ url = Hash[*raw_url]
96
+
97
+ @redis.write(['SMEMBERS', "urls:#{url_id}:requests"])
98
+ request_ids = @redis.read.map{ |rid| rid.to_i }
99
+ request_ids.each do |request_id|
100
+ @redis.write(['HGETALL', "requests:#{request_id}"])
101
+ raw_request = @redis.read
102
+ if raw_request.length > 0
103
+ request = Hash[*raw_request]
104
+ traffic_out += request['size'].to_i
105
+ end
106
+ end
107
+
108
+ @redis.write(['SMEMBERS', "urls:#{url_id}:responses"])
109
+ response_ids = @redis.read.map{ |rid| rid.to_i }
110
+ response_ids.each do |response_id|
111
+ @redis.write(['HGETALL', "responses:#{response_id}"])
112
+ raw_response = @redis.read
113
+ if raw_response.length > 0
114
+ response = Hash[*raw_response]
115
+ traffic_in += response['size'].to_i
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ puts "urls id: %6d" % [urls_id]
122
+ puts "requests id: %6d" % [requests_id]
123
+ puts "responses id: %6d" % [responses_id]
124
+ puts
125
+
126
+ puts 'traffic'
127
+ puts 'bytes out: %9d (%s)' % [traffic_out, traffic_out.to_s(:human_size, precision: 2)]
128
+ puts 'bytes in: %9d (%s)' % [traffic_in, traffic_in.to_s(:human_size, precision: 2)]
129
+ end
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ SCRIPT_BASEDIR=$(dirname $0)
5
+ SCRIPT_BASENAME=$(basename $0)
6
+
7
+
8
+ cd "${SCRIPT_BASEDIR}/.."
9
+
10
+ which -a redis-server &> /dev/null
11
+ redis-server config/redis.conf
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ SCRIPT_BASEDIR=$(dirname $0)
5
+ SCRIPT_BASENAME=$(basename $0)
6
+
7
+
8
+ cd "${SCRIPT_BASEDIR}/../tmp"
9
+
10
+ while true; do
11
+ redis-cli -p 7000 info | grep used_memory_human
12
+ sleep 10
13
+ done
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ SCRIPT_BASEDIR=$(dirname $0)
5
+ SCRIPT_BASENAME=$(basename $0)
6
+
7
+
8
+ cd "${SCRIPT_BASEDIR}/.."
9
+
10
+ kill $(cat run/redis.pid)
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+
8
+ id=${1:-0}
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}"
12
+
13
+ ./resque_crawler_stop ${id}
14
+ ./resque_crawler_start ${id}
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+
8
+ id=${1:-0}
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ export QUEUE=crawler
14
+ export PIDFILE=./run/resque_${QUEUE}_${id}.pid
15
+ export INTERVAL=1
16
+ #export COUNT=1
17
+ #export BACKGROUND=yes
18
+ LOG=tmp/resque_${QUEUE}_${id}.log
19
+
20
+ echo "${DATE} start" >> ${LOG}
21
+ rake resque:work --trace 1>> ${LOG} 2>> ${LOG} < /dev/null &
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+
8
+ id=${1:-0}
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ export QUEUE=crawler
14
+ export PIDFILE=./run/resque_${QUEUE}_${id}.pid
15
+ LOG=tmp/resque_${QUEUE}_${id}.log
16
+
17
+ pid=$(cat ${PIDFILE})
18
+ kill -QUIT ${pid}
19
+ rm ${PIDFILE}
20
+ echo "${DATE} process ended: ${pid}" >> ${LOG}
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+ export PIDFILE=./run/queue_scheduler.pid
8
+ export RESQUE_SCHEDULER_INTERVAL=1
9
+ LOG=tmp/resque_scheduler.log
10
+
11
+
12
+ cd "${SCRIPT_BASEDIR}/.."
13
+
14
+ echo "${DATE} start" >> ${LOG}
15
+ rake resque:scheduler --trace &> ${LOG} < /dev/null &
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+ export PIDFILE=./run/queue_scheduler.pid
8
+ LOG=tmp/resque_scheduler.log
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ pid=$(cat ${PIDFILE})
14
+ kill -QUIT ${pid}
15
+ sleep 1
16
+ echo "${DATE} process exit: ${pid}" >> ${LOG}
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+ PIDFILE=./run/resque_server.pid
8
+ LOG=tmp/resque_server.log
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ resque-web -F -L --host 127.0.0.1 --port 8282 --pid-file ${PIDFILE} config/resque_server_config.rb &> ${LOG} < /dev/null &
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -x
4
+ set -e
5
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
6
+ SCRIPT_BASEDIR=$(dirname $0)
7
+ SCRIPT_BASENAME=$(basename $0)
8
+ PIDFILE=./run/resque_server.pid
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ kill $(cat ${PIDFILE}) && rm ${PIDFILE}
@@ -0,0 +1,120 @@
1
+
2
+ daemonize yes
3
+
4
+ pidfile run/redis.pid
5
+
6
+ bind 127.0.0.1
7
+ port 7000
8
+
9
+ tcp-backlog 511
10
+
11
+ timeout 0
12
+
13
+ tcp-keepalive 0
14
+
15
+ #debug
16
+ #verbose
17
+ #notice
18
+ loglevel verbose
19
+
20
+ #logfile ""
21
+ logfile tmp/redis.log
22
+
23
+ syslog-enabled no
24
+ # syslog-ident redis
25
+ # syslog-facility local0
26
+
27
+ databases 2
28
+
29
+ save 900 1
30
+ save 300 10
31
+ save 60 10000
32
+
33
+ stop-writes-on-bgsave-error no
34
+
35
+ rdbcompression yes
36
+
37
+ rdbchecksum no
38
+
39
+ dbfilename redis_dump.rdb
40
+ #dir /var/lib/redis
41
+
42
+ #slave-read-only yes
43
+
44
+ repl-diskless-sync no
45
+ repl-diskless-sync-delay 5
46
+ repl-ping-slave-period 10
47
+ repl-timeout 60
48
+
49
+ repl-disable-tcp-nodelay no
50
+ # repl-backlog-size 1mb
51
+ # repl-backlog-ttl 3600
52
+
53
+ #slave-priority 100
54
+
55
+ # min-slaves-to-write 3
56
+ # min-slaves-max-lag 10
57
+
58
+ # requirepass foobared
59
+
60
+ # maxclients 10000
61
+
62
+ maxmemory 1gb
63
+
64
+ # maxmemory-policy noeviction
65
+
66
+ # maxmemory-samples 5
67
+
68
+ appendonly no
69
+ appendfilename "appendonly.aof"
70
+
71
+ # appendfsync always
72
+ appendfsync everysec
73
+ # appendfsync no
74
+
75
+ #no-appendfsync-on-rewrite no
76
+
77
+ auto-aof-rewrite-percentage 100
78
+ auto-aof-rewrite-min-size 64mb
79
+
80
+ aof-load-truncated yes
81
+
82
+ #lua-time-limit 5000
83
+
84
+ #cluster-enabled no
85
+
86
+ #cluster-config-file nodes_10100.conf
87
+ #cluster-node-timeout 15000
88
+ # cluster-slave-validity-factor 10
89
+ # cluster-migration-barrier 1
90
+ #cluster-require-full-coverage yes
91
+
92
+ #slowlog-log-slower-than 10000
93
+ #slowlog-max-len 128
94
+
95
+ #latency-monitor-threshold 0
96
+
97
+ notify-keyspace-events ""
98
+
99
+ hash-max-ziplist-entries 512
100
+ hash-max-ziplist-value 64
101
+
102
+ list-max-ziplist-entries 512
103
+ list-max-ziplist-value 64
104
+
105
+ set-max-intset-entries 512
106
+
107
+ zset-max-ziplist-entries 128
108
+ zset-max-ziplist-value 64
109
+
110
+ hll-sparse-max-bytes 3000
111
+
112
+ #activerehashing yes
113
+
114
+ #client-output-buffer-limit normal 0 0 0
115
+ #client-output-buffer-limit slave 256mb 64mb 60
116
+ #client-output-buffer-limit pubsub 32mb 8mb 60
117
+
118
+ hz 10
119
+
120
+ aof-rewrite-incremental-fsync yes
@@ -0,0 +1,6 @@
1
+
2
+ require 'resque'
3
+ require 'resque-scheduler'
4
+ require 'resque/scheduler/server'
5
+
6
+ Resque.redis = '127.0.0.1:7000:0'
@@ -0,0 +1,5 @@
1
+
2
+ require 'sengi/version'
3
+ require 'sengi/crawler_worker'
4
+ require 'sengi/crawler'
5
+ require 'sengi/uri'
@@ -0,0 +1,589 @@
1
+
2
+ require 'uri'
3
+ require 'net/http'
4
+ require 'hiredis'
5
+ require 'resque'
6
+ require 'resque-scheduler'
7
+ require 'nokogiri'
8
+ require 'time'
9
+ require 'digest'
10
+ require 'openssl'
11
+ require 'zlib'
12
+ require 'active_support/time'
13
+
14
+ require 'thefox-ext'
15
+
16
+ module TheFox
17
+ module Sengi
18
+
19
+ class Crawler
20
+
21
+ def initialize(url, options)
22
+ @url = url
23
+ @options = options
24
+
25
+ @options['serial'] = false if !@options.has_key?('serial')
26
+ @options['relative'] = false if !@options.has_key?('relative')
27
+ @options['force'] = false if !@options.has_key?('force')
28
+ @options['debug'] = false if !@options.has_key?('debug')
29
+
30
+ @options['parent_id'] = 0 if !@options.has_key?('parent_id')
31
+ @options['level'] = 0 if !@options.has_key?('level')
32
+ #pp @options
33
+
34
+ @redis = nil
35
+ @uri = nil
36
+ @request = nil
37
+ @response = nil
38
+ @html_doc = nil
39
+ @url_delay = nil
40
+ @url_separate_delay = nil
41
+ @url_reschedule = nil
42
+ end
43
+
44
+ def go
45
+ redis_setup
46
+
47
+ uri_setup
48
+ puts "#{Time.now.strftime('%F %T')} perform: #{@options['parent_id']} #{@options['level']} #{@options['relative'] ? 'y' : 'n'} #{@uri}"
49
+
50
+ check_blacklist
51
+ puts "\t" + "blacklisted: #{@uri.is_blacklisted ? 'YES' : 'no'}"
52
+ return if @uri.is_blacklisted
53
+
54
+ insert_url
55
+ puts "\t" + "url: #{@uri.id}"
56
+ if @uri.is_ignored && !@options['debug'] && !@options['force']
57
+ puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
58
+ return
59
+ end
60
+
61
+ insert_domain
62
+ puts "\t" + "domain id: #{@uri.domain_id}"
63
+
64
+ insert_request
65
+ puts "\t" + "request id: #{@uri.request_id}"
66
+
67
+ make_http_request
68
+ puts "\t" + "http response: #{@response.nil? ? 'FAILED' : 'ok'}"
69
+ return if @response.nil?
70
+
71
+ insert_response
72
+ puts "\t" + "response: #{@uri.response_id} #{@uri.response_size}"
73
+
74
+ puts "\t" + 'process http response'
75
+ process_http_response
76
+ puts "\t" + "http response"
77
+ if @uri.is_ignored && !@options['force']
78
+ puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
79
+ return
80
+ end
81
+ if @html_doc.nil?
82
+ puts "\t" + 'HTML INVALID'
83
+ return
84
+ end
85
+
86
+ puts "\t" + 'process html links'
87
+ process_html_links
88
+
89
+ puts "\t" + 'process html meta'
90
+ process_html_meta
91
+
92
+ puts "\t" + 'url done'
93
+ end
94
+
95
+ private
96
+
97
+ def redis_setup
98
+ # Redis Setup
99
+ if @redis.nil?
100
+ @redis = Hiredis::Connection.new
101
+ @redis.connect('127.0.0.1', 7000)
102
+ @redis.write(['SELECT', 1])
103
+ @redis.read
104
+ end
105
+
106
+ @redis.write(['GET', 'urls:delay'])
107
+ @url_delay = @redis.read.to_i
108
+ if @url_delay.nil?
109
+ @url_delay = URL_DELAY
110
+ end
111
+
112
+ @redis.write(['GET', 'urls:separatedelay'])
113
+ @url_separate_delay = @redis.read.to_i
114
+ if @url_separate_delay.nil?
115
+ @url_separate_delay = URL_SEPARATE_DELAY
116
+ end
117
+
118
+ @redis.write(['GET', 'urls:reschedule'])
119
+ @url_reschedule = @redis.read.to_i
120
+ if @url_reschedule.nil?
121
+ @url_reschedule = URL_RESCHEDULE
122
+ end
123
+ end
124
+
125
+ def uri_setup
126
+ # URL object
127
+ @uri = Uri.new(@url)
128
+ @url = @uri.to_s
129
+ end
130
+
131
+ def check_blacklist
132
+ # Check if the current URL domain (second- + top-level) is in the blacklist.
133
+
134
+ if !@uri.ruri.host.nil?
135
+ # This splits for example the domain 'www.facebook.com' to
136
+ # ['www', 'facebook', 'com'] and then uses the last two parts
137
+ # ['facebook', 'com'] to make the check.
138
+ domain_topparts = @uri.ruri.host.split('.')[-2..-1].join('.')
139
+
140
+ # Read Domains Blacklist
141
+ @redis.write(['SMEMBERS', 'domains:ignore'])
142
+ domains_ignore = @redis.read
143
+
144
+ if domains_ignore.include?(domain_topparts)
145
+ @uri.is_blacklisted = true
146
+ else
147
+ # If the domain wasn't found in the blacklist search with regex.
148
+ # For example: if you blacklist 'google' the domain 'google.com'
149
+ # will not be found by the parent if condition. So search also with regex.
150
+ @uri.is_blacklisted = domains_ignore.grep(Regexp.new(domain_topparts)).count > 0
151
+ end
152
+ end
153
+ end
154
+
155
+ def insert_url
156
+ # Check if a URL already exists.
157
+ @redis.write(['EXISTS', @uri.hash_id_key_name])
158
+ if @redis.read.to_b
159
+ # A URL already exists.
160
+ @redis.write(['GET', @uri.hash_id_key_name])
161
+ @uri.id = @redis.read
162
+
163
+ @redis.write(['HGETALL', @uri.key_name])
164
+ redis_uri = Hash[*@redis.read]
165
+ #pp redis_uri
166
+
167
+ @uri.is_ignored = redis_uri['is_ignored'].to_i.to_b
168
+ request_attempts = redis_uri['request_attempts'].to_i
169
+
170
+ puts "\t" + "request attempts: #{request_attempts}"
171
+
172
+ if @uri.is_ignored
173
+ @uri.is_ignored_reason = 'already ignored'
174
+ else
175
+ if request_attempts >= 3
176
+ # Ignore the URL if it has already X attempts.
177
+
178
+ @uri.is_ignored = true
179
+ @uri.is_ignored_reason = 'attempts >= 3'
180
+
181
+ # Ignore the URL.
182
+ @redis.write(['HMSET', @uri.key_name,
183
+ 'is_ignored', 1,
184
+ 'ignored_at', Time.now.strftime('%F %T %z'),
185
+ ])
186
+ @redis.read
187
+ end
188
+ end
189
+
190
+ # Increase the URL attempts, even if the URL will be ignored.
191
+ # @redis.write(['HINCRBY', @uri.key_name, 'request_attempts', 1])
192
+ # @redis.read
193
+ @redis.write(['HMSET', @uri.key_name,
194
+ 'request_attempts', request_attempts + 1,
195
+ 'request_attempt_last_at', Time.now.strftime('%F %T %z'),
196
+ ])
197
+ @redis.read
198
+ else
199
+ # New URL. Increase the URLs ID.
200
+ @redis.write(['INCR', 'urls:id'])
201
+ @uri.id = @redis.read
202
+
203
+ now_s = Time.now.strftime('%F %T %z')
204
+
205
+ # Insert the new URL.
206
+ @redis.write(['HMSET', @uri.key_name,
207
+ 'url', @uri.to_s,
208
+ 'hash', @uri.to_hash,
209
+ 'request_attempts', 1,
210
+ 'request_attempt_last_at', now_s,
211
+ 'parent_id', @options['parent_id'],
212
+ 'level', @options['level'],
213
+ 'is_blacklisted', @uri.is_blacklisted.to_i,
214
+ 'is_ignored', 0,
215
+ #'ignored_at', nil,
216
+ 'is_redirect', 0,
217
+ 'created_at', now_s,
218
+ ])
219
+ @redis.read
220
+
221
+ # Set the URL Hash to URL ID reference.
222
+ @redis.write(['SET', @uri.hash_id_key_name, @uri.id])
223
+ @redis.read
224
+ end
225
+ end
226
+
227
+ def insert_domain
228
+ # Add Domain to the indexed list.
229
+ @redis.write(['SADD', 'domains:indexed', @uri.domain_nowww])
230
+ @redis.read.to_b
231
+
232
+ # Check if a Domain already exists.
233
+ @redis.write(['EXISTS', @uri.domain_hash_id_key_name])
234
+ if @redis.read.to_b
235
+ # A Domain already exists.
236
+ @redis.write(['GET', @uri.domain_hash_id_key_name])
237
+ @uri.domain_id = @redis.read
238
+ else
239
+ # New Domain. Increase the Domains ID.
240
+ @redis.write(['INCR', 'domains:id'])
241
+ @uri.domain_id = @redis.read
242
+
243
+ # Insert the new Domain.
244
+ @redis.write(['HMSET', @uri.domain_key_name,
245
+ 'domain_nowww', @uri.domain_nowww,
246
+ 'domain_original', @uri.ruri.host,
247
+ 'hash_nowww', @uri.domain_nowww_hash,
248
+ 'hash_original', @uri.domain_original_hash,
249
+ 'created_at', Time.now.strftime('%F %T %z'),
250
+ ])
251
+ @redis.read
252
+
253
+ # Set the Domain Hash to Domain ID reference.
254
+ @redis.write(['SET', @uri.domain_hash_id_key_name, @uri.domain_id])
255
+ @redis.read
256
+ end
257
+
258
+ # Save the URLs per Domain.
259
+ @redis.write(['SADD', "domains:#{@uri.domain_id}:urls", @uri.id])
260
+ @redis.read
261
+ end
262
+
263
+ def insert_request
264
+ # Increase the Requests ID.
265
+ @redis.write(['INCR', 'requests:id'])
266
+ @uri.request_id = @redis.read
267
+
268
+ # Create a new Request.
269
+ @redis.write(['HMSET', @uri.request_key_name,
270
+ 'url_id', @uri.id,
271
+ 'user_agent', HTTP_USER_AGENT,
272
+ 'error', 0,
273
+ #'error_msg', nil,
274
+ 'size', 0,
275
+ 'created_at', Time.now.strftime('%F %T %z'),
276
+ ])
277
+ @redis.read
278
+
279
+ # Save the Requests per URL.
280
+ @redis.write(['SADD', "urls:#{@uri.id}:requests", @uri.request_id])
281
+ @redis.read
282
+ end
283
+
284
+ def make_http_request
285
+ # HTTP Request
286
+ http = Net::HTTP.new(@uri.ruri.host, @uri.ruri.port)
287
+ http.keep_alive_timeout = 0
288
+ http.open_timeout = 5
289
+ http.read_timeout = 5
290
+ http.ssl_timeout = 5
291
+ if @uri.ruri.scheme.to_s.downcase == 'https'
292
+ http.use_ssl = true
293
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
294
+ end
295
+
296
+ # Send HTTP Request
297
+ @request = Net::HTTP::Get.new(@uri.ruri.request_uri)
298
+ @request['User-Agent'] = HTTP_USER_AGENT
299
+ @request['Referer'] = HTTP_REFERER
300
+ @request['Connection'] = 'close'
301
+ @request['Accept'] = 'text/html'
302
+ @request['Accept-Encoding'] = 'gzip;q=1.0,identity;q=0.6'
303
+ @request['Accept-Language'] = 'en,en-US;q=0.8'
304
+
305
+ string_io = StringIO.new
306
+ @request.exec(string_io, Net::HTTP::HTTPVersion, @request.path)
307
+ @redis.write(['HSET', @uri.request_key_name, 'size', string_io.string.length])
308
+ @redis.read
309
+
310
+ begin
311
+ puts "\t" + 'http request'
312
+ @response = http.request(@request)
313
+ puts "\t" + 'http request ok'
314
+ rescue Exception => e
315
+ puts "\t" + "ERROR: #{e.class} #{e}"
316
+
317
+ @response = nil
318
+
319
+ # Save the error and error message to the URL Request.
320
+ @redis.write(['HMSET', @uri.request_key_name,
321
+ 'error', 1,
322
+ 'error_msg', e.to_s,
323
+ ])
324
+ @redis.read
325
+
326
+ reenqueue
327
+ return
328
+ end
329
+
330
+ # Ignore the URL for further requests because it was successful.
331
+ @redis.write(['HMSET', @uri.key_name,
332
+ 'is_ignored', 1,
333
+ 'ignored_at', Time.now.strftime('%F %T %z'),
334
+ ])
335
+ @redis.read
336
+ end
337
+
338
+ def insert_response
339
+ # Increase the Responses ID.
340
+ @redis.write(['INCR', 'responses:id'])
341
+ @uri.response_id = @redis.read
342
+
343
+ # Add the Response ID to the URL.
344
+ @redis.write(['SADD', "urls:#{@uri.id}:responses", @uri.response_id])
345
+ @redis.read
346
+
347
+ # This is still too inaccurate.
348
+ response_size = @response.header.to_hash.map{ |k, v|
349
+ vs = ''
350
+ if v.is_a?(Array)
351
+ vs = v.join(' ')
352
+ else
353
+ vs = v
354
+ end
355
+ "#{k}: #{vs}"
356
+ }.join("\r\n").length + 4
357
+
358
+ response_size += @response.body.length
359
+
360
+ @uri.response_size = response_size
361
+ @uri.response_content_type = @response['Content-Type']
362
+
363
+ # Insert the new Response.
364
+ @redis.write(['HMSET', @uri.response_key_name,
365
+ 'code', @response.code.to_i,
366
+ 'content_type', @uri.response_content_type,
367
+ 'request_id', @uri.request_id,
368
+ 'size', @uri.response_size,
369
+ 'created_at', Time.now.strftime('%F %T %z'),
370
+ ])
371
+ @redis.read
372
+
373
+ # Add the Response to the Response Code.
374
+ @redis.write(['SADD', "responses:code:#{@response.code}", @uri.response_id])
375
+ @redis.read
376
+ end
377
+
378
+ def process_http_response
379
+ body = ''
380
+ if !@response['Content-Encoding'].nil? && @response['Content-Encoding'].downcase == 'gzip'
381
+ body = Zlib::GzipReader.new(StringIO.new(@response.body)).read
382
+ else
383
+ body = @response.body
384
+ end
385
+
386
+ code = @response.code.to_i
387
+ puts "\t" + "http response code: #{code}"
388
+
389
+ if code == 200
390
+ if @uri.response_content_type[0..8] == 'text/html'
391
+ @html_doc = Nokogiri::HTML(body)
392
+ @html_doc.remove_namespaces!
393
+ else
394
+ # Ignore the URL if the response content type isn't HTML.
395
+ @uri.is_ignored = true
396
+ @uri.is_ignored_reason = "wrong content type: #{@uri.response_content_type}"
397
+ end
398
+ elsif code >= 301 && code <= 399
399
+ @redis.write(['HSET', @uri.key_name, 'is_redirect', 1])
400
+ @redis.read
401
+
402
+ if !@response['Location'].nil?
403
+ # Follow the URL.
404
+ new_uri = Uri.new(@response['Location'])
405
+
406
+ enqueue(new_uri)
407
+ end
408
+ else
409
+ @uri.is_ignored = true
410
+ @uri.is_ignored_reason = "wrong code: #{code}"
411
+ end
412
+
413
+ if @uri.is_ignored
414
+ @redis.write(['HSET', @uri.key_name, 'is_ignored', 1])
415
+ @redis.read
416
+ end
417
+ end
418
+
419
+ def process_html_links
420
+ # Process all <a> tags found on the response page.
421
+ @html_doc
422
+ .xpath('//a')
423
+ .map{ |link|
424
+
425
+ href = link['href']
426
+ #puts "link #{href}"
427
+
428
+ if !href.nil?
429
+ #begin
430
+ Uri.new(href)
431
+ # rescue Exception => e
432
+ # nil
433
+ # end
434
+ end
435
+ }
436
+ .select{ |link|
437
+ !link.nil? && link.is_valid?
438
+ }
439
+ .sort{ |uri_a, uri_b|
440
+ uri_a.weight(@uri) <=> uri_b.weight(@uri)
441
+ }
442
+ .each_with_index{ |new_uri, index|
443
+ #puts "index #{index} #{new_uri} #{new_uri.is_relative?(@uri)}"
444
+ enqueue(new_uri, index)
445
+ }
446
+ end
447
+
448
+ def process_html_meta
449
+ # Process all <meta> tags found on the response page.
450
+
451
+ @html_doc.xpath('//meta').each do |meta|
452
+ meta_name = meta['name']
453
+ if !meta_name.nil?
454
+ meta_name = meta_name.downcase
455
+
456
+ if meta_name.downcase == 'generator'
457
+ process_html_meta_generator(meta)
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ def process_html_meta_generator(meta)
464
+ # Process all generator <meta> tags.
465
+
466
+ generator = meta['content']
467
+ generator_hash = Digest::SHA256.hexdigest(generator)
468
+
469
+ generator_id = nil
470
+ generator_hash_id_key_name = "generators:id:#{generator_hash}"
471
+ generator_key_name = nil
472
+
473
+ @redis.write(['EXISTS', generator_hash_id_key_name])
474
+ if @redis.read.to_b
475
+ # Found existing generator.
476
+
477
+ @redis.write(['GET', generator_hash_id_key_name])
478
+ generator_id = @redis.read
479
+
480
+ generator_key_name = "generators:#{generator_id}"
481
+ else
482
+ # New generator. Increase the Generators ID.
483
+ @redis.write(['INCR', 'generators:id'])
484
+ generator_id = @redis.read
485
+
486
+ generator_key_name = "generators:#{generator_id}"
487
+ @redis.write(['HMSET', generator_key_name,
488
+ 'name', generator,
489
+ 'hash', generator_hash,
490
+ 'first_url_id', @uri.id,
491
+ #'last_used_at', Time.now.strftime('%F %T %z'),
492
+ 'created_at', Time.now.strftime('%F %T %z'),
493
+ ])
494
+ @redis.read
495
+
496
+ # Set the Generator Hash to Generator ID reference.
497
+ @redis.write(['SET', generator_hash_id_key_name, generator_id])
498
+ @redis.read
499
+ end
500
+
501
+ # Always overwrite the last used timestamp.
502
+ @redis.write(['HSET', generator_key_name, 'last_used_at', Time.now.strftime('%F %T %z')])
503
+ @redis.read
504
+
505
+ # Add the URL to the Generator.
506
+ @redis.write(['SADD', "generators:#{generator_id}:urls", @uri.id])
507
+ @redis.read
508
+
509
+ # Add the Generator to the URL.
510
+ @redis.write(['SADD', "urls:#{@uri.id}:generators", generator_id])
511
+ @redis.read
512
+ end
513
+
514
+ def enqueue(new_uri, index = 0, debug = false)
515
+ if !@options['relative'] || new_uri.is_relative?(@uri)
516
+ new_uri = @uri.join(new_uri)
517
+
518
+ if new_uri.is_valid?
519
+ new_uri_s = new_uri.to_s
520
+
521
+ queued_time = (@url_delay + (@url_separate_delay * index)).seconds.from_now
522
+
523
+ if @options['serial']
524
+
525
+ # Check it another process is currently using 'urls:schedule:last'.
526
+ @redis.write(['GET', 'urls:schedule:lock'])
527
+ lock = @redis.read.to_i.to_b
528
+ while lock
529
+ @redis.write(['GET', 'urls:schedule:lock'])
530
+ lock = @redis.read.to_i.to_b
531
+ sleep 0.1
532
+ end
533
+
534
+ # Lock 'urls:schedule:last' for other processes.
535
+ @redis.write(['INCR', 'urls:schedule:lock'])
536
+ @redis.read
537
+
538
+ @redis.write(['GET', 'urls:schedule:last'])
539
+ queued_time = @redis.read
540
+
541
+ if queued_time.nil?
542
+ queued_time = Time.now
543
+ else
544
+ queued_time = Time.parse(queued_time)
545
+ if queued_time < Time.now
546
+ queued_time = Time.now
547
+ end
548
+ end
549
+ queued_time += @url_delay
550
+
551
+ @redis.write(['SET', 'urls:schedule:last', queued_time.strftime('%F %T %z')])
552
+ @redis.read
553
+
554
+ # Unlock 'urls:schedule:last' for other processes.
555
+ @redis.write(['DECR', 'urls:schedule:lock'])
556
+ @redis.read
557
+ end
558
+
559
+ puts "\t" + "enqueue #{@options['level']} #{index} #{queued_time} #{new_uri_s}"
560
+
561
+ if !debug
562
+ options = {
563
+ 'serial' => @options['serial'],
564
+ 'relative' => @options['relative'],
565
+ 'parent_id' => @uri.id,
566
+ 'level' => @options['level'] + 1,
567
+ }
568
+ Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, new_uri_s, options)
569
+ end
570
+ end
571
+ end
572
+ end
573
+
574
+ def reenqueue
575
+ queued_time = @url_reschedule.seconds.from_now
576
+
577
+ puts "\t" + "re-enqueue #{queued_time}"
578
+
579
+ options = {
580
+ 'serial' => @options['serial'],
581
+ 'relative' => @options['relative'],
582
+ }
583
+ Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, @uri.to_s, options)
584
+ end
585
+
586
+ end
587
+
588
+ end
589
+ end