sengi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: UTF-8
3
+
4
+ require 'optparse'
5
+ require 'active_support'
6
+ require 'active_support/core_ext/numeric/conversions'
7
+ require 'sengi'
8
+
9
+
10
+ @options = {
11
+ }
12
+ opts = OptionParser.new do |o|
13
+ o.banner = 'Usage: list'
14
+ o.separator('')
15
+
16
+ #o.on('--no-scheme', "Don't print the scheme.") do
17
+ #end
18
+
19
+ o.on_tail('-h', '--help', 'Show this message.') do
20
+ puts o
21
+ puts
22
+ exit 3
23
+ end
24
+ end
25
+ ARGV << '-h' if ARGV.count == 0
26
+ commands = opts.parse(ARGV)
27
+
28
+ @redis = Hiredis::Connection.new
29
+ @redis.connect('127.0.0.1', 7000)
30
+ @redis.write(['SELECT', 1])
31
+ @redis.read
32
+
33
+ command = commands.shift
34
+
35
+ if command == 'urls'
36
+ @redis.write(['GET', 'urls:id'])
37
+ urls_id = @redis.read.to_i
38
+
39
+ (1..urls_id).each do |url_id|
40
+ @redis.write(['HGETALL', "urls:#{url_id}"])
41
+ raw_url = @redis.read
42
+ if raw_url.length > 0
43
+ url = Hash[*raw_url]
44
+ uri = URI(url['url'])
45
+ puts '%s' % [uri.to_s]
46
+ end
47
+ end
48
+ elsif command == 'domains'
49
+ subcommand = commands.shift
50
+ if subcommand == 'ignore'
51
+ @redis.write(['SMEMBERS', 'domains:ignore'])
52
+ puts @redis.read
53
+ else
54
+ @redis.write(['GET', 'domains:id'])
55
+ domains_id = @redis.read.to_i
56
+
57
+ (1..domains_id).each do |domain_id|
58
+ @redis.write(['HGETALL', "domains:#{domain_id}"])
59
+ raw_domain = @redis.read
60
+ if raw_domain.length > 0
61
+ domain = Hash[*raw_domain]
62
+ puts domain['domain_nowww']
63
+ end
64
+ end
65
+ end
66
+ elsif command == 'generators'
67
+ @redis.write(['GET', 'generators:id'])
68
+ generators_id = @redis.read.to_i
69
+
70
+ (1..generators_id).each do |generator_id|
71
+ @redis.write(['HGETALL', "generators:#{generator_id}"])
72
+ raw_generator = @redis.read
73
+ if raw_generator.length > 0
74
+ generator = Hash[*raw_generator]
75
+ puts generator['name']
76
+ end
77
+ end
78
+ elsif command == 'stats'
79
+ traffic_out = 0
80
+ traffic_in = 0
81
+
82
+ @redis.write(['GET', "urls:id"])
83
+ urls_id = @redis.read.to_i
84
+
85
+ @redis.write(['GET', "requests:id"])
86
+ requests_id = @redis.read.to_i
87
+
88
+ @redis.write(['GET', "responses:id"])
89
+ responses_id = @redis.read.to_i
90
+
91
+ (1..urls_id).each do |url_id|
92
+ @redis.write(['HGETALL', "urls:#{url_id}"])
93
+ raw_url = @redis.read
94
+ if raw_url.length > 0
95
+ url = Hash[*raw_url]
96
+
97
+ @redis.write(['SMEMBERS', "urls:#{url_id}:requests"])
98
+ request_ids = @redis.read.map{ |rid| rid.to_i }
99
+ request_ids.each do |request_id|
100
+ @redis.write(['HGETALL', "requests:#{request_id}"])
101
+ raw_request = @redis.read
102
+ if raw_request.length > 0
103
+ request = Hash[*raw_request]
104
+ traffic_out += request['size'].to_i
105
+ end
106
+ end
107
+
108
+ @redis.write(['SMEMBERS', "urls:#{url_id}:responses"])
109
+ response_ids = @redis.read.map{ |rid| rid.to_i }
110
+ response_ids.each do |response_id|
111
+ @redis.write(['HGETALL', "responses:#{response_id}"])
112
+ raw_response = @redis.read
113
+ if raw_response.length > 0
114
+ response = Hash[*raw_response]
115
+ traffic_in += response['size'].to_i
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ puts "urls id: %6d" % [urls_id]
122
+ puts "requests id: %6d" % [requests_id]
123
+ puts "responses id: %6d" % [responses_id]
124
+ puts
125
+
126
+ puts 'traffic'
127
+ puts 'bytes out: %9d (%s)' % [traffic_out, traffic_out.to_s(:human_size, precision: 2)]
128
+ puts 'bytes in: %9d (%s)' % [traffic_in, traffic_in.to_s(:human_size, precision: 2)]
129
+ end
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ SCRIPT_BASEDIR=$(dirname $0)
5
+ SCRIPT_BASENAME=$(basename $0)
6
+
7
+
8
+ cd "${SCRIPT_BASEDIR}/.."
9
+
10
+ which -a redis-server &> /dev/null
11
+ redis-server config/redis.conf
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ SCRIPT_BASEDIR=$(dirname $0)
5
+ SCRIPT_BASENAME=$(basename $0)
6
+
7
+
8
+ cd "${SCRIPT_BASEDIR}/../tmp"
9
+
10
+ while true; do
11
+ redis-cli -p 7000 info | grep used_memory_human
12
+ sleep 10
13
+ done
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ SCRIPT_BASEDIR=$(dirname $0)
5
+ SCRIPT_BASENAME=$(basename $0)
6
+
7
+
8
+ cd "${SCRIPT_BASEDIR}/.."
9
+
10
+ kill $(cat run/redis.pid)
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+
8
+ id=${1:-0}
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}"
12
+
13
+ ./resque_crawler_stop ${id}
14
+ ./resque_crawler_start ${id}
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+
8
+ id=${1:-0}
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ export QUEUE=crawler
14
+ export PIDFILE=./run/resque_${QUEUE}_${id}.pid
15
+ export INTERVAL=1
16
+ #export COUNT=1
17
+ #export BACKGROUND=yes
18
+ LOG=tmp/resque_${QUEUE}_${id}.log
19
+
20
+ echo "${DATE} start" >> ${LOG}
21
+ rake resque:work --trace 1>> ${LOG} 2>> ${LOG} < /dev/null &
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+
8
+ id=${1:-0}
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ export QUEUE=crawler
14
+ export PIDFILE=./run/resque_${QUEUE}_${id}.pid
15
+ LOG=tmp/resque_${QUEUE}_${id}.log
16
+
17
+ pid=$(cat ${PIDFILE})
18
+ kill -QUIT ${pid}
19
+ rm ${PIDFILE}
20
+ echo "${DATE} process ended: ${pid}" >> ${LOG}
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+ export PIDFILE=./run/queue_scheduler.pid
8
+ export RESQUE_SCHEDULER_INTERVAL=1
9
+ LOG=tmp/resque_scheduler.log
10
+
11
+
12
+ cd "${SCRIPT_BASEDIR}/.."
13
+
14
+ echo "${DATE} start" >> ${LOG}
15
+ rake resque:scheduler --trace &> ${LOG} < /dev/null &
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+ export PIDFILE=./run/queue_scheduler.pid
8
+ LOG=tmp/resque_scheduler.log
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ pid=$(cat ${PIDFILE})
14
+ kill -QUIT ${pid}
15
+ sleep 1
16
+ echo "${DATE} process exit: ${pid}" >> ${LOG}
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
5
+ SCRIPT_BASEDIR=$(dirname $0)
6
+ SCRIPT_BASENAME=$(basename $0)
7
+ PIDFILE=./run/resque_server.pid
8
+ LOG=tmp/resque_server.log
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ resque-web -F -L --host 127.0.0.1 --port 8282 --pid-file ${PIDFILE} config/resque_server_config.rb &> ${LOG} < /dev/null &
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -x
4
+ set -e
5
+ DATE=$(date +"%Y-%m-%d %H:%M:%S %z")
6
+ SCRIPT_BASEDIR=$(dirname $0)
7
+ SCRIPT_BASENAME=$(basename $0)
8
+ PIDFILE=./run/resque_server.pid
9
+
10
+
11
+ cd "${SCRIPT_BASEDIR}/.."
12
+
13
+ kill $(cat ${PIDFILE}) && rm ${PIDFILE}
@@ -0,0 +1,120 @@
1
+
2
+ daemonize yes
3
+
4
+ pidfile run/redis.pid
5
+
6
+ bind 127.0.0.1
7
+ port 7000
8
+
9
+ tcp-backlog 511
10
+
11
+ timeout 0
12
+
13
+ tcp-keepalive 0
14
+
15
+ #debug
16
+ #verbose
17
+ #notice
18
+ loglevel verbose
19
+
20
+ #logfile ""
21
+ logfile tmp/redis.log
22
+
23
+ syslog-enabled no
24
+ # syslog-ident redis
25
+ # syslog-facility local0
26
+
27
+ databases 2
28
+
29
+ save 900 1
30
+ save 300 10
31
+ save 60 10000
32
+
33
+ stop-writes-on-bgsave-error no
34
+
35
+ rdbcompression yes
36
+
37
+ rdbchecksum no
38
+
39
+ dbfilename redis_dump.rdb
40
+ #dir /var/lib/redis
41
+
42
+ #slave-read-only yes
43
+
44
+ repl-diskless-sync no
45
+ repl-diskless-sync-delay 5
46
+ repl-ping-slave-period 10
47
+ repl-timeout 60
48
+
49
+ repl-disable-tcp-nodelay no
50
+ # repl-backlog-size 1mb
51
+ # repl-backlog-ttl 3600
52
+
53
+ #slave-priority 100
54
+
55
+ # min-slaves-to-write 3
56
+ # min-slaves-max-lag 10
57
+
58
+ # requirepass foobared
59
+
60
+ # maxclients 10000
61
+
62
+ maxmemory 1gb
63
+
64
+ # maxmemory-policy noeviction
65
+
66
+ # maxmemory-samples 5
67
+
68
+ appendonly no
69
+ appendfilename "appendonly.aof"
70
+
71
+ # appendfsync always
72
+ appendfsync everysec
73
+ # appendfsync no
74
+
75
+ #no-appendfsync-on-rewrite no
76
+
77
+ auto-aof-rewrite-percentage 100
78
+ auto-aof-rewrite-min-size 64mb
79
+
80
+ aof-load-truncated yes
81
+
82
+ #lua-time-limit 5000
83
+
84
+ #cluster-enabled no
85
+
86
+ #cluster-config-file nodes_10100.conf
87
+ #cluster-node-timeout 15000
88
+ # cluster-slave-validity-factor 10
89
+ # cluster-migration-barrier 1
90
+ #cluster-require-full-coverage yes
91
+
92
+ #slowlog-log-slower-than 10000
93
+ #slowlog-max-len 128
94
+
95
+ #latency-monitor-threshold 0
96
+
97
+ notify-keyspace-events ""
98
+
99
+ hash-max-ziplist-entries 512
100
+ hash-max-ziplist-value 64
101
+
102
+ list-max-ziplist-entries 512
103
+ list-max-ziplist-value 64
104
+
105
+ set-max-intset-entries 512
106
+
107
+ zset-max-ziplist-entries 128
108
+ zset-max-ziplist-value 64
109
+
110
+ hll-sparse-max-bytes 3000
111
+
112
+ #activerehashing yes
113
+
114
+ #client-output-buffer-limit normal 0 0 0
115
+ #client-output-buffer-limit slave 256mb 64mb 60
116
+ #client-output-buffer-limit pubsub 32mb 8mb 60
117
+
118
+ hz 10
119
+
120
+ aof-rewrite-incremental-fsync yes
@@ -0,0 +1,6 @@
1
+
2
+ require 'resque'
3
+ require 'resque-scheduler'
4
+ require 'resque/scheduler/server'
5
+
6
+ Resque.redis = '127.0.0.1:7000:0'
@@ -0,0 +1,5 @@
1
+
2
+ require 'sengi/version'
3
+ require 'sengi/crawler_worker'
4
+ require 'sengi/crawler'
5
+ require 'sengi/uri'
@@ -0,0 +1,589 @@
1
+
2
+ require 'uri'
3
+ require 'net/http'
4
+ require 'hiredis'
5
+ require 'resque'
6
+ require 'resque-scheduler'
7
+ require 'nokogiri'
8
+ require 'time'
9
+ require 'digest'
10
+ require 'openssl'
11
+ require 'zlib'
12
+ require 'active_support/time'
13
+
14
+ require 'thefox-ext'
15
+
16
+ module TheFox
17
+ module Sengi
18
+
19
+ class Crawler
20
+
21
+ def initialize(url, options)
22
+ @url = url
23
+ @options = options
24
+
25
+ @options['serial'] = false if !@options.has_key?('serial')
26
+ @options['relative'] = false if !@options.has_key?('relative')
27
+ @options['force'] = false if !@options.has_key?('force')
28
+ @options['debug'] = false if !@options.has_key?('debug')
29
+
30
+ @options['parent_id'] = 0 if !@options.has_key?('parent_id')
31
+ @options['level'] = 0 if !@options.has_key?('level')
32
+ #pp @options
33
+
34
+ @redis = nil
35
+ @uri = nil
36
+ @request = nil
37
+ @response = nil
38
+ @html_doc = nil
39
+ @url_delay = nil
40
+ @url_separate_delay = nil
41
+ @url_reschedule = nil
42
+ end
43
+
44
+ def go
45
+ redis_setup
46
+
47
+ uri_setup
48
+ puts "#{Time.now.strftime('%F %T')} perform: #{@options['parent_id']} #{@options['level']} #{@options['relative'] ? 'y' : 'n'} #{@uri}"
49
+
50
+ check_blacklist
51
+ puts "\t" + "blacklisted: #{@uri.is_blacklisted ? 'YES' : 'no'}"
52
+ return if @uri.is_blacklisted
53
+
54
+ insert_url
55
+ puts "\t" + "url: #{@uri.id}"
56
+ if @uri.is_ignored && !@options['debug'] && !@options['force']
57
+ puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
58
+ return
59
+ end
60
+
61
+ insert_domain
62
+ puts "\t" + "domain id: #{@uri.domain_id}"
63
+
64
+ insert_request
65
+ puts "\t" + "request id: #{@uri.request_id}"
66
+
67
+ make_http_request
68
+ puts "\t" + "http response: #{@response.nil? ? 'FAILED' : 'ok'}"
69
+ return if @response.nil?
70
+
71
+ insert_response
72
+ puts "\t" + "response: #{@uri.response_id} #{@uri.response_size}"
73
+
74
+ puts "\t" + 'process http response'
75
+ process_http_response
76
+ puts "\t" + "http response"
77
+ if @uri.is_ignored && !@options['force']
78
+ puts "\t" + "ignored reason: #{@uri.is_ignored_reason}"
79
+ return
80
+ end
81
+ if @html_doc.nil?
82
+ puts "\t" + 'HTML INVALID'
83
+ return
84
+ end
85
+
86
+ puts "\t" + 'process html links'
87
+ process_html_links
88
+
89
+ puts "\t" + 'process html meta'
90
+ process_html_meta
91
+
92
+ puts "\t" + 'url done'
93
+ end
94
+
95
+ private
96
+
97
+ def redis_setup
98
+ # Redis Setup
99
+ if @redis.nil?
100
+ @redis = Hiredis::Connection.new
101
+ @redis.connect('127.0.0.1', 7000)
102
+ @redis.write(['SELECT', 1])
103
+ @redis.read
104
+ end
105
+
106
+ @redis.write(['GET', 'urls:delay'])
107
+ @url_delay = @redis.read.to_i
108
+ if @url_delay.nil?
109
+ @url_delay = URL_DELAY
110
+ end
111
+
112
+ @redis.write(['GET', 'urls:separatedelay'])
113
+ @url_separate_delay = @redis.read.to_i
114
+ if @url_separate_delay.nil?
115
+ @url_separate_delay = URL_SEPARATE_DELAY
116
+ end
117
+
118
+ @redis.write(['GET', 'urls:reschedule'])
119
+ @url_reschedule = @redis.read.to_i
120
+ if @url_reschedule.nil?
121
+ @url_reschedule = URL_RESCHEDULE
122
+ end
123
+ end
124
+
125
+ def uri_setup
126
+ # URL object
127
+ @uri = Uri.new(@url)
128
+ @url = @uri.to_s
129
+ end
130
+
131
+ def check_blacklist
132
+ # Check if the current URL domain (second- + top-level) is in the blacklist.
133
+
134
+ if !@uri.ruri.host.nil?
135
+ # This splits for example the domain 'www.facebook.com' to
136
+ # ['www', 'facebook', 'com'] and then uses the last two parts
137
+ # ['facebook', 'com'] to make the check.
138
+ domain_topparts = @uri.ruri.host.split('.')[-2..-1].join('.')
139
+
140
+ # Read Domains Blacklist
141
+ @redis.write(['SMEMBERS', 'domains:ignore'])
142
+ domains_ignore = @redis.read
143
+
144
+ if domains_ignore.include?(domain_topparts)
145
+ @uri.is_blacklisted = true
146
+ else
147
+ # If the domain wasn't found in the blacklist search with regex.
148
+ # For example: if you blacklist 'google' the domain 'google.com'
149
+ # will not be found by the parent if condition. So search also with regex.
150
+ @uri.is_blacklisted = domains_ignore.grep(Regexp.new(domain_topparts)).count > 0
151
+ end
152
+ end
153
+ end
154
+
155
+ def insert_url
156
+ # Check if a URL already exists.
157
+ @redis.write(['EXISTS', @uri.hash_id_key_name])
158
+ if @redis.read.to_b
159
+ # A URL already exists.
160
+ @redis.write(['GET', @uri.hash_id_key_name])
161
+ @uri.id = @redis.read
162
+
163
+ @redis.write(['HGETALL', @uri.key_name])
164
+ redis_uri = Hash[*@redis.read]
165
+ #pp redis_uri
166
+
167
+ @uri.is_ignored = redis_uri['is_ignored'].to_i.to_b
168
+ request_attempts = redis_uri['request_attempts'].to_i
169
+
170
+ puts "\t" + "request attempts: #{request_attempts}"
171
+
172
+ if @uri.is_ignored
173
+ @uri.is_ignored_reason = 'already ignored'
174
+ else
175
+ if request_attempts >= 3
176
+ # Ignore the URL if it has already X attempts.
177
+
178
+ @uri.is_ignored = true
179
+ @uri.is_ignored_reason = 'attempts >= 3'
180
+
181
+ # Ignore the URL.
182
+ @redis.write(['HMSET', @uri.key_name,
183
+ 'is_ignored', 1,
184
+ 'ignored_at', Time.now.strftime('%F %T %z'),
185
+ ])
186
+ @redis.read
187
+ end
188
+ end
189
+
190
+ # Increase the URL attempts, even if the URL will be ignored.
191
+ # @redis.write(['HINCRBY', @uri.key_name, 'request_attempts', 1])
192
+ # @redis.read
193
+ @redis.write(['HMSET', @uri.key_name,
194
+ 'request_attempts', request_attempts + 1,
195
+ 'request_attempt_last_at', Time.now.strftime('%F %T %z'),
196
+ ])
197
+ @redis.read
198
+ else
199
+ # New URL. Increase the URLs ID.
200
+ @redis.write(['INCR', 'urls:id'])
201
+ @uri.id = @redis.read
202
+
203
+ now_s = Time.now.strftime('%F %T %z')
204
+
205
+ # Insert the new URL.
206
+ @redis.write(['HMSET', @uri.key_name,
207
+ 'url', @uri.to_s,
208
+ 'hash', @uri.to_hash,
209
+ 'request_attempts', 1,
210
+ 'request_attempt_last_at', now_s,
211
+ 'parent_id', @options['parent_id'],
212
+ 'level', @options['level'],
213
+ 'is_blacklisted', @uri.is_blacklisted.to_i,
214
+ 'is_ignored', 0,
215
+ #'ignored_at', nil,
216
+ 'is_redirect', 0,
217
+ 'created_at', now_s,
218
+ ])
219
+ @redis.read
220
+
221
+ # Set the URL Hash to URL ID reference.
222
+ @redis.write(['SET', @uri.hash_id_key_name, @uri.id])
223
+ @redis.read
224
+ end
225
+ end
226
+
227
+ def insert_domain
228
+ # Add Domain to the indexed list.
229
+ @redis.write(['SADD', 'domains:indexed', @uri.domain_nowww])
230
+ @redis.read.to_b
231
+
232
+ # Check if a Domain already exists.
233
+ @redis.write(['EXISTS', @uri.domain_hash_id_key_name])
234
+ if @redis.read.to_b
235
+ # A Domain already exists.
236
+ @redis.write(['GET', @uri.domain_hash_id_key_name])
237
+ @uri.domain_id = @redis.read
238
+ else
239
+ # New Domain. Increase the Domains ID.
240
+ @redis.write(['INCR', 'domains:id'])
241
+ @uri.domain_id = @redis.read
242
+
243
+ # Insert the new Domain.
244
+ @redis.write(['HMSET', @uri.domain_key_name,
245
+ 'domain_nowww', @uri.domain_nowww,
246
+ 'domain_original', @uri.ruri.host,
247
+ 'hash_nowww', @uri.domain_nowww_hash,
248
+ 'hash_original', @uri.domain_original_hash,
249
+ 'created_at', Time.now.strftime('%F %T %z'),
250
+ ])
251
+ @redis.read
252
+
253
+ # Set the Domain Hash to Domain ID reference.
254
+ @redis.write(['SET', @uri.domain_hash_id_key_name, @uri.domain_id])
255
+ @redis.read
256
+ end
257
+
258
+ # Save the URLs per Domain.
259
+ @redis.write(['SADD', "domains:#{@uri.domain_id}:urls", @uri.id])
260
+ @redis.read
261
+ end
262
+
263
+ def insert_request
264
+ # Increase the Requests ID.
265
+ @redis.write(['INCR', 'requests:id'])
266
+ @uri.request_id = @redis.read
267
+
268
+ # Create a new Request.
269
+ @redis.write(['HMSET', @uri.request_key_name,
270
+ 'url_id', @uri.id,
271
+ 'user_agent', HTTP_USER_AGENT,
272
+ 'error', 0,
273
+ #'error_msg', nil,
274
+ 'size', 0,
275
+ 'created_at', Time.now.strftime('%F %T %z'),
276
+ ])
277
+ @redis.read
278
+
279
+ # Save the Requests per URL.
280
+ @redis.write(['SADD', "urls:#{@uri.id}:requests", @uri.request_id])
281
+ @redis.read
282
+ end
283
+
284
+ def make_http_request
285
+ # HTTP Request
286
+ http = Net::HTTP.new(@uri.ruri.host, @uri.ruri.port)
287
+ http.keep_alive_timeout = 0
288
+ http.open_timeout = 5
289
+ http.read_timeout = 5
290
+ http.ssl_timeout = 5
291
+ if @uri.ruri.scheme.to_s.downcase == 'https'
292
+ http.use_ssl = true
293
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
294
+ end
295
+
296
+ # Send HTTP Request
297
+ @request = Net::HTTP::Get.new(@uri.ruri.request_uri)
298
+ @request['User-Agent'] = HTTP_USER_AGENT
299
+ @request['Referer'] = HTTP_REFERER
300
+ @request['Connection'] = 'close'
301
+ @request['Accept'] = 'text/html'
302
+ @request['Accept-Encoding'] = 'gzip;q=1.0,identity;q=0.6'
303
+ @request['Accept-Language'] = 'en,en-US;q=0.8'
304
+
305
+ string_io = StringIO.new
306
+ @request.exec(string_io, Net::HTTP::HTTPVersion, @request.path)
307
+ @redis.write(['HSET', @uri.request_key_name, 'size', string_io.string.length])
308
+ @redis.read
309
+
310
+ begin
311
+ puts "\t" + 'http request'
312
+ @response = http.request(@request)
313
+ puts "\t" + 'http request ok'
314
+ rescue Exception => e
315
+ puts "\t" + "ERROR: #{e.class} #{e}"
316
+
317
+ @response = nil
318
+
319
+ # Save the error and error message to the URL Request.
320
+ @redis.write(['HMSET', @uri.request_key_name,
321
+ 'error', 1,
322
+ 'error_msg', e.to_s,
323
+ ])
324
+ @redis.read
325
+
326
+ reenqueue
327
+ return
328
+ end
329
+
330
+ # Ignore the URL for further requests because it was successful.
331
+ @redis.write(['HMSET', @uri.key_name,
332
+ 'is_ignored', 1,
333
+ 'ignored_at', Time.now.strftime('%F %T %z'),
334
+ ])
335
+ @redis.read
336
+ end
337
+
338
+ def insert_response
339
+ # Increase the Responses ID.
340
+ @redis.write(['INCR', 'responses:id'])
341
+ @uri.response_id = @redis.read
342
+
343
+ # Add the Response ID to the URL.
344
+ @redis.write(['SADD', "urls:#{@uri.id}:responses", @uri.response_id])
345
+ @redis.read
346
+
347
+ # This is still too inaccurate.
348
+ response_size = @response.header.to_hash.map{ |k, v|
349
+ vs = ''
350
+ if v.is_a?(Array)
351
+ vs = v.join(' ')
352
+ else
353
+ vs = v
354
+ end
355
+ "#{k}: #{vs}"
356
+ }.join("\r\n").length + 4
357
+
358
+ response_size += @response.body.length
359
+
360
+ @uri.response_size = response_size
361
+ @uri.response_content_type = @response['Content-Type']
362
+
363
+ # Insert the new Response.
364
+ @redis.write(['HMSET', @uri.response_key_name,
365
+ 'code', @response.code.to_i,
366
+ 'content_type', @uri.response_content_type,
367
+ 'request_id', @uri.request_id,
368
+ 'size', @uri.response_size,
369
+ 'created_at', Time.now.strftime('%F %T %z'),
370
+ ])
371
+ @redis.read
372
+
373
+ # Add the Response to the Response Code.
374
+ @redis.write(['SADD', "responses:code:#{@response.code}", @uri.response_id])
375
+ @redis.read
376
+ end
377
+
378
+ def process_http_response
379
+ body = ''
380
+ if !@response['Content-Encoding'].nil? && @response['Content-Encoding'].downcase == 'gzip'
381
+ body = Zlib::GzipReader.new(StringIO.new(@response.body)).read
382
+ else
383
+ body = @response.body
384
+ end
385
+
386
+ code = @response.code.to_i
387
+ puts "\t" + "http response code: #{code}"
388
+
389
+ if code == 200
390
+ if @uri.response_content_type[0..8] == 'text/html'
391
+ @html_doc = Nokogiri::HTML(body)
392
+ @html_doc.remove_namespaces!
393
+ else
394
+ # Ignore the URL if the response content type isn't HTML.
395
+ @uri.is_ignored = true
396
+ @uri.is_ignored_reason = "wrong content type: #{@uri.response_content_type}"
397
+ end
398
+ elsif code >= 301 && code <= 399
399
+ @redis.write(['HSET', @uri.key_name, 'is_redirect', 1])
400
+ @redis.read
401
+
402
+ if !@response['Location'].nil?
403
+ # Follow the URL.
404
+ new_uri = Uri.new(@response['Location'])
405
+
406
+ enqueue(new_uri)
407
+ end
408
+ else
409
+ @uri.is_ignored = true
410
+ @uri.is_ignored_reason = "wrong code: #{code}"
411
+ end
412
+
413
+ if @uri.is_ignored
414
+ @redis.write(['HSET', @uri.key_name, 'is_ignored', 1])
415
+ @redis.read
416
+ end
417
+ end
418
+
419
+ def process_html_links
420
+ # Process all <a> tags found on the response page.
421
+ @html_doc
422
+ .xpath('//a')
423
+ .map{ |link|
424
+
425
+ href = link['href']
426
+ #puts "link #{href}"
427
+
428
+ if !href.nil?
429
+ #begin
430
+ Uri.new(href)
431
+ # rescue Exception => e
432
+ # nil
433
+ # end
434
+ end
435
+ }
436
+ .select{ |link|
437
+ !link.nil? && link.is_valid?
438
+ }
439
+ .sort{ |uri_a, uri_b|
440
+ uri_a.weight(@uri) <=> uri_b.weight(@uri)
441
+ }
442
+ .each_with_index{ |new_uri, index|
443
+ #puts "index #{index} #{new_uri} #{new_uri.is_relative?(@uri)}"
444
+ enqueue(new_uri, index)
445
+ }
446
+ end
447
+
448
+ def process_html_meta
449
+ # Process all <meta> tags found on the response page.
450
+
451
+ @html_doc.xpath('//meta').each do |meta|
452
+ meta_name = meta['name']
453
+ if !meta_name.nil?
454
+ meta_name = meta_name.downcase
455
+
456
+ if meta_name.downcase == 'generator'
457
+ process_html_meta_generator(meta)
458
+ end
459
+ end
460
+ end
461
+ end
462
+
463
+ def process_html_meta_generator(meta)
464
+ # Process all generator <meta> tags.
465
+
466
+ generator = meta['content']
467
+ generator_hash = Digest::SHA256.hexdigest(generator)
468
+
469
+ generator_id = nil
470
+ generator_hash_id_key_name = "generators:id:#{generator_hash}"
471
+ generator_key_name = nil
472
+
473
+ @redis.write(['EXISTS', generator_hash_id_key_name])
474
+ if @redis.read.to_b
475
+ # Found existing generator.
476
+
477
+ @redis.write(['GET', generator_hash_id_key_name])
478
+ generator_id = @redis.read
479
+
480
+ generator_key_name = "generators:#{generator_id}"
481
+ else
482
+ # New generator. Increase the Generators ID.
483
+ @redis.write(['INCR', 'generators:id'])
484
+ generator_id = @redis.read
485
+
486
+ generator_key_name = "generators:#{generator_id}"
487
+ @redis.write(['HMSET', generator_key_name,
488
+ 'name', generator,
489
+ 'hash', generator_hash,
490
+ 'first_url_id', @uri.id,
491
+ #'last_used_at', Time.now.strftime('%F %T %z'),
492
+ 'created_at', Time.now.strftime('%F %T %z'),
493
+ ])
494
+ @redis.read
495
+
496
+ # Set the Generator Hash to Generator ID reference.
497
+ @redis.write(['SET', generator_hash_id_key_name, generator_id])
498
+ @redis.read
499
+ end
500
+
501
+ # Always overwrite the last used timestamp.
502
+ @redis.write(['HSET', generator_key_name, 'last_used_at', Time.now.strftime('%F %T %z')])
503
+ @redis.read
504
+
505
+ # Add the URL to the Generator.
506
+ @redis.write(['SADD', "generators:#{generator_id}:urls", @uri.id])
507
+ @redis.read
508
+
509
+ # Add the Generator to the URL.
510
+ @redis.write(['SADD', "urls:#{@uri.id}:generators", generator_id])
511
+ @redis.read
512
+ end
513
+
514
+ def enqueue(new_uri, index = 0, debug = false)
515
+ if !@options['relative'] || new_uri.is_relative?(@uri)
516
+ new_uri = @uri.join(new_uri)
517
+
518
+ if new_uri.is_valid?
519
+ new_uri_s = new_uri.to_s
520
+
521
+ queued_time = (@url_delay + (@url_separate_delay * index)).seconds.from_now
522
+
523
+ if @options['serial']
524
+
525
+ # Check it another process is currently using 'urls:schedule:last'.
526
+ @redis.write(['GET', 'urls:schedule:lock'])
527
+ lock = @redis.read.to_i.to_b
528
+ while lock
529
+ @redis.write(['GET', 'urls:schedule:lock'])
530
+ lock = @redis.read.to_i.to_b
531
+ sleep 0.1
532
+ end
533
+
534
+ # Lock 'urls:schedule:last' for other processes.
535
+ @redis.write(['INCR', 'urls:schedule:lock'])
536
+ @redis.read
537
+
538
+ @redis.write(['GET', 'urls:schedule:last'])
539
+ queued_time = @redis.read
540
+
541
+ if queued_time.nil?
542
+ queued_time = Time.now
543
+ else
544
+ queued_time = Time.parse(queued_time)
545
+ if queued_time < Time.now
546
+ queued_time = Time.now
547
+ end
548
+ end
549
+ queued_time += @url_delay
550
+
551
+ @redis.write(['SET', 'urls:schedule:last', queued_time.strftime('%F %T %z')])
552
+ @redis.read
553
+
554
+ # Unlock 'urls:schedule:last' for other processes.
555
+ @redis.write(['DECR', 'urls:schedule:lock'])
556
+ @redis.read
557
+ end
558
+
559
+ puts "\t" + "enqueue #{@options['level']} #{index} #{queued_time} #{new_uri_s}"
560
+
561
+ if !debug
562
+ options = {
563
+ 'serial' => @options['serial'],
564
+ 'relative' => @options['relative'],
565
+ 'parent_id' => @uri.id,
566
+ 'level' => @options['level'] + 1,
567
+ }
568
+ Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, new_uri_s, options)
569
+ end
570
+ end
571
+ end
572
+ end
573
+
574
+ def reenqueue
575
+ queued_time = @url_reschedule.seconds.from_now
576
+
577
+ puts "\t" + "re-enqueue #{queued_time}"
578
+
579
+ options = {
580
+ 'serial' => @options['serial'],
581
+ 'relative' => @options['relative'],
582
+ }
583
+ Resque.enqueue_at(queued_time, TheFox::Sengi::CrawlerWorker, @uri.to_s, options)
584
+ end
585
+
586
+ end
587
+
588
+ end
589
+ end