news_crawler 0.0.4 → 1.0.0.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/news_crawler +5 -7
- data/lib/news_crawler/downloader.rb +31 -27
- data/lib/news_crawler/link_selector/same_domain_selector.rb +4 -10
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ec4bfc6924849f911dfcc86f41fc9e3bb8c68a8
|
4
|
+
data.tar.gz: 292da2b3e3ae3836d2511b2f86687433039b7f87
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6eb68c98a026a0abb8fe3d1dd4a535ea5a3b4c17bc4ed52543c262ae440604b3515b80688ba06df53eb1e1cf9ff9228ad5fd92c3a55ba478291e7d7fa0fbdb16
|
7
|
+
data.tar.gz: 9ee6ad58b747ab1d4d9eee4b100cd506c8d0a3b751163c657203da8e4773320da915d0eca9583a5f29b91df0ef26f9696132e35cccd7701db28302e5900b0e0d
|
data/bin/news_crawler
CHANGED
@@ -57,7 +57,7 @@ OptionParser.new do | opts |
|
|
57
57
|
end
|
58
58
|
|
59
59
|
opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
|
60
|
-
"Wait time (in sec) before stop crawl
|
60
|
+
"Wait time (in sec) before stop crawl",
|
61
61
|
"If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
|
62
62
|
options[:time_out] = t
|
63
63
|
end
|
@@ -97,18 +97,16 @@ if options[:time_out]
|
|
97
97
|
sleep(options[:time_out])
|
98
98
|
else
|
99
99
|
stop = false
|
100
|
-
while(
|
100
|
+
while(not stop)
|
101
101
|
Signal.trap("INT") do | signo |
|
102
102
|
stop = true
|
103
103
|
end
|
104
104
|
end
|
105
105
|
end
|
106
106
|
|
107
|
-
puts "Stoping SDS"
|
108
|
-
se.graceful_terminate
|
109
|
-
se.terminate
|
110
|
-
puts "SDS stopped"
|
111
107
|
puts "Stoping Downloader"
|
112
|
-
dwl.graceful_terminate
|
113
108
|
dwl.terminate
|
114
109
|
puts "Downloader stopped"
|
110
|
+
puts "Stoping SDS"
|
111
|
+
se.terminate
|
112
|
+
puts "SDS stopped"
|
@@ -42,18 +42,22 @@ module NewsCrawler
|
|
42
42
|
# @param [ NewsCrawler::URLQueue ] queue url queue
|
43
43
|
def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
|
44
44
|
@queue = queue
|
45
|
-
@urls = queue.find_unvisited
|
46
45
|
@concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
|
47
46
|
@wait_time = 1
|
48
|
-
@status = :running
|
49
47
|
@stoping = false
|
48
|
+
get_new_url
|
50
49
|
wait_for_url if start_on_create
|
51
50
|
end
|
52
51
|
|
53
52
|
# Start downloader with current queue
|
54
53
|
# URL successed fetch is marked and result's stored in DB
|
55
54
|
def run
|
56
|
-
|
55
|
+
wait_for_url
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
# Download urls are given (in @urls)
|
60
|
+
def download
|
57
61
|
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
|
58
62
|
# TODO Log here
|
59
63
|
@urls = @urls.keep_if do | url |
|
@@ -73,40 +77,40 @@ module NewsCrawler
|
|
73
77
|
re
|
74
78
|
end
|
75
79
|
hydra.run
|
76
|
-
@urls = []
|
77
|
-
wait_for_url
|
78
|
-
end
|
79
|
-
|
80
|
-
# Graceful terminate this downloader
|
81
|
-
def graceful_terminate
|
82
|
-
@stoping = true
|
83
|
-
while @status == :running
|
84
|
-
sleep(1)
|
85
|
-
end
|
86
80
|
end
|
87
81
|
|
88
|
-
private
|
89
82
|
# Waiting for new urls're added to queue, using backoff algorithms
|
83
|
+
# Invoke download when suitable
|
90
84
|
def wait_for_url
|
91
|
-
@
|
92
|
-
|
93
|
-
|
85
|
+
while not @stoping do
|
86
|
+
if @queuing_urls.size == 0
|
87
|
+
get_new_url
|
88
|
+
end
|
89
|
+
if @queuing_urls.size == 0
|
90
|
+
backoff_sleep
|
91
|
+
else
|
92
|
+
if @stoping
|
93
|
+
return
|
94
|
+
end
|
95
|
+
@wait_time = 1
|
96
|
+
@urls = @queuing_urls.shift(@concurrent_download * 2)
|
97
|
+
download
|
98
|
+
sleep 0.01 # delay to receive terminate signal
|
99
|
+
end
|
94
100
|
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Sleep using backoff algorithm
|
104
|
+
# @params [ Fixnum ] seconds
|
105
|
+
def backoff_sleep
|
95
106
|
sleep @wait_time
|
96
|
-
|
97
|
-
|
98
|
-
if @wait_time < 30
|
99
|
-
@wait_time = @wait_time * 2
|
100
|
-
end
|
101
|
-
wait_for_url
|
102
|
-
else
|
103
|
-
@wait_time = 1
|
104
|
-
run
|
107
|
+
if @wait_time * 2 <= 4
|
108
|
+
@wait_time = @wait_time * 2
|
105
109
|
end
|
106
110
|
end
|
107
111
|
|
108
112
|
def get_new_url
|
109
|
-
@
|
113
|
+
@queuing_urls = @queue.find_unvisited
|
110
114
|
end
|
111
115
|
end
|
112
116
|
end
|
@@ -87,6 +87,7 @@ module NewsCrawler
|
|
87
87
|
}
|
88
88
|
end
|
89
89
|
|
90
|
+
# run selector
|
90
91
|
def run
|
91
92
|
@status = :running
|
92
93
|
return if @stoping
|
@@ -94,7 +95,7 @@ module NewsCrawler
|
|
94
95
|
@status = :stopped
|
95
96
|
return
|
96
97
|
end
|
97
|
-
while
|
98
|
+
while true do
|
98
99
|
url = next_unprocessed(@max_depth - 1)
|
99
100
|
while (url.nil?)
|
100
101
|
wait_for_url
|
@@ -103,6 +104,7 @@ module NewsCrawler
|
|
103
104
|
NCLogger.get_logger.info "Processing #{url}"
|
104
105
|
extract_url(url)
|
105
106
|
mark_processed(url)
|
107
|
+
sleep 0.01 # delay to receive terminate signal
|
106
108
|
end
|
107
109
|
end
|
108
110
|
|
@@ -154,20 +156,12 @@ module NewsCrawler
|
|
154
156
|
return false
|
155
157
|
end
|
156
158
|
|
157
|
-
# Graceful terminate this selector
|
158
|
-
def graceful_terminate
|
159
|
-
@stoping = true
|
160
|
-
while @status == :running
|
161
|
-
sleep(1)
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
159
|
private
|
166
160
|
# Waiting for new urls're added to queue, using backoff algorithms
|
167
161
|
def wait_for_url
|
168
162
|
@status = :waiting
|
169
163
|
sleep @wait_time
|
170
|
-
if @wait_time <
|
164
|
+
if @wait_time < 4
|
171
165
|
@wait_times = @wait_time * 2
|
172
166
|
end
|
173
167
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 1.0.0.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-08-
|
11
|
+
date: 2013-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|
@@ -195,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
195
195
|
version: 2.0.0
|
196
196
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
197
197
|
requirements:
|
198
|
-
- - '
|
198
|
+
- - '>'
|
199
199
|
- !ruby/object:Gem::Version
|
200
|
-
version:
|
200
|
+
version: 1.3.1
|
201
201
|
requirements: []
|
202
202
|
rubyforge_project:
|
203
203
|
rubygems_version: 2.0.3
|