news_crawler 0.0.4 → 1.0.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/news_crawler +5 -7
- data/lib/news_crawler/downloader.rb +31 -27
- data/lib/news_crawler/link_selector/same_domain_selector.rb +4 -10
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ec4bfc6924849f911dfcc86f41fc9e3bb8c68a8
|
4
|
+
data.tar.gz: 292da2b3e3ae3836d2511b2f86687433039b7f87
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6eb68c98a026a0abb8fe3d1dd4a535ea5a3b4c17bc4ed52543c262ae440604b3515b80688ba06df53eb1e1cf9ff9228ad5fd92c3a55ba478291e7d7fa0fbdb16
|
7
|
+
data.tar.gz: 9ee6ad58b747ab1d4d9eee4b100cd506c8d0a3b751163c657203da8e4773320da915d0eca9583a5f29b91df0ef26f9696132e35cccd7701db28302e5900b0e0d
|
data/bin/news_crawler
CHANGED
@@ -57,7 +57,7 @@ OptionParser.new do | opts |
|
|
57
57
|
end
|
58
58
|
|
59
59
|
opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
|
60
|
-
"Wait time (in sec) before stop crawl
|
60
|
+
"Wait time (in sec) before stop crawl",
|
61
61
|
"If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
|
62
62
|
options[:time_out] = t
|
63
63
|
end
|
@@ -97,18 +97,16 @@ if options[:time_out]
|
|
97
97
|
sleep(options[:time_out])
|
98
98
|
else
|
99
99
|
stop = false
|
100
|
-
while(
|
100
|
+
while(not stop)
|
101
101
|
Signal.trap("INT") do | signo |
|
102
102
|
stop = true
|
103
103
|
end
|
104
104
|
end
|
105
105
|
end
|
106
106
|
|
107
|
-
puts "Stoping SDS"
|
108
|
-
se.graceful_terminate
|
109
|
-
se.terminate
|
110
|
-
puts "SDS stopped"
|
111
107
|
puts "Stoping Downloader"
|
112
|
-
dwl.graceful_terminate
|
113
108
|
dwl.terminate
|
114
109
|
puts "Downloader stopped"
|
110
|
+
puts "Stoping SDS"
|
111
|
+
se.terminate
|
112
|
+
puts "SDS stopped"
|
@@ -42,18 +42,22 @@ module NewsCrawler
|
|
42
42
|
# @param [ NewsCrawler::URLQueue ] queue url queue
|
43
43
|
def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
|
44
44
|
@queue = queue
|
45
|
-
@urls = queue.find_unvisited
|
46
45
|
@concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
|
47
46
|
@wait_time = 1
|
48
|
-
@status = :running
|
49
47
|
@stoping = false
|
48
|
+
get_new_url
|
50
49
|
wait_for_url if start_on_create
|
51
50
|
end
|
52
51
|
|
53
52
|
# Start downloader with current queue
|
54
53
|
# URL successed fetch is marked and result's stored in DB
|
55
54
|
def run
|
56
|
-
|
55
|
+
wait_for_url
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
# Download urls are given (in @urls)
|
60
|
+
def download
|
57
61
|
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
|
58
62
|
# TODO Log here
|
59
63
|
@urls = @urls.keep_if do | url |
|
@@ -73,40 +77,40 @@ module NewsCrawler
|
|
73
77
|
re
|
74
78
|
end
|
75
79
|
hydra.run
|
76
|
-
@urls = []
|
77
|
-
wait_for_url
|
78
|
-
end
|
79
|
-
|
80
|
-
# Graceful terminate this downloader
|
81
|
-
def graceful_terminate
|
82
|
-
@stoping = true
|
83
|
-
while @status == :running
|
84
|
-
sleep(1)
|
85
|
-
end
|
86
80
|
end
|
87
81
|
|
88
|
-
private
|
89
82
|
# Waiting for new urls're added to queue, using backoff algorithms
|
83
|
+
# Invoke download when suitable
|
90
84
|
def wait_for_url
|
91
|
-
@
|
92
|
-
|
93
|
-
|
85
|
+
while not @stoping do
|
86
|
+
if @queuing_urls.size == 0
|
87
|
+
get_new_url
|
88
|
+
end
|
89
|
+
if @queuing_urls.size == 0
|
90
|
+
backoff_sleep
|
91
|
+
else
|
92
|
+
if @stoping
|
93
|
+
return
|
94
|
+
end
|
95
|
+
@wait_time = 1
|
96
|
+
@urls = @queuing_urls.shift(@concurrent_download * 2)
|
97
|
+
download
|
98
|
+
sleep 0.01 # delay to receive terminate signal
|
99
|
+
end
|
94
100
|
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Sleep using backoff algorithm
|
104
|
+
# @params [ Fixnum ] seconds
|
105
|
+
def backoff_sleep
|
95
106
|
sleep @wait_time
|
96
|
-
|
97
|
-
|
98
|
-
if @wait_time < 30
|
99
|
-
@wait_time = @wait_time * 2
|
100
|
-
end
|
101
|
-
wait_for_url
|
102
|
-
else
|
103
|
-
@wait_time = 1
|
104
|
-
run
|
107
|
+
if @wait_time * 2 <= 4
|
108
|
+
@wait_time = @wait_time * 2
|
105
109
|
end
|
106
110
|
end
|
107
111
|
|
108
112
|
def get_new_url
|
109
|
-
@
|
113
|
+
@queuing_urls = @queue.find_unvisited
|
110
114
|
end
|
111
115
|
end
|
112
116
|
end
|
@@ -87,6 +87,7 @@ module NewsCrawler
|
|
87
87
|
}
|
88
88
|
end
|
89
89
|
|
90
|
+
# run selector
|
90
91
|
def run
|
91
92
|
@status = :running
|
92
93
|
return if @stoping
|
@@ -94,7 +95,7 @@ module NewsCrawler
|
|
94
95
|
@status = :stopped
|
95
96
|
return
|
96
97
|
end
|
97
|
-
while
|
98
|
+
while true do
|
98
99
|
url = next_unprocessed(@max_depth - 1)
|
99
100
|
while (url.nil?)
|
100
101
|
wait_for_url
|
@@ -103,6 +104,7 @@ module NewsCrawler
|
|
103
104
|
NCLogger.get_logger.info "Processing #{url}"
|
104
105
|
extract_url(url)
|
105
106
|
mark_processed(url)
|
107
|
+
sleep 0.01 # delay to receive terminate signal
|
106
108
|
end
|
107
109
|
end
|
108
110
|
|
@@ -154,20 +156,12 @@ module NewsCrawler
|
|
154
156
|
return false
|
155
157
|
end
|
156
158
|
|
157
|
-
# Graceful terminate this selector
|
158
|
-
def graceful_terminate
|
159
|
-
@stoping = true
|
160
|
-
while @status == :running
|
161
|
-
sleep(1)
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
159
|
private
|
166
160
|
# Waiting for new urls're added to queue, using backoff algorithms
|
167
161
|
def wait_for_url
|
168
162
|
@status = :waiting
|
169
163
|
sleep @wait_time
|
170
|
-
if @wait_time <
|
164
|
+
if @wait_time < 4
|
171
165
|
@wait_times = @wait_time * 2
|
172
166
|
end
|
173
167
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 1.0.0.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-08-
|
11
|
+
date: 2013-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|
@@ -195,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
195
195
|
version: 2.0.0
|
196
196
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
197
197
|
requirements:
|
198
|
-
- - '
|
198
|
+
- - '>'
|
199
199
|
- !ruby/object:Gem::Version
|
200
|
-
version:
|
200
|
+
version: 1.3.1
|
201
201
|
requirements: []
|
202
202
|
rubyforge_project:
|
203
203
|
rubygems_version: 2.0.3
|