news_crawler 0.0.0 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50e84e9674b22d98be7b72371513219da5a23d38
|
4
|
+
data.tar.gz: bd6cd50fe658c960134fdfff53631e32e63e4b76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 405419795794f78bf0608d2b66707d842c69c7c53d1b312a2569e9c82e482563ea28d34c5183accd3bf4b213fc6291305d8295f9310918854cd24e19c1cf6a83
|
7
|
+
data.tar.gz: 53e089fb98b6a9e583c54119215ead3444fcc285d71f3bf5ca6f188b55db13a45fa6a87d414631a81dc2f092cedc726d77907d2d542671a721d92581a4876df8
|
data/bin/news_crawler
CHANGED
@@ -28,6 +28,7 @@ require 'news_crawler/nc_logger'
|
|
28
28
|
|
29
29
|
require 'news_crawler/downloader'
|
30
30
|
require 'news_crawler/link_selector/same_domain_selector'
|
31
|
+
require 'news_crawler/storage/url_queue'
|
31
32
|
|
32
33
|
include NewsCrawler::Storage
|
33
34
|
|
@@ -54,6 +55,12 @@ OptionParser.new do | opts |
|
|
54
55
|
'Maximum depth of url to crawl') do | d |
|
55
56
|
options[:max_depth] = d
|
56
57
|
end
|
58
|
+
|
59
|
+
opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
|
60
|
+
"Wait time (in sec) before stop crawl (crawler is\'nt stopped immediately but terminated gracefully)",
|
61
|
+
"If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
|
62
|
+
options[:time_out] = t
|
63
|
+
end
|
57
64
|
end.parse!
|
58
65
|
|
59
66
|
|
@@ -71,7 +78,11 @@ end
|
|
71
78
|
|
72
79
|
if ARGV.size > 0
|
73
80
|
url = ARGV[0]
|
74
|
-
|
81
|
+
begin
|
82
|
+
URLQueue.add(url)
|
83
|
+
rescue NewsCrawler::Storage::URLQueue::DuplicateURLError
|
84
|
+
NewsCrawler::NCLogger.get_logger.info("URL existed")
|
85
|
+
end
|
75
86
|
end
|
76
87
|
|
77
88
|
puts "Starting Downloader"
|
@@ -81,13 +92,22 @@ dwl.async.run
|
|
81
92
|
puts "Starting SDS"
|
82
93
|
se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
|
83
94
|
se.async.run
|
95
|
+
|
96
|
+
if options[:time_out]
|
97
|
+
sleep(options[:time_out])
|
98
|
+
else
|
99
|
+
stop = false
|
100
|
+
while(!stop)
|
101
|
+
Signal.trap("INT") do | signo |
|
102
|
+
stop = true
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
84
107
|
puts "Stoping SDS"
|
85
108
|
se.graceful_terminate
|
86
109
|
se.terminate
|
87
110
|
puts "SDS stopped"
|
88
|
-
|
89
|
-
sleep(5)
|
90
|
-
|
91
111
|
puts "Stoping Downloader"
|
92
112
|
dwl.graceful_terminate
|
93
113
|
dwl.terminate
|
@@ -34,13 +34,13 @@ module NewsCrawler
|
|
34
34
|
end
|
35
35
|
|
36
36
|
# Set logger level
|
37
|
-
# param [ Logger::Severity ] l level
|
37
|
+
# @param [ Logger::Severity ] l level
|
38
38
|
def self.set_level(l)
|
39
39
|
get_logger.level = l
|
40
40
|
end
|
41
41
|
|
42
42
|
# Set logger, should same API as Ruby Logger
|
43
|
-
# param [ Object ] l logger
|
43
|
+
# @param [ Object ] l logger
|
44
44
|
def self.set_logdev(ld)
|
45
45
|
@logger = Logger.new(ld)
|
46
46
|
@logger.progname = 'news_crawler'
|
@@ -43,8 +43,8 @@ module NewsCrawler
|
|
43
43
|
end
|
44
44
|
|
45
45
|
# Add entry to raw data collection, overwrite old data
|
46
|
-
# param [ String ] url
|
47
|
-
# param [ String ] body
|
46
|
+
# @param [ String ] url
|
47
|
+
# @param [ String ] body
|
48
48
|
def add(url, body)
|
49
49
|
@coll.update({:url => url},
|
50
50
|
{:$set => {:body => body}},
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#--
|
2
3
|
# NewsCrawler - a website crawler
|
3
4
|
#
|
@@ -41,8 +42,8 @@ module NewsCrawler
|
|
41
42
|
end
|
42
43
|
|
43
44
|
# Add entry to raw data collection
|
44
|
-
# param [ String ] url
|
45
|
-
# param [ String ] body
|
45
|
+
# @param [ String ] url
|
46
|
+
# @param [ String ] body
|
46
47
|
def add(url, body)
|
47
48
|
raise NotImplementedError
|
48
49
|
end
|