news_crawler 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50e84e9674b22d98be7b72371513219da5a23d38
|
4
|
+
data.tar.gz: bd6cd50fe658c960134fdfff53631e32e63e4b76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 405419795794f78bf0608d2b66707d842c69c7c53d1b312a2569e9c82e482563ea28d34c5183accd3bf4b213fc6291305d8295f9310918854cd24e19c1cf6a83
|
7
|
+
data.tar.gz: 53e089fb98b6a9e583c54119215ead3444fcc285d71f3bf5ca6f188b55db13a45fa6a87d414631a81dc2f092cedc726d77907d2d542671a721d92581a4876df8
|
data/bin/news_crawler
CHANGED
@@ -28,6 +28,7 @@ require 'news_crawler/nc_logger'
|
|
28
28
|
|
29
29
|
require 'news_crawler/downloader'
|
30
30
|
require 'news_crawler/link_selector/same_domain_selector'
|
31
|
+
require 'news_crawler/storage/url_queue'
|
31
32
|
|
32
33
|
include NewsCrawler::Storage
|
33
34
|
|
@@ -54,6 +55,12 @@ OptionParser.new do | opts |
|
|
54
55
|
'Maximum depth of url to crawl') do | d |
|
55
56
|
options[:max_depth] = d
|
56
57
|
end
|
58
|
+
|
59
|
+
opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
|
60
|
+
"Wait time (in sec) before stop crawl (crawler is\'nt stopped immediately but terminated gracefully)",
|
61
|
+
"If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
|
62
|
+
options[:time_out] = t
|
63
|
+
end
|
57
64
|
end.parse!
|
58
65
|
|
59
66
|
|
@@ -71,7 +78,11 @@ end
|
|
71
78
|
|
72
79
|
if ARGV.size > 0
|
73
80
|
url = ARGV[0]
|
74
|
-
|
81
|
+
begin
|
82
|
+
URLQueue.add(url)
|
83
|
+
rescue NewsCrawler::Storage::URLQueue::DuplicateURLError
|
84
|
+
NewsCrawler::NCLogger.get_logger.info("URL existed")
|
85
|
+
end
|
75
86
|
end
|
76
87
|
|
77
88
|
puts "Starting Downloader"
|
@@ -81,13 +92,22 @@ dwl.async.run
|
|
81
92
|
puts "Starting SDS"
|
82
93
|
se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
|
83
94
|
se.async.run
|
95
|
+
|
96
|
+
if options[:time_out]
|
97
|
+
sleep(options[:time_out])
|
98
|
+
else
|
99
|
+
stop = false
|
100
|
+
while(!stop)
|
101
|
+
Signal.trap("INT") do | signo |
|
102
|
+
stop = true
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
84
107
|
puts "Stoping SDS"
|
85
108
|
se.graceful_terminate
|
86
109
|
se.terminate
|
87
110
|
puts "SDS stopped"
|
88
|
-
|
89
|
-
sleep(5)
|
90
|
-
|
91
111
|
puts "Stoping Downloader"
|
92
112
|
dwl.graceful_terminate
|
93
113
|
dwl.terminate
|
@@ -34,13 +34,13 @@ module NewsCrawler
|
|
34
34
|
end
|
35
35
|
|
36
36
|
# Set logger level
|
37
|
-
# param [ Logger::Severity ] l level
|
37
|
+
# @param [ Logger::Severity ] l level
|
38
38
|
def self.set_level(l)
|
39
39
|
get_logger.level = l
|
40
40
|
end
|
41
41
|
|
42
42
|
# Set logger, should same API as Ruby Logger
|
43
|
-
# param [ Object ] l logger
|
43
|
+
# @param [ Object ] l logger
|
44
44
|
def self.set_logdev(ld)
|
45
45
|
@logger = Logger.new(ld)
|
46
46
|
@logger.progname = 'news_crawler'
|
@@ -43,8 +43,8 @@ module NewsCrawler
|
|
43
43
|
end
|
44
44
|
|
45
45
|
# Add entry to raw data collection, overwrite old data
|
46
|
-
# param [ String ] url
|
47
|
-
# param [ String ] body
|
46
|
+
# @param [ String ] url
|
47
|
+
# @param [ String ] body
|
48
48
|
def add(url, body)
|
49
49
|
@coll.update({:url => url},
|
50
50
|
{:$set => {:body => body}},
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#--
|
2
3
|
# NewsCrawler - a website crawler
|
3
4
|
#
|
@@ -41,8 +42,8 @@ module NewsCrawler
|
|
41
42
|
end
|
42
43
|
|
43
44
|
# Add entry to raw data collection
|
44
|
-
# param [ String ] url
|
45
|
-
# param [ String ] body
|
45
|
+
# @param [ String ] url
|
46
|
+
# @param [ String ] body
|
46
47
|
def add(url, body)
|
47
48
|
raise NotImplementedError
|
48
49
|
end
|