news_crawler 0.0.0 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c77a9447146f179d2b1d3013874a8ea4377be1c5
4
- data.tar.gz: f66b44459c9afc5f962e91bf135827b9014ce196
3
+ metadata.gz: 50e84e9674b22d98be7b72371513219da5a23d38
4
+ data.tar.gz: bd6cd50fe658c960134fdfff53631e32e63e4b76
5
5
  SHA512:
6
- metadata.gz: 84feee86effa11929b40797b93e42a86491a710af20f3801e7ff8becebfa8f2d4f8a465bbe9fff1f4cb9683f49af21aac3df7dd27372cdd1f8d5567147014dd9
7
- data.tar.gz: fc6a53bc7be2a01f26e78a9bcd8bd58a55b7cb34509be75ce5a1878a1311c794c7dcc4dd25ebc2697dbef2dc080dd1c57ae39f2ad03e5c7e0c23a5bdff227259
6
+ metadata.gz: 405419795794f78bf0608d2b66707d842c69c7c53d1b312a2569e9c82e482563ea28d34c5183accd3bf4b213fc6291305d8295f9310918854cd24e19c1cf6a83
7
+ data.tar.gz: 53e089fb98b6a9e583c54119215ead3444fcc285d71f3bf5ca6f188b55db13a45fa6a87d414631a81dc2f092cedc726d77907d2d542671a721d92581a4876df8
data/bin/news_crawler CHANGED
@@ -28,6 +28,7 @@ require 'news_crawler/nc_logger'
28
28
 
29
29
  require 'news_crawler/downloader'
30
30
  require 'news_crawler/link_selector/same_domain_selector'
31
+ require 'news_crawler/storage/url_queue'
31
32
 
32
33
  include NewsCrawler::Storage
33
34
 
@@ -54,6 +55,12 @@ OptionParser.new do | opts |
54
55
  'Maximum depth of url to crawl') do | d |
55
56
  options[:max_depth] = d
56
57
  end
58
+
59
+ opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
60
+ "Wait time (in sec) before stop crawl (crawler is\'nt stopped immediately but terminated gracefully)",
61
+ "If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
62
+ options[:time_out] = t
63
+ end
57
64
  end.parse!
58
65
 
59
66
 
@@ -71,7 +78,11 @@ end
71
78
 
72
79
  if ARGV.size > 0
73
80
  url = ARGV[0]
74
- URLQueue.add(url)
81
+ begin
82
+ URLQueue.add(url)
83
+ rescue NewsCrawler::Storage::URLQueue::DuplicateURLError
84
+ NewsCrawler::NCLogger.get_logger.info("URL existed")
85
+ end
75
86
  end
76
87
 
77
88
  puts "Starting Downloader"
@@ -81,13 +92,22 @@ dwl.async.run
81
92
  puts "Starting SDS"
82
93
  se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
83
94
  se.async.run
95
+
96
+ if options[:time_out]
97
+ sleep(options[:time_out])
98
+ else
99
+ stop = false
100
+ while(!stop)
101
+ Signal.trap("INT") do | signo |
102
+ stop = true
103
+ end
104
+ end
105
+ end
106
+
84
107
  puts "Stoping SDS"
85
108
  se.graceful_terminate
86
109
  se.terminate
87
110
  puts "SDS stopped"
88
-
89
- sleep(5)
90
-
91
111
  puts "Stoping Downloader"
92
112
  dwl.graceful_terminate
93
113
  dwl.terminate
@@ -34,13 +34,13 @@ module NewsCrawler
34
34
  end
35
35
 
36
36
  # Set logger level
37
- # param [ Logger::Severity ] l level
37
+ # @param [ Logger::Severity ] l level
38
38
  def self.set_level(l)
39
39
  get_logger.level = l
40
40
  end
41
41
 
42
42
  # Set logger, should same API as Ruby Logger
43
- # param [ Object ] l logger
43
+ # @param [ Object ] l logger
44
44
  def self.set_logdev(ld)
45
45
  @logger = Logger.new(ld)
46
46
  @logger.progname = 'news_crawler'
@@ -48,8 +48,8 @@ module NewsCrawler
48
48
  end
49
49
 
50
50
  # Add entry to raw data collection
51
- # param [ String ] url
52
- # param [ String ] body
51
+ # @param [ String ] url
52
+ # @param [ String ] body
53
53
  def add(url, body)
54
54
  @engine.add(url, body)
55
55
  end
@@ -43,8 +43,8 @@ module NewsCrawler
43
43
  end
44
44
 
45
45
  # Add entry to raw data collection, overwrite old data
46
- # param [ String ] url
47
- # param [ String ] body
46
+ # @param [ String ] url
47
+ # @param [ String ] body
48
48
  def add(url, body)
49
49
  @coll.update({:url => url},
50
50
  {:$set => {:body => body}},
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  #--
2
3
  # NewsCrawler - a website crawler
3
4
  #
@@ -41,8 +42,8 @@ module NewsCrawler
41
42
  end
42
43
 
43
44
  # Add entry to raw data collection
44
- # param [ String ] url
45
- # param [ String ] body
45
+ # @param [ String ] url
46
+ # @param [ String ] body
46
47
  def add(url, body)
47
48
  raise NotImplementedError
48
49
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương