news_crawler 0.0.0 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c77a9447146f179d2b1d3013874a8ea4377be1c5
4
- data.tar.gz: f66b44459c9afc5f962e91bf135827b9014ce196
3
+ metadata.gz: 50e84e9674b22d98be7b72371513219da5a23d38
4
+ data.tar.gz: bd6cd50fe658c960134fdfff53631e32e63e4b76
5
5
  SHA512:
6
- metadata.gz: 84feee86effa11929b40797b93e42a86491a710af20f3801e7ff8becebfa8f2d4f8a465bbe9fff1f4cb9683f49af21aac3df7dd27372cdd1f8d5567147014dd9
7
- data.tar.gz: fc6a53bc7be2a01f26e78a9bcd8bd58a55b7cb34509be75ce5a1878a1311c794c7dcc4dd25ebc2697dbef2dc080dd1c57ae39f2ad03e5c7e0c23a5bdff227259
6
+ metadata.gz: 405419795794f78bf0608d2b66707d842c69c7c53d1b312a2569e9c82e482563ea28d34c5183accd3bf4b213fc6291305d8295f9310918854cd24e19c1cf6a83
7
+ data.tar.gz: 53e089fb98b6a9e583c54119215ead3444fcc285d71f3bf5ca6f188b55db13a45fa6a87d414631a81dc2f092cedc726d77907d2d542671a721d92581a4876df8
data/bin/news_crawler CHANGED
@@ -28,6 +28,7 @@ require 'news_crawler/nc_logger'
28
28
 
29
29
  require 'news_crawler/downloader'
30
30
  require 'news_crawler/link_selector/same_domain_selector'
31
+ require 'news_crawler/storage/url_queue'
31
32
 
32
33
  include NewsCrawler::Storage
33
34
 
@@ -54,6 +55,12 @@ OptionParser.new do | opts |
54
55
  'Maximum depth of url to crawl') do | d |
55
56
  options[:max_depth] = d
56
57
  end
58
+
59
+ opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
60
+ "Wait time (in sec) before stop crawl (crawler is\'nt stopped immediately but terminated gracefully)",
61
+ "If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
62
+ options[:time_out] = t
63
+ end
57
64
  end.parse!
58
65
 
59
66
 
@@ -71,7 +78,11 @@ end
71
78
 
72
79
  if ARGV.size > 0
73
80
  url = ARGV[0]
74
- URLQueue.add(url)
81
+ begin
82
+ URLQueue.add(url)
83
+ rescue NewsCrawler::Storage::URLQueue::DuplicateURLError
84
+ NewsCrawler::NCLogger.get_logger.info("URL existed")
85
+ end
75
86
  end
76
87
 
77
88
  puts "Starting Downloader"
@@ -81,13 +92,22 @@ dwl.async.run
81
92
  puts "Starting SDS"
82
93
  se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
83
94
  se.async.run
95
+
96
+ if options[:time_out]
97
+ sleep(options[:time_out])
98
+ else
99
+ stop = false
100
+ while(!stop)
101
+ Signal.trap("INT") do | signo |
102
+ stop = true
103
+ end
104
+ end
105
+ end
106
+
84
107
  puts "Stoping SDS"
85
108
  se.graceful_terminate
86
109
  se.terminate
87
110
  puts "SDS stopped"
88
-
89
- sleep(5)
90
-
91
111
  puts "Stoping Downloader"
92
112
  dwl.graceful_terminate
93
113
  dwl.terminate
@@ -34,13 +34,13 @@ module NewsCrawler
34
34
  end
35
35
 
36
36
  # Set logger level
37
- # param [ Logger::Severity ] l level
37
+ # @param [ Logger::Severity ] l level
38
38
  def self.set_level(l)
39
39
  get_logger.level = l
40
40
  end
41
41
 
42
42
  # Set logger, should same API as Ruby Logger
43
- # param [ Object ] l logger
43
+ # @param [ Object ] l logger
44
44
  def self.set_logdev(ld)
45
45
  @logger = Logger.new(ld)
46
46
  @logger.progname = 'news_crawler'
@@ -48,8 +48,8 @@ module NewsCrawler
48
48
  end
49
49
 
50
50
  # Add entry to raw data collection
51
- # param [ String ] url
52
- # param [ String ] body
51
+ # @param [ String ] url
52
+ # @param [ String ] body
53
53
  def add(url, body)
54
54
  @engine.add(url, body)
55
55
  end
@@ -43,8 +43,8 @@ module NewsCrawler
43
43
  end
44
44
 
45
45
  # Add entry to raw data collection, overwrite old data
46
- # param [ String ] url
47
- # param [ String ] body
46
+ # @param [ String ] url
47
+ # @param [ String ] body
48
48
  def add(url, body)
49
49
  @coll.update({:url => url},
50
50
  {:$set => {:body => body}},
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  #--
2
3
  # NewsCrawler - a website crawler
3
4
  #
@@ -41,8 +42,8 @@ module NewsCrawler
41
42
  end
42
43
 
43
44
  # Add entry to raw data collection
44
- # param [ String ] url
45
- # param [ String ] body
45
+ # @param [ String ] url
46
+ # @param [ String ] body
46
47
  def add(url, body)
47
48
  raise NotImplementedError
48
49
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương