news_crawler 0.0.4 → 1.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be6f510fc6e9737a50f42f5a6f11f2c21b5cfcb4
4
- data.tar.gz: add012faa2d72f7ec93b28df4b55faeea52dd86d
3
+ metadata.gz: 2ec4bfc6924849f911dfcc86f41fc9e3bb8c68a8
4
+ data.tar.gz: 292da2b3e3ae3836d2511b2f86687433039b7f87
5
5
  SHA512:
6
- metadata.gz: 5498cf91cf3c169062edfdc9cdeff42adf4347cdb5addd5a36146e324bcfa96fb40806436034b59decc9730bd9b6ff7f7fbed8055980061f1b637bcb1d20ad82
7
- data.tar.gz: 6d20e4c7121fa06898112862b467838f8008837b01e795241cbb744de43eb3e1f147c70ccdde94bbe4745363652041de91ed4341b55b36bcdc7f60b9c01cfca1
6
+ metadata.gz: 6eb68c98a026a0abb8fe3d1dd4a535ea5a3b4c17bc4ed52543c262ae440604b3515b80688ba06df53eb1e1cf9ff9228ad5fd92c3a55ba478291e7d7fa0fbdb16
7
+ data.tar.gz: 9ee6ad58b747ab1d4d9eee4b100cd506c8d0a3b751163c657203da8e4773320da915d0eca9583a5f29b91df0ef26f9696132e35cccd7701db28302e5900b0e0d
data/bin/news_crawler CHANGED
@@ -57,7 +57,7 @@ OptionParser.new do | opts |
57
57
  end
58
58
 
59
59
  opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
60
- "Wait time (in sec) before stop crawl (crawler is\'nt stopped immediately but terminated gracefully)",
60
+ "Wait time (in sec) before stop crawl",
61
61
  "If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
62
62
  options[:time_out] = t
63
63
  end
@@ -97,18 +97,16 @@ if options[:time_out]
97
97
  sleep(options[:time_out])
98
98
  else
99
99
  stop = false
100
- while(!stop)
100
+ while(not stop)
101
101
  Signal.trap("INT") do | signo |
102
102
  stop = true
103
103
  end
104
104
  end
105
105
  end
106
106
 
107
- puts "Stoping SDS"
108
- se.graceful_terminate
109
- se.terminate
110
- puts "SDS stopped"
111
107
  puts "Stoping Downloader"
112
- dwl.graceful_terminate
113
108
  dwl.terminate
114
109
  puts "Downloader stopped"
110
+ puts "Stoping SDS"
111
+ se.terminate
112
+ puts "SDS stopped"
@@ -42,18 +42,22 @@ module NewsCrawler
42
42
  # @param [ NewsCrawler::URLQueue ] queue url queue
43
43
  def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
44
44
  @queue = queue
45
- @urls = queue.find_unvisited
46
45
  @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
47
46
  @wait_time = 1
48
- @status = :running
49
47
  @stoping = false
48
+ get_new_url
50
49
  wait_for_url if start_on_create
51
50
  end
52
51
 
53
52
  # Start downloader with current queue
54
53
  # URL successed fetch is marked and result's stored in DB
55
54
  def run
56
- @status = :running
55
+ wait_for_url
56
+ end
57
+
58
+ private
59
+ # Download urls are given (in @urls)
60
+ def download
57
61
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
58
62
  # TODO Log here
59
63
  @urls = @urls.keep_if do | url |
@@ -73,40 +77,40 @@ module NewsCrawler
73
77
  re
74
78
  end
75
79
  hydra.run
76
- @urls = []
77
- wait_for_url
78
- end
79
-
80
- # Graceful terminate this downloader
81
- def graceful_terminate
82
- @stoping = true
83
- while @status == :running
84
- sleep(1)
85
- end
86
80
  end
87
81
 
88
- private
89
82
  # Waiting for new urls're added to queue, using backoff algorithms
83
+ # Invoke download when suitable
90
84
  def wait_for_url
91
- @status = :waiting
92
- if @stoping # check for stop flag
93
- return
85
+ while not @stoping do
86
+ if @queuing_urls.size == 0
87
+ get_new_url
88
+ end
89
+ if @queuing_urls.size == 0
90
+ backoff_sleep
91
+ else
92
+ if @stoping
93
+ return
94
+ end
95
+ @wait_time = 1
96
+ @urls = @queuing_urls.shift(@concurrent_download * 2)
97
+ download
98
+ sleep 0.01 # delay to receive terminate signal
99
+ end
94
100
  end
101
+ end
102
+
103
+ # Sleep using backoff algorithm
104
+ # @params [ Fixnum ] seconds
105
+ def backoff_sleep
95
106
  sleep @wait_time
96
- get_new_url
97
- if @urls.size == 0
98
- if @wait_time < 30
99
- @wait_time = @wait_time * 2
100
- end
101
- wait_for_url
102
- else
103
- @wait_time = 1
104
- run
107
+ if @wait_time * 2 <= 4
108
+ @wait_time = @wait_time * 2
105
109
  end
106
110
  end
107
111
 
108
112
  def get_new_url
109
- @urls = @queue.find_unvisited
113
+ @queuing_urls = @queue.find_unvisited
110
114
  end
111
115
  end
112
116
  end
@@ -87,6 +87,7 @@ module NewsCrawler
87
87
  }
88
88
  end
89
89
 
90
+ # run selector
90
91
  def run
91
92
  @status = :running
92
93
  return if @stoping
@@ -94,7 +95,7 @@ module NewsCrawler
94
95
  @status = :stopped
95
96
  return
96
97
  end
97
- while !@stoping
98
+ while true do
98
99
  url = next_unprocessed(@max_depth - 1)
99
100
  while (url.nil?)
100
101
  wait_for_url
@@ -103,6 +104,7 @@ module NewsCrawler
103
104
  NCLogger.get_logger.info "Processing #{url}"
104
105
  extract_url(url)
105
106
  mark_processed(url)
107
+ sleep 0.01 # delay to receive terminate signal
106
108
  end
107
109
  end
108
110
 
@@ -154,20 +156,12 @@ module NewsCrawler
154
156
  return false
155
157
  end
156
158
 
157
- # Graceful terminate this selector
158
- def graceful_terminate
159
- @stoping = true
160
- while @status == :running
161
- sleep(1)
162
- end
163
- end
164
-
165
159
  private
166
160
  # Waiting for new urls're added to queue, using backoff algorithms
167
161
  def wait_for_url
168
162
  @status = :waiting
169
163
  sleep @wait_time
170
- if @wait_time < 30
164
+ if @wait_time < 4
171
165
  @wait_times = @wait_time * 2
172
166
  end
173
167
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 1.0.0.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-12 00:00:00.000000000 Z
11
+ date: 2013-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo
@@ -195,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
195
195
  version: 2.0.0
196
196
  required_rubygems_version: !ruby/object:Gem::Requirement
197
197
  requirements:
198
- - - '>='
198
+ - - '>'
199
199
  - !ruby/object:Gem::Version
200
- version: '0'
200
+ version: 1.3.1
201
201
  requirements: []
202
202
  rubyforge_project:
203
203
  rubygems_version: 2.0.3