news_crawler 0.0.4 → 1.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be6f510fc6e9737a50f42f5a6f11f2c21b5cfcb4
4
- data.tar.gz: add012faa2d72f7ec93b28df4b55faeea52dd86d
3
+ metadata.gz: 2ec4bfc6924849f911dfcc86f41fc9e3bb8c68a8
4
+ data.tar.gz: 292da2b3e3ae3836d2511b2f86687433039b7f87
5
5
  SHA512:
6
- metadata.gz: 5498cf91cf3c169062edfdc9cdeff42adf4347cdb5addd5a36146e324bcfa96fb40806436034b59decc9730bd9b6ff7f7fbed8055980061f1b637bcb1d20ad82
7
- data.tar.gz: 6d20e4c7121fa06898112862b467838f8008837b01e795241cbb744de43eb3e1f147c70ccdde94bbe4745363652041de91ed4341b55b36bcdc7f60b9c01cfca1
6
+ metadata.gz: 6eb68c98a026a0abb8fe3d1dd4a535ea5a3b4c17bc4ed52543c262ae440604b3515b80688ba06df53eb1e1cf9ff9228ad5fd92c3a55ba478291e7d7fa0fbdb16
7
+ data.tar.gz: 9ee6ad58b747ab1d4d9eee4b100cd506c8d0a3b751163c657203da8e4773320da915d0eca9583a5f29b91df0ef26f9696132e35cccd7701db28302e5900b0e0d
data/bin/news_crawler CHANGED
@@ -57,7 +57,7 @@ OptionParser.new do | opts |
57
57
  end
58
58
 
59
59
  opts.on('-t', "--time-out TIME", OptionParser::DecimalInteger,
60
- "Wait time (in sec) before stop crawl (crawler is\'nt stopped immediately but terminated gracefully)",
60
+ "Wait time (in sec) before stop crawl",
61
61
  "If time out isn't specified you can stop crawler by press Ctrl-C") do | t |
62
62
  options[:time_out] = t
63
63
  end
@@ -97,18 +97,16 @@ if options[:time_out]
97
97
  sleep(options[:time_out])
98
98
  else
99
99
  stop = false
100
- while(!stop)
100
+ while(not stop)
101
101
  Signal.trap("INT") do | signo |
102
102
  stop = true
103
103
  end
104
104
  end
105
105
  end
106
106
 
107
- puts "Stoping SDS"
108
- se.graceful_terminate
109
- se.terminate
110
- puts "SDS stopped"
111
107
  puts "Stoping Downloader"
112
- dwl.graceful_terminate
113
108
  dwl.terminate
114
109
  puts "Downloader stopped"
110
+ puts "Stoping SDS"
111
+ se.terminate
112
+ puts "SDS stopped"
@@ -42,18 +42,22 @@ module NewsCrawler
42
42
  # @param [ NewsCrawler::URLQueue ] queue url queue
43
43
  def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
44
44
  @queue = queue
45
- @urls = queue.find_unvisited
46
45
  @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
47
46
  @wait_time = 1
48
- @status = :running
49
47
  @stoping = false
48
+ get_new_url
50
49
  wait_for_url if start_on_create
51
50
  end
52
51
 
53
52
  # Start downloader with current queue
54
53
  # URL successed fetch is marked and result's stored in DB
55
54
  def run
56
- @status = :running
55
+ wait_for_url
56
+ end
57
+
58
+ private
59
+ # Download urls are given (in @urls)
60
+ def download
57
61
  hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
58
62
  # TODO Log here
59
63
  @urls = @urls.keep_if do | url |
@@ -73,40 +77,40 @@ module NewsCrawler
73
77
  re
74
78
  end
75
79
  hydra.run
76
- @urls = []
77
- wait_for_url
78
- end
79
-
80
- # Graceful terminate this downloader
81
- def graceful_terminate
82
- @stoping = true
83
- while @status == :running
84
- sleep(1)
85
- end
86
80
  end
87
81
 
88
- private
89
82
  # Waiting for new urls're added to queue, using backoff algorithms
83
+ # Invoke download when suitable
90
84
  def wait_for_url
91
- @status = :waiting
92
- if @stoping # check for stop flag
93
- return
85
+ while not @stoping do
86
+ if @queuing_urls.size == 0
87
+ get_new_url
88
+ end
89
+ if @queuing_urls.size == 0
90
+ backoff_sleep
91
+ else
92
+ if @stoping
93
+ return
94
+ end
95
+ @wait_time = 1
96
+ @urls = @queuing_urls.shift(@concurrent_download * 2)
97
+ download
98
+ sleep 0.01 # delay to receive terminate signal
99
+ end
94
100
  end
101
+ end
102
+
103
+ # Sleep using backoff algorithm
104
+ # @params [ Fixnum ] seconds
105
+ def backoff_sleep
95
106
  sleep @wait_time
96
- get_new_url
97
- if @urls.size == 0
98
- if @wait_time < 30
99
- @wait_time = @wait_time * 2
100
- end
101
- wait_for_url
102
- else
103
- @wait_time = 1
104
- run
107
+ if @wait_time * 2 <= 4
108
+ @wait_time = @wait_time * 2
105
109
  end
106
110
  end
107
111
 
108
112
  def get_new_url
109
- @urls = @queue.find_unvisited
113
+ @queuing_urls = @queue.find_unvisited
110
114
  end
111
115
  end
112
116
  end
@@ -87,6 +87,7 @@ module NewsCrawler
87
87
  }
88
88
  end
89
89
 
90
+ # run selector
90
91
  def run
91
92
  @status = :running
92
93
  return if @stoping
@@ -94,7 +95,7 @@ module NewsCrawler
94
95
  @status = :stopped
95
96
  return
96
97
  end
97
- while !@stoping
98
+ while true do
98
99
  url = next_unprocessed(@max_depth - 1)
99
100
  while (url.nil?)
100
101
  wait_for_url
@@ -103,6 +104,7 @@ module NewsCrawler
103
104
  NCLogger.get_logger.info "Processing #{url}"
104
105
  extract_url(url)
105
106
  mark_processed(url)
107
+ sleep 0.01 # delay to receive terminate signal
106
108
  end
107
109
  end
108
110
 
@@ -154,20 +156,12 @@ module NewsCrawler
154
156
  return false
155
157
  end
156
158
 
157
- # Graceful terminate this selector
158
- def graceful_terminate
159
- @stoping = true
160
- while @status == :running
161
- sleep(1)
162
- end
163
- end
164
-
165
159
  private
166
160
  # Waiting for new urls're added to queue, using backoff algorithms
167
161
  def wait_for_url
168
162
  @status = :waiting
169
163
  sleep @wait_time
170
- if @wait_time < 30
164
+ if @wait_time < 4
171
165
  @wait_times = @wait_time * 2
172
166
  end
173
167
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 1.0.0.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-12 00:00:00.000000000 Z
11
+ date: 2013-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo
@@ -195,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
195
195
  version: 2.0.0
196
196
  required_rubygems_version: !ruby/object:Gem::Requirement
197
197
  requirements:
198
- - - '>='
198
+ - - '>'
199
199
  - !ruby/object:Gem::Version
200
- version: '0'
200
+ version: 1.3.1
201
201
  requirements: []
202
202
  rubyforge_project:
203
203
  rubygems_version: 2.0.3