apollo-crawler 0.1.25 → 0.1.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 79e9ecdfed577a1ce13b74b24d6d5bc26bf75843
4
- data.tar.gz: 6d93c6da6316d4666ddc5e434bab1caadc213ba3
3
+ metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
4
+ data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
5
5
  SHA512:
6
- metadata.gz: 863d10a255722bd53c9ee998e2886fd86d04cf7808284323d33f7da1fe77fc99ac0874f1e2a61f20c8656e039bc3bac2d2a9cef911e9b6e6d12266e91636b3bc
7
- data.tar.gz: db77a2d4606dcecbec1ae2e0d872cc41f6fea64012280e179b1147d8bbabfe7eef5446bf364978ee930f338cb367624e30a86e11fe0613e4499e03dcbc670e4b
6
+ metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
7
+ data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
@@ -52,7 +52,7 @@ module Apollo
52
52
 
53
53
  doc = Apollo::Model::QueuedUrl.find(request["_id"])
54
54
  doc.update_attributes(msg['request'])
55
- doc.state = "fetched"
55
+ doc.state = :fetched
56
56
  doc.save
57
57
 
58
58
  doc = Apollo::Model::RawDocument.where(:url => request['url']).first
@@ -112,11 +112,20 @@ module Apollo
112
112
  declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
113
113
  end
114
114
 
115
+ def get_next_url(opts={})
116
+ Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
117
+ end
118
+
115
119
  def fetch_queued_urls(opts={})
116
- while url = Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
120
+ url = get_next_url(opts)
121
+
122
+ while url
123
+ puts url.inspect
117
124
  # puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
118
125
 
119
126
  fetch_url(url, opts)
127
+
128
+ url = get_next_url()
120
129
  end
121
130
  end
122
131
 
@@ -173,6 +173,16 @@ module Apollo
173
173
  return nil
174
174
  end
175
175
 
176
+ def requeue_fetching_urls(opts={})
177
+ urls = Apollo::Model::QueuedUrl.where(:state => :fetching)
178
+ urls.each do |url|
179
+ puts "Requeing '#{url.inspect}'" if opts[:verbose]
180
+
181
+ url.state = :queued
182
+ url.save
183
+ end
184
+ end
185
+
176
186
  # Run Program
177
187
  def run(args = ARGV)
178
188
  res = super(args)
@@ -180,6 +190,8 @@ module Apollo
180
190
 
181
191
  init_domains()
182
192
 
193
+ requeue_fetching_urls(self.options)
194
+
183
195
  # Here we start
184
196
  # if(ARGV.length < 1)
185
197
  # puts optparser
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.25'
22
+ VERSION = '0.1.26'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.25
4
+ version: 0.1.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak