apollo-crawler 0.1.25 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
|
4
|
+
data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
|
7
|
+
data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
|
@@ -52,7 +52,7 @@ module Apollo
|
|
52
52
|
|
53
53
|
doc = Apollo::Model::QueuedUrl.find(request["_id"])
|
54
54
|
doc.update_attributes(msg['request'])
|
55
|
-
doc.state =
|
55
|
+
doc.state = :fetched
|
56
56
|
doc.save
|
57
57
|
|
58
58
|
doc = Apollo::Model::RawDocument.where(:url => request['url']).first
|
@@ -112,11 +112,20 @@ module Apollo
|
|
112
112
|
declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
|
113
113
|
end
|
114
114
|
|
115
|
+
def get_next_url(opts={})
|
116
|
+
Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
|
117
|
+
end
|
118
|
+
|
115
119
|
def fetch_queued_urls(opts={})
|
116
|
-
|
120
|
+
url = get_next_url(opts)
|
121
|
+
|
122
|
+
while url
|
123
|
+
puts url.inspect
|
117
124
|
# puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
|
118
125
|
|
119
126
|
fetch_url(url, opts)
|
127
|
+
|
128
|
+
url = get_next_url()
|
120
129
|
end
|
121
130
|
end
|
122
131
|
|
@@ -173,6 +173,16 @@ module Apollo
|
|
173
173
|
return nil
|
174
174
|
end
|
175
175
|
|
176
|
+
def requeue_fetching_urls(opts={})
|
177
|
+
urls = Apollo::Model::QueuedUrl.where(:state => :fetching)
|
178
|
+
urls.each do |url|
|
179
|
+
puts "Requeing '#{url.inspect}'" if opts[:verbose]
|
180
|
+
|
181
|
+
url.state = :queued
|
182
|
+
url.save
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
176
186
|
# Run Program
|
177
187
|
def run(args = ARGV)
|
178
188
|
res = super(args)
|
@@ -180,6 +190,8 @@ module Apollo
|
|
180
190
|
|
181
191
|
init_domains()
|
182
192
|
|
193
|
+
requeue_fetching_urls(self.options)
|
194
|
+
|
183
195
|
# Here we start
|
184
196
|
# if(ARGV.length < 1)
|
185
197
|
# puts optparser
|