apollo-crawler 0.1.25 → 0.1.26
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5e6fa7213e0e7f81364c5dbda6a8f53def1fda6e
|
4
|
+
data.tar.gz: ecd79cd04f4a4331124b3910ef5e85da5f590692
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae28d3adaca4125abdeee5ffbf73615a1efc6388e97882b67359846e81b6b58acca74fc08f59034c13f132892cca92b324876c20ab6680222aefc08b5e0d9ea1
|
7
|
+
data.tar.gz: f452e71e138696125affbaa3ae646775857bf7b99a06a0656a85e08881e66fdbe357cd31eac4bc5f3030aadbf0075b4f602f280c669d65b139b14bc20ab974ec
|
@@ -52,7 +52,7 @@ module Apollo
|
|
52
52
|
|
53
53
|
doc = Apollo::Model::QueuedUrl.find(request["_id"])
|
54
54
|
doc.update_attributes(msg['request'])
|
55
|
-
doc.state =
|
55
|
+
doc.state = :fetched
|
56
56
|
doc.save
|
57
57
|
|
58
58
|
doc = Apollo::Model::RawDocument.where(:url => request['url']).first
|
@@ -112,11 +112,20 @@ module Apollo
|
|
112
112
|
declarations[:exchanges]["fetcher"].publish(url.to_json, :reply_to => "planner.fetched")
|
113
113
|
end
|
114
114
|
|
115
|
+
def get_next_url(opts={})
|
116
|
+
Apollo::Model::QueuedUrl.where({:state => :queued}).find_and_modify({ "$set" => { state: :fetching }}, new: true)
|
117
|
+
end
|
118
|
+
|
115
119
|
def fetch_queued_urls(opts={})
|
116
|
-
|
120
|
+
url = get_next_url(opts)
|
121
|
+
|
122
|
+
while url
|
123
|
+
puts url.inspect
|
117
124
|
# puts "Count of URLs in Queue: #{url.count}" if opts[:verbose]
|
118
125
|
|
119
126
|
fetch_url(url, opts)
|
127
|
+
|
128
|
+
url = get_next_url()
|
120
129
|
end
|
121
130
|
end
|
122
131
|
|
@@ -173,6 +173,16 @@ module Apollo
|
|
173
173
|
return nil
|
174
174
|
end
|
175
175
|
|
176
|
+
def requeue_fetching_urls(opts={})
|
177
|
+
urls = Apollo::Model::QueuedUrl.where(:state => :fetching)
|
178
|
+
urls.each do |url|
|
179
|
+
puts "Requeing '#{url.inspect}'" if opts[:verbose]
|
180
|
+
|
181
|
+
url.state = :queued
|
182
|
+
url.save
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
176
186
|
# Run Program
|
177
187
|
def run(args = ARGV)
|
178
188
|
res = super(args)
|
@@ -180,6 +190,8 @@ module Apollo
|
|
180
190
|
|
181
191
|
init_domains()
|
182
192
|
|
193
|
+
requeue_fetching_urls(self.options)
|
194
|
+
|
183
195
|
# Here we start
|
184
196
|
# if(ARGV.length < 1)
|
185
197
|
# puts optparser
|