cobweb 1.0.27 → 1.0.28
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +1 -1
- data/lib/cobweb_crawl_helper.rb +32 -22
- data/lib/cobweb_crawler.rb +0 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +33 -24
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ee0943ca3fabf5cb097b9d6edb324783cc155d6
|
4
|
+
data.tar.gz: d79eca877414244d94d5e66062a368bee4104368
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 420d1fe0daff99694de78846491f97357d42632a78cdc9a29953f95afe93223c40ecbf2fc09b34bf378321322ff9a7958f28a45b352e629d4750af56cb192a0f
|
7
|
+
data.tar.gz: e9e38c6a2c33ffff09e76c04e71f673512523a417b37b001a87142ffd8a01148b40140d1c98fb263b562e2278c0ebdb08e830a65467c41ad3fdb2ef654938012
|
data/README.textile
CHANGED
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -30,25 +30,6 @@ class CobwebCrawlHelper
|
|
30
30
|
# set status as cancelled now so that we don't enqueue any further pages
|
31
31
|
self.statistics.end_crawl(@data, true)
|
32
32
|
|
33
|
-
|
34
|
-
if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
|
35
|
-
|
36
|
-
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
37
|
-
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
38
|
-
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
39
|
-
|
40
|
-
Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
|
41
|
-
end
|
42
|
-
|
43
|
-
if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
44
|
-
|
45
|
-
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
46
|
-
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
47
|
-
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
48
|
-
|
49
|
-
Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
|
50
|
-
end
|
51
|
-
|
52
33
|
counter = 0
|
53
34
|
while(counter < 200) do
|
54
35
|
break if self.statistics.get_status == CANCELLED
|
@@ -70,14 +51,43 @@ class CobwebCrawlHelper
|
|
70
51
|
end
|
71
52
|
end
|
72
53
|
if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
73
|
-
|
74
|
-
|
54
|
+
|
55
|
+
puts "deleteing from crawl_worker"
|
56
|
+
queue = Sidekiq::Queue.new("crawl_worker")
|
57
|
+
queue.each do |job|
|
58
|
+
ap job.args # => [1, 2, 3]
|
59
|
+
job.delete if job.args[0]["crawl_id"] == id
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
|
64
|
+
puts "deleting from #{process_queue_name}"
|
65
|
+
queue = Sidekiq::Queue.new(process_queue_name)
|
75
66
|
queue.each do |job|
|
76
|
-
job.args # => [1, 2, 3]
|
67
|
+
ap job.args # => [1, 2, 3]
|
77
68
|
job.delete if job.args[0]["crawl_id"] == id
|
78
69
|
end
|
79
70
|
end
|
80
71
|
|
72
|
+
if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
|
73
|
+
|
74
|
+
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
75
|
+
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
76
|
+
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
77
|
+
|
78
|
+
Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
|
79
|
+
end
|
80
|
+
|
81
|
+
if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
82
|
+
|
83
|
+
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
84
|
+
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
85
|
+
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
86
|
+
|
87
|
+
Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
|
88
|
+
end
|
89
|
+
|
90
|
+
|
81
91
|
end
|
82
92
|
|
83
93
|
def statistics
|
data/lib/cobweb_crawler.rb
CHANGED
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -30,6 +30,10 @@ module CobwebModule
|
|
30
30
|
already_crawled?(link) || already_queued?(link) || already_running?(link)
|
31
31
|
end
|
32
32
|
|
33
|
+
def cancelled?
|
34
|
+
@stats.get_statistics[:current_status] == "Cancelled"
|
35
|
+
end
|
36
|
+
|
33
37
|
# Returns true if the crawl count is within limits
|
34
38
|
def within_crawl_limits?
|
35
39
|
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
@@ -58,42 +62,47 @@ module CobwebModule
|
|
58
62
|
end
|
59
63
|
|
60
64
|
def retrieve
|
65
|
+
unless cancelled?
|
66
|
+
unless already_running? @options[:url]
|
67
|
+
unless already_crawled? @options[:url]
|
68
|
+
update_queues
|
69
|
+
if within_crawl_limits?
|
70
|
+
@redis.sadd("currently_running", @options[:url])
|
71
|
+
@stats.update_status("Retrieving #{@options[:url]}...")
|
72
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
73
|
+
update_counters
|
74
|
+
|
75
|
+
if @options[:url] == @redis.get("original_base_url")
|
76
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
77
|
+
end
|
61
78
|
|
62
|
-
|
63
|
-
|
64
|
-
update_queues
|
65
|
-
if within_crawl_limits?
|
66
|
-
@redis.sadd("currently_running", @options[:url])
|
67
|
-
@stats.update_status("Retrieving #{@options[:url]}...")
|
68
|
-
@content = Cobweb.new(@options).get(@options[:url], @options)
|
69
|
-
update_counters
|
70
|
-
|
71
|
-
if @options[:url] == @redis.get("original_base_url")
|
72
|
-
@redis.set("crawled_base_url", @content[:base_url])
|
73
|
-
end
|
74
|
-
|
75
|
-
if content.permitted_type?
|
76
|
-
## update statistics
|
79
|
+
if content.permitted_type?
|
80
|
+
## update statistics
|
77
81
|
|
78
|
-
|
79
|
-
|
82
|
+
@stats.update_statistics(@content)
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
else
|
86
|
+
puts "======================================="
|
87
|
+
puts "OUTWITH CRAWL LIMITS"
|
88
|
+
puts "======================================="
|
89
|
+
decrement_queue_counter
|
80
90
|
end
|
81
91
|
else
|
82
92
|
puts "======================================="
|
83
|
-
puts "
|
93
|
+
puts "ALREADY CRAWLED"
|
84
94
|
puts "======================================="
|
85
95
|
decrement_queue_counter
|
86
96
|
end
|
87
97
|
else
|
88
|
-
|
89
|
-
|
90
|
-
puts "======================================="
|
98
|
+
debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
|
99
|
+
debug_ap @redis.smembers("currently_running")
|
91
100
|
decrement_queue_counter
|
92
101
|
end
|
93
102
|
else
|
94
|
-
|
95
|
-
|
96
|
-
|
103
|
+
puts "======================================="
|
104
|
+
puts "CRAWL CANCELLED"
|
105
|
+
puts "======================================="
|
97
106
|
end
|
98
107
|
false
|
99
108
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|