cobweb 1.0.27 → 1.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.textile +1 -1
- data/lib/cobweb_crawl_helper.rb +32 -22
- data/lib/cobweb_crawler.rb +0 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +33 -24
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4ee0943ca3fabf5cb097b9d6edb324783cc155d6
|
4
|
+
data.tar.gz: d79eca877414244d94d5e66062a368bee4104368
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 420d1fe0daff99694de78846491f97357d42632a78cdc9a29953f95afe93223c40ecbf2fc09b34bf378321322ff9a7958f28a45b352e629d4750af56cb192a0f
|
7
|
+
data.tar.gz: e9e38c6a2c33ffff09e76c04e71f673512523a417b37b001a87142ffd8a01148b40140d1c98fb263b562e2278c0ebdb08e830a65467c41ad3fdb2ef654938012
|
data/README.textile
CHANGED
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -30,25 +30,6 @@ class CobwebCrawlHelper
|
|
30
30
|
# set status as cancelled now so that we don't enqueue any further pages
|
31
31
|
self.statistics.end_crawl(@data, true)
|
32
32
|
|
33
|
-
|
34
|
-
if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
|
35
|
-
|
36
|
-
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
37
|
-
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
38
|
-
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
39
|
-
|
40
|
-
Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
|
41
|
-
end
|
42
|
-
|
43
|
-
if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
44
|
-
|
45
|
-
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
46
|
-
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
47
|
-
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
48
|
-
|
49
|
-
Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
|
50
|
-
end
|
51
|
-
|
52
33
|
counter = 0
|
53
34
|
while(counter < 200) do
|
54
35
|
break if self.statistics.get_status == CANCELLED
|
@@ -70,14 +51,43 @@ class CobwebCrawlHelper
|
|
70
51
|
end
|
71
52
|
end
|
72
53
|
if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
73
|
-
|
74
|
-
|
54
|
+
|
55
|
+
puts "deleteing from crawl_worker"
|
56
|
+
queue = Sidekiq::Queue.new("crawl_worker")
|
57
|
+
queue.each do |job|
|
58
|
+
ap job.args # => [1, 2, 3]
|
59
|
+
job.delete if job.args[0]["crawl_id"] == id
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
|
64
|
+
puts "deleting from #{process_queue_name}"
|
65
|
+
queue = Sidekiq::Queue.new(process_queue_name)
|
75
66
|
queue.each do |job|
|
76
|
-
job.args # => [1, 2, 3]
|
67
|
+
ap job.args # => [1, 2, 3]
|
77
68
|
job.delete if job.args[0]["crawl_id"] == id
|
78
69
|
end
|
79
70
|
end
|
80
71
|
|
72
|
+
if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
|
73
|
+
|
74
|
+
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
75
|
+
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
76
|
+
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
77
|
+
|
78
|
+
Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
|
79
|
+
end
|
80
|
+
|
81
|
+
if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
|
82
|
+
|
83
|
+
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
84
|
+
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
85
|
+
additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
|
86
|
+
|
87
|
+
Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
|
88
|
+
end
|
89
|
+
|
90
|
+
|
81
91
|
end
|
82
92
|
|
83
93
|
def statistics
|
data/lib/cobweb_crawler.rb
CHANGED
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -30,6 +30,10 @@ module CobwebModule
|
|
30
30
|
already_crawled?(link) || already_queued?(link) || already_running?(link)
|
31
31
|
end
|
32
32
|
|
33
|
+
def cancelled?
|
34
|
+
@stats.get_statistics[:current_status] == "Cancelled"
|
35
|
+
end
|
36
|
+
|
33
37
|
# Returns true if the crawl count is within limits
|
34
38
|
def within_crawl_limits?
|
35
39
|
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
@@ -58,42 +62,47 @@ module CobwebModule
|
|
58
62
|
end
|
59
63
|
|
60
64
|
def retrieve
|
65
|
+
unless cancelled?
|
66
|
+
unless already_running? @options[:url]
|
67
|
+
unless already_crawled? @options[:url]
|
68
|
+
update_queues
|
69
|
+
if within_crawl_limits?
|
70
|
+
@redis.sadd("currently_running", @options[:url])
|
71
|
+
@stats.update_status("Retrieving #{@options[:url]}...")
|
72
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
73
|
+
update_counters
|
74
|
+
|
75
|
+
if @options[:url] == @redis.get("original_base_url")
|
76
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
77
|
+
end
|
61
78
|
|
62
|
-
|
63
|
-
|
64
|
-
update_queues
|
65
|
-
if within_crawl_limits?
|
66
|
-
@redis.sadd("currently_running", @options[:url])
|
67
|
-
@stats.update_status("Retrieving #{@options[:url]}...")
|
68
|
-
@content = Cobweb.new(@options).get(@options[:url], @options)
|
69
|
-
update_counters
|
70
|
-
|
71
|
-
if @options[:url] == @redis.get("original_base_url")
|
72
|
-
@redis.set("crawled_base_url", @content[:base_url])
|
73
|
-
end
|
74
|
-
|
75
|
-
if content.permitted_type?
|
76
|
-
## update statistics
|
79
|
+
if content.permitted_type?
|
80
|
+
## update statistics
|
77
81
|
|
78
|
-
|
79
|
-
|
82
|
+
@stats.update_statistics(@content)
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
else
|
86
|
+
puts "======================================="
|
87
|
+
puts "OUTWITH CRAWL LIMITS"
|
88
|
+
puts "======================================="
|
89
|
+
decrement_queue_counter
|
80
90
|
end
|
81
91
|
else
|
82
92
|
puts "======================================="
|
83
|
-
puts "
|
93
|
+
puts "ALREADY CRAWLED"
|
84
94
|
puts "======================================="
|
85
95
|
decrement_queue_counter
|
86
96
|
end
|
87
97
|
else
|
88
|
-
|
89
|
-
|
90
|
-
puts "======================================="
|
98
|
+
debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
|
99
|
+
debug_ap @redis.smembers("currently_running")
|
91
100
|
decrement_queue_counter
|
92
101
|
end
|
93
102
|
else
|
94
|
-
|
95
|
-
|
96
|
-
|
103
|
+
puts "======================================="
|
104
|
+
puts "CRAWL CANCELLED"
|
105
|
+
puts "======================================="
|
97
106
|
end
|
98
107
|
false
|
99
108
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|