cobweb 1.0.27 → 1.0.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5cee480edd847679803dcc5e7b47f03523bd5775
4
- data.tar.gz: 37ebf10e098d2b46274a80a969962eadea9bd307
3
+ metadata.gz: 4ee0943ca3fabf5cb097b9d6edb324783cc155d6
4
+ data.tar.gz: d79eca877414244d94d5e66062a368bee4104368
5
5
  SHA512:
6
- metadata.gz: ea3ae531762c16268aac13babd9dbdb482bed8930819de4cdb013f268378eb729483512f181f85ad421a40b19ae2086011d46e6f459286cb314889450be7329c
7
- data.tar.gz: ca1bf5d58f6a242af1030fd2d5f83c28444f8eae47e12cb1c6938032687559824924ef313b52e276186c2c480137035258db5154fcf0205445a7b4c811644a14
6
+ metadata.gz: 420d1fe0daff99694de78846491f97357d42632a78cdc9a29953f95afe93223c40ecbf2fc09b34bf378321322ff9a7958f28a45b352e629d4750af56cb192a0f
7
+ data.tar.gz: e9e38c6a2c33ffff09e76c04e71f673512523a417b37b001a87142ffd8a01148b40140d1c98fb263b562e2278c0ebdb08e830a65467c41ad3fdb2ef654938012
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.27
1
+ h1. Cobweb v1.0.28
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -30,25 +30,6 @@ class CobwebCrawlHelper
30
30
  # set status as cancelled now so that we don't enqueue any further pages
31
31
  self.statistics.end_crawl(@data, true)
32
32
 
33
-
34
- if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
35
-
36
- additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
37
- additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
38
- additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
39
-
40
- Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
41
- end
42
-
43
- if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
44
-
45
- additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
46
- additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
47
- additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
48
-
49
- Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
50
- end
51
-
52
33
  counter = 0
53
34
  while(counter < 200) do
54
35
  break if self.statistics.get_status == CANCELLED
@@ -70,14 +51,43 @@ class CobwebCrawlHelper
70
51
  end
71
52
  end
72
53
  if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
73
- queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
74
- queue = Sidekiq::Queue.new(queue_name)
54
+
55
+ puts "deleteing from crawl_worker"
56
+ queue = Sidekiq::Queue.new("crawl_worker")
57
+ queue.each do |job|
58
+ ap job.args # => [1, 2, 3]
59
+ job.delete if job.args[0]["crawl_id"] == id
60
+ end
61
+
62
+
63
+ process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
64
+ puts "deleting from #{process_queue_name}"
65
+ queue = Sidekiq::Queue.new(process_queue_name)
75
66
  queue.each do |job|
76
- job.args # => [1, 2, 3]
67
+ ap job.args # => [1, 2, 3]
77
68
  job.delete if job.args[0]["crawl_id"] == id
78
69
  end
79
70
  end
80
71
 
72
+ if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
73
+
74
+ additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
75
+ additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
76
+ additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
77
+
78
+ Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
79
+ end
80
+
81
+ if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
82
+
83
+ additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
84
+ additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
85
+ additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
86
+
87
+ Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
88
+ end
89
+
90
+
81
91
  end
82
92
 
83
93
  def statistics
@@ -1,6 +1,5 @@
1
1
  require 'digest/md5'
2
2
  require 'date'
3
- require 'ap'
4
3
  require 'redis-namespace'
5
4
 
6
5
  # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.27"
6
+ "1.0.28"
7
7
  end
8
8
 
9
9
  end
@@ -30,6 +30,10 @@ module CobwebModule
30
30
  already_crawled?(link) || already_queued?(link) || already_running?(link)
31
31
  end
32
32
 
33
+ def cancelled?
34
+ @stats.get_statistics[:current_status] == "Cancelled"
35
+ end
36
+
33
37
  # Returns true if the crawl count is within limits
34
38
  def within_crawl_limits?
35
39
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
@@ -58,42 +62,47 @@ module CobwebModule
58
62
  end
59
63
 
60
64
  def retrieve
65
+ unless cancelled?
66
+ unless already_running? @options[:url]
67
+ unless already_crawled? @options[:url]
68
+ update_queues
69
+ if within_crawl_limits?
70
+ @redis.sadd("currently_running", @options[:url])
71
+ @stats.update_status("Retrieving #{@options[:url]}...")
72
+ @content = Cobweb.new(@options).get(@options[:url], @options)
73
+ update_counters
74
+
75
+ if @options[:url] == @redis.get("original_base_url")
76
+ @redis.set("crawled_base_url", @content[:base_url])
77
+ end
61
78
 
62
- unless already_running? @options[:url]
63
- unless already_crawled? @options[:url]
64
- update_queues
65
- if within_crawl_limits?
66
- @redis.sadd("currently_running", @options[:url])
67
- @stats.update_status("Retrieving #{@options[:url]}...")
68
- @content = Cobweb.new(@options).get(@options[:url], @options)
69
- update_counters
70
-
71
- if @options[:url] == @redis.get("original_base_url")
72
- @redis.set("crawled_base_url", @content[:base_url])
73
- end
74
-
75
- if content.permitted_type?
76
- ## update statistics
79
+ if content.permitted_type?
80
+ ## update statistics
77
81
 
78
- @stats.update_statistics(@content)
79
- return true
82
+ @stats.update_statistics(@content)
83
+ return true
84
+ end
85
+ else
86
+ puts "======================================="
87
+ puts "OUTWITH CRAWL LIMITS"
88
+ puts "======================================="
89
+ decrement_queue_counter
80
90
  end
81
91
  else
82
92
  puts "======================================="
83
- puts "OUTWITH CRAWL LIMITS"
93
+ puts "ALREADY CRAWLED"
84
94
  puts "======================================="
85
95
  decrement_queue_counter
86
96
  end
87
97
  else
88
- puts "======================================="
89
- puts "ALREADY CRAWLED"
90
- puts "======================================="
98
+ debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
99
+ debug_ap @redis.smembers("currently_running")
91
100
  decrement_queue_counter
92
101
  end
93
102
  else
94
- debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
95
- debug_ap @redis.smembers("currently_running")
96
- decrement_queue_counter
103
+ puts "======================================="
104
+ puts "CRAWL CANCELLED"
105
+ puts "======================================="
97
106
  end
98
107
  false
99
108
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.27
4
+ version: 1.0.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-18 00:00:00.000000000 Z
11
+ date: 2015-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis