cobweb 1.0.27 → 1.0.28

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5cee480edd847679803dcc5e7b47f03523bd5775
4
- data.tar.gz: 37ebf10e098d2b46274a80a969962eadea9bd307
3
+ metadata.gz: 4ee0943ca3fabf5cb097b9d6edb324783cc155d6
4
+ data.tar.gz: d79eca877414244d94d5e66062a368bee4104368
5
5
  SHA512:
6
- metadata.gz: ea3ae531762c16268aac13babd9dbdb482bed8930819de4cdb013f268378eb729483512f181f85ad421a40b19ae2086011d46e6f459286cb314889450be7329c
7
- data.tar.gz: ca1bf5d58f6a242af1030fd2d5f83c28444f8eae47e12cb1c6938032687559824924ef313b52e276186c2c480137035258db5154fcf0205445a7b4c811644a14
6
+ metadata.gz: 420d1fe0daff99694de78846491f97357d42632a78cdc9a29953f95afe93223c40ecbf2fc09b34bf378321322ff9a7958f28a45b352e629d4750af56cb192a0f
7
+ data.tar.gz: e9e38c6a2c33ffff09e76c04e71f673512523a417b37b001a87142ffd8a01148b40140d1c98fb263b562e2278c0ebdb08e830a65467c41ad3fdb2ef654938012
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.27
1
+ h1. Cobweb v1.0.28
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
@@ -30,25 +30,6 @@ class CobwebCrawlHelper
30
30
  # set status as cancelled now so that we don't enqueue any further pages
31
31
  self.statistics.end_crawl(@data, true)
32
32
 
33
-
34
- if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
35
-
36
- additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
37
- additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
38
- additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
39
-
40
- Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
41
- end
42
-
43
- if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
44
-
45
- additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
46
- additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
47
- additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
48
-
49
- Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
50
- end
51
-
52
33
  counter = 0
53
34
  while(counter < 200) do
54
35
  break if self.statistics.get_status == CANCELLED
@@ -70,14 +51,43 @@ class CobwebCrawlHelper
70
51
  end
71
52
  end
72
53
  if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
73
- queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
74
- queue = Sidekiq::Queue.new(queue_name)
54
+
55
+ puts "deleteing from crawl_worker"
56
+ queue = Sidekiq::Queue.new("crawl_worker")
57
+ queue.each do |job|
58
+ ap job.args # => [1, 2, 3]
59
+ job.delete if job.args[0]["crawl_id"] == id
60
+ end
61
+
62
+
63
+ process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
64
+ puts "deleting from #{process_queue_name}"
65
+ queue = Sidekiq::Queue.new(process_queue_name)
75
66
  queue.each do |job|
76
- job.args # => [1, 2, 3]
67
+ ap job.args # => [1, 2, 3]
77
68
  job.delete if job.args[0]["crawl_id"] == id
78
69
  end
79
70
  end
80
71
 
72
+ if options[:crawl_finished_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
73
+
74
+ additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
75
+ additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
76
+ additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
77
+
78
+ Resque.enqueue(options[:crawl_finished_queue], @stats.get_statistics.merge(additional_stats))
79
+ end
80
+
81
+ if options[:crawl_finished_queue] && options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
82
+
83
+ additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
84
+ additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
85
+ additional_stats[:source_id] = options[:source_id] unless options[:source_id].nil?
86
+
87
+ Kernel.const_get(options[:crawl_finished_queue]).perform_async(@stats.get_statistics.merge(additional_stats))
88
+ end
89
+
90
+
81
91
  end
82
92
 
83
93
  def statistics
@@ -1,6 +1,5 @@
1
1
  require 'digest/md5'
2
2
  require 'date'
3
- require 'ap'
4
3
  require 'redis-namespace'
5
4
 
6
5
  # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.27"
6
+ "1.0.28"
7
7
  end
8
8
 
9
9
  end
@@ -30,6 +30,10 @@ module CobwebModule
30
30
  already_crawled?(link) || already_queued?(link) || already_running?(link)
31
31
  end
32
32
 
33
+ def cancelled?
34
+ @stats.get_statistics[:current_status] == "Cancelled"
35
+ end
36
+
33
37
  # Returns true if the crawl count is within limits
34
38
  def within_crawl_limits?
35
39
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
@@ -58,42 +62,47 @@ module CobwebModule
58
62
  end
59
63
 
60
64
  def retrieve
65
+ unless cancelled?
66
+ unless already_running? @options[:url]
67
+ unless already_crawled? @options[:url]
68
+ update_queues
69
+ if within_crawl_limits?
70
+ @redis.sadd("currently_running", @options[:url])
71
+ @stats.update_status("Retrieving #{@options[:url]}...")
72
+ @content = Cobweb.new(@options).get(@options[:url], @options)
73
+ update_counters
74
+
75
+ if @options[:url] == @redis.get("original_base_url")
76
+ @redis.set("crawled_base_url", @content[:base_url])
77
+ end
61
78
 
62
- unless already_running? @options[:url]
63
- unless already_crawled? @options[:url]
64
- update_queues
65
- if within_crawl_limits?
66
- @redis.sadd("currently_running", @options[:url])
67
- @stats.update_status("Retrieving #{@options[:url]}...")
68
- @content = Cobweb.new(@options).get(@options[:url], @options)
69
- update_counters
70
-
71
- if @options[:url] == @redis.get("original_base_url")
72
- @redis.set("crawled_base_url", @content[:base_url])
73
- end
74
-
75
- if content.permitted_type?
76
- ## update statistics
79
+ if content.permitted_type?
80
+ ## update statistics
77
81
 
78
- @stats.update_statistics(@content)
79
- return true
82
+ @stats.update_statistics(@content)
83
+ return true
84
+ end
85
+ else
86
+ puts "======================================="
87
+ puts "OUTWITH CRAWL LIMITS"
88
+ puts "======================================="
89
+ decrement_queue_counter
80
90
  end
81
91
  else
82
92
  puts "======================================="
83
- puts "OUTWITH CRAWL LIMITS"
93
+ puts "ALREADY CRAWLED"
84
94
  puts "======================================="
85
95
  decrement_queue_counter
86
96
  end
87
97
  else
88
- puts "======================================="
89
- puts "ALREADY CRAWLED"
90
- puts "======================================="
98
+ debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
99
+ debug_ap @redis.smembers("currently_running")
91
100
  decrement_queue_counter
92
101
  end
93
102
  else
94
- debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
95
- debug_ap @redis.smembers("currently_running")
96
- decrement_queue_counter
103
+ puts "======================================="
104
+ puts "CRAWL CANCELLED"
105
+ puts "======================================="
97
106
  end
98
107
  false
99
108
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.27
4
+ version: 1.0.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-18 00:00:00.000000000 Z
11
+ date: 2015-03-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis