cobweb 1.0.28 → 1.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,19 +52,15 @@ class CobwebCrawlHelper
52
52
  end
53
53
  if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
54
54
 
55
- puts "deleteing from crawl_worker"
56
55
  queue = Sidekiq::Queue.new("crawl_worker")
57
56
  queue.each do |job|
58
- ap job.args # => [1, 2, 3]
59
57
  job.delete if job.args[0]["crawl_id"] == id
60
58
  end
61
59
 
62
60
 
63
61
  process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
64
- puts "deleting from #{process_queue_name}"
65
62
  queue = Sidekiq::Queue.new(process_queue_name)
66
63
  queue.each do |job|
67
- ap job.args # => [1, 2, 3]
68
64
  job.delete if job.args[0]["crawl_id"] == id
69
65
  end
70
66
  end
@@ -102,4 +98,4 @@ class CobwebCrawlHelper
102
98
  @data[:crawl_id]
103
99
  end
104
100
 
105
- end
101
+ end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.28"
6
+ "1.0.29"
7
7
  end
8
8
 
9
- end
9
+ end
@@ -14,31 +14,31 @@ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
14
14
  class CrawlWorker
15
15
  include Sidekiq::Worker
16
16
  sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
17
-
17
+
18
18
  def perform(content_request)
19
19
  puts "Performing for #{content_request["url"]}"
20
20
  # setup the crawl class to manage the crawl of this object
21
21
  @crawl = CobwebModule::Crawl.new(content_request)
22
-
22
+
23
23
  # update the counters and then perform the get, returns false if we are outwith limits
24
24
  if @crawl.retrieve
25
-
25
+
26
26
  # if the crawled object is an object type we are interested
27
27
  if @crawl.content.permitted_type?
28
-
28
+
29
29
  # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
30
30
  @crawl.process_links do |link|
31
31
  @crawl.lock("queue_links") do
32
32
  if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
33
33
  # enqueue the links to sidekiq
34
- @crawl.debug_puts "QUEUED LINK: #{link}"
34
+ @crawl.debug_puts "QUEUED LINK: #{link}"
35
35
  enqueue_content(content_request, link)
36
36
  end
37
37
  end
38
38
  end
39
-
39
+
40
40
  if @crawl.to_be_processed?
41
-
41
+
42
42
  @crawl.process do
43
43
 
44
44
  # enqueue to processing queue
@@ -51,17 +51,17 @@ class CrawlWorker
51
51
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
52
52
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
53
53
  end
54
-
54
+
55
55
  end
56
56
  else
57
57
  @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
58
58
  @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
59
59
  @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
60
60
  end
61
-
61
+
62
62
  end
63
63
  end
64
-
64
+
65
65
  #@crawl.lock("finished") do
66
66
  # let the crawl know we're finished with this object
67
67
  @crawl.finished_processing
@@ -79,7 +79,7 @@ class CrawlWorker
79
79
  conn.smembers(get_sidekiq_options[:queue]).count
80
80
  end
81
81
  end
82
-
82
+
83
83
 
84
84
  # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
85
85
  def finished(content_request)
@@ -93,7 +93,7 @@ class CrawlWorker
93
93
  @crawl.redis.incr("crawl_finished_enqueued_count")
94
94
  content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
95
95
  end
96
-
96
+
97
97
  # Enqueues the content to the processing queue setup in options
98
98
  def send_to_processing_queue(content, content_request)
99
99
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
@@ -110,7 +110,7 @@ class CrawlWorker
110
110
  end
111
111
 
112
112
  private
113
-
113
+
114
114
  # Enqueues content to the crawl_job queue
115
115
  def enqueue_content(content_request, link)
116
116
  new_request = content_request.clone
@@ -119,4 +119,4 @@ class CrawlWorker
119
119
  CrawlWorker.perform_async(new_request)
120
120
  end
121
121
 
122
- end
122
+ end
@@ -35,7 +35,7 @@ class ExportCommand
35
35
  Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
36
36
 
37
37
  uri.path.split("/")[0..-2].each do |dir|
38
- path+="/" unless path.ends_with?("/")
38
+ path+="/" unless path.cobweb_ends_with?("/")
39
39
  path+=dir
40
40
  if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
41
41
  FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
@@ -45,7 +45,7 @@ class ExportCommand
45
45
  Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
46
46
  end
47
47
  end
48
- path += "/" unless path.ends_with?("/")
48
+ path += "/" unless path.cobweb_ends_with?("/")
49
49
  filename = uri.path.split("/")[-1]
50
50
  if filename.nil? || filename.empty?
51
51
  filename = "index.html"
@@ -82,4 +82,4 @@ class ExportCommand
82
82
  end
83
83
 
84
84
  end
85
- end
85
+ end
@@ -22,7 +22,7 @@ class ReportCommand
22
22
  CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
23
23
 
24
24
  statistics = @crawler.crawl(options[:url]) do |page|
25
- puts "Reporting on #{page[:url]}"
25
+ puts "Reporting on #{page[:url]} [#{page[:status_code]}]"
26
26
  @doc = page[:body]
27
27
  page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
28
28
  page["title"] = scope.head_tag.title_tag.contents
@@ -1,12 +1,7 @@
1
1
  class String
2
2
 
3
- # add ends_with? support if method is missing
4
- def method_missing(m, *args, &block)
5
- if m == :ends_with?
6
- suffix = args[0]
7
- suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
8
- else
9
- super
10
- end
3
+ def cobweb_ends_with?(val)
4
+ suffix = val
5
+ suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
11
6
  end
12
- end
7
+ end
@@ -4,9 +4,9 @@ describe CobwebCrawler do
4
4
 
5
5
  before(:each) do
6
6
  pending("thin not installed") unless THIN_INSTALLED
7
-
7
+
8
8
  @base_url = "http://localhost:3532/"
9
-
9
+
10
10
  @default_headers = {"Cache-Control" => "private, max-age=0",
11
11
  "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
12
12
  "Expires" => "-1",
@@ -16,13 +16,13 @@ describe CobwebCrawler do
16
16
  "Server" => "gws",
17
17
  "X-XSS-Protection" => "1; mode=block"}
18
18
 
19
- end
19
+ end
20
+
20
21
 
21
-
22
22
  it "should generate a cobweb_crawler object" do
23
23
  CobwebCrawler.new.should be_an_instance_of CobwebCrawler
24
24
  end
25
-
25
+
26
26
  describe "crawl" do
27
27
 
28
28
  it "should crawl a site" do
@@ -36,21 +36,21 @@ describe CobwebCrawler do
36
36
  @statistics.get_statistics[:mime_counts]["text/html"].should == 8
37
37
  @statistics.get_statistics[:mime_counts]["text/css"].should == 18
38
38
  @statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
39
-
39
+
40
40
  end
41
-
41
+
42
42
  it "should take a block" do
43
43
  crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
44
44
  statistics = crawler.crawl(@base_url) do |content, statistics|
45
45
  content[:url].should_not be_nil
46
46
  statistics[:average_length].should_not be_nil
47
47
  end
48
-
48
+
49
49
  statistics.should_not be_nil
50
50
  statistics.get_statistics.should be_an_instance_of Hash
51
-
51
+
52
52
  statistics.get_statistics[:mime_counts]["text/html"].should == 1
53
-
53
+
54
54
  end
55
55
 
56
56
  context "internal_links" do
@@ -91,19 +91,19 @@ describe CobwebCrawler do
91
91
  before(:each) do
92
92
  pending("thin not installed") unless THIN_INSTALLED
93
93
  @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
94
- @statistics = @crawler.crawl(@base_url)
94
+ @statistics = @crawler.crawl(@base_url)
95
95
  end
96
96
 
97
97
  it "should store inbound links" do
98
98
  @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
99
- @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
99
+ @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
100
100
  end
101
101
 
102
102
  it "should handle url encoding" do
103
- @statistics.inbound_links_for("http://localhost:3532/boxgrid%3Ewithsillyname.html").sort.should == ["http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
103
+ @statistics.inbound_links_for("http://localhost:3532/boxgridwithsillyname.html").sort.should == ["http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
104
104
  end
105
105
 
106
106
  end
107
- end
107
+ end
108
108
 
109
- end
109
+ end
@@ -6,7 +6,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
6
6
 
7
7
  before(:all) do
8
8
  #store all existing resque process ids so we don't kill them afterwards
9
- if RESQUE_INSTALLED && THIN_INSTALLED
9
+ if RESQUE_INSTALLED && THIN_INSTALLED
10
10
 
11
11
  @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
12
12
  if Resque.workers.count > 0 && @existing_processes.empty?
@@ -168,7 +168,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
168
168
  # wait_for_crawl_finished crawl[:crawl_id]
169
169
  # @redis.get("crawl_job_enqueued_count").to_i.should == 20
170
170
  # end
171
- #
171
+ #
172
172
  # end
173
173
  describe "limit to 1" do
174
174
  before(:each) do
@@ -271,11 +271,13 @@ describe CrawlJob, :local_only => true, :disabled => true do
271
271
 
272
272
  after(:all) do
273
273
 
274
- @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
275
- command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
276
- IO.popen(command)
274
+ if RESQUE_INSTALLED
275
+ @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
276
+ command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
277
+ IO.popen(command)
277
278
 
278
- clear_queues
279
+ clear_queues
280
+ end
279
281
  end
280
282
 
281
283
  end
@@ -9,14 +9,15 @@ describe CrawlWorker, :local_only => true do
9
9
  @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
10
10
 
11
11
  raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
12
-
12
+
13
13
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
14
14
  puts "Starting Workers... Please Wait..."
15
15
  `mkdir log`
16
16
  `rm -rf output.log`
17
- io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log &")
17
+ puts "calling: nohup sidekiq -v -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1"
18
+ io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1")
18
19
  puts "Workers Started."
19
- end
20
+ end
20
21
  end
21
22
 
22
23
  before(:each) do
@@ -24,7 +25,7 @@ describe CrawlWorker, :local_only => true do
24
25
  pending("thin not installed") unless THIN_INSTALLED
25
26
  @base_url = "http://localhost:3532/"
26
27
  @base_page_count = 77
27
-
28
+
28
29
  clear_sidekiq_queues
29
30
  end
30
31
 
@@ -40,7 +41,7 @@ describe CrawlWorker, :local_only => true do
40
41
  }
41
42
  @cobweb = Cobweb.new @request
42
43
  end
43
-
44
+
44
45
  it "should crawl entire site" do
45
46
  crawl = @cobweb.start(@base_url)
46
47
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -66,7 +67,7 @@ describe CrawlWorker, :local_only => true do
66
67
  }
67
68
  @cobweb = Cobweb.new @request
68
69
  end
69
-
70
+
70
71
  it "should only crawl html pages" do
71
72
  crawl = @cobweb.start(@base_url)
72
73
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -78,10 +79,10 @@ describe CrawlWorker, :local_only => true do
78
79
  mime_types.count.should == 8
79
80
  mime_types.map{|m| m.should == "text/html"}
80
81
  mime_types.select{|m| m=="text/html"}.count.should == 8
81
-
82
-
82
+
83
+
83
84
  end
84
-
85
+
85
86
  end
86
87
  describe "with a crawl limit" do
87
88
  before(:each) do
@@ -93,34 +94,34 @@ describe CrawlWorker, :local_only => true do
93
94
  :cache => nil
94
95
  }
95
96
  end
96
-
97
+
97
98
  describe "of 1" do
98
99
  before(:each) do
99
100
  @request[:crawl_limit] = 1
100
101
  @cobweb = Cobweb.new @request
101
102
  end
102
-
103
+
103
104
  it "should not crawl the entire site" do
104
105
  crawl = @cobweb.start(@base_url)
105
106
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
106
107
  wait_for_crawl_finished crawl[:crawl_id]
107
108
  CrawlProcessWorker.queue_size.should_not == @base_page_count
108
- end
109
+ end
109
110
  it "should only crawl 1 page" do
110
111
  crawl = @cobweb.start(@base_url)
111
112
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
112
113
  wait_for_crawl_finished crawl[:crawl_id]
113
114
  CrawlProcessWorker.queue_size.should == 1
114
- end
115
+ end
115
116
  it "should notify of crawl finished" do
116
117
  crawl = @cobweb.start(@base_url)
117
118
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
118
119
  wait_for_crawl_finished crawl[:crawl_id]
119
120
  CrawlFinishedWorker.queue_size.should == 1
120
- end
121
-
121
+ end
122
+
122
123
  end
123
-
124
+
124
125
  describe "of 5" do
125
126
  before(:each) do
126
127
  @request[:crawl_limit] = 5
@@ -131,36 +132,36 @@ describe CrawlWorker, :local_only => true do
131
132
  @request[:crawl_limit_by_page] = true
132
133
  @cobweb = Cobweb.new @request
133
134
  end
134
-
135
+
135
136
  it "should only use html pages towards the crawl limit" do
136
137
  crawl = @cobweb.start(@base_url)
137
138
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
138
139
  wait_for_crawl_finished crawl[:crawl_id]
139
-
140
+
140
141
  mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
141
142
  mime_types.select{|m| m=="text/html"}.count.should == 5
142
143
  end
143
144
  end
144
145
  end
145
-
146
+
146
147
  describe "of 10" do
147
148
  before(:each) do
148
149
  @request[:crawl_limit] = 10
149
150
  @cobweb = Cobweb.new @request
150
151
  end
151
-
152
+
152
153
  it "should not crawl the entire site" do
153
154
  crawl = @cobweb.start(@base_url)
154
155
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
155
156
  wait_for_crawl_finished crawl[:crawl_id]
156
157
  CrawlProcessWorker.queue_size.should_not == @base_page_count
157
- end
158
+ end
158
159
  it "should notify of crawl finished" do
159
160
  crawl = @cobweb.start(@base_url)
160
161
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
161
162
  wait_for_crawl_finished crawl[:crawl_id]
162
163
  CrawlFinishedWorker.queue_size.should == 1
163
- end
164
+ end
164
165
  it "should only crawl 10 objects" do
165
166
  crawl = @cobweb.start(@base_url)
166
167
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -168,38 +169,38 @@ describe CrawlWorker, :local_only => true do
168
169
  CrawlProcessWorker.queue_size.should == 10
169
170
  end
170
171
  end
171
-
172
+
172
173
  describe "of 100" do
173
174
  before(:each) do
174
175
  @request[:crawl_limit] = 100
175
176
  @cobweb = Cobweb.new @request
176
177
  end
177
-
178
+
178
179
  it "should crawl the entire sample site" do
179
180
  crawl = @cobweb.start(@base_url)
180
181
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
181
182
  wait_for_crawl_finished crawl[:crawl_id]
182
183
  CrawlProcessWorker.queue_size.should == @base_page_count
183
- end
184
+ end
184
185
  it "should notify of crawl finished" do
185
186
  crawl = @cobweb.start(@base_url)
186
187
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
187
188
  wait_for_crawl_finished crawl[:crawl_id]
188
189
  CrawlFinishedWorker.queue_size.should == 1
189
- end
190
+ end
190
191
  it "should not crawl more than 100 pages" do
191
192
  crawl = @cobweb.start(@base_url)
192
193
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
193
194
  wait_for_crawl_finished crawl[:crawl_id]
194
195
  CrawlProcessWorker.queue_size.should_not > 100
195
- end
196
- end
196
+ end
197
+ end
197
198
  end
198
199
 
199
200
  after(:all) do
200
201
  @all_processes = `ps aux | grep sidekiq | grep -v grep | grep -v sidekiq-web | awk '{print $2}'`.split("\n")
201
202
  unless (@all_processes - @existing_processes).empty?
202
- command = "kill #{(@all_processes - @existing_processes).join(" ")}"
203
+ command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
203
204
  IO.popen(command)
204
205
  end
205
206
  clear_sidekiq_queues
@@ -211,6 +212,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
211
212
  @counter = 0
212
213
  start_time = Time.now
213
214
  while(running?(crawl_id) && Time.now < start_time + timeout) do
215
+ puts Sidekiq::Stats.new.queues
214
216
  sleep 1
215
217
  end
216
218
  if Time.now > start_time + timeout
@@ -246,9 +248,7 @@ def clear_sidekiq_queues
246
248
  end
247
249
  end
248
250
  sleep 5
249
-
251
+
250
252
  CrawlProcessWorker.queue_size.should == 0
251
253
  CrawlFinishedWorker.queue_size.should == 0
252
254
  end
253
-
254
-