cobweb 1.0.28 → 1.0.29

Sign up to get free protection for your applications and to get access to all the features.
@@ -52,19 +52,15 @@ class CobwebCrawlHelper
52
52
  end
53
53
  if options[:queue_system] == :sidekiq && SIDEKIQ_INSTALLED
54
54
 
55
- puts "deleteing from crawl_worker"
56
55
  queue = Sidekiq::Queue.new("crawl_worker")
57
56
  queue.each do |job|
58
- ap job.args # => [1, 2, 3]
59
57
  job.delete if job.args[0]["crawl_id"] == id
60
58
  end
61
59
 
62
60
 
63
61
  process_queue_name = Kernel.const_get(options[:processing_queue]).sidekiq_options_hash["queue"]
64
- puts "deleting from #{process_queue_name}"
65
62
  queue = Sidekiq::Queue.new(process_queue_name)
66
63
  queue.each do |job|
67
- ap job.args # => [1, 2, 3]
68
64
  job.delete if job.args[0]["crawl_id"] == id
69
65
  end
70
66
  end
@@ -102,4 +98,4 @@ class CobwebCrawlHelper
102
98
  @data[:crawl_id]
103
99
  end
104
100
 
105
- end
101
+ end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.28"
6
+ "1.0.29"
7
7
  end
8
8
 
9
- end
9
+ end
@@ -14,31 +14,31 @@ require File.expand_path(File.dirname(__FILE__) + '/sidekiq/cobweb_helper')
14
14
  class CrawlWorker
15
15
  include Sidekiq::Worker
16
16
  sidekiq_options :queue => "crawl_worker", :retry => false if SIDEKIQ_INSTALLED
17
-
17
+
18
18
  def perform(content_request)
19
19
  puts "Performing for #{content_request["url"]}"
20
20
  # setup the crawl class to manage the crawl of this object
21
21
  @crawl = CobwebModule::Crawl.new(content_request)
22
-
22
+
23
23
  # update the counters and then perform the get, returns false if we are outwith limits
24
24
  if @crawl.retrieve
25
-
25
+
26
26
  # if the crawled object is an object type we are interested
27
27
  if @crawl.content.permitted_type?
28
-
28
+
29
29
  # extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
30
30
  @crawl.process_links do |link|
31
31
  @crawl.lock("queue_links") do
32
32
  if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
33
33
  # enqueue the links to sidekiq
34
- @crawl.debug_puts "QUEUED LINK: #{link}"
34
+ @crawl.debug_puts "QUEUED LINK: #{link}"
35
35
  enqueue_content(content_request, link)
36
36
  end
37
37
  end
38
38
  end
39
-
39
+
40
40
  if @crawl.to_be_processed?
41
-
41
+
42
42
  @crawl.process do
43
43
 
44
44
  # enqueue to processing queue
@@ -51,17 +51,17 @@ class CrawlWorker
51
51
  current_count = enqueue_redis.hget(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field]).to_i
52
52
  enqueue_redis.hset(content_request[:enqueue_counter_key], content_request[:enqueue_counter_field], current_count+1)
53
53
  end
54
-
54
+
55
55
  end
56
56
  else
57
57
  @crawl.debug_puts "@crawl.finished? #{@crawl.finished?}"
58
58
  @crawl.debug_puts "@crawl.within_crawl_limits? #{@crawl.within_crawl_limits?}"
59
59
  @crawl.debug_puts "@crawl.first_to_finish? #{@crawl.first_to_finish?}"
60
60
  end
61
-
61
+
62
62
  end
63
63
  end
64
-
64
+
65
65
  #@crawl.lock("finished") do
66
66
  # let the crawl know we're finished with this object
67
67
  @crawl.finished_processing
@@ -79,7 +79,7 @@ class CrawlWorker
79
79
  conn.smembers(get_sidekiq_options[:queue]).count
80
80
  end
81
81
  end
82
-
82
+
83
83
 
84
84
  # Sets the crawl status to CobwebCrawlHelper::FINISHED and enqueues the crawl finished job
85
85
  def finished(content_request)
@@ -93,7 +93,7 @@ class CrawlWorker
93
93
  @crawl.redis.incr("crawl_finished_enqueued_count")
94
94
  content_request[:crawl_finished_queue].constantize.perform_async(@crawl.statistics.merge(additional_stats))
95
95
  end
96
-
96
+
97
97
  # Enqueues the content to the processing queue setup in options
98
98
  def send_to_processing_queue(content, content_request)
99
99
  content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
@@ -110,7 +110,7 @@ class CrawlWorker
110
110
  end
111
111
 
112
112
  private
113
-
113
+
114
114
  # Enqueues content to the crawl_job queue
115
115
  def enqueue_content(content_request, link)
116
116
  new_request = content_request.clone
@@ -119,4 +119,4 @@ class CrawlWorker
119
119
  CrawlWorker.perform_async(new_request)
120
120
  end
121
121
 
122
- end
122
+ end
@@ -35,7 +35,7 @@ class ExportCommand
35
35
  Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
36
36
 
37
37
  uri.path.split("/")[0..-2].each do |dir|
38
- path+="/" unless path.ends_with?("/")
38
+ path+="/" unless path.cobweb_ends_with?("/")
39
39
  path+=dir
40
40
  if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
41
41
  FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
@@ -45,7 +45,7 @@ class ExportCommand
45
45
  Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
46
46
  end
47
47
  end
48
- path += "/" unless path.ends_with?("/")
48
+ path += "/" unless path.cobweb_ends_with?("/")
49
49
  filename = uri.path.split("/")[-1]
50
50
  if filename.nil? || filename.empty?
51
51
  filename = "index.html"
@@ -82,4 +82,4 @@ class ExportCommand
82
82
  end
83
83
 
84
84
  end
85
- end
85
+ end
@@ -22,7 +22,7 @@ class ReportCommand
22
22
  CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
23
23
 
24
24
  statistics = @crawler.crawl(options[:url]) do |page|
25
- puts "Reporting on #{page[:url]}"
25
+ puts "Reporting on #{page[:url]} [#{page[:status_code]}]"
26
26
  @doc = page[:body]
27
27
  page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
28
28
  page["title"] = scope.head_tag.title_tag.contents
@@ -1,12 +1,7 @@
1
1
  class String
2
2
 
3
- # add ends_with? support if method is missing
4
- def method_missing(m, *args, &block)
5
- if m == :ends_with?
6
- suffix = args[0]
7
- suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
8
- else
9
- super
10
- end
3
+ def cobweb_ends_with?(val)
4
+ suffix = val
5
+ suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
11
6
  end
12
- end
7
+ end
@@ -4,9 +4,9 @@ describe CobwebCrawler do
4
4
 
5
5
  before(:each) do
6
6
  pending("thin not installed") unless THIN_INSTALLED
7
-
7
+
8
8
  @base_url = "http://localhost:3532/"
9
-
9
+
10
10
  @default_headers = {"Cache-Control" => "private, max-age=0",
11
11
  "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
12
12
  "Expires" => "-1",
@@ -16,13 +16,13 @@ describe CobwebCrawler do
16
16
  "Server" => "gws",
17
17
  "X-XSS-Protection" => "1; mode=block"}
18
18
 
19
- end
19
+ end
20
+
20
21
 
21
-
22
22
  it "should generate a cobweb_crawler object" do
23
23
  CobwebCrawler.new.should be_an_instance_of CobwebCrawler
24
24
  end
25
-
25
+
26
26
  describe "crawl" do
27
27
 
28
28
  it "should crawl a site" do
@@ -36,21 +36,21 @@ describe CobwebCrawler do
36
36
  @statistics.get_statistics[:mime_counts]["text/html"].should == 8
37
37
  @statistics.get_statistics[:mime_counts]["text/css"].should == 18
38
38
  @statistics.get_statistics[:mime_counts]["image/jpeg"].should == 25
39
-
39
+
40
40
  end
41
-
41
+
42
42
  it "should take a block" do
43
43
  crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :crawl_limit => 1})
44
44
  statistics = crawler.crawl(@base_url) do |content, statistics|
45
45
  content[:url].should_not be_nil
46
46
  statistics[:average_length].should_not be_nil
47
47
  end
48
-
48
+
49
49
  statistics.should_not be_nil
50
50
  statistics.get_statistics.should be_an_instance_of Hash
51
-
51
+
52
52
  statistics.get_statistics[:mime_counts]["text/html"].should == 1
53
-
53
+
54
54
  end
55
55
 
56
56
  context "internal_links" do
@@ -91,19 +91,19 @@ describe CobwebCrawler do
91
91
  before(:each) do
92
92
  pending("thin not installed") unless THIN_INSTALLED
93
93
  @crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
94
- @statistics = @crawler.crawl(@base_url)
94
+ @statistics = @crawler.crawl(@base_url)
95
95
  end
96
96
 
97
97
  it "should store inbound links" do
98
98
  @statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
99
- @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
99
+ @statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
100
100
  end
101
101
 
102
102
  it "should handle url encoding" do
103
- @statistics.inbound_links_for("http://localhost:3532/boxgrid%3Ewithsillyname.html").sort.should == ["http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
103
+ @statistics.inbound_links_for("http://localhost:3532/boxgridwithsillyname.html").sort.should == ["http://localhost:3532/boxgridwithsillyname.html", "http://localhost:3532/gallery.html", "http://localhost:3532/more.html", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
104
104
  end
105
105
 
106
106
  end
107
- end
107
+ end
108
108
 
109
- end
109
+ end
@@ -6,7 +6,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
6
6
 
7
7
  before(:all) do
8
8
  #store all existing resque process ids so we don't kill them afterwards
9
- if RESQUE_INSTALLED && THIN_INSTALLED
9
+ if RESQUE_INSTALLED && THIN_INSTALLED
10
10
 
11
11
  @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
12
12
  if Resque.workers.count > 0 && @existing_processes.empty?
@@ -168,7 +168,7 @@ describe CrawlJob, :local_only => true, :disabled => true do
168
168
  # wait_for_crawl_finished crawl[:crawl_id]
169
169
  # @redis.get("crawl_job_enqueued_count").to_i.should == 20
170
170
  # end
171
- #
171
+ #
172
172
  # end
173
173
  describe "limit to 1" do
174
174
  before(:each) do
@@ -271,11 +271,13 @@ describe CrawlJob, :local_only => true, :disabled => true do
271
271
 
272
272
  after(:all) do
273
273
 
274
- @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
275
- command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
276
- IO.popen(command)
274
+ if RESQUE_INSTALLED
275
+ @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
276
+ command = "kill -s QUIT #{(@all_processes - @existing_processes).join(" ")}"
277
+ IO.popen(command)
277
278
 
278
- clear_queues
279
+ clear_queues
280
+ end
279
281
  end
280
282
 
281
283
  end
@@ -9,14 +9,15 @@ describe CrawlWorker, :local_only => true do
9
9
  @existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
10
10
 
11
11
  raise "Sidekiq is already running, please stop before running specs." if @existing_processes.count > 0
12
-
12
+
13
13
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
14
14
  puts "Starting Workers... Please Wait..."
15
15
  `mkdir log`
16
16
  `rm -rf output.log`
17
- io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log &")
17
+ puts "calling: nohup sidekiq -v -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1"
18
+ io = IO.popen("nohup sidekiq -r ./lib/crawl_worker.rb -q crawl_worker > ./log/output.log 2>&1")
18
19
  puts "Workers Started."
19
- end
20
+ end
20
21
  end
21
22
 
22
23
  before(:each) do
@@ -24,7 +25,7 @@ describe CrawlWorker, :local_only => true do
24
25
  pending("thin not installed") unless THIN_INSTALLED
25
26
  @base_url = "http://localhost:3532/"
26
27
  @base_page_count = 77
27
-
28
+
28
29
  clear_sidekiq_queues
29
30
  end
30
31
 
@@ -40,7 +41,7 @@ describe CrawlWorker, :local_only => true do
40
41
  }
41
42
  @cobweb = Cobweb.new @request
42
43
  end
43
-
44
+
44
45
  it "should crawl entire site" do
45
46
  crawl = @cobweb.start(@base_url)
46
47
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -66,7 +67,7 @@ describe CrawlWorker, :local_only => true do
66
67
  }
67
68
  @cobweb = Cobweb.new @request
68
69
  end
69
-
70
+
70
71
  it "should only crawl html pages" do
71
72
  crawl = @cobweb.start(@base_url)
72
73
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -78,10 +79,10 @@ describe CrawlWorker, :local_only => true do
78
79
  mime_types.count.should == 8
79
80
  mime_types.map{|m| m.should == "text/html"}
80
81
  mime_types.select{|m| m=="text/html"}.count.should == 8
81
-
82
-
82
+
83
+
83
84
  end
84
-
85
+
85
86
  end
86
87
  describe "with a crawl limit" do
87
88
  before(:each) do
@@ -93,34 +94,34 @@ describe CrawlWorker, :local_only => true do
93
94
  :cache => nil
94
95
  }
95
96
  end
96
-
97
+
97
98
  describe "of 1" do
98
99
  before(:each) do
99
100
  @request[:crawl_limit] = 1
100
101
  @cobweb = Cobweb.new @request
101
102
  end
102
-
103
+
103
104
  it "should not crawl the entire site" do
104
105
  crawl = @cobweb.start(@base_url)
105
106
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
106
107
  wait_for_crawl_finished crawl[:crawl_id]
107
108
  CrawlProcessWorker.queue_size.should_not == @base_page_count
108
- end
109
+ end
109
110
  it "should only crawl 1 page" do
110
111
  crawl = @cobweb.start(@base_url)
111
112
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
112
113
  wait_for_crawl_finished crawl[:crawl_id]
113
114
  CrawlProcessWorker.queue_size.should == 1
114
- end
115
+ end
115
116
  it "should notify of crawl finished" do
116
117
  crawl = @cobweb.start(@base_url)
117
118
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
118
119
  wait_for_crawl_finished crawl[:crawl_id]
119
120
  CrawlFinishedWorker.queue_size.should == 1
120
- end
121
-
121
+ end
122
+
122
123
  end
123
-
124
+
124
125
  describe "of 5" do
125
126
  before(:each) do
126
127
  @request[:crawl_limit] = 5
@@ -131,36 +132,36 @@ describe CrawlWorker, :local_only => true do
131
132
  @request[:crawl_limit_by_page] = true
132
133
  @cobweb = Cobweb.new @request
133
134
  end
134
-
135
+
135
136
  it "should only use html pages towards the crawl limit" do
136
137
  crawl = @cobweb.start(@base_url)
137
138
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
138
139
  wait_for_crawl_finished crawl[:crawl_id]
139
-
140
+
140
141
  mime_types = CrawlProcessWorker.queue_items(0, 200).map{|job| JSON.parse(job)["args"][0]["mime_type"]}
141
142
  mime_types.select{|m| m=="text/html"}.count.should == 5
142
143
  end
143
144
  end
144
145
  end
145
-
146
+
146
147
  describe "of 10" do
147
148
  before(:each) do
148
149
  @request[:crawl_limit] = 10
149
150
  @cobweb = Cobweb.new @request
150
151
  end
151
-
152
+
152
153
  it "should not crawl the entire site" do
153
154
  crawl = @cobweb.start(@base_url)
154
155
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
155
156
  wait_for_crawl_finished crawl[:crawl_id]
156
157
  CrawlProcessWorker.queue_size.should_not == @base_page_count
157
- end
158
+ end
158
159
  it "should notify of crawl finished" do
159
160
  crawl = @cobweb.start(@base_url)
160
161
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
161
162
  wait_for_crawl_finished crawl[:crawl_id]
162
163
  CrawlFinishedWorker.queue_size.should == 1
163
- end
164
+ end
164
165
  it "should only crawl 10 objects" do
165
166
  crawl = @cobweb.start(@base_url)
166
167
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
@@ -168,38 +169,38 @@ describe CrawlWorker, :local_only => true do
168
169
  CrawlProcessWorker.queue_size.should == 10
169
170
  end
170
171
  end
171
-
172
+
172
173
  describe "of 100" do
173
174
  before(:each) do
174
175
  @request[:crawl_limit] = 100
175
176
  @cobweb = Cobweb.new @request
176
177
  end
177
-
178
+
178
179
  it "should crawl the entire sample site" do
179
180
  crawl = @cobweb.start(@base_url)
180
181
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
181
182
  wait_for_crawl_finished crawl[:crawl_id]
182
183
  CrawlProcessWorker.queue_size.should == @base_page_count
183
- end
184
+ end
184
185
  it "should notify of crawl finished" do
185
186
  crawl = @cobweb.start(@base_url)
186
187
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
187
188
  wait_for_crawl_finished crawl[:crawl_id]
188
189
  CrawlFinishedWorker.queue_size.should == 1
189
- end
190
+ end
190
191
  it "should not crawl more than 100 pages" do
191
192
  crawl = @cobweb.start(@base_url)
192
193
  @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
193
194
  wait_for_crawl_finished crawl[:crawl_id]
194
195
  CrawlProcessWorker.queue_size.should_not > 100
195
- end
196
- end
196
+ end
197
+ end
197
198
  end
198
199
 
199
200
  after(:all) do
200
201
  @all_processes = `ps aux | grep sidekiq | grep -v grep | grep -v sidekiq-web | awk '{print $2}'`.split("\n")
201
202
  unless (@all_processes - @existing_processes).empty?
202
- command = "kill #{(@all_processes - @existing_processes).join(" ")}"
203
+ command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
203
204
  IO.popen(command)
204
205
  end
205
206
  clear_sidekiq_queues
@@ -211,6 +212,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
211
212
  @counter = 0
212
213
  start_time = Time.now
213
214
  while(running?(crawl_id) && Time.now < start_time + timeout) do
215
+ puts Sidekiq::Stats.new.queues
214
216
  sleep 1
215
217
  end
216
218
  if Time.now > start_time + timeout
@@ -246,9 +248,7 @@ def clear_sidekiq_queues
246
248
  end
247
249
  end
248
250
  sleep 5
249
-
251
+
250
252
  CrawlProcessWorker.queue_size.should == 0
251
253
  CrawlFinishedWorker.queue_size.should == 0
252
254
  end
253
-
254
-