cobweb 1.0.11 → 1.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +4 -3
- data/lib/cobweb.rb +31 -8
- data/lib/cobweb_crawler.rb +7 -8
- data/lib/cobweb_process_job.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +11 -4
- data/lib/crawl_finished_worker.rb +27 -0
- data/lib/crawl_helper.rb +250 -0
- data/lib/crawl_job.rb +2 -2
- data/lib/crawl_process_worker.rb +31 -0
- data/lib/crawl_worker.rb +118 -0
- data/lib/sidekiq/cobweb_helper.rb +16 -0
- data/lib/stats.rb +12 -11
- data/lib/uri_helper.rb +8 -0
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -1
- data/spec/cobweb/cobweb_crawl_spec.rb +29 -13
- data/spec/cobweb/cobweb_crawler_spec.rb +33 -14
- data/spec/cobweb/cobweb_links_spec.rb +2 -1
- data/spec/cobweb/cobweb_spec.rb +3 -0
- data/spec/cobweb/content_link_parser_spec.rb +4 -0
- data/spec/cobweb/{cobweb_job_spec.rb → crawl_job_spec.rb} +52 -9
- data/spec/cobweb/crawl_worker_spec.rb +250 -0
- data/spec/cobweb/robots_spec.rb +2 -1
- data/spec/http_stubs.rb +95 -0
- data/spec/samples/sample_site/{boxgrid.html → boxgrid>withsillyname.html} +1 -1
- data/spec/samples/sample_site/dashboard.html +1 -1
- data/spec/samples/sample_site/forms.html +1 -1
- data/spec/samples/sample_site/gallery.html +1 -1
- data/spec/samples/sample_site/more.html +1 -1
- data/spec/samples/sample_site/tables.html +1 -1
- data/spec/samples/sample_site/typography.html +1 -1
- data/spec/spec_helper.rb +6 -88
- metadata +85 -35
- data/spec/cobweb/site_test_spec.rb.tmp +0 -101
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.12
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-08-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement:
|
15
|
+
name: redis
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,21 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: redis
|
27
|
-
requirement: &70274619906680 !ruby/object:Gem::Requirement
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
25
|
none: false
|
29
26
|
requirements:
|
30
27
|
- - ! '>='
|
31
28
|
- !ruby/object:Gem::Version
|
32
29
|
version: '0'
|
33
|
-
type: :runtime
|
34
|
-
prerelease: false
|
35
|
-
version_requirements: *70274619906680
|
36
30
|
- !ruby/object:Gem::Dependency
|
37
31
|
name: nokogiri
|
38
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
39
33
|
none: false
|
40
34
|
requirements:
|
41
35
|
- - ! '>='
|
@@ -43,10 +37,15 @@ dependencies:
|
|
43
37
|
version: '0'
|
44
38
|
type: :runtime
|
45
39
|
prerelease: false
|
46
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
47
46
|
- !ruby/object:Gem::Dependency
|
48
47
|
name: addressable
|
49
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
50
49
|
none: false
|
51
50
|
requirements:
|
52
51
|
- - ! '>='
|
@@ -54,10 +53,15 @@ dependencies:
|
|
54
53
|
version: '0'
|
55
54
|
type: :runtime
|
56
55
|
prerelease: false
|
57
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
58
62
|
- !ruby/object:Gem::Dependency
|
59
63
|
name: rspec
|
60
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
61
65
|
none: false
|
62
66
|
requirements:
|
63
67
|
- - ! '>='
|
@@ -65,10 +69,15 @@ dependencies:
|
|
65
69
|
version: '0'
|
66
70
|
type: :runtime
|
67
71
|
prerelease: false
|
68
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
69
78
|
- !ruby/object:Gem::Dependency
|
70
79
|
name: awesome_print
|
71
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
72
81
|
none: false
|
73
82
|
requirements:
|
74
83
|
- - ! '>='
|
@@ -76,10 +85,15 @@ dependencies:
|
|
76
85
|
version: '0'
|
77
86
|
type: :runtime
|
78
87
|
prerelease: false
|
79
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
80
94
|
- !ruby/object:Gem::Dependency
|
81
95
|
name: sinatra
|
82
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
83
97
|
none: false
|
84
98
|
requirements:
|
85
99
|
- - ! '>='
|
@@ -87,10 +101,15 @@ dependencies:
|
|
87
101
|
version: '0'
|
88
102
|
type: :runtime
|
89
103
|
prerelease: false
|
90
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
91
110
|
- !ruby/object:Gem::Dependency
|
92
111
|
name: thin
|
93
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
94
113
|
none: false
|
95
114
|
requirements:
|
96
115
|
- - ! '>='
|
@@ -98,10 +117,15 @@ dependencies:
|
|
98
117
|
version: '0'
|
99
118
|
type: :runtime
|
100
119
|
prerelease: false
|
101
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
102
126
|
- !ruby/object:Gem::Dependency
|
103
127
|
name: haml
|
104
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
105
129
|
none: false
|
106
130
|
requirements:
|
107
131
|
- - ! '>='
|
@@ -109,10 +133,15 @@ dependencies:
|
|
109
133
|
version: '0'
|
110
134
|
type: :runtime
|
111
135
|
prerelease: false
|
112
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
113
142
|
- !ruby/object:Gem::Dependency
|
114
143
|
name: namespaced_redis
|
115
|
-
requirement:
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
116
145
|
none: false
|
117
146
|
requirements:
|
118
147
|
- - ! '>='
|
@@ -120,10 +149,15 @@ dependencies:
|
|
120
149
|
version: '0'
|
121
150
|
type: :runtime
|
122
151
|
prerelease: false
|
123
|
-
version_requirements:
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
124
158
|
- !ruby/object:Gem::Dependency
|
125
159
|
name: json
|
126
|
-
requirement:
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
127
161
|
none: false
|
128
162
|
requirements:
|
129
163
|
- - ! '>='
|
@@ -131,10 +165,15 @@ dependencies:
|
|
131
165
|
version: '0'
|
132
166
|
type: :runtime
|
133
167
|
prerelease: false
|
134
|
-
version_requirements:
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
135
174
|
- !ruby/object:Gem::Dependency
|
136
175
|
name: slop
|
137
|
-
requirement:
|
176
|
+
requirement: !ruby/object:Gem::Requirement
|
138
177
|
none: false
|
139
178
|
requirements:
|
140
179
|
- - ! '>='
|
@@ -142,7 +181,12 @@ dependencies:
|
|
142
181
|
version: '0'
|
143
182
|
type: :runtime
|
144
183
|
prerelease: false
|
145
|
-
version_requirements:
|
184
|
+
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ! '>='
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: '0'
|
146
190
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
147
191
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
148
192
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -156,16 +200,17 @@ files:
|
|
156
200
|
- spec/cobweb/cobweb_crawl_helper_spec.rb
|
157
201
|
- spec/cobweb/cobweb_crawl_spec.rb
|
158
202
|
- spec/cobweb/cobweb_crawler_spec.rb
|
159
|
-
- spec/cobweb/cobweb_job_spec.rb
|
160
203
|
- spec/cobweb/cobweb_links_spec.rb
|
161
204
|
- spec/cobweb/cobweb_spec.rb
|
162
205
|
- spec/cobweb/content_link_parser_spec.rb
|
206
|
+
- spec/cobweb/crawl_job_spec.rb
|
207
|
+
- spec/cobweb/crawl_worker_spec.rb
|
163
208
|
- spec/cobweb/robots_spec.rb
|
164
|
-
- spec/
|
209
|
+
- spec/http_stubs.rb
|
165
210
|
- spec/samples/robots.txt
|
166
211
|
- spec/samples/sample_html_links.html
|
167
212
|
- spec/samples/sample_server.rb
|
168
|
-
- spec/samples/sample_site/boxgrid.html
|
213
|
+
- spec/samples/sample_site/boxgrid>withsillyname.html
|
169
214
|
- spec/samples/sample_site/css/accordion.css
|
170
215
|
- spec/samples/sample_site/css/datatable.css
|
171
216
|
- spec/samples/sample_site/css/datepicker.css
|
@@ -343,8 +388,12 @@ files:
|
|
343
388
|
- lib/cobweb_version.rb
|
344
389
|
- lib/content_link_parser.rb
|
345
390
|
- lib/crawl.rb
|
391
|
+
- lib/crawl_finished_worker.rb
|
392
|
+
- lib/crawl_helper.rb
|
346
393
|
- lib/crawl_job.rb
|
347
394
|
- lib/crawl_object.rb
|
395
|
+
- lib/crawl_process_worker.rb
|
396
|
+
- lib/crawl_worker.rb
|
348
397
|
- lib/document.rb
|
349
398
|
- lib/encoding_safe_process_job.rb
|
350
399
|
- lib/export_command.rb
|
@@ -353,6 +402,7 @@ files:
|
|
353
402
|
- lib/report_command.rb
|
354
403
|
- lib/robots.rb
|
355
404
|
- lib/server.rb
|
405
|
+
- lib/sidekiq/cobweb_helper.rb
|
356
406
|
- lib/stats.rb
|
357
407
|
- lib/string.rb
|
358
408
|
- lib/uri_helper.rb
|
@@ -530,7 +580,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
530
580
|
version: '0'
|
531
581
|
requirements: []
|
532
582
|
rubyforge_project:
|
533
|
-
rubygems_version: 1.8.
|
583
|
+
rubygems_version: 1.8.25
|
534
584
|
signing_key:
|
535
585
|
specification_version: 3
|
536
586
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
@@ -1,101 +0,0 @@
|
|
1
|
-
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
|
3
|
-
describe Cobweb, :local_only => true do
|
4
|
-
|
5
|
-
before(:all) do
|
6
|
-
#store all existing resque process ids so we don't kill them afterwards
|
7
|
-
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
8
|
-
|
9
|
-
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
10
|
-
puts "Starting Workers... Please Wait..."
|
11
|
-
`mkdir log`
|
12
|
-
io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
|
13
|
-
puts "Workers Started."
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
before(:each) do
|
18
|
-
@base_url = "http://localhost:3532/"
|
19
|
-
@base_page_count = 77
|
20
|
-
clear_queues
|
21
|
-
end
|
22
|
-
|
23
|
-
describe "with a crawl limit" do
|
24
|
-
before(:each) do
|
25
|
-
@request = {
|
26
|
-
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
27
|
-
:quiet => true,
|
28
|
-
:cache => nil,
|
29
|
-
:use_encoding_safe_process_job => true,
|
30
|
-
:crawl_limit_by_page => true
|
31
|
-
}
|
32
|
-
end
|
33
|
-
|
34
|
-
describe "on ancestry.com.au" do
|
35
|
-
describe "limited to 100" do
|
36
|
-
before(:each) do
|
37
|
-
@request[:crawl_limit] = 100
|
38
|
-
@request[:valid_mime_types] = ["text/html"]
|
39
|
-
@cobweb = Cobweb.new @request
|
40
|
-
end
|
41
|
-
|
42
|
-
it "should crawl 100 pages" do
|
43
|
-
crawl = @cobweb.start("http://www.ancestry.com.au/")
|
44
|
-
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
45
|
-
wait_for_crawl_finished crawl[:crawl_id], 180
|
46
|
-
puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
describe "limited to 999" do
|
51
|
-
before(:each) do
|
52
|
-
@request[:crawl_limit] = 999
|
53
|
-
@cobweb = Cobweb.new @request
|
54
|
-
end
|
55
|
-
|
56
|
-
it "should crawl 999 pages" do
|
57
|
-
crawl = @cobweb.start("http://www.ancestry.com.au/")
|
58
|
-
@stat = Stats.new({:crawl_id => crawl[:crawl_id]})
|
59
|
-
wait_for_crawl_finished crawl[:crawl_id], 720
|
60
|
-
puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
|
61
|
-
end
|
62
|
-
end
|
63
|
-
__END__
|
64
|
-
|
65
|
-
end
|
66
|
-
|
67
|
-
after(:all) do
|
68
|
-
|
69
|
-
@all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
70
|
-
command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
|
71
|
-
IO.popen(command)
|
72
|
-
|
73
|
-
clear_queues
|
74
|
-
end
|
75
|
-
|
76
|
-
end
|
77
|
-
|
78
|
-
def wait_for_crawl_finished(crawl_id, timeout=20)
|
79
|
-
counter = 0
|
80
|
-
start_time = Time.now
|
81
|
-
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
82
|
-
sleep 0.5
|
83
|
-
end
|
84
|
-
if Time.now > start_time + timeout
|
85
|
-
raise "End of crawl not detected"
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def running?(crawl_id)
|
90
|
-
@stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
|
91
|
-
end
|
92
|
-
|
93
|
-
def clear_queues
|
94
|
-
Resque.queues.each do |queue|
|
95
|
-
Resque.remove_queue(queue)
|
96
|
-
end
|
97
|
-
|
98
|
-
Resque.size("cobweb_process_job").should == 0
|
99
|
-
Resque.size("cobweb_finished_job").should == 0
|
100
|
-
Resque.peek("cobweb_process_job", 0, 200).should be_empty
|
101
|
-
end
|