cobweb 1.0.11 → 1.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.11
4
+ version: 1.0.12
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-18 00:00:00.000000000 Z
12
+ date: 2013-08-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: resque
16
- requirement: &70274619912400 !ruby/object:Gem::Requirement
15
+ name: redis
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,21 +21,15 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70274619912400
25
- - !ruby/object:Gem::Dependency
26
- name: redis
27
- requirement: &70274619906680 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
28
25
  none: false
29
26
  requirements:
30
27
  - - ! '>='
31
28
  - !ruby/object:Gem::Version
32
29
  version: '0'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: *70274619906680
36
30
  - !ruby/object:Gem::Dependency
37
31
  name: nokogiri
38
- requirement: &70274619897540 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
39
33
  none: false
40
34
  requirements:
41
35
  - - ! '>='
@@ -43,10 +37,15 @@ dependencies:
43
37
  version: '0'
44
38
  type: :runtime
45
39
  prerelease: false
46
- version_requirements: *70274619897540
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
47
46
  - !ruby/object:Gem::Dependency
48
47
  name: addressable
49
- requirement: &70274619888180 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
50
49
  none: false
51
50
  requirements:
52
51
  - - ! '>='
@@ -54,10 +53,15 @@ dependencies:
54
53
  version: '0'
55
54
  type: :runtime
56
55
  prerelease: false
57
- version_requirements: *70274619888180
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
58
62
  - !ruby/object:Gem::Dependency
59
63
  name: rspec
60
- requirement: &70274619880820 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
61
65
  none: false
62
66
  requirements:
63
67
  - - ! '>='
@@ -65,10 +69,15 @@ dependencies:
65
69
  version: '0'
66
70
  type: :runtime
67
71
  prerelease: false
68
- version_requirements: *70274619880820
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
69
78
  - !ruby/object:Gem::Dependency
70
79
  name: awesome_print
71
- requirement: &70274619877520 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
72
81
  none: false
73
82
  requirements:
74
83
  - - ! '>='
@@ -76,10 +85,15 @@ dependencies:
76
85
  version: '0'
77
86
  type: :runtime
78
87
  prerelease: false
79
- version_requirements: *70274619877520
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
80
94
  - !ruby/object:Gem::Dependency
81
95
  name: sinatra
82
- requirement: &70274619876520 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
83
97
  none: false
84
98
  requirements:
85
99
  - - ! '>='
@@ -87,10 +101,15 @@ dependencies:
87
101
  version: '0'
88
102
  type: :runtime
89
103
  prerelease: false
90
- version_requirements: *70274619876520
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
91
110
  - !ruby/object:Gem::Dependency
92
111
  name: thin
93
- requirement: &70274619875960 !ruby/object:Gem::Requirement
112
+ requirement: !ruby/object:Gem::Requirement
94
113
  none: false
95
114
  requirements:
96
115
  - - ! '>='
@@ -98,10 +117,15 @@ dependencies:
98
117
  version: '0'
99
118
  type: :runtime
100
119
  prerelease: false
101
- version_requirements: *70274619875960
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
102
126
  - !ruby/object:Gem::Dependency
103
127
  name: haml
104
- requirement: &70274619875340 !ruby/object:Gem::Requirement
128
+ requirement: !ruby/object:Gem::Requirement
105
129
  none: false
106
130
  requirements:
107
131
  - - ! '>='
@@ -109,10 +133,15 @@ dependencies:
109
133
  version: '0'
110
134
  type: :runtime
111
135
  prerelease: false
112
- version_requirements: *70274619875340
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
113
142
  - !ruby/object:Gem::Dependency
114
143
  name: namespaced_redis
115
- requirement: &70274619874800 !ruby/object:Gem::Requirement
144
+ requirement: !ruby/object:Gem::Requirement
116
145
  none: false
117
146
  requirements:
118
147
  - - ! '>='
@@ -120,10 +149,15 @@ dependencies:
120
149
  version: '0'
121
150
  type: :runtime
122
151
  prerelease: false
123
- version_requirements: *70274619874800
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
124
158
  - !ruby/object:Gem::Dependency
125
159
  name: json
126
- requirement: &70274619874180 !ruby/object:Gem::Requirement
160
+ requirement: !ruby/object:Gem::Requirement
127
161
  none: false
128
162
  requirements:
129
163
  - - ! '>='
@@ -131,10 +165,15 @@ dependencies:
131
165
  version: '0'
132
166
  type: :runtime
133
167
  prerelease: false
134
- version_requirements: *70274619874180
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
135
174
  - !ruby/object:Gem::Dependency
136
175
  name: slop
137
- requirement: &70274619873340 !ruby/object:Gem::Requirement
176
+ requirement: !ruby/object:Gem::Requirement
138
177
  none: false
139
178
  requirements:
140
179
  - - ! '>='
@@ -142,7 +181,12 @@ dependencies:
142
181
  version: '0'
143
182
  type: :runtime
144
183
  prerelease: false
145
- version_requirements: *70274619873340
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ! '>='
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
146
190
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
147
191
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
148
192
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -156,16 +200,17 @@ files:
156
200
  - spec/cobweb/cobweb_crawl_helper_spec.rb
157
201
  - spec/cobweb/cobweb_crawl_spec.rb
158
202
  - spec/cobweb/cobweb_crawler_spec.rb
159
- - spec/cobweb/cobweb_job_spec.rb
160
203
  - spec/cobweb/cobweb_links_spec.rb
161
204
  - spec/cobweb/cobweb_spec.rb
162
205
  - spec/cobweb/content_link_parser_spec.rb
206
+ - spec/cobweb/crawl_job_spec.rb
207
+ - spec/cobweb/crawl_worker_spec.rb
163
208
  - spec/cobweb/robots_spec.rb
164
- - spec/cobweb/site_test_spec.rb.tmp
209
+ - spec/http_stubs.rb
165
210
  - spec/samples/robots.txt
166
211
  - spec/samples/sample_html_links.html
167
212
  - spec/samples/sample_server.rb
168
- - spec/samples/sample_site/boxgrid.html
213
+ - spec/samples/sample_site/boxgrid>withsillyname.html
169
214
  - spec/samples/sample_site/css/accordion.css
170
215
  - spec/samples/sample_site/css/datatable.css
171
216
  - spec/samples/sample_site/css/datepicker.css
@@ -343,8 +388,12 @@ files:
343
388
  - lib/cobweb_version.rb
344
389
  - lib/content_link_parser.rb
345
390
  - lib/crawl.rb
391
+ - lib/crawl_finished_worker.rb
392
+ - lib/crawl_helper.rb
346
393
  - lib/crawl_job.rb
347
394
  - lib/crawl_object.rb
395
+ - lib/crawl_process_worker.rb
396
+ - lib/crawl_worker.rb
348
397
  - lib/document.rb
349
398
  - lib/encoding_safe_process_job.rb
350
399
  - lib/export_command.rb
@@ -353,6 +402,7 @@ files:
353
402
  - lib/report_command.rb
354
403
  - lib/robots.rb
355
404
  - lib/server.rb
405
+ - lib/sidekiq/cobweb_helper.rb
356
406
  - lib/stats.rb
357
407
  - lib/string.rb
358
408
  - lib/uri_helper.rb
@@ -530,7 +580,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
530
580
  version: '0'
531
581
  requirements: []
532
582
  rubyforge_project:
533
- rubygems_version: 1.8.10
583
+ rubygems_version: 1.8.25
534
584
  signing_key:
535
585
  specification_version: 3
536
586
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
@@ -1,101 +0,0 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
-
3
- describe Cobweb, :local_only => true do
4
-
5
- before(:all) do
6
- #store all existing resque process ids so we don't kill them afterwards
7
- @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
8
-
9
- # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
- puts "Starting Workers... Please Wait..."
11
- `mkdir log`
12
- io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=3 QUEUE=cobweb_crawl_job > log/output.log &")
13
- puts "Workers Started."
14
-
15
- end
16
-
17
- before(:each) do
18
- @base_url = "http://localhost:3532/"
19
- @base_page_count = 77
20
- clear_queues
21
- end
22
-
23
- describe "with a crawl limit" do
24
- before(:each) do
25
- @request = {
26
- :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
27
- :quiet => true,
28
- :cache => nil,
29
- :use_encoding_safe_process_job => true,
30
- :crawl_limit_by_page => true
31
- }
32
- end
33
-
34
- describe "on ancestry.com.au" do
35
- describe "limited to 100" do
36
- before(:each) do
37
- @request[:crawl_limit] = 100
38
- @request[:valid_mime_types] = ["text/html"]
39
- @cobweb = Cobweb.new @request
40
- end
41
-
42
- it "should crawl 100 pages" do
43
- crawl = @cobweb.start("http://www.ancestry.com.au/")
44
- @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
45
- wait_for_crawl_finished crawl[:crawl_id], 180
46
- puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
47
- end
48
- end
49
-
50
- describe "limited to 999" do
51
- before(:each) do
52
- @request[:crawl_limit] = 999
53
- @cobweb = Cobweb.new @request
54
- end
55
-
56
- it "should crawl 999 pages" do
57
- crawl = @cobweb.start("http://www.ancestry.com.au/")
58
- @stat = Stats.new({:crawl_id => crawl[:crawl_id]})
59
- wait_for_crawl_finished crawl[:crawl_id], 720
60
- puts "Crawled #{Resque.size("encoding_safe_process_job")} pages."
61
- end
62
- end
63
- __END__
64
-
65
- end
66
-
67
- after(:all) do
68
-
69
- @all_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
70
- command = "kill -9 #{(@all_processes - @existing_processes).join(" ")}"
71
- IO.popen(command)
72
-
73
- clear_queues
74
- end
75
-
76
- end
77
-
78
- def wait_for_crawl_finished(crawl_id, timeout=20)
79
- counter = 0
80
- start_time = Time.now
81
- while(running?(crawl_id) && Time.now < start_time + timeout) do
82
- sleep 0.5
83
- end
84
- if Time.now > start_time + timeout
85
- raise "End of crawl not detected"
86
- end
87
- end
88
-
89
- def running?(crawl_id)
90
- @stat.get_status != CobwebCrawlHelper::FINISHED and @stat.get_status != CobwebCrawlHelper::CANCELLED
91
- end
92
-
93
- def clear_queues
94
- Resque.queues.each do |queue|
95
- Resque.remove_queue(queue)
96
- end
97
-
98
- Resque.size("cobweb_process_job").should == 0
99
- Resque.size("cobweb_finished_job").should == 0
100
- Resque.peek("cobweb_process_job", 0, 200).should be_empty
101
- end