cobweb 1.0.29 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +2 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/content_link_parser.rb +5 -4
- data/lib/sidekiq/cobweb_helper.rb +2 -1
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +1 -1
- data/spec/cobweb/crawl_worker_spec.rb +1 -1
- data/spec/spec_helper.rb +3 -3
- metadata +133 -35
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 854165929dc7a5e3e16d138515723cfd2d3f95b5
|
4
|
+
data.tar.gz: 8444c545e80547e41dcaeae817a81183ed4a8d54
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 181edda4c8fc822f52729a645046c6e3a8d11ad0ff3d5540cfcc055528563605c82eec5a5615074fe86f97c8655694dc75c29d61b8e324fed02291372e5f107a
|
7
|
+
data.tar.gz: 830a4aceeb58d6d5b43d284622a9054aea66a5a46aadabec319bf9eebeb3e20444e476a3a0db42afcc2344a6a5f7dae210aaed024055d6d7d25042f3cbd38f79
|
data/README.textile
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
h1. Cobweb v1.0
|
1
|
+
h1. Cobweb v1.1.0
|
2
2
|
|
3
3
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
4
4
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
5
|
+
!https://circleci.com/gh/stewartmckee/cobweb.svg?style=shield&circle-token=07357f0bd17ac67e21ea161fb9abdb35ecac4c2e!
|
5
6
|
!https://gemnasium.com/stewartmckee/cobweb.png!
|
6
7
|
!https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
|
7
8
|
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
@@ -8,12 +8,11 @@ class ContentLinkParser
|
|
8
8
|
def initialize(url, content, options = {})
|
9
9
|
@options = {}.merge(options)
|
10
10
|
@url = url
|
11
|
+
@base_url = ''
|
11
12
|
@doc = Nokogiri::HTML(content)
|
12
13
|
|
13
|
-
base_url = @url.to_s
|
14
14
|
if @doc.at("base[href]")
|
15
|
-
base_url = @doc.at("base[href]").attr("href").to_s
|
16
|
-
@url = base_url if base_url
|
15
|
+
@base_url = @doc.at("base[href]").attr("href").to_s if @doc.at("base[href]").attr("href").to_s.present?
|
17
16
|
end
|
18
17
|
|
19
18
|
@options[:tags] = {}
|
@@ -46,7 +45,9 @@ class ContentLinkParser
|
|
46
45
|
options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
|
47
46
|
data = link_data
|
48
47
|
links = data.keys.map{|key| data[key]}.flatten.uniq
|
49
|
-
links = links.map{|link| UriHelper.join_no_fragment(@url, link)
|
48
|
+
links = links.map{|link| UriHelper.join_no_fragment(@url, UriHelper.join_no_fragment(@base_url, link))}
|
49
|
+
.reject(&:nil?)
|
50
|
+
.map(&:to_s)
|
50
51
|
links = links.reject{|link| link =~ /\/([^\/]+?)\/\1\// }
|
51
52
|
links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }
|
52
53
|
links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
|
@@ -2,6 +2,7 @@
|
|
2
2
|
if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
3
3
|
SIDEKIQ_INSTALLED = true
|
4
4
|
require 'sidekiq'
|
5
|
+
require 'sidekiq/api'
|
5
6
|
else
|
6
7
|
SIDEKIQ_INSTALLED = false
|
7
8
|
puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
|
@@ -29,4 +30,4 @@ module Sidekiq
|
|
29
30
|
end
|
30
31
|
end
|
31
32
|
end
|
32
|
-
end
|
33
|
+
end
|
@@ -212,7 +212,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
|
|
212
212
|
@counter = 0
|
213
213
|
start_time = Time.now
|
214
214
|
while(running?(crawl_id) && Time.now < start_time + timeout) do
|
215
|
-
puts Sidekiq::Stats.new.queues
|
215
|
+
# puts Sidekiq::Stats.new.queues
|
216
216
|
sleep 1
|
217
217
|
end
|
218
218
|
if Time.now > start_time + timeout
|
data/spec/spec_helper.rb
CHANGED
@@ -35,13 +35,13 @@ RSpec.configure do |config|
|
|
35
35
|
}
|
36
36
|
|
37
37
|
config.before(:each) {
|
38
|
-
|
38
|
+
|
39
39
|
@redis_mock_object = MockRedis.new
|
40
40
|
Redis.stub(:new).and_return(@redis_mock_object)
|
41
41
|
Redis::Namespace.stub(:new).and_return(@redis_mock_object)
|
42
|
-
|
42
|
+
|
43
43
|
@redis_mock_object.flushdb
|
44
|
-
|
44
|
+
|
45
45
|
}
|
46
46
|
|
47
47
|
end
|
metadata
CHANGED
@@ -1,127 +1,225 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: redis
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
16
30
|
requirements:
|
17
|
-
- - "
|
31
|
+
- - ">="
|
18
32
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
33
|
+
version: 3.2.1
|
20
34
|
type: :runtime
|
21
35
|
prerelease: false
|
22
36
|
version_requirements: !ruby/object:Gem::Requirement
|
23
37
|
requirements:
|
24
|
-
- - "
|
38
|
+
- - ">="
|
25
39
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
40
|
+
version: 3.2.1
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: nokogiri
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
|
-
- - "
|
45
|
+
- - ">="
|
32
46
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
47
|
+
version: 1.6.6.2
|
34
48
|
type: :runtime
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
|
-
- - "
|
52
|
+
- - ">="
|
39
53
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
54
|
+
version: 1.6.6.2
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: addressable
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
|
-
- - "
|
59
|
+
- - ">="
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
61
|
+
version: 2.3.8
|
48
62
|
type: :runtime
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
|
-
- - "
|
66
|
+
- - ">="
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
68
|
+
version: 2.3.8
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: sinatra
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
|
-
- - "
|
73
|
+
- - ">="
|
60
74
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
75
|
+
version: 1.4.6
|
62
76
|
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- - "
|
80
|
+
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
82
|
+
version: 1.4.6
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: haml
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- - "
|
87
|
+
- - ">="
|
74
88
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
89
|
+
version: 4.0.7
|
76
90
|
type: :runtime
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- - "
|
94
|
+
- - ">="
|
81
95
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
96
|
+
version: 4.0.7
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: redis-namespace
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - ">="
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
103
|
+
version: 1.5.2
|
90
104
|
type: :runtime
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - ">="
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
110
|
+
version: 1.5.2
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: json
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
100
114
|
requirements:
|
101
|
-
- - "
|
115
|
+
- - ">="
|
102
116
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
117
|
+
version: 1.8.3
|
104
118
|
type: :runtime
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
107
121
|
requirements:
|
108
|
-
- - "
|
122
|
+
- - ">="
|
109
123
|
- !ruby/object:Gem::Version
|
110
|
-
version:
|
124
|
+
version: 1.8.3
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: slop
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
114
128
|
requirements:
|
115
|
-
- - "
|
129
|
+
- - ">="
|
116
130
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
131
|
+
version: 4.2.0
|
118
132
|
type: :runtime
|
119
133
|
prerelease: false
|
120
134
|
version_requirements: !ruby/object:Gem::Requirement
|
121
135
|
requirements:
|
122
|
-
- - "
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: 4.2.0
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: rspec
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: rspec-core
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: mock_redis
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: thin
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: coveralls
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :development
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: sidekiq
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
123
221
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
222
|
+
version: '0'
|
125
223
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
126
224
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
127
225
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -518,7 +616,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
518
616
|
version: '0'
|
519
617
|
requirements: []
|
520
618
|
rubyforge_project:
|
521
|
-
rubygems_version: 2.4.
|
619
|
+
rubygems_version: 2.4.5.1
|
522
620
|
signing_key:
|
523
621
|
specification_version: 4
|
524
622
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|