cobweb 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +5 -4
- data/lib/cobweb_crawler.rb +10 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/stats.rb +5 -1
- data/spec/cobweb/cobweb_crawler_spec.rb +2 -2
- metadata +25 -80
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -48,6 +48,7 @@ class Cobweb
|
|
48
48
|
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
49
49
|
default_valid_mime_types_to ["*/*"]
|
50
50
|
default_raise_exceptions_to false
|
51
|
+
default_store_refered_url_to false
|
51
52
|
|
52
53
|
end
|
53
54
|
|
@@ -139,19 +140,19 @@ class Cobweb
|
|
139
140
|
@http.read_timeout = @options[:timeout].to_i
|
140
141
|
@http.open_timeout = @options[:timeout].to_i
|
141
142
|
begin
|
142
|
-
puts "Retrieving #{
|
143
|
+
puts "Retrieving #{uri}... " unless @options[:quiet]
|
143
144
|
request_options={}
|
144
|
-
request_options['Cookie']= options[:cookies] if options
|
145
|
+
request_options['Cookie']= options[:cookies] if options[:cookies]
|
145
146
|
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
146
147
|
|
147
148
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
148
149
|
response = @http.request request
|
149
150
|
|
150
151
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
151
|
-
puts "redirected... " unless @options[:quiet]
|
152
152
|
|
153
153
|
# get location to redirect to
|
154
154
|
uri = UriHelper.join_no_fragment(uri, response['location'])
|
155
|
+
puts "Following Redirect to #{uri}... " unless @options[:quiet]
|
155
156
|
|
156
157
|
# decrement redirect limit
|
157
158
|
redirect_limit = redirect_limit - 1
|
@@ -398,7 +399,7 @@ class Cobweb
|
|
398
399
|
end
|
399
400
|
|
400
401
|
end
|
401
|
-
|
402
|
+
|
402
403
|
# escapes characters with meaning in regular expressions and adds wildcard expression
|
403
404
|
def self.escape_pattern_for_regex(pattern)
|
404
405
|
pattern = pattern.gsub(".", "\\.")
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -54,6 +54,7 @@ class CobwebCrawler
|
|
54
54
|
while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
55
55
|
thread = Thread.new do
|
56
56
|
|
57
|
+
|
57
58
|
url = @redis.spop "queued"
|
58
59
|
queue_counter = 0 if url.nil?
|
59
60
|
|
@@ -76,6 +77,8 @@ class CobwebCrawler
|
|
76
77
|
cobweb_links = CobwebLinks.new(@options)
|
77
78
|
|
78
79
|
internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
|
80
|
+
|
81
|
+
all_internal_links = internal_links
|
79
82
|
|
80
83
|
# reject the link if we've crawled it or queued it
|
81
84
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
@@ -91,6 +94,12 @@ class CobwebCrawler
|
|
91
94
|
@redis.hset "navigation", url, children
|
92
95
|
queue_counter += 1
|
93
96
|
end
|
97
|
+
|
98
|
+
if @options[:store_refered_url]
|
99
|
+
all_internal_links.each do |link|
|
100
|
+
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
|
101
|
+
end
|
102
|
+
end
|
94
103
|
|
95
104
|
crawl_counter = @redis.scard("crawled").to_i
|
96
105
|
queue_counter = @redis.scard("queued").to_i
|
@@ -117,7 +126,7 @@ class CobwebCrawler
|
|
117
126
|
ensure
|
118
127
|
@stats.end_crawl(@options)
|
119
128
|
end
|
120
|
-
@stats
|
129
|
+
@stats
|
121
130
|
end
|
122
131
|
|
123
132
|
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/stats.rb
CHANGED
@@ -36,7 +36,11 @@ class Stats
|
|
36
36
|
def get_crawled
|
37
37
|
@redis.smembers "crawled"
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
|
+
def inbound_links_for(url, redis=@redis)
|
41
|
+
@redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
|
42
|
+
end
|
43
|
+
|
40
44
|
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
41
45
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
42
46
|
|
@@ -32,7 +32,7 @@ describe CobwebCrawler do
|
|
32
32
|
statistics = crawler.crawl(@base_url)
|
33
33
|
|
34
34
|
statistics.should_not be_nil
|
35
|
-
statistics.should be_an_instance_of Hash
|
35
|
+
statistics.get_statistics.should be_an_instance_of Hash
|
36
36
|
|
37
37
|
end
|
38
38
|
|
@@ -48,7 +48,7 @@ describe CobwebCrawler do
|
|
48
48
|
end
|
49
49
|
|
50
50
|
statistics.should_not be_nil
|
51
|
-
statistics.should be_an_instance_of Hash
|
51
|
+
statistics.get_statistics.should be_an_instance_of Hash
|
52
52
|
|
53
53
|
end
|
54
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirement: &70331050084220 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,15 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ! '>='
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '0'
|
24
|
+
version_requirements: *70331050084220
|
30
25
|
- !ruby/object:Gem::Dependency
|
31
26
|
name: redis
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
27
|
+
requirement: &70331050081420 !ruby/object:Gem::Requirement
|
33
28
|
none: false
|
34
29
|
requirements:
|
35
30
|
- - ! '>='
|
@@ -37,15 +32,10 @@ dependencies:
|
|
37
32
|
version: '0'
|
38
33
|
type: :runtime
|
39
34
|
prerelease: false
|
40
|
-
version_requirements:
|
41
|
-
none: false
|
42
|
-
requirements:
|
43
|
-
- - ! '>='
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
35
|
+
version_requirements: *70331050081420
|
46
36
|
- !ruby/object:Gem::Dependency
|
47
37
|
name: nokogiri
|
48
|
-
requirement: !ruby/object:Gem::Requirement
|
38
|
+
requirement: &70331050075780 !ruby/object:Gem::Requirement
|
49
39
|
none: false
|
50
40
|
requirements:
|
51
41
|
- - ! '>='
|
@@ -53,15 +43,10 @@ dependencies:
|
|
53
43
|
version: '0'
|
54
44
|
type: :runtime
|
55
45
|
prerelease: false
|
56
|
-
version_requirements:
|
57
|
-
none: false
|
58
|
-
requirements:
|
59
|
-
- - ! '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
46
|
+
version_requirements: *70331050075780
|
62
47
|
- !ruby/object:Gem::Dependency
|
63
48
|
name: addressable
|
64
|
-
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirement: &70331050066140 !ruby/object:Gem::Requirement
|
65
50
|
none: false
|
66
51
|
requirements:
|
67
52
|
- - ! '>='
|
@@ -69,15 +54,10 @@ dependencies:
|
|
69
54
|
version: '0'
|
70
55
|
type: :runtime
|
71
56
|
prerelease: false
|
72
|
-
version_requirements:
|
73
|
-
none: false
|
74
|
-
requirements:
|
75
|
-
- - ! '>='
|
76
|
-
- !ruby/object:Gem::Version
|
77
|
-
version: '0'
|
57
|
+
version_requirements: *70331050066140
|
78
58
|
- !ruby/object:Gem::Dependency
|
79
59
|
name: rspec
|
80
|
-
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirement: &70331050054340 !ruby/object:Gem::Requirement
|
81
61
|
none: false
|
82
62
|
requirements:
|
83
63
|
- - ! '>='
|
@@ -85,15 +65,10 @@ dependencies:
|
|
85
65
|
version: '0'
|
86
66
|
type: :runtime
|
87
67
|
prerelease: false
|
88
|
-
version_requirements:
|
89
|
-
none: false
|
90
|
-
requirements:
|
91
|
-
- - ! '>='
|
92
|
-
- !ruby/object:Gem::Version
|
93
|
-
version: '0'
|
68
|
+
version_requirements: *70331050054340
|
94
69
|
- !ruby/object:Gem::Dependency
|
95
70
|
name: awesome_print
|
96
|
-
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirement: &70331050049640 !ruby/object:Gem::Requirement
|
97
72
|
none: false
|
98
73
|
requirements:
|
99
74
|
- - ! '>='
|
@@ -101,15 +76,10 @@ dependencies:
|
|
101
76
|
version: '0'
|
102
77
|
type: :runtime
|
103
78
|
prerelease: false
|
104
|
-
version_requirements:
|
105
|
-
none: false
|
106
|
-
requirements:
|
107
|
-
- - ! '>='
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: '0'
|
79
|
+
version_requirements: *70331050049640
|
110
80
|
- !ruby/object:Gem::Dependency
|
111
81
|
name: sinatra
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
82
|
+
requirement: &70331050048160 !ruby/object:Gem::Requirement
|
113
83
|
none: false
|
114
84
|
requirements:
|
115
85
|
- - ! '>='
|
@@ -117,15 +87,10 @@ dependencies:
|
|
117
87
|
version: '0'
|
118
88
|
type: :runtime
|
119
89
|
prerelease: false
|
120
|
-
version_requirements:
|
121
|
-
none: false
|
122
|
-
requirements:
|
123
|
-
- - ! '>='
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
90
|
+
version_requirements: *70331050048160
|
126
91
|
- !ruby/object:Gem::Dependency
|
127
92
|
name: thin
|
128
|
-
requirement: !ruby/object:Gem::Requirement
|
93
|
+
requirement: &70331050047400 !ruby/object:Gem::Requirement
|
129
94
|
none: false
|
130
95
|
requirements:
|
131
96
|
- - ! '>='
|
@@ -133,15 +98,10 @@ dependencies:
|
|
133
98
|
version: '0'
|
134
99
|
type: :runtime
|
135
100
|
prerelease: false
|
136
|
-
version_requirements:
|
137
|
-
none: false
|
138
|
-
requirements:
|
139
|
-
- - ! '>='
|
140
|
-
- !ruby/object:Gem::Version
|
141
|
-
version: '0'
|
101
|
+
version_requirements: *70331050047400
|
142
102
|
- !ruby/object:Gem::Dependency
|
143
103
|
name: haml
|
144
|
-
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirement: &70331050046440 !ruby/object:Gem::Requirement
|
145
105
|
none: false
|
146
106
|
requirements:
|
147
107
|
- - ! '>='
|
@@ -149,15 +109,10 @@ dependencies:
|
|
149
109
|
version: '0'
|
150
110
|
type: :runtime
|
151
111
|
prerelease: false
|
152
|
-
version_requirements:
|
153
|
-
none: false
|
154
|
-
requirements:
|
155
|
-
- - ! '>='
|
156
|
-
- !ruby/object:Gem::Version
|
157
|
-
version: '0'
|
112
|
+
version_requirements: *70331050046440
|
158
113
|
- !ruby/object:Gem::Dependency
|
159
114
|
name: namespaced_redis
|
160
|
-
requirement: !ruby/object:Gem::Requirement
|
115
|
+
requirement: &70331050045060 !ruby/object:Gem::Requirement
|
161
116
|
none: false
|
162
117
|
requirements:
|
163
118
|
- - ! '>='
|
@@ -165,15 +120,10 @@ dependencies:
|
|
165
120
|
version: '0'
|
166
121
|
type: :runtime
|
167
122
|
prerelease: false
|
168
|
-
version_requirements:
|
169
|
-
none: false
|
170
|
-
requirements:
|
171
|
-
- - ! '>='
|
172
|
-
- !ruby/object:Gem::Version
|
173
|
-
version: '0'
|
123
|
+
version_requirements: *70331050045060
|
174
124
|
- !ruby/object:Gem::Dependency
|
175
125
|
name: json
|
176
|
-
requirement: !ruby/object:Gem::Requirement
|
126
|
+
requirement: &70331050044240 !ruby/object:Gem::Requirement
|
177
127
|
none: false
|
178
128
|
requirements:
|
179
129
|
- - ! '>='
|
@@ -181,12 +131,7 @@ dependencies:
|
|
181
131
|
version: '0'
|
182
132
|
type: :runtime
|
183
133
|
prerelease: false
|
184
|
-
version_requirements:
|
185
|
-
none: false
|
186
|
-
requirements:
|
187
|
-
- - ! '>='
|
188
|
-
- !ruby/object:Gem::Version
|
189
|
-
version: '0'
|
134
|
+
version_requirements: *70331050044240
|
190
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
191
136
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
192
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -569,7 +514,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
569
514
|
version: '0'
|
570
515
|
requirements: []
|
571
516
|
rubyforge_project:
|
572
|
-
rubygems_version: 1.8.
|
517
|
+
rubygems_version: 1.8.10
|
573
518
|
signing_key:
|
574
519
|
specification_version: 3
|
575
520
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|