cobweb 1.0.5 → 1.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +1 -1
- data/lib/cobweb.rb +5 -4
- data/lib/cobweb_crawler.rb +10 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/stats.rb +5 -1
- data/spec/cobweb/cobweb_crawler_spec.rb +2 -2
- metadata +25 -80
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -48,6 +48,7 @@ class Cobweb
|
|
48
48
|
default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
|
49
49
|
default_valid_mime_types_to ["*/*"]
|
50
50
|
default_raise_exceptions_to false
|
51
|
+
default_store_refered_url_to false
|
51
52
|
|
52
53
|
end
|
53
54
|
|
@@ -139,19 +140,19 @@ class Cobweb
|
|
139
140
|
@http.read_timeout = @options[:timeout].to_i
|
140
141
|
@http.open_timeout = @options[:timeout].to_i
|
141
142
|
begin
|
142
|
-
puts "Retrieving #{
|
143
|
+
puts "Retrieving #{uri}... " unless @options[:quiet]
|
143
144
|
request_options={}
|
144
|
-
request_options['Cookie']= options[:cookies] if options
|
145
|
+
request_options['Cookie']= options[:cookies] if options[:cookies]
|
145
146
|
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
146
147
|
|
147
148
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
148
149
|
response = @http.request request
|
149
150
|
|
150
151
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
151
|
-
puts "redirected... " unless @options[:quiet]
|
152
152
|
|
153
153
|
# get location to redirect to
|
154
154
|
uri = UriHelper.join_no_fragment(uri, response['location'])
|
155
|
+
puts "Following Redirect to #{uri}... " unless @options[:quiet]
|
155
156
|
|
156
157
|
# decrement redirect limit
|
157
158
|
redirect_limit = redirect_limit - 1
|
@@ -398,7 +399,7 @@ class Cobweb
|
|
398
399
|
end
|
399
400
|
|
400
401
|
end
|
401
|
-
|
402
|
+
|
402
403
|
# escapes characters with meaning in regular expressions and adds wildcard expression
|
403
404
|
def self.escape_pattern_for_regex(pattern)
|
404
405
|
pattern = pattern.gsub(".", "\\.")
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -54,6 +54,7 @@ class CobwebCrawler
|
|
54
54
|
while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
55
55
|
thread = Thread.new do
|
56
56
|
|
57
|
+
|
57
58
|
url = @redis.spop "queued"
|
58
59
|
queue_counter = 0 if url.nil?
|
59
60
|
|
@@ -76,6 +77,8 @@ class CobwebCrawler
|
|
76
77
|
cobweb_links = CobwebLinks.new(@options)
|
77
78
|
|
78
79
|
internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
|
80
|
+
|
81
|
+
all_internal_links = internal_links
|
79
82
|
|
80
83
|
# reject the link if we've crawled it or queued it
|
81
84
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
@@ -91,6 +94,12 @@ class CobwebCrawler
|
|
91
94
|
@redis.hset "navigation", url, children
|
92
95
|
queue_counter += 1
|
93
96
|
end
|
97
|
+
|
98
|
+
if @options[:store_refered_url]
|
99
|
+
all_internal_links.each do |link|
|
100
|
+
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
|
101
|
+
end
|
102
|
+
end
|
94
103
|
|
95
104
|
crawl_counter = @redis.scard("crawled").to_i
|
96
105
|
queue_counter = @redis.scard("queued").to_i
|
@@ -117,7 +126,7 @@ class CobwebCrawler
|
|
117
126
|
ensure
|
118
127
|
@stats.end_crawl(@options)
|
119
128
|
end
|
120
|
-
@stats
|
129
|
+
@stats
|
121
130
|
end
|
122
131
|
|
123
132
|
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/stats.rb
CHANGED
@@ -36,7 +36,11 @@ class Stats
|
|
36
36
|
def get_crawled
|
37
37
|
@redis.smembers "crawled"
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
|
+
def inbound_links_for(url, redis=@redis)
|
41
|
+
@redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
|
42
|
+
end
|
43
|
+
|
40
44
|
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
41
45
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
42
46
|
|
@@ -32,7 +32,7 @@ describe CobwebCrawler do
|
|
32
32
|
statistics = crawler.crawl(@base_url)
|
33
33
|
|
34
34
|
statistics.should_not be_nil
|
35
|
-
statistics.should be_an_instance_of Hash
|
35
|
+
statistics.get_statistics.should be_an_instance_of Hash
|
36
36
|
|
37
37
|
end
|
38
38
|
|
@@ -48,7 +48,7 @@ describe CobwebCrawler do
|
|
48
48
|
end
|
49
49
|
|
50
50
|
statistics.should_not be_nil
|
51
|
-
statistics.should be_an_instance_of Hash
|
51
|
+
statistics.get_statistics.should be_an_instance_of Hash
|
52
52
|
|
53
53
|
end
|
54
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirement: &70331050084220 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,15 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ! '>='
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '0'
|
24
|
+
version_requirements: *70331050084220
|
30
25
|
- !ruby/object:Gem::Dependency
|
31
26
|
name: redis
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
27
|
+
requirement: &70331050081420 !ruby/object:Gem::Requirement
|
33
28
|
none: false
|
34
29
|
requirements:
|
35
30
|
- - ! '>='
|
@@ -37,15 +32,10 @@ dependencies:
|
|
37
32
|
version: '0'
|
38
33
|
type: :runtime
|
39
34
|
prerelease: false
|
40
|
-
version_requirements:
|
41
|
-
none: false
|
42
|
-
requirements:
|
43
|
-
- - ! '>='
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
35
|
+
version_requirements: *70331050081420
|
46
36
|
- !ruby/object:Gem::Dependency
|
47
37
|
name: nokogiri
|
48
|
-
requirement: !ruby/object:Gem::Requirement
|
38
|
+
requirement: &70331050075780 !ruby/object:Gem::Requirement
|
49
39
|
none: false
|
50
40
|
requirements:
|
51
41
|
- - ! '>='
|
@@ -53,15 +43,10 @@ dependencies:
|
|
53
43
|
version: '0'
|
54
44
|
type: :runtime
|
55
45
|
prerelease: false
|
56
|
-
version_requirements:
|
57
|
-
none: false
|
58
|
-
requirements:
|
59
|
-
- - ! '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
46
|
+
version_requirements: *70331050075780
|
62
47
|
- !ruby/object:Gem::Dependency
|
63
48
|
name: addressable
|
64
|
-
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirement: &70331050066140 !ruby/object:Gem::Requirement
|
65
50
|
none: false
|
66
51
|
requirements:
|
67
52
|
- - ! '>='
|
@@ -69,15 +54,10 @@ dependencies:
|
|
69
54
|
version: '0'
|
70
55
|
type: :runtime
|
71
56
|
prerelease: false
|
72
|
-
version_requirements:
|
73
|
-
none: false
|
74
|
-
requirements:
|
75
|
-
- - ! '>='
|
76
|
-
- !ruby/object:Gem::Version
|
77
|
-
version: '0'
|
57
|
+
version_requirements: *70331050066140
|
78
58
|
- !ruby/object:Gem::Dependency
|
79
59
|
name: rspec
|
80
|
-
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirement: &70331050054340 !ruby/object:Gem::Requirement
|
81
61
|
none: false
|
82
62
|
requirements:
|
83
63
|
- - ! '>='
|
@@ -85,15 +65,10 @@ dependencies:
|
|
85
65
|
version: '0'
|
86
66
|
type: :runtime
|
87
67
|
prerelease: false
|
88
|
-
version_requirements:
|
89
|
-
none: false
|
90
|
-
requirements:
|
91
|
-
- - ! '>='
|
92
|
-
- !ruby/object:Gem::Version
|
93
|
-
version: '0'
|
68
|
+
version_requirements: *70331050054340
|
94
69
|
- !ruby/object:Gem::Dependency
|
95
70
|
name: awesome_print
|
96
|
-
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirement: &70331050049640 !ruby/object:Gem::Requirement
|
97
72
|
none: false
|
98
73
|
requirements:
|
99
74
|
- - ! '>='
|
@@ -101,15 +76,10 @@ dependencies:
|
|
101
76
|
version: '0'
|
102
77
|
type: :runtime
|
103
78
|
prerelease: false
|
104
|
-
version_requirements:
|
105
|
-
none: false
|
106
|
-
requirements:
|
107
|
-
- - ! '>='
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: '0'
|
79
|
+
version_requirements: *70331050049640
|
110
80
|
- !ruby/object:Gem::Dependency
|
111
81
|
name: sinatra
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
82
|
+
requirement: &70331050048160 !ruby/object:Gem::Requirement
|
113
83
|
none: false
|
114
84
|
requirements:
|
115
85
|
- - ! '>='
|
@@ -117,15 +87,10 @@ dependencies:
|
|
117
87
|
version: '0'
|
118
88
|
type: :runtime
|
119
89
|
prerelease: false
|
120
|
-
version_requirements:
|
121
|
-
none: false
|
122
|
-
requirements:
|
123
|
-
- - ! '>='
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
version: '0'
|
90
|
+
version_requirements: *70331050048160
|
126
91
|
- !ruby/object:Gem::Dependency
|
127
92
|
name: thin
|
128
|
-
requirement: !ruby/object:Gem::Requirement
|
93
|
+
requirement: &70331050047400 !ruby/object:Gem::Requirement
|
129
94
|
none: false
|
130
95
|
requirements:
|
131
96
|
- - ! '>='
|
@@ -133,15 +98,10 @@ dependencies:
|
|
133
98
|
version: '0'
|
134
99
|
type: :runtime
|
135
100
|
prerelease: false
|
136
|
-
version_requirements:
|
137
|
-
none: false
|
138
|
-
requirements:
|
139
|
-
- - ! '>='
|
140
|
-
- !ruby/object:Gem::Version
|
141
|
-
version: '0'
|
101
|
+
version_requirements: *70331050047400
|
142
102
|
- !ruby/object:Gem::Dependency
|
143
103
|
name: haml
|
144
|
-
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirement: &70331050046440 !ruby/object:Gem::Requirement
|
145
105
|
none: false
|
146
106
|
requirements:
|
147
107
|
- - ! '>='
|
@@ -149,15 +109,10 @@ dependencies:
|
|
149
109
|
version: '0'
|
150
110
|
type: :runtime
|
151
111
|
prerelease: false
|
152
|
-
version_requirements:
|
153
|
-
none: false
|
154
|
-
requirements:
|
155
|
-
- - ! '>='
|
156
|
-
- !ruby/object:Gem::Version
|
157
|
-
version: '0'
|
112
|
+
version_requirements: *70331050046440
|
158
113
|
- !ruby/object:Gem::Dependency
|
159
114
|
name: namespaced_redis
|
160
|
-
requirement: !ruby/object:Gem::Requirement
|
115
|
+
requirement: &70331050045060 !ruby/object:Gem::Requirement
|
161
116
|
none: false
|
162
117
|
requirements:
|
163
118
|
- - ! '>='
|
@@ -165,15 +120,10 @@ dependencies:
|
|
165
120
|
version: '0'
|
166
121
|
type: :runtime
|
167
122
|
prerelease: false
|
168
|
-
version_requirements:
|
169
|
-
none: false
|
170
|
-
requirements:
|
171
|
-
- - ! '>='
|
172
|
-
- !ruby/object:Gem::Version
|
173
|
-
version: '0'
|
123
|
+
version_requirements: *70331050045060
|
174
124
|
- !ruby/object:Gem::Dependency
|
175
125
|
name: json
|
176
|
-
requirement: !ruby/object:Gem::Requirement
|
126
|
+
requirement: &70331050044240 !ruby/object:Gem::Requirement
|
177
127
|
none: false
|
178
128
|
requirements:
|
179
129
|
- - ! '>='
|
@@ -181,12 +131,7 @@ dependencies:
|
|
181
131
|
version: '0'
|
182
132
|
type: :runtime
|
183
133
|
prerelease: false
|
184
|
-
version_requirements:
|
185
|
-
none: false
|
186
|
-
requirements:
|
187
|
-
- - ! '>='
|
188
|
-
- !ruby/object:Gem::Version
|
189
|
-
version: '0'
|
134
|
+
version_requirements: *70331050044240
|
190
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
191
136
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
192
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -569,7 +514,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
569
514
|
version: '0'
|
570
515
|
requirements: []
|
571
516
|
rubyforge_project:
|
572
|
-
rubygems_version: 1.8.
|
517
|
+
rubygems_version: 1.8.10
|
573
518
|
signing_key:
|
574
519
|
specification_version: 3
|
575
520
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|