cobweb 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.5
2
+ h1. Cobweb v1.0.6
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -48,6 +48,7 @@ class Cobweb
48
48
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
49
49
  default_valid_mime_types_to ["*/*"]
50
50
  default_raise_exceptions_to false
51
+ default_store_refered_url_to false
51
52
 
52
53
  end
53
54
 
@@ -139,19 +140,19 @@ class Cobweb
139
140
  @http.read_timeout = @options[:timeout].to_i
140
141
  @http.open_timeout = @options[:timeout].to_i
141
142
  begin
142
- puts "Retrieving #{url }... " unless @options[:quiet]
143
+ puts "Retrieving #{uri}... " unless @options[:quiet]
143
144
  request_options={}
144
- request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
145
+ request_options['Cookie']= options[:cookies] if options[:cookies]
145
146
  request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
146
147
 
147
148
  request = Net::HTTP::Get.new uri.request_uri, request_options
148
149
  response = @http.request request
149
150
 
150
151
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
151
- puts "redirected... " unless @options[:quiet]
152
152
 
153
153
  # get location to redirect to
154
154
  uri = UriHelper.join_no_fragment(uri, response['location'])
155
+ puts "Following Redirect to #{uri}... " unless @options[:quiet]
155
156
 
156
157
  # decrement redirect limit
157
158
  redirect_limit = redirect_limit - 1
@@ -398,7 +399,7 @@ class Cobweb
398
399
  end
399
400
 
400
401
  end
401
-
402
+
402
403
  # escapes characters with meaning in regular expressions and adds wildcard expression
403
404
  def self.escape_pattern_for_regex(pattern)
404
405
  pattern = pattern.gsub(".", "\\.")
@@ -54,6 +54,7 @@ class CobwebCrawler
54
54
  while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
55
55
  thread = Thread.new do
56
56
 
57
+
57
58
  url = @redis.spop "queued"
58
59
  queue_counter = 0 if url.nil?
59
60
 
@@ -76,6 +77,8 @@ class CobwebCrawler
76
77
  cobweb_links = CobwebLinks.new(@options)
77
78
 
78
79
  internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
80
+
81
+ all_internal_links = internal_links
79
82
 
80
83
  # reject the link if we've crawled it or queued it
81
84
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
@@ -91,6 +94,12 @@ class CobwebCrawler
91
94
  @redis.hset "navigation", url, children
92
95
  queue_counter += 1
93
96
  end
97
+
98
+ if @options[:store_refered_url]
99
+ all_internal_links.each do |link|
100
+ @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
101
+ end
102
+ end
94
103
 
95
104
  crawl_counter = @redis.scard("crawled").to_i
96
105
  queue_counter = @redis.scard("queued").to_i
@@ -117,7 +126,7 @@ class CobwebCrawler
117
126
  ensure
118
127
  @stats.end_crawl(@options)
119
128
  end
120
- @stats.get_statistics
129
+ @stats
121
130
  end
122
131
 
123
132
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.5"
6
+ "1.0.6"
7
7
  end
8
8
 
9
9
  end
@@ -36,7 +36,11 @@ class Stats
36
36
  def get_crawled
37
37
  @redis.smembers "crawled"
38
38
  end
39
-
39
+
40
+ def inbound_links_for(url, redis=@redis)
41
+ @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
42
+ end
43
+
40
44
  # Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
41
45
  def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
42
46
 
@@ -32,7 +32,7 @@ describe CobwebCrawler do
32
32
  statistics = crawler.crawl(@base_url)
33
33
 
34
34
  statistics.should_not be_nil
35
- statistics.should be_an_instance_of Hash
35
+ statistics.get_statistics.should be_an_instance_of Hash
36
36
 
37
37
  end
38
38
 
@@ -48,7 +48,7 @@ describe CobwebCrawler do
48
48
  end
49
49
 
50
50
  statistics.should_not be_nil
51
- statistics.should be_an_instance_of Hash
51
+ statistics.get_statistics.should be_an_instance_of Hash
52
52
 
53
53
  end
54
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-08 00:00:00.000000000 Z
12
+ date: 2013-02-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: !ruby/object:Gem::Requirement
16
+ requirement: &70331050084220 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,15 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: '0'
24
+ version_requirements: *70331050084220
30
25
  - !ruby/object:Gem::Dependency
31
26
  name: redis
32
- requirement: !ruby/object:Gem::Requirement
27
+ requirement: &70331050081420 !ruby/object:Gem::Requirement
33
28
  none: false
34
29
  requirements:
35
30
  - - ! '>='
@@ -37,15 +32,10 @@ dependencies:
37
32
  version: '0'
38
33
  type: :runtime
39
34
  prerelease: false
40
- version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
- requirements:
43
- - - ! '>='
44
- - !ruby/object:Gem::Version
45
- version: '0'
35
+ version_requirements: *70331050081420
46
36
  - !ruby/object:Gem::Dependency
47
37
  name: nokogiri
48
- requirement: !ruby/object:Gem::Requirement
38
+ requirement: &70331050075780 !ruby/object:Gem::Requirement
49
39
  none: false
50
40
  requirements:
51
41
  - - ! '>='
@@ -53,15 +43,10 @@ dependencies:
53
43
  version: '0'
54
44
  type: :runtime
55
45
  prerelease: false
56
- version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
46
+ version_requirements: *70331050075780
62
47
  - !ruby/object:Gem::Dependency
63
48
  name: addressable
64
- requirement: !ruby/object:Gem::Requirement
49
+ requirement: &70331050066140 !ruby/object:Gem::Requirement
65
50
  none: false
66
51
  requirements:
67
52
  - - ! '>='
@@ -69,15 +54,10 @@ dependencies:
69
54
  version: '0'
70
55
  type: :runtime
71
56
  prerelease: false
72
- version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
- requirements:
75
- - - ! '>='
76
- - !ruby/object:Gem::Version
77
- version: '0'
57
+ version_requirements: *70331050066140
78
58
  - !ruby/object:Gem::Dependency
79
59
  name: rspec
80
- requirement: !ruby/object:Gem::Requirement
60
+ requirement: &70331050054340 !ruby/object:Gem::Requirement
81
61
  none: false
82
62
  requirements:
83
63
  - - ! '>='
@@ -85,15 +65,10 @@ dependencies:
85
65
  version: '0'
86
66
  type: :runtime
87
67
  prerelease: false
88
- version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
- requirements:
91
- - - ! '>='
92
- - !ruby/object:Gem::Version
93
- version: '0'
68
+ version_requirements: *70331050054340
94
69
  - !ruby/object:Gem::Dependency
95
70
  name: awesome_print
96
- requirement: !ruby/object:Gem::Requirement
71
+ requirement: &70331050049640 !ruby/object:Gem::Requirement
97
72
  none: false
98
73
  requirements:
99
74
  - - ! '>='
@@ -101,15 +76,10 @@ dependencies:
101
76
  version: '0'
102
77
  type: :runtime
103
78
  prerelease: false
104
- version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
- requirements:
107
- - - ! '>='
108
- - !ruby/object:Gem::Version
109
- version: '0'
79
+ version_requirements: *70331050049640
110
80
  - !ruby/object:Gem::Dependency
111
81
  name: sinatra
112
- requirement: !ruby/object:Gem::Requirement
82
+ requirement: &70331050048160 !ruby/object:Gem::Requirement
113
83
  none: false
114
84
  requirements:
115
85
  - - ! '>='
@@ -117,15 +87,10 @@ dependencies:
117
87
  version: '0'
118
88
  type: :runtime
119
89
  prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ! '>='
124
- - !ruby/object:Gem::Version
125
- version: '0'
90
+ version_requirements: *70331050048160
126
91
  - !ruby/object:Gem::Dependency
127
92
  name: thin
128
- requirement: !ruby/object:Gem::Requirement
93
+ requirement: &70331050047400 !ruby/object:Gem::Requirement
129
94
  none: false
130
95
  requirements:
131
96
  - - ! '>='
@@ -133,15 +98,10 @@ dependencies:
133
98
  version: '0'
134
99
  type: :runtime
135
100
  prerelease: false
136
- version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
- requirements:
139
- - - ! '>='
140
- - !ruby/object:Gem::Version
141
- version: '0'
101
+ version_requirements: *70331050047400
142
102
  - !ruby/object:Gem::Dependency
143
103
  name: haml
144
- requirement: !ruby/object:Gem::Requirement
104
+ requirement: &70331050046440 !ruby/object:Gem::Requirement
145
105
  none: false
146
106
  requirements:
147
107
  - - ! '>='
@@ -149,15 +109,10 @@ dependencies:
149
109
  version: '0'
150
110
  type: :runtime
151
111
  prerelease: false
152
- version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
- requirements:
155
- - - ! '>='
156
- - !ruby/object:Gem::Version
157
- version: '0'
112
+ version_requirements: *70331050046440
158
113
  - !ruby/object:Gem::Dependency
159
114
  name: namespaced_redis
160
- requirement: !ruby/object:Gem::Requirement
115
+ requirement: &70331050045060 !ruby/object:Gem::Requirement
161
116
  none: false
162
117
  requirements:
163
118
  - - ! '>='
@@ -165,15 +120,10 @@ dependencies:
165
120
  version: '0'
166
121
  type: :runtime
167
122
  prerelease: false
168
- version_requirements: !ruby/object:Gem::Requirement
169
- none: false
170
- requirements:
171
- - - ! '>='
172
- - !ruby/object:Gem::Version
173
- version: '0'
123
+ version_requirements: *70331050045060
174
124
  - !ruby/object:Gem::Dependency
175
125
  name: json
176
- requirement: !ruby/object:Gem::Requirement
126
+ requirement: &70331050044240 !ruby/object:Gem::Requirement
177
127
  none: false
178
128
  requirements:
179
129
  - - ! '>='
@@ -181,12 +131,7 @@ dependencies:
181
131
  version: '0'
182
132
  type: :runtime
183
133
  prerelease: false
184
- version_requirements: !ruby/object:Gem::Requirement
185
- none: false
186
- requirements:
187
- - - ! '>='
188
- - !ruby/object:Gem::Version
189
- version: '0'
134
+ version_requirements: *70331050044240
190
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
191
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
192
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -569,7 +514,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
569
514
  version: '0'
570
515
  requirements: []
571
516
  rubyforge_project:
572
- rubygems_version: 1.8.24
517
+ rubygems_version: 1.8.10
573
518
  signing_key:
574
519
  specification_version: 3
575
520
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly