cobweb 1.0.5 → 1.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.5
2
+ h1. Cobweb v1.0.6
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -48,6 +48,7 @@ class Cobweb
48
48
  default_user_agent_to "cobweb/#{Cobweb.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"
49
49
  default_valid_mime_types_to ["*/*"]
50
50
  default_raise_exceptions_to false
51
+ default_store_refered_url_to false
51
52
 
52
53
  end
53
54
 
@@ -139,19 +140,19 @@ class Cobweb
139
140
  @http.read_timeout = @options[:timeout].to_i
140
141
  @http.open_timeout = @options[:timeout].to_i
141
142
  begin
142
- puts "Retrieving #{url }... " unless @options[:quiet]
143
+ puts "Retrieving #{uri}... " unless @options[:quiet]
143
144
  request_options={}
144
- request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
145
+ request_options['Cookie']= options[:cookies] if options[:cookies]
145
146
  request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
146
147
 
147
148
  request = Net::HTTP::Get.new uri.request_uri, request_options
148
149
  response = @http.request request
149
150
 
150
151
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
151
- puts "redirected... " unless @options[:quiet]
152
152
 
153
153
  # get location to redirect to
154
154
  uri = UriHelper.join_no_fragment(uri, response['location'])
155
+ puts "Following Redirect to #{uri}... " unless @options[:quiet]
155
156
 
156
157
  # decrement redirect limit
157
158
  redirect_limit = redirect_limit - 1
@@ -398,7 +399,7 @@ class Cobweb
398
399
  end
399
400
 
400
401
  end
401
-
402
+
402
403
  # escapes characters with meaning in regular expressions and adds wildcard expression
403
404
  def self.escape_pattern_for_regex(pattern)
404
405
  pattern = pattern.gsub(".", "\\.")
@@ -54,6 +54,7 @@ class CobwebCrawler
54
54
  while queue_counter>0 && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
55
55
  thread = Thread.new do
56
56
 
57
+
57
58
  url = @redis.spop "queued"
58
59
  queue_counter = 0 if url.nil?
59
60
 
@@ -76,6 +77,8 @@ class CobwebCrawler
76
77
  cobweb_links = CobwebLinks.new(@options)
77
78
 
78
79
  internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
80
+
81
+ all_internal_links = internal_links
79
82
 
80
83
  # reject the link if we've crawled it or queued it
81
84
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
@@ -91,6 +94,12 @@ class CobwebCrawler
91
94
  @redis.hset "navigation", url, children
92
95
  queue_counter += 1
93
96
  end
97
+
98
+ if @options[:store_refered_url]
99
+ all_internal_links.each do |link|
100
+ @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(link)}", url)
101
+ end
102
+ end
94
103
 
95
104
  crawl_counter = @redis.scard("crawled").to_i
96
105
  queue_counter = @redis.scard("queued").to_i
@@ -117,7 +126,7 @@ class CobwebCrawler
117
126
  ensure
118
127
  @stats.end_crawl(@options)
119
128
  end
120
- @stats.get_statistics
129
+ @stats
121
130
  end
122
131
 
123
132
  end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.5"
6
+ "1.0.6"
7
7
  end
8
8
 
9
9
  end
@@ -36,7 +36,11 @@ class Stats
36
36
  def get_crawled
37
37
  @redis.smembers "crawled"
38
38
  end
39
-
39
+
40
+ def inbound_links_for(url, redis=@redis)
41
+ @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
42
+ end
43
+
40
44
  # Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
41
45
  def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
42
46
 
@@ -32,7 +32,7 @@ describe CobwebCrawler do
32
32
  statistics = crawler.crawl(@base_url)
33
33
 
34
34
  statistics.should_not be_nil
35
- statistics.should be_an_instance_of Hash
35
+ statistics.get_statistics.should be_an_instance_of Hash
36
36
 
37
37
  end
38
38
 
@@ -48,7 +48,7 @@ describe CobwebCrawler do
48
48
  end
49
49
 
50
50
  statistics.should_not be_nil
51
- statistics.should be_an_instance_of Hash
51
+ statistics.get_statistics.should be_an_instance_of Hash
52
52
 
53
53
  end
54
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-08 00:00:00.000000000 Z
12
+ date: 2013-02-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: !ruby/object:Gem::Requirement
16
+ requirement: &70331050084220 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,15 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ! '>='
28
- - !ruby/object:Gem::Version
29
- version: '0'
24
+ version_requirements: *70331050084220
30
25
  - !ruby/object:Gem::Dependency
31
26
  name: redis
32
- requirement: !ruby/object:Gem::Requirement
27
+ requirement: &70331050081420 !ruby/object:Gem::Requirement
33
28
  none: false
34
29
  requirements:
35
30
  - - ! '>='
@@ -37,15 +32,10 @@ dependencies:
37
32
  version: '0'
38
33
  type: :runtime
39
34
  prerelease: false
40
- version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
- requirements:
43
- - - ! '>='
44
- - !ruby/object:Gem::Version
45
- version: '0'
35
+ version_requirements: *70331050081420
46
36
  - !ruby/object:Gem::Dependency
47
37
  name: nokogiri
48
- requirement: !ruby/object:Gem::Requirement
38
+ requirement: &70331050075780 !ruby/object:Gem::Requirement
49
39
  none: false
50
40
  requirements:
51
41
  - - ! '>='
@@ -53,15 +43,10 @@ dependencies:
53
43
  version: '0'
54
44
  type: :runtime
55
45
  prerelease: false
56
- version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
- requirements:
59
- - - ! '>='
60
- - !ruby/object:Gem::Version
61
- version: '0'
46
+ version_requirements: *70331050075780
62
47
  - !ruby/object:Gem::Dependency
63
48
  name: addressable
64
- requirement: !ruby/object:Gem::Requirement
49
+ requirement: &70331050066140 !ruby/object:Gem::Requirement
65
50
  none: false
66
51
  requirements:
67
52
  - - ! '>='
@@ -69,15 +54,10 @@ dependencies:
69
54
  version: '0'
70
55
  type: :runtime
71
56
  prerelease: false
72
- version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
- requirements:
75
- - - ! '>='
76
- - !ruby/object:Gem::Version
77
- version: '0'
57
+ version_requirements: *70331050066140
78
58
  - !ruby/object:Gem::Dependency
79
59
  name: rspec
80
- requirement: !ruby/object:Gem::Requirement
60
+ requirement: &70331050054340 !ruby/object:Gem::Requirement
81
61
  none: false
82
62
  requirements:
83
63
  - - ! '>='
@@ -85,15 +65,10 @@ dependencies:
85
65
  version: '0'
86
66
  type: :runtime
87
67
  prerelease: false
88
- version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
- requirements:
91
- - - ! '>='
92
- - !ruby/object:Gem::Version
93
- version: '0'
68
+ version_requirements: *70331050054340
94
69
  - !ruby/object:Gem::Dependency
95
70
  name: awesome_print
96
- requirement: !ruby/object:Gem::Requirement
71
+ requirement: &70331050049640 !ruby/object:Gem::Requirement
97
72
  none: false
98
73
  requirements:
99
74
  - - ! '>='
@@ -101,15 +76,10 @@ dependencies:
101
76
  version: '0'
102
77
  type: :runtime
103
78
  prerelease: false
104
- version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
- requirements:
107
- - - ! '>='
108
- - !ruby/object:Gem::Version
109
- version: '0'
79
+ version_requirements: *70331050049640
110
80
  - !ruby/object:Gem::Dependency
111
81
  name: sinatra
112
- requirement: !ruby/object:Gem::Requirement
82
+ requirement: &70331050048160 !ruby/object:Gem::Requirement
113
83
  none: false
114
84
  requirements:
115
85
  - - ! '>='
@@ -117,15 +87,10 @@ dependencies:
117
87
  version: '0'
118
88
  type: :runtime
119
89
  prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ! '>='
124
- - !ruby/object:Gem::Version
125
- version: '0'
90
+ version_requirements: *70331050048160
126
91
  - !ruby/object:Gem::Dependency
127
92
  name: thin
128
- requirement: !ruby/object:Gem::Requirement
93
+ requirement: &70331050047400 !ruby/object:Gem::Requirement
129
94
  none: false
130
95
  requirements:
131
96
  - - ! '>='
@@ -133,15 +98,10 @@ dependencies:
133
98
  version: '0'
134
99
  type: :runtime
135
100
  prerelease: false
136
- version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
- requirements:
139
- - - ! '>='
140
- - !ruby/object:Gem::Version
141
- version: '0'
101
+ version_requirements: *70331050047400
142
102
  - !ruby/object:Gem::Dependency
143
103
  name: haml
144
- requirement: !ruby/object:Gem::Requirement
104
+ requirement: &70331050046440 !ruby/object:Gem::Requirement
145
105
  none: false
146
106
  requirements:
147
107
  - - ! '>='
@@ -149,15 +109,10 @@ dependencies:
149
109
  version: '0'
150
110
  type: :runtime
151
111
  prerelease: false
152
- version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
- requirements:
155
- - - ! '>='
156
- - !ruby/object:Gem::Version
157
- version: '0'
112
+ version_requirements: *70331050046440
158
113
  - !ruby/object:Gem::Dependency
159
114
  name: namespaced_redis
160
- requirement: !ruby/object:Gem::Requirement
115
+ requirement: &70331050045060 !ruby/object:Gem::Requirement
161
116
  none: false
162
117
  requirements:
163
118
  - - ! '>='
@@ -165,15 +120,10 @@ dependencies:
165
120
  version: '0'
166
121
  type: :runtime
167
122
  prerelease: false
168
- version_requirements: !ruby/object:Gem::Requirement
169
- none: false
170
- requirements:
171
- - - ! '>='
172
- - !ruby/object:Gem::Version
173
- version: '0'
123
+ version_requirements: *70331050045060
174
124
  - !ruby/object:Gem::Dependency
175
125
  name: json
176
- requirement: !ruby/object:Gem::Requirement
126
+ requirement: &70331050044240 !ruby/object:Gem::Requirement
177
127
  none: false
178
128
  requirements:
179
129
  - - ! '>='
@@ -181,12 +131,7 @@ dependencies:
181
131
  version: '0'
182
132
  type: :runtime
183
133
  prerelease: false
184
- version_requirements: !ruby/object:Gem::Requirement
185
- none: false
186
- requirements:
187
- - - ! '>='
188
- - !ruby/object:Gem::Version
189
- version: '0'
134
+ version_requirements: *70331050044240
190
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
191
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
192
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -569,7 +514,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
569
514
  version: '0'
570
515
  requirements: []
571
516
  rubyforge_project:
572
- rubygems_version: 1.8.24
517
+ rubygems_version: 1.8.10
573
518
  signing_key:
574
519
  specification_version: 3
575
520
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly