cobweb 0.0.25 → 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.25
2
+ h1. Cobweb v0.0.26
3
3
 
4
4
  h2. Intro
5
5
 
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.25"
22
+ "0.0.26"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
@@ -77,8 +77,6 @@ class Cobweb
77
77
 
78
78
  raise "url cannot be nil" if url.nil?
79
79
 
80
- absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
81
-
82
80
  # get the unique id for this request
83
81
  unique_id = Digest::SHA1.hexdigest(url.to_s)
84
82
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -127,7 +125,7 @@ class Cobweb
127
125
  puts "redirected... " unless @options[:quiet]
128
126
 
129
127
  # get location to redirect to
130
- url = absolutize.url(response['location']).to_s
128
+ url = Addressable::URI.parse(response['location']).to_s
131
129
 
132
130
  # decrement redirect limit
133
131
  redirect_limit = redirect_limit - 1
@@ -234,8 +232,6 @@ class Cobweb
234
232
  def head(url, options = @options)
235
233
  raise "url cannot be nil" if url.nil?
236
234
 
237
- absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
238
-
239
235
  # get the unique id for this request
240
236
  unique_id = Digest::SHA1.hexdigest(url)
241
237
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -277,7 +273,7 @@ class Cobweb
277
273
 
278
274
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
279
275
  puts "redirected... " unless @options[:quiet]
280
- url = absolutize.url(response['location']).to_s
276
+ url = Addressable::URI.parse(response['location']).to_s
281
277
  redirect_limit = redirect_limit - 1
282
278
  options = options.clone
283
279
  options[:redirect_limit]=redirect_limit
@@ -22,8 +22,6 @@ class CobwebCrawler
22
22
 
23
23
  @crawl_options = crawl_options
24
24
 
25
- @absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
26
-
27
25
  @redis.sadd "queued", base_url
28
26
  crawl_counter = @redis.scard("crawled").to_i
29
27
  queue_counter = @redis.scard("queued").to_i
@@ -12,7 +12,6 @@ class ContentLinkParser
12
12
  if @doc.at("base[href]")
13
13
  base_url = @doc.at("base[href]").attr("href").to_s
14
14
  end
15
- @absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
16
15
 
17
16
  @options[:tags] = {}
18
17
  @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
@@ -57,15 +56,14 @@ class ContentLinkParser
57
56
  if attribute.kind_of? String or attribute.kind_of? Symbol
58
57
  @doc.css(selector).each do |tag|
59
58
  begin
60
- uri = @absolutize.url(tag[attribute])
61
- array << uri.to_s
59
+ array << Addressable::URI.parse(tag[attribute]).to_s
62
60
  rescue
63
61
  end
64
62
  end
65
63
  elsif attribute.instance_of? Regexp
66
64
  @doc.css(selector).each do |tag|
67
65
  begin
68
- tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
66
+ tag.content.scan(attribute) {|match| array << Addressable::URI.parse(match[0]).to_s}
69
67
  rescue
70
68
  end
71
69
  end
@@ -13,7 +13,6 @@ class CrawlJob
13
13
 
14
14
  @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
15
15
 
16
- @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
17
16
  @debug = content_request[:debug]
18
17
 
19
18
  refresh_counters
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.25
4
+ version: 0.0.26
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70349719636940 !ruby/object:Gem::Requirement
16
+ requirement: &70294818287920 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70349719636940
24
+ version_requirements: *70294818287920
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70349719636520 !ruby/object:Gem::Requirement
27
+ requirement: &70294818287500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70349719636520
35
+ version_requirements: *70294818287500
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70349719636100 !ruby/object:Gem::Requirement
38
+ requirement: &70294818287080 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,21 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70349719636100
47
- - !ruby/object:Gem::Dependency
48
- name: absolutize
49
- requirement: &70349719635660 !ruby/object:Gem::Requirement
50
- none: false
51
- requirements:
52
- - - ! '>='
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- type: :runtime
56
- prerelease: false
57
- version_requirements: *70349719635660
46
+ version_requirements: *70294818287080
58
47
  - !ruby/object:Gem::Dependency
59
48
  name: addressable
60
- requirement: &70349719635240 !ruby/object:Gem::Requirement
49
+ requirement: &70294818286640 !ruby/object:Gem::Requirement
61
50
  none: false
62
51
  requirements:
63
52
  - - ! '>='
@@ -65,10 +54,10 @@ dependencies:
65
54
  version: '0'
66
55
  type: :runtime
67
56
  prerelease: false
68
- version_requirements: *70349719635240
57
+ version_requirements: *70294818286640
69
58
  - !ruby/object:Gem::Dependency
70
59
  name: rspec
71
- requirement: &70349719634820 !ruby/object:Gem::Requirement
60
+ requirement: &70294818286220 !ruby/object:Gem::Requirement
72
61
  none: false
73
62
  requirements:
74
63
  - - ! '>='
@@ -76,10 +65,10 @@ dependencies:
76
65
  version: '0'
77
66
  type: :runtime
78
67
  prerelease: false
79
- version_requirements: *70349719634820
68
+ version_requirements: *70294818286220
80
69
  - !ruby/object:Gem::Dependency
81
70
  name: awesome_print
82
- requirement: &70349719634400 !ruby/object:Gem::Requirement
71
+ requirement: &70294818285800 !ruby/object:Gem::Requirement
83
72
  none: false
84
73
  requirements:
85
74
  - - ! '>='
@@ -87,10 +76,10 @@ dependencies:
87
76
  version: '0'
88
77
  type: :runtime
89
78
  prerelease: false
90
- version_requirements: *70349719634400
79
+ version_requirements: *70294818285800
91
80
  - !ruby/object:Gem::Dependency
92
81
  name: sinatra
93
- requirement: &70349719633980 !ruby/object:Gem::Requirement
82
+ requirement: &70294818285380 !ruby/object:Gem::Requirement
94
83
  none: false
95
84
  requirements:
96
85
  - - ! '>='
@@ -98,10 +87,10 @@ dependencies:
98
87
  version: '0'
99
88
  type: :runtime
100
89
  prerelease: false
101
- version_requirements: *70349719633980
90
+ version_requirements: *70294818285380
102
91
  - !ruby/object:Gem::Dependency
103
92
  name: thin
104
- requirement: &70349719633560 !ruby/object:Gem::Requirement
93
+ requirement: &70294818284960 !ruby/object:Gem::Requirement
105
94
  none: false
106
95
  requirements:
107
96
  - - ! '>='
@@ -109,10 +98,10 @@ dependencies:
109
98
  version: '0'
110
99
  type: :runtime
111
100
  prerelease: false
112
- version_requirements: *70349719633560
101
+ version_requirements: *70294818284960
113
102
  - !ruby/object:Gem::Dependency
114
103
  name: haml
115
- requirement: &70349719633140 !ruby/object:Gem::Requirement
104
+ requirement: &70294818284540 !ruby/object:Gem::Requirement
116
105
  none: false
117
106
  requirements:
118
107
  - - ! '>='
@@ -120,7 +109,7 @@ dependencies:
120
109
  version: '0'
121
110
  type: :runtime
122
111
  prerelease: false
123
- version_requirements: *70349719633140
112
+ version_requirements: *70294818284540
124
113
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
114
  your crawl.
126
115
  email: stewart@rockwellcottage.com