cobweb 0.0.25 → 0.0.26

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.25
2
+ h1. Cobweb v0.0.26
3
3
 
4
4
  h2. Intro
5
5
 
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.25"
22
+ "0.0.26"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
@@ -77,8 +77,6 @@ class Cobweb
77
77
 
78
78
  raise "url cannot be nil" if url.nil?
79
79
 
80
- absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
81
-
82
80
  # get the unique id for this request
83
81
  unique_id = Digest::SHA1.hexdigest(url.to_s)
84
82
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -127,7 +125,7 @@ class Cobweb
127
125
  puts "redirected... " unless @options[:quiet]
128
126
 
129
127
  # get location to redirect to
130
- url = absolutize.url(response['location']).to_s
128
+ url = Addressable::URI.parse(response['location']).to_s
131
129
 
132
130
  # decrement redirect limit
133
131
  redirect_limit = redirect_limit - 1
@@ -234,8 +232,6 @@ class Cobweb
234
232
  def head(url, options = @options)
235
233
  raise "url cannot be nil" if url.nil?
236
234
 
237
- absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
238
-
239
235
  # get the unique id for this request
240
236
  unique_id = Digest::SHA1.hexdigest(url)
241
237
  if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
@@ -277,7 +273,7 @@ class Cobweb
277
273
 
278
274
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
279
275
  puts "redirected... " unless @options[:quiet]
280
- url = absolutize.url(response['location']).to_s
276
+ url = Addressable::URI.parse(response['location']).to_s
281
277
  redirect_limit = redirect_limit - 1
282
278
  options = options.clone
283
279
  options[:redirect_limit]=redirect_limit
@@ -22,8 +22,6 @@ class CobwebCrawler
22
22
 
23
23
  @crawl_options = crawl_options
24
24
 
25
- @absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
26
-
27
25
  @redis.sadd "queued", base_url
28
26
  crawl_counter = @redis.scard("crawled").to_i
29
27
  queue_counter = @redis.scard("queued").to_i
@@ -12,7 +12,6 @@ class ContentLinkParser
12
12
  if @doc.at("base[href]")
13
13
  base_url = @doc.at("base[href]").attr("href").to_s
14
14
  end
15
- @absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
16
15
 
17
16
  @options[:tags] = {}
18
17
  @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
@@ -57,15 +56,14 @@ class ContentLinkParser
57
56
  if attribute.kind_of? String or attribute.kind_of? Symbol
58
57
  @doc.css(selector).each do |tag|
59
58
  begin
60
- uri = @absolutize.url(tag[attribute])
61
- array << uri.to_s
59
+ array << Addressable::URI.parse(tag[attribute]).to_s
62
60
  rescue
63
61
  end
64
62
  end
65
63
  elsif attribute.instance_of? Regexp
66
64
  @doc.css(selector).each do |tag|
67
65
  begin
68
- tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
66
+ tag.content.scan(attribute) {|match| array << Addressable::URI.parse(match[0]).to_s}
69
67
  rescue
70
68
  end
71
69
  end
@@ -13,7 +13,6 @@ class CrawlJob
13
13
 
14
14
  @redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
15
15
 
16
- @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
17
16
  @debug = content_request[:debug]
18
17
 
19
18
  refresh_counters
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.25
4
+ version: 0.0.26
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70349719636940 !ruby/object:Gem::Requirement
16
+ requirement: &70294818287920 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70349719636940
24
+ version_requirements: *70294818287920
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70349719636520 !ruby/object:Gem::Requirement
27
+ requirement: &70294818287500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70349719636520
35
+ version_requirements: *70294818287500
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70349719636100 !ruby/object:Gem::Requirement
38
+ requirement: &70294818287080 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,21 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70349719636100
47
- - !ruby/object:Gem::Dependency
48
- name: absolutize
49
- requirement: &70349719635660 !ruby/object:Gem::Requirement
50
- none: false
51
- requirements:
52
- - - ! '>='
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- type: :runtime
56
- prerelease: false
57
- version_requirements: *70349719635660
46
+ version_requirements: *70294818287080
58
47
  - !ruby/object:Gem::Dependency
59
48
  name: addressable
60
- requirement: &70349719635240 !ruby/object:Gem::Requirement
49
+ requirement: &70294818286640 !ruby/object:Gem::Requirement
61
50
  none: false
62
51
  requirements:
63
52
  - - ! '>='
@@ -65,10 +54,10 @@ dependencies:
65
54
  version: '0'
66
55
  type: :runtime
67
56
  prerelease: false
68
- version_requirements: *70349719635240
57
+ version_requirements: *70294818286640
69
58
  - !ruby/object:Gem::Dependency
70
59
  name: rspec
71
- requirement: &70349719634820 !ruby/object:Gem::Requirement
60
+ requirement: &70294818286220 !ruby/object:Gem::Requirement
72
61
  none: false
73
62
  requirements:
74
63
  - - ! '>='
@@ -76,10 +65,10 @@ dependencies:
76
65
  version: '0'
77
66
  type: :runtime
78
67
  prerelease: false
79
- version_requirements: *70349719634820
68
+ version_requirements: *70294818286220
80
69
  - !ruby/object:Gem::Dependency
81
70
  name: awesome_print
82
- requirement: &70349719634400 !ruby/object:Gem::Requirement
71
+ requirement: &70294818285800 !ruby/object:Gem::Requirement
83
72
  none: false
84
73
  requirements:
85
74
  - - ! '>='
@@ -87,10 +76,10 @@ dependencies:
87
76
  version: '0'
88
77
  type: :runtime
89
78
  prerelease: false
90
- version_requirements: *70349719634400
79
+ version_requirements: *70294818285800
91
80
  - !ruby/object:Gem::Dependency
92
81
  name: sinatra
93
- requirement: &70349719633980 !ruby/object:Gem::Requirement
82
+ requirement: &70294818285380 !ruby/object:Gem::Requirement
94
83
  none: false
95
84
  requirements:
96
85
  - - ! '>='
@@ -98,10 +87,10 @@ dependencies:
98
87
  version: '0'
99
88
  type: :runtime
100
89
  prerelease: false
101
- version_requirements: *70349719633980
90
+ version_requirements: *70294818285380
102
91
  - !ruby/object:Gem::Dependency
103
92
  name: thin
104
- requirement: &70349719633560 !ruby/object:Gem::Requirement
93
+ requirement: &70294818284960 !ruby/object:Gem::Requirement
105
94
  none: false
106
95
  requirements:
107
96
  - - ! '>='
@@ -109,10 +98,10 @@ dependencies:
109
98
  version: '0'
110
99
  type: :runtime
111
100
  prerelease: false
112
- version_requirements: *70349719633560
101
+ version_requirements: *70294818284960
113
102
  - !ruby/object:Gem::Dependency
114
103
  name: haml
115
- requirement: &70349719633140 !ruby/object:Gem::Requirement
104
+ requirement: &70294818284540 !ruby/object:Gem::Requirement
116
105
  none: false
117
106
  requirements:
118
107
  - - ! '>='
@@ -120,7 +109,7 @@ dependencies:
120
109
  version: '0'
121
110
  type: :runtime
122
111
  prerelease: false
123
- version_requirements: *70349719633140
112
+ version_requirements: *70294818284540
124
113
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
114
  your crawl.
126
115
  email: stewart@rockwellcottage.com