cobweb 0.0.25 → 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +3 -7
- data/lib/cobweb_crawler.rb +0 -2
- data/lib/content_link_parser.rb +2 -4
- data/lib/crawl_job.rb +0 -1
- metadata +19 -30
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -19,7 +19,7 @@ class Cobweb
|
|
19
19
|
# investigate using event machine for single threaded crawling
|
20
20
|
|
21
21
|
def self.version
|
22
|
-
"0.0.
|
22
|
+
"0.0.26"
|
23
23
|
end
|
24
24
|
|
25
25
|
def method_missing(method_sym, *arguments, &block)
|
@@ -77,8 +77,6 @@ class Cobweb
|
|
77
77
|
|
78
78
|
raise "url cannot be nil" if url.nil?
|
79
79
|
|
80
|
-
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
|
81
|
-
|
82
80
|
# get the unique id for this request
|
83
81
|
unique_id = Digest::SHA1.hexdigest(url.to_s)
|
84
82
|
if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
|
@@ -127,7 +125,7 @@ class Cobweb
|
|
127
125
|
puts "redirected... " unless @options[:quiet]
|
128
126
|
|
129
127
|
# get location to redirect to
|
130
|
-
url =
|
128
|
+
url = Addressable::URI.parse(response['location']).to_s
|
131
129
|
|
132
130
|
# decrement redirect limit
|
133
131
|
redirect_limit = redirect_limit - 1
|
@@ -234,8 +232,6 @@ class Cobweb
|
|
234
232
|
def head(url, options = @options)
|
235
233
|
raise "url cannot be nil" if url.nil?
|
236
234
|
|
237
|
-
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
238
|
-
|
239
235
|
# get the unique id for this request
|
240
236
|
unique_id = Digest::SHA1.hexdigest(url)
|
241
237
|
if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
|
@@ -277,7 +273,7 @@ class Cobweb
|
|
277
273
|
|
278
274
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
279
275
|
puts "redirected... " unless @options[:quiet]
|
280
|
-
url =
|
276
|
+
url = Addressable::URI.parse(response['location']).to_s
|
281
277
|
redirect_limit = redirect_limit - 1
|
282
278
|
options = options.clone
|
283
279
|
options[:redirect_limit]=redirect_limit
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -22,8 +22,6 @@ class CobwebCrawler
|
|
22
22
|
|
23
23
|
@crawl_options = crawl_options
|
24
24
|
|
25
|
-
@absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
26
|
-
|
27
25
|
@redis.sadd "queued", base_url
|
28
26
|
crawl_counter = @redis.scard("crawled").to_i
|
29
27
|
queue_counter = @redis.scard("queued").to_i
|
data/lib/content_link_parser.rb
CHANGED
@@ -12,7 +12,6 @@ class ContentLinkParser
|
|
12
12
|
if @doc.at("base[href]")
|
13
13
|
base_url = @doc.at("base[href]").attr("href").to_s
|
14
14
|
end
|
15
|
-
@absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
16
15
|
|
17
16
|
@options[:tags] = {}
|
18
17
|
@options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
|
@@ -57,15 +56,14 @@ class ContentLinkParser
|
|
57
56
|
if attribute.kind_of? String or attribute.kind_of? Symbol
|
58
57
|
@doc.css(selector).each do |tag|
|
59
58
|
begin
|
60
|
-
|
61
|
-
array << uri.to_s
|
59
|
+
array << Addressable::URI.parse(tag[attribute]).to_s
|
62
60
|
rescue
|
63
61
|
end
|
64
62
|
end
|
65
63
|
elsif attribute.instance_of? Regexp
|
66
64
|
@doc.css(selector).each do |tag|
|
67
65
|
begin
|
68
|
-
tag.content.scan(attribute) {|match| array <<
|
66
|
+
tag.content.scan(attribute) {|match| array << Addressable::URI.parse(match[0]).to_s}
|
69
67
|
rescue
|
70
68
|
end
|
71
69
|
end
|
data/lib/crawl_job.rb
CHANGED
@@ -13,7 +13,6 @@ class CrawlJob
|
|
13
13
|
|
14
14
|
@redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
15
15
|
|
16
|
-
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
17
16
|
@debug = content_request[:debug]
|
18
17
|
|
19
18
|
refresh_counters
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.26
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-03-13 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70294818287920 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70294818287920
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70294818287500 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70294818287500
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70294818287080 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,21 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
47
|
-
- !ruby/object:Gem::Dependency
|
48
|
-
name: absolutize
|
49
|
-
requirement: &70349719635660 !ruby/object:Gem::Requirement
|
50
|
-
none: false
|
51
|
-
requirements:
|
52
|
-
- - ! '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
type: :runtime
|
56
|
-
prerelease: false
|
57
|
-
version_requirements: *70349719635660
|
46
|
+
version_requirements: *70294818287080
|
58
47
|
- !ruby/object:Gem::Dependency
|
59
48
|
name: addressable
|
60
|
-
requirement: &
|
49
|
+
requirement: &70294818286640 !ruby/object:Gem::Requirement
|
61
50
|
none: false
|
62
51
|
requirements:
|
63
52
|
- - ! '>='
|
@@ -65,10 +54,10 @@ dependencies:
|
|
65
54
|
version: '0'
|
66
55
|
type: :runtime
|
67
56
|
prerelease: false
|
68
|
-
version_requirements: *
|
57
|
+
version_requirements: *70294818286640
|
69
58
|
- !ruby/object:Gem::Dependency
|
70
59
|
name: rspec
|
71
|
-
requirement: &
|
60
|
+
requirement: &70294818286220 !ruby/object:Gem::Requirement
|
72
61
|
none: false
|
73
62
|
requirements:
|
74
63
|
- - ! '>='
|
@@ -76,10 +65,10 @@ dependencies:
|
|
76
65
|
version: '0'
|
77
66
|
type: :runtime
|
78
67
|
prerelease: false
|
79
|
-
version_requirements: *
|
68
|
+
version_requirements: *70294818286220
|
80
69
|
- !ruby/object:Gem::Dependency
|
81
70
|
name: awesome_print
|
82
|
-
requirement: &
|
71
|
+
requirement: &70294818285800 !ruby/object:Gem::Requirement
|
83
72
|
none: false
|
84
73
|
requirements:
|
85
74
|
- - ! '>='
|
@@ -87,10 +76,10 @@ dependencies:
|
|
87
76
|
version: '0'
|
88
77
|
type: :runtime
|
89
78
|
prerelease: false
|
90
|
-
version_requirements: *
|
79
|
+
version_requirements: *70294818285800
|
91
80
|
- !ruby/object:Gem::Dependency
|
92
81
|
name: sinatra
|
93
|
-
requirement: &
|
82
|
+
requirement: &70294818285380 !ruby/object:Gem::Requirement
|
94
83
|
none: false
|
95
84
|
requirements:
|
96
85
|
- - ! '>='
|
@@ -98,10 +87,10 @@ dependencies:
|
|
98
87
|
version: '0'
|
99
88
|
type: :runtime
|
100
89
|
prerelease: false
|
101
|
-
version_requirements: *
|
90
|
+
version_requirements: *70294818285380
|
102
91
|
- !ruby/object:Gem::Dependency
|
103
92
|
name: thin
|
104
|
-
requirement: &
|
93
|
+
requirement: &70294818284960 !ruby/object:Gem::Requirement
|
105
94
|
none: false
|
106
95
|
requirements:
|
107
96
|
- - ! '>='
|
@@ -109,10 +98,10 @@ dependencies:
|
|
109
98
|
version: '0'
|
110
99
|
type: :runtime
|
111
100
|
prerelease: false
|
112
|
-
version_requirements: *
|
101
|
+
version_requirements: *70294818284960
|
113
102
|
- !ruby/object:Gem::Dependency
|
114
103
|
name: haml
|
115
|
-
requirement: &
|
104
|
+
requirement: &70294818284540 !ruby/object:Gem::Requirement
|
116
105
|
none: false
|
117
106
|
requirements:
|
118
107
|
- - ! '>='
|
@@ -120,7 +109,7 @@ dependencies:
|
|
120
109
|
version: '0'
|
121
110
|
type: :runtime
|
122
111
|
prerelease: false
|
123
|
-
version_requirements: *
|
112
|
+
version_requirements: *70294818284540
|
124
113
|
description: Web Crawler that uses resque background job engine to allow you to cluster
|
125
114
|
your crawl.
|
126
115
|
email: stewart@rockwellcottage.com
|