cobweb 0.0.28 → 0.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +3 -2
- data/lib/cobweb.rb +1 -1
- data/lib/crawl_job.rb +12 -5
- metadata +19 -19
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.29
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -37,7 +37,8 @@ h3. Data Returned
|
|
37
37
|
** :scripts - url's from script tags
|
38
38
|
** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
|
39
39
|
* :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
40
|
-
|
40
|
+
* :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
|
41
|
+
|
41
42
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
42
43
|
|
43
44
|
h2. Installation
|
data/lib/cobweb.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -48,7 +48,7 @@ class CrawlJob
|
|
48
48
|
end
|
49
49
|
|
50
50
|
# enqueue to processing queue
|
51
|
-
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
51
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
52
52
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
53
53
|
puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
|
54
54
|
|
@@ -89,10 +89,9 @@ class CrawlJob
|
|
89
89
|
end
|
90
90
|
|
91
91
|
def self.internal_link?(link)
|
92
|
-
puts "Checking
|
93
|
-
@internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
|
92
|
+
puts "Checking internal link for: #{link}" if @debug
|
94
93
|
valid_link = true
|
95
|
-
|
94
|
+
internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
|
96
95
|
puts "Matching against #{pattern.source}" if @debug
|
97
96
|
if link.match(pattern)
|
98
97
|
puts "Matched as internal" if @debug
|
@@ -102,9 +101,17 @@ class CrawlJob
|
|
102
101
|
puts "Didn't match any pattern so marked as not internal" if @debug
|
103
102
|
false
|
104
103
|
end
|
104
|
+
|
105
|
+
def self.internal_patterns
|
106
|
+
@internal_patterns ||= @redis.smembers("internal_urls")
|
107
|
+
end
|
105
108
|
|
106
109
|
def self.all_links_from_content(content)
|
107
|
-
content[:links].keys.map{|key| content[:links][key]}.flatten
|
110
|
+
links = content[:links].keys.map{|key| content[:links][key]}.flatten
|
111
|
+
links.reject!{|link| link.starts_with?("javascript:")}
|
112
|
+
links = links.map{|link| Addressable::URI.join(content[:url], link)}
|
113
|
+
links.select!{|link| link.scheme == "http" || link.scheme == "https"}
|
114
|
+
links
|
108
115
|
end
|
109
116
|
|
110
117
|
def self.enqueue_content(content_request, link)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.29
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-03-14 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70189333520340 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70189333520340
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70189333519900 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70189333519900
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70189333519480 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70189333519480
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70189333519060 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70189333519060
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70189333518640 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70189333518640
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70189333518220 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70189333518220
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70189333517800 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70189333517800
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70189333517380 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70189333517380
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70189333533320 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70189333533320
|
113
113
|
description: Web Crawler that uses resque background job engine to allow you to cluster
|
114
114
|
your crawl.
|
115
115
|
email: stewart@rockwellcottage.com
|