cobweb 0.0.28 → 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/README.textile +3 -2
  2. data/lib/cobweb.rb +1 -1
  3. data/lib/crawl_job.rb +12 -5
  4. metadata +19 -19
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.28
2
+ h1. Cobweb v0.0.29
3
3
 
4
4
  h2. Intro
5
5
 
@@ -37,7 +37,8 @@ h3. Data Returned
37
37
  ** :scripts - url's from script tags
38
38
  ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
39
39
  * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
40
-
40
+ * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
41
+
41
42
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
42
43
 
43
44
  h2. Installation
data/lib/cobweb.rb CHANGED
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.28"
22
+ "0.0.29"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
data/lib/crawl_job.rb CHANGED
@@ -48,7 +48,7 @@ class CrawlJob
48
48
  end
49
49
 
50
50
  # enqueue to processing queue
51
- Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
51
+ Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
52
52
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
53
53
  puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
54
54
 
@@ -89,10 +89,9 @@ class CrawlJob
89
89
  end
90
90
 
91
91
  def self.internal_link?(link)
92
- puts "Checking for internal link for: #{link}" if @debug
93
- @internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
92
+ puts "Checking internal link for: #{link}" if @debug
94
93
  valid_link = true
95
- @internal_patterns.each do |pattern|
94
+ internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
96
95
  puts "Matching against #{pattern.source}" if @debug
97
96
  if link.match(pattern)
98
97
  puts "Matched as internal" if @debug
@@ -102,9 +101,17 @@ class CrawlJob
102
101
  puts "Didn't match any pattern so marked as not internal" if @debug
103
102
  false
104
103
  end
104
+
105
+ def self.internal_patterns
106
+ @internal_patterns ||= @redis.smembers("internal_urls")
107
+ end
105
108
 
106
109
  def self.all_links_from_content(content)
107
- content[:links].keys.map{|key| content[:links][key]}.flatten
110
+ links = content[:links].keys.map{|key| content[:links][key]}.flatten
111
+ links.reject!{|link| link.starts_with?("javascript:")}
112
+ links = links.map{|link| Addressable::URI.join(content[:url], link)}
113
+ links.select!{|link| link.scheme == "http" || link.scheme == "https"}
114
+ links
108
115
  end
109
116
 
110
117
  def self.enqueue_content(content_request, link)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.28
4
+ version: 0.0.29
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70349176950540 !ruby/object:Gem::Requirement
16
+ requirement: &70189333520340 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70349176950540
24
+ version_requirements: *70189333520340
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70349176950120 !ruby/object:Gem::Requirement
27
+ requirement: &70189333519900 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70349176950120
35
+ version_requirements: *70189333519900
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70349176949680 !ruby/object:Gem::Requirement
38
+ requirement: &70189333519480 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70349176949680
46
+ version_requirements: *70189333519480
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70349176949260 !ruby/object:Gem::Requirement
49
+ requirement: &70189333519060 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70349176949260
57
+ version_requirements: *70189333519060
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70349176948840 !ruby/object:Gem::Requirement
60
+ requirement: &70189333518640 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70349176948840
68
+ version_requirements: *70189333518640
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70349176948420 !ruby/object:Gem::Requirement
71
+ requirement: &70189333518220 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70349176948420
79
+ version_requirements: *70189333518220
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70349176948000 !ruby/object:Gem::Requirement
82
+ requirement: &70189333517800 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70349176948000
90
+ version_requirements: *70189333517800
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70349176947580 !ruby/object:Gem::Requirement
93
+ requirement: &70189333517380 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70349176947580
101
+ version_requirements: *70189333517380
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70349176947160 !ruby/object:Gem::Requirement
104
+ requirement: &70189333533320 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,7 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70349176947160
112
+ version_requirements: *70189333533320
113
113
  description: Web Crawler that uses resque background job engine to allow you to cluster
114
114
  your crawl.
115
115
  email: stewart@rockwellcottage.com