cobweb 0.0.28 → 0.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/README.textile +3 -2
  2. data/lib/cobweb.rb +1 -1
  3. data/lib/crawl_job.rb +12 -5
  4. metadata +19 -19
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.28
2
+ h1. Cobweb v0.0.29
3
3
 
4
4
  h2. Intro
5
5
 
@@ -37,7 +37,8 @@ h3. Data Returned
37
37
  ** :scripts - url's from script tags
38
38
  ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
39
39
  * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
40
-
40
+ * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
41
+
41
42
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
42
43
 
43
44
  h2. Installation
data/lib/cobweb.rb CHANGED
@@ -19,7 +19,7 @@ class Cobweb
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
21
  def self.version
22
- "0.0.28"
22
+ "0.0.29"
23
23
  end
24
24
 
25
25
  def method_missing(method_sym, *arguments, &block)
data/lib/crawl_job.rb CHANGED
@@ -48,7 +48,7 @@ class CrawlJob
48
48
  end
49
49
 
50
50
  # enqueue to processing queue
51
- Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
51
+ Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:internal_urls => internal_patterns, :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
52
52
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
53
53
  puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter}" if content_request[:debug]
54
54
 
@@ -89,10 +89,9 @@ class CrawlJob
89
89
  end
90
90
 
91
91
  def self.internal_link?(link)
92
- puts "Checking for internal link for: #{link}" if @debug
93
- @internal_patterns ||= @redis.smembers("internal_urls").map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}
92
+ puts "Checking internal link for: #{link}" if @debug
94
93
  valid_link = true
95
- @internal_patterns.each do |pattern|
94
+ internal_patterns.map{|pattern| Regexp.new("^#{pattern.gsub("*", ".*?")}")}.each do |pattern|
96
95
  puts "Matching against #{pattern.source}" if @debug
97
96
  if link.match(pattern)
98
97
  puts "Matched as internal" if @debug
@@ -102,9 +101,17 @@ class CrawlJob
102
101
  puts "Didn't match any pattern so marked as not internal" if @debug
103
102
  false
104
103
  end
104
+
105
+ def self.internal_patterns
106
+ @internal_patterns ||= @redis.smembers("internal_urls")
107
+ end
105
108
 
106
109
  def self.all_links_from_content(content)
107
- content[:links].keys.map{|key| content[:links][key]}.flatten
110
+ links = content[:links].keys.map{|key| content[:links][key]}.flatten
111
+ links.reject!{|link| link.starts_with?("javascript:")}
112
+ links = links.map{|link| Addressable::URI.join(content[:url], link)}
113
+ links.select!{|link| link.scheme == "http" || link.scheme == "https"}
114
+ links
108
115
  end
109
116
 
110
117
  def self.enqueue_content(content_request, link)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.28
4
+ version: 0.0.29
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70349176950540 !ruby/object:Gem::Requirement
16
+ requirement: &70189333520340 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70349176950540
24
+ version_requirements: *70189333520340
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70349176950120 !ruby/object:Gem::Requirement
27
+ requirement: &70189333519900 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70349176950120
35
+ version_requirements: *70189333519900
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70349176949680 !ruby/object:Gem::Requirement
38
+ requirement: &70189333519480 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70349176949680
46
+ version_requirements: *70189333519480
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70349176949260 !ruby/object:Gem::Requirement
49
+ requirement: &70189333519060 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70349176949260
57
+ version_requirements: *70189333519060
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70349176948840 !ruby/object:Gem::Requirement
60
+ requirement: &70189333518640 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70349176948840
68
+ version_requirements: *70189333518640
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70349176948420 !ruby/object:Gem::Requirement
71
+ requirement: &70189333518220 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70349176948420
79
+ version_requirements: *70189333518220
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70349176948000 !ruby/object:Gem::Requirement
82
+ requirement: &70189333517800 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70349176948000
90
+ version_requirements: *70189333517800
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70349176947580 !ruby/object:Gem::Requirement
93
+ requirement: &70189333517380 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70349176947580
101
+ version_requirements: *70189333517380
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70349176947160 !ruby/object:Gem::Requirement
104
+ requirement: &70189333533320 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,7 +109,7 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70349176947160
112
+ version_requirements: *70189333533320
113
113
  description: Web Crawler that uses resque background job engine to allow you to cluster
114
114
  your crawl.
115
115
  email: stewart@rockwellcottage.com