cobweb 0.0.49 → 0.0.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.49
2
+ h1. Cobweb v0.0.50
3
3
 
4
4
  h2. Intro
5
5
 
@@ -38,6 +38,7 @@ h3. Data Returned
38
38
  ** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
39
39
  * :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
40
40
  * :internal_urls - an array of urls with * wildcards that represent urls internal to the site (ie pages within the same domain)
41
+ * :external_urls - an array of urls with * wildcards that represent urls external to the site (overrides internal_urls)
41
42
 
42
43
  The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
43
44
 
data/lib/cobweb_links.rb CHANGED
@@ -10,8 +10,8 @@ class CobwebLinks
10
10
  @options[:external_urls] = [] unless @options.has_key? :external_urls
11
11
  @options[:debug] = false unless @options.has_key? :debug
12
12
 
13
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("?", "\\?").gsub("*", ".*?")}")}
14
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{pattern.gsub(".", "\\.").gsub("?", "\\?").gsub("*", ".*?")}")}
13
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{pattern.cobweb_encode_for_regex([".", "?"]).gsub("*", ".*?")}")}
14
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{pattern.cobweb_encode_for_regex([".", "?"]).gsub("*", ".*?")}")}
15
15
 
16
16
  end
17
17
 
@@ -46,3 +46,11 @@ end
46
46
  class InvalidUrlsError < Exception
47
47
  end
48
48
 
49
+ class String
50
+ def cobweb_encode_for_regex(characters)
51
+ characters.map{|character| self.gsub!(character, "\\#{character}") }
52
+ ap self
53
+ self
54
+ end
55
+ end
56
+
@@ -1,6 +1,6 @@
1
1
  class CobwebVersion
2
2
  def self.version
3
- "0.0.49"
3
+ "0.0.50"
4
4
  end
5
5
 
6
6
  end
@@ -81,7 +81,7 @@ describe CobwebLinks do
81
81
  cobweb_links.external?("http://blog.domain_one.com/pageone.html").should be_true
82
82
  end
83
83
  it "should match external links with querystring parameters" do
84
- cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.ford.com/"], :external_urls => ["http://*.ford.com/*?*view=print"], :debug => true)
84
+ cobweb_links = CobwebLinks.new(:internal_urls => ["http://www.ford.com/"], :external_urls => ["http://*.ford.com/*?*view=print"])
85
85
  cobweb_links.external?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_true
86
86
  cobweb_links.internal?("http://corporate.ford.com/news-center/press-releases-detail/pr-doug-scott2658-marketing-manager-31039?view=print").should be_false
87
87
  cobweb_links.external?("http://corporate.ford.com/news-center/view=print/pr-doug-scott2658-marketing-manager-31039").should be_true
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.49
4
+ version: 0.0.50
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70099734277020 !ruby/object:Gem::Requirement
16
+ requirement: &70235629945140 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70099734277020
24
+ version_requirements: *70235629945140
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70099734276180 !ruby/object:Gem::Requirement
27
+ requirement: &70235629943920 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70099734276180
35
+ version_requirements: *70235629943920
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70099734289880 !ruby/object:Gem::Requirement
38
+ requirement: &70235629942920 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70099734289880
46
+ version_requirements: *70235629942920
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70099734287280 !ruby/object:Gem::Requirement
49
+ requirement: &70235629941120 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70099734287280
57
+ version_requirements: *70235629941120
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70099734286420 !ruby/object:Gem::Requirement
60
+ requirement: &70235629938800 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70099734286420
68
+ version_requirements: *70235629938800
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70099734285000 !ruby/object:Gem::Requirement
71
+ requirement: &70235629961180 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70099734285000
79
+ version_requirements: *70235629961180
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70099734283680 !ruby/object:Gem::Requirement
82
+ requirement: &70235629958300 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70099734283680
90
+ version_requirements: *70235629958300
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70099734295820 !ruby/object:Gem::Requirement
93
+ requirement: &70235629970420 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70099734295820
101
+ version_requirements: *70235629970420
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70099734292820 !ruby/object:Gem::Requirement
104
+ requirement: &70235629969420 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70099734292820
112
+ version_requirements: *70235629969420
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70099734300600 !ruby/object:Gem::Requirement
115
+ requirement: &70235629968260 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70099734300600
123
+ version_requirements: *70235629968260
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com