cobweb 0.0.52 → 0.0.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.52
2
+ h1. Cobweb v0.0.53
3
3
 
4
4
  h2. Intro
5
5
 
@@ -1,6 +1,6 @@
1
1
  class CobwebVersion
2
2
  def self.version
3
- "0.0.52"
3
+ "0.0.53"
4
4
  end
5
5
 
6
6
  end
@@ -42,6 +42,7 @@ class ContentLinkParser
42
42
  data = link_data
43
43
  links = data.keys.map{|key| data[key]}.flatten.uniq
44
44
  links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
45
+ links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }
45
46
  links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
46
47
  links
47
48
  end
@@ -29,7 +29,7 @@ describe ContentLinkParser do
29
29
  end
30
30
  it "should return the correct links" do
31
31
  links = @content_parser.links
32
- links.length.should == 7
32
+ links.length.should == 11
33
33
  end
34
34
  end
35
35
  describe "returning image links" do
@@ -92,12 +92,12 @@ describe ContentLinkParser do
92
92
  link_data.should be_an_instance_of Hash
93
93
 
94
94
  link_data.keys.length.should == 5
95
- link_data[:links].length.should == 7
95
+ link_data[:links].length.should == 11
96
96
  end
97
97
 
98
98
  it "should return all http and https links by default" do
99
99
  links = @content_parser.all_links
100
- links.count.should == 11
100
+ links.count.should == 13
101
101
  end
102
102
 
103
103
  it "should return all http and https links by default" do
@@ -110,6 +110,13 @@ describe ContentLinkParser do
110
110
  links = @content_parser.all_links(:valid_schemes => [:https])
111
111
  links.count.should == 1
112
112
  end
113
+
114
+ it "should detect and not return link loops" do
115
+ links = @content_parser.all_links
116
+ links.should include("http://www.ge.com/repeated1/repeated2/nothing/repeated1/")
117
+ links.should_not include("http://www.ge.com/repeated1/repeated2/nothing/repeated1/repeated2")
118
+ links.should include("http://www.ge.com/repeated1/repeated2/nothing/repeated1/asdf/repeated2")
119
+ end
113
120
  end
114
121
 
115
122
  describe "ignoring default tags" do
@@ -27,6 +27,11 @@
27
27
  <a href="mailto:stewart@theizone.co.uk">Click Here to email </a>
28
28
  <a href="javascript:alert('javascript clicked');">click here for javscript</a>
29
29
  <a href="https://sampleurl-a.com/">Click Here for SSL link to URL 1</a>
30
+ <a href="http://www.ge.com/repeated1/repeated2/nothing/repeated1/"></a>
31
+ <a href="http://www.ge.com/repeated1/repeated2/nothing/repeated1/repeated2"></a>
32
+ <a href="http://www.ge.com/repeated1/repeated2/nothing/repeated1/asdf/repeated2"></a>
33
+
34
+ <a href="http://www.ge.com/citizenship/our-priorities/our-products-services/news/citizenship-in-the-news/about-citizenship/our-impact/about-citizenship/performance-against-commitments/metrics/environment-health-safety/about-citizenship/our-impact/metrics/suppliers/about-citizenship/leadership-messages/about-citizenship/awards-recognition/our-commitment-areas/our-people/our-commitment-areas/our-customers/metrics/suppliers/about-citizenship/leadership-messages/our-commitment-areas/our-suppliers/metrics/ecomagination-data/about-citizenship/awards-recognition/news/videos/news/news/videos/about-citizenship/awards-recognition/">looped link</a>
30
35
  <frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
31
36
 
32
37
  <map id="testmap"><area href="http://sampleurl-area"></area>></map>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.52
4
+ version: 0.0.53
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70175437227100 !ruby/object:Gem::Requirement
16
+ requirement: &70165685677520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70175437227100
24
+ version_requirements: *70165685677520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70175437225800 !ruby/object:Gem::Requirement
27
+ requirement: &70165685676500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70175437225800
35
+ version_requirements: *70165685676500
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70175437224640 !ruby/object:Gem::Requirement
38
+ requirement: &70165685675420 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70175437224640
46
+ version_requirements: *70165685675420
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70175437223160 !ruby/object:Gem::Requirement
49
+ requirement: &70165685673680 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70175437223160
57
+ version_requirements: *70165685673680
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70175437220680 !ruby/object:Gem::Requirement
60
+ requirement: &70165685671300 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70175437220680
68
+ version_requirements: *70165685671300
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70175437243360 !ruby/object:Gem::Requirement
71
+ requirement: &70165685694460 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70175437243360
79
+ version_requirements: *70165685694460
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70175437241140 !ruby/object:Gem::Requirement
82
+ requirement: &70165685692120 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70175437241140
90
+ version_requirements: *70165685692120
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70175437252920 !ruby/object:Gem::Requirement
93
+ requirement: &70165685703520 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70175437252920
101
+ version_requirements: *70165685703520
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70175437251920 !ruby/object:Gem::Requirement
104
+ requirement: &70165685702520 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70175437251920
112
+ version_requirements: *70165685702520
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70175437250760 !ruby/object:Gem::Requirement
115
+ requirement: &70165685701400 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70175437250760
123
+ version_requirements: *70165685701400
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com