cobweb 0.0.52 → 0.0.53

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.52
2
+ h1. Cobweb v0.0.53
3
3
 
4
4
  h2. Intro
5
5
 
@@ -1,6 +1,6 @@
1
1
  class CobwebVersion
2
2
  def self.version
3
- "0.0.52"
3
+ "0.0.53"
4
4
  end
5
5
 
6
6
  end
@@ -42,6 +42,7 @@ class ContentLinkParser
42
42
  data = link_data
43
43
  links = data.keys.map{|key| data[key]}.flatten.uniq
44
44
  links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
45
+ links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }
45
46
  links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
46
47
  links
47
48
  end
@@ -29,7 +29,7 @@ describe ContentLinkParser do
29
29
  end
30
30
  it "should return the correct links" do
31
31
  links = @content_parser.links
32
- links.length.should == 7
32
+ links.length.should == 11
33
33
  end
34
34
  end
35
35
  describe "returning image links" do
@@ -92,12 +92,12 @@ describe ContentLinkParser do
92
92
  link_data.should be_an_instance_of Hash
93
93
 
94
94
  link_data.keys.length.should == 5
95
- link_data[:links].length.should == 7
95
+ link_data[:links].length.should == 11
96
96
  end
97
97
 
98
98
  it "should return all http and https links by default" do
99
99
  links = @content_parser.all_links
100
- links.count.should == 11
100
+ links.count.should == 13
101
101
  end
102
102
 
103
103
  it "should return all http and https links by default" do
@@ -110,6 +110,13 @@ describe ContentLinkParser do
110
110
  links = @content_parser.all_links(:valid_schemes => [:https])
111
111
  links.count.should == 1
112
112
  end
113
+
114
+ it "should detect and not return link loops" do
115
+ links = @content_parser.all_links
116
+ links.should include("http://www.ge.com/repeated1/repeated2/nothing/repeated1/")
117
+ links.should_not include("http://www.ge.com/repeated1/repeated2/nothing/repeated1/repeated2")
118
+ links.should include("http://www.ge.com/repeated1/repeated2/nothing/repeated1/asdf/repeated2")
119
+ end
113
120
  end
114
121
 
115
122
  describe "ignoring default tags" do
@@ -27,6 +27,11 @@
27
27
  <a href="mailto:stewart@theizone.co.uk">Click Here to email </a>
28
28
  <a href="javascript:alert('javascript clicked');">click here for javscript</a>
29
29
  <a href="https://sampleurl-a.com/">Click Here for SSL link to URL 1</a>
30
+ <a href="http://www.ge.com/repeated1/repeated2/nothing/repeated1/"></a>
31
+ <a href="http://www.ge.com/repeated1/repeated2/nothing/repeated1/repeated2"></a>
32
+ <a href="http://www.ge.com/repeated1/repeated2/nothing/repeated1/asdf/repeated2"></a>
33
+
34
+ <a href="http://www.ge.com/citizenship/our-priorities/our-products-services/news/citizenship-in-the-news/about-citizenship/our-impact/about-citizenship/performance-against-commitments/metrics/environment-health-safety/about-citizenship/our-impact/metrics/suppliers/about-citizenship/leadership-messages/about-citizenship/awards-recognition/our-commitment-areas/our-people/our-commitment-areas/our-customers/metrics/suppliers/about-citizenship/leadership-messages/our-commitment-areas/our-suppliers/metrics/ecomagination-data/about-citizenship/awards-recognition/news/videos/news/news/videos/about-citizenship/awards-recognition/">looped link</a>
30
35
  <frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
31
36
 
32
37
  <map id="testmap"><area href="http://sampleurl-area"></area>></map>
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.52
4
+ version: 0.0.53
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70175437227100 !ruby/object:Gem::Requirement
16
+ requirement: &70165685677520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70175437227100
24
+ version_requirements: *70165685677520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70175437225800 !ruby/object:Gem::Requirement
27
+ requirement: &70165685676500 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70175437225800
35
+ version_requirements: *70165685676500
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70175437224640 !ruby/object:Gem::Requirement
38
+ requirement: &70165685675420 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70175437224640
46
+ version_requirements: *70165685675420
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70175437223160 !ruby/object:Gem::Requirement
49
+ requirement: &70165685673680 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70175437223160
57
+ version_requirements: *70165685673680
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70175437220680 !ruby/object:Gem::Requirement
60
+ requirement: &70165685671300 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70175437220680
68
+ version_requirements: *70165685671300
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70175437243360 !ruby/object:Gem::Requirement
71
+ requirement: &70165685694460 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70175437243360
79
+ version_requirements: *70165685694460
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70175437241140 !ruby/object:Gem::Requirement
82
+ requirement: &70165685692120 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70175437241140
90
+ version_requirements: *70165685692120
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70175437252920 !ruby/object:Gem::Requirement
93
+ requirement: &70165685703520 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70175437252920
101
+ version_requirements: *70165685703520
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70175437251920 !ruby/object:Gem::Requirement
104
+ requirement: &70165685702520 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70175437251920
112
+ version_requirements: *70165685702520
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70175437250760 !ruby/object:Gem::Requirement
115
+ requirement: &70165685701400 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70175437250760
123
+ version_requirements: *70165685701400
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com