spieker 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +2 -0
- data/Rakefile +9 -0
- data/circle.yml +3 -0
- data/lib/spieker.rb +1 -0
- data/lib/spieker/link_scraper.rb +1 -15
- data/lib/spieker/link_validator.rb +51 -0
- data/lib/spieker/version.rb +1 -1
- data/spieker.gemspec +1 -0
- data/test/link_scraper_test.rb +4 -3
- data/test/link_validator_test.rb +33 -0
- data/test/spieker_test.rb +1 -7
- data/test/test_helper.rb +3 -0
- metadata +24 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c787dd82625950fc0ce4eca39e86d29c021b5414
|
4
|
+
data.tar.gz: 9825847bfbe9cb7a09961a26f191b88ff8b0ec0a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f50c13bdac9438682de85294c0cad57848d0e7205be4d76d0d4809894b6338cb989ff833ec4e7304bfa9bb092a6ee52d7cfdac9aeb120b258c4dcc11622a59c
|
7
|
+
data.tar.gz: 9dc328ea3b99435490d8620cc65f42604415592c826c8ff56ce897a107ca4f0d63881991ab411a7bacab58940acc893f082362380813e52e593700d85f5bfbb8
|
data/CHANGELOG
ADDED
data/Rakefile
CHANGED
data/circle.yml
ADDED
data/lib/spieker.rb
CHANGED
data/lib/spieker/link_scraper.rb
CHANGED
@@ -3,7 +3,6 @@ require 'capybara/poltergeist'
|
|
3
3
|
|
4
4
|
module Spieker
|
5
5
|
class LinkScraper
|
6
|
-
LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)|data\:).*/
|
7
6
|
|
8
7
|
include Capybara::DSL
|
9
8
|
attr_writer :links
|
@@ -45,19 +44,10 @@ module Spieker
|
|
45
44
|
|
46
45
|
def cleaned_up_links(links)
|
47
46
|
links.select { |link|
|
48
|
-
|
47
|
+
LinkValidator.new(link, @url.to_s).valid?
|
49
48
|
}.map(&method(:filter_hash)).compact.uniq
|
50
49
|
end
|
51
50
|
|
52
|
-
def is_local?(link)
|
53
|
-
link =~ LOCAL_LINK_REGEX ||
|
54
|
-
begin
|
55
|
-
URI.parse(link).hostname == @url.hostname
|
56
|
-
rescue
|
57
|
-
false
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
51
|
def filter_hash(link)
|
62
52
|
if match = link.match(/(.*)#(.*)$/)
|
63
53
|
match[1]
|
@@ -65,10 +55,6 @@ module Spieker
|
|
65
55
|
link
|
66
56
|
end
|
67
57
|
end
|
68
|
-
|
69
|
-
def is_email? link
|
70
|
-
link =~ /mailto/
|
71
|
-
end
|
72
58
|
end
|
73
59
|
end
|
74
60
|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
class Spieker::LinkValidator
|
2
|
+
LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)).*/
|
3
|
+
|
4
|
+
def initialize(link, origin)
|
5
|
+
@link = link
|
6
|
+
@origin = origin
|
7
|
+
end
|
8
|
+
|
9
|
+
def valid?
|
10
|
+
assertions.all? { |assert| send(assert) }
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def assertions
|
16
|
+
private_methods.select { |m| m.to_s.start_with?('assert_') }
|
17
|
+
end
|
18
|
+
|
19
|
+
def assert_is_local
|
20
|
+
link =~ LOCAL_LINK_REGEX ||
|
21
|
+
begin
|
22
|
+
URI.parse(link).hostname == origin.hostname
|
23
|
+
rescue
|
24
|
+
false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def assert_is_not_email
|
29
|
+
link !~ /mailto/
|
30
|
+
end
|
31
|
+
|
32
|
+
def assert_is_not_javascript
|
33
|
+
link !~ /javascript/
|
34
|
+
end
|
35
|
+
|
36
|
+
def assert_is_not_pdf
|
37
|
+
link !~ /\bpdf\b/
|
38
|
+
end
|
39
|
+
|
40
|
+
def assert_is_not_data
|
41
|
+
link !~ /^data/
|
42
|
+
end
|
43
|
+
|
44
|
+
def link
|
45
|
+
@link
|
46
|
+
end
|
47
|
+
|
48
|
+
def origin
|
49
|
+
URI.parse @origin.start_with?('http') ? @origin : 'http://' + @origin
|
50
|
+
end
|
51
|
+
end
|
data/lib/spieker/version.rb
CHANGED
data/spieker.gemspec
CHANGED
data/test/link_scraper_test.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'spieker.rb'
|
1
|
+
require 'test_helper'
|
3
2
|
|
4
3
|
class TestLinkScraper < Test::Unit::TestCase
|
5
4
|
def setup
|
@@ -22,7 +21,9 @@ class TestLinkScraper < Test::Unit::TestCase
|
|
22
21
|
'/local#justahash',
|
23
22
|
'#justahash',
|
24
23
|
'http://www.remote.com',
|
25
|
-
'mailto:timonv@gmail.com'
|
24
|
+
'mailto:timonv@gmail.com',
|
25
|
+
'javascript',
|
26
|
+
'pdf'
|
26
27
|
]
|
27
28
|
expected_links = [
|
28
29
|
'http://www.google.com/local',
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class TestLinkValidator < Test::Unit::TestCase
|
4
|
+
def is_valid?(link)
|
5
|
+
@validator = Spieker::LinkValidator.new(link, 'http://www.google.com').valid?
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_normal_local_link
|
9
|
+
[
|
10
|
+
'http://www.google.com/local',
|
11
|
+
'/local',
|
12
|
+
''
|
13
|
+
].each do |link|
|
14
|
+
assert is_valid?(link), "Expected #{link} to be valid"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_email
|
19
|
+
assert !is_valid?('mailto:timonv@gmail.com')
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_javascript
|
23
|
+
assert !is_valid?('javascript')
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_pdf
|
27
|
+
assert !is_valid?('pdf')
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_data
|
31
|
+
assert !is_valid?('data')
|
32
|
+
end
|
33
|
+
end
|
data/test/spieker_test.rb
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
require 'spieker.rb'
|
1
|
+
require 'test_helper'
|
4
2
|
|
5
3
|
class TestSpieker < Test::Unit::TestCase
|
6
4
|
def setup
|
@@ -10,8 +8,4 @@ class TestSpieker < Test::Unit::TestCase
|
|
10
8
|
def test_current_path
|
11
9
|
assert_equal '/path', @crawler.current_path
|
12
10
|
end
|
13
|
-
|
14
|
-
def test_all_links
|
15
|
-
assert_equal [], @crawler.current_links
|
16
|
-
end
|
17
11
|
end
|
data/test/test_helper.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spieker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Timon Vonk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: capybara
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -89,18 +103,23 @@ extensions: []
|
|
89
103
|
extra_rdoc_files: []
|
90
104
|
files:
|
91
105
|
- .gitignore
|
106
|
+
- CHANGELOG
|
92
107
|
- Gemfile
|
93
108
|
- LICENSE.txt
|
94
109
|
- README.md
|
95
110
|
- Rakefile
|
96
111
|
- bin/spieker
|
112
|
+
- circle.yml
|
97
113
|
- lib/spieker.rb
|
98
114
|
- lib/spieker/crawler.rb
|
99
115
|
- lib/spieker/link_scraper.rb
|
116
|
+
- lib/spieker/link_validator.rb
|
100
117
|
- lib/spieker/version.rb
|
101
118
|
- spieker.gemspec
|
102
119
|
- test/link_scraper_test.rb
|
120
|
+
- test/link_validator_test.rb
|
103
121
|
- test/spieker_test.rb
|
122
|
+
- test/test_helper.rb
|
104
123
|
homepage: ''
|
105
124
|
licenses:
|
106
125
|
- MIT
|
@@ -127,4 +146,7 @@ specification_version: 4
|
|
127
146
|
summary: Easilly crawl a website
|
128
147
|
test_files:
|
129
148
|
- test/link_scraper_test.rb
|
149
|
+
- test/link_validator_test.rb
|
130
150
|
- test/spieker_test.rb
|
151
|
+
- test/test_helper.rb
|
152
|
+
has_rdoc:
|