spieker 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +2 -0
- data/Rakefile +9 -0
- data/circle.yml +3 -0
- data/lib/spieker.rb +1 -0
- data/lib/spieker/link_scraper.rb +1 -15
- data/lib/spieker/link_validator.rb +51 -0
- data/lib/spieker/version.rb +1 -1
- data/spieker.gemspec +1 -0
- data/test/link_scraper_test.rb +4 -3
- data/test/link_validator_test.rb +33 -0
- data/test/spieker_test.rb +1 -7
- data/test/test_helper.rb +3 -0
- metadata +24 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c787dd82625950fc0ce4eca39e86d29c021b5414
|
4
|
+
data.tar.gz: 9825847bfbe9cb7a09961a26f191b88ff8b0ec0a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4f50c13bdac9438682de85294c0cad57848d0e7205be4d76d0d4809894b6338cb989ff833ec4e7304bfa9bb092a6ee52d7cfdac9aeb120b258c4dcc11622a59c
|
7
|
+
data.tar.gz: 9dc328ea3b99435490d8620cc65f42604415592c826c8ff56ce897a107ca4f0d63881991ab411a7bacab58940acc893f082362380813e52e593700d85f5bfbb8
|
data/CHANGELOG
ADDED
data/Rakefile
CHANGED
data/circle.yml
ADDED
data/lib/spieker.rb
CHANGED
data/lib/spieker/link_scraper.rb
CHANGED
@@ -3,7 +3,6 @@ require 'capybara/poltergeist'
|
|
3
3
|
|
4
4
|
module Spieker
|
5
5
|
class LinkScraper
|
6
|
-
LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)|data\:).*/
|
7
6
|
|
8
7
|
include Capybara::DSL
|
9
8
|
attr_writer :links
|
@@ -45,19 +44,10 @@ module Spieker
|
|
45
44
|
|
46
45
|
def cleaned_up_links(links)
|
47
46
|
links.select { |link|
|
48
|
-
|
47
|
+
LinkValidator.new(link, @url.to_s).valid?
|
49
48
|
}.map(&method(:filter_hash)).compact.uniq
|
50
49
|
end
|
51
50
|
|
52
|
-
def is_local?(link)
|
53
|
-
link =~ LOCAL_LINK_REGEX ||
|
54
|
-
begin
|
55
|
-
URI.parse(link).hostname == @url.hostname
|
56
|
-
rescue
|
57
|
-
false
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
51
|
def filter_hash(link)
|
62
52
|
if match = link.match(/(.*)#(.*)$/)
|
63
53
|
match[1]
|
@@ -65,10 +55,6 @@ module Spieker
|
|
65
55
|
link
|
66
56
|
end
|
67
57
|
end
|
68
|
-
|
69
|
-
def is_email? link
|
70
|
-
link =~ /mailto/
|
71
|
-
end
|
72
58
|
end
|
73
59
|
end
|
74
60
|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
class Spieker::LinkValidator
|
2
|
+
LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)).*/
|
3
|
+
|
4
|
+
def initialize(link, origin)
|
5
|
+
@link = link
|
6
|
+
@origin = origin
|
7
|
+
end
|
8
|
+
|
9
|
+
def valid?
|
10
|
+
assertions.all? { |assert| send(assert) }
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def assertions
|
16
|
+
private_methods.select { |m| m.to_s.start_with?('assert_') }
|
17
|
+
end
|
18
|
+
|
19
|
+
def assert_is_local
|
20
|
+
link =~ LOCAL_LINK_REGEX ||
|
21
|
+
begin
|
22
|
+
URI.parse(link).hostname == origin.hostname
|
23
|
+
rescue
|
24
|
+
false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def assert_is_not_email
|
29
|
+
link !~ /mailto/
|
30
|
+
end
|
31
|
+
|
32
|
+
def assert_is_not_javascript
|
33
|
+
link !~ /javascript/
|
34
|
+
end
|
35
|
+
|
36
|
+
def assert_is_not_pdf
|
37
|
+
link !~ /\bpdf\b/
|
38
|
+
end
|
39
|
+
|
40
|
+
def assert_is_not_data
|
41
|
+
link !~ /^data/
|
42
|
+
end
|
43
|
+
|
44
|
+
def link
|
45
|
+
@link
|
46
|
+
end
|
47
|
+
|
48
|
+
def origin
|
49
|
+
URI.parse @origin.start_with?('http') ? @origin : 'http://' + @origin
|
50
|
+
end
|
51
|
+
end
|
data/lib/spieker/version.rb
CHANGED
data/spieker.gemspec
CHANGED
data/test/link_scraper_test.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'spieker.rb'
|
1
|
+
require 'test_helper'
|
3
2
|
|
4
3
|
class TestLinkScraper < Test::Unit::TestCase
|
5
4
|
def setup
|
@@ -22,7 +21,9 @@ class TestLinkScraper < Test::Unit::TestCase
|
|
22
21
|
'/local#justahash',
|
23
22
|
'#justahash',
|
24
23
|
'http://www.remote.com',
|
25
|
-
'mailto:timonv@gmail.com'
|
24
|
+
'mailto:timonv@gmail.com',
|
25
|
+
'javascript',
|
26
|
+
'pdf'
|
26
27
|
]
|
27
28
|
expected_links = [
|
28
29
|
'http://www.google.com/local',
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class TestLinkValidator < Test::Unit::TestCase
|
4
|
+
def is_valid?(link)
|
5
|
+
@validator = Spieker::LinkValidator.new(link, 'http://www.google.com').valid?
|
6
|
+
end
|
7
|
+
|
8
|
+
def test_normal_local_link
|
9
|
+
[
|
10
|
+
'http://www.google.com/local',
|
11
|
+
'/local',
|
12
|
+
''
|
13
|
+
].each do |link|
|
14
|
+
assert is_valid?(link), "Expected #{link} to be valid"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_email
|
19
|
+
assert !is_valid?('mailto:timonv@gmail.com')
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_javascript
|
23
|
+
assert !is_valid?('javascript')
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_pdf
|
27
|
+
assert !is_valid?('pdf')
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_data
|
31
|
+
assert !is_valid?('data')
|
32
|
+
end
|
33
|
+
end
|
data/test/spieker_test.rb
CHANGED
@@ -1,6 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
require 'spieker.rb'
|
1
|
+
require 'test_helper'
|
4
2
|
|
5
3
|
class TestSpieker < Test::Unit::TestCase
|
6
4
|
def setup
|
@@ -10,8 +8,4 @@ class TestSpieker < Test::Unit::TestCase
|
|
10
8
|
def test_current_path
|
11
9
|
assert_equal '/path', @crawler.current_path
|
12
10
|
end
|
13
|
-
|
14
|
-
def test_all_links
|
15
|
-
assert_equal [], @crawler.current_links
|
16
|
-
end
|
17
11
|
end
|
data/test/test_helper.rb
ADDED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spieker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Timon Vonk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: capybara
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -89,18 +103,23 @@ extensions: []
|
|
89
103
|
extra_rdoc_files: []
|
90
104
|
files:
|
91
105
|
- .gitignore
|
106
|
+
- CHANGELOG
|
92
107
|
- Gemfile
|
93
108
|
- LICENSE.txt
|
94
109
|
- README.md
|
95
110
|
- Rakefile
|
96
111
|
- bin/spieker
|
112
|
+
- circle.yml
|
97
113
|
- lib/spieker.rb
|
98
114
|
- lib/spieker/crawler.rb
|
99
115
|
- lib/spieker/link_scraper.rb
|
116
|
+
- lib/spieker/link_validator.rb
|
100
117
|
- lib/spieker/version.rb
|
101
118
|
- spieker.gemspec
|
102
119
|
- test/link_scraper_test.rb
|
120
|
+
- test/link_validator_test.rb
|
103
121
|
- test/spieker_test.rb
|
122
|
+
- test/test_helper.rb
|
104
123
|
homepage: ''
|
105
124
|
licenses:
|
106
125
|
- MIT
|
@@ -127,4 +146,7 @@ specification_version: 4
|
|
127
146
|
summary: Easilly crawl a website
|
128
147
|
test_files:
|
129
148
|
- test/link_scraper_test.rb
|
149
|
+
- test/link_validator_test.rb
|
130
150
|
- test/spieker_test.rb
|
151
|
+
- test/test_helper.rb
|
152
|
+
has_rdoc:
|