spieker 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6fadff203628d5b933856d22c19b3b828c61a31b
4
- data.tar.gz: 1bb4b613e70657c5824e1685999f85449280ad68
3
+ metadata.gz: c787dd82625950fc0ce4eca39e86d29c021b5414
4
+ data.tar.gz: 9825847bfbe9cb7a09961a26f191b88ff8b0ec0a
5
5
  SHA512:
6
- metadata.gz: 6cf3c371aa3e12dd6753f0b44ee054689af1e596c518c576edfc25655945c3df37ad868e47e38ac5044d1e0313670f0b0f7a7bff7f9a797ef9f45da200b1f8f4
7
- data.tar.gz: 163dd14767485e49dc15a299bdc8ad5927a0a0e6fc8b73870a3ca4d2f5fcd7ca68b992a882e996b20b76375af16956c2faa2519d135e79952b40050592b7c06e
6
+ metadata.gz: 4f50c13bdac9438682de85294c0cad57848d0e7205be4d76d0d4809894b6338cb989ff833ec4e7304bfa9bb092a6ee52d7cfdac9aeb120b258c4dcc11622a59c
7
+ data.tar.gz: 9dc328ea3b99435490d8620cc65f42604415592c826c8ff56ce897a107ca4f0d63881991ab411a7bacab58940acc893f082362380813e52e593700d85f5bfbb8
@@ -0,0 +1,2 @@
1
+ 0.0.3
2
+ * Ignore pdf and javascript links
data/Rakefile CHANGED
@@ -1 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << "test"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task default: :test
@@ -0,0 +1,3 @@
1
+ machine:
2
+ ruby:
3
+ version: 2.0.0
@@ -1,3 +1,4 @@
1
1
  require "spieker/version"
2
2
  require "spieker/crawler"
3
3
  require "spieker/link_scraper"
4
+ require "spieker/link_validator"
@@ -3,7 +3,6 @@ require 'capybara/poltergeist'
3
3
 
4
4
  module Spieker
5
5
  class LinkScraper
6
- LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)|data\:).*/
7
6
 
8
7
  include Capybara::DSL
9
8
  attr_writer :links
@@ -45,19 +44,10 @@ module Spieker
45
44
 
46
45
  def cleaned_up_links(links)
47
46
  links.select { |link|
48
- is_local?(link) && !is_email?(link)
47
+ LinkValidator.new(link, @url.to_s).valid?
49
48
  }.map(&method(:filter_hash)).compact.uniq
50
49
  end
51
50
 
52
- def is_local?(link)
53
- link =~ LOCAL_LINK_REGEX ||
54
- begin
55
- URI.parse(link).hostname == @url.hostname
56
- rescue
57
- false
58
- end
59
- end
60
-
61
51
  def filter_hash(link)
62
52
  if match = link.match(/(.*)#(.*)$/)
63
53
  match[1]
@@ -65,10 +55,6 @@ module Spieker
65
55
  link
66
56
  end
67
57
  end
68
-
69
- def is_email? link
70
- link =~ /mailto/
71
- end
72
58
  end
73
59
  end
74
60
 
@@ -0,0 +1,51 @@
1
+ class Spieker::LinkValidator
2
+ LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)).*/
3
+
4
+ def initialize(link, origin)
5
+ @link = link
6
+ @origin = origin
7
+ end
8
+
9
+ def valid?
10
+ assertions.all? { |assert| send(assert) }
11
+ end
12
+
13
+ private
14
+
15
+ def assertions
16
+ private_methods.select { |m| m.to_s.start_with?('assert_') }
17
+ end
18
+
19
+ def assert_is_local
20
+ link =~ LOCAL_LINK_REGEX ||
21
+ begin
22
+ URI.parse(link).hostname == origin.hostname
23
+ rescue
24
+ false
25
+ end
26
+ end
27
+
28
+ def assert_is_not_email
29
+ link !~ /mailto/
30
+ end
31
+
32
+ def assert_is_not_javascript
33
+ link !~ /javascript/
34
+ end
35
+
36
+ def assert_is_not_pdf
37
+ link !~ /\bpdf\b/
38
+ end
39
+
40
+ def assert_is_not_data
41
+ link !~ /^data/
42
+ end
43
+
44
+ def link
45
+ @link
46
+ end
47
+
48
+ def origin
49
+ URI.parse @origin.start_with?('http') ? @origin : 'http://' + @origin
50
+ end
51
+ end
@@ -1,3 +1,3 @@
1
1
  module Spieker
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "pry"
23
24
 
24
25
  spec.add_dependency "capybara"
25
26
  spec.add_dependency "poltergeist"
@@ -1,5 +1,4 @@
1
- require 'test/unit'
2
- require 'spieker.rb'
1
+ require 'test_helper'
3
2
 
4
3
  class TestLinkScraper < Test::Unit::TestCase
5
4
  def setup
@@ -22,7 +21,9 @@ class TestLinkScraper < Test::Unit::TestCase
22
21
  '/local#justahash',
23
22
  '#justahash',
24
23
  'http://www.remote.com',
25
- 'mailto:timonv@gmail.com'
24
+ 'mailto:timonv@gmail.com',
25
+ 'javascript',
26
+ 'pdf'
26
27
  ]
27
28
  expected_links = [
28
29
  'http://www.google.com/local',
@@ -0,0 +1,33 @@
1
+ require 'test_helper'
2
+
3
+ class TestLinkValidator < Test::Unit::TestCase
4
+ def is_valid?(link)
5
+ @validator = Spieker::LinkValidator.new(link, 'http://www.google.com').valid?
6
+ end
7
+
8
+ def test_normal_local_link
9
+ [
10
+ 'http://www.google.com/local',
11
+ '/local',
12
+ ''
13
+ ].each do |link|
14
+ assert is_valid?(link), "Expected #{link} to be valid"
15
+ end
16
+ end
17
+
18
+ def test_email
19
+ assert !is_valid?('mailto:timonv@gmail.com')
20
+ end
21
+
22
+ def test_javascript
23
+ assert !is_valid?('javascript')
24
+ end
25
+
26
+ def test_pdf
27
+ assert !is_valid?('pdf')
28
+ end
29
+
30
+ def test_data
31
+ assert !is_valid?('data')
32
+ end
33
+ end
@@ -1,6 +1,4 @@
1
- require 'test/unit'
2
-
3
- require 'spieker.rb'
1
+ require 'test_helper'
4
2
 
5
3
  class TestSpieker < Test::Unit::TestCase
6
4
  def setup
@@ -10,8 +8,4 @@ class TestSpieker < Test::Unit::TestCase
10
8
  def test_current_path
11
9
  assert_equal '/path', @crawler.current_path
12
10
  end
13
-
14
- def test_all_links
15
- assert_equal [], @crawler.current_links
16
- end
17
11
  end
@@ -0,0 +1,3 @@
1
+ require 'test/unit'
2
+ require 'spieker'
3
+ require 'pry'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spieker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Timon Vonk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-25 00:00:00.000000000 Z
11
+ date: 2013-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: capybara
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -89,18 +103,23 @@ extensions: []
89
103
  extra_rdoc_files: []
90
104
  files:
91
105
  - .gitignore
106
+ - CHANGELOG
92
107
  - Gemfile
93
108
  - LICENSE.txt
94
109
  - README.md
95
110
  - Rakefile
96
111
  - bin/spieker
112
+ - circle.yml
97
113
  - lib/spieker.rb
98
114
  - lib/spieker/crawler.rb
99
115
  - lib/spieker/link_scraper.rb
116
+ - lib/spieker/link_validator.rb
100
117
  - lib/spieker/version.rb
101
118
  - spieker.gemspec
102
119
  - test/link_scraper_test.rb
120
+ - test/link_validator_test.rb
103
121
  - test/spieker_test.rb
122
+ - test/test_helper.rb
104
123
  homepage: ''
105
124
  licenses:
106
125
  - MIT
@@ -127,4 +146,7 @@ specification_version: 4
127
146
  summary: Easilly crawl a website
128
147
  test_files:
129
148
  - test/link_scraper_test.rb
149
+ - test/link_validator_test.rb
130
150
  - test/spieker_test.rb
151
+ - test/test_helper.rb
152
+ has_rdoc: