spieker 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6fadff203628d5b933856d22c19b3b828c61a31b
4
- data.tar.gz: 1bb4b613e70657c5824e1685999f85449280ad68
3
+ metadata.gz: c787dd82625950fc0ce4eca39e86d29c021b5414
4
+ data.tar.gz: 9825847bfbe9cb7a09961a26f191b88ff8b0ec0a
5
5
  SHA512:
6
- metadata.gz: 6cf3c371aa3e12dd6753f0b44ee054689af1e596c518c576edfc25655945c3df37ad868e47e38ac5044d1e0313670f0b0f7a7bff7f9a797ef9f45da200b1f8f4
7
- data.tar.gz: 163dd14767485e49dc15a299bdc8ad5927a0a0e6fc8b73870a3ca4d2f5fcd7ca68b992a882e996b20b76375af16956c2faa2519d135e79952b40050592b7c06e
6
+ metadata.gz: 4f50c13bdac9438682de85294c0cad57848d0e7205be4d76d0d4809894b6338cb989ff833ec4e7304bfa9bb092a6ee52d7cfdac9aeb120b258c4dcc11622a59c
7
+ data.tar.gz: 9dc328ea3b99435490d8620cc65f42604415592c826c8ff56ce897a107ca4f0d63881991ab411a7bacab58940acc893f082362380813e52e593700d85f5bfbb8
@@ -0,0 +1,2 @@
1
+ 0.0.3
2
+ * Ignore pdf and javascript links
data/Rakefile CHANGED
@@ -1 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs << "test"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task default: :test
@@ -0,0 +1,3 @@
1
+ machine:
2
+ ruby:
3
+ version: 2.0.0
@@ -1,3 +1,4 @@
1
1
  require "spieker/version"
2
2
  require "spieker/crawler"
3
3
  require "spieker/link_scraper"
4
+ require "spieker/link_validator"
@@ -3,7 +3,6 @@ require 'capybara/poltergeist'
3
3
 
4
4
  module Spieker
5
5
  class LinkScraper
6
- LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)|data\:).*/
7
6
 
8
7
  include Capybara::DSL
9
8
  attr_writer :links
@@ -45,19 +44,10 @@ module Spieker
45
44
 
46
45
  def cleaned_up_links(links)
47
46
  links.select { |link|
48
- is_local?(link) && !is_email?(link)
47
+ LinkValidator.new(link, @url.to_s).valid?
49
48
  }.map(&method(:filter_hash)).compact.uniq
50
49
  end
51
50
 
52
- def is_local?(link)
53
- link =~ LOCAL_LINK_REGEX ||
54
- begin
55
- URI.parse(link).hostname == @url.hostname
56
- rescue
57
- false
58
- end
59
- end
60
-
61
51
  def filter_hash(link)
62
52
  if match = link.match(/(.*)#(.*)$/)
63
53
  match[1]
@@ -65,10 +55,6 @@ module Spieker
65
55
  link
66
56
  end
67
57
  end
68
-
69
- def is_email? link
70
- link =~ /mailto/
71
- end
72
58
  end
73
59
  end
74
60
 
@@ -0,0 +1,51 @@
1
+ class Spieker::LinkValidator
2
+ LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)).*/
3
+
4
+ def initialize(link, origin)
5
+ @link = link
6
+ @origin = origin
7
+ end
8
+
9
+ def valid?
10
+ assertions.all? { |assert| send(assert) }
11
+ end
12
+
13
+ private
14
+
15
+ def assertions
16
+ private_methods.select { |m| m.to_s.start_with?('assert_') }
17
+ end
18
+
19
+ def assert_is_local
20
+ link =~ LOCAL_LINK_REGEX ||
21
+ begin
22
+ URI.parse(link).hostname == origin.hostname
23
+ rescue
24
+ false
25
+ end
26
+ end
27
+
28
+ def assert_is_not_email
29
+ link !~ /mailto/
30
+ end
31
+
32
+ def assert_is_not_javascript
33
+ link !~ /javascript/
34
+ end
35
+
36
+ def assert_is_not_pdf
37
+ link !~ /\bpdf\b/
38
+ end
39
+
40
+ def assert_is_not_data
41
+ link !~ /^data/
42
+ end
43
+
44
+ def link
45
+ @link
46
+ end
47
+
48
+ def origin
49
+ URI.parse @origin.start_with?('http') ? @origin : 'http://' + @origin
50
+ end
51
+ end
@@ -1,3 +1,3 @@
1
1
  module Spieker
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -20,6 +20,7 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.3"
22
22
  spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "pry"
23
24
 
24
25
  spec.add_dependency "capybara"
25
26
  spec.add_dependency "poltergeist"
@@ -1,5 +1,4 @@
1
- require 'test/unit'
2
- require 'spieker.rb'
1
+ require 'test_helper'
3
2
 
4
3
  class TestLinkScraper < Test::Unit::TestCase
5
4
  def setup
@@ -22,7 +21,9 @@ class TestLinkScraper < Test::Unit::TestCase
22
21
  '/local#justahash',
23
22
  '#justahash',
24
23
  'http://www.remote.com',
25
- 'mailto:timonv@gmail.com'
24
+ 'mailto:timonv@gmail.com',
25
+ 'javascript',
26
+ 'pdf'
26
27
  ]
27
28
  expected_links = [
28
29
  'http://www.google.com/local',
@@ -0,0 +1,33 @@
1
+ require 'test_helper'
2
+
3
+ class TestLinkValidator < Test::Unit::TestCase
4
+ def is_valid?(link)
5
+ @validator = Spieker::LinkValidator.new(link, 'http://www.google.com').valid?
6
+ end
7
+
8
+ def test_normal_local_link
9
+ [
10
+ 'http://www.google.com/local',
11
+ '/local',
12
+ ''
13
+ ].each do |link|
14
+ assert is_valid?(link), "Expected #{link} to be valid"
15
+ end
16
+ end
17
+
18
+ def test_email
19
+ assert !is_valid?('mailto:timonv@gmail.com')
20
+ end
21
+
22
+ def test_javascript
23
+ assert !is_valid?('javascript')
24
+ end
25
+
26
+ def test_pdf
27
+ assert !is_valid?('pdf')
28
+ end
29
+
30
+ def test_data
31
+ assert !is_valid?('data')
32
+ end
33
+ end
@@ -1,6 +1,4 @@
1
- require 'test/unit'
2
-
3
- require 'spieker.rb'
1
+ require 'test_helper'
4
2
 
5
3
  class TestSpieker < Test::Unit::TestCase
6
4
  def setup
@@ -10,8 +8,4 @@ class TestSpieker < Test::Unit::TestCase
10
8
  def test_current_path
11
9
  assert_equal '/path', @crawler.current_path
12
10
  end
13
-
14
- def test_all_links
15
- assert_equal [], @crawler.current_links
16
- end
17
11
  end
@@ -0,0 +1,3 @@
1
+ require 'test/unit'
2
+ require 'spieker'
3
+ require 'pry'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spieker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Timon Vonk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-25 00:00:00.000000000 Z
11
+ date: 2013-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: capybara
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -89,18 +103,23 @@ extensions: []
89
103
  extra_rdoc_files: []
90
104
  files:
91
105
  - .gitignore
106
+ - CHANGELOG
92
107
  - Gemfile
93
108
  - LICENSE.txt
94
109
  - README.md
95
110
  - Rakefile
96
111
  - bin/spieker
112
+ - circle.yml
97
113
  - lib/spieker.rb
98
114
  - lib/spieker/crawler.rb
99
115
  - lib/spieker/link_scraper.rb
116
+ - lib/spieker/link_validator.rb
100
117
  - lib/spieker/version.rb
101
118
  - spieker.gemspec
102
119
  - test/link_scraper_test.rb
120
+ - test/link_validator_test.rb
103
121
  - test/spieker_test.rb
122
+ - test/test_helper.rb
104
123
  homepage: ''
105
124
  licenses:
106
125
  - MIT
@@ -127,4 +146,7 @@ specification_version: 4
127
146
  summary: Easilly crawl a website
128
147
  test_files:
129
148
  - test/link_scraper_test.rb
149
+ - test/link_validator_test.rb
130
150
  - test/spieker_test.rb
151
+ - test/test_helper.rb
152
+ has_rdoc: