spieker 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5b4522f86a2664e3028ed3193ba87f782dbdcb95
4
+ data.tar.gz: d110801ec5b674af967fe84a398176ce467b3a2b
5
+ SHA512:
6
+ metadata.gz: 87176bd62d604ac693772fb7dc761b0807c8776142470049e9ea729a8f1230cbee03010a40661d9e887f53f6ce8e2a3c21810c12e47e3aeb96298dc71334f324
7
+ data.tar.gz: c4f41f4346cba9e79ec7a6511f1dd3d7f6c7991668392d911a40a8b5f3570f6432bdb5c793f7123315c94b324a6511e61d0f7fd94a08b720fc71162a8a7fc333
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Timon Vonk
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
File without changes
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/spieker ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'crawler'
4
+
5
+ Spieker::Crawler.new(ARGV[0], verbose: true).crawl!
@@ -0,0 +1,64 @@
1
+ module Spieker
2
+ class Crawler
3
+ def initialize(url, verbose: false)
4
+ @url = url
5
+ @tracked_links = []
6
+ @verbose = verbose
7
+ end
8
+
9
+ def crawl!
10
+ report "Starting to crawl on #{@url}"
11
+
12
+ scraper = LinkScraper.new(@url)
13
+ track_link(@url)
14
+ links = scraper.result
15
+ recursively_crawl(links)
16
+
17
+ print_results if @verbose
18
+ end
19
+
20
+ def current_path
21
+ URI.parse(@url).path
22
+ end
23
+
24
+ private
25
+ def recursively_crawl(links)
26
+ new_links_threaded = {}
27
+ links.each do |link|
28
+ report "Crawling page #{link}"
29
+
30
+ scraper = LinkScraper.new(link)
31
+ new_links_threaded[link] = scraper.result
32
+ track_link(link)
33
+
34
+ report "Finished page #{link}, #{new_links_threaded[link].length} links found"
35
+ end
36
+ new_links = select_untracked_links(new_links_threaded.values.flatten.uniq)
37
+ report "Recursively crawling #{new_links.length} links ..."
38
+ report "NEW LINKS FOUND: \n#{new_links.join("\n")}"
39
+ report "TRACKED LINKS: \n#{@tracked_links.join("\n")}"
40
+
41
+ recursively_crawl(new_links) if new_links.any?
42
+ end
43
+
44
+ def select_untracked_links(links)
45
+ links.select { |l| !@tracked_links.include?(l) }
46
+ end
47
+
48
+ def track_link(link)
49
+ @tracked_links = @tracked_links.push(link)
50
+ end
51
+
52
+ def report(text)
53
+ if @verbose
54
+ puts text
55
+ end
56
+ end
57
+
58
+ def print_results
59
+ report "\n\n:::RESULTS:::\n\n"
60
+ report "Pages found #{@tracked_links.uniq.compact.length}\n\n"
61
+ report "All links found:\n\n #{@tracked_links.compact.join("\n")}"
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,77 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ module Spieker
5
+ class LinkScraper
6
+ LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)|data\:).*/
7
+
8
+ include Capybara::DSL
9
+ attr_writer :links
10
+
11
+ def initialize(url)
12
+ @url = URI.parse(url)
13
+ Capybara.app_host = app_host
14
+
15
+ Capybara.register_driver :poltergeist do |app|
16
+ Capybara::Poltergeist::Driver.new(app, phantomjs_logger: NullStream.new )
17
+ end
18
+
19
+ Capybara.current_driver = :selenium
20
+ end
21
+
22
+ def result
23
+ cleaned_up_links(found_links)
24
+ end
25
+
26
+ def app_host
27
+ "#{@url.scheme}://#{@url.hostname}"
28
+ end
29
+
30
+ private
31
+
32
+ def found_links
33
+ @links ||= drive_page_for_links
34
+ end
35
+
36
+ def drive_page_for_links
37
+ begin
38
+ visit @url.path
39
+ page.all('a').map { |el| el[:href]}
40
+ rescue
41
+ puts "Error parsing #{@url.to_s}"
42
+ []
43
+ end
44
+ end
45
+
46
+ def cleaned_up_links(links)
47
+ links.select { |link|
48
+ is_local?(link) && !is_email?(link)
49
+ }.map(&method(:filter_hash)).compact.uniq
50
+ end
51
+
52
+ def is_local?(link)
53
+ link =~ LOCAL_LINK_REGEX ||
54
+ begin
55
+ URI.parse(link).hostname == @url.hostname
56
+ rescue
57
+ false
58
+ end
59
+ end
60
+
61
+ def filter_hash(link)
62
+ if match = link.match(/(.*)#(.*)$/)
63
+ match[1]
64
+ else
65
+ link
66
+ end
67
+ end
68
+
69
+ def is_email? link
70
+ link =~ /mailto/
71
+ end
72
+ end
73
+ end
74
+
75
+ class NullStream
76
+ def puts; end
77
+ end
@@ -0,0 +1,3 @@
1
+ module Spieker
2
+ VERSION = "0.0.1"
3
+ end
data/lib/spieker.rb ADDED
@@ -0,0 +1,3 @@
1
+ require "spieker/version"
2
+ require "spieker/instance"
3
+ require "spieker/link_scraper"
data/spieker.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'spieker/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "spieker"
8
+ spec.version = Spieker::VERSION
9
+ spec.authors = ["Timon Vonk"]
10
+ spec.email = ["timonv@gmail.com"]
11
+ spec.description = %q{Easilly crawl a website}
12
+ spec.summary = %q{Easilly crawl a website}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+
24
+ spec.add_dependency "capybara"
25
+ spec.add_dependency "poltergeist"
26
+ spec.add_dependency "selenium-webdriver"
27
+ end
@@ -0,0 +1,36 @@
1
+ require 'test/unit'
2
+ require 'spieker.rb'
3
+
4
+ class TestLinkScraper < Test::Unit::TestCase
5
+ def setup
6
+ @scraper = Spieker::LinkScraper.new('http://www.google.com')
7
+ end
8
+
9
+ def test_apphost
10
+ assert_equal 'http://www.google.com', @scraper.app_host
11
+ end
12
+
13
+ def test_result
14
+ return
15
+ assert @scraper.result.any?
16
+ end
17
+
18
+ def test_filter_local
19
+ found_links = [
20
+ 'http://www.google.com/local',
21
+ '/local',
22
+ '/local#justahash',
23
+ '#justahash',
24
+ 'http://www.remote.com',
25
+ 'mailto:timonv@gmail.com'
26
+ ]
27
+ expected_links = [
28
+ 'http://www.google.com/local',
29
+ '/local',
30
+ ''
31
+ ]
32
+
33
+ @scraper.links = found_links
34
+ assert_equal expected_links, @scraper.result
35
+ end
36
+ end
@@ -0,0 +1,17 @@
1
+ require 'test/unit'
2
+
3
+ require 'spieker.rb'
4
+
5
+ class TestSpieker < Test::Unit::TestCase
6
+ def setup
7
+ @crawler = Spieker::Crawler.new('http://www.google.com/path')
8
+ end
9
+
10
+ def test_current_path
11
+ assert_equal '/path', @crawler.current_path
12
+ end
13
+
14
+ def test_all_links
15
+ assert_equal [], @crawler.current_links
16
+ end
17
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spieker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Timon Vonk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: capybara
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: poltergeist
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: selenium-webdriver
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Easilly crawl a website
84
+ email:
85
+ - timonv@gmail.com
86
+ executables:
87
+ - spieker
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - bin/spieker
97
+ - lib/spieker.rb
98
+ - lib/spieker/crawler.rb
99
+ - lib/spieker/link_scraper.rb
100
+ - lib/spieker/version.rb
101
+ - spieker.gemspec
102
+ - test/link_scraper_test.rb
103
+ - test/spieker_test.rb
104
+ homepage: ''
105
+ licenses:
106
+ - MIT
107
+ metadata: {}
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ requirements: []
123
+ rubyforge_project:
124
+ rubygems_version: 2.0.3
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: Easilly crawl a website
128
+ test_files:
129
+ - test/link_scraper_test.rb
130
+ - test/spieker_test.rb