crabbs 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9fc5a904a6d962205d1a075c95ef79a98ccba4e7
4
+ data.tar.gz: e30f07bf9305c2e4a6aad06de1c335d2f2976d0d
5
+ SHA512:
6
+ metadata.gz: 82c05e3734be530a5f8e9059fdd69959598f76ad66cbd3ce2cb9af7120e3d24cc9839eb79a7168417273404e7dd20a74a1cad01d05563f5ce4743f59e6efaaba
7
+ data.tar.gz: db18939b31a5cb899cdc8ca5872ba355e21ef3f597106c9996debc44697fa08ab8c7d1834bbbdff0c3373939c034334e5897a514ab0a45a2b9c91c8410d584fa
data/.gitignore ADDED
@@ -0,0 +1,25 @@
1
+ *.swa
2
+ *.swo
3
+ *.swp
4
+ *.gem
5
+ *.rbc
6
+ .bundle
7
+ .config
8
+ .yardoc
9
+ Gemfile.lock
10
+ InstalledFiles
11
+ _yardoc
12
+ coverage
13
+ doc/
14
+ lib/bundler/man
15
+ pkg
16
+ rdoc
17
+ spec/reports
18
+ test/tmp
19
+ test/version_tmp
20
+ tmp
21
+ *.bundle
22
+ *.so
23
+ *.o
24
+ *.a
25
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format documentation
3
+ --warnings
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Bruno Trecenti
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ # Crabbs
2
+
3
+ A simple web crawler
4
+
5
+ ## Description
6
+
7
+ A simple web crawler for single domains that oupts a site map as json
8
+
9
+ ## Installation
10
+
11
+ `gem install crabbs`
12
+
13
+ ## Usage
14
+
15
+ `crabbs -u http://example.com`
16
+
17
+ Please reference to `crabbs --help` for further options.
18
+
19
+ ## Contributing
20
+
21
+ 1. Fork it ( https://github.com/[my-github-username]/crabbs/fork )
22
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
23
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
24
+ 4. Push to the branch (`git push origin my-new-feature`)
25
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/crabbs ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'crabbs'
4
+
5
+ cli = Crabbs::CLI.new
6
+ cli.start
data/crabbs.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crabbs/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'crabbs'
8
+ spec.version = Crabbs::VERSION
9
+ spec.authors = ['Bruno Trecenti']
10
+ spec.email = ['btrecent@thoughtworks.com']
11
+ spec.summary = %q{A crawler for the web}
12
+ spec.description = %q{A crawler for a single domain web application}
13
+ spec.homepage = 'http://github.com/Trecenti/crabbs'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_runtime_dependency 'nokogiri', '~> 1.6'
22
+ spec.add_runtime_dependency 'slop', '~> 3.5', '>= 3.5.0'
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.6'
25
+ spec.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
26
+ spec.add_development_dependency 'rspec', '~> 2.14', '>= 2.14.1'
27
+ spec.add_development_dependency 'webmock', '~> 1.18', '>= 1.18.0'
28
+ spec.add_development_dependency 'pry', '~> 0.9', '>= 0.9.12'
29
+ end
data/lib/crabbs/cli.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'slop'
2
+ require 'json'
3
+
4
+ module Crabbs
5
+ class CLI
6
+
7
+ def start
8
+ begin
9
+ opts = parse_options
10
+ result = Crabbs.start(opts[:url])
11
+ STDOUT.puts result.to_json
12
+ rescue Slop::MissingOptionError => e
13
+ STDOUT.puts e.message
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def parse_options
20
+ Slop.parse(help: true) do
21
+ banner 'Usage: crabbs [options]'
22
+
23
+ on 'u', 'url=', 'URL to start crawling', required: true
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ require 'net/http'
2
+
3
+ module Crabbs
4
+ class Crawler
5
+ attr_reader :visited, :site_map
6
+
7
+ def initialize()
8
+ @visited = []
9
+ @site_map = {}
10
+ end
11
+
12
+ def crawl(uri_string)
13
+ recurse uri_string, @site_map
14
+ end
15
+
16
+ private
17
+
18
+ def recurse(uri_string, hash)
19
+ hash[uri_string] = Hash.new
20
+
21
+ return if (@visited.include? uri_string)
22
+
23
+ links = extract_links uri_string
24
+ @visited << uri_string
25
+
26
+ links.each do |link|
27
+ recurse(link, hash[uri_string])
28
+ end
29
+ end
30
+
31
+ def extract_links(uri_string)
32
+ begin
33
+ uri = URI.parse(uri_string)
34
+ page = Crabbs::Page.new(Net::HTTP.get(uri), uri.to_s)
35
+
36
+ page.links
37
+ rescue URI::InvalidURIError
38
+ []
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,45 @@
1
+ require 'nokogiri'
2
+
3
+ module Crabbs
4
+ class Page
5
+ def initialize(html, url)
6
+ @html = Nokogiri::HTML(html)
7
+ @url = url
8
+ end
9
+
10
+ def links
11
+ uri_list = @html.css('a[href]').map { |a| URI.parse(a['href']) }
12
+
13
+ valid_uris = uri_list
14
+ .select { |uri| uri.host == URI.parse(@url).host or uri.host.nil? }
15
+ .select { |uri| uri.fragment.nil? or not uri.fragment.empty? }
16
+
17
+ links = create_full_uri_links(valid_uris)
18
+
19
+ links = links.select do |link|
20
+ host = URI.parse(link).host
21
+ extension = File.extname(link.sub(host, ''))
22
+ extension.empty? or extension == '.html'
23
+ end
24
+
25
+ links.uniq
26
+ end
27
+
28
+ private
29
+
30
+ def create_full_uri_links(uri_list)
31
+ uri_list.map do |uri|
32
+ new_uri = uri
33
+
34
+ if uri.host.nil?
35
+ new_uri = URI.parse @url
36
+ new_uri = URI.join(new_uri.to_s, uri.path) unless uri.path.nil?
37
+ new_uri = URI.join(new_uri.to_s, "?#{uri.query}") unless uri.query.nil?
38
+ new_uri = URI.join(new_uri.to_s, "##{uri.fragment}") unless uri.fragment.nil?
39
+ end
40
+
41
+ new_uri.to_s
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module Crabbs
2
+ VERSION = "0.0.2"
3
+ end
data/lib/crabbs.rb ADDED
@@ -0,0 +1,17 @@
1
+ require 'crabbs/version'
2
+ require 'crabbs/page'
3
+ require 'crabbs/crawler'
4
+ require 'crabbs/cli'
5
+
6
+ module Crabbs
7
+
8
+ class << self
9
+ attr_reader :crawler
10
+
11
+ def start(url)
12
+ @crawler = Crabbs::Crawler.new
13
+ @crawler.crawl url
14
+ @crawler.site_map
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,33 @@
1
+ require 'crabbs/cli'
2
+
3
+ describe Crabbs::CLI do
4
+ subject { Crabbs::CLI.new }
5
+
6
+ describe '#start' do
7
+ context 'with no arguments' do
8
+ it 'prints out help' do
9
+ ARGV.replace []
10
+
11
+ allow(STDOUT).to receive(:puts)
12
+
13
+ subject.start
14
+
15
+ expect(STDOUT).to have_received(:puts).with('Missing required option(s): url')
16
+ end
17
+ end
18
+
19
+ context 'with required arguments' do
20
+ it 'outputs the result' do
21
+ ARGV.replace ['--url=https://example.com']
22
+
23
+ allow(Crabbs).to receive(:start).and_return('result')
24
+ allow(STDOUT).to receive(:puts)
25
+
26
+ subject.start
27
+
28
+ expect(Crabbs).to have_received(:start).with('https://example.com')
29
+ expect(STDOUT).to have_received(:puts).with('result'.to_json)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,17 @@
1
+ require 'webmock/rspec'
2
+ require 'crabbs'
3
+
4
+ describe Crabbs do
5
+
6
+ subject { Crabbs }
7
+
8
+ describe '#start' do
9
+ before do
10
+ stub_request(:get, "http://example.com/").to_return(:body => "")
11
+ end
12
+
13
+ it 'starts crawling' do
14
+ subject.start('http://example.com').should == { 'http://example.com' => {} }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,88 @@
1
+ require 'webmock/rspec'
2
+ require 'json'
3
+ require 'crabbs/crawler'
4
+
5
+ describe Crabbs::Crawler do
6
+ describe '#crawl' do
7
+ subject { Crabbs::Crawler.new }
8
+
9
+ context 'an invalid URI' do
10
+ it 'stores single entry site map' do
11
+ subject.crawl %q{invalid\uri}
12
+
13
+ subject.site_map.should == {"invalid\\uri"=>{}}
14
+ end
15
+ end
16
+
17
+ context 'a valid URI' do
18
+ context 'non html response' do
19
+ before do
20
+ @uri_string = 'http://example.com/test.json'
21
+ stub_request(:get, @uri_string).to_return(body: { a: { href: '/test' } }.to_json, headers: { 'Content-Type' => 'application/json' })
22
+ end
23
+
24
+ it 'stores empty hash map' do
25
+ subject.crawl @uri_string
26
+
27
+ subject.site_map.should == { 'http://example.com/test.json' => {} }
28
+ end
29
+ end
30
+
31
+ context 'without recursive links' do
32
+ before do
33
+ @uri_string = 'http://example.com/'
34
+ stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a><a href="/local"></a><a href="http://fb.com/"></a>})
35
+ stub_request(:get, "http://example.com/path").to_return(body: "")
36
+ stub_request(:get, "http://example.com/local").to_return(body: "")
37
+ end
38
+
39
+ it 'stores the visted links' do
40
+ subject.crawl @uri_string
41
+
42
+ subject.visited.should include('http://example.com/')
43
+ end
44
+
45
+ it 'stores a hash with the site map' do
46
+ subject.crawl @uri_string
47
+
48
+ expected_site_map = {
49
+ 'http://example.com/' => {
50
+ 'http://example.com/path' => {},
51
+ 'http://example.com/local' => {}
52
+ }
53
+ }
54
+
55
+ subject.site_map.should == expected_site_map
56
+ end
57
+ end
58
+
59
+ context 'with recursive links' do
60
+ before :each do
61
+ @uri_string = 'http://example.com/'
62
+ stub_request(:get, @uri_string).to_return(body: %Q{<a href="/path"></a>})
63
+ stub_request(:get, "http://example.com/path").to_return(body: %Q{<a href="http://example.com/"></a>})
64
+ end
65
+
66
+ it 'visits both pages' do
67
+ subject.crawl @uri_string
68
+
69
+ subject.visited.should == ['http://example.com/', 'http://example.com/path']
70
+ end
71
+
72
+ it 'stores a hash with the site map' do
73
+ subject.crawl @uri_string
74
+
75
+ expected_site_map = {
76
+ 'http://example.com/' => {
77
+ 'http://example.com/path' => {
78
+ 'http://example.com/' => {}
79
+ }
80
+ }
81
+ }
82
+
83
+ subject.site_map.should == expected_site_map
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,79 @@
1
+ require 'crabbs/page'
2
+
3
+ describe Crabbs::Page do
4
+ describe '#links' do
5
+ let(:host) { 'http://localhost.com/' }
6
+
7
+ subject { Crabbs::Page.new(html, host) }
8
+
9
+ context 'when html has external links' do
10
+ let(:html) { %Q{<a href="http://facebook.com/"></a>} }
11
+
12
+ it 'ignores external links' do
13
+ subject.links.should be_empty
14
+ end
15
+ end
16
+
17
+ context 'when html has links for non html resources' do
18
+ let(:html) { %Q{<a href="http://localhost.com/path.zip"></a>} }
19
+
20
+ it 'ignores non html links' do
21
+ subject.links.should be_empty
22
+ end
23
+
24
+ end
25
+
26
+ context 'when html has links for subdomains' do
27
+ let(:html) { %Q{<a href="http://subdomain.localhost.com/path"></a>} }
28
+
29
+ it 'ignores subdomain links' do
30
+ subject.links.should be_empty
31
+ end
32
+ end
33
+
34
+ context 'when html has query parameter links' do
35
+ let(:html) { %Q{<a href="?path=1"></a><a href="?path=1"></a>} }
36
+
37
+ it 'includes the full uri link' do
38
+ subject.links.should == ['http://localhost.com/?path=1']
39
+ end
40
+ end
41
+
42
+ context 'when html has hash links' do
43
+ let(:html) { %Q{<a href="#path"></a><a href="#path"></a><a href="#"></a>} }
44
+
45
+ it 'includes the full uri link' do
46
+ subject.links.should == ['http://localhost.com/#path']
47
+ end
48
+
49
+ it 'ignore empty hashes' do
50
+ subject.links.should_not include('http://localhost.com/#')
51
+ end
52
+ end
53
+
54
+ context 'when html has direct path links' do
55
+ let(:host) { 'http://localhost.com/first/' }
56
+ let(:html) { %Q{<a href="path"></a><a href="path"></a>} }
57
+
58
+ it 'includes the full uri link' do
59
+ subject.links.should == ['http://localhost.com/first/path']
60
+ end
61
+ end
62
+
63
+ context 'when html has path links' do
64
+ let(:html) { %Q{<a href="/path"></a><a href="/path"></a>} }
65
+
66
+ it 'includes the full uri link' do
67
+ subject.links.should == ['http://localhost.com/path']
68
+ end
69
+ end
70
+
71
+ context 'when html has full URI links for the same host' do
72
+ let(:html) { %Q{<a href="http://localhost.com/path"></a><a href="/path"></a>} }
73
+
74
+ it 'includes the full uri link' do
75
+ subject.links.should == ['http://localhost.com/path']
76
+ end
77
+ end
78
+ end
79
+ end
metadata ADDED
@@ -0,0 +1,194 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crabbs
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Bruno Trecenti
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: slop
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.5'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 3.5.0
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '3.5'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 3.5.0
47
+ - !ruby/object:Gem::Dependency
48
+ name: bundler
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '1.6'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.6'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rake
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '10.3'
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: 10.3.2
71
+ type: :development
72
+ prerelease: false
73
+ version_requirements: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: '10.3'
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 10.3.2
81
+ - !ruby/object:Gem::Dependency
82
+ name: rspec
83
+ requirement: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - "~>"
86
+ - !ruby/object:Gem::Version
87
+ version: '2.14'
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: 2.14.1
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '2.14'
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: 2.14.1
101
+ - !ruby/object:Gem::Dependency
102
+ name: webmock
103
+ requirement: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - "~>"
106
+ - !ruby/object:Gem::Version
107
+ version: '1.18'
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: 1.18.0
111
+ type: :development
112
+ prerelease: false
113
+ version_requirements: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.18'
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: 1.18.0
121
+ - !ruby/object:Gem::Dependency
122
+ name: pry
123
+ requirement: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - "~>"
126
+ - !ruby/object:Gem::Version
127
+ version: '0.9'
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: 0.9.12
131
+ type: :development
132
+ prerelease: false
133
+ version_requirements: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - "~>"
136
+ - !ruby/object:Gem::Version
137
+ version: '0.9'
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: 0.9.12
141
+ description: A crawler for a single domain web application
142
+ email:
143
+ - btrecent@thoughtworks.com
144
+ executables:
145
+ - crabbs
146
+ extensions: []
147
+ extra_rdoc_files: []
148
+ files:
149
+ - ".gitignore"
150
+ - ".rspec"
151
+ - Gemfile
152
+ - LICENSE.txt
153
+ - README.md
154
+ - Rakefile
155
+ - bin/crabbs
156
+ - crabbs.gemspec
157
+ - lib/crabbs.rb
158
+ - lib/crabbs/cli.rb
159
+ - lib/crabbs/crawler.rb
160
+ - lib/crabbs/page.rb
161
+ - lib/crabbs/version.rb
162
+ - spec/crabbs/cli_spec.rb
163
+ - spec/crabbs/crabbs_spec.rb
164
+ - spec/crabbs/crawler_spec.rb
165
+ - spec/crabbs/page_spec.rb
166
+ homepage: http://github.com/Trecenti/crabbs
167
+ licenses:
168
+ - MIT
169
+ metadata: {}
170
+ post_install_message:
171
+ rdoc_options: []
172
+ require_paths:
173
+ - lib
174
+ required_ruby_version: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ required_rubygems_version: !ruby/object:Gem::Requirement
180
+ requirements:
181
+ - - ">="
182
+ - !ruby/object:Gem::Version
183
+ version: '0'
184
+ requirements: []
185
+ rubyforge_project:
186
+ rubygems_version: 2.2.2
187
+ signing_key:
188
+ specification_version: 4
189
+ summary: A crawler for the web
190
+ test_files:
191
+ - spec/crabbs/cli_spec.rb
192
+ - spec/crabbs/crabbs_spec.rb
193
+ - spec/crabbs/crawler_spec.rb
194
+ - spec/crabbs/page_spec.rb