ruby-crawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/crawler.gemspec +30 -0
- data/lib/crawler.rb +59 -0
- data/lib/crawler/document.rb +33 -0
- data/lib/crawler/document_parser.rb +45 -0
- data/lib/crawler/error.rb +10 -0
- data/lib/crawler/formatting.rb +16 -0
- data/lib/crawler/http.rb +17 -0
- data/lib/crawler/index.rb +63 -0
- data/lib/crawler/storage.rb +166 -0
- data/lib/crawler/validations.rb +12 -0
- data/lib/crawler/version.rb +3 -0
- data/test/crawler/crawler_test.rb +46 -0
- data/test/crawler/document_test.rb +23 -0
- data/test/crawler/formatting_test.rb +23 -0
- data/test/crawler/http_test.rb +23 -0
- data/test/crawler/index_test.rb +78 -0
- data/test/crawler/storage_test.rb +147 -0
- data/test/crawler/validations_test.rb +10 -0
- data/test/support/domain_html.rb +256 -0
- data/test/test_helper.rb +50 -0
- metadata +191 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 939a77945341f3b298f7ee95d448e49d423ace18
|
4
|
+
data.tar.gz: 19b95592ff7960a39fee570adfd1983c89d12da4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9d411c506bced66661da541bfe1a4e6ec10903a9800789481b8788f7cad98f2de8e9572327862aa381fa6d02ef902533df5ac6c4bcf421a002344a11eae5d305
|
7
|
+
data.tar.gz: e09ecb14da9a0c246cd825139366dad39f8497b23fd9aeee264c28e05560a4743ec8d3f1cf351686542a350f8a8cfd942747982006cf01ae21a658dd09d0d357
|
data/.DS_Store
ADDED
Binary file
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Adam Ryan
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Crawler
|
2
|
+
|
3
|
+
Crawler is a straightforward gem that will index a given domain, and store the results in redis.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'ruby-crawler'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install ruby-crawler
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Crawl a given domain
|
22
|
+
```ruby
|
23
|
+
crawler = Crawler.new('http://domain.com')
|
24
|
+
crawler.crawl
|
25
|
+
```
|
26
|
+
|
27
|
+
|
28
|
+
View Results
|
29
|
+
```ruby
|
30
|
+
crawler.results
|
31
|
+
```
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
1. Fork it ( http://github.com/<my-github-username>/crawler/fork )
|
36
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
37
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
39
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/crawler.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "ruby-crawler"
|
8
|
+
spec.version = Crawler::VERSION
|
9
|
+
spec.authors = ["Adam Ryan"]
|
10
|
+
spec.email = ["adam.g.ryan@gmail.com"]
|
11
|
+
spec.summary = %q{Simple ruby web crawler}
|
12
|
+
spec.description = %q{Simple ruby web crawler}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri"
|
22
|
+
spec.add_dependency "addressable"
|
23
|
+
spec.add_dependency "redis"
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
26
|
+
spec.add_development_dependency "rake"
|
27
|
+
spec.add_development_dependency "minitest"
|
28
|
+
spec.add_development_dependency "webmock"
|
29
|
+
spec.add_development_dependency "simplecov"
|
30
|
+
end
|
data/lib/crawler.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'crawler/document'
|
3
|
+
require 'crawler/error'
|
4
|
+
require 'crawler/formatting'
|
5
|
+
require 'crawler/index'
|
6
|
+
require 'crawler/validations'
|
7
|
+
require 'crawler/version'
|
8
|
+
|
9
|
+
module Crawler
|
10
|
+
class << self
|
11
|
+
include Formatting
|
12
|
+
include Validations
|
13
|
+
|
14
|
+
attr_accessor :base_uri, :base_url, :index
|
15
|
+
|
16
|
+
# Initialize a new Crawler
|
17
|
+
#
|
18
|
+
# base_uri => base_uri to crawl
|
19
|
+
# base_url => base_url to crawl
|
20
|
+
#
|
21
|
+
def new(domain)
|
22
|
+
@base_uri = Addressable::URI.parse(domain.strip)
|
23
|
+
validate_protocol
|
24
|
+
|
25
|
+
@base_url = construct_url base_uri
|
26
|
+
self
|
27
|
+
end
|
28
|
+
|
29
|
+
# Performs crawl of domain, indexes results
|
30
|
+
#
|
31
|
+
def crawl(url = base_url)
|
32
|
+
document = Crawler::Document.new(url)
|
33
|
+
index.consume_document url.sub(base_url, ''), document
|
34
|
+
|
35
|
+
paths_queue = index.get_paths_to_visit
|
36
|
+
next_path = paths_queue[0]
|
37
|
+
|
38
|
+
print " Pages remaing - #{paths_queue.count} \r"
|
39
|
+
crawl "#{base_url}#{next_path}" if next_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the indexed results of a crawl
|
43
|
+
#
|
44
|
+
def results
|
45
|
+
index.results
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Crawler::Index
|
51
|
+
# New Index used to record results of domain crawl
|
52
|
+
#
|
53
|
+
def index
|
54
|
+
@index ||= Crawler::Index.new(base_uri)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'crawler/document_parser'
|
2
|
+
require 'crawler/formatting'
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
class Document
|
6
|
+
include DocumentParser
|
7
|
+
include Formatting
|
8
|
+
|
9
|
+
attr_accessor :uri, :url, :content, :links, :domain_specific_paths, :static_assets
|
10
|
+
|
11
|
+
# Intialize a new Document
|
12
|
+
#
|
13
|
+
# uri => uri of the document
|
14
|
+
# url => url of the document
|
15
|
+
# links => links found in the document
|
16
|
+
# domain_specific_paths => paths in the document related to the crawler's base domain
|
17
|
+
# static_assets => static_assets found in the document
|
18
|
+
#
|
19
|
+
def initialize(url)
|
20
|
+
@uri = Addressable::URI.parse(url.strip)
|
21
|
+
@url = construct_url uri
|
22
|
+
@links = extract_links
|
23
|
+
@domain_specific_paths = extract_domain_specific_paths
|
24
|
+
@static_assets = extract_assets
|
25
|
+
end
|
26
|
+
|
27
|
+
# Nokogiri::HTML::Document
|
28
|
+
#
|
29
|
+
def content
|
30
|
+
@content ||= parse_content uri
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'crawler/http'
|
3
|
+
require 'crawler/formatting'
|
4
|
+
|
5
|
+
module Crawler
|
6
|
+
module DocumentParser
|
7
|
+
include Formatting
|
8
|
+
include Http
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
# Parses the HTML from an http response
|
13
|
+
#
|
14
|
+
def parse_content(uri)
|
15
|
+
Nokogiri::HTML request(uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the links from the html document
|
19
|
+
#
|
20
|
+
def extract_links
|
21
|
+
content.css('a').map { |a| a['href'] unless a['href'] == '#' }.compact.uniq
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the static assets from the html document
|
25
|
+
#
|
26
|
+
def extract_assets
|
27
|
+
assets = content.css('img', 'script').map { |i| i['src'] }
|
28
|
+
assets |= content.css('video').map { |v| v['poster'] }
|
29
|
+
assets |= content.css('link').map { |l| l['href'] }
|
30
|
+
|
31
|
+
assets.compact.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the paths that are related to the given domain
|
35
|
+
#
|
36
|
+
def extract_domain_specific_paths
|
37
|
+
links.map do |link|
|
38
|
+
uri = Addressable::URI.parse(link.strip)
|
39
|
+
if uri.hostname.nil? || uri.hostname == @uri.hostname
|
40
|
+
normalize_path uri.path
|
41
|
+
end
|
42
|
+
end.compact
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Crawler
|
2
|
+
module Formatting
|
3
|
+
|
4
|
+
# normalize paths => '/path/to/'
|
5
|
+
#
|
6
|
+
def normalize_path(path)
|
7
|
+
"#{path}/".gsub(/\/+/, '/')
|
8
|
+
end
|
9
|
+
|
10
|
+
# constructs a full url from a given uri object
|
11
|
+
#
|
12
|
+
def construct_url(uri)
|
13
|
+
"#{uri.scheme}://#{uri.host}#{uri.path}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/crawler/http.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'crawler/formatting'
|
2
|
+
require 'crawler/storage'
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
class Index
|
6
|
+
include Formatting
|
7
|
+
include Storage
|
8
|
+
|
9
|
+
attr_accessor :base_uri
|
10
|
+
|
11
|
+
# New Index to record paths for a given domain
|
12
|
+
#
|
13
|
+
def initialize(base_uri)
|
14
|
+
@base_uri = base_uri
|
15
|
+
clear_stored_results
|
16
|
+
end
|
17
|
+
|
18
|
+
# Ingests a Crawler::Document, stores all relevant data in redis
|
19
|
+
# Updates pages that need to be visited as well as pages that have been visited already
|
20
|
+
#
|
21
|
+
def consume_document(path, document)
|
22
|
+
path = normalize_path path
|
23
|
+
new_links = document.domain_specific_paths.map { |path| normalize_path path }
|
24
|
+
|
25
|
+
store_path path
|
26
|
+
|
27
|
+
store_path_visited path
|
28
|
+
|
29
|
+
store_path_assets path, document.static_assets
|
30
|
+
|
31
|
+
store_path_links_to path, new_links
|
32
|
+
|
33
|
+
store_paths_to_visit(new_links - get_paths_visited)
|
34
|
+
|
35
|
+
remove_path_from_queue path
|
36
|
+
|
37
|
+
update_paths_linked_to_from_path(document)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the data associated with an indexed domain
|
41
|
+
#
|
42
|
+
def results
|
43
|
+
get_domain_data
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Records incoming links for pages
|
49
|
+
# Uses the current path as the incoming link
|
50
|
+
# Records the current_path as incoming on all links found in the current document
|
51
|
+
#
|
52
|
+
def update_paths_linked_to_from_path(document)
|
53
|
+
document.domain_specific_paths.each do |url|
|
54
|
+
link_uri_path = normalize_path Addressable::URI.parse(url.strip).path
|
55
|
+
document_uri_path = normalize_path document.uri.path
|
56
|
+
next if link_uri_path == document_uri_path
|
57
|
+
|
58
|
+
store_path link_uri_path
|
59
|
+
store_path_linked_to_from(link_uri_path, [document_uri_path])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Crawler
|
4
|
+
module Storage
|
5
|
+
|
6
|
+
# Stores a path for the domain
|
7
|
+
#
|
8
|
+
def store_path(path)
|
9
|
+
redis.sadd stored_paths_key, path
|
10
|
+
end
|
11
|
+
|
12
|
+
# Stores the static assets for a path
|
13
|
+
#
|
14
|
+
def store_path_assets(path, assets)
|
15
|
+
return if assets.empty?
|
16
|
+
redis.sadd path_assets_key(path), assets
|
17
|
+
end
|
18
|
+
|
19
|
+
# Stores paths that the current path links to
|
20
|
+
#
|
21
|
+
def store_path_links_to(path, links)
|
22
|
+
return if links.empty?
|
23
|
+
redis.sadd path_links_to_key(path), links
|
24
|
+
end
|
25
|
+
|
26
|
+
# Stores paths that link to the current path
|
27
|
+
#
|
28
|
+
def store_path_linked_to_from(path, links)
|
29
|
+
return if links.empty?
|
30
|
+
redis.sadd path_linked_to_from_key(path), links
|
31
|
+
end
|
32
|
+
|
33
|
+
# Stores paths that have been indexed for a domain
|
34
|
+
#
|
35
|
+
def store_path_visited(path)
|
36
|
+
redis.sadd paths_visited_key, path
|
37
|
+
end
|
38
|
+
|
39
|
+
# Stores paths that need to be visited for a domain
|
40
|
+
#
|
41
|
+
def store_paths_to_visit(paths)
|
42
|
+
return if paths.empty?
|
43
|
+
redis.sadd paths_to_visit_key, paths
|
44
|
+
end
|
45
|
+
|
46
|
+
# Removes a path from paths that need to be visited
|
47
|
+
#
|
48
|
+
def remove_path_from_queue(path)
|
49
|
+
redis.srem paths_to_visit_key, path
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns known paths for domain
|
53
|
+
#
|
54
|
+
def get_stored_paths
|
55
|
+
redis.smembers stored_paths_key
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns paths that haven't been indexed
|
59
|
+
#
|
60
|
+
def get_paths_to_visit
|
61
|
+
redis.smembers paths_to_visit_key
|
62
|
+
end
|
63
|
+
|
64
|
+
# Returns assets for a path
|
65
|
+
#
|
66
|
+
def get_path_assets(path)
|
67
|
+
redis.smembers path_assets_key(path)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns links that a path links to
|
71
|
+
#
|
72
|
+
def get_path_links_to(path)
|
73
|
+
redis.smembers path_links_to_key(path)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns links that link to a path
|
77
|
+
#
|
78
|
+
def get_path_linked_to_from(path)
|
79
|
+
redis.smembers path_linked_to_from_key(path)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns paths that have been indexed for a domain
|
83
|
+
#
|
84
|
+
def get_paths_visited
|
85
|
+
redis.smembers paths_visited_key
|
86
|
+
end
|
87
|
+
|
88
|
+
# Returns paths and associated data for a domain
|
89
|
+
#
|
90
|
+
def get_domain_data
|
91
|
+
get_stored_paths.inject({ 'domain' => base_uri.hostname, 'paths' => {}}) do |hsh, path|
|
92
|
+
hsh['paths'][path] = get_path_data(path)
|
93
|
+
hsh
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns assets, links to, and links for a given path
|
98
|
+
#
|
99
|
+
def get_path_data(path)
|
100
|
+
{
|
101
|
+
'asset_dependencies' => get_path_assets(path),
|
102
|
+
'links_to' => get_path_links_to(path),
|
103
|
+
'linked_to_from' => get_path_linked_to_from(path)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
# Deletes all data for a domain
|
108
|
+
#
|
109
|
+
def clear_stored_results
|
110
|
+
paths = get_stored_paths
|
111
|
+
redis.pipelined do
|
112
|
+
paths.each do |path|
|
113
|
+
[stored_paths_key, paths_visited_key, paths_to_visit_key].each do |key|
|
114
|
+
redis.del key
|
115
|
+
end
|
116
|
+
|
117
|
+
clear_path_results(path)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Deletes all path data for a path
|
123
|
+
#
|
124
|
+
def clear_path_results(path)
|
125
|
+
[path_assets_key(path), path_links_to_key(path), path_linked_to_from_key(path)].each do |key|
|
126
|
+
redis.del key
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Redis
|
131
|
+
#
|
132
|
+
def redis
|
133
|
+
@redis ||= Redis.new
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def path_assets_key(path)
|
139
|
+
"#{prefix}-path-assets-#{base_uri.hostname}-#{path}"
|
140
|
+
end
|
141
|
+
|
142
|
+
def path_links_to_key(path)
|
143
|
+
"#{prefix}-path-links-to-#{base_uri.hostname}-#{path}"
|
144
|
+
end
|
145
|
+
|
146
|
+
def path_linked_to_from_key(path)
|
147
|
+
"#{prefix}-path-linked-to-from-#{base_uri.hostname}-#{path}"
|
148
|
+
end
|
149
|
+
|
150
|
+
def stored_paths_key
|
151
|
+
"#{prefix}-paths-#{base_uri.hostname}"
|
152
|
+
end
|
153
|
+
|
154
|
+
def paths_visited_key
|
155
|
+
"#{prefix}-paths-visited-#{base_uri.hostname}"
|
156
|
+
end
|
157
|
+
|
158
|
+
def paths_to_visit_key
|
159
|
+
"#{prefix}-queued-paths-#{base_uri.hostname}"
|
160
|
+
end
|
161
|
+
|
162
|
+
def prefix
|
163
|
+
'crawler'
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|