ruby-crawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/crawler.gemspec +30 -0
- data/lib/crawler.rb +59 -0
- data/lib/crawler/document.rb +33 -0
- data/lib/crawler/document_parser.rb +45 -0
- data/lib/crawler/error.rb +10 -0
- data/lib/crawler/formatting.rb +16 -0
- data/lib/crawler/http.rb +17 -0
- data/lib/crawler/index.rb +63 -0
- data/lib/crawler/storage.rb +166 -0
- data/lib/crawler/validations.rb +12 -0
- data/lib/crawler/version.rb +3 -0
- data/test/crawler/crawler_test.rb +46 -0
- data/test/crawler/document_test.rb +23 -0
- data/test/crawler/formatting_test.rb +23 -0
- data/test/crawler/http_test.rb +23 -0
- data/test/crawler/index_test.rb +78 -0
- data/test/crawler/storage_test.rb +147 -0
- data/test/crawler/validations_test.rb +10 -0
- data/test/support/domain_html.rb +256 -0
- data/test/test_helper.rb +50 -0
- metadata +191 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 939a77945341f3b298f7ee95d448e49d423ace18
|
4
|
+
data.tar.gz: 19b95592ff7960a39fee570adfd1983c89d12da4
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9d411c506bced66661da541bfe1a4e6ec10903a9800789481b8788f7cad98f2de8e9572327862aa381fa6d02ef902533df5ac6c4bcf421a002344a11eae5d305
|
7
|
+
data.tar.gz: e09ecb14da9a0c246cd825139366dad39f8497b23fd9aeee264c28e05560a4743ec8d3f1cf351686542a350f8a8cfd942747982006cf01ae21a658dd09d0d357
|
data/.DS_Store
ADDED
Binary file
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Adam Ryan
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# Crawler
|
2
|
+
|
3
|
+
Crawler is a straightforward gem that will index a given domain, and store the results in redis.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'ruby-crawler'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install ruby-crawler
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Crawl a given domain
|
22
|
+
```ruby
|
23
|
+
crawler = Crawler.new('http://domain.com')
|
24
|
+
crawler.crawl
|
25
|
+
```
|
26
|
+
|
27
|
+
|
28
|
+
View Results
|
29
|
+
```ruby
|
30
|
+
crawler.results
|
31
|
+
```
|
32
|
+
|
33
|
+
## Contributing
|
34
|
+
|
35
|
+
1. Fork it ( http://github.com/<my-github-username>/crawler/fork )
|
36
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
37
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
39
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/crawler.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "ruby-crawler"
|
8
|
+
spec.version = Crawler::VERSION
|
9
|
+
spec.authors = ["Adam Ryan"]
|
10
|
+
spec.email = ["adam.g.ryan@gmail.com"]
|
11
|
+
spec.summary = %q{Simple ruby web crawler}
|
12
|
+
spec.description = %q{Simple ruby web crawler}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri"
|
22
|
+
spec.add_dependency "addressable"
|
23
|
+
spec.add_dependency "redis"
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
26
|
+
spec.add_development_dependency "rake"
|
27
|
+
spec.add_development_dependency "minitest"
|
28
|
+
spec.add_development_dependency "webmock"
|
29
|
+
spec.add_development_dependency "simplecov"
|
30
|
+
end
|
data/lib/crawler.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'crawler/document'
|
3
|
+
require 'crawler/error'
|
4
|
+
require 'crawler/formatting'
|
5
|
+
require 'crawler/index'
|
6
|
+
require 'crawler/validations'
|
7
|
+
require 'crawler/version'
|
8
|
+
|
9
|
+
module Crawler
|
10
|
+
class << self
|
11
|
+
include Formatting
|
12
|
+
include Validations
|
13
|
+
|
14
|
+
attr_accessor :base_uri, :base_url, :index
|
15
|
+
|
16
|
+
# Initialize a new Crawler
|
17
|
+
#
|
18
|
+
# base_uri => base_uri to crawl
|
19
|
+
# base_url => base_url to crawl
|
20
|
+
#
|
21
|
+
def new(domain)
|
22
|
+
@base_uri = Addressable::URI.parse(domain.strip)
|
23
|
+
validate_protocol
|
24
|
+
|
25
|
+
@base_url = construct_url base_uri
|
26
|
+
self
|
27
|
+
end
|
28
|
+
|
29
|
+
# Performs crawl of domain, indexes results
|
30
|
+
#
|
31
|
+
def crawl(url = base_url)
|
32
|
+
document = Crawler::Document.new(url)
|
33
|
+
index.consume_document url.sub(base_url, ''), document
|
34
|
+
|
35
|
+
paths_queue = index.get_paths_to_visit
|
36
|
+
next_path = paths_queue[0]
|
37
|
+
|
38
|
+
print " Pages remaing - #{paths_queue.count} \r"
|
39
|
+
crawl "#{base_url}#{next_path}" if next_path
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the indexed results of a crawl
|
43
|
+
#
|
44
|
+
def results
|
45
|
+
index.results
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Crawler::Index
|
51
|
+
# New Index used to record results of domain crawl
|
52
|
+
#
|
53
|
+
def index
|
54
|
+
@index ||= Crawler::Index.new(base_uri)
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'crawler/document_parser'
|
2
|
+
require 'crawler/formatting'
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
class Document
|
6
|
+
include DocumentParser
|
7
|
+
include Formatting
|
8
|
+
|
9
|
+
attr_accessor :uri, :url, :content, :links, :domain_specific_paths, :static_assets
|
10
|
+
|
11
|
+
# Intialize a new Document
|
12
|
+
#
|
13
|
+
# uri => uri of the document
|
14
|
+
# url => url of the document
|
15
|
+
# links => links found in the document
|
16
|
+
# domain_specific_paths => paths in the document related to the crawler's base domain
|
17
|
+
# static_assets => static_assets found in the document
|
18
|
+
#
|
19
|
+
def initialize(url)
|
20
|
+
@uri = Addressable::URI.parse(url.strip)
|
21
|
+
@url = construct_url uri
|
22
|
+
@links = extract_links
|
23
|
+
@domain_specific_paths = extract_domain_specific_paths
|
24
|
+
@static_assets = extract_assets
|
25
|
+
end
|
26
|
+
|
27
|
+
# Nokogiri::HTML::Document
|
28
|
+
#
|
29
|
+
def content
|
30
|
+
@content ||= parse_content uri
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'crawler/http'
|
3
|
+
require 'crawler/formatting'
|
4
|
+
|
5
|
+
module Crawler
|
6
|
+
module DocumentParser
|
7
|
+
include Formatting
|
8
|
+
include Http
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
# Parses the HTML from an http response
|
13
|
+
#
|
14
|
+
def parse_content(uri)
|
15
|
+
Nokogiri::HTML request(uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the links from the html document
|
19
|
+
#
|
20
|
+
def extract_links
|
21
|
+
content.css('a').map { |a| a['href'] unless a['href'] == '#' }.compact.uniq
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns the static assets from the html document
|
25
|
+
#
|
26
|
+
def extract_assets
|
27
|
+
assets = content.css('img', 'script').map { |i| i['src'] }
|
28
|
+
assets |= content.css('video').map { |v| v['poster'] }
|
29
|
+
assets |= content.css('link').map { |l| l['href'] }
|
30
|
+
|
31
|
+
assets.compact.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the paths that are related to the given domain
|
35
|
+
#
|
36
|
+
def extract_domain_specific_paths
|
37
|
+
links.map do |link|
|
38
|
+
uri = Addressable::URI.parse(link.strip)
|
39
|
+
if uri.hostname.nil? || uri.hostname == @uri.hostname
|
40
|
+
normalize_path uri.path
|
41
|
+
end
|
42
|
+
end.compact
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Crawler
|
2
|
+
module Formatting
|
3
|
+
|
4
|
+
# normalize paths => '/path/to/'
|
5
|
+
#
|
6
|
+
def normalize_path(path)
|
7
|
+
"#{path}/".gsub(/\/+/, '/')
|
8
|
+
end
|
9
|
+
|
10
|
+
# constructs a full url from a given uri object
|
11
|
+
#
|
12
|
+
def construct_url(uri)
|
13
|
+
"#{uri.scheme}://#{uri.host}#{uri.path}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/crawler/http.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'crawler/formatting'
|
2
|
+
require 'crawler/storage'
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
class Index
|
6
|
+
include Formatting
|
7
|
+
include Storage
|
8
|
+
|
9
|
+
attr_accessor :base_uri
|
10
|
+
|
11
|
+
# New Index to record paths for a given domain
|
12
|
+
#
|
13
|
+
def initialize(base_uri)
|
14
|
+
@base_uri = base_uri
|
15
|
+
clear_stored_results
|
16
|
+
end
|
17
|
+
|
18
|
+
# Ingests a Crawler::Document, stores all relevant data in redis
|
19
|
+
# Updates pages that need to be visited as well as pages that have been visited already
|
20
|
+
#
|
21
|
+
def consume_document(path, document)
|
22
|
+
path = normalize_path path
|
23
|
+
new_links = document.domain_specific_paths.map { |path| normalize_path path }
|
24
|
+
|
25
|
+
store_path path
|
26
|
+
|
27
|
+
store_path_visited path
|
28
|
+
|
29
|
+
store_path_assets path, document.static_assets
|
30
|
+
|
31
|
+
store_path_links_to path, new_links
|
32
|
+
|
33
|
+
store_paths_to_visit(new_links - get_paths_visited)
|
34
|
+
|
35
|
+
remove_path_from_queue path
|
36
|
+
|
37
|
+
update_paths_linked_to_from_path(document)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Returns the data associated with an indexed domain
|
41
|
+
#
|
42
|
+
def results
|
43
|
+
get_domain_data
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Records incoming links for pages
|
49
|
+
# Uses the current path as the incoming link
|
50
|
+
# Records the current_path as incoming on all links found in the current document
|
51
|
+
#
|
52
|
+
def update_paths_linked_to_from_path(document)
|
53
|
+
document.domain_specific_paths.each do |url|
|
54
|
+
link_uri_path = normalize_path Addressable::URI.parse(url.strip).path
|
55
|
+
document_uri_path = normalize_path document.uri.path
|
56
|
+
next if link_uri_path == document_uri_path
|
57
|
+
|
58
|
+
store_path link_uri_path
|
59
|
+
store_path_linked_to_from(link_uri_path, [document_uri_path])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Crawler
|
4
|
+
module Storage
|
5
|
+
|
6
|
+
# Stores a path for the domain
|
7
|
+
#
|
8
|
+
def store_path(path)
|
9
|
+
redis.sadd stored_paths_key, path
|
10
|
+
end
|
11
|
+
|
12
|
+
# Stores the static assets for a path
|
13
|
+
#
|
14
|
+
def store_path_assets(path, assets)
|
15
|
+
return if assets.empty?
|
16
|
+
redis.sadd path_assets_key(path), assets
|
17
|
+
end
|
18
|
+
|
19
|
+
# Stores paths that the current path links to
|
20
|
+
#
|
21
|
+
def store_path_links_to(path, links)
|
22
|
+
return if links.empty?
|
23
|
+
redis.sadd path_links_to_key(path), links
|
24
|
+
end
|
25
|
+
|
26
|
+
# Stores paths that link to the current path
|
27
|
+
#
|
28
|
+
def store_path_linked_to_from(path, links)
|
29
|
+
return if links.empty?
|
30
|
+
redis.sadd path_linked_to_from_key(path), links
|
31
|
+
end
|
32
|
+
|
33
|
+
# Stores paths that have been indexed for a domain
|
34
|
+
#
|
35
|
+
def store_path_visited(path)
|
36
|
+
redis.sadd paths_visited_key, path
|
37
|
+
end
|
38
|
+
|
39
|
+
# Stores paths that need to be visited for a domain
|
40
|
+
#
|
41
|
+
def store_paths_to_visit(paths)
|
42
|
+
return if paths.empty?
|
43
|
+
redis.sadd paths_to_visit_key, paths
|
44
|
+
end
|
45
|
+
|
46
|
+
# Removes a path from paths that need to be visited
|
47
|
+
#
|
48
|
+
def remove_path_from_queue(path)
|
49
|
+
redis.srem paths_to_visit_key, path
|
50
|
+
end
|
51
|
+
|
52
|
+
# Returns known paths for domain
|
53
|
+
#
|
54
|
+
def get_stored_paths
|
55
|
+
redis.smembers stored_paths_key
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns paths that haven't been indexed
|
59
|
+
#
|
60
|
+
def get_paths_to_visit
|
61
|
+
redis.smembers paths_to_visit_key
|
62
|
+
end
|
63
|
+
|
64
|
+
# Returns assets for a path
|
65
|
+
#
|
66
|
+
def get_path_assets(path)
|
67
|
+
redis.smembers path_assets_key(path)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Returns links that a path links to
|
71
|
+
#
|
72
|
+
def get_path_links_to(path)
|
73
|
+
redis.smembers path_links_to_key(path)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns links that link to a path
|
77
|
+
#
|
78
|
+
def get_path_linked_to_from(path)
|
79
|
+
redis.smembers path_linked_to_from_key(path)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns paths that have been indexed for a domain
|
83
|
+
#
|
84
|
+
def get_paths_visited
|
85
|
+
redis.smembers paths_visited_key
|
86
|
+
end
|
87
|
+
|
88
|
+
# Returns paths and associated data for a domain
|
89
|
+
#
|
90
|
+
def get_domain_data
|
91
|
+
get_stored_paths.inject({ 'domain' => base_uri.hostname, 'paths' => {}}) do |hsh, path|
|
92
|
+
hsh['paths'][path] = get_path_data(path)
|
93
|
+
hsh
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns assets, links to, and links for a given path
|
98
|
+
#
|
99
|
+
def get_path_data(path)
|
100
|
+
{
|
101
|
+
'asset_dependencies' => get_path_assets(path),
|
102
|
+
'links_to' => get_path_links_to(path),
|
103
|
+
'linked_to_from' => get_path_linked_to_from(path)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
# Deletes all data for a domain
|
108
|
+
#
|
109
|
+
def clear_stored_results
|
110
|
+
paths = get_stored_paths
|
111
|
+
redis.pipelined do
|
112
|
+
paths.each do |path|
|
113
|
+
[stored_paths_key, paths_visited_key, paths_to_visit_key].each do |key|
|
114
|
+
redis.del key
|
115
|
+
end
|
116
|
+
|
117
|
+
clear_path_results(path)
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Deletes all path data for a path
|
123
|
+
#
|
124
|
+
def clear_path_results(path)
|
125
|
+
[path_assets_key(path), path_links_to_key(path), path_linked_to_from_key(path)].each do |key|
|
126
|
+
redis.del key
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Redis
|
131
|
+
#
|
132
|
+
def redis
|
133
|
+
@redis ||= Redis.new
|
134
|
+
end
|
135
|
+
|
136
|
+
private
|
137
|
+
|
138
|
+
def path_assets_key(path)
|
139
|
+
"#{prefix}-path-assets-#{base_uri.hostname}-#{path}"
|
140
|
+
end
|
141
|
+
|
142
|
+
def path_links_to_key(path)
|
143
|
+
"#{prefix}-path-links-to-#{base_uri.hostname}-#{path}"
|
144
|
+
end
|
145
|
+
|
146
|
+
def path_linked_to_from_key(path)
|
147
|
+
"#{prefix}-path-linked-to-from-#{base_uri.hostname}-#{path}"
|
148
|
+
end
|
149
|
+
|
150
|
+
def stored_paths_key
|
151
|
+
"#{prefix}-paths-#{base_uri.hostname}"
|
152
|
+
end
|
153
|
+
|
154
|
+
def paths_visited_key
|
155
|
+
"#{prefix}-paths-visited-#{base_uri.hostname}"
|
156
|
+
end
|
157
|
+
|
158
|
+
def paths_to_visit_key
|
159
|
+
"#{prefix}-queued-paths-#{base_uri.hostname}"
|
160
|
+
end
|
161
|
+
|
162
|
+
def prefix
|
163
|
+
'crawler'
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|