ruby-crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 939a77945341f3b298f7ee95d448e49d423ace18
4
+ data.tar.gz: 19b95592ff7960a39fee570adfd1983c89d12da4
5
+ SHA512:
6
+ metadata.gz: 9d411c506bced66661da541bfe1a4e6ec10903a9800789481b8788f7cad98f2de8e9572327862aa381fa6d02ef902533df5ac6c4bcf421a002344a11eae5d305
7
+ data.tar.gz: e09ecb14da9a0c246cd825139366dad39f8497b23fd9aeee264c28e05560a4743ec8d3f1cf351686542a350f8a8cfd942747982006cf01ae21a658dd09d0d357
Binary file
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ coverage
19
+ dump.rdb
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawler.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Adam Ryan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Crawler
2
+
3
+ Crawler is a straightforward gem that will index a given domain, and store the results in redis.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ruby-crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ruby-crawler
18
+
19
+ ## Usage
20
+
21
+ Crawl a given domain
22
+ ```ruby
23
+ crawler = Crawler.new('http://domain.com')
24
+ crawler.crawl
25
+ ```
26
+
27
+
28
+ View Results
29
+ ```ruby
30
+ crawler.results
31
+ ```
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( http://github.com/<my-github-username>/crawler/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create new Pull Request
@@ -0,0 +1,11 @@
1
+ require 'bundler'
2
+ require 'bundler/gem_tasks'
3
+ Bundler::GemHelper.install_tasks
4
+
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << 'test'
8
+ t.pattern = 'test/**/*_test.rb'
9
+ end
10
+
11
+ task :default => :test
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ruby-crawler"
8
+ spec.version = Crawler::VERSION
9
+ spec.authors = ["Adam Ryan"]
10
+ spec.email = ["adam.g.ryan@gmail.com"]
11
+ spec.summary = %q{Simple ruby web crawler}
12
+ spec.description = %q{Simple ruby web crawler}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri"
22
+ spec.add_dependency "addressable"
23
+ spec.add_dependency "redis"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.5"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "minitest"
28
+ spec.add_development_dependency "webmock"
29
+ spec.add_development_dependency "simplecov"
30
+ end
@@ -0,0 +1,59 @@
1
+ require 'addressable/uri'
2
+ require 'crawler/document'
3
+ require 'crawler/error'
4
+ require 'crawler/formatting'
5
+ require 'crawler/index'
6
+ require 'crawler/validations'
7
+ require 'crawler/version'
8
+
9
+ module Crawler
10
+ class << self
11
+ include Formatting
12
+ include Validations
13
+
14
+ attr_accessor :base_uri, :base_url, :index
15
+
16
+ # Initialize a new Crawler
17
+ #
18
+ # base_uri => base_uri to crawl
19
+ # base_url => base_url to crawl
20
+ #
21
+ def new(domain)
22
+ @base_uri = Addressable::URI.parse(domain.strip)
23
+ validate_protocol
24
+
25
+ @base_url = construct_url base_uri
26
+ self
27
+ end
28
+
29
+ # Performs crawl of domain, indexes results
30
+ #
31
+ def crawl(url = base_url)
32
+ document = Crawler::Document.new(url)
33
+ index.consume_document url.sub(base_url, ''), document
34
+
35
+ paths_queue = index.get_paths_to_visit
36
+ next_path = paths_queue[0]
37
+
38
+ print " Pages remaing - #{paths_queue.count} \r"
39
+ crawl "#{base_url}#{next_path}" if next_path
40
+ end
41
+
42
+ # Returns the indexed results of a crawl
43
+ #
44
+ def results
45
+ index.results
46
+ end
47
+
48
+ private
49
+
50
+ # Crawler::Index
51
+ # New Index used to record results of domain crawl
52
+ #
53
+ def index
54
+ @index ||= Crawler::Index.new(base_uri)
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,33 @@
1
+ require 'crawler/document_parser'
2
+ require 'crawler/formatting'
3
+
4
+ module Crawler
5
+ class Document
6
+ include DocumentParser
7
+ include Formatting
8
+
9
+ attr_accessor :uri, :url, :content, :links, :domain_specific_paths, :static_assets
10
+
11
+ # Intialize a new Document
12
+ #
13
+ # uri => uri of the document
14
+ # url => url of the document
15
+ # links => links found in the document
16
+ # domain_specific_paths => paths in the document related to the crawler's base domain
17
+ # static_assets => static_assets found in the document
18
+ #
19
+ def initialize(url)
20
+ @uri = Addressable::URI.parse(url.strip)
21
+ @url = construct_url uri
22
+ @links = extract_links
23
+ @domain_specific_paths = extract_domain_specific_paths
24
+ @static_assets = extract_assets
25
+ end
26
+
27
+ # Nokogiri::HTML::Document
28
+ #
29
+ def content
30
+ @content ||= parse_content uri
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,45 @@
1
+ require 'nokogiri'
2
+ require 'crawler/http'
3
+ require 'crawler/formatting'
4
+
5
+ module Crawler
6
+ module DocumentParser
7
+ include Formatting
8
+ include Http
9
+
10
+ private
11
+
12
+ # Parses the HTML from an http response
13
+ #
14
+ def parse_content(uri)
15
+ Nokogiri::HTML request(uri)
16
+ end
17
+
18
+ # Returns the links from the html document
19
+ #
20
+ def extract_links
21
+ content.css('a').map { |a| a['href'] unless a['href'] == '#' }.compact.uniq
22
+ end
23
+
24
+ # Returns the static assets from the html document
25
+ #
26
+ def extract_assets
27
+ assets = content.css('img', 'script').map { |i| i['src'] }
28
+ assets |= content.css('video').map { |v| v['poster'] }
29
+ assets |= content.css('link').map { |l| l['href'] }
30
+
31
+ assets.compact.uniq
32
+ end
33
+
34
+ # Returns the paths that are related to the given domain
35
+ #
36
+ def extract_domain_specific_paths
37
+ links.map do |link|
38
+ uri = Addressable::URI.parse(link.strip)
39
+ if uri.hostname.nil? || uri.hostname == @uri.hostname
40
+ normalize_path uri.path
41
+ end
42
+ end.compact
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,10 @@
1
+ module Crawler
2
+
3
+ # Base Crawler Error
4
+ #
5
+ class Error < StandardError; end
6
+
7
+ # Raised when invalid protocol is used
8
+ #
9
+ class InvalidProtocolError < Error; end
10
+ end
@@ -0,0 +1,16 @@
1
+ module Crawler
2
+ module Formatting
3
+
4
+ # normalize paths => '/path/to/'
5
+ #
6
+ def normalize_path(path)
7
+ "#{path}/".gsub(/\/+/, '/')
8
+ end
9
+
10
+ # constructs a full url from a given uri object
11
+ #
12
+ def construct_url(uri)
13
+ "#{uri.scheme}://#{uri.host}#{uri.path}"
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ require 'net/http'
2
+ require 'open-uri'
3
+
4
+ module Crawler
5
+ module Http
6
+
7
+ # Open URI HTTP request
8
+ #
9
+ def request(uri)
10
+ begin
11
+ open(uri)
12
+ rescue
13
+ ''
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,63 @@
1
+ require 'crawler/formatting'
2
+ require 'crawler/storage'
3
+
4
+ module Crawler
5
+ class Index
6
+ include Formatting
7
+ include Storage
8
+
9
+ attr_accessor :base_uri
10
+
11
+ # New Index to record paths for a given domain
12
+ #
13
+ def initialize(base_uri)
14
+ @base_uri = base_uri
15
+ clear_stored_results
16
+ end
17
+
18
+ # Ingests a Crawler::Document, stores all relevant data in redis
19
+ # Updates pages that need to be visited as well as pages that have been visited already
20
+ #
21
+ def consume_document(path, document)
22
+ path = normalize_path path
23
+ new_links = document.domain_specific_paths.map { |path| normalize_path path }
24
+
25
+ store_path path
26
+
27
+ store_path_visited path
28
+
29
+ store_path_assets path, document.static_assets
30
+
31
+ store_path_links_to path, new_links
32
+
33
+ store_paths_to_visit(new_links - get_paths_visited)
34
+
35
+ remove_path_from_queue path
36
+
37
+ update_paths_linked_to_from_path(document)
38
+ end
39
+
40
+ # Returns the data associated with an indexed domain
41
+ #
42
+ def results
43
+ get_domain_data
44
+ end
45
+
46
+ private
47
+
48
+ # Records incoming links for pages
49
+ # Uses the current path as the incoming link
50
+ # Records the current_path as incoming on all links found in the current document
51
+ #
52
+ def update_paths_linked_to_from_path(document)
53
+ document.domain_specific_paths.each do |url|
54
+ link_uri_path = normalize_path Addressable::URI.parse(url.strip).path
55
+ document_uri_path = normalize_path document.uri.path
56
+ next if link_uri_path == document_uri_path
57
+
58
+ store_path link_uri_path
59
+ store_path_linked_to_from(link_uri_path, [document_uri_path])
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,166 @@
1
+ require 'redis'
2
+
3
+ module Crawler
4
+ module Storage
5
+
6
+ # Stores a path for the domain
7
+ #
8
+ def store_path(path)
9
+ redis.sadd stored_paths_key, path
10
+ end
11
+
12
+ # Stores the static assets for a path
13
+ #
14
+ def store_path_assets(path, assets)
15
+ return if assets.empty?
16
+ redis.sadd path_assets_key(path), assets
17
+ end
18
+
19
+ # Stores paths that the current path links to
20
+ #
21
+ def store_path_links_to(path, links)
22
+ return if links.empty?
23
+ redis.sadd path_links_to_key(path), links
24
+ end
25
+
26
+ # Stores paths that link to the current path
27
+ #
28
+ def store_path_linked_to_from(path, links)
29
+ return if links.empty?
30
+ redis.sadd path_linked_to_from_key(path), links
31
+ end
32
+
33
+ # Stores paths that have been indexed for a domain
34
+ #
35
+ def store_path_visited(path)
36
+ redis.sadd paths_visited_key, path
37
+ end
38
+
39
+ # Stores paths that need to be visited for a domain
40
+ #
41
+ def store_paths_to_visit(paths)
42
+ return if paths.empty?
43
+ redis.sadd paths_to_visit_key, paths
44
+ end
45
+
46
+ # Removes a path from paths that need to be visited
47
+ #
48
+ def remove_path_from_queue(path)
49
+ redis.srem paths_to_visit_key, path
50
+ end
51
+
52
+ # Returns known paths for domain
53
+ #
54
+ def get_stored_paths
55
+ redis.smembers stored_paths_key
56
+ end
57
+
58
+ # Returns paths that haven't been indexed
59
+ #
60
+ def get_paths_to_visit
61
+ redis.smembers paths_to_visit_key
62
+ end
63
+
64
+ # Returns assets for a path
65
+ #
66
+ def get_path_assets(path)
67
+ redis.smembers path_assets_key(path)
68
+ end
69
+
70
+ # Returns links that a path links to
71
+ #
72
+ def get_path_links_to(path)
73
+ redis.smembers path_links_to_key(path)
74
+ end
75
+
76
+ # Returns links that link to a path
77
+ #
78
+ def get_path_linked_to_from(path)
79
+ redis.smembers path_linked_to_from_key(path)
80
+ end
81
+
82
+ # Returns paths that have been indexed for a domain
83
+ #
84
+ def get_paths_visited
85
+ redis.smembers paths_visited_key
86
+ end
87
+
88
+ # Returns paths and associated data for a domain
89
+ #
90
+ def get_domain_data
91
+ get_stored_paths.inject({ 'domain' => base_uri.hostname, 'paths' => {}}) do |hsh, path|
92
+ hsh['paths'][path] = get_path_data(path)
93
+ hsh
94
+ end
95
+ end
96
+
97
+ # Returns assets, links to, and links for a given path
98
+ #
99
+ def get_path_data(path)
100
+ {
101
+ 'asset_dependencies' => get_path_assets(path),
102
+ 'links_to' => get_path_links_to(path),
103
+ 'linked_to_from' => get_path_linked_to_from(path)
104
+ }
105
+ end
106
+
107
+ # Deletes all data for a domain
108
+ #
109
+ def clear_stored_results
110
+ paths = get_stored_paths
111
+ redis.pipelined do
112
+ paths.each do |path|
113
+ [stored_paths_key, paths_visited_key, paths_to_visit_key].each do |key|
114
+ redis.del key
115
+ end
116
+
117
+ clear_path_results(path)
118
+ end
119
+ end
120
+ end
121
+
122
+ # Deletes all path data for a path
123
+ #
124
+ def clear_path_results(path)
125
+ [path_assets_key(path), path_links_to_key(path), path_linked_to_from_key(path)].each do |key|
126
+ redis.del key
127
+ end
128
+ end
129
+
130
+ # Redis
131
+ #
132
+ def redis
133
+ @redis ||= Redis.new
134
+ end
135
+
136
+ private
137
+
138
+ def path_assets_key(path)
139
+ "#{prefix}-path-assets-#{base_uri.hostname}-#{path}"
140
+ end
141
+
142
+ def path_links_to_key(path)
143
+ "#{prefix}-path-links-to-#{base_uri.hostname}-#{path}"
144
+ end
145
+
146
+ def path_linked_to_from_key(path)
147
+ "#{prefix}-path-linked-to-from-#{base_uri.hostname}-#{path}"
148
+ end
149
+
150
+ def stored_paths_key
151
+ "#{prefix}-paths-#{base_uri.hostname}"
152
+ end
153
+
154
+ def paths_visited_key
155
+ "#{prefix}-paths-visited-#{base_uri.hostname}"
156
+ end
157
+
158
+ def paths_to_visit_key
159
+ "#{prefix}-queued-paths-#{base_uri.hostname}"
160
+ end
161
+
162
+ def prefix
163
+ 'crawler'
164
+ end
165
+ end
166
+ end