ruby-crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 939a77945341f3b298f7ee95d448e49d423ace18
4
+ data.tar.gz: 19b95592ff7960a39fee570adfd1983c89d12da4
5
+ SHA512:
6
+ metadata.gz: 9d411c506bced66661da541bfe1a4e6ec10903a9800789481b8788f7cad98f2de8e9572327862aa381fa6d02ef902533df5ac6c4bcf421a002344a11eae5d305
7
+ data.tar.gz: e09ecb14da9a0c246cd825139366dad39f8497b23fd9aeee264c28e05560a4743ec8d3f1cf351686542a350f8a8cfd942747982006cf01ae21a658dd09d0d357
Binary file
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ coverage
19
+ dump.rdb
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawler.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Adam Ryan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Crawler
2
+
3
+ Crawler is a straightforward gem that will index a given domain, and store the results in redis.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ruby-crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ruby-crawler
18
+
19
+ ## Usage
20
+
21
+ Crawl a given domain
22
+ ```ruby
23
+ crawler = Crawler.new('http://domain.com')
24
+ crawler.crawl
25
+ ```
26
+
27
+
28
+ View Results
29
+ ```ruby
30
+ crawler.results
31
+ ```
32
+
33
+ ## Contributing
34
+
35
+ 1. Fork it ( http://github.com/<my-github-username>/crawler/fork )
36
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
37
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
38
+ 4. Push to the branch (`git push origin my-new-feature`)
39
+ 5. Create new Pull Request
@@ -0,0 +1,11 @@
1
+ require 'bundler'
2
+ require 'bundler/gem_tasks'
3
+ Bundler::GemHelper.install_tasks
4
+
5
+ require 'rake/testtask'
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << 'test'
8
+ t.pattern = 'test/**/*_test.rb'
9
+ end
10
+
11
+ task :default => :test
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ruby-crawler"
8
+ spec.version = Crawler::VERSION
9
+ spec.authors = ["Adam Ryan"]
10
+ spec.email = ["adam.g.ryan@gmail.com"]
11
+ spec.summary = %q{Simple ruby web crawler}
12
+ spec.description = %q{Simple ruby web crawler}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri"
22
+ spec.add_dependency "addressable"
23
+ spec.add_dependency "redis"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.5"
26
+ spec.add_development_dependency "rake"
27
+ spec.add_development_dependency "minitest"
28
+ spec.add_development_dependency "webmock"
29
+ spec.add_development_dependency "simplecov"
30
+ end
@@ -0,0 +1,59 @@
1
+ require 'addressable/uri'
2
+ require 'crawler/document'
3
+ require 'crawler/error'
4
+ require 'crawler/formatting'
5
+ require 'crawler/index'
6
+ require 'crawler/validations'
7
+ require 'crawler/version'
8
+
9
+ module Crawler
10
+ class << self
11
+ include Formatting
12
+ include Validations
13
+
14
+ attr_accessor :base_uri, :base_url, :index
15
+
16
+ # Initialize a new Crawler
17
+ #
18
+ # base_uri => base_uri to crawl
19
+ # base_url => base_url to crawl
20
+ #
21
+ def new(domain)
22
+ @base_uri = Addressable::URI.parse(domain.strip)
23
+ validate_protocol
24
+
25
+ @base_url = construct_url base_uri
26
+ self
27
+ end
28
+
29
+ # Performs crawl of domain, indexes results
30
+ #
31
+ def crawl(url = base_url)
32
+ document = Crawler::Document.new(url)
33
+ index.consume_document url.sub(base_url, ''), document
34
+
35
+ paths_queue = index.get_paths_to_visit
36
+ next_path = paths_queue[0]
37
+
38
+ print " Pages remaing - #{paths_queue.count} \r"
39
+ crawl "#{base_url}#{next_path}" if next_path
40
+ end
41
+
42
+ # Returns the indexed results of a crawl
43
+ #
44
+ def results
45
+ index.results
46
+ end
47
+
48
+ private
49
+
50
+ # Crawler::Index
51
+ # New Index used to record results of domain crawl
52
+ #
53
+ def index
54
+ @index ||= Crawler::Index.new(base_uri)
55
+ end
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,33 @@
1
+ require 'crawler/document_parser'
2
+ require 'crawler/formatting'
3
+
4
+ module Crawler
5
+ class Document
6
+ include DocumentParser
7
+ include Formatting
8
+
9
+ attr_accessor :uri, :url, :content, :links, :domain_specific_paths, :static_assets
10
+
11
+ # Intialize a new Document
12
+ #
13
+ # uri => uri of the document
14
+ # url => url of the document
15
+ # links => links found in the document
16
+ # domain_specific_paths => paths in the document related to the crawler's base domain
17
+ # static_assets => static_assets found in the document
18
+ #
19
+ def initialize(url)
20
+ @uri = Addressable::URI.parse(url.strip)
21
+ @url = construct_url uri
22
+ @links = extract_links
23
+ @domain_specific_paths = extract_domain_specific_paths
24
+ @static_assets = extract_assets
25
+ end
26
+
27
+ # Nokogiri::HTML::Document
28
+ #
29
+ def content
30
+ @content ||= parse_content uri
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,45 @@
1
+ require 'nokogiri'
2
+ require 'crawler/http'
3
+ require 'crawler/formatting'
4
+
5
+ module Crawler
6
+ module DocumentParser
7
+ include Formatting
8
+ include Http
9
+
10
+ private
11
+
12
+ # Parses the HTML from an http response
13
+ #
14
+ def parse_content(uri)
15
+ Nokogiri::HTML request(uri)
16
+ end
17
+
18
+ # Returns the links from the html document
19
+ #
20
+ def extract_links
21
+ content.css('a').map { |a| a['href'] unless a['href'] == '#' }.compact.uniq
22
+ end
23
+
24
+ # Returns the static assets from the html document
25
+ #
26
+ def extract_assets
27
+ assets = content.css('img', 'script').map { |i| i['src'] }
28
+ assets |= content.css('video').map { |v| v['poster'] }
29
+ assets |= content.css('link').map { |l| l['href'] }
30
+
31
+ assets.compact.uniq
32
+ end
33
+
34
+ # Returns the paths that are related to the given domain
35
+ #
36
+ def extract_domain_specific_paths
37
+ links.map do |link|
38
+ uri = Addressable::URI.parse(link.strip)
39
+ if uri.hostname.nil? || uri.hostname == @uri.hostname
40
+ normalize_path uri.path
41
+ end
42
+ end.compact
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,10 @@
1
+ module Crawler
2
+
3
+ # Base Crawler Error
4
+ #
5
+ class Error < StandardError; end
6
+
7
+ # Raised when invalid protocol is used
8
+ #
9
+ class InvalidProtocolError < Error; end
10
+ end
@@ -0,0 +1,16 @@
1
+ module Crawler
2
+ module Formatting
3
+
4
+ # normalize paths => '/path/to/'
5
+ #
6
+ def normalize_path(path)
7
+ "#{path}/".gsub(/\/+/, '/')
8
+ end
9
+
10
+ # constructs a full url from a given uri object
11
+ #
12
+ def construct_url(uri)
13
+ "#{uri.scheme}://#{uri.host}#{uri.path}"
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,17 @@
1
+ require 'net/http'
2
+ require 'open-uri'
3
+
4
+ module Crawler
5
+ module Http
6
+
7
+ # Open URI HTTP request
8
+ #
9
+ def request(uri)
10
+ begin
11
+ open(uri)
12
+ rescue
13
+ ''
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,63 @@
1
+ require 'crawler/formatting'
2
+ require 'crawler/storage'
3
+
4
+ module Crawler
5
+ class Index
6
+ include Formatting
7
+ include Storage
8
+
9
+ attr_accessor :base_uri
10
+
11
+ # New Index to record paths for a given domain
12
+ #
13
+ def initialize(base_uri)
14
+ @base_uri = base_uri
15
+ clear_stored_results
16
+ end
17
+
18
+ # Ingests a Crawler::Document, stores all relevant data in redis
19
+ # Updates pages that need to be visited as well as pages that have been visited already
20
+ #
21
+ def consume_document(path, document)
22
+ path = normalize_path path
23
+ new_links = document.domain_specific_paths.map { |path| normalize_path path }
24
+
25
+ store_path path
26
+
27
+ store_path_visited path
28
+
29
+ store_path_assets path, document.static_assets
30
+
31
+ store_path_links_to path, new_links
32
+
33
+ store_paths_to_visit(new_links - get_paths_visited)
34
+
35
+ remove_path_from_queue path
36
+
37
+ update_paths_linked_to_from_path(document)
38
+ end
39
+
40
+ # Returns the data associated with an indexed domain
41
+ #
42
+ def results
43
+ get_domain_data
44
+ end
45
+
46
+ private
47
+
48
+ # Records incoming links for pages
49
+ # Uses the current path as the incoming link
50
+ # Records the current_path as incoming on all links found in the current document
51
+ #
52
+ def update_paths_linked_to_from_path(document)
53
+ document.domain_specific_paths.each do |url|
54
+ link_uri_path = normalize_path Addressable::URI.parse(url.strip).path
55
+ document_uri_path = normalize_path document.uri.path
56
+ next if link_uri_path == document_uri_path
57
+
58
+ store_path link_uri_path
59
+ store_path_linked_to_from(link_uri_path, [document_uri_path])
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,166 @@
1
+ require 'redis'
2
+
3
+ module Crawler
4
+ module Storage
5
+
6
+ # Stores a path for the domain
7
+ #
8
+ def store_path(path)
9
+ redis.sadd stored_paths_key, path
10
+ end
11
+
12
+ # Stores the static assets for a path
13
+ #
14
+ def store_path_assets(path, assets)
15
+ return if assets.empty?
16
+ redis.sadd path_assets_key(path), assets
17
+ end
18
+
19
+ # Stores paths that the current path links to
20
+ #
21
+ def store_path_links_to(path, links)
22
+ return if links.empty?
23
+ redis.sadd path_links_to_key(path), links
24
+ end
25
+
26
+ # Stores paths that link to the current path
27
+ #
28
+ def store_path_linked_to_from(path, links)
29
+ return if links.empty?
30
+ redis.sadd path_linked_to_from_key(path), links
31
+ end
32
+
33
+ # Stores paths that have been indexed for a domain
34
+ #
35
+ def store_path_visited(path)
36
+ redis.sadd paths_visited_key, path
37
+ end
38
+
39
+ # Stores paths that need to be visited for a domain
40
+ #
41
+ def store_paths_to_visit(paths)
42
+ return if paths.empty?
43
+ redis.sadd paths_to_visit_key, paths
44
+ end
45
+
46
+ # Removes a path from paths that need to be visited
47
+ #
48
+ def remove_path_from_queue(path)
49
+ redis.srem paths_to_visit_key, path
50
+ end
51
+
52
+ # Returns known paths for domain
53
+ #
54
+ def get_stored_paths
55
+ redis.smembers stored_paths_key
56
+ end
57
+
58
+ # Returns paths that haven't been indexed
59
+ #
60
+ def get_paths_to_visit
61
+ redis.smembers paths_to_visit_key
62
+ end
63
+
64
+ # Returns assets for a path
65
+ #
66
+ def get_path_assets(path)
67
+ redis.smembers path_assets_key(path)
68
+ end
69
+
70
+ # Returns links that a path links to
71
+ #
72
+ def get_path_links_to(path)
73
+ redis.smembers path_links_to_key(path)
74
+ end
75
+
76
+ # Returns links that link to a path
77
+ #
78
+ def get_path_linked_to_from(path)
79
+ redis.smembers path_linked_to_from_key(path)
80
+ end
81
+
82
+ # Returns paths that have been indexed for a domain
83
+ #
84
+ def get_paths_visited
85
+ redis.smembers paths_visited_key
86
+ end
87
+
88
+ # Returns paths and associated data for a domain
89
+ #
90
+ def get_domain_data
91
+ get_stored_paths.inject({ 'domain' => base_uri.hostname, 'paths' => {}}) do |hsh, path|
92
+ hsh['paths'][path] = get_path_data(path)
93
+ hsh
94
+ end
95
+ end
96
+
97
+ # Returns assets, links to, and links for a given path
98
+ #
99
+ def get_path_data(path)
100
+ {
101
+ 'asset_dependencies' => get_path_assets(path),
102
+ 'links_to' => get_path_links_to(path),
103
+ 'linked_to_from' => get_path_linked_to_from(path)
104
+ }
105
+ end
106
+
107
+ # Deletes all data for a domain
108
+ #
109
+ def clear_stored_results
110
+ paths = get_stored_paths
111
+ redis.pipelined do
112
+ paths.each do |path|
113
+ [stored_paths_key, paths_visited_key, paths_to_visit_key].each do |key|
114
+ redis.del key
115
+ end
116
+
117
+ clear_path_results(path)
118
+ end
119
+ end
120
+ end
121
+
122
+ # Deletes all path data for a path
123
+ #
124
+ def clear_path_results(path)
125
+ [path_assets_key(path), path_links_to_key(path), path_linked_to_from_key(path)].each do |key|
126
+ redis.del key
127
+ end
128
+ end
129
+
130
+ # Redis
131
+ #
132
+ def redis
133
+ @redis ||= Redis.new
134
+ end
135
+
136
+ private
137
+
138
+ def path_assets_key(path)
139
+ "#{prefix}-path-assets-#{base_uri.hostname}-#{path}"
140
+ end
141
+
142
+ def path_links_to_key(path)
143
+ "#{prefix}-path-links-to-#{base_uri.hostname}-#{path}"
144
+ end
145
+
146
+ def path_linked_to_from_key(path)
147
+ "#{prefix}-path-linked-to-from-#{base_uri.hostname}-#{path}"
148
+ end
149
+
150
+ def stored_paths_key
151
+ "#{prefix}-paths-#{base_uri.hostname}"
152
+ end
153
+
154
+ def paths_visited_key
155
+ "#{prefix}-paths-visited-#{base_uri.hostname}"
156
+ end
157
+
158
+ def paths_to_visit_key
159
+ "#{prefix}-queued-paths-#{base_uri.hostname}"
160
+ end
161
+
162
+ def prefix
163
+ 'crawler'
164
+ end
165
+ end
166
+ end