site_mapper 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/site_mapper +1 -1
- data/lib/site_mapper.rb +21 -1
- data/lib/site_mapper/crawler.rb +25 -25
- data/lib/site_mapper/logger.rb +57 -0
- data/lib/site_mapper/request.rb +5 -4
- data/lib/site_mapper/robots.rb +4 -2
- data/lib/site_mapper/version.rb +1 -1
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49d4c0ab215ac8872234e3c552275f89688f997d
|
4
|
+
data.tar.gz: 4b75bae288524a38fe3a8f7d6df9ab42e715211a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0cad8aedfb27fc5a4d56b297098b76a9abd43df86b6c20b9d6f228256091191bb020e620217a9bb40bdc68c981bf8d2ca4fdf2930c55158bb321fa11dfc960b
|
7
|
+
data.tar.gz: c3b1c1863e3de70793d127772b7a314b1e788483fe9151e441c1c209b7fed59c8b210cdae5d453017b0d49eff179f7b51eee47544f68a15c6a62c94783b7b24e
|
data/bin/site_mapper
CHANGED
data/lib/site_mapper.rb
CHANGED
@@ -2,6 +2,7 @@ require 'uri'
|
|
2
2
|
require 'net/http'
|
3
3
|
|
4
4
|
require 'site_mapper/version'
|
5
|
+
require 'site_mapper/logger'
|
5
6
|
require 'site_mapper/request'
|
6
7
|
require 'site_mapper/robots'
|
7
8
|
require 'site_mapper/crawler'
|
@@ -17,9 +18,28 @@ module SiteMapper
|
|
17
18
|
# Map all links on a given site.
|
18
19
|
# @return [Array] with links.
|
19
20
|
# @param [String] link to domain
|
21
|
+
# @param [Hash] options hash
|
20
22
|
# @example Collect all URLs from example.com
|
21
23
|
# SiteMapper.map('example.com')
|
22
|
-
def self.map(link)
|
24
|
+
def self.map(link, options = {})
|
25
|
+
set_logger(options[:logger])
|
23
26
|
Crawler.collect_urls(link) { |url| yield(url) if block_given? }
|
24
27
|
end
|
28
|
+
|
29
|
+
# Set logger.
|
30
|
+
# @param [Object] logger
|
31
|
+
# @example set system logger
|
32
|
+
# SiteMapper.set_logger(:system)
|
33
|
+
# @example set nil logger
|
34
|
+
# SiteMapper.set_logger(:nil)
|
35
|
+
# @example set your own logger
|
36
|
+
# SiteMapper.set_logger(YourLogger)
|
37
|
+
def self.set_logger(logger)
|
38
|
+
return if logger.nil?
|
39
|
+
if logger.is_a?(Symbol)
|
40
|
+
Logger.use_logger_type(logger)
|
41
|
+
else
|
42
|
+
Logger.use_logger(logger)
|
43
|
+
end
|
44
|
+
end
|
25
45
|
end
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -5,10 +5,10 @@ module SiteMapper
|
|
5
5
|
# Crawls a given site.
|
6
6
|
class Crawler
|
7
7
|
# @param [String] url base url for crawler
|
8
|
-
# @param [Hash] resolve (optional false by default)
|
9
|
-
def initialize(url,
|
10
|
-
@base_url
|
11
|
-
@options = { resolve:
|
8
|
+
# @param [Hash] options hash, resolve key (optional false by default)
|
9
|
+
def initialize(url, options = {})
|
10
|
+
@base_url = Request.resolve_url(url)
|
11
|
+
@options = { resolve: false }.merge(options)
|
12
12
|
@crawl_url = CrawlUrl.new(@base_url)
|
13
13
|
@fetch_queue = CrawlQueue.new
|
14
14
|
@processed = Set.new
|
@@ -37,17 +37,17 @@ module SiteMapper
|
|
37
37
|
yield(url)
|
38
38
|
page_links(url)
|
39
39
|
end
|
40
|
-
|
40
|
+
Logger.log "Crawling finished, #{@processed.length} links found"
|
41
41
|
@processed.to_a
|
42
42
|
rescue Interrupt, IRB::Abort
|
43
|
-
|
43
|
+
Logger.err_log 'Crawl interrupted.'
|
44
44
|
@fetch_queue.to_a
|
45
45
|
end
|
46
46
|
|
47
47
|
private
|
48
48
|
|
49
49
|
def page_links(get_url)
|
50
|
-
|
50
|
+
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
51
51
|
link_elements = Request.get_page(get_url).css('a') rescue []
|
52
52
|
@processed << get_url
|
53
53
|
link_elements.each do |page_link|
|
@@ -70,25 +70,25 @@ module SiteMapper
|
|
70
70
|
def resolve(url)
|
71
71
|
@options[:resolve] ? Request.resolve_url(url) : url
|
72
72
|
end
|
73
|
-
end
|
74
73
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
74
|
+
# Queue of urls to be crawled.
|
75
|
+
class CrawlQueue
|
76
|
+
# @return [Set] that exends EnumerablePop module
|
77
|
+
def self.new
|
78
|
+
Set.new.extend(EnumerablePop)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Add pop method when added to class.
|
82
|
+
# The class that extends this module need to implement #first and #delete.
|
83
|
+
module EnumerablePop
|
84
|
+
# Pop first element from list.
|
85
|
+
# @return [Object] the first object in the list or nil
|
86
|
+
def pop
|
87
|
+
first_element = first
|
88
|
+
delete(first_element)
|
89
|
+
first_element
|
90
|
+
end
|
91
91
|
end
|
92
92
|
end
|
93
|
-
end
|
93
|
+
end
|
94
94
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module SiteMapper
|
2
|
+
# Handles logging
|
3
|
+
class Logger
|
4
|
+
# @param [Symbol] type of logger class to be used
|
5
|
+
def self.use_logger_type(type)
|
6
|
+
fail 'Logger already set' if defined?(@@log)
|
7
|
+
@@log = case type.to_s
|
8
|
+
when 'nil', 'default'
|
9
|
+
NilLogger
|
10
|
+
when 'system'
|
11
|
+
SystemOutLogger
|
12
|
+
else
|
13
|
+
fail ArgumentError, "Unknown logger type: '#{type}'"
|
14
|
+
end
|
15
|
+
@@log
|
16
|
+
end
|
17
|
+
|
18
|
+
# @param [Class, #log, #err_log] logger a logger class
|
19
|
+
def self.use_logger(logger)
|
20
|
+
fail 'Logger already set' if defined?(@@log)
|
21
|
+
@@log = logger
|
22
|
+
end
|
23
|
+
|
24
|
+
# @param [String] msg to be logged
|
25
|
+
def self.log(msg)
|
26
|
+
@@log ||= use_logger_type(:default)
|
27
|
+
@@log.log(msg)
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param [String] err_msg to be logged
|
31
|
+
def self.err_log(err_msg)
|
32
|
+
@@log ||= use_logger_type(:default)
|
33
|
+
@@log.err_log(err_msg)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Log to terminal.
|
37
|
+
module SystemOutLogger
|
38
|
+
# @param [String] msg to be logged to STDOUT
|
39
|
+
def self.log(msg)
|
40
|
+
STDOUT.puts(msg)
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param [String] msg to be logged to STDERR
|
44
|
+
def self.err_log(msg)
|
45
|
+
STDERR.puts("[ERROR] #{msg}")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Don't log
|
50
|
+
module NilLogger
|
51
|
+
# @param [String] msg to be ignored
|
52
|
+
def self.log(msg);end
|
53
|
+
# @param [String] msg to be ignored
|
54
|
+
def self.err_log(msg);end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -37,7 +37,7 @@ module SiteMapper
|
|
37
37
|
end
|
38
38
|
|
39
39
|
# Get response body, rescues with nil if an exception is raised.
|
40
|
-
# @see #get_response
|
40
|
+
# @see Request#get_response
|
41
41
|
def get_response_body(*args)
|
42
42
|
get_response(*args).body rescue nil
|
43
43
|
end
|
@@ -45,14 +45,15 @@ module SiteMapper
|
|
45
45
|
# Resolve an URL string and follows redirects.
|
46
46
|
# if the URL can't be resolved the original URL is returned.
|
47
47
|
# @param [String] url
|
48
|
-
# @param [
|
48
|
+
# @param [Hash] options hash, with_query key (optional and true by default)
|
49
49
|
# @return [String] a URL string that potentially is a redirected URL
|
50
50
|
# @example Resolve google.com
|
51
51
|
# resolve_url('google.com')
|
52
52
|
# # => 'https://www.google.com'
|
53
|
-
def resolve_url(url,
|
53
|
+
def resolve_url(url, options = {})
|
54
|
+
options = { with_query: true }.merge(options)
|
54
55
|
resolved = UrlResolver.resolve(url)
|
55
|
-
resolved = remove_query(resolved) unless with_query
|
56
|
+
resolved = remove_query(resolved) unless options[:with_query]
|
56
57
|
resolved
|
57
58
|
end
|
58
59
|
|
data/lib/site_mapper/robots.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
2
1
|
module SiteMapper
|
2
|
+
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
3
3
|
# Provided a base URL it checks whether a given URL is
|
4
4
|
# allowed to be crawled according to /robots.txt.
|
5
5
|
# @see https://rubygems.org/gems/robots
|
@@ -114,7 +114,9 @@ module SiteMapper
|
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
117
|
-
# @param [String]
|
117
|
+
# @param [String] robots_txt contents of /robots.txt
|
118
|
+
# @param [String] hostname for the passed robots_txt
|
119
|
+
# @param [String] user_agent to check
|
118
120
|
def initialize(robots_txt, hostname, user_agent)
|
119
121
|
@robots_txt = robots_txt
|
120
122
|
@hostname = hostname
|
data/lib/site_mapper/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.8'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: redcarpet
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '3.2'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.2'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: coveralls
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -119,6 +133,7 @@ files:
|
|
119
133
|
- bin/site_mapper
|
120
134
|
- lib/site_mapper/crawl_url.rb
|
121
135
|
- lib/site_mapper/crawler.rb
|
136
|
+
- lib/site_mapper/logger.rb
|
122
137
|
- lib/site_mapper/request.rb
|
123
138
|
- lib/site_mapper/robots.rb
|
124
139
|
- lib/site_mapper/version.rb
|