site_mapper 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/site_mapper +1 -1
- data/lib/site_mapper.rb +21 -1
- data/lib/site_mapper/crawler.rb +25 -25
- data/lib/site_mapper/logger.rb +57 -0
- data/lib/site_mapper/request.rb +5 -4
- data/lib/site_mapper/robots.rb +4 -2
- data/lib/site_mapper/version.rb +1 -1
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49d4c0ab215ac8872234e3c552275f89688f997d
|
4
|
+
data.tar.gz: 4b75bae288524a38fe3a8f7d6df9ab42e715211a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0cad8aedfb27fc5a4d56b297098b76a9abd43df86b6c20b9d6f228256091191bb020e620217a9bb40bdc68c981bf8d2ca4fdf2930c55158bb321fa11dfc960b
|
7
|
+
data.tar.gz: c3b1c1863e3de70793d127772b7a314b1e788483fe9151e441c1c209b7fed59c8b210cdae5d453017b0d49eff179f7b51eee47544f68a15c6a62c94783b7b24e
|
data/bin/site_mapper
CHANGED
data/lib/site_mapper.rb
CHANGED
@@ -2,6 +2,7 @@ require 'uri'
|
|
2
2
|
require 'net/http'
|
3
3
|
|
4
4
|
require 'site_mapper/version'
|
5
|
+
require 'site_mapper/logger'
|
5
6
|
require 'site_mapper/request'
|
6
7
|
require 'site_mapper/robots'
|
7
8
|
require 'site_mapper/crawler'
|
@@ -17,9 +18,28 @@ module SiteMapper
|
|
17
18
|
# Map all links on a given site.
|
18
19
|
# @return [Array] with links.
|
19
20
|
# @param [String] link to domain
|
21
|
+
# @param [Hash] options hash
|
20
22
|
# @example Collect all URLs from example.com
|
21
23
|
# SiteMapper.map('example.com')
|
22
|
-
def self.map(link)
|
24
|
+
def self.map(link, options = {})
|
25
|
+
set_logger(options[:logger])
|
23
26
|
Crawler.collect_urls(link) { |url| yield(url) if block_given? }
|
24
27
|
end
|
28
|
+
|
29
|
+
# Set logger.
|
30
|
+
# @param [Object] logger
|
31
|
+
# @example set system logger
|
32
|
+
# SiteMapper.set_logger(:system)
|
33
|
+
# @example set nil logger
|
34
|
+
# SiteMapper.set_logger(:nil)
|
35
|
+
# @example set your own logger
|
36
|
+
# SiteMapper.set_logger(YourLogger)
|
37
|
+
def self.set_logger(logger)
|
38
|
+
return if logger.nil?
|
39
|
+
if logger.is_a?(Symbol)
|
40
|
+
Logger.use_logger_type(logger)
|
41
|
+
else
|
42
|
+
Logger.use_logger(logger)
|
43
|
+
end
|
44
|
+
end
|
25
45
|
end
|
data/lib/site_mapper/crawler.rb
CHANGED
@@ -5,10 +5,10 @@ module SiteMapper
|
|
5
5
|
# Crawls a given site.
|
6
6
|
class Crawler
|
7
7
|
# @param [String] url base url for crawler
|
8
|
-
# @param [Hash] resolve (optional false by default)
|
9
|
-
def initialize(url,
|
10
|
-
@base_url
|
11
|
-
@options = { resolve:
|
8
|
+
# @param [Hash] options hash, resolve key (optional false by default)
|
9
|
+
def initialize(url, options = {})
|
10
|
+
@base_url = Request.resolve_url(url)
|
11
|
+
@options = { resolve: false }.merge(options)
|
12
12
|
@crawl_url = CrawlUrl.new(@base_url)
|
13
13
|
@fetch_queue = CrawlQueue.new
|
14
14
|
@processed = Set.new
|
@@ -37,17 +37,17 @@ module SiteMapper
|
|
37
37
|
yield(url)
|
38
38
|
page_links(url)
|
39
39
|
end
|
40
|
-
|
40
|
+
Logger.log "Crawling finished, #{@processed.length} links found"
|
41
41
|
@processed.to_a
|
42
42
|
rescue Interrupt, IRB::Abort
|
43
|
-
|
43
|
+
Logger.err_log 'Crawl interrupted.'
|
44
44
|
@fetch_queue.to_a
|
45
45
|
end
|
46
46
|
|
47
47
|
private
|
48
48
|
|
49
49
|
def page_links(get_url)
|
50
|
-
|
50
|
+
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
|
51
51
|
link_elements = Request.get_page(get_url).css('a') rescue []
|
52
52
|
@processed << get_url
|
53
53
|
link_elements.each do |page_link|
|
@@ -70,25 +70,25 @@ module SiteMapper
|
|
70
70
|
def resolve(url)
|
71
71
|
@options[:resolve] ? Request.resolve_url(url) : url
|
72
72
|
end
|
73
|
-
end
|
74
73
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
74
|
+
# Queue of urls to be crawled.
|
75
|
+
class CrawlQueue
|
76
|
+
# @return [Set] that exends EnumerablePop module
|
77
|
+
def self.new
|
78
|
+
Set.new.extend(EnumerablePop)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Add pop method when added to class.
|
82
|
+
# The class that extends this module need to implement #first and #delete.
|
83
|
+
module EnumerablePop
|
84
|
+
# Pop first element from list.
|
85
|
+
# @return [Object] the first object in the list or nil
|
86
|
+
def pop
|
87
|
+
first_element = first
|
88
|
+
delete(first_element)
|
89
|
+
first_element
|
90
|
+
end
|
91
91
|
end
|
92
92
|
end
|
93
|
-
end
|
93
|
+
end
|
94
94
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module SiteMapper
|
2
|
+
# Handles logging
|
3
|
+
class Logger
|
4
|
+
# @param [Symbol] type of logger class to be used
|
5
|
+
def self.use_logger_type(type)
|
6
|
+
fail 'Logger already set' if defined?(@@log)
|
7
|
+
@@log = case type.to_s
|
8
|
+
when 'nil', 'default'
|
9
|
+
NilLogger
|
10
|
+
when 'system'
|
11
|
+
SystemOutLogger
|
12
|
+
else
|
13
|
+
fail ArgumentError, "Unknown logger type: '#{type}'"
|
14
|
+
end
|
15
|
+
@@log
|
16
|
+
end
|
17
|
+
|
18
|
+
# @param [Class, #log, #err_log] logger a logger class
|
19
|
+
def self.use_logger(logger)
|
20
|
+
fail 'Logger already set' if defined?(@@log)
|
21
|
+
@@log = logger
|
22
|
+
end
|
23
|
+
|
24
|
+
# @param [String] msg to be logged
|
25
|
+
def self.log(msg)
|
26
|
+
@@log ||= use_logger_type(:default)
|
27
|
+
@@log.log(msg)
|
28
|
+
end
|
29
|
+
|
30
|
+
# @param [String] err_msg to be logged
|
31
|
+
def self.err_log(err_msg)
|
32
|
+
@@log ||= use_logger_type(:default)
|
33
|
+
@@log.err_log(err_msg)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Log to terminal.
|
37
|
+
module SystemOutLogger
|
38
|
+
# @param [String] msg to be logged to STDOUT
|
39
|
+
def self.log(msg)
|
40
|
+
STDOUT.puts(msg)
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param [String] msg to be logged to STDERR
|
44
|
+
def self.err_log(msg)
|
45
|
+
STDERR.puts("[ERROR] #{msg}")
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Don't log
|
50
|
+
module NilLogger
|
51
|
+
# @param [String] msg to be ignored
|
52
|
+
def self.log(msg);end
|
53
|
+
# @param [String] msg to be ignored
|
54
|
+
def self.err_log(msg);end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/site_mapper/request.rb
CHANGED
@@ -37,7 +37,7 @@ module SiteMapper
|
|
37
37
|
end
|
38
38
|
|
39
39
|
# Get response body, rescues with nil if an exception is raised.
|
40
|
-
# @see #get_response
|
40
|
+
# @see Request#get_response
|
41
41
|
def get_response_body(*args)
|
42
42
|
get_response(*args).body rescue nil
|
43
43
|
end
|
@@ -45,14 +45,15 @@ module SiteMapper
|
|
45
45
|
# Resolve an URL string and follows redirects.
|
46
46
|
# if the URL can't be resolved the original URL is returned.
|
47
47
|
# @param [String] url
|
48
|
-
# @param [
|
48
|
+
# @param [Hash] options hash, with_query key (optional and true by default)
|
49
49
|
# @return [String] a URL string that potentially is a redirected URL
|
50
50
|
# @example Resolve google.com
|
51
51
|
# resolve_url('google.com')
|
52
52
|
# # => 'https://www.google.com'
|
53
|
-
def resolve_url(url,
|
53
|
+
def resolve_url(url, options = {})
|
54
|
+
options = { with_query: true }.merge(options)
|
54
55
|
resolved = UrlResolver.resolve(url)
|
55
|
-
resolved = remove_query(resolved) unless with_query
|
56
|
+
resolved = remove_query(resolved) unless options[:with_query]
|
56
57
|
resolved
|
57
58
|
end
|
58
59
|
|
data/lib/site_mapper/robots.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
2
1
|
module SiteMapper
|
2
|
+
# Based on: https://rubygems.org/gems/robots, v0.10.1
|
3
3
|
# Provided a base URL it checks whether a given URL is
|
4
4
|
# allowed to be crawled according to /robots.txt.
|
5
5
|
# @see https://rubygems.org/gems/robots
|
@@ -114,7 +114,9 @@ module SiteMapper
|
|
114
114
|
end
|
115
115
|
end
|
116
116
|
|
117
|
-
# @param [String]
|
117
|
+
# @param [String] robots_txt contents of /robots.txt
|
118
|
+
# @param [String] hostname for the passed robots_txt
|
119
|
+
# @param [String] user_agent to check
|
118
120
|
def initialize(robots_txt, hostname, user_agent)
|
119
121
|
@robots_txt = robots_txt
|
120
122
|
@hostname = hostname
|
data/lib/site_mapper/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: site_mapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jacob Burenstam
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ~>
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0.8'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: redcarpet
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '3.2'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.2'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: coveralls
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -119,6 +133,7 @@ files:
|
|
119
133
|
- bin/site_mapper
|
120
134
|
- lib/site_mapper/crawl_url.rb
|
121
135
|
- lib/site_mapper/crawler.rb
|
136
|
+
- lib/site_mapper/logger.rb
|
122
137
|
- lib/site_mapper/request.rb
|
123
138
|
- lib/site_mapper/robots.rb
|
124
139
|
- lib/site_mapper/version.rb
|