site_mapper 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d2f7c3a2410e0cb508297f6fb229c1e5ec5e8f98
4
- data.tar.gz: 6495918729c0e0d4c8788c4eb126c829b72dc17b
3
+ metadata.gz: 49d4c0ab215ac8872234e3c552275f89688f997d
4
+ data.tar.gz: 4b75bae288524a38fe3a8f7d6df9ab42e715211a
5
5
  SHA512:
6
- metadata.gz: 80fceb2fd976b3c0e6ffe2f7237ffe9645257a271b99ca7f919e147a132fd09275862ba8c0563924e533e6b5049cd192726a06c48a8c4a4efa1b61b0af2c1e55
7
- data.tar.gz: 9ff4ef13486b564bc9f8b3d60c42c248c26ed4245584af3e05a0a907b41ac7bf6bfc44ab5ece3cee91216c240201990c65e0adfa20960b28a0b37f8f6eb6ad4e
6
+ metadata.gz: e0cad8aedfb27fc5a4d56b297098b76a9abd43df86b6c20b9d6f228256091191bb020e620217a9bb40bdc68c981bf8d2ca4fdf2930c55158bb321fa11dfc960b
7
+ data.tar.gz: c3b1c1863e3de70793d127772b7a314b1e788483fe9151e441c1c209b7fed59c8b210cdae5d453017b0d49eff179f7b51eee47544f68a15c6a62c94783b7b24e
@@ -3,4 +3,4 @@
3
3
  require 'site_mapper'
4
4
 
5
5
  url = ARGV[0]
6
- SiteMapper.map(url)
6
+ SiteMapper.map(url, logger: :system)
@@ -2,6 +2,7 @@ require 'uri'
2
2
  require 'net/http'
3
3
 
4
4
  require 'site_mapper/version'
5
+ require 'site_mapper/logger'
5
6
  require 'site_mapper/request'
6
7
  require 'site_mapper/robots'
7
8
  require 'site_mapper/crawler'
@@ -17,9 +18,28 @@ module SiteMapper
17
18
  # Map all links on a given site.
18
19
  # @return [Array] with links.
19
20
  # @param [String] link to domain
21
+ # @param [Hash] options hash
20
22
  # @example Collect all URLs from example.com
21
23
  # SiteMapper.map('example.com')
22
- def self.map(link)
24
+ def self.map(link, options = {})
25
+ set_logger(options[:logger])
23
26
  Crawler.collect_urls(link) { |url| yield(url) if block_given? }
24
27
  end
28
+
29
+ # Set logger.
30
+ # @param [Object] logger
31
+ # @example set system logger
32
+ # SiteMapper.set_logger(:system)
33
+ # @example set nil logger
34
+ # SiteMapper.set_logger(:nil)
35
+ # @example set your own logger
36
+ # SiteMapper.set_logger(YourLogger)
37
+ def self.set_logger(logger)
38
+ return if logger.nil?
39
+ if logger.is_a?(Symbol)
40
+ Logger.use_logger_type(logger)
41
+ else
42
+ Logger.use_logger(logger)
43
+ end
44
+ end
25
45
  end
@@ -5,10 +5,10 @@ module SiteMapper
5
5
  # Crawls a given site.
6
6
  class Crawler
7
7
  # @param [String] url base url for crawler
8
- # @param [Hash] resolve (optional false by default)
9
- def initialize(url, resolve: false)
10
- @base_url = Request.resolve_url(url)
11
- @options = { resolve: resolve }
8
+ # @param [Hash] options hash, resolve key (optional false by default)
9
+ def initialize(url, options = {})
10
+ @base_url = Request.resolve_url(url)
11
+ @options = { resolve: false }.merge(options)
12
12
  @crawl_url = CrawlUrl.new(@base_url)
13
13
  @fetch_queue = CrawlQueue.new
14
14
  @processed = Set.new
@@ -37,17 +37,17 @@ module SiteMapper
37
37
  yield(url)
38
38
  page_links(url)
39
39
  end
40
- puts "Crawling finished, #{@processed.length} links found"
40
+ Logger.log "Crawling finished, #{@processed.length} links found"
41
41
  @processed.to_a
42
42
  rescue Interrupt, IRB::Abort
43
- puts 'Crawl interrupted.'
43
+ Logger.err_log 'Crawl interrupted.'
44
44
  @fetch_queue.to_a
45
45
  end
46
46
 
47
47
  private
48
48
 
49
49
  def page_links(get_url)
50
- puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
50
+ Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
51
51
  link_elements = Request.get_page(get_url).css('a') rescue []
52
52
  @processed << get_url
53
53
  link_elements.each do |page_link|
@@ -70,25 +70,25 @@ module SiteMapper
70
70
  def resolve(url)
71
71
  @options[:resolve] ? Request.resolve_url(url) : url
72
72
  end
73
- end
74
73
 
75
- # Queue of urls to be crawled.
76
- class CrawlQueue
77
- # @return [Set] that exends EnumerablePop module
78
- def self.new
79
- Set.new.extend(EnumerablePop)
80
- end
81
-
82
- # Add pop method when added to class.
83
- # The class that extends this module need to implement #first and #delete.
84
- module EnumerablePop
85
- # Pop first element from list.
86
- # @return [Object] the first object in the list or nil
87
- def pop
88
- first_element = first
89
- delete(first_element)
90
- first_element
74
+ # Queue of urls to be crawled.
75
+ class CrawlQueue
76
+ # @return [Set] that exends EnumerablePop module
77
+ def self.new
78
+ Set.new.extend(EnumerablePop)
79
+ end
80
+
81
+ # Add pop method when added to class.
82
+ # The class that extends this module need to implement #first and #delete.
83
+ module EnumerablePop
84
+ # Pop first element from list.
85
+ # @return [Object] the first object in the list or nil
86
+ def pop
87
+ first_element = first
88
+ delete(first_element)
89
+ first_element
90
+ end
91
91
  end
92
92
  end
93
- end
93
+ end
94
94
  end
@@ -0,0 +1,57 @@
1
+ module SiteMapper
2
+ # Handles logging
3
+ class Logger
4
+ # @param [Symbol] type of logger class to be used
5
+ def self.use_logger_type(type)
6
+ fail 'Logger already set' if defined?(@@log)
7
+ @@log = case type.to_s
8
+ when 'nil', 'default'
9
+ NilLogger
10
+ when 'system'
11
+ SystemOutLogger
12
+ else
13
+ fail ArgumentError, "Unknown logger type: '#{type}'"
14
+ end
15
+ @@log
16
+ end
17
+
18
+ # @param [Class, #log, #err_log] logger a logger class
19
+ def self.use_logger(logger)
20
+ fail 'Logger already set' if defined?(@@log)
21
+ @@log = logger
22
+ end
23
+
24
+ # @param [String] msg to be logged
25
+ def self.log(msg)
26
+ @@log ||= use_logger_type(:default)
27
+ @@log.log(msg)
28
+ end
29
+
30
+ # @param [String] err_msg to be logged
31
+ def self.err_log(err_msg)
32
+ @@log ||= use_logger_type(:default)
33
+ @@log.err_log(err_msg)
34
+ end
35
+
36
+ # Log to terminal.
37
+ module SystemOutLogger
38
+ # @param [String] msg to be logged to STDOUT
39
+ def self.log(msg)
40
+ STDOUT.puts(msg)
41
+ end
42
+
43
+ # @param [String] msg to be logged to STDERR
44
+ def self.err_log(msg)
45
+ STDERR.puts("[ERROR] #{msg}")
46
+ end
47
+ end
48
+
49
+ # Don't log
50
+ module NilLogger
51
+ # @param [String] msg to be ignored
52
+ def self.log(msg);end
53
+ # @param [String] msg to be ignored
54
+ def self.err_log(msg);end
55
+ end
56
+ end
57
+ end
@@ -37,7 +37,7 @@ module SiteMapper
37
37
  end
38
38
 
39
39
  # Get response body, rescues with nil if an exception is raised.
40
- # @see #get_response
40
+ # @see Request#get_response
41
41
  def get_response_body(*args)
42
42
  get_response(*args).body rescue nil
43
43
  end
@@ -45,14 +45,15 @@ module SiteMapper
45
45
  # Resolve an URL string and follows redirects.
46
46
  # if the URL can't be resolved the original URL is returned.
47
47
  # @param [String] url
48
- # @param [Boolean] with_query (optional and true by default)
48
+ # @param [Hash] options hash, with_query key (optional and true by default)
49
49
  # @return [String] a URL string that potentially is a redirected URL
50
50
  # @example Resolve google.com
51
51
  # resolve_url('google.com')
52
52
  # # => 'https://www.google.com'
53
- def resolve_url(url, with_query: true)
53
+ def resolve_url(url, options = {})
54
+ options = { with_query: true }.merge(options)
54
55
  resolved = UrlResolver.resolve(url)
55
- resolved = remove_query(resolved) unless with_query
56
+ resolved = remove_query(resolved) unless options[:with_query]
56
57
  resolved
57
58
  end
58
59
 
@@ -1,5 +1,5 @@
1
- # Based on: https://rubygems.org/gems/robots, v0.10.1
2
1
  module SiteMapper
2
+ # Based on: https://rubygems.org/gems/robots, v0.10.1
3
3
  # Provided a base URL it checks whether a given URL is
4
4
  # allowed to be crawled according to /robots.txt.
5
5
  # @see https://rubygems.org/gems/robots
@@ -114,7 +114,9 @@ module SiteMapper
114
114
  end
115
115
  end
116
116
 
117
- # @param [String] url to fetch /robots.txt from
117
+ # @param [String] robots_txt contents of /robots.txt
118
+ # @param [String] hostname for the passed robots_txt
119
+ # @param [String] user_agent to check
118
120
  def initialize(robots_txt, hostname, user_agent)
119
121
  @robots_txt = robots_txt
120
122
  @hostname = hostname
@@ -1,4 +1,4 @@
1
1
  module SiteMapper
2
2
  # Gem version
3
- VERSION = '0.0.9'
3
+ VERSION = '0.0.10'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ~>
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0.8'
97
+ - !ruby/object:Gem::Dependency
98
+ name: redcarpet
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '3.2'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '3.2'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: coveralls
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -119,6 +133,7 @@ files:
119
133
  - bin/site_mapper
120
134
  - lib/site_mapper/crawl_url.rb
121
135
  - lib/site_mapper/crawler.rb
136
+ - lib/site_mapper/logger.rb
122
137
  - lib/site_mapper/request.rb
123
138
  - lib/site_mapper/robots.rb
124
139
  - lib/site_mapper/version.rb