site_mapper 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d2f7c3a2410e0cb508297f6fb229c1e5ec5e8f98
4
- data.tar.gz: 6495918729c0e0d4c8788c4eb126c829b72dc17b
3
+ metadata.gz: 49d4c0ab215ac8872234e3c552275f89688f997d
4
+ data.tar.gz: 4b75bae288524a38fe3a8f7d6df9ab42e715211a
5
5
  SHA512:
6
- metadata.gz: 80fceb2fd976b3c0e6ffe2f7237ffe9645257a271b99ca7f919e147a132fd09275862ba8c0563924e533e6b5049cd192726a06c48a8c4a4efa1b61b0af2c1e55
7
- data.tar.gz: 9ff4ef13486b564bc9f8b3d60c42c248c26ed4245584af3e05a0a907b41ac7bf6bfc44ab5ece3cee91216c240201990c65e0adfa20960b28a0b37f8f6eb6ad4e
6
+ metadata.gz: e0cad8aedfb27fc5a4d56b297098b76a9abd43df86b6c20b9d6f228256091191bb020e620217a9bb40bdc68c981bf8d2ca4fdf2930c55158bb321fa11dfc960b
7
+ data.tar.gz: c3b1c1863e3de70793d127772b7a314b1e788483fe9151e441c1c209b7fed59c8b210cdae5d453017b0d49eff179f7b51eee47544f68a15c6a62c94783b7b24e
@@ -3,4 +3,4 @@
3
3
  require 'site_mapper'
4
4
 
5
5
  url = ARGV[0]
6
- SiteMapper.map(url)
6
+ SiteMapper.map(url, logger: :system)
@@ -2,6 +2,7 @@ require 'uri'
2
2
  require 'net/http'
3
3
 
4
4
  require 'site_mapper/version'
5
+ require 'site_mapper/logger'
5
6
  require 'site_mapper/request'
6
7
  require 'site_mapper/robots'
7
8
  require 'site_mapper/crawler'
@@ -17,9 +18,28 @@ module SiteMapper
17
18
  # Map all links on a given site.
18
19
  # @return [Array] with links.
19
20
  # @param [String] link to domain
21
+ # @param [Hash] options hash
20
22
  # @example Collect all URLs from example.com
21
23
  # SiteMapper.map('example.com')
22
- def self.map(link)
24
+ def self.map(link, options = {})
25
+ set_logger(options[:logger])
23
26
  Crawler.collect_urls(link) { |url| yield(url) if block_given? }
24
27
  end
28
+
29
+ # Set logger.
30
+ # @param [Object] logger
31
+ # @example set system logger
32
+ # SiteMapper.set_logger(:system)
33
+ # @example set nil logger
34
+ # SiteMapper.set_logger(:nil)
35
+ # @example set your own logger
36
+ # SiteMapper.set_logger(YourLogger)
37
+ def self.set_logger(logger)
38
+ return if logger.nil?
39
+ if logger.is_a?(Symbol)
40
+ Logger.use_logger_type(logger)
41
+ else
42
+ Logger.use_logger(logger)
43
+ end
44
+ end
25
45
  end
@@ -5,10 +5,10 @@ module SiteMapper
5
5
  # Crawls a given site.
6
6
  class Crawler
7
7
  # @param [String] url base url for crawler
8
- # @param [Hash] resolve (optional false by default)
9
- def initialize(url, resolve: false)
10
- @base_url = Request.resolve_url(url)
11
- @options = { resolve: resolve }
8
+ # @param [Hash] options hash, resolve key (optional false by default)
9
+ def initialize(url, options = {})
10
+ @base_url = Request.resolve_url(url)
11
+ @options = { resolve: false }.merge(options)
12
12
  @crawl_url = CrawlUrl.new(@base_url)
13
13
  @fetch_queue = CrawlQueue.new
14
14
  @processed = Set.new
@@ -37,17 +37,17 @@ module SiteMapper
37
37
  yield(url)
38
38
  page_links(url)
39
39
  end
40
- puts "Crawling finished, #{@processed.length} links found"
40
+ Logger.log "Crawling finished, #{@processed.length} links found"
41
41
  @processed.to_a
42
42
  rescue Interrupt, IRB::Abort
43
- puts 'Crawl interrupted.'
43
+ Logger.err_log 'Crawl interrupted.'
44
44
  @fetch_queue.to_a
45
45
  end
46
46
 
47
47
  private
48
48
 
49
49
  def page_links(get_url)
50
- puts "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
50
+ Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
51
51
  link_elements = Request.get_page(get_url).css('a') rescue []
52
52
  @processed << get_url
53
53
  link_elements.each do |page_link|
@@ -70,25 +70,25 @@ module SiteMapper
70
70
  def resolve(url)
71
71
  @options[:resolve] ? Request.resolve_url(url) : url
72
72
  end
73
- end
74
73
 
75
- # Queue of urls to be crawled.
76
- class CrawlQueue
77
- # @return [Set] that exends EnumerablePop module
78
- def self.new
79
- Set.new.extend(EnumerablePop)
80
- end
81
-
82
- # Add pop method when added to class.
83
- # The class that extends this module need to implement #first and #delete.
84
- module EnumerablePop
85
- # Pop first element from list.
86
- # @return [Object] the first object in the list or nil
87
- def pop
88
- first_element = first
89
- delete(first_element)
90
- first_element
74
+ # Queue of urls to be crawled.
75
+ class CrawlQueue
76
+ # @return [Set] that exends EnumerablePop module
77
+ def self.new
78
+ Set.new.extend(EnumerablePop)
79
+ end
80
+
81
+ # Add pop method when added to class.
82
+ # The class that extends this module need to implement #first and #delete.
83
+ module EnumerablePop
84
+ # Pop first element from list.
85
+ # @return [Object] the first object in the list or nil
86
+ def pop
87
+ first_element = first
88
+ delete(first_element)
89
+ first_element
90
+ end
91
91
  end
92
92
  end
93
- end
93
+ end
94
94
  end
@@ -0,0 +1,57 @@
1
+ module SiteMapper
2
+ # Handles logging
3
+ class Logger
4
+ # @param [Symbol] type of logger class to be used
5
+ def self.use_logger_type(type)
6
+ fail 'Logger already set' if defined?(@@log)
7
+ @@log = case type.to_s
8
+ when 'nil', 'default'
9
+ NilLogger
10
+ when 'system'
11
+ SystemOutLogger
12
+ else
13
+ fail ArgumentError, "Unknown logger type: '#{type}'"
14
+ end
15
+ @@log
16
+ end
17
+
18
+ # @param [Class, #log, #err_log] logger a logger class
19
+ def self.use_logger(logger)
20
+ fail 'Logger already set' if defined?(@@log)
21
+ @@log = logger
22
+ end
23
+
24
+ # @param [String] msg to be logged
25
+ def self.log(msg)
26
+ @@log ||= use_logger_type(:default)
27
+ @@log.log(msg)
28
+ end
29
+
30
+ # @param [String] err_msg to be logged
31
+ def self.err_log(err_msg)
32
+ @@log ||= use_logger_type(:default)
33
+ @@log.err_log(err_msg)
34
+ end
35
+
36
+ # Log to terminal.
37
+ module SystemOutLogger
38
+ # @param [String] msg to be logged to STDOUT
39
+ def self.log(msg)
40
+ STDOUT.puts(msg)
41
+ end
42
+
43
+ # @param [String] msg to be logged to STDERR
44
+ def self.err_log(msg)
45
+ STDERR.puts("[ERROR] #{msg}")
46
+ end
47
+ end
48
+
49
+ # Don't log
50
+ module NilLogger
51
+ # @param [String] msg to be ignored
52
+ def self.log(msg);end
53
+ # @param [String] msg to be ignored
54
+ def self.err_log(msg);end
55
+ end
56
+ end
57
+ end
@@ -37,7 +37,7 @@ module SiteMapper
37
37
  end
38
38
 
39
39
  # Get response body, rescues with nil if an exception is raised.
40
- # @see #get_response
40
+ # @see Request#get_response
41
41
  def get_response_body(*args)
42
42
  get_response(*args).body rescue nil
43
43
  end
@@ -45,14 +45,15 @@ module SiteMapper
45
45
  # Resolve an URL string and follows redirects.
46
46
  # if the URL can't be resolved the original URL is returned.
47
47
  # @param [String] url
48
- # @param [Boolean] with_query (optional and true by default)
48
+ # @param [Hash] options hash, with_query key (optional and true by default)
49
49
  # @return [String] a URL string that potentially is a redirected URL
50
50
  # @example Resolve google.com
51
51
  # resolve_url('google.com')
52
52
  # # => 'https://www.google.com'
53
- def resolve_url(url, with_query: true)
53
+ def resolve_url(url, options = {})
54
+ options = { with_query: true }.merge(options)
54
55
  resolved = UrlResolver.resolve(url)
55
- resolved = remove_query(resolved) unless with_query
56
+ resolved = remove_query(resolved) unless options[:with_query]
56
57
  resolved
57
58
  end
58
59
 
@@ -1,5 +1,5 @@
1
- # Based on: https://rubygems.org/gems/robots, v0.10.1
2
1
  module SiteMapper
2
+ # Based on: https://rubygems.org/gems/robots, v0.10.1
3
3
  # Provided a base URL it checks whether a given URL is
4
4
  # allowed to be crawled according to /robots.txt.
5
5
  # @see https://rubygems.org/gems/robots
@@ -114,7 +114,9 @@ module SiteMapper
114
114
  end
115
115
  end
116
116
 
117
- # @param [String] url to fetch /robots.txt from
117
+ # @param [String] robots_txt contents of /robots.txt
118
+ # @param [String] hostname for the passed robots_txt
119
+ # @param [String] user_agent to check
118
120
  def initialize(robots_txt, hostname, user_agent)
119
121
  @robots_txt = robots_txt
120
122
  @hostname = hostname
@@ -1,4 +1,4 @@
1
1
  module SiteMapper
2
2
  # Gem version
3
- VERSION = '0.0.9'
3
+ VERSION = '0.0.10'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ~>
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0.8'
97
+ - !ruby/object:Gem::Dependency
98
+ name: redcarpet
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '3.2'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '3.2'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: coveralls
99
113
  requirement: !ruby/object:Gem::Requirement
@@ -119,6 +133,7 @@ files:
119
133
  - bin/site_mapper
120
134
  - lib/site_mapper/crawl_url.rb
121
135
  - lib/site_mapper/crawler.rb
136
+ - lib/site_mapper/logger.rb
122
137
  - lib/site_mapper/request.rb
123
138
  - lib/site_mapper/robots.rb
124
139
  - lib/site_mapper/version.rb