site_mapper 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dc4b21c14dc15f1cc6df4b6406d12acf5cb821d9
4
- data.tar.gz: 34ef0ab2fcd0a74bbcdd53d9e47681d6440f951d
3
+ metadata.gz: f4c94ba470563b948533b4f9f39023ea0bef89bd
4
+ data.tar.gz: ce8e91a137ee13825fdd1b5ef0fe1220d48d85a1
5
5
  SHA512:
6
- metadata.gz: 1d4da1f2753dfb5f06ea577c02183efbf4cb919b783ce128f07b46f29b6af7a330cc01d839895ea4d4fb53cf68db2a6b5adccba47530a0278aaca9bfe1fa4c02
7
- data.tar.gz: c32dde9478240d63b63d6e521e04f3c914322b544c79eee7ee25e1b2ea46b5ad2529ddc7d778d30fe2c10cde074f88a72c53458a814fc5d2ab74bc87288e63ed
6
+ metadata.gz: 36171ef9626a3f905d2441582a95f81b36c2ac5cd068ea3015c32b6c540d8cebc50ba6f4ecec2ac9ea889b8d164c383d2f5cea86cc53401ea33e6183c0bdd1e2
7
+ data.tar.gz: cd9e6bd664c97b0f31dce8e7b56a2b99e92261f20e5c4876b37c89afbe8b616149284a2d579b8c9b15849e5fc0393ef97cf07d7c02e2120fa8b7a01456891f9c
@@ -11,7 +11,7 @@ module SiteMapper
11
11
  # @example Intitialize CrawlUrl with example.com as base_url
12
12
  # CrawlUrl.new('example.com')
13
13
  def initialize(base_url)
14
- @resolved_base_url = Request.resolve_url(base_url) # "#{protocol}#{host}"
14
+ @resolved_base_url = Request.resolve_url(base_url)
15
15
  @base_hostname = URI.parse(@resolved_base_url).hostname
16
16
  end
17
17
 
@@ -6,18 +6,14 @@ module SiteMapper
6
6
  class Crawler
7
7
  # Default options
8
8
  OPTIONS = {
9
- resolve: false,
10
9
  sleep_length: 0.5,
11
10
  max_requests: Float::INFINITY
12
11
  }
13
12
 
14
13
  # @param [String] url base url for crawler
15
- # @param [Hash] options hash, resolve key (optional false by default)
16
- # add user_agent key to specify custom User-agent
17
- # @example Create crawler with custom User-agent
14
+ # @param [Hash] options hash
15
+ # @example Create crawler with custom User-Agent
18
16
  # Crawler.new('example.com', user_agent: 'MyUserAgent')
19
- # @example Create crawler and resolve all urls
20
- # Crawler.new('example.com', resolve: true)
21
17
  # @example Create crawler and sleep 1 second between each request
22
18
  # Crawler.new('example.com', sleep_length: 1)
23
19
  # @example Create crawler and perform max 3 requests
@@ -75,7 +71,7 @@ module SiteMapper
75
71
  @processed << current_url
76
72
  link_elements.each do |page_link|
77
73
  url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
78
- @fetch_queue << url if url && eligible_for_queue?(resolve(url))
74
+ @fetch_queue << url if url && eligible_for_queue?(url)
79
75
  end
80
76
  end
81
77
 
@@ -85,22 +81,19 @@ module SiteMapper
85
81
 
86
82
  def robots
87
83
  return @robots unless @robots.nil?
88
- robots_body = Request.response_body("#{@base_url}/robots.txt", user_agent: @options[:user_agent])
89
- @robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
84
+ robots_url = URI.join(@base_url, '/robots.txt').to_s
85
+ robots_body = Request.response_body(robots_url, user_agent: @options[:user_agent])
86
+ @robots = Robots.new(robots_body, URI.parse(@base_url).host, @options[:user_agent])
90
87
  @robots
91
88
  end
92
89
 
93
- def resolve(url)
94
- @options[:resolve] ? Request.resolve_url(url) : url
95
- end
96
-
97
90
  def wait
98
91
  sleep @options[:sleep_length]
99
92
  end
100
93
 
101
94
  # Queue of urls to be crawled.
102
95
  class CrawlQueue
103
- # @return [Set] that exends EnumerablePop module
96
+ # @return [Set] that extends EnumerablePop module
104
97
  def self.new
105
98
  Set.new.extend(EnumerablePop)
106
99
  end
@@ -45,13 +45,13 @@ module SiteMapper
45
45
  # Log to STDOUT
46
46
  # @param [String] msg to be logged to STDOUT
47
47
  def self.log(msg)
48
- STDOUT.puts(msg)
48
+ $stdout.puts(msg)
49
49
  end
50
50
 
51
51
  # Log to STDERR
52
52
  # @param [String] msg to be logged to STDERR
53
53
  def self.err_log(msg)
54
- STDERR.puts("[ERROR] #{msg}")
54
+ $stderr.puts("[ERROR] #{msg}")
55
55
  end
56
56
  end
57
57
 
@@ -1,4 +1,4 @@
1
1
  module SiteMapper
2
2
  # Gem version
3
- VERSION = '0.0.12'
3
+ VERSION = '0.0.13'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam