site_mapper 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dc4b21c14dc15f1cc6df4b6406d12acf5cb821d9
4
- data.tar.gz: 34ef0ab2fcd0a74bbcdd53d9e47681d6440f951d
3
+ metadata.gz: f4c94ba470563b948533b4f9f39023ea0bef89bd
4
+ data.tar.gz: ce8e91a137ee13825fdd1b5ef0fe1220d48d85a1
5
5
  SHA512:
6
- metadata.gz: 1d4da1f2753dfb5f06ea577c02183efbf4cb919b783ce128f07b46f29b6af7a330cc01d839895ea4d4fb53cf68db2a6b5adccba47530a0278aaca9bfe1fa4c02
7
- data.tar.gz: c32dde9478240d63b63d6e521e04f3c914322b544c79eee7ee25e1b2ea46b5ad2529ddc7d778d30fe2c10cde074f88a72c53458a814fc5d2ab74bc87288e63ed
6
+ metadata.gz: 36171ef9626a3f905d2441582a95f81b36c2ac5cd068ea3015c32b6c540d8cebc50ba6f4ecec2ac9ea889b8d164c383d2f5cea86cc53401ea33e6183c0bdd1e2
7
+ data.tar.gz: cd9e6bd664c97b0f31dce8e7b56a2b99e92261f20e5c4876b37c89afbe8b616149284a2d579b8c9b15849e5fc0393ef97cf07d7c02e2120fa8b7a01456891f9c
@@ -11,7 +11,7 @@ module SiteMapper
11
11
  # @example Intitialize CrawlUrl with example.com as base_url
12
12
  # CrawlUrl.new('example.com')
13
13
  def initialize(base_url)
14
- @resolved_base_url = Request.resolve_url(base_url) # "#{protocol}#{host}"
14
+ @resolved_base_url = Request.resolve_url(base_url)
15
15
  @base_hostname = URI.parse(@resolved_base_url).hostname
16
16
  end
17
17
 
@@ -6,18 +6,14 @@ module SiteMapper
6
6
  class Crawler
7
7
  # Default options
8
8
  OPTIONS = {
9
- resolve: false,
10
9
  sleep_length: 0.5,
11
10
  max_requests: Float::INFINITY
12
11
  }
13
12
 
14
13
  # @param [String] url base url for crawler
15
- # @param [Hash] options hash, resolve key (optional false by default)
16
- # add user_agent key to specify custom User-agent
17
- # @example Create crawler with custom User-agent
14
+ # @param [Hash] options hash
15
+ # @example Create crawler with custom User-Agent
18
16
  # Crawler.new('example.com', user_agent: 'MyUserAgent')
19
- # @example Create crawler and resolve all urls
20
- # Crawler.new('example.com', resolve: true)
21
17
  # @example Create crawler and sleep 1 second between each request
22
18
  # Crawler.new('example.com', sleep_length: 1)
23
19
  # @example Create crawler and perform max 3 requests
@@ -75,7 +71,7 @@ module SiteMapper
75
71
  @processed << current_url
76
72
  link_elements.each do |page_link|
77
73
  url = @crawl_url.absolute_url_from(page_link.attr('href'), current_url)
78
- @fetch_queue << url if url && eligible_for_queue?(resolve(url))
74
+ @fetch_queue << url if url && eligible_for_queue?(url)
79
75
  end
80
76
  end
81
77
 
@@ -85,22 +81,19 @@ module SiteMapper
85
81
 
86
82
  def robots
87
83
  return @robots unless @robots.nil?
88
- robots_body = Request.response_body("#{@base_url}/robots.txt", user_agent: @options[:user_agent])
89
- @robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
84
+ robots_url = URI.join(@base_url, '/robots.txt').to_s
85
+ robots_body = Request.response_body(robots_url, user_agent: @options[:user_agent])
86
+ @robots = Robots.new(robots_body, URI.parse(@base_url).host, @options[:user_agent])
90
87
  @robots
91
88
  end
92
89
 
93
- def resolve(url)
94
- @options[:resolve] ? Request.resolve_url(url) : url
95
- end
96
-
97
90
  def wait
98
91
  sleep @options[:sleep_length]
99
92
  end
100
93
 
101
94
  # Queue of urls to be crawled.
102
95
  class CrawlQueue
103
- # @return [Set] that exends EnumerablePop module
96
+ # @return [Set] that extends EnumerablePop module
104
97
  def self.new
105
98
  Set.new.extend(EnumerablePop)
106
99
  end
@@ -45,13 +45,13 @@ module SiteMapper
45
45
  # Log to STDOUT
46
46
  # @param [String] msg to be logged to STDOUT
47
47
  def self.log(msg)
48
- STDOUT.puts(msg)
48
+ $stdout.puts(msg)
49
49
  end
50
50
 
51
51
  # Log to STDERR
52
52
  # @param [String] msg to be logged to STDERR
53
53
  def self.err_log(msg)
54
- STDERR.puts("[ERROR] #{msg}")
54
+ $stderr.puts("[ERROR] #{msg}")
55
55
  end
56
56
  end
57
57
 
@@ -1,4 +1,4 @@
1
1
  module SiteMapper
2
2
  # Gem version
3
- VERSION = '0.0.12'
3
+ VERSION = '0.0.13'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_mapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jacob Burenstam