sitemaps_parser 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84077b93f96854c50072c350407fb253b9e0a952
4
- data.tar.gz: 6b1437162a1e98e3e882815cabea1ab5e4b1dc90
3
+ metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
4
+ data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
5
5
  SHA512:
6
- metadata.gz: 1e705e71f275f333a98dee425c0c11eddcab08b502233e0500328e926f3570227549ef7867206ad03dcd292a932b477b71592123b7ab456594c981dac5e624fb
7
- data.tar.gz: 0decafa6d29c0785d01f2099c64691678df91123986e5f075decf3c5b09d55738195aa6d53c787dfbd757b39462c4df245711c5b1ce90640a88afdc89a28854c
6
+ metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
7
+ data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
data/README.md CHANGED
@@ -1,7 +1,15 @@
1
1
  # Sitemaps
2
2
 
3
+ [![Gem](https://img.shields.io/gem/v/sitemaps_parser.svg?style=flat-square)](https://rubygems.org/gems/sitemaps_parser)
4
+
3
5
  Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
4
6
 
7
+ ## TODO
8
+
9
+ * discovery
10
+ * sorting by last modified, or priority
11
+ * filtering by last modified
12
+
5
13
  ## Installation
6
14
 
7
15
  Add this line to your application's Gemfile:
@@ -27,19 +35,15 @@ require 'sitemaps'
27
35
  Sitemaps.parse("<xml ns=\"...")
28
36
 
29
37
  # fetch and parse a sitemap from a known url
30
- Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true, fetch: :builtin)
38
+ sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
31
39
 
32
- # attempt to discover a sitemap url for a domain
33
- # parallel fetches a small set of 'normal' urls, returns whichever comes back first, most likely only one.
34
- xml = Sitemaps.discover("http://example.com")
35
- if url.present?
36
- sitemap = Sitemaps.parse(xml)
40
+ # fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
41
+ sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
42
+ entry.loc.path !~ /blog/i
37
43
  end
38
44
 
39
45
  # sitemap usage
40
-
41
46
  sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
42
-
43
47
  urls = sitemap.entries.map(&:loc)
44
48
  ```
45
49
 
@@ -20,7 +20,7 @@ module Sitemaps
20
20
  return resp.body
21
21
  end
22
22
 
23
- # on a 3xx response, handle the redirect
23
+ # on a 3xx response, handle the redirect
24
24
  elsif resp.code.to_s =~ /3\d\d/
25
25
  location = URI.parse(resp.header['location'])
26
26
  location = uri + resp.header['location'] if location.relative?
@@ -29,14 +29,14 @@ module Sitemaps
29
29
  attempts += 1
30
30
  next
31
31
 
32
- # otherwise (4xx, 5xx) throw an exception
32
+ # otherwise (4xx, 5xx) throw an exception
33
33
  else
34
- fail FetchError, "Failed to fetch URI, #{uri}, failed with response code: #{resp.code}"
34
+ raise FetchError, "Failed to fetch URI, #{uri}, failed with response code: #{resp.code}"
35
35
  end
36
36
  end
37
37
 
38
38
  # if we got here, we ran out of attempts
39
- fail MaxRedirectError, "Failed to fetch URI #{uri}, redirected too many times" if attempts >= @max_attempts
39
+ raise MaxRedirectError, "Failed to fetch URI #{uri}, redirected too many times" if attempts >= @max_attempts
40
40
  end
41
41
  end
42
42
  end
@@ -1,12 +1,7 @@
1
1
  module Sitemaps
2
2
  # Parse XML Sitemaps
3
3
  module Parser
4
- require "time"
5
- require "rexml/document"
6
- require "active_support"
7
- require "active_support/core_ext/object/try"
8
-
9
- def self.parse(source)
4
+ def self.parse(source, max_entries: nil, filter: nil)
10
5
  document = REXML::Document.new(source)
11
6
  entries = document.elements.to_a("/urlset/url").map do |root|
12
7
  loc = parse_loc(root) || next
@@ -14,8 +9,10 @@ module Sitemaps
14
9
  freq = parse_changefreq(root)
15
10
  pri = parse_priority(root)
16
11
 
17
- Sitemaps::Entry.new(loc, mod, freq, pri)
12
+ entry = Sitemaps::Entry.new(loc, mod, freq, pri)
13
+ (!filter || filter.call(entry)) ? entry : nil
18
14
  end.reject(&:nil?)
15
+ entries = entries.take(max_entries) unless max_entries.nil?
19
16
 
20
17
  sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
21
18
  loc = parse_loc(root) || next
@@ -1,3 +1,3 @@
1
1
  module Sitemaps
2
- VERSION = "0.1.0".freeze
2
+ VERSION = "0.1.1".freeze
3
3
  end
data/lib/sitemaps.rb CHANGED
@@ -1,5 +1,14 @@
1
+ require "active_support"
2
+ require "active_support/core_ext/object/try"
1
3
  require "active_support/core_ext/object/blank"
2
4
 
5
+ require "time"
6
+ require "rexml/document"
7
+
8
+ require "sitemaps/version"
9
+ require "sitemaps/parser"
10
+ require "sitemaps/fetcher"
11
+
3
12
  # Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
4
13
  module Sitemaps
5
14
  Entry = Struct.new(:loc, :lastmod, :changefreq, :priority)
@@ -10,18 +19,22 @@ module Sitemaps
10
19
  Sitemaps::Parser.parse(source)
11
20
  end
12
21
 
13
- def self.fetch(url, fetch: nil, recurse: true)
14
- fetch ||= -> (url) { Sitemaps::Fetcher.fetch(url) }
15
- recurse ? fetch_recursive(url, fetch) : fetch_single(url, fetch)
22
+ def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
23
+ fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
24
+ url = parse_url(url)
25
+
26
+ recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
16
27
  end
17
28
 
18
- def self.fetch_single(url, fetch)
19
- source = fetch.call(parse_url(url))
20
- Sitemaps::Parser.parse(source)
29
+ def self.fetch_single(url, fetch, max_entries, &block)
30
+ url = parse_url(url)
31
+ source = fetch.call(url)
32
+
33
+ Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
21
34
  end
22
35
 
23
- def self.fetch_recursive(url, fetch)
24
- queue = [url]
36
+ def self.fetch_recursive(url, fetch, max_entries, &block)
37
+ queue = [parse_url(url)]
25
38
  maps = {}
26
39
 
27
40
  # walk the queue, fetching the sitemap requested and adding
@@ -32,8 +45,15 @@ module Sitemaps
32
45
  break if url.nil?
33
46
  next unless maps[url].nil?
34
47
 
35
- maps[url] = fetch_single(url, fetch)
48
+ # fetch this item in the queue, and queue up any sub maps it found
49
+ maps[url] = fetch_single(url, fetch, max_entries, &block)
36
50
  queue.push(*maps[url].sitemaps.map(&:loc))
51
+
52
+ # decrement max_entries (since it's max_entries total, not per map)
53
+ unless max_entries.nil?
54
+ max_entries -= maps[url].entries.length
55
+ break if max_entries <= 0
56
+ end
37
57
  rescue => ex
38
58
  $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
39
59
  next
@@ -50,11 +70,7 @@ module Sitemaps
50
70
  def self.parse_url(url)
51
71
  return url if url.is_a? URI
52
72
 
53
- url = "http://#{url}" unless url =~ /^https?/
73
+ url = "http://#{url}" unless url =~ %r{^https?://}
54
74
  URI.parse(url)
55
75
  end
56
76
  end
57
-
58
- require "sitemaps/version"
59
- require "sitemaps/parser"
60
- require "sitemaps/fetcher"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemaps_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raphaelson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-21 00:00:00.000000000 Z
11
+ date: 2016-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler