sitemaps_parser 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 84077b93f96854c50072c350407fb253b9e0a952
4
- data.tar.gz: 6b1437162a1e98e3e882815cabea1ab5e4b1dc90
3
+ metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
4
+ data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
5
5
  SHA512:
6
- metadata.gz: 1e705e71f275f333a98dee425c0c11eddcab08b502233e0500328e926f3570227549ef7867206ad03dcd292a932b477b71592123b7ab456594c981dac5e624fb
7
- data.tar.gz: 0decafa6d29c0785d01f2099c64691678df91123986e5f075decf3c5b09d55738195aa6d53c787dfbd757b39462c4df245711c5b1ce90640a88afdc89a28854c
6
+ metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
7
+ data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
data/README.md CHANGED
@@ -1,7 +1,15 @@
1
1
  # Sitemaps
2
2
 
3
+ [![Gem](https://img.shields.io/gem/v/sitemaps_parser.svg?style=flat-square)](https://rubygems.org/gems/sitemaps_parser)
4
+
3
5
  Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
4
6
 
7
+ ## TODO
8
+
9
+ * discovery
10
+ * sorting by last modified, or priority
11
+ * filtering by last modified
12
+
5
13
  ## Installation
6
14
 
7
15
  Add this line to your application's Gemfile:
@@ -27,19 +35,15 @@ require 'sitemaps'
27
35
  Sitemaps.parse("<xml ns=\"...")
28
36
 
29
37
  # fetch and parse a sitemap from a known url
30
- Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true, fetch: :builtin)
38
+ sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
31
39
 
32
- # attempt to discover a sitemap url for a domain
33
- # parallel fetches a small set of 'normal' urls, returns whichever comes back first, most likely only one.
34
- xml = Sitemaps.discover("http://example.com")
35
- if url.present?
36
- sitemap = Sitemaps.parse(xml)
40
+ # fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
41
+ sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
42
+ entry.loc.path !~ /blog/i
37
43
  end
38
44
 
39
45
  # sitemap usage
40
-
41
46
  sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
42
-
43
47
  urls = sitemap.entries.map(&:loc)
44
48
  ```
45
49
 
@@ -20,7 +20,7 @@ module Sitemaps
20
20
  return resp.body
21
21
  end
22
22
 
23
- # on a 3xx response, handle the redirect
23
+ # on a 3xx response, handle the redirect
24
24
  elsif resp.code.to_s =~ /3\d\d/
25
25
  location = URI.parse(resp.header['location'])
26
26
  location = uri + resp.header['location'] if location.relative?
@@ -29,14 +29,14 @@ module Sitemaps
29
29
  attempts += 1
30
30
  next
31
31
 
32
- # otherwise (4xx, 5xx) throw an exception
32
+ # otherwise (4xx, 5xx) throw an exception
33
33
  else
34
- fail FetchError, "Failed to fetch URI, #{uri}, failed with response code: #{resp.code}"
34
+ raise FetchError, "Failed to fetch URI, #{uri}, failed with response code: #{resp.code}"
35
35
  end
36
36
  end
37
37
 
38
38
  # if we got here, we ran out of attempts
39
- fail MaxRedirectError, "Failed to fetch URI #{uri}, redirected too many times" if attempts >= @max_attempts
39
+ raise MaxRedirectError, "Failed to fetch URI #{uri}, redirected too many times" if attempts >= @max_attempts
40
40
  end
41
41
  end
42
42
  end
@@ -1,12 +1,7 @@
1
1
  module Sitemaps
2
2
  # Parse XML Sitemaps
3
3
  module Parser
4
- require "time"
5
- require "rexml/document"
6
- require "active_support"
7
- require "active_support/core_ext/object/try"
8
-
9
- def self.parse(source)
4
+ def self.parse(source, max_entries: nil, filter: nil)
10
5
  document = REXML::Document.new(source)
11
6
  entries = document.elements.to_a("/urlset/url").map do |root|
12
7
  loc = parse_loc(root) || next
@@ -14,8 +9,10 @@ module Sitemaps
14
9
  freq = parse_changefreq(root)
15
10
  pri = parse_priority(root)
16
11
 
17
- Sitemaps::Entry.new(loc, mod, freq, pri)
12
+ entry = Sitemaps::Entry.new(loc, mod, freq, pri)
13
+ (!filter || filter.call(entry)) ? entry : nil
18
14
  end.reject(&:nil?)
15
+ entries = entries.take(max_entries) unless max_entries.nil?
19
16
 
20
17
  sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
21
18
  loc = parse_loc(root) || next
@@ -1,3 +1,3 @@
1
1
  module Sitemaps
2
- VERSION = "0.1.0".freeze
2
+ VERSION = "0.1.1".freeze
3
3
  end
data/lib/sitemaps.rb CHANGED
@@ -1,5 +1,14 @@
1
+ require "active_support"
2
+ require "active_support/core_ext/object/try"
1
3
  require "active_support/core_ext/object/blank"
2
4
 
5
+ require "time"
6
+ require "rexml/document"
7
+
8
+ require "sitemaps/version"
9
+ require "sitemaps/parser"
10
+ require "sitemaps/fetcher"
11
+
3
12
  # Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
4
13
  module Sitemaps
5
14
  Entry = Struct.new(:loc, :lastmod, :changefreq, :priority)
@@ -10,18 +19,22 @@ module Sitemaps
10
19
  Sitemaps::Parser.parse(source)
11
20
  end
12
21
 
13
- def self.fetch(url, fetch: nil, recurse: true)
14
- fetch ||= -> (url) { Sitemaps::Fetcher.fetch(url) }
15
- recurse ? fetch_recursive(url, fetch) : fetch_single(url, fetch)
22
+ def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
23
+ fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
24
+ url = parse_url(url)
25
+
26
+ recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
16
27
  end
17
28
 
18
- def self.fetch_single(url, fetch)
19
- source = fetch.call(parse_url(url))
20
- Sitemaps::Parser.parse(source)
29
+ def self.fetch_single(url, fetch, max_entries, &block)
30
+ url = parse_url(url)
31
+ source = fetch.call(url)
32
+
33
+ Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
21
34
  end
22
35
 
23
- def self.fetch_recursive(url, fetch)
24
- queue = [url]
36
+ def self.fetch_recursive(url, fetch, max_entries, &block)
37
+ queue = [parse_url(url)]
25
38
  maps = {}
26
39
 
27
40
  # walk the queue, fetching the sitemap requested and adding
@@ -32,8 +45,15 @@ module Sitemaps
32
45
  break if url.nil?
33
46
  next unless maps[url].nil?
34
47
 
35
- maps[url] = fetch_single(url, fetch)
48
+ # fetch this item in the queue, and queue up any sub maps it found
49
+ maps[url] = fetch_single(url, fetch, max_entries, &block)
36
50
  queue.push(*maps[url].sitemaps.map(&:loc))
51
+
52
+ # decrement max_entries (since it's max_entries total, not per map)
53
+ unless max_entries.nil?
54
+ max_entries -= maps[url].entries.length
55
+ break if max_entries <= 0
56
+ end
37
57
  rescue => ex
38
58
  $stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
39
59
  next
@@ -50,11 +70,7 @@ module Sitemaps
50
70
  def self.parse_url(url)
51
71
  return url if url.is_a? URI
52
72
 
53
- url = "http://#{url}" unless url =~ /^https?/
73
+ url = "http://#{url}" unless url =~ %r{^https?://}
54
74
  URI.parse(url)
55
75
  end
56
76
  end
57
-
58
- require "sitemaps/version"
59
- require "sitemaps/parser"
60
- require "sitemaps/fetcher"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemaps_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raphaelson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-21 00:00:00.000000000 Z
11
+ date: 2016-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler