sitemaps_parser 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -8
- data/lib/sitemaps/fetcher.rb +4 -4
- data/lib/sitemaps/parser.rb +4 -7
- data/lib/sitemaps/version.rb +1 -1
- data/lib/sitemaps.rb +30 -14
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
|
4
|
+
data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
|
7
|
+
data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
|
data/README.md
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
# Sitemaps
|
2
2
|
|
3
|
+
[![Gem](https://img.shields.io/gem/v/sitemaps_parser.svg?style=flat-square)](https://rubygems.org/gems/sitemaps_parser)
|
4
|
+
|
3
5
|
Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
|
4
6
|
|
7
|
+
## TODO
|
8
|
+
|
9
|
+
* discovery
|
10
|
+
* sorting by last modified, or priority
|
11
|
+
* filtering by last modified
|
12
|
+
|
5
13
|
## Installation
|
6
14
|
|
7
15
|
Add this line to your application's Gemfile:
|
@@ -27,19 +35,15 @@ require 'sitemaps'
|
|
27
35
|
Sitemaps.parse("<xml ns=\"...")
|
28
36
|
|
29
37
|
# fetch and parse a sitemap from a known url
|
30
|
-
Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true
|
38
|
+
sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
|
31
39
|
|
32
|
-
#
|
33
|
-
|
34
|
-
|
35
|
-
if url.present?
|
36
|
-
sitemap = Sitemaps.parse(xml)
|
40
|
+
# fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
|
41
|
+
sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
|
42
|
+
entry.loc.path !~ /blog/i
|
37
43
|
end
|
38
44
|
|
39
45
|
# sitemap usage
|
40
|
-
|
41
46
|
sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
|
42
|
-
|
43
47
|
urls = sitemap.entries.map(&:loc)
|
44
48
|
```
|
45
49
|
|
data/lib/sitemaps/fetcher.rb
CHANGED
@@ -20,7 +20,7 @@ module Sitemaps
|
|
20
20
|
return resp.body
|
21
21
|
end
|
22
22
|
|
23
|
-
|
23
|
+
# on a 3xx response, handle the redirect
|
24
24
|
elsif resp.code.to_s =~ /3\d\d/
|
25
25
|
location = URI.parse(resp.header['location'])
|
26
26
|
location = uri + resp.header['location'] if location.relative?
|
@@ -29,14 +29,14 @@ module Sitemaps
|
|
29
29
|
attempts += 1
|
30
30
|
next
|
31
31
|
|
32
|
-
|
32
|
+
# otherwise (4xx, 5xx) throw an exception
|
33
33
|
else
|
34
|
-
|
34
|
+
raise FetchError, "Failed to fetch URI, #{uri}, failed with response code: #{resp.code}"
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
38
|
# if we got here, we ran out of attempts
|
39
|
-
|
39
|
+
raise MaxRedirectError, "Failed to fetch URI #{uri}, redirected too many times" if attempts >= @max_attempts
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
data/lib/sitemaps/parser.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
module Sitemaps
|
2
2
|
# Parse XML Sitemaps
|
3
3
|
module Parser
|
4
|
-
|
5
|
-
require "rexml/document"
|
6
|
-
require "active_support"
|
7
|
-
require "active_support/core_ext/object/try"
|
8
|
-
|
9
|
-
def self.parse(source)
|
4
|
+
def self.parse(source, max_entries: nil, filter: nil)
|
10
5
|
document = REXML::Document.new(source)
|
11
6
|
entries = document.elements.to_a("/urlset/url").map do |root|
|
12
7
|
loc = parse_loc(root) || next
|
@@ -14,8 +9,10 @@ module Sitemaps
|
|
14
9
|
freq = parse_changefreq(root)
|
15
10
|
pri = parse_priority(root)
|
16
11
|
|
17
|
-
Sitemaps::Entry.new(loc, mod, freq, pri)
|
12
|
+
entry = Sitemaps::Entry.new(loc, mod, freq, pri)
|
13
|
+
(!filter || filter.call(entry)) ? entry : nil
|
18
14
|
end.reject(&:nil?)
|
15
|
+
entries = entries.take(max_entries) unless max_entries.nil?
|
19
16
|
|
20
17
|
sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
|
21
18
|
loc = parse_loc(root) || next
|
data/lib/sitemaps/version.rb
CHANGED
data/lib/sitemaps.rb
CHANGED
@@ -1,5 +1,14 @@
|
|
1
|
+
require "active_support"
|
2
|
+
require "active_support/core_ext/object/try"
|
1
3
|
require "active_support/core_ext/object/blank"
|
2
4
|
|
5
|
+
require "time"
|
6
|
+
require "rexml/document"
|
7
|
+
|
8
|
+
require "sitemaps/version"
|
9
|
+
require "sitemaps/parser"
|
10
|
+
require "sitemaps/fetcher"
|
11
|
+
|
3
12
|
# Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
|
4
13
|
module Sitemaps
|
5
14
|
Entry = Struct.new(:loc, :lastmod, :changefreq, :priority)
|
@@ -10,18 +19,22 @@ module Sitemaps
|
|
10
19
|
Sitemaps::Parser.parse(source)
|
11
20
|
end
|
12
21
|
|
13
|
-
def self.fetch(url, fetch: nil, recurse: true)
|
14
|
-
fetch ||= -> (
|
15
|
-
|
22
|
+
def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
|
23
|
+
fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
|
24
|
+
url = parse_url(url)
|
25
|
+
|
26
|
+
recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
|
16
27
|
end
|
17
28
|
|
18
|
-
def self.fetch_single(url, fetch)
|
19
|
-
|
20
|
-
|
29
|
+
def self.fetch_single(url, fetch, max_entries, &block)
|
30
|
+
url = parse_url(url)
|
31
|
+
source = fetch.call(url)
|
32
|
+
|
33
|
+
Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
|
21
34
|
end
|
22
35
|
|
23
|
-
def self.fetch_recursive(url, fetch)
|
24
|
-
queue = [url]
|
36
|
+
def self.fetch_recursive(url, fetch, max_entries, &block)
|
37
|
+
queue = [parse_url(url)]
|
25
38
|
maps = {}
|
26
39
|
|
27
40
|
# walk the queue, fetching the sitemap requested and adding
|
@@ -32,8 +45,15 @@ module Sitemaps
|
|
32
45
|
break if url.nil?
|
33
46
|
next unless maps[url].nil?
|
34
47
|
|
35
|
-
|
48
|
+
# fetch this item in the queue, and queue up any sub maps it found
|
49
|
+
maps[url] = fetch_single(url, fetch, max_entries, &block)
|
36
50
|
queue.push(*maps[url].sitemaps.map(&:loc))
|
51
|
+
|
52
|
+
# decrement max_entries (since it's max_entries total, not per map)
|
53
|
+
unless max_entries.nil?
|
54
|
+
max_entries -= maps[url].entries.length
|
55
|
+
break if max_entries <= 0
|
56
|
+
end
|
37
57
|
rescue => ex
|
38
58
|
$stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
|
39
59
|
next
|
@@ -50,11 +70,7 @@ module Sitemaps
|
|
50
70
|
def self.parse_url(url)
|
51
71
|
return url if url.is_a? URI
|
52
72
|
|
53
|
-
url = "http://#{url}" unless url =~
|
73
|
+
url = "http://#{url}" unless url =~ %r{^https?://}
|
54
74
|
URI.parse(url)
|
55
75
|
end
|
56
76
|
end
|
57
|
-
|
58
|
-
require "sitemaps/version"
|
59
|
-
require "sitemaps/parser"
|
60
|
-
require "sitemaps/fetcher"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemaps_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Raphaelson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|