sitemaps_parser 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -8
- data/lib/sitemaps/fetcher.rb +4 -4
- data/lib/sitemaps/parser.rb +4 -7
- data/lib/sitemaps/version.rb +1 -1
- data/lib/sitemaps.rb +30 -14
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2566aba4212963304e040f2d4867b40b0d263707
|
4
|
+
data.tar.gz: 8f9bc5c86db26122c47e68b83ec14ff34bb76e5a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aabccdfa4ac6719e0347cc29f100125b0e1fbd625021853b2cd81b54aa7a6daff29ab84761d5826db5ff14fb412fa1fa6a12befe4daeb8bf142a06c3ab329100
|
7
|
+
data.tar.gz: ed64e0d67a466a6f4081d7cbe5e3f00b558188ceb9bde1139fa8917859f5ac0d3f1675403829ae309149d3ea988a2825ebeef7e66572106ca10c552171074c17
|
data/README.md
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
# Sitemaps
|
2
2
|
|
3
|
+
[](https://rubygems.org/gems/sitemaps_parser)
|
4
|
+
|
3
5
|
Discover, retrieve and parse XML sitemaps, according to the spec at [sitemaps.org](http://sitemaps.org).
|
4
6
|
|
7
|
+
## TODO
|
8
|
+
|
9
|
+
* discovery
|
10
|
+
* sorting by last modified, or priority
|
11
|
+
* filtering by last modified
|
12
|
+
|
5
13
|
## Installation
|
6
14
|
|
7
15
|
Add this line to your application's Gemfile:
|
@@ -27,19 +35,15 @@ require 'sitemaps'
|
|
27
35
|
Sitemaps.parse("<xml ns=\"...")
|
28
36
|
|
29
37
|
# fetch and parse a sitemap from a known url
|
30
|
-
Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true
|
38
|
+
sitemap = Sitemaps.fetch("http://google.com/sitemap.xml", recurse: true)
|
31
39
|
|
32
|
-
#
|
33
|
-
|
34
|
-
|
35
|
-
if url.present?
|
36
|
-
sitemap = Sitemaps.parse(xml)
|
40
|
+
# fetch and parse sitemaps, excluding paths matching a filter, and limiting to the top 200
|
41
|
+
sitemap = Sitemaps.fetch("https://www.digitalocean.com/sitemaps.xml.gz", max_entries: 200) do |entry|
|
42
|
+
entry.loc.path !~ /blog/i
|
37
43
|
end
|
38
44
|
|
39
45
|
# sitemap usage
|
40
|
-
|
41
46
|
sitemap.entries.first #> Struct(loc: 'http://example.com/page', lastmod: DateTime.utc, changefreq: :monthly, priority: 0.5)
|
42
|
-
|
43
47
|
urls = sitemap.entries.map(&:loc)
|
44
48
|
```
|
45
49
|
|
data/lib/sitemaps/fetcher.rb
CHANGED
@@ -20,7 +20,7 @@ module Sitemaps
|
|
20
20
|
return resp.body
|
21
21
|
end
|
22
22
|
|
23
|
-
|
23
|
+
# on a 3xx response, handle the redirect
|
24
24
|
elsif resp.code.to_s =~ /3\d\d/
|
25
25
|
location = URI.parse(resp.header['location'])
|
26
26
|
location = uri + resp.header['location'] if location.relative?
|
@@ -29,14 +29,14 @@ module Sitemaps
|
|
29
29
|
attempts += 1
|
30
30
|
next
|
31
31
|
|
32
|
-
|
32
|
+
# otherwise (4xx, 5xx) throw an exception
|
33
33
|
else
|
34
|
-
|
34
|
+
raise FetchError, "Failed to fetch URI, #{uri}, failed with response code: #{resp.code}"
|
35
35
|
end
|
36
36
|
end
|
37
37
|
|
38
38
|
# if we got here, we ran out of attempts
|
39
|
-
|
39
|
+
raise MaxRedirectError, "Failed to fetch URI #{uri}, redirected too many times" if attempts >= @max_attempts
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
data/lib/sitemaps/parser.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
module Sitemaps
|
2
2
|
# Parse XML Sitemaps
|
3
3
|
module Parser
|
4
|
-
|
5
|
-
require "rexml/document"
|
6
|
-
require "active_support"
|
7
|
-
require "active_support/core_ext/object/try"
|
8
|
-
|
9
|
-
def self.parse(source)
|
4
|
+
def self.parse(source, max_entries: nil, filter: nil)
|
10
5
|
document = REXML::Document.new(source)
|
11
6
|
entries = document.elements.to_a("/urlset/url").map do |root|
|
12
7
|
loc = parse_loc(root) || next
|
@@ -14,8 +9,10 @@ module Sitemaps
|
|
14
9
|
freq = parse_changefreq(root)
|
15
10
|
pri = parse_priority(root)
|
16
11
|
|
17
|
-
Sitemaps::Entry.new(loc, mod, freq, pri)
|
12
|
+
entry = Sitemaps::Entry.new(loc, mod, freq, pri)
|
13
|
+
(!filter || filter.call(entry)) ? entry : nil
|
18
14
|
end.reject(&:nil?)
|
15
|
+
entries = entries.take(max_entries) unless max_entries.nil?
|
19
16
|
|
20
17
|
sitemaps = document.elements.to_a("/sitemapindex/sitemap").map do |root|
|
21
18
|
loc = parse_loc(root) || next
|
data/lib/sitemaps/version.rb
CHANGED
data/lib/sitemaps.rb
CHANGED
@@ -1,5 +1,14 @@
|
|
1
|
+
require "active_support"
|
2
|
+
require "active_support/core_ext/object/try"
|
1
3
|
require "active_support/core_ext/object/blank"
|
2
4
|
|
5
|
+
require "time"
|
6
|
+
require "rexml/document"
|
7
|
+
|
8
|
+
require "sitemaps/version"
|
9
|
+
require "sitemaps/parser"
|
10
|
+
require "sitemaps/fetcher"
|
11
|
+
|
3
12
|
# Discover, fetch and parse XML sitemaps as defined by the `http://sitemaps.org` spec.
|
4
13
|
module Sitemaps
|
5
14
|
Entry = Struct.new(:loc, :lastmod, :changefreq, :priority)
|
@@ -10,18 +19,22 @@ module Sitemaps
|
|
10
19
|
Sitemaps::Parser.parse(source)
|
11
20
|
end
|
12
21
|
|
13
|
-
def self.fetch(url, fetch: nil, recurse: true)
|
14
|
-
fetch ||= -> (
|
15
|
-
|
22
|
+
def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
|
23
|
+
fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
|
24
|
+
url = parse_url(url)
|
25
|
+
|
26
|
+
recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
|
16
27
|
end
|
17
28
|
|
18
|
-
def self.fetch_single(url, fetch)
|
19
|
-
|
20
|
-
|
29
|
+
def self.fetch_single(url, fetch, max_entries, &block)
|
30
|
+
url = parse_url(url)
|
31
|
+
source = fetch.call(url)
|
32
|
+
|
33
|
+
Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
|
21
34
|
end
|
22
35
|
|
23
|
-
def self.fetch_recursive(url, fetch)
|
24
|
-
queue = [url]
|
36
|
+
def self.fetch_recursive(url, fetch, max_entries, &block)
|
37
|
+
queue = [parse_url(url)]
|
25
38
|
maps = {}
|
26
39
|
|
27
40
|
# walk the queue, fetching the sitemap requested and adding
|
@@ -32,8 +45,15 @@ module Sitemaps
|
|
32
45
|
break if url.nil?
|
33
46
|
next unless maps[url].nil?
|
34
47
|
|
35
|
-
|
48
|
+
# fetch this item in the queue, and queue up any sub maps it found
|
49
|
+
maps[url] = fetch_single(url, fetch, max_entries, &block)
|
36
50
|
queue.push(*maps[url].sitemaps.map(&:loc))
|
51
|
+
|
52
|
+
# decrement max_entries (since it's max_entries total, not per map)
|
53
|
+
unless max_entries.nil?
|
54
|
+
max_entries -= maps[url].entries.length
|
55
|
+
break if max_entries <= 0
|
56
|
+
end
|
37
57
|
rescue => ex
|
38
58
|
$stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
|
39
59
|
next
|
@@ -50,11 +70,7 @@ module Sitemaps
|
|
50
70
|
def self.parse_url(url)
|
51
71
|
return url if url.is_a? URI
|
52
72
|
|
53
|
-
url = "http://#{url}" unless url =~
|
73
|
+
url = "http://#{url}" unless url =~ %r{^https?://}
|
54
74
|
URI.parse(url)
|
55
75
|
end
|
56
76
|
end
|
57
|
-
|
58
|
-
require "sitemaps/version"
|
59
|
-
require "sitemaps/parser"
|
60
|
-
require "sitemaps/fetcher"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemaps_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Raphaelson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|