sitemaps_parser 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sitemaps/parser.rb +4 -2
- data/lib/sitemaps/version.rb +1 -1
- data/lib/sitemaps.rb +11 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48e74e772cd689a18f735341f9cc1272494e26b8
|
4
|
+
data.tar.gz: 12737b00e5a24ae20ccb17c079b3fc3621e555c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98d19dfb44b860c00401bcf2318e6a077101cd27a11fa7665d8d4b2d1a5bcbdfe11d0bd43e9b1da1e5e378dbe9eb92898b8132f2e6fe883e4df5fc0a0ae94138
|
7
|
+
data.tar.gz: e966d408e8edb14f66fce1ad6ce4f3053a53667a9e56ef26d3d3375311b3b4b83a794da859b174c67532dddaa4d01e6fb22cb0d5ec97bd0c856f2190bd0e4fb9
|
data/lib/sitemaps/parser.rb
CHANGED
@@ -9,9 +9,10 @@ module Sitemaps
|
|
9
9
|
# @param source [String] an XML string to parse.
|
10
10
|
# @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap.
|
11
11
|
# @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap.
|
12
|
+
# @param filter_indexes [Boolean, nil] if truthy, filter is called per submap as well as entries.
|
12
13
|
# @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
|
13
14
|
# a sitemap will still be returned, but the entries and sitemaps keys will be empty.
|
14
|
-
def self.parse(source, max_entries: nil, filter: nil)
|
15
|
+
def self.parse(source, max_entries: nil, filter: nil, filter_indexes: nil)
|
15
16
|
document = REXML::Document.new(source)
|
16
17
|
entries = document.elements.to_a("/urlset/url").map do |root|
|
17
18
|
loc = parse_loc(root) || next
|
@@ -29,7 +30,8 @@ module Sitemaps
|
|
29
30
|
loc = parse_loc(root) || next
|
30
31
|
mod = parse_lastmod(root)
|
31
32
|
|
32
|
-
Sitemaps::Submap.new(loc, mod)
|
33
|
+
submap = Sitemaps::Submap.new(loc, mod)
|
34
|
+
(!filter || !filter_indexes || filter.call(submap)) ? submap : nil
|
33
35
|
end.reject(&:nil?)
|
34
36
|
|
35
37
|
Sitemaps::Sitemap.new(entries, sitemaps)
|
data/lib/sitemaps/version.rb
CHANGED
data/lib/sitemaps.rb
CHANGED
@@ -50,7 +50,7 @@ module Sitemaps
|
|
50
50
|
# a good idea to include, as many sites have _very_ large sitemaps.
|
51
51
|
# @return [Sitemap]
|
52
52
|
#
|
53
|
-
# @overload fetch(url, fetcher: nil, max_entries: nil)
|
53
|
+
# @overload fetch(url, fetcher: nil, filter_indexes: nil, max_entries: nil)
|
54
54
|
# If a block is given, it's used as a filter for entries before they're added to the sitemap.
|
55
55
|
#
|
56
56
|
# @param url [String, URI] the url of the sitemap in question.
|
@@ -58,17 +58,19 @@ module Sitemaps
|
|
58
58
|
# @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
|
59
59
|
# sitemap has this many entries, further fetches and parsing will not occur. This is always
|
60
60
|
# a good idea to include, as many sites have _very_ large sitemaps.
|
61
|
+
# @param filter_indexes [Boolean] if true, Submap instances will be run through the filter block
|
62
|
+
# as well as Entry instances.
|
61
63
|
# @return [Sitemap]
|
62
64
|
# @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
|
63
65
|
# @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
|
64
|
-
def self.fetch(url, fetcher: nil, max_entries: nil, &block)
|
66
|
+
def self.fetch(url, fetcher: nil, max_entries: nil, filter_indexes: nil, &block)
|
65
67
|
fetcher ||= @default_fetcher
|
66
68
|
unless url.is_a? URI
|
67
69
|
url = "http://#{url}" unless url =~ %r{^https?://}
|
68
70
|
url = URI.parse(url)
|
69
71
|
end
|
70
72
|
|
71
|
-
_instance.fetch_recursive(url, fetcher, max_entries, &block)
|
73
|
+
_instance.fetch_recursive(url, fetcher, max_entries, filter_indexes, &block)
|
72
74
|
end
|
73
75
|
|
74
76
|
# Discover, fetch and parse sitemaps from the given host.
|
@@ -93,10 +95,12 @@ module Sitemaps
|
|
93
95
|
# @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
|
94
96
|
# sitemap has this many entries, further fetches and parsing will not occur. This is always
|
95
97
|
# a good idea to include, as many sites have _very_ large sitemaps.
|
98
|
+
# @param filter_indexes [Boolean] if true, Submap instances will be run through the filter block
|
99
|
+
# as well as Entry instances.
|
96
100
|
# @return [Sitemap]
|
97
101
|
# @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
|
98
102
|
# @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
|
99
|
-
def self.discover(url, fetcher: nil, max_entries: nil, &block)
|
103
|
+
def self.discover(url, fetcher: nil, max_entries: nil, filter_indexes: nil, &block)
|
100
104
|
fetcher ||= @default_fetcher
|
101
105
|
unless url.is_a? URI
|
102
106
|
url = "http://#{url}" unless url =~ %r{^https?://}
|
@@ -104,7 +108,7 @@ module Sitemaps
|
|
104
108
|
end
|
105
109
|
|
106
110
|
roots = _instance.discover_roots(url, fetcher)
|
107
|
-
_instance.fetch_recursive(roots, fetcher, max_entries, &block)
|
111
|
+
_instance.fetch_recursive(roots, fetcher, max_entries, filter_indexes, &block)
|
108
112
|
end
|
109
113
|
|
110
114
|
# @return [Instance]
|
@@ -120,7 +124,7 @@ module Sitemaps
|
|
120
124
|
class Instance
|
121
125
|
# recursively fetch sitemaps and sitemap indexes from the given urls.
|
122
126
|
# @return [Sitemap]
|
123
|
-
def fetch_recursive(urls, fetcher, max_entries, &block)
|
127
|
+
def fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block)
|
124
128
|
queue = urls.is_a?(Array) ? urls : [urls]
|
125
129
|
maps = {}
|
126
130
|
|
@@ -134,7 +138,7 @@ module Sitemaps
|
|
134
138
|
|
135
139
|
# fetch this item in the queue, and queue up any sub maps it found
|
136
140
|
source = fetcher.call(url)
|
137
|
-
sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
|
141
|
+
sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block, filter_indexes: filter_indexes)
|
138
142
|
|
139
143
|
# save the results and queue up any submaps it found
|
140
144
|
maps[url] = sitemap
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemaps_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Raphaelson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|