sitemaps_parser 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/sitemaps/parser.rb +4 -2
- data/lib/sitemaps/version.rb +1 -1
- data/lib/sitemaps.rb +11 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 48e74e772cd689a18f735341f9cc1272494e26b8
|
4
|
+
data.tar.gz: 12737b00e5a24ae20ccb17c079b3fc3621e555c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98d19dfb44b860c00401bcf2318e6a077101cd27a11fa7665d8d4b2d1a5bcbdfe11d0bd43e9b1da1e5e378dbe9eb92898b8132f2e6fe883e4df5fc0a0ae94138
|
7
|
+
data.tar.gz: e966d408e8edb14f66fce1ad6ce4f3053a53667a9e56ef26d3d3375311b3b4b83a794da859b174c67532dddaa4d01e6fb22cb0d5ec97bd0c856f2190bd0e4fb9
|
data/lib/sitemaps/parser.rb
CHANGED
@@ -9,9 +9,10 @@ module Sitemaps
|
|
9
9
|
# @param source [String] an XML string to parse.
|
10
10
|
# @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap.
|
11
11
|
# @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap.
|
12
|
+
# @param filter_indexes [Boolean, nil] if truthy, filter is called per submap as well as entries.
|
12
13
|
# @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
|
13
14
|
# a sitemap will still be returned, but the entries and sitemaps keys will be empty.
|
14
|
-
def self.parse(source, max_entries: nil, filter: nil)
|
15
|
+
def self.parse(source, max_entries: nil, filter: nil, filter_indexes: nil)
|
15
16
|
document = REXML::Document.new(source)
|
16
17
|
entries = document.elements.to_a("/urlset/url").map do |root|
|
17
18
|
loc = parse_loc(root) || next
|
@@ -29,7 +30,8 @@ module Sitemaps
|
|
29
30
|
loc = parse_loc(root) || next
|
30
31
|
mod = parse_lastmod(root)
|
31
32
|
|
32
|
-
Sitemaps::Submap.new(loc, mod)
|
33
|
+
submap = Sitemaps::Submap.new(loc, mod)
|
34
|
+
(!filter || !filter_indexes || filter.call(submap)) ? submap : nil
|
33
35
|
end.reject(&:nil?)
|
34
36
|
|
35
37
|
Sitemaps::Sitemap.new(entries, sitemaps)
|
data/lib/sitemaps/version.rb
CHANGED
data/lib/sitemaps.rb
CHANGED
@@ -50,7 +50,7 @@ module Sitemaps
|
|
50
50
|
# a good idea to include, as many sites have _very_ large sitemaps.
|
51
51
|
# @return [Sitemap]
|
52
52
|
#
|
53
|
-
# @overload fetch(url, fetcher: nil, max_entries: nil)
|
53
|
+
# @overload fetch(url, fetcher: nil, filter_indexes: nil, max_entries: nil)
|
54
54
|
# If a block is given, it's used as a filter for entries before they're added to the sitemap.
|
55
55
|
#
|
56
56
|
# @param url [String, URI] the url of the sitemap in question.
|
@@ -58,17 +58,19 @@ module Sitemaps
|
|
58
58
|
# @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
|
59
59
|
# sitemap has this many entries, further fetches and parsing will not occur. This is always
|
60
60
|
# a good idea to include, as many sites have _very_ large sitemaps.
|
61
|
+
# @param filter_indexes [Boolean] if true, Submap instances will be run through the filter block
|
62
|
+
# as well as Entry instances.
|
61
63
|
# @return [Sitemap]
|
62
64
|
# @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
|
63
65
|
# @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
|
64
|
-
def self.fetch(url, fetcher: nil, max_entries: nil, &block)
|
66
|
+
def self.fetch(url, fetcher: nil, max_entries: nil, filter_indexes: nil, &block)
|
65
67
|
fetcher ||= @default_fetcher
|
66
68
|
unless url.is_a? URI
|
67
69
|
url = "http://#{url}" unless url =~ %r{^https?://}
|
68
70
|
url = URI.parse(url)
|
69
71
|
end
|
70
72
|
|
71
|
-
_instance.fetch_recursive(url, fetcher, max_entries, &block)
|
73
|
+
_instance.fetch_recursive(url, fetcher, max_entries, filter_indexes, &block)
|
72
74
|
end
|
73
75
|
|
74
76
|
# Discover, fetch and parse sitemaps from the given host.
|
@@ -93,10 +95,12 @@ module Sitemaps
|
|
93
95
|
# @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
|
94
96
|
# sitemap has this many entries, further fetches and parsing will not occur. This is always
|
95
97
|
# a good idea to include, as many sites have _very_ large sitemaps.
|
98
|
+
# @param filter_indexes [Boolean] if true, Submap instances will be run through the filter block
|
99
|
+
# as well as Entry instances.
|
96
100
|
# @return [Sitemap]
|
97
101
|
# @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
|
98
102
|
# @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
|
99
|
-
def self.discover(url, fetcher: nil, max_entries: nil, &block)
|
103
|
+
def self.discover(url, fetcher: nil, max_entries: nil, filter_indexes: nil, &block)
|
100
104
|
fetcher ||= @default_fetcher
|
101
105
|
unless url.is_a? URI
|
102
106
|
url = "http://#{url}" unless url =~ %r{^https?://}
|
@@ -104,7 +108,7 @@ module Sitemaps
|
|
104
108
|
end
|
105
109
|
|
106
110
|
roots = _instance.discover_roots(url, fetcher)
|
107
|
-
_instance.fetch_recursive(roots, fetcher, max_entries, &block)
|
111
|
+
_instance.fetch_recursive(roots, fetcher, max_entries, filter_indexes, &block)
|
108
112
|
end
|
109
113
|
|
110
114
|
# @return [Instance]
|
@@ -120,7 +124,7 @@ module Sitemaps
|
|
120
124
|
class Instance
|
121
125
|
# recursively fetch sitemaps and sitemap indexes from the given urls.
|
122
126
|
# @return [Sitemap]
|
123
|
-
def fetch_recursive(urls, fetcher, max_entries, &block)
|
127
|
+
def fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block)
|
124
128
|
queue = urls.is_a?(Array) ? urls : [urls]
|
125
129
|
maps = {}
|
126
130
|
|
@@ -134,7 +138,7 @@ module Sitemaps
|
|
134
138
|
|
135
139
|
# fetch this item in the queue, and queue up any sub maps it found
|
136
140
|
source = fetcher.call(url)
|
137
|
-
sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
|
141
|
+
sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block, filter_indexes: filter_indexes)
|
138
142
|
|
139
143
|
# save the results and queue up any submaps it found
|
140
144
|
maps[url] = sitemap
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitemaps_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Raphaelson
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|