sitemaps_parser 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f6a5cf10a4446ce42e918767ee0279a81178951c
4
- data.tar.gz: 878301d4d7ffd4429a0d4763d9ecb4ccca719039
3
+ metadata.gz: 48e74e772cd689a18f735341f9cc1272494e26b8
4
+ data.tar.gz: 12737b00e5a24ae20ccb17c079b3fc3621e555c8
5
5
  SHA512:
6
- metadata.gz: 1a209890b6494fb17fed6c234656854a657dec8dc5d6012d752d6962339b8b5c06f8cd0facb15c96884f18bcabf4bb2a56aea53b84ad8326695ae43c4e57fe2b
7
- data.tar.gz: 108c423e4969c262bf00959fa6ae4d5855c0d35faf10d21c282d3a05413e2f4e9178767dfce86da7872231dc48ac7a81338c72a6b1f809d73ff2d7e3e11a7c9d
6
+ metadata.gz: 98d19dfb44b860c00401bcf2318e6a077101cd27a11fa7665d8d4b2d1a5bcbdfe11d0bd43e9b1da1e5e378dbe9eb92898b8132f2e6fe883e4df5fc0a0ae94138
7
+ data.tar.gz: e966d408e8edb14f66fce1ad6ce4f3053a53667a9e56ef26d3d3375311b3b4b83a794da859b174c67532dddaa4d01e6fb22cb0d5ec97bd0c856f2190bd0e4fb9
@@ -9,9 +9,10 @@ module Sitemaps
9
9
  # @param source [String] an XML string to parse.
10
10
  # @param max_entries [Integer, nil] the maximum number of entries to add to the sitemap.
11
11
  # @param filter [#call, nil] if provided, called per entry to filter the entry out of the sitemap.
12
+ # @param filter_indexes [Boolean, nil] if truthy, filter is called per submap as well as entries.
12
13
  # @return [Sitemap] the sitemap parsed from the XML string. If the XML string given is invalid,
13
14
  # a sitemap will still be returned, but the entries and sitemaps keys will be empty.
14
- def self.parse(source, max_entries: nil, filter: nil)
15
+ def self.parse(source, max_entries: nil, filter: nil, filter_indexes: nil)
15
16
  document = REXML::Document.new(source)
16
17
  entries = document.elements.to_a("/urlset/url").map do |root|
17
18
  loc = parse_loc(root) || next
@@ -29,7 +30,8 @@ module Sitemaps
29
30
  loc = parse_loc(root) || next
30
31
  mod = parse_lastmod(root)
31
32
 
32
- Sitemaps::Submap.new(loc, mod)
33
+ submap = Sitemaps::Submap.new(loc, mod)
34
+ (!filter || !filter_indexes || filter.call(submap)) ? submap : nil
33
35
  end.reject(&:nil?)
34
36
 
35
37
  Sitemaps::Sitemap.new(entries, sitemaps)
@@ -1,3 +1,3 @@
1
1
  module Sitemaps
2
- VERSION = "0.2.0".freeze
2
+ VERSION = "0.2.1".freeze
3
3
  end
data/lib/sitemaps.rb CHANGED
@@ -50,7 +50,7 @@ module Sitemaps
50
50
  # a good idea to include, as many sites have _very_ large sitemaps.
51
51
  # @return [Sitemap]
52
52
  #
53
- # @overload fetch(url, fetcher: nil, max_entries: nil)
53
+ # @overload fetch(url, fetcher: nil, filter_indexes: nil, max_entries: nil)
54
54
  # If a block is given, it's used as a filter for entries before they're added to the sitemap.
55
55
  #
56
56
  # @param url [String, URI] the url of the sitemap in question.
@@ -58,17 +58,19 @@ module Sitemaps
58
58
  # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
59
59
  # sitemap has this many entries, further fetches and parsing will not occur. This is always
60
60
  # a good idea to include, as many sites have _very_ large sitemaps.
61
+ # @param filter_indexes [Boolean] if true, Submap instances will be run through the filter block
62
+ # as well as Entry instances.
61
63
  # @return [Sitemap]
62
64
  # @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
63
65
  # @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
64
- def self.fetch(url, fetcher: nil, max_entries: nil, &block)
66
+ def self.fetch(url, fetcher: nil, max_entries: nil, filter_indexes: nil, &block)
65
67
  fetcher ||= @default_fetcher
66
68
  unless url.is_a? URI
67
69
  url = "http://#{url}" unless url =~ %r{^https?://}
68
70
  url = URI.parse(url)
69
71
  end
70
72
 
71
- _instance.fetch_recursive(url, fetcher, max_entries, &block)
73
+ _instance.fetch_recursive(url, fetcher, max_entries, filter_indexes, &block)
72
74
  end
73
75
 
74
76
  # Discover, fetch and parse sitemaps from the given host.
@@ -93,10 +95,12 @@ module Sitemaps
93
95
  # @param max_entries [Integer] the maximum number of entries to include in the sitemap. Once the
94
96
  # sitemap has this many entries, further fetches and parsing will not occur. This is always
95
97
  # a good idea to include, as many sites have _very_ large sitemaps.
98
+ # @param filter_indexes [Boolean] if true, Submap instances will be run through the filter block
99
+ # as well as Entry instances.
96
100
  # @return [Sitemap]
97
101
  # @yield [Entry] Filters the entry from the sitemap if the block returns falsey.
98
102
  # @yieldreturn [Boolean] whether or not to include the entry in the sitemap.
99
- def self.discover(url, fetcher: nil, max_entries: nil, &block)
103
+ def self.discover(url, fetcher: nil, max_entries: nil, filter_indexes: nil, &block)
100
104
  fetcher ||= @default_fetcher
101
105
  unless url.is_a? URI
102
106
  url = "http://#{url}" unless url =~ %r{^https?://}
@@ -104,7 +108,7 @@ module Sitemaps
104
108
  end
105
109
 
106
110
  roots = _instance.discover_roots(url, fetcher)
107
- _instance.fetch_recursive(roots, fetcher, max_entries, &block)
111
+ _instance.fetch_recursive(roots, fetcher, max_entries, filter_indexes, &block)
108
112
  end
109
113
 
110
114
  # @return [Instance]
@@ -120,7 +124,7 @@ module Sitemaps
120
124
  class Instance
121
125
  # recursively fetch sitemaps and sitemap indexes from the given urls.
122
126
  # @return [Sitemap]
123
- def fetch_recursive(urls, fetcher, max_entries, &block)
127
+ def fetch_recursive(urls, fetcher, max_entries, filter_indexes, &block)
124
128
  queue = urls.is_a?(Array) ? urls : [urls]
125
129
  maps = {}
126
130
 
@@ -134,7 +138,7 @@ module Sitemaps
134
138
 
135
139
  # fetch this item in the queue, and queue up any sub maps it found
136
140
  source = fetcher.call(url)
137
- sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
141
+ sitemap = Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block, filter_indexes: filter_indexes)
138
142
 
139
143
  # save the results and queue up any submaps it found
140
144
  maps[url] = sitemap
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitemaps_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Raphaelson
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-24 00:00:00.000000000 Z
11
+ date: 2016-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler