arwen 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 28a4731079ae915acfb7f6b50e7210ef4e9cb46eaf2cca7072734c900e12c30f
4
+ data.tar.gz: 857982f366ff50a6d60d0ae2aa87a108b913c1ac9f67ae15294e7b9cb14c6597
5
+ SHA512:
6
+ metadata.gz: 8abe5eec16f455c42e3a0d799f9f1d758c376e7340b981a09c5b9a08e938f84afd52769d0af0a0703e810a2fb00114a7e8c06755ba9e3ebc7a7911bac87d84bd
7
+ data.tar.gz: e5b7e9157ae6ed2dd5cf09ec9850205dec2d2d03d7341fc14e72e0cfe71431a7a2b9358cb7a1345272bbcfb8ed4e46663ca13693d6d9058eede1f298ca55efce
@@ -0,0 +1,18 @@
1
+ name: Ruby
2
+
3
+ on: [push,pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v2
10
+ - name: Set up Ruby
11
+ uses: ruby/setup-ruby@v1
12
+ with:
13
+ ruby-version: 2.7.2
14
+ - name: Run the default task
15
+ run: |
16
+ gem install bundler -v 2.2.11
17
+ bundle install
18
+ bundle exec rake
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ benchmarks/
data/.rubocop.yml ADDED
@@ -0,0 +1,21 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-minitest
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.5
7
+ NewCops: enable
8
+
9
+ Style/StringLiterals:
10
+ Enabled: true
11
+ EnforcedStyle: double_quotes
12
+
13
+ Style/StringLiteralsInInterpolation:
14
+ Enabled: true
15
+ EnforcedStyle: double_quotes
16
+
17
+ Layout/LineLength:
18
+ Max: 120
19
+
20
+ Style/Documentation:
21
+ Enabled: false
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2021-02-27
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in arwen.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+
10
+ gem "minitest", "~> 5.0"
11
+
12
+ gem "rubocop", "~> 1.7"
13
+ gem "rubocop-minitest"
14
+ gem "rubocop-rake"
data/Gemfile.lock ADDED
@@ -0,0 +1,56 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ arwen (0.1.0)
5
+ ox (~> 2.14)
6
+ typhoeus (~> 1.4)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.2)
12
+ ethon (0.12.0)
13
+ ffi (>= 1.3.0)
14
+ ffi (1.14.2)
15
+ minitest (5.14.4)
16
+ ox (2.14.1)
17
+ parallel (1.20.1)
18
+ parser (3.0.0.0)
19
+ ast (~> 2.4.1)
20
+ rainbow (3.0.0)
21
+ rake (13.0.3)
22
+ regexp_parser (2.1.1)
23
+ rexml (3.2.4)
24
+ rubocop (1.11.0)
25
+ parallel (~> 1.10)
26
+ parser (>= 3.0.0.0)
27
+ rainbow (>= 2.2.2, < 4.0)
28
+ regexp_parser (>= 1.8, < 3.0)
29
+ rexml
30
+ rubocop-ast (>= 1.2.0, < 2.0)
31
+ ruby-progressbar (~> 1.7)
32
+ unicode-display_width (>= 1.4.0, < 3.0)
33
+ rubocop-ast (1.4.1)
34
+ parser (>= 2.7.1.5)
35
+ rubocop-minitest (0.10.3)
36
+ rubocop (>= 0.87, < 2.0)
37
+ rubocop-rake (0.5.1)
38
+ rubocop
39
+ ruby-progressbar (1.11.0)
40
+ typhoeus (1.4.0)
41
+ ethon (>= 0.9.0)
42
+ unicode-display_width (2.0.0)
43
+
44
+ PLATFORMS
45
+ x86_64-darwin-20
46
+
47
+ DEPENDENCIES
48
+ arwen!
49
+ minitest (~> 5.0)
50
+ rake (~> 13.0)
51
+ rubocop (~> 1.7)
52
+ rubocop-minitest
53
+ rubocop-rake
54
+
55
+ BUNDLED WITH
56
+ 2.2.11
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Adam Hake
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,62 @@
1
+ # Arwen
2
+
3
+ Arwen parses sitemaps that adhere to the sitemaps.org protocol. Inspired by benbalter's [sitemap-parser gem](https://github.com/benbalter/sitemap-parser), arwen automatically detects if recursion is needed by analyzing the presence of `<sitemapindex>`. It also leverages Typheous' parallel request functionality via `Typheous::Hydra` when needed to drastically speed up
4
+ fetching large sitemaps.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'arwen'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle install
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install arwen
21
+
22
+ ## Usage
23
+
24
+ To parse a sitemap, create a new `Arwen` instance, passing it the URL to the sitemap and, optionally, any options that should be passed to `Typheous::Request`.
25
+
26
+ ```ruby
27
+ arwen = Arwen.new("https://www.example.org/sitemap.xml")
28
+ ```
29
+ As long as the sitemap implements the sitemaps.org protocol, `arwen` will check if the sitemap should be parsed recursively (i.e. if the sitemap implements `<sitemapindex>`). The sitemap is not fetched until a method is called that accesses it.
30
+
31
+ `arwen` uses `Ox` has its XML parser. To access the raw `Ox::Document` sitemap, use the `sitemap` instance method. See the `Ox::Document` [documentation](http://www.ohler.com/ox/Ox/Document.html) for full details.
32
+
33
+ ```ruby
34
+ arwen.sitemap # Ox::document
35
+ ```
36
+ The `urls` instance method returns an array of `Arwen::Url` objects. `Arwen::Url` is a simple object that models the `<url>` schema in the sitemaps.org protocol.
37
+
38
+ ```ruby
39
+ arwen.urls # Array<Arwen::Url>
40
+ ```
41
+
42
+ To get an array of just the url strings for the whole sitemap, use the `to_a` instance method:
43
+
44
+ ```ruby
45
+ arwen.to_a # Array<string>
46
+ ```
47
+
48
+ ## Development
49
+
50
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
51
+
52
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
53
+
54
+ ## Contributing
55
+
56
+ Bug reports and pull requests are welcome on GitHub at https://github.com/adamhake/arwen.
57
+
58
+ ## License
59
+
60
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
61
+
62
+ *Yes, the name is a reference to the Half-Elven daughter of Elrond.*
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "test"
8
+ t.libs << "lib"
9
+ t.test_files = FileList["test/**/*_test.rb"]
10
+ end
11
+
12
+ require "rubocop/rake_task"
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ task default: %i[test rubocop]
data/arwen.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
5
+ require "arwen/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "arwen"
9
+ spec.version = Arwen::VERSION
10
+ spec.authors = ["Adam Hake"]
11
+ spec.email = ["adamhake@gmail.com"]
12
+
13
+ spec.summary = "Parses a sitemap recursively using typheous"
14
+ spec.description = "Sitemap Parser recursively parses a sitemap leveraging typheous concurrent https request"
15
+ spec.homepage = "https://github.com/adamhake/arwen"
16
+ spec.license = "MIT"
17
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
18
+
19
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
20
+
21
+ spec.metadata["homepage_uri"] = spec.homepage
22
+ # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
23
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
24
+
25
+ # Specify which files should be added to the gem when it is released.
26
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
27
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
28
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Uncomment to register a new dependency of your gem
35
+ # spec.add_dependency "example-gem", "~> 1.0"
36
+ spec.add_dependency "ox", "~> 2.14"
37
+ spec.add_dependency "typhoeus", "~> 1.4"
38
+
39
+ # For more information and examples about making a new gem, checkout our
40
+ # guide at: https://bundler.io/guides/creating_gem.html
41
+ end
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "arwen"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/arwen.rb ADDED
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "arwen/version"
4
+ require "arwen/url"
5
+ require "ox"
6
+ require "typhoeus"
7
+
8
+ # Parses a sitemap url and provides all links provided by the sitemap or sitemap_index.
9
+ # It uses Typheous for network requests and making concurrent requests when parsing a sitemap_index.
10
+ # Ox is the XML parser used to parse the sitemap. Sitemaps are assumed to follow the sitemaps.org protocol.
11
+ #
12
+ # @see https://github.com/typhoeus/typhoeus
13
+ # @see https://github.com/ohler55/ox
14
+ # @see https://www.sitemaps.org/protocol.html
15
+ class Arwen
16
+ # Create a new Arwen instance
17
+ #
18
+ # @param [string] url the full URL to the sitemap or sitemap_index XML file
19
+ # @param [hash] opts options passed to Typheous::Request instances.
20
+ # @option opts [integer] :max_concurrency maximum concurrent requests passed to Typheous::Hydra
21
+ # @see https://rubydoc.info/github/typhoeus/typhoeus/Typhoeus/Request
22
+ def initialize(url, opts = {})
23
+ @url = url
24
+ max_concurrency = opts.delete(:max_concurrency) { 200 }
25
+ @opts = { followlocation: true }.merge(opts)
26
+ @hydra = Typhoeus::Hydra.new(max_concurrency: max_concurrency)
27
+ end
28
+
29
+ # fetches and returns all urls for the sitemap with corresponding <url> sitemap schema metadata
30
+ #
31
+ # @return [Array<SitemapParser::Url>]
32
+ def urls
33
+ @urls ||= all_urls(sitemap)
34
+ end
35
+
36
+ # returns an array of url strings for all URls in the sitemap
37
+ #
38
+ # @return [Array<String>]
39
+ def to_a
40
+ urls.map(&:url)
41
+ end
42
+
43
+ # parses the sitemap url to an Ox::Document instance
44
+ #
45
+ # @return [Ox::Document]
46
+ # @see http://www.ohler.com/ox/Ox/Document.html
47
+ def sitemap
48
+ @sitemap ||= raw_sitemap
49
+ end
50
+
51
+ private
52
+
53
+ def all_urls(sitemap)
54
+ return parse_multiple_sitemaps(sitemap) if sitemap.root.respond_to?(:sitemap)
55
+
56
+ parse_single_sitemap(sitemap)
57
+ end
58
+
59
+ def parse_multiple_sitemaps(sitemap)
60
+ raise "invalid sitemap format" unless sitemap&.root&.value == "sitemapindex"
61
+
62
+ urls = sitemap.root.locate("sitemap/loc/*")
63
+ site_urls = []
64
+
65
+ requests = fetch_sitemaps(urls)
66
+ requests.each do |req|
67
+ sitemap = Ox.load(req.response.body)
68
+ site_urls += parse_single_sitemap(sitemap)
69
+ end
70
+
71
+ site_urls
72
+ end
73
+
74
+ def fetch_sitemaps(urls)
75
+ requests = urls.map do |url|
76
+ req = Typhoeus::Request.new(url, @opts)
77
+ @hydra.queue(req)
78
+ req
79
+ end
80
+ @hydra.run
81
+
82
+ requests
83
+ end
84
+
85
+ def parse_single_sitemap(sitemap)
86
+ raise "invalid sitemap format" unless sitemap&.root&.value == "urlset"
87
+
88
+ sitemap.root.nodes.map { |node| Url.new(node) }
89
+ end
90
+
91
+ def raw_sitemap
92
+ response = Typhoeus.get(@url, @opts)
93
+ raise "invalid sitemap url for #{@url}" unless response.success?
94
+
95
+ Ox.load(response.body)
96
+ end
97
+ end
data/lib/arwen/url.rb ADDED
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Arwen
4
+ # Models the sitemap <url> schema definition according the sitemap.org protocol
5
+ #
6
+ # @see https://www.sitemaps.org/protocol.html#urldef
7
+ class Url
8
+ # @return [string] <loc> schema value
9
+ attr_accessor :url
10
+
11
+ # @return [string] <lastmod> schema value
12
+ attr_accessor :lastmod
13
+
14
+ # @return [float] <priority> schema value
15
+ attr_accessor :priority
16
+
17
+ # @return [string] <changefreq> schema value
18
+ attr_accessor :changefreq
19
+
20
+ # The Ox::Element object used to initialize the Url instance
21
+ # @return [Ox::Element]
22
+ attr_reader :raw
23
+
24
+ # Create a new SitemapParser::URL
25
+ #
26
+ # @param [Ox::Element] ox_element element in the sitemap tree
27
+ # @see http://www.ohler.com/ox/Ox/Element.html
28
+ def initialize(ox_element)
29
+ @url = ox_element.locate("loc/*").first
30
+ @lastmod = ox_element.locate("lastmod/*").first
31
+ @priority = ox_element.locate("priority/*").first&.to_f
32
+ @changefreq = ox_element.locate("changefreq/*").first
33
+ @raw = ox_element
34
+ end
35
+
36
+ # converts the string lastmod value to a `Date` object
37
+ #
38
+ # @return [Date]
39
+ def to_date
40
+ return nil if lastmod.nil?
41
+
42
+ Date.parse(lastmod)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Arwen
4
+ VERSION = "0.1.0"
5
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arwen
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Adam Hake
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-03-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ox
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.14'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: typhoeus
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.4'
41
+ description: Sitemap Parser recursively parses a sitemap leveraging typheous concurrent
42
+ https request
43
+ email:
44
+ - adamhake@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".github/workflows/main.yml"
50
+ - ".gitignore"
51
+ - ".rubocop.yml"
52
+ - CHANGELOG.md
53
+ - Gemfile
54
+ - Gemfile.lock
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - arwen.gemspec
59
+ - bin/console
60
+ - bin/setup
61
+ - lib/arwen.rb
62
+ - lib/arwen/url.rb
63
+ - lib/arwen/version.rb
64
+ homepage: https://github.com/adamhake/arwen
65
+ licenses:
66
+ - MIT
67
+ metadata:
68
+ homepage_uri: https://github.com/adamhake/arwen
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.5.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.1.4
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Parses a sitemap recursively using typheous
88
+ test_files: []