arwen 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 28a4731079ae915acfb7f6b50e7210ef4e9cb46eaf2cca7072734c900e12c30f
4
+ data.tar.gz: 857982f366ff50a6d60d0ae2aa87a108b913c1ac9f67ae15294e7b9cb14c6597
5
+ SHA512:
6
+ metadata.gz: 8abe5eec16f455c42e3a0d799f9f1d758c376e7340b981a09c5b9a08e938f84afd52769d0af0a0703e810a2fb00114a7e8c06755ba9e3ebc7a7911bac87d84bd
7
+ data.tar.gz: e5b7e9157ae6ed2dd5cf09ec9850205dec2d2d03d7341fc14e72e0cfe71431a7a2b9358cb7a1345272bbcfb8ed4e46663ca13693d6d9058eede1f298ca55efce
@@ -0,0 +1,18 @@
1
+ name: Ruby
2
+
3
+ on: [push,pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v2
10
+ - name: Set up Ruby
11
+ uses: ruby/setup-ruby@v1
12
+ with:
13
+ ruby-version: 2.7.2
14
+ - name: Run the default task
15
+ run: |
16
+ gem install bundler -v 2.2.11
17
+ bundle install
18
+ bundle exec rake
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ benchmarks/
data/.rubocop.yml ADDED
@@ -0,0 +1,21 @@
1
+ require:
2
+ - rubocop-rake
3
+ - rubocop-minitest
4
+
5
+ AllCops:
6
+ TargetRubyVersion: 2.5
7
+ NewCops: enable
8
+
9
+ Style/StringLiterals:
10
+ Enabled: true
11
+ EnforcedStyle: double_quotes
12
+
13
+ Style/StringLiteralsInInterpolation:
14
+ Enabled: true
15
+ EnforcedStyle: double_quotes
16
+
17
+ Layout/LineLength:
18
+ Max: 120
19
+
20
+ Style/Documentation:
21
+ Enabled: false
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2021-02-27
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in arwen.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+
10
+ gem "minitest", "~> 5.0"
11
+
12
+ gem "rubocop", "~> 1.7"
13
+ gem "rubocop-minitest"
14
+ gem "rubocop-rake"
data/Gemfile.lock ADDED
@@ -0,0 +1,56 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ arwen (0.1.0)
5
+ ox (~> 2.14)
6
+ typhoeus (~> 1.4)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.2)
12
+ ethon (0.12.0)
13
+ ffi (>= 1.3.0)
14
+ ffi (1.14.2)
15
+ minitest (5.14.4)
16
+ ox (2.14.1)
17
+ parallel (1.20.1)
18
+ parser (3.0.0.0)
19
+ ast (~> 2.4.1)
20
+ rainbow (3.0.0)
21
+ rake (13.0.3)
22
+ regexp_parser (2.1.1)
23
+ rexml (3.2.4)
24
+ rubocop (1.11.0)
25
+ parallel (~> 1.10)
26
+ parser (>= 3.0.0.0)
27
+ rainbow (>= 2.2.2, < 4.0)
28
+ regexp_parser (>= 1.8, < 3.0)
29
+ rexml
30
+ rubocop-ast (>= 1.2.0, < 2.0)
31
+ ruby-progressbar (~> 1.7)
32
+ unicode-display_width (>= 1.4.0, < 3.0)
33
+ rubocop-ast (1.4.1)
34
+ parser (>= 2.7.1.5)
35
+ rubocop-minitest (0.10.3)
36
+ rubocop (>= 0.87, < 2.0)
37
+ rubocop-rake (0.5.1)
38
+ rubocop
39
+ ruby-progressbar (1.11.0)
40
+ typhoeus (1.4.0)
41
+ ethon (>= 0.9.0)
42
+ unicode-display_width (2.0.0)
43
+
44
+ PLATFORMS
45
+ x86_64-darwin-20
46
+
47
+ DEPENDENCIES
48
+ arwen!
49
+ minitest (~> 5.0)
50
+ rake (~> 13.0)
51
+ rubocop (~> 1.7)
52
+ rubocop-minitest
53
+ rubocop-rake
54
+
55
+ BUNDLED WITH
56
+ 2.2.11
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Adam Hake
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,62 @@
1
+ # Arwen
2
+
3
+ Arwen parses sitemaps that adhere to the sitemaps.org protocol. Inspired by benbalter's [sitemap-parser gem](https://github.com/benbalter/sitemap-parser), arwen automatically detects if recursion is needed by analyzing the presence of `<sitemapindex>`. It also leverages Typheous' parallel request functionality via `Typheous::Hydra` when needed to drastically speed up
4
+ fetching large sitemaps.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'arwen'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle install
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install arwen
21
+
22
+ ## Usage
23
+
24
+ To parse a sitemap, create a new `Arwen` instance, passing it the URL to the sitemap and, optionally, any options that should be passed to `Typheous::Request`.
25
+
26
+ ```ruby
27
+ arwen = Arwen.new("https://www.example.org/sitemap.xml")
28
+ ```
29
+ As long as the sitemap implements the sitemaps.org protocol, `arwen` will check if the sitemap should be parsed recursively (i.e. if the sitemap implements `<sitemapindex>`). The sitemap is not fetched until a method is called that accesses it.
30
+
31
+ `arwen` uses `Ox` has its XML parser. To access the raw `Ox::Document` sitemap, use the `sitemap` instance method. See the `Ox::Document` [documentation](http://www.ohler.com/ox/Ox/Document.html) for full details.
32
+
33
+ ```ruby
34
+ arwen.sitemap # Ox::document
35
+ ```
36
+ The `urls` instance method returns an array of `Arwen::Url` objects. `Arwen::Url` is a simple object that models the `<url>` schema in the sitemaps.org protocol.
37
+
38
+ ```ruby
39
+ arwen.urls # Array<Arwen::Url>
40
+ ```
41
+
42
+ To get an array of just the url strings for the whole sitemap, use the `to_a` instance method:
43
+
44
+ ```ruby
45
+ arwen.to_a # Array<string>
46
+ ```
47
+
48
+ ## Development
49
+
50
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
51
+
52
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
53
+
54
+ ## Contributing
55
+
56
+ Bug reports and pull requests are welcome on GitHub at https://github.com/adamhake/arwen.
57
+
58
+ ## License
59
+
60
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
61
+
62
+ *Yes, the name is a reference to the Half-Elven daughter of Elrond.*
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "test"
8
+ t.libs << "lib"
9
+ t.test_files = FileList["test/**/*_test.rb"]
10
+ end
11
+
12
+ require "rubocop/rake_task"
13
+
14
+ RuboCop::RakeTask.new
15
+
16
+ task default: %i[test rubocop]
data/arwen.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift lib unless $LOAD_PATH.include?(lib)
5
+ require "arwen/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "arwen"
9
+ spec.version = Arwen::VERSION
10
+ spec.authors = ["Adam Hake"]
11
+ spec.email = ["adamhake@gmail.com"]
12
+
13
+ spec.summary = "Parses a sitemap recursively using typheous"
14
+ spec.description = "Sitemap Parser recursively parses a sitemap leveraging typheous concurrent https request"
15
+ spec.homepage = "https://github.com/adamhake/arwen"
16
+ spec.license = "MIT"
17
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
18
+
19
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
20
+
21
+ spec.metadata["homepage_uri"] = spec.homepage
22
+ # spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
23
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
24
+
25
+ # Specify which files should be added to the gem when it is released.
26
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
27
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
28
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{\A(?:test|spec|features)/}) }
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Uncomment to register a new dependency of your gem
35
+ # spec.add_dependency "example-gem", "~> 1.0"
36
+ spec.add_dependency "ox", "~> 2.14"
37
+ spec.add_dependency "typhoeus", "~> 1.4"
38
+
39
+ # For more information and examples about making a new gem, checkout our
40
+ # guide at: https://bundler.io/guides/creating_gem.html
41
+ end
data/bin/console ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "arwen"
6
+
7
+ # You can add fixtures and/or initialization code here to make experimenting
8
+ # with your gem easier. You can also use a different console, if you like.
9
+
10
+ # (If you use this, don't forget to add pry to your Gemfile!)
11
+ # require "pry"
12
+ # Pry.start
13
+
14
+ require "irb"
15
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/arwen.rb ADDED
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "arwen/version"
4
+ require "arwen/url"
5
+ require "ox"
6
+ require "typhoeus"
7
+
8
+ # Parses a sitemap url and provides all links provided by the sitemap or sitemap_index.
9
+ # It uses Typheous for network requests and making concurrent requests when parsing a sitemap_index.
10
+ # Ox is the XML parser used to parse the sitemap. Sitemaps are assumed to follow the sitemaps.org protocol.
11
+ #
12
+ # @see https://github.com/typhoeus/typhoeus
13
+ # @see https://github.com/ohler55/ox
14
+ # @see https://www.sitemaps.org/protocol.html
15
+ class Arwen
16
+ # Create a new Arwen instance
17
+ #
18
+ # @param [string] url the full URL to the sitemap or sitemap_index XML file
19
+ # @param [hash] opts options passed to Typheous::Request instances.
20
+ # @option opts [integer] :max_concurrency maximum concurrent requests passed to Typheous::Hydra
21
+ # @see https://rubydoc.info/github/typhoeus/typhoeus/Typhoeus/Request
22
+ def initialize(url, opts = {})
23
+ @url = url
24
+ max_concurrency = opts.delete(:max_concurrency) { 200 }
25
+ @opts = { followlocation: true }.merge(opts)
26
+ @hydra = Typhoeus::Hydra.new(max_concurrency: max_concurrency)
27
+ end
28
+
29
+ # fetches and returns all urls for the sitemap with corresponding <url> sitemap schema metadata
30
+ #
31
+ # @return [Array<SitemapParser::Url>]
32
+ def urls
33
+ @urls ||= all_urls(sitemap)
34
+ end
35
+
36
+ # returns an array of url strings for all URls in the sitemap
37
+ #
38
+ # @return [Array<String>]
39
+ def to_a
40
+ urls.map(&:url)
41
+ end
42
+
43
+ # parses the sitemap url to an Ox::Document instance
44
+ #
45
+ # @return [Ox::Document]
46
+ # @see http://www.ohler.com/ox/Ox/Document.html
47
+ def sitemap
48
+ @sitemap ||= raw_sitemap
49
+ end
50
+
51
+ private
52
+
53
+ def all_urls(sitemap)
54
+ return parse_multiple_sitemaps(sitemap) if sitemap.root.respond_to?(:sitemap)
55
+
56
+ parse_single_sitemap(sitemap)
57
+ end
58
+
59
+ def parse_multiple_sitemaps(sitemap)
60
+ raise "invalid sitemap format" unless sitemap&.root&.value == "sitemapindex"
61
+
62
+ urls = sitemap.root.locate("sitemap/loc/*")
63
+ site_urls = []
64
+
65
+ requests = fetch_sitemaps(urls)
66
+ requests.each do |req|
67
+ sitemap = Ox.load(req.response.body)
68
+ site_urls += parse_single_sitemap(sitemap)
69
+ end
70
+
71
+ site_urls
72
+ end
73
+
74
+ def fetch_sitemaps(urls)
75
+ requests = urls.map do |url|
76
+ req = Typhoeus::Request.new(url, @opts)
77
+ @hydra.queue(req)
78
+ req
79
+ end
80
+ @hydra.run
81
+
82
+ requests
83
+ end
84
+
85
+ def parse_single_sitemap(sitemap)
86
+ raise "invalid sitemap format" unless sitemap&.root&.value == "urlset"
87
+
88
+ sitemap.root.nodes.map { |node| Url.new(node) }
89
+ end
90
+
91
+ def raw_sitemap
92
+ response = Typhoeus.get(@url, @opts)
93
+ raise "invalid sitemap url for #{@url}" unless response.success?
94
+
95
+ Ox.load(response.body)
96
+ end
97
+ end
data/lib/arwen/url.rb ADDED
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Arwen
4
+ # Models the sitemap <url> schema definition according the sitemap.org protocol
5
+ #
6
+ # @see https://www.sitemaps.org/protocol.html#urldef
7
+ class Url
8
+ # @return [string] <loc> schema value
9
+ attr_accessor :url
10
+
11
+ # @return [string] <lastmod> schema value
12
+ attr_accessor :lastmod
13
+
14
+ # @return [float] <priority> schema value
15
+ attr_accessor :priority
16
+
17
+ # @return [string] <changefreq> schema value
18
+ attr_accessor :changefreq
19
+
20
+ # The Ox::Element object used to initialize the Url instance
21
+ # @return [Ox::Element]
22
+ attr_reader :raw
23
+
24
+ # Create a new SitemapParser::URL
25
+ #
26
+ # @param [Ox::Element] ox_element element in the sitemap tree
27
+ # @see http://www.ohler.com/ox/Ox/Element.html
28
+ def initialize(ox_element)
29
+ @url = ox_element.locate("loc/*").first
30
+ @lastmod = ox_element.locate("lastmod/*").first
31
+ @priority = ox_element.locate("priority/*").first&.to_f
32
+ @changefreq = ox_element.locate("changefreq/*").first
33
+ @raw = ox_element
34
+ end
35
+
36
+ # converts the string lastmod value to a `Date` object
37
+ #
38
+ # @return [Date]
39
+ def to_date
40
+ return nil if lastmod.nil?
41
+
42
+ Date.parse(lastmod)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Arwen
4
+ VERSION = "0.1.0"
5
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arwen
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Adam Hake
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-03-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ox
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.14'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: typhoeus
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.4'
41
+ description: Sitemap Parser recursively parses a sitemap leveraging typheous concurrent
42
+ https request
43
+ email:
44
+ - adamhake@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".github/workflows/main.yml"
50
+ - ".gitignore"
51
+ - ".rubocop.yml"
52
+ - CHANGELOG.md
53
+ - Gemfile
54
+ - Gemfile.lock
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - arwen.gemspec
59
+ - bin/console
60
+ - bin/setup
61
+ - lib/arwen.rb
62
+ - lib/arwen/url.rb
63
+ - lib/arwen/version.rb
64
+ homepage: https://github.com/adamhake/arwen
65
+ licenses:
66
+ - MIT
67
+ metadata:
68
+ homepage_uri: https://github.com/adamhake/arwen
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.5.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.1.4
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Parses a sitemap recursively using typheous
88
+ test_files: []