earl 0.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby-tests.yml +32 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +35 -0
  5. data/.rubocop_todo.yml +22 -0
  6. data/.ruby-gemset +1 -0
  7. data/.ruby-version +1 -0
  8. data/Gemfile +13 -1
  9. data/Guardfile +15 -0
  10. data/LICENSE +2 -2
  11. data/README.md +127 -25
  12. data/Rakefile +10 -2
  13. data/earl.gemspec +19 -14
  14. data/lib/earl/earl.rb +172 -0
  15. data/lib/earl/scraper.rb +92 -0
  16. data/lib/earl/version.rb +4 -2
  17. data/lib/earl.rb +11 -20
  18. data/spec/fixtures/bicycles.html +490 -0
  19. data/spec/fixtures/bicycles_without_description.html +489 -0
  20. data/spec/fixtures/bicycles_without_images.html +457 -0
  21. data/spec/fixtures/cassettes/feed/is_atom_feed.yml +2298 -0
  22. data/spec/fixtures/cassettes/feed/is_rss_feed.yml +48 -0
  23. data/spec/fixtures/cassettes/feed/no_feed.yml +69 -0
  24. data/spec/fixtures/cassettes/feed/with_atom_and_rss_feed.yml +1471 -0
  25. data/spec/fixtures/cassettes/feed/with_rss_feed.yml +47 -0
  26. data/spec/fixtures/cassettes/oembed/no_oembed.yml +101 -0
  27. data/spec/fixtures/cassettes/oembed/youtube_oembed.yml +129 -0
  28. data/spec/fixtures/page_as_atom.html +161 -0
  29. data/spec/fixtures/page_as_rss.html +151 -0
  30. data/spec/fixtures/page_with_atom_feed.html +39 -0
  31. data/spec/fixtures/page_with_rss_and_atom_feeds.html +40 -0
  32. data/spec/fixtures/page_with_rss_feed.html +39 -0
  33. data/spec/fixtures/page_without_feeds.html +36 -0
  34. data/spec/fixtures/youtube.html +1839 -0
  35. data/spec/integration/feed_spec.rb +78 -0
  36. data/spec/integration/oembed_spec.rb +36 -0
  37. data/spec/spec_helper.rb +21 -29
  38. data/spec/support/fixtures.rb +15 -0
  39. data/spec/support/vcr.rb +9 -0
  40. data/spec/unit/earl/earl_spec.rb +15 -0
  41. data/spec/unit/earl/feed_spec.rb +62 -0
  42. data/spec/unit/earl/oembed_spec.rb +50 -0
  43. data/spec/unit/earl/scraper_spec.rb +49 -0
  44. data/spec/unit/earl_spec.rb +74 -0
  45. metadata +90 -62
  46. data/.rvmrc +0 -48
  47. data/lib/earl/email_assembler.rb +0 -11
  48. data/lib/earl/email_entity.rb +0 -27
  49. data/lib/earl/email_parser.tt +0 -58
  50. data/lib/earl/entity_base.rb +0 -37
  51. data/lib/earl/hash_inquirer.rb +0 -16
  52. data/lib/earl/string_inquirer.rb +0 -11
  53. data/lib/earl/url_assembler.rb +0 -15
  54. data/lib/earl/url_entity.rb +0 -23
  55. data/lib/earl/url_parser.tt +0 -163
  56. data/spec/earl/earl_spec.rb +0 -17
  57. data/spec/earl/email_entity_spec.rb +0 -31
  58. data/spec/earl/email_parser_spec.rb +0 -29
  59. data/spec/earl/entity_base_spec.rb +0 -39
  60. data/spec/earl/hash_inquirer_spec.rb +0 -24
  61. data/spec/earl/string_inquirer_spec.rb +0 -9
  62. data/spec/earl/url_entity_spec.rb +0 -45
  63. data/spec/earl/url_parser_spec.rb +0 -189
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bfb59215c79763794ebf311cff3e939a0be46e4557abf8bbd8c358693b12e1ee
4
+ data.tar.gz: 550ea900108a04f4d12935bcd7fa398b95c886f59847c2492a7d90b96495f042
5
+ SHA512:
6
+ metadata.gz: 16102c9eee3ff031411da1aacd80371a7474ea3ff5b70dad9a77ad83efe63a943db7b845e8b461a4c44de7685e8933c903a2c2511c41df00f69759cd06ece54c
7
+ data.tar.gz: 3b724772932a18d2de4f589959c28968fa9cc93cf158bca8ac4360166ae3b04877e8bb6dc09338329da94fbbccee4891819598f1e0a4a642d858d13362f9219f
@@ -0,0 +1,32 @@
1
+ name: Ruby Tests
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ test:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v3
18
+
19
+ - name: Set up Ruby
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: 3.3
23
+ bundler-cache: true
24
+
25
+ - name: Install dependencies
26
+ run: bundle install
27
+
28
+ - name: Run tests
29
+ run: bundle exec rspec
30
+
31
+ - name: Run rubocop
32
+ run: bundle exec rubocop
data/.gitignore CHANGED
@@ -15,3 +15,8 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/.rubocop.yml ADDED
@@ -0,0 +1,35 @@
1
+ inherit_from: .rubocop_todo.yml
2
+
3
+ AllCops:
4
+ NewCops: disable
5
+ SuggestExtensions: false
6
+ TargetRubyVersion: 3.0
7
+
8
+ Layout/LineLength:
9
+ Max: 199
10
+
11
+ Metrics/BlockLength:
12
+ Max: 63
13
+
14
+ Metrics/ClassLength:
15
+ Max: 200
16
+
17
+ Metrics/CyclomaticComplexity:
18
+ Max: 8
19
+
20
+ Metrics/MethodLength:
21
+ Max: 20
22
+
23
+ Style/ClassVars:
24
+ Exclude:
25
+ - 'lib/earl/scraper.rb'
26
+
27
+ Style/Documentation:
28
+ Exclude:
29
+ - 'spec/**/*'
30
+
31
+ Style/ClassAndModuleChildren:
32
+ EnforcedStyleForClasses: compact
33
+
34
+ Layout/FirstHashElementIndentation:
35
+ EnforcedStyle: consistent
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,22 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2025-07-29 10:15:32 UTC using RuboCop version 1.79.0.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 7
10
+ # This cop supports unsafe autocorrection (--autocorrect-all).
11
+ # Configuration parameters: AllowSafeAssignment.
12
+ Lint/AssignmentInCondition:
13
+ Exclude:
14
+ - 'lib/earl/earl.rb'
15
+ - 'lib/earl/scraper.rb'
16
+
17
+ # Offense count: 2
18
+ # Configuration parameters: AllowedMethods.
19
+ # AllowedMethods: enums
20
+ Lint/ConstantDefinitionInBlock:
21
+ Exclude:
22
+ - 'spec/unit/earl/scraper_spec.rb'
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ earl
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 3.3.3
data/Gemfile CHANGED
@@ -1,4 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
- # Specify your gem's dependencies in earl.gemspec
5
+ # Specify gem dependencies in earl.gemspec
4
6
  gemspec
7
+
8
+ # development dependencies
9
+ gem 'bundler', '>= 2.2.33'
10
+ gem 'guard-rspec'
11
+ gem 'guard-rubocop'
12
+ gem 'rake'
13
+ gem 'rspec'
14
+ gem 'rubocop', require: false
15
+ gem 'vcr'
16
+ gem 'webmock'
data/Guardfile ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ # A sample Guardfile
4
+ # More info at https://github.com/guard/guard#readme
5
+
6
+ guard :rspec, cmd: 'bundle exec rspec' do
7
+ watch(%r{^spec/.+_spec\.rb$})
8
+ watch('spec/spec_helper.rb') { 'spec' }
9
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/unit/#{m[1]}_spec.rb" }
10
+ end
11
+
12
+ guard :rubocop, cli: ['--display-cop-names'] do
13
+ watch(/.+\.rb$/)
14
+ watch(%r{(?:.+/)?\.rubocop(?:_todo)?\.yml$}) { |m| File.dirname(m[0]) }
15
+ end
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Jeremy Ruppel
1
+ Copyright (c) 2009 T.J. VanSlyke, 2012-2025 Paul Gallagher
2
2
 
3
3
  MIT License
4
4
 
@@ -19,4 +19,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
19
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
20
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
21
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,41 +1,143 @@
1
1
  # Earl
2
2
 
3
- What URI wishes it could look like.
3
+ Earl wants to help you scrape all the relevant metadata for your favorite web pages so you can be as cool as
4
+ Facebook when displaying user-submitted link content. Earl returns details like titles, descriptions, content type,
5
+ associated feeds, and OEmbed definitions if available.
4
6
 
5
- ## Installation
7
+ Earl is based on an original source project called _earl_ by [teejayvanslyke](https://github.com/teejayvanslyke/earl) (but never released as a gem).
8
+ The revamp was done by [Paul Gallagher](https://github.com/tardate), and master source is currently
9
+ available at <https://github.com/evendis/earl>.
6
10
 
7
- Add this line to your application's Gemfile:
11
+ The Earl gem is officially named _earl_. Big thanks go to [jeremyruppel](https://github.com/jeremyruppel) who
12
+ contributed the ownership of the _earl_ gem name. The original _earl_ gem had a somewhat similar purpose - it is now defunct, but still available up to version 0.3.0 via rubgems. Any _earl_ gem with version 1.0.0 or higher is the new gem release (and is in no way backwardly compatible with
13
+ earlier versions).
8
14
 
9
- gem 'earl'
15
+ ## The Earl Cookbook
10
16
 
11
- And then execute:
17
+ ### How do instantiate Earl?
12
18
 
13
- $ bundle
19
+ Pass any url-like string to Earl:
14
20
 
15
- Or install it yourself as:
21
+ my_earl_instance = Earl.new('https://github.com/evendis/earl')
22
+ #
23
+ # or using the []= convenience method:
24
+ my_earl_instance = Earl['https://github.com/evendis/earl']
16
25
 
17
- $ gem install earl
26
+ ### How do I inspect details of the page?
18
27
 
19
- ## Usage
28
+ earl = Earl['https://github.com/evendis/earl']
29
+ earl.title
30
+ => "evendis/earl · GitHub"
31
+ earl.description
32
+ => "earl - URL metadata API for scraping titles, descriptions, images, and videos from URL's."
33
+ earl.image
34
+ => "https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7@4x.png?1340935010"
20
35
 
21
- ``` rb
22
- url = Earl::URL.new 'http://www.foo.com'
36
+ ### How do I get oembed details for a link?
23
37
 
24
- url.scheme # => 'http'
25
- url.scheme? # => true
38
+ Earl will get oembed details for a link if they are available.
26
39
 
27
- url.subdomain # => 'www'
28
- url.subdomain.www? # => true
29
- url.subdomain.baz? # => false
40
+ earl = Earl['https://www.youtube.com/watch?v=hNSkCqMUMQA']
41
+ earl.oembed
42
+ => {:title=>"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi",
43
+ :author_name=>"RubyKaigi",
44
+ :author_url=>"https://www.youtube.com/@rubykaigi4884",
45
+ :type=>"video",
46
+ :height=>113,
47
+ :width=>200,
48
+ :version=>"1.0",
49
+ :provider_name=>"YouTube",
50
+ :provider_url=>"https://www.youtube.com/",
51
+ :thumbnail_height=>360,
52
+ :thumbnail_width=>480,
53
+ :thumbnail_url=>"https://i.ytimg.com/vi/hNSkCqMUMQA/hqdefault.jpg",
54
+ :html=>
55
+ "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"}
56
+ # to get the embed code:
57
+ earl.oembed_html
58
+ => "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"
30
59
 
31
- url.host = 'foo.edu'
32
- url.to_s # => 'http://www.foo.edu'
33
- ```
60
+ ### How do I customise the oembed link?
34
61
 
35
- ## Contributing
62
+ Supported oembed parameters may be provided with to `Earl.new` or to the `oembed` call:
36
63
 
37
- 1. Fork it
38
- 2. Create your feature branch (`git checkout -b my-new-feature`)
39
- 3. Commit your changes (`git commit -am 'Added some feature'`)
40
- 4. Push to the branch (`git push origin my-new-feature`)
41
- 5. Create new Pull Request
64
+ earl = Earl.new('https://www.youtube.com/watch?v=hNSkCqMUMQA', { oembed: { maxwidth: '200', maxheight: '320' }})
65
+ earl.oembed_html
66
+ => "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"
67
+
68
+ ### How do I inspect what attributes are available for a page?
69
+
70
+ To see all of the attributes a URL provides, simply ask:
71
+
72
+ earl = Earl['https://github.com/evendis/earl']
73
+ earl.attributes
74
+ => [:title, :image, :description, :rss_feed, :atom_feed, :content_type, :base_url, :charset, :content_encoding, :headers, :feed]
75
+
76
+ ### How can I extend Earl to scrape additional page details?
77
+
78
+ Need to scrape additional page details currently supported by Earl? Implement your own scraper:
79
+
80
+ class QotdScraper < Earl::Scraper
81
+ match /^http\:\/\/www\.quotationspage\.com\/qotd\.html$/
82
+
83
+ define_attribute :qotd do |doc|
84
+ doc.at('dt.quote a').text
85
+ end
86
+ end
87
+
88
+ The define_attribute method will supply you with a Nokogiri document which you can traverse to your heart's content.
89
+ Use the match method to limit the scope of URLs that your scraper will apply to.
90
+
91
+ Your new attribute is now available for use:
92
+
93
+ Earl['http://www.quotationspage.com/qotd.html'].qotd
94
+ => "Love is a snowmobile racing across the tundra and then suddenly it flips over, pinning you underneath. At night, the ice weasels come."
95
+
96
+ ### How do I install it for normal use?
97
+
98
+ If using bundler, add gem 'earl' your application's Gemfile and run `bundle`.
99
+
100
+ Or install it from the command-line:
101
+
102
+ gem install earl
103
+
104
+ ### How do I install it for gem development?
105
+
106
+ To work on enhancements of fix bugs in Earl, fork and clone the github repository.
107
+ If you are using bundler (recommended), run `bundle` to install development dependencies:
108
+
109
+ gem install bundler
110
+ bundle
111
+
112
+ ### How do I run the tests?
113
+
114
+ Once development dependencies are installed, all unit tests are run with just:
115
+
116
+ $ rake
117
+ # or..
118
+ $ rake spec
119
+
120
+ VCR is used to record integration tests. To re-record sessions, delete the corresponding cassette in
121
+ [spec/fixtures/cassettes](./spec/fixtures/cassettes/).
122
+
123
+ ### How do I automatically run tests when I modify files?
124
+
125
+ Guard is installed as part of the development dependencies. Start a guard process in a terminal window:
126
+
127
+ bundle exec guard
128
+
129
+ It will run all the tests to start with by default. Then whenever you change a file, the associated tests will execute in this terminal window.
130
+
131
+ ## Contributing to Earl
132
+
133
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
134
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
135
+ * Fork the project
136
+ * Start a feature/bugfix branch
137
+ * Commit and push until you are happy with your contribution
138
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
139
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
140
+
141
+ ## Copyright
142
+
143
+ See [LICENSE](./LICENSE) for details.
data/Rakefile CHANGED
@@ -1,5 +1,13 @@
1
- #!/usr/bin/env rake
1
+ # frozen_string_literal: true
2
+
2
3
  require 'bundler/gem_tasks'
3
4
  require 'rspec/core/rake_task'
4
5
 
5
- RSpec::Core::RakeTask.new :spec
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
9
+
10
+ desc 'Open an irb session preloaded with this library'
11
+ task :console do
12
+ sh 'irb -I lib -r earl.rb'
13
+ end
data/earl.gemspec CHANGED
@@ -1,21 +1,26 @@
1
- # -*- encoding: utf-8 -*-
2
- require File.expand_path('../lib/earl/version', __FILE__)
1
+ # frozen_string_literal: true
2
+
3
+ require 'English'
4
+ lib = File.expand_path('lib', __dir__)
5
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
6
+ require 'earl/version'
3
7
 
4
8
  Gem::Specification.new do |gem|
5
- gem.authors = ["Jeremy Ruppel"]
6
- gem.email = ["jeremy.ruppel@gmail.com"]
7
- gem.description = %q{What URI wishes it could look like}
8
- gem.summary = %q{What URI wishes it could look like}
9
- gem.homepage = "https://github.com/remind101/earl"
9
+ gem.authors = ['teejayvanslyke', 'Paul Gallagher']
10
+ gem.email = ['tj@elctech.com', 'gallagher.paul@gmail.com']
11
+ gem.description = 'URL metadata API'
12
+ gem.summary = 'URL metadata API for scraping titles, descriptions, images, and videos from URLs'
13
+ gem.homepage = 'https://github.com/evendis/earl'
10
14
 
11
- gem.files = `git ls-files`.split($\)
12
- gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
13
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
- gem.name = "earl"
15
- gem.require_paths = ["lib"]
18
+ gem.name = 'earl'
19
+ gem.require_paths = ['lib']
16
20
  gem.version = Earl::VERSION
21
+ gem.license = 'MIT'
17
22
 
18
- gem.add_dependency 'treetop', '>= 1.4.10'
19
-
20
- gem.add_development_dependency 'rspec', '>= 2.9.0'
23
+ gem.required_ruby_version = '>= 3.0'
24
+ gem.add_runtime_dependency 'nokogiri', '~> 1.18'
25
+ gem.add_runtime_dependency 'ruby-oembed', '~> 0.18.1'
21
26
  end
data/lib/earl/earl.rb ADDED
@@ -0,0 +1,172 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Earl is a class that represents a URL and provides methods to fetch metadata about the page
4
+ class Earl
5
+ attr_accessor :url, :options
6
+ attr_writer :oembed
7
+
8
+ def initialize(url, options = {})
9
+ @url = url
10
+ @options = options
11
+ end
12
+
13
+ def to_s
14
+ url
15
+ end
16
+
17
+ def uri
18
+ @uri ||= URI.parse(url)
19
+ end
20
+
21
+ def uri_response
22
+ @uri_response ||= uri.open
23
+ end
24
+
25
+ # Returns
26
+ # :base_url - the actual url (resolved after possible redirect) as a String
27
+ # :content_type - mime type
28
+ # :charset - returns a charset parameter in Content-Type field. It is downcased for canonicalization.
29
+ # :content_encoding - returns a list of encodings in Content-Encoding field as an Array of String. The encodings are downcased for canonicalization.
30
+ # :headers - raw response header metadata
31
+ # (excluded since this generally returns not RFC 2616 compliant date :last_modified - returns a Time which represents Last-Modified field.
32
+ def uri_response_attribute(name)
33
+ case name
34
+ when :base_url
35
+ (uri_response_attribute(:base_uri) || url).to_s
36
+ when :headers
37
+ uri_response_attribute(:meta)
38
+ else
39
+ uri_response.respond_to?(name) && uri_response.send(name)
40
+ end
41
+ end
42
+ protected :uri_response_attribute
43
+
44
+ def uri_response_attributes
45
+ %i[content_type base_url charset content_encoding headers]
46
+ end
47
+ protected :uri_response_attributes
48
+
49
+ def scraper
50
+ @scraper ||= Scraper.for(url, self)
51
+ end
52
+
53
+ def response
54
+ scraper&.response
55
+ end
56
+
57
+ # Returns a hash of link meta data, including:
58
+ # :title, :description, :image (all attributes)
59
+ # :base_url
60
+ def metadata
61
+ data = oembed || {}
62
+ attributes.each do |attribute|
63
+ if attribute_value = send(attribute)
64
+ data[attribute] ||= attribute_value
65
+ end
66
+ end
67
+ data
68
+ end
69
+
70
+ def respond_to_missing?(name, include_private)
71
+ uri_response_attributes.include?(name) || scraper&.attribute?(name) || super
72
+ end
73
+
74
+ # Dispatch missing methods if a match for:
75
+ # - uri_response_attributes
76
+ # - scraper attributes
77
+ def method_missing(method, *args)
78
+ if uri_response_attributes.include?(method)
79
+ uri_response_attribute(method)
80
+ elsif scraper&.attribute?(method)
81
+ scraper.attribute(method)
82
+ else
83
+ super
84
+ end
85
+ end
86
+
87
+ # Returns a full array of attributes available for the link
88
+ def attributes
89
+ scraper.attributes.keys + uri_response_attributes + [:feed]
90
+ end
91
+
92
+ # Returns the options to be used for oembed
93
+ def oembed_options
94
+ { maxwidth: '560', maxheight: '315' }.merge(options[:oembed] || {})
95
+ end
96
+
97
+ # Returns the oembed meta data hash for the URL (or nil if not defined/available)
98
+ # e.g. for https://www.youtube.com/watch?v=hNSkCqMUMQA:
99
+ # {
100
+ # :title=>"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi",
101
+ # :author_name=>"RubyKaigi",
102
+ # :author_url=>"https://www.youtube.com/@rubykaigi4884",
103
+ # :type=>"video",
104
+ # :height=>113,
105
+ # :width=>200,
106
+ # :version=>"1.0",
107
+ # :provider_name=>"YouTube",
108
+ # :provider_url=>"https://www.youtube.com/",
109
+ # :thumbnail_height=>360,
110
+ # :thumbnail_width=>480,
111
+ # :thumbnail_url=>"https://i.ytimg.com/vi/hNSkCqMUMQA/hqdefault.jpg",
112
+ # :html=> "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" \
113
+ # frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" \
114
+ # referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen \
115
+ # title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"
116
+ # }
117
+ #
118
+ # +options+ defines a custom oembed options hash and will cause a re-fetch of the oembed metadata
119
+ def oembed(options = nil)
120
+ if options # use custom options, refetch oembed metadata
121
+ @options[:oembed] = options
122
+ @oembed = nil
123
+ end
124
+ @oembed ||= begin
125
+ h = fetch_oembed(base_url).fields
126
+ if h
127
+ h.keys.each do |key| # symbolize_keys!
128
+ new_key = begin
129
+ key.to_sym
130
+ rescue StandardError
131
+ key
132
+ end
133
+ h[new_key] = h.delete(key)
134
+ end
135
+ h
136
+ end
137
+ rescue StandardError
138
+ nil
139
+ end
140
+ end
141
+
142
+ def fetch_oembed(base_url)
143
+ OEmbed::Providers.get(base_url)
144
+ end
145
+ protected :fetch_oembed
146
+
147
+ # Returns the oembed code for the url (or nil if not defined/available)
148
+ def oembed_html
149
+ oembed && oembed[:html]
150
+ end
151
+
152
+ # Returns true if there is an ATOM or RSS feed associated with this URL.
153
+ def feed?
154
+ !feed.nil?
155
+ end
156
+
157
+ # Returns the feed URL associated with this URL.
158
+ # Returns RSS by default, or ATOM if +prefer+ is not :rss.
159
+ def feed(prefer = :rss)
160
+ rss = rss_feed
161
+ atom = atom_feed
162
+ if rss && atom
163
+ prefer == :rss ? rss : atom
164
+ else
165
+ rss || atom
166
+ end
167
+ end
168
+
169
+ def self.[](url)
170
+ new(url)
171
+ end
172
+ end
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Base class for nokogiri page scraping
4
+ class Earl::Scraper
5
+ class << self
6
+ @@registry = []
7
+ attr_reader :regexp, :attributes
8
+
9
+ def match(regexp)
10
+ @regexp = regexp
11
+ register self
12
+ end
13
+
14
+ def define_attribute(name, &block)
15
+ @attributes ||= {}
16
+ @attributes[name] = block
17
+ end
18
+
19
+ def for(url, earl_source)
20
+ @@registry.each do |klass|
21
+ return klass.new(url, earl_source) if klass.regexp.match(url)
22
+ end
23
+ Earl::Scraper.new(url, earl_source)
24
+ end
25
+
26
+ def register(scraper_klass)
27
+ @@registry << scraper_klass
28
+ end
29
+ private :register
30
+ end
31
+
32
+ attr_reader :earl_source
33
+
34
+ def initialize(url, earl_source = nil)
35
+ @url = url
36
+ @earl_source = earl_source
37
+ end
38
+
39
+ def response
40
+ @response ||= earl_source && Nokogiri::HTML(earl_source.uri_response)
41
+ end
42
+
43
+ def attribute(name)
44
+ return unless attribute?(name)
45
+
46
+ attributes[name].call(response)
47
+ end
48
+
49
+ def attributes
50
+ if self.class.superclass == Earl::Scraper
51
+ self.class.superclass.attributes.merge(self.class.attributes)
52
+ else
53
+ self.class.attributes
54
+ end
55
+ end
56
+
57
+ def attribute?(name)
58
+ return false unless self.class.attributes
59
+
60
+ attributes.key?(name)
61
+ end
62
+
63
+ define_attribute :title do |doc|
64
+ if title = doc.at('title')
65
+ title.content
66
+ end
67
+ end
68
+
69
+ define_attribute :image do |doc|
70
+ if first_image = doc.at('img')
71
+ first_image['src']
72
+ end
73
+ end
74
+
75
+ define_attribute :description do |doc|
76
+ if element = doc.at("meta[name='description']")
77
+ element['content']
78
+ end
79
+ end
80
+
81
+ define_attribute :rss_feed do |doc|
82
+ if element = doc.at("link[type='application/rss+xml']")
83
+ element['href']
84
+ end
85
+ end
86
+
87
+ define_attribute :atom_feed do |doc|
88
+ if element = doc.at("link[type='application/atom+xml']")
89
+ element['href']
90
+ end
91
+ end
92
+ end
data/lib/earl/version.rb CHANGED
@@ -1,3 +1,5 @@
1
- module Earl
2
- VERSION = '0.3.0'
1
+ # frozen_string_literal: true
2
+
3
+ class Earl
4
+ VERSION = '2.0.0'
3
5
  end