earl 0.3.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ruby-tests.yml +32 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +35 -0
- data/.rubocop_todo.yml +22 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +13 -1
- data/Guardfile +15 -0
- data/LICENSE +2 -2
- data/README.md +127 -25
- data/Rakefile +10 -2
- data/earl.gemspec +19 -14
- data/lib/earl/earl.rb +172 -0
- data/lib/earl/scraper.rb +92 -0
- data/lib/earl/version.rb +4 -2
- data/lib/earl.rb +11 -20
- data/spec/fixtures/bicycles.html +490 -0
- data/spec/fixtures/bicycles_without_description.html +489 -0
- data/spec/fixtures/bicycles_without_images.html +457 -0
- data/spec/fixtures/cassettes/feed/is_atom_feed.yml +2298 -0
- data/spec/fixtures/cassettes/feed/is_rss_feed.yml +48 -0
- data/spec/fixtures/cassettes/feed/no_feed.yml +69 -0
- data/spec/fixtures/cassettes/feed/with_atom_and_rss_feed.yml +1471 -0
- data/spec/fixtures/cassettes/feed/with_rss_feed.yml +47 -0
- data/spec/fixtures/cassettes/oembed/no_oembed.yml +101 -0
- data/spec/fixtures/cassettes/oembed/youtube_oembed.yml +129 -0
- data/spec/fixtures/page_as_atom.html +161 -0
- data/spec/fixtures/page_as_rss.html +151 -0
- data/spec/fixtures/page_with_atom_feed.html +39 -0
- data/spec/fixtures/page_with_rss_and_atom_feeds.html +40 -0
- data/spec/fixtures/page_with_rss_feed.html +39 -0
- data/spec/fixtures/page_without_feeds.html +36 -0
- data/spec/fixtures/youtube.html +1839 -0
- data/spec/integration/feed_spec.rb +78 -0
- data/spec/integration/oembed_spec.rb +36 -0
- data/spec/spec_helper.rb +21 -29
- data/spec/support/fixtures.rb +15 -0
- data/spec/support/vcr.rb +9 -0
- data/spec/unit/earl/earl_spec.rb +15 -0
- data/spec/unit/earl/feed_spec.rb +62 -0
- data/spec/unit/earl/oembed_spec.rb +50 -0
- data/spec/unit/earl/scraper_spec.rb +49 -0
- data/spec/unit/earl_spec.rb +74 -0
- metadata +90 -62
- data/.rvmrc +0 -48
- data/lib/earl/email_assembler.rb +0 -11
- data/lib/earl/email_entity.rb +0 -27
- data/lib/earl/email_parser.tt +0 -58
- data/lib/earl/entity_base.rb +0 -37
- data/lib/earl/hash_inquirer.rb +0 -16
- data/lib/earl/string_inquirer.rb +0 -11
- data/lib/earl/url_assembler.rb +0 -15
- data/lib/earl/url_entity.rb +0 -23
- data/lib/earl/url_parser.tt +0 -163
- data/spec/earl/earl_spec.rb +0 -17
- data/spec/earl/email_entity_spec.rb +0 -31
- data/spec/earl/email_parser_spec.rb +0 -29
- data/spec/earl/entity_base_spec.rb +0 -39
- data/spec/earl/hash_inquirer_spec.rb +0 -24
- data/spec/earl/string_inquirer_spec.rb +0 -9
- data/spec/earl/url_entity_spec.rb +0 -45
- data/spec/earl/url_parser_spec.rb +0 -189
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bfb59215c79763794ebf311cff3e939a0be46e4557abf8bbd8c358693b12e1ee
|
4
|
+
data.tar.gz: 550ea900108a04f4d12935bcd7fa398b95c886f59847c2492a7d90b96495f042
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 16102c9eee3ff031411da1aacd80371a7474ea3ff5b70dad9a77ad83efe63a943db7b845e8b461a4c44de7685e8933c903a2c2511c41df00f69759cd06ece54c
|
7
|
+
data.tar.gz: 3b724772932a18d2de4f589959c28968fa9cc93cf158bca8ac4360166ae3b04877e8bb6dc09338329da94fbbccee4891819598f1e0a4a642d858d13362f9219f
|
@@ -0,0 +1,32 @@
|
|
1
|
+
name: Ruby Tests
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
pull_request:
|
8
|
+
branches:
|
9
|
+
- main
|
10
|
+
|
11
|
+
jobs:
|
12
|
+
test:
|
13
|
+
runs-on: ubuntu-latest
|
14
|
+
|
15
|
+
steps:
|
16
|
+
- name: Checkout code
|
17
|
+
uses: actions/checkout@v3
|
18
|
+
|
19
|
+
- name: Set up Ruby
|
20
|
+
uses: ruby/setup-ruby@v1
|
21
|
+
with:
|
22
|
+
ruby-version: 3.3
|
23
|
+
bundler-cache: true
|
24
|
+
|
25
|
+
- name: Install dependencies
|
26
|
+
run: bundle install
|
27
|
+
|
28
|
+
- name: Run tests
|
29
|
+
run: bundle exec rspec
|
30
|
+
|
31
|
+
- name: Run rubocop
|
32
|
+
run: bundle exec rubocop
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
|
3
|
+
AllCops:
|
4
|
+
NewCops: disable
|
5
|
+
SuggestExtensions: false
|
6
|
+
TargetRubyVersion: 3.0
|
7
|
+
|
8
|
+
Layout/LineLength:
|
9
|
+
Max: 199
|
10
|
+
|
11
|
+
Metrics/BlockLength:
|
12
|
+
Max: 63
|
13
|
+
|
14
|
+
Metrics/ClassLength:
|
15
|
+
Max: 200
|
16
|
+
|
17
|
+
Metrics/CyclomaticComplexity:
|
18
|
+
Max: 8
|
19
|
+
|
20
|
+
Metrics/MethodLength:
|
21
|
+
Max: 20
|
22
|
+
|
23
|
+
Style/ClassVars:
|
24
|
+
Exclude:
|
25
|
+
- 'lib/earl/scraper.rb'
|
26
|
+
|
27
|
+
Style/Documentation:
|
28
|
+
Exclude:
|
29
|
+
- 'spec/**/*'
|
30
|
+
|
31
|
+
Style/ClassAndModuleChildren:
|
32
|
+
EnforcedStyleForClasses: compact
|
33
|
+
|
34
|
+
Layout/FirstHashElementIndentation:
|
35
|
+
EnforcedStyle: consistent
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2025-07-29 10:15:32 UTC using RuboCop version 1.79.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 7
|
10
|
+
# This cop supports unsafe autocorrection (--autocorrect-all).
|
11
|
+
# Configuration parameters: AllowSafeAssignment.
|
12
|
+
Lint/AssignmentInCondition:
|
13
|
+
Exclude:
|
14
|
+
- 'lib/earl/earl.rb'
|
15
|
+
- 'lib/earl/scraper.rb'
|
16
|
+
|
17
|
+
# Offense count: 2
|
18
|
+
# Configuration parameters: AllowedMethods.
|
19
|
+
# AllowedMethods: enums
|
20
|
+
Lint/ConstantDefinitionInBlock:
|
21
|
+
Exclude:
|
22
|
+
- 'spec/unit/earl/scraper_spec.rb'
|
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
earl
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
3.3.3
|
data/Gemfile
CHANGED
@@ -1,4 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
source 'https://rubygems.org'
|
2
4
|
|
3
|
-
# Specify
|
5
|
+
# Specify gem dependencies in earl.gemspec
|
4
6
|
gemspec
|
7
|
+
|
8
|
+
# development dependencies
|
9
|
+
gem 'bundler', '>= 2.2.33'
|
10
|
+
gem 'guard-rspec'
|
11
|
+
gem 'guard-rubocop'
|
12
|
+
gem 'rake'
|
13
|
+
gem 'rspec'
|
14
|
+
gem 'rubocop', require: false
|
15
|
+
gem 'vcr'
|
16
|
+
gem 'webmock'
|
data/Guardfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# A sample Guardfile
|
4
|
+
# More info at https://github.com/guard/guard#readme
|
5
|
+
|
6
|
+
guard :rspec, cmd: 'bundle exec rspec' do
|
7
|
+
watch(%r{^spec/.+_spec\.rb$})
|
8
|
+
watch('spec/spec_helper.rb') { 'spec' }
|
9
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/unit/#{m[1]}_spec.rb" }
|
10
|
+
end
|
11
|
+
|
12
|
+
guard :rubocop, cli: ['--display-cop-names'] do
|
13
|
+
watch(/.+\.rb$/)
|
14
|
+
watch(%r{(?:.+/)?\.rubocop(?:_todo)?\.yml$}) { |m| File.dirname(m[0]) }
|
15
|
+
end
|
data/LICENSE
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Copyright (c) 2012
|
1
|
+
Copyright (c) 2009 T.J. VanSlyke, 2012-2025 Paul Gallagher
|
2
2
|
|
3
3
|
MIT License
|
4
4
|
|
@@ -19,4 +19,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
19
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
20
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
21
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,41 +1,143 @@
|
|
1
1
|
# Earl
|
2
2
|
|
3
|
-
|
3
|
+
Earl wants to help you scrape all the relevant metadata for your favorite web pages so you can be as cool as
|
4
|
+
Facebook when displaying user-submitted link content. Earl returns details like titles, descriptions, content type,
|
5
|
+
associated feeds, and OEmbed definitions if available.
|
4
6
|
|
5
|
-
|
7
|
+
Earl is based on an original source project called _earl_ by [teejayvanslyke](https://github.com/teejayvanslyke/earl) (but never released as a gem).
|
8
|
+
The revamp was done by [Paul Gallagher](https://github.com/tardate), and master source is currently
|
9
|
+
available at <https://github.com/evendis/earl>.
|
6
10
|
|
7
|
-
|
11
|
+
The Earl gem is officially named _earl_. Big thanks go to [jeremyruppel](https://github.com/jeremyruppel) who
|
12
|
+
contributed the ownership of the _earl_ gem name. The original _earl_ gem had a somewhat similar purpose - it is now defunct, but still available up to version 0.3.0 via rubgems. Any _earl_ gem with version 1.0.0 or higher is the new gem release (and is in no way backwardly compatible with
|
13
|
+
earlier versions).
|
8
14
|
|
9
|
-
|
15
|
+
## The Earl Cookbook
|
10
16
|
|
11
|
-
|
17
|
+
### How do instantiate Earl?
|
12
18
|
|
13
|
-
|
19
|
+
Pass any url-like string to Earl:
|
14
20
|
|
15
|
-
|
21
|
+
my_earl_instance = Earl.new('https://github.com/evendis/earl')
|
22
|
+
#
|
23
|
+
# or using the []= convenience method:
|
24
|
+
my_earl_instance = Earl['https://github.com/evendis/earl']
|
16
25
|
|
17
|
-
|
26
|
+
### How do I inspect details of the page?
|
18
27
|
|
19
|
-
|
28
|
+
earl = Earl['https://github.com/evendis/earl']
|
29
|
+
earl.title
|
30
|
+
=> "evendis/earl · GitHub"
|
31
|
+
earl.description
|
32
|
+
=> "earl - URL metadata API for scraping titles, descriptions, images, and videos from URL's."
|
33
|
+
earl.image
|
34
|
+
=> "https://a248.e.akamai.net/assets.github.com/images/modules/header/logov7@4x.png?1340935010"
|
20
35
|
|
21
|
-
|
22
|
-
url = Earl::URL.new 'http://www.foo.com'
|
36
|
+
### How do I get oembed details for a link?
|
23
37
|
|
24
|
-
|
25
|
-
url.scheme? # => true
|
38
|
+
Earl will get oembed details for a link if they are available.
|
26
39
|
|
27
|
-
|
28
|
-
|
29
|
-
|
40
|
+
earl = Earl['https://www.youtube.com/watch?v=hNSkCqMUMQA']
|
41
|
+
earl.oembed
|
42
|
+
=> {:title=>"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi",
|
43
|
+
:author_name=>"RubyKaigi",
|
44
|
+
:author_url=>"https://www.youtube.com/@rubykaigi4884",
|
45
|
+
:type=>"video",
|
46
|
+
:height=>113,
|
47
|
+
:width=>200,
|
48
|
+
:version=>"1.0",
|
49
|
+
:provider_name=>"YouTube",
|
50
|
+
:provider_url=>"https://www.youtube.com/",
|
51
|
+
:thumbnail_height=>360,
|
52
|
+
:thumbnail_width=>480,
|
53
|
+
:thumbnail_url=>"https://i.ytimg.com/vi/hNSkCqMUMQA/hqdefault.jpg",
|
54
|
+
:html=>
|
55
|
+
"<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"}
|
56
|
+
# to get the embed code:
|
57
|
+
earl.oembed_html
|
58
|
+
=> "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"
|
30
59
|
|
31
|
-
|
32
|
-
url.to_s # => 'http://www.foo.edu'
|
33
|
-
```
|
60
|
+
### How do I customise the oembed link?
|
34
61
|
|
35
|
-
|
62
|
+
Supported oembed parameters may be provided with to `Earl.new` or to the `oembed` call:
|
36
63
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
64
|
+
earl = Earl.new('https://www.youtube.com/watch?v=hNSkCqMUMQA', { oembed: { maxwidth: '200', maxheight: '320' }})
|
65
|
+
earl.oembed_html
|
66
|
+
=> "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"
|
67
|
+
|
68
|
+
### How do I inspect what attributes are available for a page?
|
69
|
+
|
70
|
+
To see all of the attributes a URL provides, simply ask:
|
71
|
+
|
72
|
+
earl = Earl['https://github.com/evendis/earl']
|
73
|
+
earl.attributes
|
74
|
+
=> [:title, :image, :description, :rss_feed, :atom_feed, :content_type, :base_url, :charset, :content_encoding, :headers, :feed]
|
75
|
+
|
76
|
+
### How can I extend Earl to scrape additional page details?
|
77
|
+
|
78
|
+
Need to scrape additional page details currently supported by Earl? Implement your own scraper:
|
79
|
+
|
80
|
+
class QotdScraper < Earl::Scraper
|
81
|
+
match /^http\:\/\/www\.quotationspage\.com\/qotd\.html$/
|
82
|
+
|
83
|
+
define_attribute :qotd do |doc|
|
84
|
+
doc.at('dt.quote a').text
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
The define_attribute method will supply you with a Nokogiri document which you can traverse to your heart's content.
|
89
|
+
Use the match method to limit the scope of URLs that your scraper will apply to.
|
90
|
+
|
91
|
+
Your new attribute is now available for use:
|
92
|
+
|
93
|
+
Earl['http://www.quotationspage.com/qotd.html'].qotd
|
94
|
+
=> "Love is a snowmobile racing across the tundra and then suddenly it flips over, pinning you underneath. At night, the ice weasels come."
|
95
|
+
|
96
|
+
### How do I install it for normal use?
|
97
|
+
|
98
|
+
If using bundler, add gem 'earl' your application's Gemfile and run `bundle`.
|
99
|
+
|
100
|
+
Or install it from the command-line:
|
101
|
+
|
102
|
+
gem install earl
|
103
|
+
|
104
|
+
### How do I install it for gem development?
|
105
|
+
|
106
|
+
To work on enhancements of fix bugs in Earl, fork and clone the github repository.
|
107
|
+
If you are using bundler (recommended), run `bundle` to install development dependencies:
|
108
|
+
|
109
|
+
gem install bundler
|
110
|
+
bundle
|
111
|
+
|
112
|
+
### How do I run the tests?
|
113
|
+
|
114
|
+
Once development dependencies are installed, all unit tests are run with just:
|
115
|
+
|
116
|
+
$ rake
|
117
|
+
# or..
|
118
|
+
$ rake spec
|
119
|
+
|
120
|
+
VCR is used to record integration tests. To re-record sessions, delete the corresponding cassette in
|
121
|
+
[spec/fixtures/cassettes](./spec/fixtures/cassettes/).
|
122
|
+
|
123
|
+
### How do I automatically run tests when I modify files?
|
124
|
+
|
125
|
+
Guard is installed as part of the development dependencies. Start a guard process in a terminal window:
|
126
|
+
|
127
|
+
bundle exec guard
|
128
|
+
|
129
|
+
It will run all the tests to start with by default. Then whenever you change a file, the associated tests will execute in this terminal window.
|
130
|
+
|
131
|
+
## Contributing to Earl
|
132
|
+
|
133
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
134
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
135
|
+
* Fork the project
|
136
|
+
* Start a feature/bugfix branch
|
137
|
+
* Commit and push until you are happy with your contribution
|
138
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
139
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
140
|
+
|
141
|
+
## Copyright
|
142
|
+
|
143
|
+
See [LICENSE](./LICENSE) for details.
|
data/Rakefile
CHANGED
@@ -1,5 +1,13 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'bundler/gem_tasks'
|
3
4
|
require 'rspec/core/rake_task'
|
4
5
|
|
5
|
-
RSpec::Core::RakeTask.new
|
6
|
+
RSpec::Core::RakeTask.new(:spec)
|
7
|
+
|
8
|
+
task default: :spec
|
9
|
+
|
10
|
+
desc 'Open an irb session preloaded with this library'
|
11
|
+
task :console do
|
12
|
+
sh 'irb -I lib -r earl.rb'
|
13
|
+
end
|
data/earl.gemspec
CHANGED
@@ -1,21 +1,26 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'English'
|
4
|
+
lib = File.expand_path('lib', __dir__)
|
5
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
6
|
+
require 'earl/version'
|
3
7
|
|
4
8
|
Gem::Specification.new do |gem|
|
5
|
-
gem.authors = [
|
6
|
-
gem.email = [
|
7
|
-
gem.description =
|
8
|
-
gem.summary =
|
9
|
-
gem.homepage =
|
9
|
+
gem.authors = ['teejayvanslyke', 'Paul Gallagher']
|
10
|
+
gem.email = ['tj@elctech.com', 'gallagher.paul@gmail.com']
|
11
|
+
gem.description = 'URL metadata API'
|
12
|
+
gem.summary = 'URL metadata API for scraping titles, descriptions, images, and videos from URLs'
|
13
|
+
gem.homepage = 'https://github.com/evendis/earl'
|
10
14
|
|
11
|
-
gem.files = `git ls-files`.split(
|
12
|
-
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
|
+
gem.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
13
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
-
gem.name =
|
15
|
-
gem.require_paths = [
|
18
|
+
gem.name = 'earl'
|
19
|
+
gem.require_paths = ['lib']
|
16
20
|
gem.version = Earl::VERSION
|
21
|
+
gem.license = 'MIT'
|
17
22
|
|
18
|
-
gem.
|
19
|
-
|
20
|
-
gem.
|
23
|
+
gem.required_ruby_version = '>= 3.0'
|
24
|
+
gem.add_runtime_dependency 'nokogiri', '~> 1.18'
|
25
|
+
gem.add_runtime_dependency 'ruby-oembed', '~> 0.18.1'
|
21
26
|
end
|
data/lib/earl/earl.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Earl is a class that represents a URL and provides methods to fetch metadata about the page
|
4
|
+
class Earl
|
5
|
+
attr_accessor :url, :options
|
6
|
+
attr_writer :oembed
|
7
|
+
|
8
|
+
def initialize(url, options = {})
|
9
|
+
@url = url
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
url
|
15
|
+
end
|
16
|
+
|
17
|
+
def uri
|
18
|
+
@uri ||= URI.parse(url)
|
19
|
+
end
|
20
|
+
|
21
|
+
def uri_response
|
22
|
+
@uri_response ||= uri.open
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns
|
26
|
+
# :base_url - the actual url (resolved after possible redirect) as a String
|
27
|
+
# :content_type - mime type
|
28
|
+
# :charset - returns a charset parameter in Content-Type field. It is downcased for canonicalization.
|
29
|
+
# :content_encoding - returns a list of encodings in Content-Encoding field as an Array of String. The encodings are downcased for canonicalization.
|
30
|
+
# :headers - raw response header metadata
|
31
|
+
# (excluded since this generally returns not RFC 2616 compliant date :last_modified - returns a Time which represents Last-Modified field.
|
32
|
+
def uri_response_attribute(name)
|
33
|
+
case name
|
34
|
+
when :base_url
|
35
|
+
(uri_response_attribute(:base_uri) || url).to_s
|
36
|
+
when :headers
|
37
|
+
uri_response_attribute(:meta)
|
38
|
+
else
|
39
|
+
uri_response.respond_to?(name) && uri_response.send(name)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
protected :uri_response_attribute
|
43
|
+
|
44
|
+
def uri_response_attributes
|
45
|
+
%i[content_type base_url charset content_encoding headers]
|
46
|
+
end
|
47
|
+
protected :uri_response_attributes
|
48
|
+
|
49
|
+
def scraper
|
50
|
+
@scraper ||= Scraper.for(url, self)
|
51
|
+
end
|
52
|
+
|
53
|
+
def response
|
54
|
+
scraper&.response
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns a hash of link meta data, including:
|
58
|
+
# :title, :description, :image (all attributes)
|
59
|
+
# :base_url
|
60
|
+
def metadata
|
61
|
+
data = oembed || {}
|
62
|
+
attributes.each do |attribute|
|
63
|
+
if attribute_value = send(attribute)
|
64
|
+
data[attribute] ||= attribute_value
|
65
|
+
end
|
66
|
+
end
|
67
|
+
data
|
68
|
+
end
|
69
|
+
|
70
|
+
def respond_to_missing?(name, include_private)
|
71
|
+
uri_response_attributes.include?(name) || scraper&.attribute?(name) || super
|
72
|
+
end
|
73
|
+
|
74
|
+
# Dispatch missing methods if a match for:
|
75
|
+
# - uri_response_attributes
|
76
|
+
# - scraper attributes
|
77
|
+
def method_missing(method, *args)
|
78
|
+
if uri_response_attributes.include?(method)
|
79
|
+
uri_response_attribute(method)
|
80
|
+
elsif scraper&.attribute?(method)
|
81
|
+
scraper.attribute(method)
|
82
|
+
else
|
83
|
+
super
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Returns a full array of attributes available for the link
|
88
|
+
def attributes
|
89
|
+
scraper.attributes.keys + uri_response_attributes + [:feed]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Returns the options to be used for oembed
|
93
|
+
def oembed_options
|
94
|
+
{ maxwidth: '560', maxheight: '315' }.merge(options[:oembed] || {})
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns the oembed meta data hash for the URL (or nil if not defined/available)
|
98
|
+
# e.g. for https://www.youtube.com/watch?v=hNSkCqMUMQA:
|
99
|
+
# {
|
100
|
+
# :title=>"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi",
|
101
|
+
# :author_name=>"RubyKaigi",
|
102
|
+
# :author_url=>"https://www.youtube.com/@rubykaigi4884",
|
103
|
+
# :type=>"video",
|
104
|
+
# :height=>113,
|
105
|
+
# :width=>200,
|
106
|
+
# :version=>"1.0",
|
107
|
+
# :provider_name=>"YouTube",
|
108
|
+
# :provider_url=>"https://www.youtube.com/",
|
109
|
+
# :thumbnail_height=>360,
|
110
|
+
# :thumbnail_width=>480,
|
111
|
+
# :thumbnail_url=>"https://i.ytimg.com/vi/hNSkCqMUMQA/hqdefault.jpg",
|
112
|
+
# :html=> "<iframe width=\"200\" height=\"113\" src=\"https://www.youtube.com/embed/hNSkCqMUMQA?feature=oembed\" \
|
113
|
+
# frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" \
|
114
|
+
# referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen \
|
115
|
+
# title=\"[JA][Keynote] Ruby Taught Me About Encoding Under the Hood / Mari Imaizumi @ima1zumi\"></iframe>"
|
116
|
+
# }
|
117
|
+
#
|
118
|
+
# +options+ defines a custom oembed options hash and will cause a re-fetch of the oembed metadata
|
119
|
+
def oembed(options = nil)
|
120
|
+
if options # use custom options, refetch oembed metadata
|
121
|
+
@options[:oembed] = options
|
122
|
+
@oembed = nil
|
123
|
+
end
|
124
|
+
@oembed ||= begin
|
125
|
+
h = fetch_oembed(base_url).fields
|
126
|
+
if h
|
127
|
+
h.keys.each do |key| # symbolize_keys!
|
128
|
+
new_key = begin
|
129
|
+
key.to_sym
|
130
|
+
rescue StandardError
|
131
|
+
key
|
132
|
+
end
|
133
|
+
h[new_key] = h.delete(key)
|
134
|
+
end
|
135
|
+
h
|
136
|
+
end
|
137
|
+
rescue StandardError
|
138
|
+
nil
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def fetch_oembed(base_url)
|
143
|
+
OEmbed::Providers.get(base_url)
|
144
|
+
end
|
145
|
+
protected :fetch_oembed
|
146
|
+
|
147
|
+
# Returns the oembed code for the url (or nil if not defined/available)
|
148
|
+
def oembed_html
|
149
|
+
oembed && oembed[:html]
|
150
|
+
end
|
151
|
+
|
152
|
+
# Returns true if there is an ATOM or RSS feed associated with this URL.
|
153
|
+
def feed?
|
154
|
+
!feed.nil?
|
155
|
+
end
|
156
|
+
|
157
|
+
# Returns the feed URL associated with this URL.
|
158
|
+
# Returns RSS by default, or ATOM if +prefer+ is not :rss.
|
159
|
+
def feed(prefer = :rss)
|
160
|
+
rss = rss_feed
|
161
|
+
atom = atom_feed
|
162
|
+
if rss && atom
|
163
|
+
prefer == :rss ? rss : atom
|
164
|
+
else
|
165
|
+
rss || atom
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def self.[](url)
|
170
|
+
new(url)
|
171
|
+
end
|
172
|
+
end
|
data/lib/earl/scraper.rb
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Base class for nokogiri page scraping
|
4
|
+
class Earl::Scraper
|
5
|
+
class << self
|
6
|
+
@@registry = []
|
7
|
+
attr_reader :regexp, :attributes
|
8
|
+
|
9
|
+
def match(regexp)
|
10
|
+
@regexp = regexp
|
11
|
+
register self
|
12
|
+
end
|
13
|
+
|
14
|
+
def define_attribute(name, &block)
|
15
|
+
@attributes ||= {}
|
16
|
+
@attributes[name] = block
|
17
|
+
end
|
18
|
+
|
19
|
+
def for(url, earl_source)
|
20
|
+
@@registry.each do |klass|
|
21
|
+
return klass.new(url, earl_source) if klass.regexp.match(url)
|
22
|
+
end
|
23
|
+
Earl::Scraper.new(url, earl_source)
|
24
|
+
end
|
25
|
+
|
26
|
+
def register(scraper_klass)
|
27
|
+
@@registry << scraper_klass
|
28
|
+
end
|
29
|
+
private :register
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :earl_source
|
33
|
+
|
34
|
+
def initialize(url, earl_source = nil)
|
35
|
+
@url = url
|
36
|
+
@earl_source = earl_source
|
37
|
+
end
|
38
|
+
|
39
|
+
def response
|
40
|
+
@response ||= earl_source && Nokogiri::HTML(earl_source.uri_response)
|
41
|
+
end
|
42
|
+
|
43
|
+
def attribute(name)
|
44
|
+
return unless attribute?(name)
|
45
|
+
|
46
|
+
attributes[name].call(response)
|
47
|
+
end
|
48
|
+
|
49
|
+
def attributes
|
50
|
+
if self.class.superclass == Earl::Scraper
|
51
|
+
self.class.superclass.attributes.merge(self.class.attributes)
|
52
|
+
else
|
53
|
+
self.class.attributes
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def attribute?(name)
|
58
|
+
return false unless self.class.attributes
|
59
|
+
|
60
|
+
attributes.key?(name)
|
61
|
+
end
|
62
|
+
|
63
|
+
define_attribute :title do |doc|
|
64
|
+
if title = doc.at('title')
|
65
|
+
title.content
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
define_attribute :image do |doc|
|
70
|
+
if first_image = doc.at('img')
|
71
|
+
first_image['src']
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
define_attribute :description do |doc|
|
76
|
+
if element = doc.at("meta[name='description']")
|
77
|
+
element['content']
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
define_attribute :rss_feed do |doc|
|
82
|
+
if element = doc.at("link[type='application/rss+xml']")
|
83
|
+
element['href']
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
define_attribute :atom_feed do |doc|
|
88
|
+
if element = doc.at("link[type='application/atom+xml']")
|
89
|
+
element['href']
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/earl/version.rb
CHANGED