richurls 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7e1fcf6a779cb196b1b72fe63156af98b340fc8be86b2b78a4e233a479c47b9d
4
- data.tar.gz: 4dd73e4c5c52b1bdcbba13d612158f2827d8d770a67b0ac7229d3a331032b1a5
3
+ metadata.gz: 42cff335ffa60073310cdec6c1b16f2a1f1a4cfce0bde60835a76538a0f138aa
4
+ data.tar.gz: cdded0800498b21f24b5c89910da73fd5150492c4f9c7ccedcb5499d538b1423
5
5
  SHA512:
6
- metadata.gz: bf8186d4e4dc41447c9bc3a0adad8575f8893eebd64702703371f0ad612e9eab2ffd5eb06161d7c8248a0c21e25cddc2b48af3d13527f5a381bf08c5ba427f29
7
- data.tar.gz: 8cf6e808d2df2f97b9912d67b79396c9f11e541d94cb6f48e015028c630e2991003757c1aacf9036656befd2a1d31509fdc55fbce494d2991a3bef8211dc18cf
6
+ metadata.gz: a5633b6852996d4938f002e2e1e4a90e1b25037ace20ba5da9303abc4dc073efa9ba5819134080d6c467dbe905733226cd66554e99edcd9ab57fbfd49bdc4a57
7
+ data.tar.gz: '08bce91b2a1c2a301b3eb288883621f7f7d607e56e36765ee00d5d09a84348f281cff0e48f61e074c9c394d5478b9c7140ca9a4f02eb37778e129c3ec8ed6277'
data/.rubocop.yml CHANGED
@@ -27,6 +27,12 @@ Style/FrozenStringLiteralComment:
27
27
  Enabled: false
28
28
  Style/GuardClause:
29
29
  Enabled: false
30
+ Style/HashEachMethods:
31
+ Enabled: true
32
+ Style/HashTransformKeys:
33
+ Enabled: true
34
+ Style/HashTransformValues:
35
+ Enabled: true
30
36
  Naming/MemoizedInstanceVariableName:
31
37
  Enabled: false
32
38
  Style/RegexpLiteral:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- richurls (0.3.0)
4
+ richurls (0.4.0)
5
5
  oj (~> 3)
6
6
  ox (~> 2)
7
7
  patron (~> 0.13)
@@ -2,26 +2,30 @@ require 'ox'
2
2
 
3
3
  require_relative 'xml_handler'
4
4
  require_relative 'url_helper'
5
- require_relative 'parsers/title_parser'
6
- require_relative 'parsers/description_parser'
7
- require_relative 'parsers/image_parser'
5
+ require_relative 'parsers/property'
6
+ require_relative 'parsers/url'
8
7
  require_relative 'parsers/embed_parser'
9
8
  require_relative 'parsers/provider_display_parser'
10
- require_relative 'parsers/favicon_parser'
11
9
 
12
10
  module RichUrls
13
11
  class BodyDecorator
14
12
  NoXMLError = Class.new(StandardError)
15
13
 
16
14
  PARSERS = {
17
- 'title' => Parsers::TitleParser,
18
- 'description' => Parsers::DescriptionParser,
19
- 'image' => Parsers::ImageParser,
15
+ 'title' => Parsers::Property,
16
+ 'description' => Parsers::Property,
17
+ 'image' => Parsers::Url,
18
+ 'favicon' => Parsers::Url,
20
19
  'provider_display' => Parsers::ProviderDisplayParser,
21
- 'favicon' => Parsers::FaviconParser,
22
20
  'embed' => Parsers::EmbedParser
23
21
  }.freeze
24
22
 
23
+ def self.decorate(url, body)
24
+ new(url, body).decorate
25
+ end
26
+
27
+ private_class_method :new
28
+
25
29
  def initialize(url, body)
26
30
  @url = url
27
31
  @xml = XMLHandler.new
@@ -37,7 +41,7 @@ module RichUrls
37
41
 
38
42
  def decorate
39
43
  PARSERS.each_with_object({}) do |(key, parser), object|
40
- object[key] = parser.call(@xml, @url)&.force_encoding('UTF-8')
44
+ object[key] = parser.call(@xml.properties[key], @url)
41
45
  end
42
46
  end
43
47
  end
data/lib/el.rb ADDED
@@ -0,0 +1,41 @@
1
+ module RichUrls
2
+ class El
3
+ MAX_TEXT_LENGTH = 1000
4
+
5
+ attr_reader :tag, :open, :attributes
6
+
7
+ def initialize(tag)
8
+ @tag = tag
9
+ @open = true
10
+ @attributes = {}
11
+ end
12
+
13
+ def add(key, value)
14
+ return if @attributes[key]
15
+
16
+ @attributes[key] = value
17
+ end
18
+
19
+ def append_text(str)
20
+ @attributes[:text] ||= ''
21
+
22
+ str = str.strip
23
+ length = @attributes[:text].length
24
+
25
+ if length <= MAX_TEXT_LENGTH
26
+ end_slice = MAX_TEXT_LENGTH - length
27
+ sliced = str[0...end_slice]
28
+
29
+ @attributes[:text] << sliced + ' '
30
+ end
31
+ end
32
+
33
+ def text
34
+ @attributes[:text].strip
35
+ end
36
+
37
+ def close!
38
+ @open = false
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,14 @@
1
+ module RichUrls
2
+ module Finders
3
+ module Favicon
4
+ KEYWORDS = ['shortcut icon', 'icon shortcut', 'icon'].freeze
5
+
6
+ def self.call(elem)
7
+ return unless elem.tag == :link &&
8
+ KEYWORDS.include?(elem.attributes[:rel])
9
+
10
+ elem.attributes[:href]
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,12 @@
1
+ module RichUrls
2
+ module Finders
3
+ module MetaDescription
4
+ def self.call(elem)
5
+ return unless elem.tag == :meta &&
6
+ elem.attributes[:property] == 'og:description'
7
+
8
+ elem.attributes[:content]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ module RichUrls
2
+ module Finders
3
+ module MetaImage
4
+ def self.call(elem)
5
+ return unless elem.tag == :meta &&
6
+ elem.attributes[:property] == 'og:image'
7
+
8
+ elem.attributes[:content]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ module RichUrls
2
+ module Finders
3
+ module MetaTitle
4
+ def self.call(elem)
5
+ return unless elem.tag == :meta &&
6
+ elem.attributes[:property] == 'og:title'
7
+
8
+ elem.attributes[:content]
9
+ end
10
+ end
11
+ end
12
+ end
data/lib/finders.rb ADDED
@@ -0,0 +1,7 @@
1
+ module RichUrls
2
+ module Finders
3
+ Description = ->(elem) { elem.tag == :p && elem.text }
4
+ Image = ->(elem) { elem.tag == :img && elem.attributes[:src] }
5
+ Title = ->(elem) { elem.tag == :title && elem.text }
6
+ end
7
+ end
@@ -1,28 +1,30 @@
1
- module Parsers
2
- class EmbedParser
3
- require_relative 'embed_parsers/base'
4
- require_relative 'embed_parsers/youtube'
5
- require_relative 'embed_parsers/youtube_short'
6
- require_relative 'embed_parsers/paste'
7
- require_relative 'embed_parsers/spotify'
1
+ module RichUrls
2
+ module Parsers
3
+ class EmbedParser
4
+ require_relative 'embed_parsers/base'
5
+ require_relative 'embed_parsers/youtube'
6
+ require_relative 'embed_parsers/youtube_short'
7
+ require_relative 'embed_parsers/paste'
8
+ require_relative 'embed_parsers/spotify'
8
9
 
9
- PARSERS = [
10
- Youtube,
11
- YoutubeShort,
12
- Paste,
13
- Spotify
14
- ].freeze
10
+ PARSERS = [
11
+ Youtube,
12
+ YoutubeShort,
13
+ Paste,
14
+ Spotify
15
+ ].freeze
15
16
 
16
- def self.call(_, url)
17
- uri = URI(url)
17
+ def self.call(_, url)
18
+ uri = URI(url)
18
19
 
19
- PARSERS.each do |parser|
20
- embed_parser = parser.new(uri)
20
+ PARSERS.each do |parser|
21
+ embed_parser = parser.new(uri)
21
22
 
22
- return embed_parser.parse if embed_parser.match?
23
- end
23
+ return embed_parser.parse if embed_parser.match?
24
+ end
24
25
 
25
- nil
26
+ nil
27
+ end
26
28
  end
27
29
  end
28
30
  end
@@ -1,14 +1,16 @@
1
- module Parsers
2
- class EmbedParser
3
- class Base
4
- def initialize(uri)
5
- @uri = uri
6
- end
1
+ module RichUrls
2
+ module Parsers
3
+ class EmbedParser
4
+ class Base
5
+ def initialize(uri)
6
+ @uri = uri
7
+ end
7
8
 
8
- private
9
+ private
9
10
 
10
- def query
11
- @query ||= Hash[URI.decode_www_form(@uri.query)]
11
+ def query
12
+ @query ||= Hash[URI.decode_www_form(@uri.query)]
13
+ end
12
14
  end
13
15
  end
14
16
  end
@@ -1,18 +1,20 @@
1
- module Parsers
2
- class EmbedParser
3
- class Paste < Base
4
- IFRAME = '<iframe src="https://pasteapp.com/p/%s/embed?view=%s" '\
5
- 'width="480" height="480" scrolling="no" '\
6
- 'frameborder="0" allowfullscreen></iframe>'.freeze
1
+ module RichUrls
2
+ module Parsers
3
+ class EmbedParser
4
+ class Paste < Base
5
+ IFRAME = '<iframe src="https://pasteapp.com/p/%s/embed?view=%s" '\
6
+ 'width="480" height="480" scrolling="no" '\
7
+ 'frameborder="0" allowfullscreen></iframe>'.freeze
7
8
 
8
- def match?
9
- @uri.host == 'pasteapp.com' && @uri.path =~ /\/p\/[a-zA-Z0-9]+/
10
- end
9
+ def match?
10
+ @uri.host == 'pasteapp.com' && @uri.path =~ /\/p\/[a-zA-Z0-9]+/
11
+ end
11
12
 
12
- def parse
13
- path_id = @uri.path.sub(/\/p\//, '')
13
+ def parse
14
+ path_id = @uri.path.sub(/\/p\//, '')
14
15
 
15
- IFRAME % [path_id, query.fetch('view')]
16
+ IFRAME % [path_id, query.fetch('view')]
17
+ end
16
18
  end
17
19
  end
18
20
  end
@@ -1,30 +1,32 @@
1
- module Parsers
2
- class EmbedParser
3
- class Spotify < Base
4
- IFRAME = '<iframe src="https://open.spotify.com/embed/%s/%s" '\
5
- 'width="300" height="380" frameborder="0" '\
6
- 'allowtransparency="true" allow="encrypted-media">'\
7
- '</iframe>'.freeze
1
+ module RichUrls
2
+ module Parsers
3
+ class EmbedParser
4
+ class Spotify < Base
5
+ IFRAME = '<iframe src="https://open.spotify.com/embed/%s/%s" '\
6
+ 'width="300" height="380" frameborder="0" '\
7
+ 'allowtransparency="true" allow="encrypted-media">'\
8
+ '</iframe>'.freeze
8
9
 
9
- SCOPES = %w[
10
- album
11
- track
12
- playlist
13
- ].freeze
10
+ SCOPES = %w[
11
+ album
12
+ track
13
+ playlist
14
+ ].freeze
14
15
 
15
- def match?
16
- valid_path = SCOPES.any? do |path|
17
- @uri.path.start_with?("/#{path}")
18
- end
16
+ def match?
17
+ valid_path = SCOPES.any? do |path|
18
+ @uri.path.start_with?("/#{path}")
19
+ end
19
20
 
20
- @uri.host == 'open.spotify.com' && valid_path
21
- end
21
+ @uri.host == 'open.spotify.com' && valid_path
22
+ end
22
23
 
23
- def parse
24
- path = @uri.path
25
- path[0] = ''
24
+ def parse
25
+ path = @uri.path
26
+ path[0] = ''
26
27
 
27
- IFRAME % path.split('/')
28
+ IFRAME % path.split('/')
29
+ end
28
30
  end
29
31
  end
30
32
  end
@@ -1,20 +1,22 @@
1
- module Parsers
2
- class EmbedParser
3
- class Youtube < Base
4
- IFRAME = '<iframe width="560" height="315" '\
5
- 'src="https://www.youtube.com/embed/%s" frameborder="0" '\
6
- 'allow="accelerometer; autoplay; encrypted-media; '\
7
- 'gyroscope; picture-in-picture" allowfullscreen>'\
8
- '</iframe>'.freeze
1
+ module RichUrls
2
+ module Parsers
3
+ class EmbedParser
4
+ class Youtube < Base
5
+ IFRAME = '<iframe width="560" height="315" '\
6
+ 'src="https://www.youtube.com/embed/%s" frameborder="0" '\
7
+ 'allow="accelerometer; autoplay; encrypted-media; '\
8
+ 'gyroscope; picture-in-picture" allowfullscreen>'\
9
+ '</iframe>'.freeze
9
10
 
10
- def match?
11
- @uri.host == 'www.youtube.com' &&
12
- @uri.path == '/watch' &&
13
- query.key?('v')
14
- end
11
+ def match?
12
+ @uri.host == 'www.youtube.com' &&
13
+ @uri.path == '/watch' &&
14
+ query.key?('v')
15
+ end
15
16
 
16
- def parse
17
- IFRAME % query.fetch('v')
17
+ def parse
18
+ IFRAME % query.fetch('v')
19
+ end
18
20
  end
19
21
  end
20
22
  end
@@ -1,15 +1,17 @@
1
- module Parsers
2
- class EmbedParser
3
- class YoutubeShort < Base
4
- def match?
5
- @uri.host == 'youtu.be'
6
- end
1
+ module RichUrls
2
+ module Parsers
3
+ class EmbedParser
4
+ class YoutubeShort < Base
5
+ def match?
6
+ @uri.host == 'youtu.be'
7
+ end
7
8
 
8
- def parse
9
- path = @uri.path
10
- path[0] = ''
9
+ def parse
10
+ path = @uri.path
11
+ path[0] = ''
11
12
 
12
- Youtube::IFRAME % path
13
+ Youtube::IFRAME % path
14
+ end
13
15
  end
14
16
  end
15
17
  end
@@ -0,0 +1,5 @@
1
+ module RichUrls
2
+ module Parsers
3
+ Property = ->(property, _) { property&.force_encoding('UTF-8') }
4
+ end
5
+ end
@@ -1,3 +1,5 @@
1
- module Parsers
2
- ProviderDisplayParser = ->(_, url) { URI(url).host }
1
+ module RichUrls
2
+ module Parsers
3
+ ProviderDisplayParser = ->(_, url) { URI(url).host }
4
+ end
3
5
  end
@@ -0,0 +1,5 @@
1
+ module RichUrls
2
+ module Parsers
3
+ Url = ->(content, url) { UrlHelper.url_for(url, content) }
4
+ end
5
+ end
data/lib/url_fetcher.rb CHANGED
@@ -38,7 +38,7 @@ module RichUrls
38
38
  response = session.get(@url)
39
39
 
40
40
  if response.status < 400
41
- decorated = BodyDecorator.new(response.url, response.body).decorate
41
+ decorated = BodyDecorator.decorate(response.url, response.body)
42
42
  RichUrls.cache.set(digest, Oj.dump(decorated))
43
43
  decorated
44
44
  else
data/lib/xml_handler.rb CHANGED
@@ -1,12 +1,18 @@
1
+ require_relative 'el'
2
+ require_relative 'finders'
3
+ require_relative 'finders/meta_title'
4
+ require_relative 'finders/meta_description'
5
+ require_relative 'finders/meta_image'
6
+ require_relative 'finders/favicon'
7
+
1
8
  module RichUrls
2
9
  class XMLHandler < ::Ox::Sax
3
10
  WHITELISTED_EL_NAMES = %i[
4
- html
5
- head
6
11
  title
7
12
  meta
8
13
  link
9
14
  img
15
+ p
10
16
  ].freeze
11
17
 
12
18
  WHITELISTED_ATTRS = %i[
@@ -17,48 +23,97 @@ module RichUrls
17
23
  src
18
24
  ].freeze
19
25
 
26
+ FALLBACK_ELEMENTS = {
27
+ img: 'og:image',
28
+ p: 'og:description',
29
+ title: 'og:title'
30
+ }.freeze
31
+
32
+ FINDERS = {
33
+ Finders::MetaTitle => 'title',
34
+ Finders::MetaDescription => 'description',
35
+ Finders::MetaImage => 'image',
36
+ Finders::Favicon => 'favicon',
37
+ Finders::Title => 'title',
38
+ Finders::Description => 'description',
39
+ Finders::Image => 'image'
40
+ }.freeze
41
+
20
42
  StopParsingError = Class.new(StandardError)
21
- El = Struct.new(:name, :attributes)
22
43
 
23
- attr_accessor :elements
44
+ attr_reader :elements, :properties
24
45
 
25
46
  def initialize
26
47
  @elements = []
48
+ @counts = Set.new
49
+ @properties = {
50
+ 'title' => nil,
51
+ 'description' => nil,
52
+ 'image' => nil,
53
+ 'favicon' => nil
54
+ }
27
55
  end
28
56
 
29
- def find(name, attrs = {})
57
+ def find(tag, attrs = {})
30
58
  @elements.detect do |el|
31
59
  matching_attributes = attrs.all? { |k, v| el.attributes[k] == v }
32
60
 
33
- el.name == name && matching_attributes
61
+ el.tag == tag && matching_attributes
34
62
  end
35
63
  end
36
64
 
37
- def start_element(element_name)
38
- return unless WHITELISTED_EL_NAMES.include?(element_name)
65
+ def start_element(tag)
66
+ return unless WHITELISTED_EL_NAMES.include?(tag)
39
67
 
40
- @elements << El.new(element_name, {})
68
+ @elements << El.new(tag) if add_element?(tag)
41
69
  end
42
70
 
43
- def attr(name, str)
44
- el = @elements.last
71
+ def end_element(tag)
72
+ return unless WHITELISTED_EL_NAMES.include?(tag)
45
73
 
46
- return unless el && WHITELISTED_ATTRS.include?(name)
74
+ el = @elements.reverse_each.detect { |e| e.open && e.tag == tag }
75
+ return unless el
47
76
 
48
- el.attributes[name] = str
77
+ el.close!
78
+ find_element(el)
49
79
 
50
- raise StopParsingError if stop?(name, el)
80
+ raise StopParsingError if @properties.values.all?
51
81
  end
52
82
 
53
- def text(str)
83
+ def attr(key, value)
84
+ return unless WHITELISTED_ATTRS.include?(key)
85
+
54
86
  el = @elements.last
55
- el && el.attributes[:text].nil? && el.attributes[:text] = str
87
+ el&.add(key, value)
88
+ end
89
+
90
+ def text(str)
91
+ el = @elements.detect(&:open)
92
+ el&.append_text(str)
56
93
  end
57
94
 
58
95
  private
59
96
 
60
- def stop?(name, elem)
61
- name == WHITELISTED_ATTRS.last && elem.name == :img
97
+ def find_element(elem)
98
+ FINDERS.each_pair do |finder, attribute|
99
+ next if @properties[attribute]
100
+
101
+ content = finder.call(elem)
102
+
103
+ if content
104
+ @properties[attribute] = content
105
+ break
106
+ end
107
+ end
108
+ end
109
+
110
+ def add_element?(tag)
111
+ return true unless FALLBACK_ELEMENTS.keys.include?(tag)
112
+ return false if @counts.include?(tag)
113
+
114
+ @counts.add(tag)
115
+
116
+ !find(:meta, property: FALLBACK_ELEMENTS.fetch(tag))
62
117
  end
63
118
  end
64
119
  end
data/richurls.gemspec CHANGED
@@ -3,7 +3,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
3
 
4
4
  Gem::Specification.new do |spec|
5
5
  spec.name = 'richurls'
6
- spec.version = '0.3.0'
6
+ spec.version = '0.4.0'
7
7
  spec.authors = ['grdw']
8
8
  spec.email = ['gerard@wetransfer.com']
9
9
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: richurls
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - grdw
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-17 00:00:00.000000000 Z
11
+ date: 2020-03-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: oj
@@ -124,17 +124,21 @@ files:
124
124
  - README.md
125
125
  - lib/body_decorator.rb
126
126
  - lib/cache.rb
127
- - lib/parsers/description_parser.rb
127
+ - lib/el.rb
128
+ - lib/finders.rb
129
+ - lib/finders/favicon.rb
130
+ - lib/finders/meta_description.rb
131
+ - lib/finders/meta_image.rb
132
+ - lib/finders/meta_title.rb
128
133
  - lib/parsers/embed_parser.rb
129
134
  - lib/parsers/embed_parsers/base.rb
130
135
  - lib/parsers/embed_parsers/paste.rb
131
136
  - lib/parsers/embed_parsers/spotify.rb
132
137
  - lib/parsers/embed_parsers/youtube.rb
133
138
  - lib/parsers/embed_parsers/youtube_short.rb
134
- - lib/parsers/favicon_parser.rb
135
- - lib/parsers/image_parser.rb
139
+ - lib/parsers/property.rb
136
140
  - lib/parsers/provider_display_parser.rb
137
- - lib/parsers/title_parser.rb
141
+ - lib/parsers/url.rb
138
142
  - lib/richurls.rb
139
143
  - lib/url_fetcher.rb
140
144
  - lib/url_helper.rb
@@ -1,7 +0,0 @@
1
- module Parsers
2
- DescriptionParser = lambda do |document, _|
3
- meta_el = document.find(:meta, property: 'og:description')
4
-
5
- meta_el && meta_el.attributes[:content]
6
- end
7
- end
@@ -1,18 +0,0 @@
1
- module Parsers
2
- module FaviconParser
3
- KEYWORDS = ['shortcut icon', 'icon shortcut', 'icon'].freeze
4
-
5
- def self.call(document, url)
6
- KEYWORDS.each do |rel|
7
- found_document = document.find(:link, rel: rel)
8
-
9
- if found_document
10
- @el = found_document
11
- break
12
- end
13
- end
14
-
15
- @el && UrlHelper.url_for(url, @el.attributes[:href])
16
- end
17
- end
18
- end
@@ -1,14 +0,0 @@
1
- module Parsers
2
- class ImageParser
3
- def self.call(document, url)
4
- meta_image = document.find(:meta, property: 'og:image')
5
- image_tag = document.find(:img)
6
-
7
- image_source =
8
- (meta_image && meta_image.attributes[:content]) ||
9
- (image_tag && image_tag.attributes[:src])
10
-
11
- image_source && UrlHelper.url_for(url, image_source)
12
- end
13
- end
14
- end
@@ -1,9 +0,0 @@
1
- module Parsers
2
- TitleParser = lambda do |document, _|
3
- meta_el = document.find(:meta, property: 'og:title')
4
- title_el = document.find(:title)
5
-
6
- meta_el && meta_el.attributes[:content] ||
7
- title_el && title_el.attributes[:text]
8
- end
9
- end