richurls 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +6 -0
- data/Gemfile.lock +1 -1
- data/lib/body_decorator.rb +13 -9
- data/lib/el.rb +41 -0
- data/lib/finders/favicon.rb +14 -0
- data/lib/finders/meta_description.rb +12 -0
- data/lib/finders/meta_image.rb +12 -0
- data/lib/finders/meta_title.rb +12 -0
- data/lib/finders.rb +7 -0
- data/lib/parsers/embed_parser.rb +22 -20
- data/lib/parsers/embed_parsers/base.rb +11 -9
- data/lib/parsers/embed_parsers/paste.rb +14 -12
- data/lib/parsers/embed_parsers/spotify.rb +24 -22
- data/lib/parsers/embed_parsers/youtube.rb +17 -15
- data/lib/parsers/embed_parsers/youtube_short.rb +12 -10
- data/lib/parsers/property.rb +5 -0
- data/lib/parsers/provider_display_parser.rb +4 -2
- data/lib/parsers/url.rb +5 -0
- data/lib/url_fetcher.rb +1 -1
- data/lib/xml_handler.rb +73 -18
- data/richurls.gemspec +1 -1
- metadata +10 -6
- data/lib/parsers/description_parser.rb +0 -7
- data/lib/parsers/favicon_parser.rb +0 -18
- data/lib/parsers/image_parser.rb +0 -14
- data/lib/parsers/title_parser.rb +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 42cff335ffa60073310cdec6c1b16f2a1f1a4cfce0bde60835a76538a0f138aa
|
4
|
+
data.tar.gz: cdded0800498b21f24b5c89910da73fd5150492c4f9c7ccedcb5499d538b1423
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5633b6852996d4938f002e2e1e4a90e1b25037ace20ba5da9303abc4dc073efa9ba5819134080d6c467dbe905733226cd66554e99edcd9ab57fbfd49bdc4a57
|
7
|
+
data.tar.gz: '08bce91b2a1c2a301b3eb288883621f7f7d607e56e36765ee00d5d09a84348f281cff0e48f61e074c9c394d5478b9c7140ca9a4f02eb37778e129c3ec8ed6277'
|
data/.rubocop.yml
CHANGED
@@ -27,6 +27,12 @@ Style/FrozenStringLiteralComment:
|
|
27
27
|
Enabled: false
|
28
28
|
Style/GuardClause:
|
29
29
|
Enabled: false
|
30
|
+
Style/HashEachMethods:
|
31
|
+
Enabled: true
|
32
|
+
Style/HashTransformKeys:
|
33
|
+
Enabled: true
|
34
|
+
Style/HashTransformValues:
|
35
|
+
Enabled: true
|
30
36
|
Naming/MemoizedInstanceVariableName:
|
31
37
|
Enabled: false
|
32
38
|
Style/RegexpLiteral:
|
data/Gemfile.lock
CHANGED
data/lib/body_decorator.rb
CHANGED
@@ -2,26 +2,30 @@ require 'ox'
|
|
2
2
|
|
3
3
|
require_relative 'xml_handler'
|
4
4
|
require_relative 'url_helper'
|
5
|
-
require_relative 'parsers/
|
6
|
-
require_relative 'parsers/
|
7
|
-
require_relative 'parsers/image_parser'
|
5
|
+
require_relative 'parsers/property'
|
6
|
+
require_relative 'parsers/url'
|
8
7
|
require_relative 'parsers/embed_parser'
|
9
8
|
require_relative 'parsers/provider_display_parser'
|
10
|
-
require_relative 'parsers/favicon_parser'
|
11
9
|
|
12
10
|
module RichUrls
|
13
11
|
class BodyDecorator
|
14
12
|
NoXMLError = Class.new(StandardError)
|
15
13
|
|
16
14
|
PARSERS = {
|
17
|
-
'title' => Parsers::
|
18
|
-
'description' => Parsers::
|
19
|
-
'image' => Parsers::
|
15
|
+
'title' => Parsers::Property,
|
16
|
+
'description' => Parsers::Property,
|
17
|
+
'image' => Parsers::Url,
|
18
|
+
'favicon' => Parsers::Url,
|
20
19
|
'provider_display' => Parsers::ProviderDisplayParser,
|
21
|
-
'favicon' => Parsers::FaviconParser,
|
22
20
|
'embed' => Parsers::EmbedParser
|
23
21
|
}.freeze
|
24
22
|
|
23
|
+
def self.decorate(url, body)
|
24
|
+
new(url, body).decorate
|
25
|
+
end
|
26
|
+
|
27
|
+
private_class_method :new
|
28
|
+
|
25
29
|
def initialize(url, body)
|
26
30
|
@url = url
|
27
31
|
@xml = XMLHandler.new
|
@@ -37,7 +41,7 @@ module RichUrls
|
|
37
41
|
|
38
42
|
def decorate
|
39
43
|
PARSERS.each_with_object({}) do |(key, parser), object|
|
40
|
-
object[key] = parser.call(@xml, @url)
|
44
|
+
object[key] = parser.call(@xml.properties[key], @url)
|
41
45
|
end
|
42
46
|
end
|
43
47
|
end
|
data/lib/el.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
module RichUrls
|
2
|
+
class El
|
3
|
+
MAX_TEXT_LENGTH = 1000
|
4
|
+
|
5
|
+
attr_reader :tag, :open, :attributes
|
6
|
+
|
7
|
+
def initialize(tag)
|
8
|
+
@tag = tag
|
9
|
+
@open = true
|
10
|
+
@attributes = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def add(key, value)
|
14
|
+
return if @attributes[key]
|
15
|
+
|
16
|
+
@attributes[key] = value
|
17
|
+
end
|
18
|
+
|
19
|
+
def append_text(str)
|
20
|
+
@attributes[:text] ||= ''
|
21
|
+
|
22
|
+
str = str.strip
|
23
|
+
length = @attributes[:text].length
|
24
|
+
|
25
|
+
if length <= MAX_TEXT_LENGTH
|
26
|
+
end_slice = MAX_TEXT_LENGTH - length
|
27
|
+
sliced = str[0...end_slice]
|
28
|
+
|
29
|
+
@attributes[:text] << sliced + ' '
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def text
|
34
|
+
@attributes[:text].strip
|
35
|
+
end
|
36
|
+
|
37
|
+
def close!
|
38
|
+
@open = false
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module RichUrls
|
2
|
+
module Finders
|
3
|
+
module Favicon
|
4
|
+
KEYWORDS = ['shortcut icon', 'icon shortcut', 'icon'].freeze
|
5
|
+
|
6
|
+
def self.call(elem)
|
7
|
+
return unless elem.tag == :link &&
|
8
|
+
KEYWORDS.include?(elem.attributes[:rel])
|
9
|
+
|
10
|
+
elem.attributes[:href]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/finders.rb
ADDED
data/lib/parsers/embed_parser.rb
CHANGED
@@ -1,28 +1,30 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
module RichUrls
|
2
|
+
module Parsers
|
3
|
+
class EmbedParser
|
4
|
+
require_relative 'embed_parsers/base'
|
5
|
+
require_relative 'embed_parsers/youtube'
|
6
|
+
require_relative 'embed_parsers/youtube_short'
|
7
|
+
require_relative 'embed_parsers/paste'
|
8
|
+
require_relative 'embed_parsers/spotify'
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
PARSERS = [
|
11
|
+
Youtube,
|
12
|
+
YoutubeShort,
|
13
|
+
Paste,
|
14
|
+
Spotify
|
15
|
+
].freeze
|
15
16
|
|
16
|
-
|
17
|
-
|
17
|
+
def self.call(_, url)
|
18
|
+
uri = URI(url)
|
18
19
|
|
19
|
-
|
20
|
-
|
20
|
+
PARSERS.each do |parser|
|
21
|
+
embed_parser = parser.new(uri)
|
21
22
|
|
22
|
-
|
23
|
-
|
23
|
+
return embed_parser.parse if embed_parser.match?
|
24
|
+
end
|
24
25
|
|
25
|
-
|
26
|
+
nil
|
27
|
+
end
|
26
28
|
end
|
27
29
|
end
|
28
30
|
end
|
@@ -1,14 +1,16 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
class
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module RichUrls
|
2
|
+
module Parsers
|
3
|
+
class EmbedParser
|
4
|
+
class Base
|
5
|
+
def initialize(uri)
|
6
|
+
@uri = uri
|
7
|
+
end
|
7
8
|
|
8
|
-
|
9
|
+
private
|
9
10
|
|
10
|
-
|
11
|
-
|
11
|
+
def query
|
12
|
+
@query ||= Hash[URI.decode_www_form(@uri.query)]
|
13
|
+
end
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -1,18 +1,20 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
class
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module RichUrls
|
2
|
+
module Parsers
|
3
|
+
class EmbedParser
|
4
|
+
class Paste < Base
|
5
|
+
IFRAME = '<iframe src="https://pasteapp.com/p/%s/embed?view=%s" '\
|
6
|
+
'width="480" height="480" scrolling="no" '\
|
7
|
+
'frameborder="0" allowfullscreen></iframe>'.freeze
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
def match?
|
10
|
+
@uri.host == 'pasteapp.com' && @uri.path =~ /\/p\/[a-zA-Z0-9]+/
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
def parse
|
14
|
+
path_id = @uri.path.sub(/\/p\//, '')
|
14
15
|
|
15
|
-
|
16
|
+
IFRAME % [path_id, query.fetch('view')]
|
17
|
+
end
|
16
18
|
end
|
17
19
|
end
|
18
20
|
end
|
@@ -1,30 +1,32 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
class
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
1
|
+
module RichUrls
|
2
|
+
module Parsers
|
3
|
+
class EmbedParser
|
4
|
+
class Spotify < Base
|
5
|
+
IFRAME = '<iframe src="https://open.spotify.com/embed/%s/%s" '\
|
6
|
+
'width="300" height="380" frameborder="0" '\
|
7
|
+
'allowtransparency="true" allow="encrypted-media">'\
|
8
|
+
'</iframe>'.freeze
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
SCOPES = %w[
|
11
|
+
album
|
12
|
+
track
|
13
|
+
playlist
|
14
|
+
].freeze
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
16
|
+
def match?
|
17
|
+
valid_path = SCOPES.any? do |path|
|
18
|
+
@uri.path.start_with?("/#{path}")
|
19
|
+
end
|
19
20
|
|
20
|
-
|
21
|
-
|
21
|
+
@uri.host == 'open.spotify.com' && valid_path
|
22
|
+
end
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
def parse
|
25
|
+
path = @uri.path
|
26
|
+
path[0] = ''
|
26
27
|
|
27
|
-
|
28
|
+
IFRAME % path.split('/')
|
29
|
+
end
|
28
30
|
end
|
29
31
|
end
|
30
32
|
end
|
@@ -1,20 +1,22 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
class
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
1
|
+
module RichUrls
|
2
|
+
module Parsers
|
3
|
+
class EmbedParser
|
4
|
+
class Youtube < Base
|
5
|
+
IFRAME = '<iframe width="560" height="315" '\
|
6
|
+
'src="https://www.youtube.com/embed/%s" frameborder="0" '\
|
7
|
+
'allow="accelerometer; autoplay; encrypted-media; '\
|
8
|
+
'gyroscope; picture-in-picture" allowfullscreen>'\
|
9
|
+
'</iframe>'.freeze
|
9
10
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
def match?
|
12
|
+
@uri.host == 'www.youtube.com' &&
|
13
|
+
@uri.path == '/watch' &&
|
14
|
+
query.key?('v')
|
15
|
+
end
|
15
16
|
|
16
|
-
|
17
|
-
|
17
|
+
def parse
|
18
|
+
IFRAME % query.fetch('v')
|
19
|
+
end
|
18
20
|
end
|
19
21
|
end
|
20
22
|
end
|
@@ -1,15 +1,17 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
class
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module RichUrls
|
2
|
+
module Parsers
|
3
|
+
class EmbedParser
|
4
|
+
class YoutubeShort < Base
|
5
|
+
def match?
|
6
|
+
@uri.host == 'youtu.be'
|
7
|
+
end
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
def parse
|
10
|
+
path = @uri.path
|
11
|
+
path[0] = ''
|
11
12
|
|
12
|
-
|
13
|
+
Youtube::IFRAME % path
|
14
|
+
end
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
data/lib/parsers/url.rb
ADDED
data/lib/url_fetcher.rb
CHANGED
@@ -38,7 +38,7 @@ module RichUrls
|
|
38
38
|
response = session.get(@url)
|
39
39
|
|
40
40
|
if response.status < 400
|
41
|
-
decorated = BodyDecorator.
|
41
|
+
decorated = BodyDecorator.decorate(response.url, response.body)
|
42
42
|
RichUrls.cache.set(digest, Oj.dump(decorated))
|
43
43
|
decorated
|
44
44
|
else
|
data/lib/xml_handler.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
+
require_relative 'el'
|
2
|
+
require_relative 'finders'
|
3
|
+
require_relative 'finders/meta_title'
|
4
|
+
require_relative 'finders/meta_description'
|
5
|
+
require_relative 'finders/meta_image'
|
6
|
+
require_relative 'finders/favicon'
|
7
|
+
|
1
8
|
module RichUrls
|
2
9
|
class XMLHandler < ::Ox::Sax
|
3
10
|
WHITELISTED_EL_NAMES = %i[
|
4
|
-
html
|
5
|
-
head
|
6
11
|
title
|
7
12
|
meta
|
8
13
|
link
|
9
14
|
img
|
15
|
+
p
|
10
16
|
].freeze
|
11
17
|
|
12
18
|
WHITELISTED_ATTRS = %i[
|
@@ -17,48 +23,97 @@ module RichUrls
|
|
17
23
|
src
|
18
24
|
].freeze
|
19
25
|
|
26
|
+
FALLBACK_ELEMENTS = {
|
27
|
+
img: 'og:image',
|
28
|
+
p: 'og:description',
|
29
|
+
title: 'og:title'
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
FINDERS = {
|
33
|
+
Finders::MetaTitle => 'title',
|
34
|
+
Finders::MetaDescription => 'description',
|
35
|
+
Finders::MetaImage => 'image',
|
36
|
+
Finders::Favicon => 'favicon',
|
37
|
+
Finders::Title => 'title',
|
38
|
+
Finders::Description => 'description',
|
39
|
+
Finders::Image => 'image'
|
40
|
+
}.freeze
|
41
|
+
|
20
42
|
StopParsingError = Class.new(StandardError)
|
21
|
-
El = Struct.new(:name, :attributes)
|
22
43
|
|
23
|
-
|
44
|
+
attr_reader :elements, :properties
|
24
45
|
|
25
46
|
def initialize
|
26
47
|
@elements = []
|
48
|
+
@counts = Set.new
|
49
|
+
@properties = {
|
50
|
+
'title' => nil,
|
51
|
+
'description' => nil,
|
52
|
+
'image' => nil,
|
53
|
+
'favicon' => nil
|
54
|
+
}
|
27
55
|
end
|
28
56
|
|
29
|
-
def find(
|
57
|
+
def find(tag, attrs = {})
|
30
58
|
@elements.detect do |el|
|
31
59
|
matching_attributes = attrs.all? { |k, v| el.attributes[k] == v }
|
32
60
|
|
33
|
-
el.
|
61
|
+
el.tag == tag && matching_attributes
|
34
62
|
end
|
35
63
|
end
|
36
64
|
|
37
|
-
def start_element(
|
38
|
-
return unless WHITELISTED_EL_NAMES.include?(
|
65
|
+
def start_element(tag)
|
66
|
+
return unless WHITELISTED_EL_NAMES.include?(tag)
|
39
67
|
|
40
|
-
@elements << El.new(
|
68
|
+
@elements << El.new(tag) if add_element?(tag)
|
41
69
|
end
|
42
70
|
|
43
|
-
def
|
44
|
-
|
71
|
+
def end_element(tag)
|
72
|
+
return unless WHITELISTED_EL_NAMES.include?(tag)
|
45
73
|
|
46
|
-
|
74
|
+
el = @elements.reverse_each.detect { |e| e.open && e.tag == tag }
|
75
|
+
return unless el
|
47
76
|
|
48
|
-
el.
|
77
|
+
el.close!
|
78
|
+
find_element(el)
|
49
79
|
|
50
|
-
raise StopParsingError if
|
80
|
+
raise StopParsingError if @properties.values.all?
|
51
81
|
end
|
52
82
|
|
53
|
-
def
|
83
|
+
def attr(key, value)
|
84
|
+
return unless WHITELISTED_ATTRS.include?(key)
|
85
|
+
|
54
86
|
el = @elements.last
|
55
|
-
el
|
87
|
+
el&.add(key, value)
|
88
|
+
end
|
89
|
+
|
90
|
+
def text(str)
|
91
|
+
el = @elements.detect(&:open)
|
92
|
+
el&.append_text(str)
|
56
93
|
end
|
57
94
|
|
58
95
|
private
|
59
96
|
|
60
|
-
def
|
61
|
-
|
97
|
+
def find_element(elem)
|
98
|
+
FINDERS.each_pair do |finder, attribute|
|
99
|
+
next if @properties[attribute]
|
100
|
+
|
101
|
+
content = finder.call(elem)
|
102
|
+
|
103
|
+
if content
|
104
|
+
@properties[attribute] = content
|
105
|
+
break
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def add_element?(tag)
|
111
|
+
return true unless FALLBACK_ELEMENTS.keys.include?(tag)
|
112
|
+
return false if @counts.include?(tag)
|
113
|
+
|
114
|
+
@counts.add(tag)
|
115
|
+
|
116
|
+
!find(:meta, property: FALLBACK_ELEMENTS.fetch(tag))
|
62
117
|
end
|
63
118
|
end
|
64
119
|
end
|
data/richurls.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: richurls
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- grdw
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03-
|
11
|
+
date: 2020-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: oj
|
@@ -124,17 +124,21 @@ files:
|
|
124
124
|
- README.md
|
125
125
|
- lib/body_decorator.rb
|
126
126
|
- lib/cache.rb
|
127
|
-
- lib/
|
127
|
+
- lib/el.rb
|
128
|
+
- lib/finders.rb
|
129
|
+
- lib/finders/favicon.rb
|
130
|
+
- lib/finders/meta_description.rb
|
131
|
+
- lib/finders/meta_image.rb
|
132
|
+
- lib/finders/meta_title.rb
|
128
133
|
- lib/parsers/embed_parser.rb
|
129
134
|
- lib/parsers/embed_parsers/base.rb
|
130
135
|
- lib/parsers/embed_parsers/paste.rb
|
131
136
|
- lib/parsers/embed_parsers/spotify.rb
|
132
137
|
- lib/parsers/embed_parsers/youtube.rb
|
133
138
|
- lib/parsers/embed_parsers/youtube_short.rb
|
134
|
-
- lib/parsers/
|
135
|
-
- lib/parsers/image_parser.rb
|
139
|
+
- lib/parsers/property.rb
|
136
140
|
- lib/parsers/provider_display_parser.rb
|
137
|
-
- lib/parsers/
|
141
|
+
- lib/parsers/url.rb
|
138
142
|
- lib/richurls.rb
|
139
143
|
- lib/url_fetcher.rb
|
140
144
|
- lib/url_helper.rb
|
@@ -1,18 +0,0 @@
|
|
1
|
-
module Parsers
|
2
|
-
module FaviconParser
|
3
|
-
KEYWORDS = ['shortcut icon', 'icon shortcut', 'icon'].freeze
|
4
|
-
|
5
|
-
def self.call(document, url)
|
6
|
-
KEYWORDS.each do |rel|
|
7
|
-
found_document = document.find(:link, rel: rel)
|
8
|
-
|
9
|
-
if found_document
|
10
|
-
@el = found_document
|
11
|
-
break
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
@el && UrlHelper.url_for(url, @el.attributes[:href])
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
data/lib/parsers/image_parser.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
module Parsers
|
2
|
-
class ImageParser
|
3
|
-
def self.call(document, url)
|
4
|
-
meta_image = document.find(:meta, property: 'og:image')
|
5
|
-
image_tag = document.find(:img)
|
6
|
-
|
7
|
-
image_source =
|
8
|
-
(meta_image && meta_image.attributes[:content]) ||
|
9
|
-
(image_tag && image_tag.attributes[:src])
|
10
|
-
|
11
|
-
image_source && UrlHelper.url_for(url, image_source)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
end
|
data/lib/parsers/title_parser.rb
DELETED