uts58 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +21 -0
- data/Gemfile +5 -0
- data/README.md +76 -0
- data/lib/uts58/constants.rb +138233 -0
- data/lib/uts58/extractor.rb +201 -0
- data/lib/uts58.rb +46 -0
- data/uts58.gemspec +27 -0
- metadata +117 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
require 'simpleidn'
|
|
3
|
+
require 'public_suffix'
|
|
4
|
+
require_relative 'constants'
|
|
5
|
+
|
|
6
|
+
module Uts58
|
|
7
|
+
# Finds web links in arbitrary text per UTS #58. The public API
|
|
8
|
+
# mirrors Twitter::TwitterText::Extractor closely enough that
|
|
9
|
+
# twitter-text consumers (notably Mastodon) can swap one for the
|
|
10
|
+
# other.
|
|
11
|
+
#
|
|
12
|
+
# Instances carry only optional configuration (see #max_length=); if
|
|
13
|
+
# you don't need to set anything, the module-level
|
|
14
|
+
# Uts58.extract_urls and Uts58.extract_urls_with_indices shortcuts
|
|
15
|
+
# are simpler.
|
|
16
|
+
class Extractor
|
|
17
|
+
PATH_CLOSERS = [35, 47, 63]
|
|
18
|
+
QUERY_CLOSERS = [35] # how about &?
|
|
19
|
+
FRAGMENT_CLOSERS = []
|
|
20
|
+
|
|
21
|
+
# Maximum allowed length of the matched text, in input codepoints.
|
|
22
|
+
# Matches whose input span exceeds this are dropped from the result
|
|
23
|
+
# of #extract_urls_with_indices.
|
|
24
|
+
#
|
|
25
|
+
# "Matched text" means the substring that came out of +text+ — for
|
|
26
|
+
# example 11 for <tt>"example.com"</tt>. The returned +:url+ can
|
|
27
|
+
# be both longer and shorter, most commonly when a missing scheme
|
|
28
|
+
# is filled in ( +"https://example.com"+ is 19 codepoints). The
|
|
29
|
+
# limit is measured against the input, not against the returned
|
|
30
|
+
# URL.
|
|
31
|
+
#
|
|
32
|
+
# Default is +nil+, meaning no limit.
|
|
33
|
+
attr_accessor :max_length
|
|
34
|
+
|
|
35
|
+
def initialize
|
|
36
|
+
@max_length = nil
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Returns every URL found in +text+ as a list of hashes:
|
|
40
|
+
#
|
|
41
|
+
# { url: String, indices: [start, end] }
|
|
42
|
+
#
|
|
43
|
+
# +url+ is the cleaned-up form: any A-labels in the hostname are
|
|
44
|
+
# decoded to U-labels, and the scheme is filled in as +https://+
|
|
45
|
+
# if the input had none. +indices+ are codepoint offsets into
|
|
46
|
+
# +text+, with +end+ exclusive, so <tt>text[start...end]</tt>
|
|
47
|
+
# gives the substring that matched.
|
|
48
|
+
#
|
|
49
|
+
# Note that the start and end may not match the length of url. One
|
|
50
|
+
# very common example is input that like "foo example.com bar",
|
|
51
|
+
# where the URL will be https://example.com, including "https://".
|
|
52
|
+
#
|
|
53
|
+
# Returns an empty array if +text+ contains no links. +options+ is
|
|
54
|
+
# accepted for twitter-text compatibility and currently ignored.
|
|
55
|
+
def extract_urls_with_indices(text, options = {})
|
|
56
|
+
result = []
|
|
57
|
+
text.to_enum(:scan,/(?<![-\p{Alnum}\p{M}.\/])(?=\p{Alnum}[-\p{L}\p{N}\p{M}\u00DF\u03C2\u06FD\u06FE\u0F0B\u3007]*[\.:。])/).map{Regexp.last_match}.each do |match|
|
|
58
|
+
# get rid of a leading protocol. We also tolerate letter/mark/number
|
|
59
|
+
# characters between the trigger and the scheme, so that input
|
|
60
|
+
# like "テストhttp://example.com" attaches the scheme correctly:
|
|
61
|
+
# the trigger fires at offset 0 (the start of "テスト") because
|
|
62
|
+
# nothing precedes it, and the actual link begins three
|
|
63
|
+
# codepoints later.
|
|
64
|
+
s = match.post_match
|
|
65
|
+
scheme_match = /^([\p{Han}\p{Hiragana}\p{Katakana}\p{Hangul}\p{Thai}\p{Lao}\p{Khmer}\p{Myanmar}]*?)(https?:\/\/)/i.match(s)
|
|
66
|
+
if scheme_match
|
|
67
|
+
scheme_offset = scheme_match[1].length
|
|
68
|
+
proto = scheme_match[2]
|
|
69
|
+
s = scheme_match.post_match
|
|
70
|
+
else
|
|
71
|
+
scheme_offset = 0
|
|
72
|
+
proto = "https://"
|
|
73
|
+
end
|
|
74
|
+
# look for the prefix that might be a hostname or an IDN.
|
|
75
|
+
# this is a somewhat sloppy match, with a few false positives.
|
|
76
|
+
prefix = /^([-\p{L}\p{N}\p{M}\u00DF\u03C2\u06FD\u06FE\u0F0B\u3007]+[\.。]){1,4}[-\p{L}\p{N}\p{M}]+(?![-\p{L}\p{N}\p{M}])/.match(s)
|
|
77
|
+
if prefix && prefix[0].length < 254
|
|
78
|
+
hn = SimpleIDN.to_unicode(prefix.match(0).gsub(/。/, "."))
|
|
79
|
+
begin
|
|
80
|
+
about = PublicSuffix.parse(hn,
|
|
81
|
+
ignore_private: true,
|
|
82
|
+
default_rule: nil)
|
|
83
|
+
if about && about.tld != "invalid" then
|
|
84
|
+
# at this point, we do have enough to mark something,
|
|
85
|
+
# the question is how much. there may be a trailing
|
|
86
|
+
# port, then a path, then a query, finally a fragment.
|
|
87
|
+
rest = prefix.post_match
|
|
88
|
+
# a port number must be 1..65535
|
|
89
|
+
port = /^:(\d+)/.match(rest)
|
|
90
|
+
if port
|
|
91
|
+
n = port[1].to_i
|
|
92
|
+
next if n < 1 || n > 65535
|
|
93
|
+
rest = port.post_match
|
|
94
|
+
end
|
|
95
|
+
# path
|
|
96
|
+
rest = skip_component(rest, PATH_CLOSERS) while rest[0] == "/"
|
|
97
|
+
# query
|
|
98
|
+
rest = skip_component(rest, QUERY_CLOSERS) if rest[0] == '?'
|
|
99
|
+
rest = skip_component(rest, FRAGMENT_CLOSERS) if rest[0] == "#"
|
|
100
|
+
rest_length = prefix.post_match.length - rest.length
|
|
101
|
+
match_length = match.post_match.length - rest.length - scheme_offset
|
|
102
|
+
next if @max_length && match_length > @max_length
|
|
103
|
+
start = match.begin(0) + scheme_offset
|
|
104
|
+
result << {
|
|
105
|
+
url: "#{proto}#{hn}#{prefix.post_match[...rest_length]}",
|
|
106
|
+
indices: [start, start + match_length]
|
|
107
|
+
}
|
|
108
|
+
end
|
|
109
|
+
rescue PublicSuffix::DomainInvalid
|
|
110
|
+
# evidently we're not looking at the start of a link
|
|
111
|
+
rescue PublicSuffix::DomainNotAllowed
|
|
112
|
+
# ditto
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
# ah! the good feeling of going home after a hard day's work
|
|
117
|
+
result
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Returns just the URLs found in +text+, as an array of strings,
|
|
121
|
+
# in the order they occur. Use #extract_urls_with_indices instead
|
|
122
|
+
# if you also need the offsets, e.g. for adding HTML markup or for
|
|
123
|
+
# pairing the found links with the form used in the text.
|
|
124
|
+
#
|
|
125
|
+
# For text such as "a example.com b", this returns ["https://example.com"].
|
|
126
|
+
def extract_urls(text, options = {})
|
|
127
|
+
extract_urls_with_indices(text, options).map { |r| r[:url] }
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Given a list of entities (hashes with an +:indices+ key of the
|
|
131
|
+
# shape <tt>[start, end]</tt>, as produced by
|
|
132
|
+
# #extract_urls_with_indices) drops every entity that overlaps an
|
|
133
|
+
# earlier one and returns the survivors.
|
|
134
|
+
#
|
|
135
|
+
# Useful when merging the output of several extractors (URLs,
|
|
136
|
+
# mentions, hashtags, …), or when #extract_urls_with_indices itself
|
|
137
|
+
# finds several partly overlapping candidate URLs and you want only
|
|
138
|
+
# the longest. The algorithm prefers entries that start earlier;
|
|
139
|
+
# ties are broken by input order.
|
|
140
|
+
#
|
|
141
|
+
# The input array is not modified.
|
|
142
|
+
def remove_overlapping_entities(entities)
|
|
143
|
+
sorted = entities.sort_by { |e| e[:indices].first }
|
|
144
|
+
prev = nil
|
|
145
|
+
sorted.reject do |e|
|
|
146
|
+
if prev && prev[:indices].last > e[:indices].first
|
|
147
|
+
true
|
|
148
|
+
else
|
|
149
|
+
prev = e
|
|
150
|
+
false
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
private
|
|
156
|
+
|
|
157
|
+
def followed_by_hard(codepoints, i)
|
|
158
|
+
j = i;
|
|
159
|
+
while(j < codepoints.length &&
|
|
160
|
+
Constants::TERMINATION.include?(codepoints[j]) &&
|
|
161
|
+
Constants::TERMINATION[codepoints[j]] == :soft)
|
|
162
|
+
j = j + 1
|
|
163
|
+
end
|
|
164
|
+
j >= codepoints.length ||
|
|
165
|
+
(Constants::TERMINATION.include?(codepoints[j]) &&
|
|
166
|
+
Constants::TERMINATION[codepoints[j]] == :hard)
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def skip_component(string, extra_closers)
|
|
170
|
+
openers = []
|
|
171
|
+
codepoints = string.codepoints
|
|
172
|
+
codepoints.each.with_index do |cp, i|
|
|
173
|
+
if i == 0
|
|
174
|
+
# it's the lead-in character
|
|
175
|
+
elsif extra_closers.include? cp
|
|
176
|
+
return string[i..]
|
|
177
|
+
elsif Constants::TERMINATION.include?(cp)
|
|
178
|
+
case Constants::TERMINATION[cp]
|
|
179
|
+
when :hard
|
|
180
|
+
return string[i..]
|
|
181
|
+
when :soft
|
|
182
|
+
return string[i..] if followed_by_hard(codepoints, i)
|
|
183
|
+
when :close
|
|
184
|
+
if Constants::OPENERS[cp] == openers.last
|
|
185
|
+
openers.pop
|
|
186
|
+
else
|
|
187
|
+
return string[i..]
|
|
188
|
+
end
|
|
189
|
+
when :open
|
|
190
|
+
openers << cp
|
|
191
|
+
end
|
|
192
|
+
else
|
|
193
|
+
# it's a letter or something like that
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
# Input ran out before any terminator did: the whole component
|
|
197
|
+
# belongs to the URL, so there is nothing left over.
|
|
198
|
+
""
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
data/lib/uts58.rb
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Ruby implementation of {UTS #58}[https://www.unicode.org/reports/tr58/],
|
|
4
|
+
# the Unicode spec for finding links in running text.
|
|
5
|
+
#
|
|
6
|
+
# The two entry points below are module-level shortcuts around a single
|
|
7
|
+
# memoised Uts58::Extractor instance. They also strip partly
|
|
8
|
+
# overlapping links — if the extractor finds two candidates that share
|
|
9
|
+
# any characters in the input, the wrappers keep the earlier one and
|
|
10
|
+
# drop the rest. Use Uts58::Extractor directly if you want the
|
|
11
|
+
# raw, possibly-overlapping list (e.g. to merge with hashtag/mention
|
|
12
|
+
# extractors before resolving overlap yourself).
|
|
13
|
+
#
|
|
14
|
+
# Uts58.extract_urls("see example.com here")
|
|
15
|
+
# # => ["https://example.com"]
|
|
16
|
+
#
|
|
17
|
+
# Uts58.extract_urls_with_indices("see example.com here")
|
|
18
|
+
# # => [{ url: "https://example.com", indices: [4, 15] }]
|
|
19
|
+
module Uts58
|
|
20
|
+
VERSION = "0.1.0"
|
|
21
|
+
|
|
22
|
+
class << self
|
|
23
|
+
# Like Uts58::Extractor#extract_urls_with_indices, but with
|
|
24
|
+
# overlapping results merged via
|
|
25
|
+
# Uts58::Extractor#remove_overlapping_entities.
|
|
26
|
+
def extract_urls_with_indices(text, options = {})
|
|
27
|
+
extractor.remove_overlapping_entities(
|
|
28
|
+
extractor.extract_urls_with_indices(text, options)
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Like Uts58::Extractor#extract_urls, but with the URLs of
|
|
33
|
+
# overlapping results merged.
|
|
34
|
+
def extract_urls(text, options = {})
|
|
35
|
+
extract_urls_with_indices(text, options).map { |r| r[:url] }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def extractor
|
|
41
|
+
@extractor ||= Extractor.new
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
require_relative "uts58/extractor"
|
data/uts58.gemspec
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |spec|
|
|
4
|
+
spec.name = "uts58"
|
|
5
|
+
spec.version = "0.1.0"
|
|
6
|
+
spec.authors = ["Arnt Gulbrandsen"]
|
|
7
|
+
spec.email = ["arnt@gulbrandsen.priv.no"]
|
|
8
|
+
|
|
9
|
+
spec.summary = %q{Ruby implementation of Unicode UTS58}
|
|
10
|
+
spec.description = %q{Ruby code to detect links in text, as specified by UTS58}
|
|
11
|
+
spec.homepage = "https://github.com/arnt/uts58"
|
|
12
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 3.1")
|
|
13
|
+
spec.licenses = ["BSD-2-Clause"]
|
|
14
|
+
|
|
15
|
+
# Specify which files should be added to the gem when it is released.
|
|
16
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
17
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
|
18
|
+
`git ls-files -z 2>/dev/null`.split("\x0").reject { |f| f.match(%r{^(bin|test|spec|features|rfcs|tools)/}) }
|
|
19
|
+
end
|
|
20
|
+
spec.require_paths = ["lib"]
|
|
21
|
+
|
|
22
|
+
spec.add_dependency "public_suffix", "~> 6.0"
|
|
23
|
+
spec.add_dependency "simpleidn", "~> 0.2"
|
|
24
|
+
spec.add_development_dependency "byebug", "~> 12.0"
|
|
25
|
+
spec.add_development_dependency "diff-lcs", '~> 1.5.1'
|
|
26
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
|
27
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: uts58
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Arnt Gulbrandsen
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: public_suffix
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '6.0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '6.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: simpleidn
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0.2'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0.2'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: byebug
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '12.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '12.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: diff-lcs
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: 1.5.1
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: 1.5.1
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: rspec
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '3.0'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '3.0'
|
|
82
|
+
description: Ruby code to detect links in text, as specified by UTS58
|
|
83
|
+
email:
|
|
84
|
+
- arnt@gulbrandsen.priv.no
|
|
85
|
+
executables: []
|
|
86
|
+
extensions: []
|
|
87
|
+
extra_rdoc_files: []
|
|
88
|
+
files:
|
|
89
|
+
- ".github/workflows/ci.yml"
|
|
90
|
+
- Gemfile
|
|
91
|
+
- README.md
|
|
92
|
+
- lib/uts58.rb
|
|
93
|
+
- lib/uts58/constants.rb
|
|
94
|
+
- lib/uts58/extractor.rb
|
|
95
|
+
- uts58.gemspec
|
|
96
|
+
homepage: https://github.com/arnt/uts58
|
|
97
|
+
licenses:
|
|
98
|
+
- BSD-2-Clause
|
|
99
|
+
metadata: {}
|
|
100
|
+
rdoc_options: []
|
|
101
|
+
require_paths:
|
|
102
|
+
- lib
|
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
104
|
+
requirements:
|
|
105
|
+
- - ">="
|
|
106
|
+
- !ruby/object:Gem::Version
|
|
107
|
+
version: '3.1'
|
|
108
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
|
+
requirements:
|
|
110
|
+
- - ">="
|
|
111
|
+
- !ruby/object:Gem::Version
|
|
112
|
+
version: '0'
|
|
113
|
+
requirements: []
|
|
114
|
+
rubygems_version: 3.6.7
|
|
115
|
+
specification_version: 4
|
|
116
|
+
summary: Ruby implementation of Unicode UTS58
|
|
117
|
+
test_files: []
|