uts58 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+ require 'simpleidn'
3
+ require 'public_suffix'
4
+ require_relative 'constants'
5
+
6
+ module Uts58
7
+ # Finds web links in arbitrary text per UTS #58. The public API
8
+ # mirrors Twitter::TwitterText::Extractor closely enough that
9
+ # twitter-text consumers (notably Mastodon) can swap one for the
10
+ # other.
11
+ #
12
+ # Instances carry only optional configuration (see #max_length=); if
13
+ # you don't need to set anything, the module-level
14
+ # Uts58.extract_urls and Uts58.extract_urls_with_indices shortcuts
15
+ # are simpler.
16
+ class Extractor
17
+ PATH_CLOSERS = [35, 47, 63]
18
+ QUERY_CLOSERS = [35] # how about &?
19
+ FRAGMENT_CLOSERS = []
20
+
21
+ # Maximum allowed length of the matched text, in input codepoints.
22
+ # Matches whose input span exceeds this are dropped from the result
23
+ # of #extract_urls_with_indices.
24
+ #
25
+ # "Matched text" means the substring that came out of +text+ — for
26
+ # example 11 for <tt>"example.com"</tt>. The returned +:url+ can
27
+ # be both longer and shorter, most commonly when a missing scheme
28
+ # is filled in ( +"https://example.com"+ is 19 codepoints). The
29
+ # limit is measured against the input, not against the returned
30
+ # URL.
31
+ #
32
+ # Default is +nil+, meaning no limit.
33
+ attr_accessor :max_length
34
+
35
+ def initialize
36
+ @max_length = nil
37
+ end
38
+
39
+ # Returns every URL found in +text+ as a list of hashes:
40
+ #
41
+ # { url: String, indices: [start, end] }
42
+ #
43
+ # +url+ is the cleaned-up form: any A-labels in the hostname are
44
+ # decoded to U-labels, and the scheme is filled in as +https://+
45
+ # if the input had none. +indices+ are codepoint offsets into
46
+ # +text+, with +end+ exclusive, so <tt>text[start...end]</tt>
47
+ # gives the substring that matched.
48
+ #
49
+ # Note that the start and end may not match the length of url. One
50
+ # very common example is input that like "foo example.com bar",
51
+ # where the URL will be https://example.com, including "https://".
52
+ #
53
+ # Returns an empty array if +text+ contains no links. +options+ is
54
+ # accepted for twitter-text compatibility and currently ignored.
55
+ def extract_urls_with_indices(text, options = {})
56
+ result = []
57
+ text.to_enum(:scan,/(?<![-\p{Alnum}\p{M}.\/])(?=\p{Alnum}[-\p{L}\p{N}\p{M}\u00DF\u03C2\u06FD\u06FE\u0F0B\u3007]*[\.:。])/).map{Regexp.last_match}.each do |match|
58
+ # get rid of a leading protocol. We also tolerate letter/mark/number
59
+ # characters between the trigger and the scheme, so that input
60
+ # like "テストhttp://example.com" attaches the scheme correctly:
61
+ # the trigger fires at offset 0 (the start of "テスト") because
62
+ # nothing precedes it, and the actual link begins three
63
+ # codepoints later.
64
+ s = match.post_match
65
+ scheme_match = /^([\p{Han}\p{Hiragana}\p{Katakana}\p{Hangul}\p{Thai}\p{Lao}\p{Khmer}\p{Myanmar}]*?)(https?:\/\/)/i.match(s)
66
+ if scheme_match
67
+ scheme_offset = scheme_match[1].length
68
+ proto = scheme_match[2]
69
+ s = scheme_match.post_match
70
+ else
71
+ scheme_offset = 0
72
+ proto = "https://"
73
+ end
74
+ # look for the prefix that might be a hostname or an IDN.
75
+ # this is a somewhat sloppy match, with a few false positives.
76
+ prefix = /^([-\p{L}\p{N}\p{M}\u00DF\u03C2\u06FD\u06FE\u0F0B\u3007]+[\.。]){1,4}[-\p{L}\p{N}\p{M}]+(?![-\p{L}\p{N}\p{M}])/.match(s)
77
+ if prefix && prefix[0].length < 254
78
+ hn = SimpleIDN.to_unicode(prefix.match(0).gsub(/。/, "."))
79
+ begin
80
+ about = PublicSuffix.parse(hn,
81
+ ignore_private: true,
82
+ default_rule: nil)
83
+ if about && about.tld != "invalid" then
84
+ # at this point, we do have enough to mark something,
85
+ # the question is how much. there may be a trailing
86
+ # port, then a path, then a query, finally a fragment.
87
+ rest = prefix.post_match
88
+ # a port number must be 1..65535
89
+ port = /^:(\d+)/.match(rest)
90
+ if port
91
+ n = port[1].to_i
92
+ next if n < 1 || n > 65535
93
+ rest = port.post_match
94
+ end
95
+ # path
96
+ rest = skip_component(rest, PATH_CLOSERS) while rest[0] == "/"
97
+ # query
98
+ rest = skip_component(rest, QUERY_CLOSERS) if rest[0] == '?'
99
+ rest = skip_component(rest, FRAGMENT_CLOSERS) if rest[0] == "#"
100
+ rest_length = prefix.post_match.length - rest.length
101
+ match_length = match.post_match.length - rest.length - scheme_offset
102
+ next if @max_length && match_length > @max_length
103
+ start = match.begin(0) + scheme_offset
104
+ result << {
105
+ url: "#{proto}#{hn}#{prefix.post_match[...rest_length]}",
106
+ indices: [start, start + match_length]
107
+ }
108
+ end
109
+ rescue PublicSuffix::DomainInvalid
110
+ # evidently we're not looking at the start of a link
111
+ rescue PublicSuffix::DomainNotAllowed
112
+ # ditto
113
+ end
114
+ end
115
+ end
116
+ # ah! the good feeling of going home after a hard day's work
117
+ result
118
+ end
119
+
120
+ # Returns just the URLs found in +text+, as an array of strings,
121
+ # in the order they occur. Use #extract_urls_with_indices instead
122
+ # if you also need the offsets, e.g. for adding HTML markup or for
123
+ # pairing the found links with the form used in the text.
124
+ #
125
+ # For text such as "a example.com b", this returns ["https://example.com"].
126
+ def extract_urls(text, options = {})
127
+ extract_urls_with_indices(text, options).map { |r| r[:url] }
128
+ end
129
+
130
+ # Given a list of entities (hashes with an +:indices+ key of the
131
+ # shape <tt>[start, end]</tt>, as produced by
132
+ # #extract_urls_with_indices) drops every entity that overlaps an
133
+ # earlier one and returns the survivors.
134
+ #
135
+ # Useful when merging the output of several extractors (URLs,
136
+ # mentions, hashtags, …), or when #extract_urls_with_indices itself
137
+ # finds several partly overlapping candidate URLs and you want only
138
+ # the longest. The algorithm prefers entries that start earlier;
139
+ # ties are broken by input order.
140
+ #
141
+ # The input array is not modified.
142
+ def remove_overlapping_entities(entities)
143
+ sorted = entities.sort_by { |e| e[:indices].first }
144
+ prev = nil
145
+ sorted.reject do |e|
146
+ if prev && prev[:indices].last > e[:indices].first
147
+ true
148
+ else
149
+ prev = e
150
+ false
151
+ end
152
+ end
153
+ end
154
+
155
+ private
156
+
157
+ def followed_by_hard(codepoints, i)
158
+ j = i;
159
+ while(j < codepoints.length &&
160
+ Constants::TERMINATION.include?(codepoints[j]) &&
161
+ Constants::TERMINATION[codepoints[j]] == :soft)
162
+ j = j + 1
163
+ end
164
+ j >= codepoints.length ||
165
+ (Constants::TERMINATION.include?(codepoints[j]) &&
166
+ Constants::TERMINATION[codepoints[j]] == :hard)
167
+ end
168
+
169
+ def skip_component(string, extra_closers)
170
+ openers = []
171
+ codepoints = string.codepoints
172
+ codepoints.each.with_index do |cp, i|
173
+ if i == 0
174
+ # it's the lead-in character
175
+ elsif extra_closers.include? cp
176
+ return string[i..]
177
+ elsif Constants::TERMINATION.include?(cp)
178
+ case Constants::TERMINATION[cp]
179
+ when :hard
180
+ return string[i..]
181
+ when :soft
182
+ return string[i..] if followed_by_hard(codepoints, i)
183
+ when :close
184
+ if Constants::OPENERS[cp] == openers.last
185
+ openers.pop
186
+ else
187
+ return string[i..]
188
+ end
189
+ when :open
190
+ openers << cp
191
+ end
192
+ else
193
+ # it's a letter or something like that
194
+ end
195
+ end
196
+ # Input ran out before any terminator did: the whole component
197
+ # belongs to the URL, so there is nothing left over.
198
+ ""
199
+ end
200
+ end
201
+ end
data/lib/uts58.rb ADDED
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Ruby implementation of {UTS #58}[https://www.unicode.org/reports/tr58/],
4
+ # the Unicode spec for finding links in running text.
5
+ #
6
+ # The two entry points below are module-level shortcuts around a single
7
+ # memoised Uts58::Extractor instance. They also strip partly
8
+ # overlapping links — if the extractor finds two candidates that share
9
+ # any characters in the input, the wrappers keep the earlier one and
10
+ # drop the rest. Use Uts58::Extractor directly if you want the
11
+ # raw, possibly-overlapping list (e.g. to merge with hashtag/mention
12
+ # extractors before resolving overlap yourself).
13
+ #
14
+ # Uts58.extract_urls("see example.com here")
15
+ # # => ["https://example.com"]
16
+ #
17
+ # Uts58.extract_urls_with_indices("see example.com here")
18
+ # # => [{ url: "https://example.com", indices: [4, 15] }]
19
+ module Uts58
20
+ VERSION = "0.1.0"
21
+
22
+ class << self
23
+ # Like Uts58::Extractor#extract_urls_with_indices, but with
24
+ # overlapping results merged via
25
+ # Uts58::Extractor#remove_overlapping_entities.
26
+ def extract_urls_with_indices(text, options = {})
27
+ extractor.remove_overlapping_entities(
28
+ extractor.extract_urls_with_indices(text, options)
29
+ )
30
+ end
31
+
32
+ # Like Uts58::Extractor#extract_urls, but with the URLs of
33
+ # overlapping results merged.
34
+ def extract_urls(text, options = {})
35
+ extract_urls_with_indices(text, options).map { |r| r[:url] }
36
+ end
37
+
38
+ private
39
+
40
+ def extractor
41
+ @extractor ||= Extractor.new
42
+ end
43
+ end
44
+ end
45
+
46
+ require_relative "uts58/extractor"
data/uts58.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "uts58"
5
+ spec.version = "0.1.0"
6
+ spec.authors = ["Arnt Gulbrandsen"]
7
+ spec.email = ["arnt@gulbrandsen.priv.no"]
8
+
9
+ spec.summary = %q{Ruby implementation of Unicode UTS58}
10
+ spec.description = %q{Ruby code to detect links in text, as specified by UTS58}
11
+ spec.homepage = "https://github.com/arnt/uts58"
12
+ spec.required_ruby_version = Gem::Requirement.new(">= 3.1")
13
+ spec.licenses = ["BSD-2-Clause"]
14
+
15
+ # Specify which files should be added to the gem when it is released.
16
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
17
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
18
+ `git ls-files -z 2>/dev/null`.split("\x0").reject { |f| f.match(%r{^(bin|test|spec|features|rfcs|tools)/}) }
19
+ end
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "public_suffix", "~> 6.0"
23
+ spec.add_dependency "simpleidn", "~> 0.2"
24
+ spec.add_development_dependency "byebug", "~> 12.0"
25
+ spec.add_development_dependency "diff-lcs", '~> 1.5.1'
26
+ spec.add_development_dependency "rspec", "~> 3.0"
27
+ end
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uts58
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Arnt Gulbrandsen
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: public_suffix
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '6.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '6.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: simpleidn
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '0.2'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '0.2'
40
+ - !ruby/object:Gem::Dependency
41
+ name: byebug
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '12.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '12.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: diff-lcs
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: 1.5.1
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: 1.5.1
68
+ - !ruby/object:Gem::Dependency
69
+ name: rspec
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '3.0'
82
+ description: Ruby code to detect links in text, as specified by UTS58
83
+ email:
84
+ - arnt@gulbrandsen.priv.no
85
+ executables: []
86
+ extensions: []
87
+ extra_rdoc_files: []
88
+ files:
89
+ - ".github/workflows/ci.yml"
90
+ - Gemfile
91
+ - README.md
92
+ - lib/uts58.rb
93
+ - lib/uts58/constants.rb
94
+ - lib/uts58/extractor.rb
95
+ - uts58.gemspec
96
+ homepage: https://github.com/arnt/uts58
97
+ licenses:
98
+ - BSD-2-Clause
99
+ metadata: {}
100
+ rdoc_options: []
101
+ require_paths:
102
+ - lib
103
+ required_ruby_version: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - ">="
106
+ - !ruby/object:Gem::Version
107
+ version: '3.1'
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ requirements: []
114
+ rubygems_version: 3.6.7
115
+ specification_version: 4
116
+ summary: Ruby implementation of Unicode UTS58
117
+ test_files: []