text_alignment 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/char_mapping.rb +32 -7
- data/lib/text_alignment/mixed_alignment.rb +1 -1
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c2a36fe4cfde7dfb76f554fd4afcae7cb5a03e455887621217f5e5e633b20b3
|
4
|
+
data.tar.gz: f63070c6f423bc15d0fc8c742a21238a104a1b2c1d3fe56ac436effa8ef8eacf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '078a41bc6ab9b16e7747be6a3bb15aff4b23a1161bcea0b653a93f04d673799afcac2109cd1ce8d1a95c99c5c07d36842e3698c9f0997500e653fb4ab939e04a'
|
7
|
+
data.tar.gz: ce44d334779d43b3057317537f615ebf39b8049639d3e50c4e14272c952b76a5df2b060cfd3a15f1d28372c11a795ef8bf43cb04de32b5a78ce2f44433edddfb
|
@@ -91,6 +91,9 @@ class TextAlignment::CharMapping
|
|
91
91
|
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
92
|
end
|
93
93
|
|
94
|
+
@text = _text
|
95
|
+
|
96
|
+
# sort by the length of the spell-outs is important
|
94
97
|
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
95
98
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
96
99
|
@index_enmap = offset_mapping.to_h
|
@@ -98,11 +101,11 @@ class TextAlignment::CharMapping
|
|
98
101
|
end
|
99
102
|
|
100
103
|
def enmap_position(position)
|
101
|
-
@index_enmap[position] || raise(ArgumentError, "Unusual position
|
104
|
+
@index_enmap[position] || raise(ArgumentError, "Unusual position of annotation: #{position}")
|
102
105
|
end
|
103
106
|
|
104
107
|
def demap_position(position)
|
105
|
-
@index_demap[position] || raise(ArgumentError, "Unusual position
|
108
|
+
@index_demap[position] || raise(ArgumentError, "Unusual position of annotation: #{position}")
|
106
109
|
end
|
107
110
|
|
108
111
|
def enmap_denotations(denotations)
|
@@ -110,6 +113,23 @@ class TextAlignment::CharMapping
|
|
110
113
|
|
111
114
|
denotations.map do |d|
|
112
115
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
116
|
+
rescue ArgumentError => e
|
117
|
+
snippet_begin = d[:span][:begin] - 5
|
118
|
+
if snippet_begin < 0
|
119
|
+
snippet_begin = 0
|
120
|
+
end
|
121
|
+
snippet_end = d[:span][:end] + 5
|
122
|
+
if snippet_end > @text.length
|
123
|
+
snippet_end = @text.length
|
124
|
+
end
|
125
|
+
snippet = @text[snippet_begin ... d[:span][:begin]] + '[' + @text[d[:span][:begin] ... d[:span][:end]] + ']' + @text[d[:span][:end] ... snippet_end]
|
126
|
+
if snippet_begin > 0
|
127
|
+
snippet = '...' + snippet
|
128
|
+
end
|
129
|
+
if snippet_end < @text.length
|
130
|
+
snippet = snippet + '...'
|
131
|
+
end
|
132
|
+
raise ArgumentError, e.message + " (#{snippet})"
|
113
133
|
end
|
114
134
|
end
|
115
135
|
|
@@ -118,7 +138,7 @@ class TextAlignment::CharMapping
|
|
118
138
|
def enmap_text(_text, char_mapping, no_ws = false)
|
119
139
|
text = _text.dup
|
120
140
|
|
121
|
-
# To
|
141
|
+
# To perform the single letter mapping replacement
|
122
142
|
char_mapping.each do |one, long|
|
123
143
|
text.gsub!(one, long) if long.length == 1
|
124
144
|
end
|
@@ -130,12 +150,16 @@ class TextAlignment::CharMapping
|
|
130
150
|
|
131
151
|
init_next = 0
|
132
152
|
while loc = text.index(long, init_next)
|
133
|
-
|
153
|
+
# Huristics to check if the surrounding letters are sufficiently distinguished.
|
154
|
+
if long.length > 3 || ((text[loc - 1, 2] !~ /[a-z][a-z]/) && (text[loc + long.length - 1, 2] !~ /[a-z][a-z]/))
|
155
|
+
# if true
|
156
|
+
rpositions << [loc, long.length, 1]
|
157
|
+
|
158
|
+
# a workaround to avoid messing-up due to embedding
|
159
|
+
text[loc, long.length] = one * long.length
|
160
|
+
end
|
134
161
|
init_next = loc + long.length
|
135
162
|
end
|
136
|
-
|
137
|
-
# a workaround to avoid messing-up due to embedding
|
138
|
-
text.gsub!(long, one * long.length)
|
139
163
|
end
|
140
164
|
|
141
165
|
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
@@ -163,6 +187,7 @@ class TextAlignment::CharMapping
|
|
163
187
|
|
164
188
|
# To execute the long letter mapping
|
165
189
|
char_mapping.each do |one, long|
|
190
|
+
next unless text =~ /#{one}/
|
166
191
|
text.gsub!(one * long.length, one) if long.length > 1
|
167
192
|
end
|
168
193
|
|
@@ -17,7 +17,7 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2,
|
20
|
+
def initialize(_str1, _str2, mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
23
|
mappings ||= TextAlignment::CHAR_MAPPING
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|