text_alignment 0.12.0 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/char_mapping.rb +32 -7
- data/lib/text_alignment/mixed_alignment.rb +1 -1
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c2a36fe4cfde7dfb76f554fd4afcae7cb5a03e455887621217f5e5e633b20b3
|
4
|
+
data.tar.gz: f63070c6f423bc15d0fc8c742a21238a104a1b2c1d3fe56ac436effa8ef8eacf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '078a41bc6ab9b16e7747be6a3bb15aff4b23a1161bcea0b653a93f04d673799afcac2109cd1ce8d1a95c99c5c07d36842e3698c9f0997500e653fb4ab939e04a'
|
7
|
+
data.tar.gz: ce44d334779d43b3057317537f615ebf39b8049639d3e50c4e14272c952b76a5df2b060cfd3a15f1d28372c11a795ef8bf43cb04de32b5a78ce2f44433edddfb
|
@@ -91,6 +91,9 @@ class TextAlignment::CharMapping
|
|
91
91
|
@method_squeeze_ws = method(:squeeze_ws_1!)
|
92
92
|
end
|
93
93
|
|
94
|
+
@text = _text
|
95
|
+
|
96
|
+
# sort by the length of the spell-outs is important
|
94
97
|
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
95
98
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
96
99
|
@index_enmap = offset_mapping.to_h
|
@@ -98,11 +101,11 @@ class TextAlignment::CharMapping
|
|
98
101
|
end
|
99
102
|
|
100
103
|
def enmap_position(position)
|
101
|
-
@index_enmap[position] || raise(ArgumentError, "Unusual position
|
104
|
+
@index_enmap[position] || raise(ArgumentError, "Unusual position of annotation: #{position}")
|
102
105
|
end
|
103
106
|
|
104
107
|
def demap_position(position)
|
105
|
-
@index_demap[position] || raise(ArgumentError, "Unusual position
|
108
|
+
@index_demap[position] || raise(ArgumentError, "Unusual position of annotation: #{position}")
|
106
109
|
end
|
107
110
|
|
108
111
|
def enmap_denotations(denotations)
|
@@ -110,6 +113,23 @@ class TextAlignment::CharMapping
|
|
110
113
|
|
111
114
|
denotations.map do |d|
|
112
115
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
116
|
+
rescue ArgumentError => e
|
117
|
+
snippet_begin = d[:span][:begin] - 5
|
118
|
+
if snippet_begin < 0
|
119
|
+
snippet_begin = 0
|
120
|
+
end
|
121
|
+
snippet_end = d[:span][:end] + 5
|
122
|
+
if snippet_end > @text.length
|
123
|
+
snippet_end = @text.length
|
124
|
+
end
|
125
|
+
snippet = @text[snippet_begin ... d[:span][:begin]] + '[' + @text[d[:span][:begin] ... d[:span][:end]] + ']' + @text[d[:span][:end] ... snippet_end]
|
126
|
+
if snippet_begin > 0
|
127
|
+
snippet = '...' + snippet
|
128
|
+
end
|
129
|
+
if snippet_end < @text.length
|
130
|
+
snippet = snippet + '...'
|
131
|
+
end
|
132
|
+
raise ArgumentError, e.message + " (#{snippet})"
|
113
133
|
end
|
114
134
|
end
|
115
135
|
|
@@ -118,7 +138,7 @@ class TextAlignment::CharMapping
|
|
118
138
|
def enmap_text(_text, char_mapping, no_ws = false)
|
119
139
|
text = _text.dup
|
120
140
|
|
121
|
-
# To
|
141
|
+
# To perform the single letter mapping replacement
|
122
142
|
char_mapping.each do |one, long|
|
123
143
|
text.gsub!(one, long) if long.length == 1
|
124
144
|
end
|
@@ -130,12 +150,16 @@ class TextAlignment::CharMapping
|
|
130
150
|
|
131
151
|
init_next = 0
|
132
152
|
while loc = text.index(long, init_next)
|
133
|
-
|
153
|
+
# Huristics to check if the surrounding letters are sufficiently distinguished.
|
154
|
+
if long.length > 3 || ((text[loc - 1, 2] !~ /[a-z][a-z]/) && (text[loc + long.length - 1, 2] !~ /[a-z][a-z]/))
|
155
|
+
# if true
|
156
|
+
rpositions << [loc, long.length, 1]
|
157
|
+
|
158
|
+
# a workaround to avoid messing-up due to embedding
|
159
|
+
text[loc, long.length] = one * long.length
|
160
|
+
end
|
134
161
|
init_next = loc + long.length
|
135
162
|
end
|
136
|
-
|
137
|
-
# a workaround to avoid messing-up due to embedding
|
138
|
-
text.gsub!(long, one * long.length)
|
139
163
|
end
|
140
164
|
|
141
165
|
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
@@ -163,6 +187,7 @@ class TextAlignment::CharMapping
|
|
163
187
|
|
164
188
|
# To execute the long letter mapping
|
165
189
|
char_mapping.each do |one, long|
|
190
|
+
next unless text =~ /#{one}/
|
166
191
|
text.gsub!(one * long.length, one) if long.length > 1
|
167
192
|
end
|
168
193
|
|
@@ -17,7 +17,7 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2,
|
20
|
+
def initialize(_str1, _str2, mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
23
|
mappings ||= TextAlignment::CHAR_MAPPING
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|