text_alignment 0.12.1 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2345340266a0e66e9d26daaa51db1c9239bb837f52a5112a5c525a6d87b120d5
|
4
|
+
data.tar.gz: d3d1d118786e89a4bd7f9a6a9315643967c3ae099f8072913862ded9c895bfa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fc6c3235373f0e0174a922f006dd2cdf687361dfd567056137d20707b674ae75a40c13862d5a02946a225c19dfdae239ebf3f274a47bb4df1c8b2256bd968e2
|
7
|
+
data.tar.gz: 50dede22ed93d9e93a21dbbec5de39f3e342147c512d620a341331c4ddee6d0cd2cd08535e484732fd33e24ec1d8789d4daa7ef79191b13bcb9908634fcafe13
|
@@ -93,6 +93,7 @@ class TextAlignment::CharMapping
|
|
93
93
|
|
94
94
|
@text = _text
|
95
95
|
|
96
|
+
# sort by the length of the spell-outs is important
|
96
97
|
char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
|
97
98
|
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
98
99
|
@index_enmap = offset_mapping.to_h
|
@@ -100,11 +101,11 @@ class TextAlignment::CharMapping
|
|
100
101
|
end
|
101
102
|
|
102
103
|
def enmap_position(position)
|
103
|
-
@index_enmap[position]
|
104
|
+
@index_enmap[position]
|
104
105
|
end
|
105
106
|
|
106
107
|
def demap_position(position)
|
107
|
-
@index_demap[position]
|
108
|
+
@index_demap[position]
|
108
109
|
end
|
109
110
|
|
110
111
|
def enmap_denotations(denotations)
|
@@ -112,23 +113,6 @@ class TextAlignment::CharMapping
|
|
112
113
|
|
113
114
|
denotations.map do |d|
|
114
115
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
115
|
-
rescue ArgumentError => e
|
116
|
-
snippet_begin = d[:span][:begin] - 5
|
117
|
-
if snippet_begin < 0
|
118
|
-
snippet_begin = 0
|
119
|
-
end
|
120
|
-
snippet_end = d[:span][:end] + 5
|
121
|
-
if snippet_end > @text.length
|
122
|
-
snippet_end = @text.length
|
123
|
-
end
|
124
|
-
snippet = @text[snippet_begin ... d[:span][:begin]] + '[' + @text[d[:span][:begin] ... d[:span][:end]] + ']' + @text[d[:span][:end] ... snippet_end]
|
125
|
-
if snippet_begin > 0
|
126
|
-
snippet = '...' + snippet
|
127
|
-
end
|
128
|
-
if snippet_end < @text.length
|
129
|
-
snippet = snippet + '...'
|
130
|
-
end
|
131
|
-
raise ArgumentError, e.message + " (#{snippet})"
|
132
116
|
end
|
133
117
|
end
|
134
118
|
|
@@ -137,7 +121,7 @@ class TextAlignment::CharMapping
|
|
137
121
|
def enmap_text(_text, char_mapping, no_ws = false)
|
138
122
|
text = _text.dup
|
139
123
|
|
140
|
-
# To
|
124
|
+
# To perform the single letter mapping replacement
|
141
125
|
char_mapping.each do |one, long|
|
142
126
|
text.gsub!(one, long) if long.length == 1
|
143
127
|
end
|
@@ -149,12 +133,16 @@ class TextAlignment::CharMapping
|
|
149
133
|
|
150
134
|
init_next = 0
|
151
135
|
while loc = text.index(long, init_next)
|
152
|
-
|
136
|
+
# Huristics to check if the surrounding letters are sufficiently distinguished.
|
137
|
+
if long.length > 3 || ((text[loc - 1, 2] !~ /[a-z][a-z]/) && (text[loc + long.length - 1, 2] !~ /[a-z][a-z]/))
|
138
|
+
# if true
|
139
|
+
rpositions << [loc, long.length, 1]
|
140
|
+
|
141
|
+
# a workaround to avoid messing-up due to embedding
|
142
|
+
text[loc, long.length] = one * long.length
|
143
|
+
end
|
153
144
|
init_next = loc + long.length
|
154
145
|
end
|
155
|
-
|
156
|
-
# a workaround to avoid messing-up due to embedding
|
157
|
-
text.gsub!(long, one * long.length)
|
158
146
|
end
|
159
147
|
|
160
148
|
# To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
|
@@ -182,6 +170,7 @@ class TextAlignment::CharMapping
|
|
182
170
|
|
183
171
|
# To execute the long letter mapping
|
184
172
|
char_mapping.each do |one, long|
|
173
|
+
next unless text =~ /#{one}/
|
185
174
|
text.gsub!(one * long.length, one) if long.length > 1
|
186
175
|
end
|
187
176
|
|
@@ -17,7 +17,7 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2,
|
20
|
+
def initialize(_str1, _str2, mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
23
|
mappings ||= TextAlignment::CHAR_MAPPING
|
@@ -40,12 +40,12 @@ class TextAlignment::TextAlignment
|
|
40
40
|
end
|
41
41
|
|
42
42
|
@mapped_text = @text_mapping.mapped_text
|
43
|
-
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
44
43
|
|
45
44
|
## To generate the block_alignment of the input text against the reference text
|
46
45
|
@blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
47
46
|
r
|
48
47
|
else
|
48
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
49
49
|
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
50
50
|
end
|
51
51
|
|
@@ -343,7 +343,7 @@ class TextAlignment::TextAlignment
|
|
343
343
|
|
344
344
|
## term-based alignment
|
345
345
|
tblocks = if denotations
|
346
|
-
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
346
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] && d[:span][:end] && d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
347
347
|
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
348
348
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
349
349
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|