text_alignment 0.7.2 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +37 -138
- data/lib/text_alignment/anchor_finder.rb +130 -62
- data/lib/text_alignment/char_mapping.rb +187 -0
- data/lib/text_alignment/constants.rb +1 -1
- data/lib/text_alignment/cultivation_map.rb +19 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +7 -63
- data/lib/text_alignment/text_alignment.rb +251 -182
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
- data/lib/text_alignment/mappings.rb +0 -74
@@ -0,0 +1,187 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
|
+
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
|
+
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
|
+
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
|
+
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["‐", "-"], #U+2010 (Hyphen)
|
67
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
68
|
+
["−", "-"], #U+2212 (minus sign)
|
69
|
+
["–", "-"], #U+2013 (en dash)
|
70
|
+
["′", "'"], #U+2032 (prime)
|
71
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
72
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
73
|
+
["“", '"'], #U+201C (left double quotation mark)
|
74
|
+
["”", '"'], #U+201D (right double quotation mark)
|
75
|
+
['"', "''"]
|
76
|
+
]
|
77
|
+
|
78
|
+
|
79
|
+
class TextAlignment::CharMapping
|
80
|
+
attr_reader :str
|
81
|
+
|
82
|
+
def initialize(_str, char_mapping = nil)
|
83
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
+
@str, offset_mapping = enmap_str(_str, char_mapping)
|
85
|
+
@index_enmap = offset_mapping.to_h
|
86
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
|
+
end
|
88
|
+
|
89
|
+
def enmap_position(position)
|
90
|
+
@index_enmap[position]
|
91
|
+
end
|
92
|
+
|
93
|
+
def demap_position(position)
|
94
|
+
@index_demap[position]
|
95
|
+
end
|
96
|
+
|
97
|
+
def enmap_denotations(_denotations)
|
98
|
+
denotations = _denotations.map do |d|
|
99
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
def enmap_str(_str, char_mapping)
|
106
|
+
str = _str.dup
|
107
|
+
|
108
|
+
# To execute the single letter mapping
|
109
|
+
char_mapping.each do |one, long|
|
110
|
+
str.gsub!(one, long) if long.length == 1
|
111
|
+
end
|
112
|
+
|
113
|
+
# To get the (location, length) index for replacements
|
114
|
+
loc_len = []
|
115
|
+
char_mapping.each do |one, long|
|
116
|
+
next if long.length == 1
|
117
|
+
|
118
|
+
init_next = 0
|
119
|
+
while loc = str.index(long, init_next)
|
120
|
+
loc_len << [loc, long.length]
|
121
|
+
init_next = loc + long.length
|
122
|
+
end
|
123
|
+
|
124
|
+
# a workaround to avoid messing-up due to embedding
|
125
|
+
str.gsub!(long, one * long.length)
|
126
|
+
end
|
127
|
+
|
128
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
129
|
+
init_next = 0
|
130
|
+
while loc = str.index(/\s{2,}/, init_next)
|
131
|
+
len = $~[0].length
|
132
|
+
loc_len << [loc, len]
|
133
|
+
init_next = loc + len
|
134
|
+
end
|
135
|
+
|
136
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
137
|
+
|
138
|
+
# To get the offset_mapping before and after replacement
|
139
|
+
offset_mapping = []
|
140
|
+
init_next = 0
|
141
|
+
j = 0
|
142
|
+
|
143
|
+
loc_len.each do |loc, len|
|
144
|
+
offset_mapping += (init_next .. loc).map do |i|
|
145
|
+
j += 1
|
146
|
+
[i, j - 1]
|
147
|
+
end
|
148
|
+
init_next = loc + len
|
149
|
+
end
|
150
|
+
|
151
|
+
offset_mapping += (init_next .. str.length).map do |i|
|
152
|
+
j += 1
|
153
|
+
[i, j - 1]
|
154
|
+
end
|
155
|
+
|
156
|
+
# To execute the long letter mapping
|
157
|
+
char_mapping.each do |one, long|
|
158
|
+
str.gsub!(one * long.length, one) if long.length > 1
|
159
|
+
end
|
160
|
+
|
161
|
+
# To replace multi whitespace sequences to a space
|
162
|
+
str.gsub!(/\s{2,}/, ' ')
|
163
|
+
|
164
|
+
[str, offset_mapping]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
if __FILE__ == $0
|
169
|
+
require 'json'
|
170
|
+
|
171
|
+
unless ARGV.length == 1
|
172
|
+
warn "#{$0} an_annotation_json_file.json"
|
173
|
+
exit
|
174
|
+
end
|
175
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
176
|
+
denotations = annotations[:denotations]
|
177
|
+
if denotations.nil? && annotations[:tracks]
|
178
|
+
denotations = annotations[:tracks].first[:denotations]
|
179
|
+
end
|
180
|
+
|
181
|
+
str_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
182
|
+
str_mapped = str_mapping.str
|
183
|
+
denotations_mapped = str_mapping.enmap_denotations(denotations)
|
184
|
+
new_annotations = {text:str_mapped, denotations:denotations_mapped}
|
185
|
+
|
186
|
+
puts new_annotations.to_json
|
187
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
-
TextAlignment::SIZE_WINDOW =
|
4
|
+
TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
7
|
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
regions.each do |b, e|
|
12
|
+
(b ... e).each{|p| @map[p] = e}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def search_again_position(position)
|
17
|
+
@map[position]
|
18
|
+
end
|
19
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -17,10 +17,12 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2)
|
20
|
+
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -139,72 +141,14 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
142
|
-
|
143
|
-
|
144
|
-
def string_preprocessing(_str1, _str2)
|
145
|
-
str1 = _str1.dup
|
146
|
-
str2 = _str2.dup
|
147
|
-
mappings = TextAlignment::MAPPINGS.dup
|
148
|
-
|
149
|
-
## single character mappings
|
150
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
-
characters_to.gsub!(/-/, '\-')
|
154
|
-
|
155
|
-
str1.tr!(characters_from, characters_to)
|
156
|
-
str2.tr!(characters_from, characters_to)
|
157
|
-
|
158
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
-
|
160
|
-
## long to one character mappings
|
161
|
-
pletters = TextAlignment::PADDING_LETTERS
|
162
|
-
|
163
|
-
# find the padding letter for str1
|
164
|
-
@padding_letter1 = begin
|
165
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
-
TextAlignment::PADDING_LETTERS[i]
|
168
|
-
end
|
169
|
-
|
170
|
-
# find the padding letter for str2
|
171
|
-
@padding_letter2 = begin
|
172
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
-
TextAlignment::PADDING_LETTERS[i]
|
175
|
-
end
|
176
|
-
|
177
|
-
# ASCII foldings
|
178
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
-
ascii_foldings.each do |f|
|
180
|
-
from = f[1]
|
181
|
-
|
182
|
-
if str2.index(f[0])
|
183
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
-
str1.gsub!(from, to)
|
185
|
-
end
|
186
|
-
|
187
|
-
if str1.index(f[0])
|
188
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
-
str2.gsub!(from, to)
|
190
|
-
end
|
191
|
-
end
|
192
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
-
|
194
|
-
[str1, str2, mappings]
|
195
|
-
end
|
196
|
-
|
197
|
-
def compute_similarity(_s1, _s2, sdiff)
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
198
145
|
return 0 if sdiff.nil?
|
199
146
|
|
200
147
|
# compute the lcs only with non-whitespace letters
|
201
148
|
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
149
|
return 0 if lcs == 0
|
203
150
|
|
204
|
-
|
205
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
-
|
207
|
-
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
151
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
208
152
|
end
|
209
153
|
|
210
154
|
end
|
@@ -2,39 +2,214 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
8
|
-
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
-
|
10
9
|
class TextAlignment::TextAlignment
|
11
10
|
attr_reader :block_alignment
|
12
11
|
attr_reader :similarity
|
13
12
|
attr_reader :lost_annotations
|
13
|
+
attr_reader :cultivation_map
|
14
14
|
|
15
|
-
def initialize(
|
16
|
-
raise ArgumentError, "nil string" if
|
15
|
+
def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
17
17
|
|
18
|
-
@block_alignment = {source_text:
|
19
|
-
@
|
20
|
-
@
|
18
|
+
@block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
|
19
|
+
@original_str1 = _str1
|
20
|
+
@original_str2 = _str2
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
@str1_mapping = TextAlignment::CharMapping.new(_str1)
|
23
|
+
@str2_mapping = TextAlignment::CharMapping.new(_str2)
|
24
|
+
|
25
|
+
str1 = @str1_mapping.str
|
26
|
+
denotations = @str1_mapping.enmap_denotations(_denotations)
|
27
|
+
|
28
|
+
str2 = @str2_mapping.str
|
29
|
+
|
30
|
+
@cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
|
31
|
+
|
32
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
|
33
|
+
# whole block alignment
|
34
|
+
r
|
35
|
+
else
|
36
|
+
find_block_alignment(str1, str2, denotations, @cultivation_map)
|
27
37
|
end
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
39
|
+
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
40
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
41
|
+
[b[:target][:begin], b[:target][:end]]
|
42
|
+
else
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
end.compact
|
46
|
+
newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
|
47
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
48
|
+
condensed.push region
|
49
|
+
else
|
50
|
+
condensed.last[1] = region.last
|
51
|
+
end
|
52
|
+
condensed
|
53
|
+
end
|
54
|
+
|
55
|
+
@cultivation_map.cultivate(newly_cultivated_regions_condensed)
|
56
|
+
end
|
57
|
+
|
58
|
+
def transform_begin_position(_begin_position)
|
59
|
+
begin_position = @str1_mapping.enmap_position(_begin_position)
|
60
|
+
|
61
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
62
|
+
block = @block_alignment[:blocks][i]
|
63
|
+
|
64
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
65
|
+
begin_position + block[:delta]
|
66
|
+
elsif block[:alignment] == :empty
|
67
|
+
if begin_position == block[:source][:begin]
|
68
|
+
block[:target][:begin]
|
69
|
+
else
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
else
|
73
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
74
|
+
r.nil? ? nil : r + block[:target][:begin]
|
75
|
+
end
|
76
|
+
|
77
|
+
@str2_mapping.demap_position(b)
|
78
|
+
end
|
79
|
+
|
80
|
+
def transform_end_position(_end_position)
|
81
|
+
end_position = @str1_mapping.enmap_position(_end_position)
|
82
|
+
|
83
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
84
|
+
block = @block_alignment[:blocks][i]
|
85
|
+
|
86
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
87
|
+
end_position + block[:delta]
|
88
|
+
elsif block[:alignment] == :empty
|
89
|
+
if end_position == block[:source][:end]
|
90
|
+
block[:target][:end]
|
91
|
+
else
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
else
|
95
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
96
|
+
r.nil? ? nil : r + block[:target][:begin]
|
33
97
|
end
|
34
98
|
|
99
|
+
@str2_mapping.demap_position(e)
|
100
|
+
end
|
101
|
+
|
102
|
+
def transform_a_span(span)
|
103
|
+
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
104
|
+
end
|
35
105
|
|
106
|
+
def transform_spans(spans)
|
107
|
+
spans.map{|span| transform_a_span(span)}
|
108
|
+
end
|
109
|
+
|
110
|
+
def transform_denotations!(denotations)
|
111
|
+
return nil if denotations.nil?
|
112
|
+
@lost_annotations = []
|
113
|
+
|
114
|
+
denotations.each do |d|
|
115
|
+
source = {begin:d.begin, end:d.end}
|
116
|
+
d.begin = transform_begin_position(d.begin);
|
117
|
+
d.end = transform_end_position(d.end);
|
118
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
|
119
|
+
rescue
|
120
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
121
|
+
d.begin = nil
|
122
|
+
d.end = nil
|
123
|
+
end
|
124
|
+
|
125
|
+
@lost_annotations
|
126
|
+
end
|
127
|
+
|
128
|
+
def transform_hdenotations(hdenotations)
|
129
|
+
return nil if hdenotations.nil?
|
130
|
+
@lost_annotations = []
|
131
|
+
|
132
|
+
r = hdenotations.collect do |d|
|
133
|
+
t = transform_a_span(d[:span])
|
134
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
|
135
|
+
new_d = d.dup.merge({span:t})
|
136
|
+
rescue
|
137
|
+
@lost_annotations << {source: d[:span], target:t}
|
138
|
+
nil
|
139
|
+
end.compact
|
140
|
+
|
141
|
+
r
|
142
|
+
end
|
143
|
+
|
144
|
+
def alignment_show
|
145
|
+
stext = @block_alignment[:source_text]
|
146
|
+
ttext = @block_alignment[:target_text]
|
147
|
+
|
148
|
+
show = ''
|
149
|
+
@block_alignment[:blocks].each do |a|
|
150
|
+
show += case a[:alignment]
|
151
|
+
when :block
|
152
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
153
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
154
|
+
when :term
|
155
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
156
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
157
|
+
when :empty
|
158
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
159
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
160
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
161
|
+
">>>>> string 2 " +
|
162
|
+
if a[:target]
|
163
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
164
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
165
|
+
else
|
166
|
+
"[-]\n\n"
|
167
|
+
end
|
168
|
+
else
|
169
|
+
astr1 = ''
|
170
|
+
astr2 = ''
|
171
|
+
|
172
|
+
base = a[:source][:begin]
|
173
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
174
|
+
case c.action
|
175
|
+
when '='
|
176
|
+
stext[c.old_position + base]
|
177
|
+
when '+'
|
178
|
+
'_'
|
179
|
+
when '-'
|
180
|
+
stext[c.old_position + base]
|
181
|
+
when '!'
|
182
|
+
stext[c.old_position + base] + '_'
|
183
|
+
end
|
184
|
+
end.join('')
|
185
|
+
|
186
|
+
base = a[:target][:begin]
|
187
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
188
|
+
case c.action
|
189
|
+
when '='
|
190
|
+
ttext[c.new_position + base]
|
191
|
+
when '+'
|
192
|
+
ttext[c.new_position + base]
|
193
|
+
when '-'
|
194
|
+
'_'
|
195
|
+
when '!'
|
196
|
+
'_' + ttext[c.new_position + base]
|
197
|
+
end
|
198
|
+
end.join('')
|
199
|
+
|
200
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
201
|
+
"[#{astr1}]\n" +
|
202
|
+
"[#{astr2}]\n\n"
|
203
|
+
end
|
204
|
+
end
|
205
|
+
show
|
206
|
+
end
|
207
|
+
|
208
|
+
private
|
209
|
+
|
210
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
36
211
|
## to find block alignments
|
37
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2,
|
212
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
38
213
|
|
39
214
|
blocks = []
|
40
215
|
while block = anchor_finder.get_next_anchor
|
@@ -77,12 +252,13 @@ class TextAlignment::TextAlignment
|
|
77
252
|
|
78
253
|
if b2 == e2
|
79
254
|
[
|
80
|
-
{source:{begin:b1, end:e1},
|
255
|
+
{source:{begin:b1, end:e1}, alignment: :empty},
|
81
256
|
block
|
82
257
|
]
|
83
258
|
else
|
259
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
260
|
+
|
84
261
|
if b1 == 0 && b2 == 0
|
85
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
262
|
b2 = e2 - len_buffer if e2 > len_buffer
|
87
263
|
end
|
88
264
|
|
@@ -94,6 +270,10 @@ class TextAlignment::TextAlignment
|
|
94
270
|
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
271
|
block
|
96
272
|
]
|
273
|
+
elsif ((e2 - b2) - (e1 - b1)) > len_buffer
|
274
|
+
la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
|
275
|
+
la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
|
276
|
+
[la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
|
97
277
|
else
|
98
278
|
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
99
279
|
end
|
@@ -111,21 +291,58 @@ class TextAlignment::TextAlignment
|
|
111
291
|
b1 = last_block[:source][:end]
|
112
292
|
if b1 < str1.length
|
113
293
|
e1 = str1.length
|
114
|
-
|
115
294
|
b2 = last_block[:target][:end]
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
295
|
+
|
296
|
+
_str1 = str1[b1 ... e1]
|
297
|
+
if _str1.strip.empty?
|
298
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
120
299
|
else
|
121
|
-
|
300
|
+
if b2 < str2.length
|
301
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
302
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
303
|
+
|
304
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
305
|
+
else
|
306
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
307
|
+
end
|
122
308
|
end
|
123
309
|
else
|
124
310
|
[]
|
125
311
|
end
|
126
312
|
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
316
|
+
## Block exact match
|
317
|
+
search_position = 0
|
318
|
+
|
319
|
+
block_begin = begin
|
320
|
+
_block_begin = str2.index(str1, search_position)
|
321
|
+
break if _block_begin.nil?
|
322
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
323
|
+
_block_begin
|
324
|
+
end until search_position.nil?
|
325
|
+
|
326
|
+
unless block_begin.nil?
|
327
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
328
|
+
end
|
329
|
+
|
330
|
+
search_position = 0
|
331
|
+
|
332
|
+
dstr1 = str1.downcase
|
333
|
+
dstr2 = str2.downcase
|
334
|
+
block_begin = begin
|
335
|
+
_block_begin = dstr2.index(dstr1, search_position)
|
336
|
+
break if _block_begin.nil?
|
337
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
338
|
+
_block_begin
|
339
|
+
end until search_position.nil?
|
340
|
+
|
341
|
+
unless block_begin.nil?
|
342
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
343
|
+
end
|
127
344
|
|
128
|
-
|
345
|
+
nil
|
129
346
|
end
|
130
347
|
|
131
348
|
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
@@ -138,7 +355,7 @@ class TextAlignment::TextAlignment
|
|
138
355
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
139
356
|
|
140
357
|
position = 0
|
141
|
-
|
358
|
+
_tblocks = ds_in_scope.map do |term|
|
142
359
|
lex = term[:lex]
|
143
360
|
r = block2.index(lex, position)
|
144
361
|
if r.nil?
|
@@ -146,11 +363,11 @@ class TextAlignment::TextAlignment
|
|
146
363
|
break
|
147
364
|
end
|
148
365
|
position = r + lex.length
|
149
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
366
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
150
367
|
end
|
151
368
|
|
152
369
|
# missing term found
|
153
|
-
|
370
|
+
_tblocks = [] if position.nil?
|
154
371
|
|
155
372
|
# redundant matching found
|
156
373
|
unless position.nil?
|
@@ -158,19 +375,20 @@ class TextAlignment::TextAlignment
|
|
158
375
|
lex = term[:lex]
|
159
376
|
look_forward = block2.index(lex, position)
|
160
377
|
unless look_forward.nil?
|
161
|
-
|
162
|
-
tblocks = []
|
378
|
+
_tblocks = []
|
163
379
|
break
|
164
380
|
end
|
165
381
|
end
|
166
382
|
end
|
167
383
|
|
168
|
-
|
384
|
+
_tblocks
|
385
|
+
else
|
386
|
+
[]
|
169
387
|
end
|
170
388
|
|
171
389
|
if tblocks.empty?
|
172
390
|
if b1 == 0 && e1 == str1.length
|
173
|
-
if (e1 >
|
391
|
+
if (e1 > 2000) || (e2 > 2000)
|
174
392
|
[{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
|
175
393
|
else
|
176
394
|
block1 = str1[b1 ... e1]
|
@@ -237,153 +455,4 @@ class TextAlignment::TextAlignment
|
|
237
455
|
end
|
238
456
|
end
|
239
457
|
|
240
|
-
|
241
|
-
def indices(str, target)
|
242
|
-
position = 0
|
243
|
-
len = target.len
|
244
|
-
Enumerator.new do |yielder|
|
245
|
-
while idx = str.index(target, position)
|
246
|
-
yielder << idx
|
247
|
-
position = idx + len
|
248
|
-
end
|
249
|
-
end
|
250
|
-
end
|
251
|
-
|
252
|
-
def transform_begin_position(begin_position)
|
253
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
254
|
-
block = @block_alignment[:blocks][i]
|
255
|
-
|
256
|
-
b = if block[:alignment] == :block || block[:alignment] == :term
|
257
|
-
begin_position + block[:delta]
|
258
|
-
elsif block[:alignment] == :empty
|
259
|
-
if begin_position == block[:source][:begin]
|
260
|
-
block[:target][:begin]
|
261
|
-
else
|
262
|
-
nil
|
263
|
-
end
|
264
|
-
else
|
265
|
-
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
266
|
-
r.nil? ? nil : r + block[:target][:begin]
|
267
|
-
end
|
268
|
-
end
|
269
|
-
|
270
|
-
def transform_end_position(end_position)
|
271
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
272
|
-
block = @block_alignment[:blocks][i]
|
273
|
-
|
274
|
-
e = if block[:alignment] == :block || block[:alignment] == :term
|
275
|
-
end_position + block[:delta]
|
276
|
-
elsif block[:alignment] == :empty
|
277
|
-
if end_position == block[:source][:end]
|
278
|
-
block[:target][:end]
|
279
|
-
else
|
280
|
-
nil
|
281
|
-
end
|
282
|
-
else
|
283
|
-
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
284
|
-
r.nil? ? nil : r + block[:target][:begin]
|
285
|
-
end
|
286
|
-
end
|
287
|
-
|
288
|
-
def transform_a_span(span)
|
289
|
-
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
290
|
-
end
|
291
|
-
|
292
|
-
def transform_spans(spans)
|
293
|
-
spans.map{|span| transform_a_span(span)}
|
294
|
-
end
|
295
|
-
|
296
|
-
def transform_denotations!(denotations)
|
297
|
-
return nil if denotations.nil?
|
298
|
-
@lost_annotations = []
|
299
|
-
|
300
|
-
denotations.each do |d|
|
301
|
-
source = {begin:d.begin, end:d.end}
|
302
|
-
d.begin = transform_begin_position(d.begin);
|
303
|
-
d.end = transform_end_position(d.end);
|
304
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
305
|
-
rescue
|
306
|
-
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
307
|
-
d.begin = nil
|
308
|
-
d.end = nil
|
309
|
-
end
|
310
|
-
|
311
|
-
@lost_annotations
|
312
|
-
end
|
313
|
-
|
314
|
-
def transform_hdenotations(hdenotations)
|
315
|
-
return nil if hdenotations.nil?
|
316
|
-
@lost_annotations = []
|
317
|
-
|
318
|
-
r = hdenotations.collect do |d|
|
319
|
-
t = transform_a_span(d[:span])
|
320
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
321
|
-
new_d = d.dup.merge({span:t})
|
322
|
-
rescue
|
323
|
-
@lost_annotations << {source: d[:span], target:t}
|
324
|
-
nil
|
325
|
-
end.compact
|
326
|
-
|
327
|
-
r
|
328
|
-
end
|
329
|
-
|
330
|
-
def alignment_show
|
331
|
-
stext = @block_alignment[:source_text]
|
332
|
-
ttext = @block_alignment[:target_text]
|
333
|
-
|
334
|
-
show = ''
|
335
|
-
@block_alignment[:blocks].each do |a|
|
336
|
-
show += case a[:alignment]
|
337
|
-
when :block
|
338
|
-
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
339
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
340
|
-
when :term
|
341
|
-
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
342
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
343
|
-
when :empty
|
344
|
-
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
345
|
-
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
346
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
347
|
-
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
348
|
-
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
349
|
-
else
|
350
|
-
astr1 = ''
|
351
|
-
astr2 = ''
|
352
|
-
|
353
|
-
base = a[:source][:begin]
|
354
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
355
|
-
case c.action
|
356
|
-
when '='
|
357
|
-
stext[c.old_position + base]
|
358
|
-
when '+'
|
359
|
-
'_'
|
360
|
-
when '-'
|
361
|
-
stext[c.old_position + base]
|
362
|
-
when '!'
|
363
|
-
stext[c.old_position + base] + '_'
|
364
|
-
end
|
365
|
-
end.join('')
|
366
|
-
|
367
|
-
base = a[:target][:begin]
|
368
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
369
|
-
case c.action
|
370
|
-
when '='
|
371
|
-
ttext[c.new_position + base]
|
372
|
-
when '+'
|
373
|
-
ttext[c.new_position + base]
|
374
|
-
when '-'
|
375
|
-
'_'
|
376
|
-
when '!'
|
377
|
-
'_' + ttext[c.new_position + base]
|
378
|
-
end
|
379
|
-
end.join('')
|
380
|
-
|
381
|
-
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
382
|
-
"[#{astr1}]\n" +
|
383
|
-
"[#{astr2}]\n\n"
|
384
|
-
end
|
385
|
-
end
|
386
|
-
show
|
387
|
-
end
|
388
|
-
|
389
458
|
end
|