text_alignment 0.7.3 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +39 -140
- data/lib/text_alignment/anchor_finder.rb +130 -62
- data/lib/text_alignment/char_mapping.rb +189 -0
- data/lib/text_alignment/constants.rb +1 -1
- data/lib/text_alignment/cultivation_map.rb +19 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +2 -2
- data/lib/text_alignment/mixed_alignment.rb +7 -63
- data/lib/text_alignment/text_alignment.rb +269 -181
- data/lib/text_alignment/version.rb +1 -1
- metadata +4 -3
- data/lib/text_alignment/mappings.rb +0 -75
@@ -0,0 +1,189 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::CHAR_MAPPING = [
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
|
+
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
|
+
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
|
+
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
|
+
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (Non-Breaking space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["‐", "-"], #U+2010 (Hyphen)
|
67
|
+
["‑", "-"], #U+2011 (Non-Breaking Hyphen)
|
68
|
+
["−", "-"], #U+2212 (minus sign)
|
69
|
+
["–", "-"], #U+2013 (en dash)
|
70
|
+
["′", "'"], #U+2032 (prime)
|
71
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
72
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
73
|
+
["“", '"'], #U+201C (left double quotation mark)
|
74
|
+
["”", '"'], #U+201D (right double quotation mark)
|
75
|
+
['"', "''"]
|
76
|
+
]
|
77
|
+
|
78
|
+
|
79
|
+
class TextAlignment::CharMapping
|
80
|
+
attr_reader :mapped_text
|
81
|
+
|
82
|
+
def initialize(_text, char_mapping = nil)
|
83
|
+
char_mapping ||= TextAlignment::CHAR_MAPPING
|
84
|
+
@mapped_text, offset_mapping = enmap_text(_text, char_mapping)
|
85
|
+
@index_enmap = offset_mapping.to_h
|
86
|
+
@index_demap = offset_mapping.map{|m| m.reverse}.to_h
|
87
|
+
end
|
88
|
+
|
89
|
+
def enmap_position(position)
|
90
|
+
@index_enmap[position]
|
91
|
+
end
|
92
|
+
|
93
|
+
def demap_position(position)
|
94
|
+
@index_demap[position]
|
95
|
+
end
|
96
|
+
|
97
|
+
def enmap_denotations(_denotations)
|
98
|
+
return nil if _denotations.nil?
|
99
|
+
|
100
|
+
denotations = _denotations.map do |d|
|
101
|
+
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def enmap_text(_text, char_mapping)
|
108
|
+
text = _text.dup
|
109
|
+
|
110
|
+
# To execute the single letter mapping
|
111
|
+
char_mapping.each do |one, long|
|
112
|
+
text.gsub!(one, long) if long.length == 1
|
113
|
+
end
|
114
|
+
|
115
|
+
# To get the (location, length) index for replacements
|
116
|
+
loc_len = []
|
117
|
+
char_mapping.each do |one, long|
|
118
|
+
next if long.length == 1
|
119
|
+
|
120
|
+
init_next = 0
|
121
|
+
while loc = text.index(long, init_next)
|
122
|
+
loc_len << [loc, long.length]
|
123
|
+
init_next = loc + long.length
|
124
|
+
end
|
125
|
+
|
126
|
+
# a workaround to avoid messing-up due to embedding
|
127
|
+
text.gsub!(long, one * long.length)
|
128
|
+
end
|
129
|
+
|
130
|
+
# To get the (location, length) index for consecutive whitespace sequences
|
131
|
+
init_next = 0
|
132
|
+
while loc = text.index(/\s{2,}/, init_next)
|
133
|
+
len = $~[0].length
|
134
|
+
loc_len << [loc, len]
|
135
|
+
init_next = loc + len
|
136
|
+
end
|
137
|
+
|
138
|
+
loc_len.sort!{|a, b| a[0] <=> b[0]}
|
139
|
+
|
140
|
+
# To get the offset_mapping before and after replacement
|
141
|
+
offset_mapping = []
|
142
|
+
init_next = 0
|
143
|
+
j = 0
|
144
|
+
|
145
|
+
loc_len.each do |loc, len|
|
146
|
+
offset_mapping += (init_next .. loc).map do |i|
|
147
|
+
j += 1
|
148
|
+
[i, j - 1]
|
149
|
+
end
|
150
|
+
init_next = loc + len
|
151
|
+
end
|
152
|
+
|
153
|
+
offset_mapping += (init_next .. text.length).map do |i|
|
154
|
+
j += 1
|
155
|
+
[i, j - 1]
|
156
|
+
end
|
157
|
+
|
158
|
+
# To execute the long letter mapping
|
159
|
+
char_mapping.each do |one, long|
|
160
|
+
text.gsub!(one * long.length, one) if long.length > 1
|
161
|
+
end
|
162
|
+
|
163
|
+
# To replace multi whitespace sequences to a space
|
164
|
+
text.gsub!(/\s{2,}/, ' ')
|
165
|
+
|
166
|
+
[text, offset_mapping]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
if __FILE__ == $0
|
171
|
+
require 'json'
|
172
|
+
|
173
|
+
unless ARGV.length == 1
|
174
|
+
warn "#{$0} an_annotation_json_file.json"
|
175
|
+
exit
|
176
|
+
end
|
177
|
+
annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
|
178
|
+
denotations = annotations[:denotations]
|
179
|
+
if denotations.nil? && annotations[:tracks]
|
180
|
+
denotations = annotations[:tracks].first[:denotations]
|
181
|
+
end
|
182
|
+
|
183
|
+
text_mapping = TextAlignment::CharMapping.new(annotations[:text])
|
184
|
+
text_mapped = text_mapping.mapped_text
|
185
|
+
denotations_mapped = text_mapping.enmap_denotations(denotations)
|
186
|
+
new_annotations = {text:text_mapped, denotations:denotations_mapped}
|
187
|
+
|
188
|
+
puts new_annotations.to_json
|
189
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module TextAlignment; end unless defined? TextAlignment
|
2
2
|
|
3
3
|
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
-
TextAlignment::SIZE_WINDOW =
|
4
|
+
TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
7
|
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
class TextAlignment::CultivationMap
|
4
|
+
attr_reader :map
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@map = {}
|
8
|
+
end
|
9
|
+
|
10
|
+
def cultivate(regions)
|
11
|
+
regions.each do |b, e|
|
12
|
+
(b ... e).each{|p| @map[p] = e}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def search_again_position(position)
|
17
|
+
@map[position]
|
18
|
+
end
|
19
|
+
end
|
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
|
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/glcs_alignment'
|
8
|
-
require 'text_alignment/
|
8
|
+
require 'text_alignment/char_mapping'
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
@@ -106,7 +106,7 @@ if __FILE__ == $0
|
|
106
106
|
|
107
107
|
dictionary = [["β", "beta"]]
|
108
108
|
# align = TextAlignment::TextAlignment.new(str1, str2)
|
109
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::
|
109
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
|
110
110
|
p align.common_elements
|
111
111
|
p align.mapped_elements
|
112
112
|
end
|
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
|
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
7
|
require 'text_alignment/lcs_cdiff'
|
8
8
|
require 'text_alignment/glcs_alignment'
|
9
|
-
require 'text_alignment/
|
9
|
+
require 'text_alignment/char_mapping'
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
@@ -17,10 +17,12 @@ class TextAlignment::MixedAlignment
|
|
17
17
|
attr_reader :similarity
|
18
18
|
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
19
|
|
20
|
-
def initialize(_str1, _str2)
|
20
|
+
def initialize(_str1, _str2, _mappings = nil)
|
21
21
|
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
22
22
|
|
23
|
-
|
23
|
+
mappings ||= TextAlignment::CHAR_MAPPING
|
24
|
+
str1 = _str1.dup
|
25
|
+
str2 = _str2.dup
|
24
26
|
|
25
27
|
_compute_mixed_alignment(str1, str2, mappings)
|
26
28
|
end
|
@@ -139,72 +141,14 @@ class TextAlignment::MixedAlignment
|
|
139
141
|
@position_map_end = posmap_end.sort.to_h
|
140
142
|
end
|
141
143
|
|
142
|
-
|
143
|
-
|
144
|
-
def string_preprocessing(_str1, _str2)
|
145
|
-
str1 = _str1.dup
|
146
|
-
str2 = _str2.dup
|
147
|
-
mappings = TextAlignment::MAPPINGS.dup
|
148
|
-
|
149
|
-
## single character mappings
|
150
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
151
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
152
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
153
|
-
characters_to.gsub!(/-/, '\-')
|
154
|
-
|
155
|
-
str1.tr!(characters_from, characters_to)
|
156
|
-
str2.tr!(characters_from, characters_to)
|
157
|
-
|
158
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
159
|
-
|
160
|
-
## long to one character mappings
|
161
|
-
pletters = TextAlignment::PADDING_LETTERS
|
162
|
-
|
163
|
-
# find the padding letter for str1
|
164
|
-
@padding_letter1 = begin
|
165
|
-
i = pletters.index{|l| str2.index(l).nil?}
|
166
|
-
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
167
|
-
TextAlignment::PADDING_LETTERS[i]
|
168
|
-
end
|
169
|
-
|
170
|
-
# find the padding letter for str2
|
171
|
-
@padding_letter2 = begin
|
172
|
-
i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
|
173
|
-
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
174
|
-
TextAlignment::PADDING_LETTERS[i]
|
175
|
-
end
|
176
|
-
|
177
|
-
# ASCII foldings
|
178
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
179
|
-
ascii_foldings.each do |f|
|
180
|
-
from = f[1]
|
181
|
-
|
182
|
-
if str2.index(f[0])
|
183
|
-
to = f[0] + (@padding_letter1 * (f[1].length - 1))
|
184
|
-
str1.gsub!(from, to)
|
185
|
-
end
|
186
|
-
|
187
|
-
if str1.index(f[0])
|
188
|
-
to = f[0] + (@padding_letter2 * (f[1].length - 1))
|
189
|
-
str2.gsub!(from, to)
|
190
|
-
end
|
191
|
-
end
|
192
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
193
|
-
|
194
|
-
[str1, str2, mappings]
|
195
|
-
end
|
196
|
-
|
197
|
-
def compute_similarity(_s1, _s2, sdiff)
|
144
|
+
def compute_similarity(s1, s2, sdiff)
|
198
145
|
return 0 if sdiff.nil?
|
199
146
|
|
200
147
|
# compute the lcs only with non-whitespace letters
|
201
148
|
lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
|
202
149
|
return 0 if lcs == 0
|
203
150
|
|
204
|
-
|
205
|
-
s2 = _s2.tr(@padding_letter2, ' ')
|
206
|
-
|
207
|
-
similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
|
151
|
+
similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
|
208
152
|
end
|
209
153
|
|
210
154
|
end
|
@@ -2,39 +2,233 @@
|
|
2
2
|
require 'text_alignment/constants'
|
3
3
|
require 'text_alignment/anchor_finder'
|
4
4
|
require 'text_alignment/mixed_alignment'
|
5
|
+
require 'text_alignment/cultivation_map'
|
5
6
|
|
6
7
|
module TextAlignment; end unless defined? TextAlignment
|
7
8
|
|
8
|
-
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
-
|
10
9
|
class TextAlignment::TextAlignment
|
11
10
|
attr_reader :block_alignment
|
12
11
|
attr_reader :similarity
|
13
12
|
attr_reader :lost_annotations
|
14
13
|
|
15
|
-
|
16
|
-
|
14
|
+
# Initialize with a reference text, again which texts will be aligned
|
15
|
+
def initialize(reference_text, to_prevent_overlap = false)
|
16
|
+
raise ArgumentError, "nil text" if reference_text.nil?
|
17
17
|
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
18
|
+
@original_rtext = reference_text
|
19
|
+
@rtext_mapping = TextAlignment::CharMapping.new(reference_text)
|
20
|
+
@to_prevent_overlap = to_prevent_overlap
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
@original_text = nil
|
23
|
+
@block_alignment = nil
|
24
|
+
@cultivation_map = TextAlignment::CultivationMap.new
|
25
|
+
end
|
26
|
+
|
27
|
+
def align(text, denotations = nil)
|
28
|
+
# To maintain the cultivation map
|
29
|
+
update_cultivation_map if @to_prevent_overlap
|
30
|
+
|
31
|
+
# In case the input text is the same as the previous one, reuse the previous text mapping
|
32
|
+
unless @original_text && @original_text == text
|
33
|
+
@original_text = text
|
34
|
+
@text_mapping = TextAlignment::CharMapping.new(text)
|
27
35
|
end
|
28
36
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
37
|
+
text_mapped = @text_mapping.mapped_text
|
38
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
39
|
+
|
40
|
+
rtext_mapped = @rtext_mapping.mapped_text
|
41
|
+
|
42
|
+
## To generate the block_alignment of the input text against the reference text
|
43
|
+
|
44
|
+
# Initialization
|
45
|
+
@block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
|
46
|
+
|
47
|
+
# Generation
|
48
|
+
@block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
|
49
|
+
r
|
50
|
+
else
|
51
|
+
find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def update_cultivation_map
|
56
|
+
return if @block_alignment.nil? || @block_alignment[:blocks].nil?
|
57
|
+
|
58
|
+
## To update the cultivation map
|
59
|
+
newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
|
60
|
+
if b[:alignment] == :block || b[:alignment] == :term
|
61
|
+
[b[:target][:begin], b[:target][:end]]
|
62
|
+
else
|
63
|
+
nil
|
64
|
+
end
|
65
|
+
end.compact.inject([]) do |condensed, region|
|
66
|
+
if condensed.empty? || (condensed.last.last + 1 < region.first)
|
67
|
+
condensed.push region
|
68
|
+
else
|
69
|
+
condensed.last[1] = region.last
|
70
|
+
end
|
71
|
+
condensed
|
72
|
+
end
|
73
|
+
|
74
|
+
@cultivation_map.cultivate(newly_cultivated_regions)
|
75
|
+
end
|
76
|
+
|
77
|
+
def transform_begin_position(_begin_position)
|
78
|
+
begin_position = @text_mapping.enmap_position(_begin_position)
|
79
|
+
|
80
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
81
|
+
block = @block_alignment[:blocks][i]
|
82
|
+
|
83
|
+
b = if block[:alignment] == :block || block[:alignment] == :term
|
84
|
+
begin_position + block[:delta]
|
85
|
+
elsif block[:alignment] == :empty
|
86
|
+
if begin_position == block[:source][:begin]
|
87
|
+
block[:target][:begin]
|
88
|
+
else
|
89
|
+
nil
|
90
|
+
end
|
91
|
+
else
|
92
|
+
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
93
|
+
r.nil? ? nil : r + block[:target][:begin]
|
94
|
+
end
|
95
|
+
|
96
|
+
@rtext_mapping.demap_position(b)
|
97
|
+
end
|
98
|
+
|
99
|
+
def transform_end_position(_end_position)
|
100
|
+
end_position = @text_mapping.enmap_position(_end_position)
|
101
|
+
|
102
|
+
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
103
|
+
block = @block_alignment[:blocks][i]
|
104
|
+
|
105
|
+
e = if block[:alignment] == :block || block[:alignment] == :term
|
106
|
+
end_position + block[:delta]
|
107
|
+
elsif block[:alignment] == :empty
|
108
|
+
if end_position == block[:source][:end]
|
109
|
+
block[:target][:end]
|
110
|
+
else
|
111
|
+
nil
|
112
|
+
end
|
113
|
+
else
|
114
|
+
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
115
|
+
r.nil? ? nil : r + block[:target][:begin]
|
116
|
+
end
|
117
|
+
|
118
|
+
@rtext_mapping.demap_position(e)
|
119
|
+
end
|
120
|
+
|
121
|
+
def transform_a_span(span)
|
122
|
+
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
123
|
+
end
|
124
|
+
|
125
|
+
def transform_spans(spans)
|
126
|
+
spans.map{|span| transform_a_span(span)}
|
127
|
+
end
|
128
|
+
|
129
|
+
def transform_denotations!(denotations)
|
130
|
+
return nil if denotations.nil?
|
131
|
+
@lost_annotations = []
|
132
|
+
|
133
|
+
denotations.each do |d|
|
134
|
+
source = {begin:d.begin, end:d.end}
|
135
|
+
d.begin = transform_begin_position(d.begin);
|
136
|
+
d.end = transform_end_position(d.end);
|
137
|
+
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
|
138
|
+
rescue
|
139
|
+
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
140
|
+
d.begin = nil
|
141
|
+
d.end = nil
|
33
142
|
end
|
34
143
|
|
144
|
+
@lost_annotations
|
145
|
+
end
|
146
|
+
|
147
|
+
def transform_hdenotations(hdenotations)
|
148
|
+
return nil if hdenotations.nil?
|
149
|
+
@lost_annotations = []
|
150
|
+
|
151
|
+
r = hdenotations.collect do |d|
|
152
|
+
t = transform_a_span(d[:span])
|
153
|
+
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
|
154
|
+
new_d = d.dup.merge({span:t})
|
155
|
+
rescue
|
156
|
+
@lost_annotations << {source: d[:span], target:t}
|
157
|
+
nil
|
158
|
+
end.compact
|
159
|
+
|
160
|
+
r
|
161
|
+
end
|
162
|
+
|
163
|
+
def alignment_show
|
164
|
+
stext = @block_alignment[:text]
|
165
|
+
ttext = @block_alignment[:reference_text]
|
166
|
+
|
167
|
+
show = ''
|
168
|
+
@block_alignment[:blocks].each do |a|
|
169
|
+
show += case a[:alignment]
|
170
|
+
when :block
|
171
|
+
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
172
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
173
|
+
when :term
|
174
|
+
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
175
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
176
|
+
when :empty
|
177
|
+
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
178
|
+
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
179
|
+
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
180
|
+
">>>>> string 2 " +
|
181
|
+
if a[:target]
|
182
|
+
"[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
183
|
+
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
184
|
+
else
|
185
|
+
"[-]\n\n"
|
186
|
+
end
|
187
|
+
else
|
188
|
+
astr1 = ''
|
189
|
+
astr2 = ''
|
190
|
+
|
191
|
+
base = a[:source][:begin]
|
192
|
+
astr1 = a[:alignment].sdiff.map do |c|
|
193
|
+
case c.action
|
194
|
+
when '='
|
195
|
+
stext[c.old_position + base]
|
196
|
+
when '+'
|
197
|
+
'_'
|
198
|
+
when '-'
|
199
|
+
stext[c.old_position + base]
|
200
|
+
when '!'
|
201
|
+
stext[c.old_position + base] + '_'
|
202
|
+
end
|
203
|
+
end.join('')
|
35
204
|
|
205
|
+
base = a[:target][:begin]
|
206
|
+
astr2 = a[:alignment].sdiff.map do |c|
|
207
|
+
case c.action
|
208
|
+
when '='
|
209
|
+
ttext[c.new_position + base]
|
210
|
+
when '+'
|
211
|
+
ttext[c.new_position + base]
|
212
|
+
when '-'
|
213
|
+
'_'
|
214
|
+
when '!'
|
215
|
+
'_' + ttext[c.new_position + base]
|
216
|
+
end
|
217
|
+
end.join('')
|
218
|
+
|
219
|
+
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
220
|
+
"[#{astr1}]\n" +
|
221
|
+
"[#{astr2}]\n\n"
|
222
|
+
end
|
223
|
+
end
|
224
|
+
show
|
225
|
+
end
|
226
|
+
|
227
|
+
private
|
228
|
+
|
229
|
+
def find_block_alignment(str1, str2, denotations, cultivation_map)
|
36
230
|
## to find block alignments
|
37
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2,
|
231
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
|
38
232
|
|
39
233
|
blocks = []
|
40
234
|
while block = anchor_finder.get_next_anchor
|
@@ -77,12 +271,13 @@ class TextAlignment::TextAlignment
|
|
77
271
|
|
78
272
|
if b2 == e2
|
79
273
|
[
|
80
|
-
{source:{begin:b1, end:e1},
|
274
|
+
{source:{begin:b1, end:e1}, alignment: :empty},
|
81
275
|
block
|
82
276
|
]
|
83
277
|
else
|
278
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
279
|
+
|
84
280
|
if b1 == 0 && b2 == 0
|
85
|
-
len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
86
281
|
b2 = e2 - len_buffer if e2 > len_buffer
|
87
282
|
end
|
88
283
|
|
@@ -94,6 +289,10 @@ class TextAlignment::TextAlignment
|
|
94
289
|
{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
|
95
290
|
block
|
96
291
|
]
|
292
|
+
elsif ((e2 - b2) - (e1 - b1)) > len_buffer
|
293
|
+
la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
|
294
|
+
la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
|
295
|
+
[la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
|
97
296
|
else
|
98
297
|
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
|
99
298
|
end
|
@@ -111,21 +310,58 @@ class TextAlignment::TextAlignment
|
|
111
310
|
b1 = last_block[:source][:end]
|
112
311
|
if b1 < str1.length
|
113
312
|
e1 = str1.length
|
114
|
-
|
115
313
|
b2 = last_block[:target][:end]
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
314
|
+
|
315
|
+
_str1 = str1[b1 ... e1]
|
316
|
+
if _str1.strip.empty?
|
317
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
120
318
|
else
|
121
|
-
|
319
|
+
if b2 < str2.length
|
320
|
+
len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
|
321
|
+
e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
|
322
|
+
|
323
|
+
local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
|
324
|
+
else
|
325
|
+
[{source:{begin:b1, end:e1}, alignment: :empty}]
|
326
|
+
end
|
122
327
|
end
|
123
328
|
else
|
124
329
|
[]
|
125
330
|
end
|
126
331
|
end
|
332
|
+
end
|
333
|
+
|
334
|
+
def whole_block_alignment(str1, str2, cultivation_map)
|
335
|
+
## Block exact match
|
336
|
+
search_position = 0
|
337
|
+
|
338
|
+
block_begin = begin
|
339
|
+
_block_begin = str2.index(str1, search_position)
|
340
|
+
break if _block_begin.nil?
|
341
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
342
|
+
_block_begin
|
343
|
+
end until search_position.nil?
|
344
|
+
|
345
|
+
unless block_begin.nil?
|
346
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
347
|
+
end
|
348
|
+
|
349
|
+
search_position = 0
|
127
350
|
|
128
|
-
|
351
|
+
dstr1 = str1.downcase
|
352
|
+
dstr2 = str2.downcase
|
353
|
+
block_begin = begin
|
354
|
+
_block_begin = dstr2.index(dstr1, search_position)
|
355
|
+
break if _block_begin.nil?
|
356
|
+
search_position = cultivation_map.search_again_position(_block_begin)
|
357
|
+
_block_begin
|
358
|
+
end until search_position.nil?
|
359
|
+
|
360
|
+
unless block_begin.nil?
|
361
|
+
return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
|
362
|
+
end
|
363
|
+
|
364
|
+
nil
|
129
365
|
end
|
130
366
|
|
131
367
|
def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
|
@@ -138,7 +374,7 @@ class TextAlignment::TextAlignment
|
|
138
374
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
139
375
|
|
140
376
|
position = 0
|
141
|
-
|
377
|
+
_tblocks = ds_in_scope.map do |term|
|
142
378
|
lex = term[:lex]
|
143
379
|
r = block2.index(lex, position)
|
144
380
|
if r.nil?
|
@@ -146,11 +382,11 @@ class TextAlignment::TextAlignment
|
|
146
382
|
break
|
147
383
|
end
|
148
384
|
position = r + lex.length
|
149
|
-
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
|
385
|
+
{source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
|
150
386
|
end
|
151
387
|
|
152
388
|
# missing term found
|
153
|
-
|
389
|
+
_tblocks = [] if position.nil?
|
154
390
|
|
155
391
|
# redundant matching found
|
156
392
|
unless position.nil?
|
@@ -158,14 +394,15 @@ class TextAlignment::TextAlignment
|
|
158
394
|
lex = term[:lex]
|
159
395
|
look_forward = block2.index(lex, position)
|
160
396
|
unless look_forward.nil?
|
161
|
-
|
162
|
-
tblocks = []
|
397
|
+
_tblocks = []
|
163
398
|
break
|
164
399
|
end
|
165
400
|
end
|
166
401
|
end
|
167
402
|
|
168
|
-
|
403
|
+
_tblocks
|
404
|
+
else
|
405
|
+
[]
|
169
406
|
end
|
170
407
|
|
171
408
|
if tblocks.empty?
|
@@ -237,153 +474,4 @@ class TextAlignment::TextAlignment
|
|
237
474
|
end
|
238
475
|
end
|
239
476
|
|
240
|
-
|
241
|
-
def indices(str, target)
|
242
|
-
position = 0
|
243
|
-
len = target.len
|
244
|
-
Enumerator.new do |yielder|
|
245
|
-
while idx = str.index(target, position)
|
246
|
-
yielder << idx
|
247
|
-
position = idx + len
|
248
|
-
end
|
249
|
-
end
|
250
|
-
end
|
251
|
-
|
252
|
-
def transform_begin_position(begin_position)
|
253
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
|
254
|
-
block = @block_alignment[:blocks][i]
|
255
|
-
|
256
|
-
b = if block[:alignment] == :block || block[:alignment] == :term
|
257
|
-
begin_position + block[:delta]
|
258
|
-
elsif block[:alignment] == :empty
|
259
|
-
if begin_position == block[:source][:begin]
|
260
|
-
block[:target][:begin]
|
261
|
-
else
|
262
|
-
nil
|
263
|
-
end
|
264
|
-
else
|
265
|
-
r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
|
266
|
-
r.nil? ? nil : r + block[:target][:begin]
|
267
|
-
end
|
268
|
-
end
|
269
|
-
|
270
|
-
def transform_end_position(end_position)
|
271
|
-
i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
|
272
|
-
block = @block_alignment[:blocks][i]
|
273
|
-
|
274
|
-
e = if block[:alignment] == :block || block[:alignment] == :term
|
275
|
-
end_position + block[:delta]
|
276
|
-
elsif block[:alignment] == :empty
|
277
|
-
if end_position == block[:source][:end]
|
278
|
-
block[:target][:end]
|
279
|
-
else
|
280
|
-
nil
|
281
|
-
end
|
282
|
-
else
|
283
|
-
r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
|
284
|
-
r.nil? ? nil : r + block[:target][:begin]
|
285
|
-
end
|
286
|
-
end
|
287
|
-
|
288
|
-
def transform_a_span(span)
|
289
|
-
{begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
|
290
|
-
end
|
291
|
-
|
292
|
-
def transform_spans(spans)
|
293
|
-
spans.map{|span| transform_a_span(span)}
|
294
|
-
end
|
295
|
-
|
296
|
-
def transform_denotations!(denotations)
|
297
|
-
return nil if denotations.nil?
|
298
|
-
@lost_annotations = []
|
299
|
-
|
300
|
-
denotations.each do |d|
|
301
|
-
source = {begin:d.begin, end:d.end}
|
302
|
-
d.begin = transform_begin_position(d.begin);
|
303
|
-
d.end = transform_end_position(d.end);
|
304
|
-
raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
|
305
|
-
rescue
|
306
|
-
@lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
|
307
|
-
d.begin = nil
|
308
|
-
d.end = nil
|
309
|
-
end
|
310
|
-
|
311
|
-
@lost_annotations
|
312
|
-
end
|
313
|
-
|
314
|
-
def transform_hdenotations(hdenotations)
|
315
|
-
return nil if hdenotations.nil?
|
316
|
-
@lost_annotations = []
|
317
|
-
|
318
|
-
r = hdenotations.collect do |d|
|
319
|
-
t = transform_a_span(d[:span])
|
320
|
-
raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
|
321
|
-
new_d = d.dup.merge({span:t})
|
322
|
-
rescue
|
323
|
-
@lost_annotations << {source: d[:span], target:t}
|
324
|
-
nil
|
325
|
-
end.compact
|
326
|
-
|
327
|
-
r
|
328
|
-
end
|
329
|
-
|
330
|
-
def alignment_show
|
331
|
-
stext = @block_alignment[:source_text]
|
332
|
-
ttext = @block_alignment[:target_text]
|
333
|
-
|
334
|
-
show = ''
|
335
|
-
@block_alignment[:blocks].each do |a|
|
336
|
-
show += case a[:alignment]
|
337
|
-
when :block
|
338
|
-
"===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
339
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
340
|
-
when :term
|
341
|
-
"===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
342
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
|
343
|
-
when :empty
|
344
|
-
"xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
|
345
|
-
"<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
|
346
|
-
stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
|
347
|
-
">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
|
348
|
-
ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
|
349
|
-
else
|
350
|
-
astr1 = ''
|
351
|
-
astr2 = ''
|
352
|
-
|
353
|
-
base = a[:source][:begin]
|
354
|
-
astr1 = a[:alignment].sdiff.map do |c|
|
355
|
-
case c.action
|
356
|
-
when '='
|
357
|
-
stext[c.old_position + base]
|
358
|
-
when '+'
|
359
|
-
'_'
|
360
|
-
when '-'
|
361
|
-
stext[c.old_position + base]
|
362
|
-
when '!'
|
363
|
-
stext[c.old_position + base] + '_'
|
364
|
-
end
|
365
|
-
end.join('')
|
366
|
-
|
367
|
-
base = a[:target][:begin]
|
368
|
-
astr2 = a[:alignment].sdiff.map do |c|
|
369
|
-
case c.action
|
370
|
-
when '='
|
371
|
-
ttext[c.new_position + base]
|
372
|
-
when '+'
|
373
|
-
ttext[c.new_position + base]
|
374
|
-
when '-'
|
375
|
-
'_'
|
376
|
-
when '!'
|
377
|
-
'_' + ttext[c.new_position + base]
|
378
|
-
end
|
379
|
-
end.join('')
|
380
|
-
|
381
|
-
"***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
|
382
|
-
"[#{astr1}]\n" +
|
383
|
-
"[#{astr2}]\n\n"
|
384
|
-
end
|
385
|
-
end
|
386
|
-
show
|
387
|
-
end
|
388
|
-
|
389
477
|
end
|