text_alignment 0.7.2 → 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,187 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::CHAR_MAPPING = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["‐", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
68
+ ["−", "-"], #U+2212 (minus sign)
69
+ ["–", "-"], #U+2013 (en dash)
70
+ ["′", "'"], #U+2032 (prime)
71
+ ["‘", "'"], #U+2018 (left single quotation mark)
72
+ ["’", "'"], #U+2019 (right single quotation mark)
73
+ ["“", '"'], #U+201C (left double quotation mark)
74
+ ["”", '"'], #U+201D (right double quotation mark)
75
+ ['"', "''"]
76
+ ]
77
+
78
+
79
+ class TextAlignment::CharMapping
80
+ attr_reader :str
81
+
82
+ def initialize(_str, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @str, offset_mapping = enmap_str(_str, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
88
+
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
92
+
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
96
+
97
+ def enmap_denotations(_denotations)
98
+ denotations = _denotations.map do |d|
99
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
+ end
101
+ end
102
+
103
+ private
104
+
105
+ def enmap_str(_str, char_mapping)
106
+ str = _str.dup
107
+
108
+ # To execute the single letter mapping
109
+ char_mapping.each do |one, long|
110
+ str.gsub!(one, long) if long.length == 1
111
+ end
112
+
113
+ # To get the (location, length) index for replacements
114
+ loc_len = []
115
+ char_mapping.each do |one, long|
116
+ next if long.length == 1
117
+
118
+ init_next = 0
119
+ while loc = str.index(long, init_next)
120
+ loc_len << [loc, long.length]
121
+ init_next = loc + long.length
122
+ end
123
+
124
+ # a workaround to avoid messing-up due to embedding
125
+ str.gsub!(long, one * long.length)
126
+ end
127
+
128
+ # To get the (location, length) index for consecutive whitespace sequences
129
+ init_next = 0
130
+ while loc = str.index(/\s{2,}/, init_next)
131
+ len = $~[0].length
132
+ loc_len << [loc, len]
133
+ init_next = loc + len
134
+ end
135
+
136
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
137
+
138
+ # To get the offset_mapping before and after replacement
139
+ offset_mapping = []
140
+ init_next = 0
141
+ j = 0
142
+
143
+ loc_len.each do |loc, len|
144
+ offset_mapping += (init_next .. loc).map do |i|
145
+ j += 1
146
+ [i, j - 1]
147
+ end
148
+ init_next = loc + len
149
+ end
150
+
151
+ offset_mapping += (init_next .. str.length).map do |i|
152
+ j += 1
153
+ [i, j - 1]
154
+ end
155
+
156
+ # To execute the long letter mapping
157
+ char_mapping.each do |one, long|
158
+ str.gsub!(one * long.length, one) if long.length > 1
159
+ end
160
+
161
+ # To replace multi whitespace sequences to a space
162
+ str.gsub!(/\s{2,}/, ' ')
163
+
164
+ [str, offset_mapping]
165
+ end
166
+ end
167
+
168
+ if __FILE__ == $0
169
+ require 'json'
170
+
171
+ unless ARGV.length == 1
172
+ warn "#{$0} an_annotation_json_file.json"
173
+ exit
174
+ end
175
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
176
+ denotations = annotations[:denotations]
177
+ if denotations.nil? && annotations[:tracks]
178
+ denotations = annotations[:tracks].first[:denotations]
179
+ end
180
+
181
+ str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
+ str_mapped = str_mapping.str
183
+ denotations_mapped = str_mapping.enmap_denotations(denotations)
184
+ new_annotations = {text:str_mapped, denotations:denotations_mapped}
185
+
186
+ puts new_annotations.to_json
187
+ end
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -0,0 +1,19 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = {}
8
+ end
9
+
10
+ def cultivate(regions)
11
+ regions.each do |b, e|
12
+ (b ... e).each{|p| @map[p] = e}
13
+ end
14
+ end
15
+
16
+ def search_again_position(position)
17
+ @map[position]
18
+ end
19
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -17,10 +17,12 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -139,72 +141,14 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
144
+ def compute_similarity(s1, s2, sdiff)
198
145
  return 0 if sdiff.nil?
199
146
 
200
147
  # compute the lcs only with non-whitespace letters
201
148
  lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
149
  return 0 if lcs == 0
203
150
 
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
151
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
208
152
  end
209
153
 
210
154
  end
@@ -2,39 +2,214 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
9
  class TextAlignment::TextAlignment
11
10
  attr_reader :block_alignment
12
11
  attr_reader :similarity
13
12
  attr_reader :lost_annotations
13
+ attr_reader :cultivation_map
14
14
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
15
+ def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
16
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
17
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
18
+ @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
19
+ @original_str1 = _str1
20
+ @original_str2 = _str2
21
21
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
22
+ @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
+ @str2_mapping = TextAlignment::CharMapping.new(_str2)
24
+
25
+ str1 = @str1_mapping.str
26
+ denotations = @str1_mapping.enmap_denotations(_denotations)
27
+
28
+ str2 = @str2_mapping.str
29
+
30
+ @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
31
+
32
+ @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
+ # whole block alignment
34
+ r
35
+ else
36
+ find_block_alignment(str1, str2, denotations, @cultivation_map)
27
37
  end
28
38
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
32
- return
39
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
+ if b[:alignment] == :block || b[:alignment] == :term
41
+ [b[:target][:begin], b[:target][:end]]
42
+ else
43
+ nil
44
+ end
45
+ end.compact
46
+ newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
47
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
48
+ condensed.push region
49
+ else
50
+ condensed.last[1] = region.last
51
+ end
52
+ condensed
53
+ end
54
+
55
+ @cultivation_map.cultivate(newly_cultivated_regions_condensed)
56
+ end
57
+
58
+ def transform_begin_position(_begin_position)
59
+ begin_position = @str1_mapping.enmap_position(_begin_position)
60
+
61
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
+ block = @block_alignment[:blocks][i]
63
+
64
+ b = if block[:alignment] == :block || block[:alignment] == :term
65
+ begin_position + block[:delta]
66
+ elsif block[:alignment] == :empty
67
+ if begin_position == block[:source][:begin]
68
+ block[:target][:begin]
69
+ else
70
+ nil
71
+ end
72
+ else
73
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
74
+ r.nil? ? nil : r + block[:target][:begin]
75
+ end
76
+
77
+ @str2_mapping.demap_position(b)
78
+ end
79
+
80
+ def transform_end_position(_end_position)
81
+ end_position = @str1_mapping.enmap_position(_end_position)
82
+
83
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
+ block = @block_alignment[:blocks][i]
85
+
86
+ e = if block[:alignment] == :block || block[:alignment] == :term
87
+ end_position + block[:delta]
88
+ elsif block[:alignment] == :empty
89
+ if end_position == block[:source][:end]
90
+ block[:target][:end]
91
+ else
92
+ nil
93
+ end
94
+ else
95
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
96
+ r.nil? ? nil : r + block[:target][:begin]
33
97
  end
34
98
 
99
+ @str2_mapping.demap_position(e)
100
+ end
101
+
102
+ def transform_a_span(span)
103
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
104
+ end
35
105
 
106
+ def transform_spans(spans)
107
+ spans.map{|span| transform_a_span(span)}
108
+ end
109
+
110
+ def transform_denotations!(denotations)
111
+ return nil if denotations.nil?
112
+ @lost_annotations = []
113
+
114
+ denotations.each do |d|
115
+ source = {begin:d.begin, end:d.end}
116
+ d.begin = transform_begin_position(d.begin);
117
+ d.end = transform_end_position(d.end);
118
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
119
+ rescue
120
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
+ d.begin = nil
122
+ d.end = nil
123
+ end
124
+
125
+ @lost_annotations
126
+ end
127
+
128
+ def transform_hdenotations(hdenotations)
129
+ return nil if hdenotations.nil?
130
+ @lost_annotations = []
131
+
132
+ r = hdenotations.collect do |d|
133
+ t = transform_a_span(d[:span])
134
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
135
+ new_d = d.dup.merge({span:t})
136
+ rescue
137
+ @lost_annotations << {source: d[:span], target:t}
138
+ nil
139
+ end.compact
140
+
141
+ r
142
+ end
143
+
144
+ def alignment_show
145
+ stext = @block_alignment[:source_text]
146
+ ttext = @block_alignment[:target_text]
147
+
148
+ show = ''
149
+ @block_alignment[:blocks].each do |a|
150
+ show += case a[:alignment]
151
+ when :block
152
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
153
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
154
+ when :term
155
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
156
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
157
+ when :empty
158
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
159
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
160
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
161
+ ">>>>> string 2 " +
162
+ if a[:target]
163
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
164
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
165
+ else
166
+ "[-]\n\n"
167
+ end
168
+ else
169
+ astr1 = ''
170
+ astr2 = ''
171
+
172
+ base = a[:source][:begin]
173
+ astr1 = a[:alignment].sdiff.map do |c|
174
+ case c.action
175
+ when '='
176
+ stext[c.old_position + base]
177
+ when '+'
178
+ '_'
179
+ when '-'
180
+ stext[c.old_position + base]
181
+ when '!'
182
+ stext[c.old_position + base] + '_'
183
+ end
184
+ end.join('')
185
+
186
+ base = a[:target][:begin]
187
+ astr2 = a[:alignment].sdiff.map do |c|
188
+ case c.action
189
+ when '='
190
+ ttext[c.new_position + base]
191
+ when '+'
192
+ ttext[c.new_position + base]
193
+ when '-'
194
+ '_'
195
+ when '!'
196
+ '_' + ttext[c.new_position + base]
197
+ end
198
+ end.join('')
199
+
200
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
201
+ "[#{astr1}]\n" +
202
+ "[#{astr2}]\n\n"
203
+ end
204
+ end
205
+ show
206
+ end
207
+
208
+ private
209
+
210
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
36
211
  ## to find block alignments
37
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
212
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
38
213
 
39
214
  blocks = []
40
215
  while block = anchor_finder.get_next_anchor
@@ -77,12 +252,13 @@ class TextAlignment::TextAlignment
77
252
 
78
253
  if b2 == e2
79
254
  [
80
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
255
+ {source:{begin:b1, end:e1}, alignment: :empty},
81
256
  block
82
257
  ]
83
258
  else
259
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
260
+
84
261
  if b1 == 0 && b2 == 0
85
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
262
  b2 = e2 - len_buffer if e2 > len_buffer
87
263
  end
88
264
 
@@ -94,6 +270,10 @@ class TextAlignment::TextAlignment
94
270
  {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
271
  block
96
272
  ]
273
+ elsif ((e2 - b2) - (e1 - b1)) > len_buffer
274
+ la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
275
+ la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
276
+ [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
97
277
  else
98
278
  local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
99
279
  end
@@ -111,21 +291,58 @@ class TextAlignment::TextAlignment
111
291
  b1 = last_block[:source][:end]
112
292
  if b1 < str1.length
113
293
  e1 = str1.length
114
-
115
294
  b2 = last_block[:target][:end]
116
- if b2 < str2.length
117
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
295
+
296
+ _str1 = str1[b1 ... e1]
297
+ if _str1.strip.empty?
298
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
120
299
  else
121
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
300
+ if b2 < str2.length
301
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
302
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
303
+
304
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
305
+ else
306
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
307
+ end
122
308
  end
123
309
  else
124
310
  []
125
311
  end
126
312
  end
313
+ end
314
+
315
+ def whole_block_alignment(str1, str2, cultivation_map)
316
+ ## Block exact match
317
+ search_position = 0
318
+
319
+ block_begin = begin
320
+ _block_begin = str2.index(str1, search_position)
321
+ break if _block_begin.nil?
322
+ search_position = cultivation_map.search_again_position(_block_begin)
323
+ _block_begin
324
+ end until search_position.nil?
325
+
326
+ unless block_begin.nil?
327
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
328
+ end
329
+
330
+ search_position = 0
331
+
332
+ dstr1 = str1.downcase
333
+ dstr2 = str2.downcase
334
+ block_begin = begin
335
+ _block_begin = dstr2.index(dstr1, search_position)
336
+ break if _block_begin.nil?
337
+ search_position = cultivation_map.search_again_position(_block_begin)
338
+ _block_begin
339
+ end until search_position.nil?
340
+
341
+ unless block_begin.nil?
342
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
343
+ end
127
344
 
128
- @block_alignment[:blocks] = blocks2
345
+ nil
129
346
  end
130
347
 
131
348
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
@@ -138,7 +355,7 @@ class TextAlignment::TextAlignment
138
355
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
356
 
140
357
  position = 0
141
- tblocks = ds_in_scope.map do |term|
358
+ _tblocks = ds_in_scope.map do |term|
142
359
  lex = term[:lex]
143
360
  r = block2.index(lex, position)
144
361
  if r.nil?
@@ -146,11 +363,11 @@ class TextAlignment::TextAlignment
146
363
  break
147
364
  end
148
365
  position = r + lex.length
149
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
366
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
150
367
  end
151
368
 
152
369
  # missing term found
153
- tblocks = [] if position.nil?
370
+ _tblocks = [] if position.nil?
154
371
 
155
372
  # redundant matching found
156
373
  unless position.nil?
@@ -158,19 +375,20 @@ class TextAlignment::TextAlignment
158
375
  lex = term[:lex]
159
376
  look_forward = block2.index(lex, position)
160
377
  unless look_forward.nil?
161
- puts lex
162
- tblocks = []
378
+ _tblocks = []
163
379
  break
164
380
  end
165
381
  end
166
382
  end
167
383
 
168
- tblocks
384
+ _tblocks
385
+ else
386
+ []
169
387
  end
170
388
 
171
389
  if tblocks.empty?
172
390
  if b1 == 0 && e1 == str1.length
173
- if (e1 > 1000) || (e2 > 1000)
391
+ if (e1 > 2000) || (e2 > 2000)
174
392
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
175
393
  else
176
394
  block1 = str1[b1 ... e1]
@@ -237,153 +455,4 @@ class TextAlignment::TextAlignment
237
455
  end
238
456
  end
239
457
 
240
-
241
- def indices(str, target)
242
- position = 0
243
- len = target.len
244
- Enumerator.new do |yielder|
245
- while idx = str.index(target, position)
246
- yielder << idx
247
- position = idx + len
248
- end
249
- end
250
- end
251
-
252
- def transform_begin_position(begin_position)
253
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
254
- block = @block_alignment[:blocks][i]
255
-
256
- b = if block[:alignment] == :block || block[:alignment] == :term
257
- begin_position + block[:delta]
258
- elsif block[:alignment] == :empty
259
- if begin_position == block[:source][:begin]
260
- block[:target][:begin]
261
- else
262
- nil
263
- end
264
- else
265
- r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
266
- r.nil? ? nil : r + block[:target][:begin]
267
- end
268
- end
269
-
270
- def transform_end_position(end_position)
271
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
272
- block = @block_alignment[:blocks][i]
273
-
274
- e = if block[:alignment] == :block || block[:alignment] == :term
275
- end_position + block[:delta]
276
- elsif block[:alignment] == :empty
277
- if end_position == block[:source][:end]
278
- block[:target][:end]
279
- else
280
- nil
281
- end
282
- else
283
- r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
284
- r.nil? ? nil : r + block[:target][:begin]
285
- end
286
- end
287
-
288
- def transform_a_span(span)
289
- {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
290
- end
291
-
292
- def transform_spans(spans)
293
- spans.map{|span| transform_a_span(span)}
294
- end
295
-
296
- def transform_denotations!(denotations)
297
- return nil if denotations.nil?
298
- @lost_annotations = []
299
-
300
- denotations.each do |d|
301
- source = {begin:d.begin, end:d.end}
302
- d.begin = transform_begin_position(d.begin);
303
- d.end = transform_end_position(d.end);
304
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
305
- rescue
306
- @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
- d.begin = nil
308
- d.end = nil
309
- end
310
-
311
- @lost_annotations
312
- end
313
-
314
- def transform_hdenotations(hdenotations)
315
- return nil if hdenotations.nil?
316
- @lost_annotations = []
317
-
318
- r = hdenotations.collect do |d|
319
- t = transform_a_span(d[:span])
320
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
321
- new_d = d.dup.merge({span:t})
322
- rescue
323
- @lost_annotations << {source: d[:span], target:t}
324
- nil
325
- end.compact
326
-
327
- r
328
- end
329
-
330
- def alignment_show
331
- stext = @block_alignment[:source_text]
332
- ttext = @block_alignment[:target_text]
333
-
334
- show = ''
335
- @block_alignment[:blocks].each do |a|
336
- show += case a[:alignment]
337
- when :block
338
- "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
339
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
340
- when :term
341
- "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
342
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
343
- when :empty
344
- "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
345
- "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
346
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
347
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
348
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
349
- else
350
- astr1 = ''
351
- astr2 = ''
352
-
353
- base = a[:source][:begin]
354
- astr1 = a[:alignment].sdiff.map do |c|
355
- case c.action
356
- when '='
357
- stext[c.old_position + base]
358
- when '+'
359
- '_'
360
- when '-'
361
- stext[c.old_position + base]
362
- when '!'
363
- stext[c.old_position + base] + '_'
364
- end
365
- end.join('')
366
-
367
- base = a[:target][:begin]
368
- astr2 = a[:alignment].sdiff.map do |c|
369
- case c.action
370
- when '='
371
- ttext[c.new_position + base]
372
- when '+'
373
- ttext[c.new_position + base]
374
- when '-'
375
- '_'
376
- when '!'
377
- '_' + ttext[c.new_position + base]
378
- end
379
- end.join('')
380
-
381
- "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
382
- "[#{astr1}]\n" +
383
- "[#{astr2}]\n\n"
384
- end
385
- end
386
- show
387
- end
388
-
389
458
  end