text_alignment 0.7.2 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::CHAR_MAPPING = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["‐", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
68
+ ["−", "-"], #U+2212 (minus sign)
69
+ ["–", "-"], #U+2013 (en dash)
70
+ ["′", "'"], #U+2032 (prime)
71
+ ["‘", "'"], #U+2018 (left single quotation mark)
72
+ ["’", "'"], #U+2019 (right single quotation mark)
73
+ ["“", '"'], #U+201C (left double quotation mark)
74
+ ["”", '"'], #U+201D (right double quotation mark)
75
+ ['"', "''"]
76
+ ]
77
+
78
+
79
+ class TextAlignment::CharMapping
80
+ attr_reader :str
81
+
82
+ def initialize(_str, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @str, offset_mapping = enmap_str(_str, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
88
+
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
92
+
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
96
+
97
+ def enmap_denotations(_denotations)
98
+ denotations = _denotations.map do |d|
99
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
+ end
101
+ end
102
+
103
+ private
104
+
105
+ def enmap_str(_str, char_mapping)
106
+ str = _str.dup
107
+
108
+ # To execute the single letter mapping
109
+ char_mapping.each do |one, long|
110
+ str.gsub!(one, long) if long.length == 1
111
+ end
112
+
113
+ # To get the (location, length) index for replacements
114
+ loc_len = []
115
+ char_mapping.each do |one, long|
116
+ next if long.length == 1
117
+
118
+ init_next = 0
119
+ while loc = str.index(long, init_next)
120
+ loc_len << [loc, long.length]
121
+ init_next = loc + long.length
122
+ end
123
+
124
+ # a workaround to avoid messing-up due to embedding
125
+ str.gsub!(long, one * long.length)
126
+ end
127
+
128
+ # To get the (location, length) index for consecutive whitespace sequences
129
+ init_next = 0
130
+ while loc = str.index(/\s{2,}/, init_next)
131
+ len = $~[0].length
132
+ loc_len << [loc, len]
133
+ init_next = loc + len
134
+ end
135
+
136
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
137
+
138
+ # To get the offset_mapping before and after replacement
139
+ offset_mapping = []
140
+ init_next = 0
141
+ j = 0
142
+
143
+ loc_len.each do |loc, len|
144
+ offset_mapping += (init_next .. loc).map do |i|
145
+ j += 1
146
+ [i, j - 1]
147
+ end
148
+ init_next = loc + len
149
+ end
150
+
151
+ offset_mapping += (init_next .. str.length).map do |i|
152
+ j += 1
153
+ [i, j - 1]
154
+ end
155
+
156
+ # To execute the long letter mapping
157
+ char_mapping.each do |one, long|
158
+ str.gsub!(one * long.length, one) if long.length > 1
159
+ end
160
+
161
+ # To replace multi whitespace sequences to a space
162
+ str.gsub!(/\s{2,}/, ' ')
163
+
164
+ [str, offset_mapping]
165
+ end
166
+ end
167
+
168
+ if __FILE__ == $0
169
+ require 'json'
170
+
171
+ unless ARGV.length == 1
172
+ warn "#{$0} an_annotation_json_file.json"
173
+ exit
174
+ end
175
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
176
+ denotations = annotations[:denotations]
177
+ if denotations.nil? && annotations[:tracks]
178
+ denotations = annotations[:tracks].first[:denotations]
179
+ end
180
+
181
+ str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
+ str_mapped = str_mapping.str
183
+ denotations_mapped = str_mapping.enmap_denotations(denotations)
184
+ new_annotations = {text:str_mapped, denotations:denotations_mapped}
185
+
186
+ puts new_annotations.to_json
187
+ end
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -0,0 +1,19 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = {}
8
+ end
9
+
10
+ def cultivate(regions)
11
+ regions.each do |b, e|
12
+ (b ... e).each{|p| @map[p] = e}
13
+ end
14
+ end
15
+
16
+ def search_again_position(position)
17
+ @map[position]
18
+ end
19
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -17,10 +17,12 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -139,72 +141,14 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
144
+ def compute_similarity(s1, s2, sdiff)
198
145
  return 0 if sdiff.nil?
199
146
 
200
147
  # compute the lcs only with non-whitespace letters
201
148
  lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
149
  return 0 if lcs == 0
203
150
 
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
151
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
208
152
  end
209
153
 
210
154
  end
@@ -2,39 +2,214 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
9
  class TextAlignment::TextAlignment
11
10
  attr_reader :block_alignment
12
11
  attr_reader :similarity
13
12
  attr_reader :lost_annotations
13
+ attr_reader :cultivation_map
14
14
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
15
+ def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
16
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
17
17
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
18
+ @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
19
+ @original_str1 = _str1
20
+ @original_str2 = _str2
21
21
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
22
+ @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
+ @str2_mapping = TextAlignment::CharMapping.new(_str2)
24
+
25
+ str1 = @str1_mapping.str
26
+ denotations = @str1_mapping.enmap_denotations(_denotations)
27
+
28
+ str2 = @str2_mapping.str
29
+
30
+ @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
31
+
32
+ @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
+ # whole block alignment
34
+ r
35
+ else
36
+ find_block_alignment(str1, str2, denotations, @cultivation_map)
27
37
  end
28
38
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
32
- return
39
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
+ if b[:alignment] == :block || b[:alignment] == :term
41
+ [b[:target][:begin], b[:target][:end]]
42
+ else
43
+ nil
44
+ end
45
+ end.compact
46
+ newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
47
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
48
+ condensed.push region
49
+ else
50
+ condensed.last[1] = region.last
51
+ end
52
+ condensed
53
+ end
54
+
55
+ @cultivation_map.cultivate(newly_cultivated_regions_condensed)
56
+ end
57
+
58
+ def transform_begin_position(_begin_position)
59
+ begin_position = @str1_mapping.enmap_position(_begin_position)
60
+
61
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
+ block = @block_alignment[:blocks][i]
63
+
64
+ b = if block[:alignment] == :block || block[:alignment] == :term
65
+ begin_position + block[:delta]
66
+ elsif block[:alignment] == :empty
67
+ if begin_position == block[:source][:begin]
68
+ block[:target][:begin]
69
+ else
70
+ nil
71
+ end
72
+ else
73
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
74
+ r.nil? ? nil : r + block[:target][:begin]
75
+ end
76
+
77
+ @str2_mapping.demap_position(b)
78
+ end
79
+
80
+ def transform_end_position(_end_position)
81
+ end_position = @str1_mapping.enmap_position(_end_position)
82
+
83
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
+ block = @block_alignment[:blocks][i]
85
+
86
+ e = if block[:alignment] == :block || block[:alignment] == :term
87
+ end_position + block[:delta]
88
+ elsif block[:alignment] == :empty
89
+ if end_position == block[:source][:end]
90
+ block[:target][:end]
91
+ else
92
+ nil
93
+ end
94
+ else
95
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
96
+ r.nil? ? nil : r + block[:target][:begin]
33
97
  end
34
98
 
99
+ @str2_mapping.demap_position(e)
100
+ end
101
+
102
+ def transform_a_span(span)
103
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
104
+ end
35
105
 
106
+ def transform_spans(spans)
107
+ spans.map{|span| transform_a_span(span)}
108
+ end
109
+
110
+ def transform_denotations!(denotations)
111
+ return nil if denotations.nil?
112
+ @lost_annotations = []
113
+
114
+ denotations.each do |d|
115
+ source = {begin:d.begin, end:d.end}
116
+ d.begin = transform_begin_position(d.begin);
117
+ d.end = transform_end_position(d.end);
118
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
119
+ rescue
120
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
+ d.begin = nil
122
+ d.end = nil
123
+ end
124
+
125
+ @lost_annotations
126
+ end
127
+
128
+ def transform_hdenotations(hdenotations)
129
+ return nil if hdenotations.nil?
130
+ @lost_annotations = []
131
+
132
+ r = hdenotations.collect do |d|
133
+ t = transform_a_span(d[:span])
134
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
135
+ new_d = d.dup.merge({span:t})
136
+ rescue
137
+ @lost_annotations << {source: d[:span], target:t}
138
+ nil
139
+ end.compact
140
+
141
+ r
142
+ end
143
+
144
+ def alignment_show
145
+ stext = @block_alignment[:source_text]
146
+ ttext = @block_alignment[:target_text]
147
+
148
+ show = ''
149
+ @block_alignment[:blocks].each do |a|
150
+ show += case a[:alignment]
151
+ when :block
152
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
153
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
154
+ when :term
155
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
156
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
157
+ when :empty
158
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
159
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
160
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
161
+ ">>>>> string 2 " +
162
+ if a[:target]
163
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
164
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
165
+ else
166
+ "[-]\n\n"
167
+ end
168
+ else
169
+ astr1 = ''
170
+ astr2 = ''
171
+
172
+ base = a[:source][:begin]
173
+ astr1 = a[:alignment].sdiff.map do |c|
174
+ case c.action
175
+ when '='
176
+ stext[c.old_position + base]
177
+ when '+'
178
+ '_'
179
+ when '-'
180
+ stext[c.old_position + base]
181
+ when '!'
182
+ stext[c.old_position + base] + '_'
183
+ end
184
+ end.join('')
185
+
186
+ base = a[:target][:begin]
187
+ astr2 = a[:alignment].sdiff.map do |c|
188
+ case c.action
189
+ when '='
190
+ ttext[c.new_position + base]
191
+ when '+'
192
+ ttext[c.new_position + base]
193
+ when '-'
194
+ '_'
195
+ when '!'
196
+ '_' + ttext[c.new_position + base]
197
+ end
198
+ end.join('')
199
+
200
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
201
+ "[#{astr1}]\n" +
202
+ "[#{astr2}]\n\n"
203
+ end
204
+ end
205
+ show
206
+ end
207
+
208
+ private
209
+
210
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
36
211
  ## to find block alignments
37
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
212
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
38
213
 
39
214
  blocks = []
40
215
  while block = anchor_finder.get_next_anchor
@@ -77,12 +252,13 @@ class TextAlignment::TextAlignment
77
252
 
78
253
  if b2 == e2
79
254
  [
80
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
255
+ {source:{begin:b1, end:e1}, alignment: :empty},
81
256
  block
82
257
  ]
83
258
  else
259
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
260
+
84
261
  if b1 == 0 && b2 == 0
85
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
262
  b2 = e2 - len_buffer if e2 > len_buffer
87
263
  end
88
264
 
@@ -94,6 +270,10 @@ class TextAlignment::TextAlignment
94
270
  {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
271
  block
96
272
  ]
273
+ elsif ((e2 - b2) - (e1 - b1)) > len_buffer
274
+ la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
275
+ la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
276
+ [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
97
277
  else
98
278
  local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
99
279
  end
@@ -111,21 +291,58 @@ class TextAlignment::TextAlignment
111
291
  b1 = last_block[:source][:end]
112
292
  if b1 < str1.length
113
293
  e1 = str1.length
114
-
115
294
  b2 = last_block[:target][:end]
116
- if b2 < str2.length
117
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
295
+
296
+ _str1 = str1[b1 ... e1]
297
+ if _str1.strip.empty?
298
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
120
299
  else
121
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
300
+ if b2 < str2.length
301
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
302
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
303
+
304
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
305
+ else
306
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
307
+ end
122
308
  end
123
309
  else
124
310
  []
125
311
  end
126
312
  end
313
+ end
314
+
315
+ def whole_block_alignment(str1, str2, cultivation_map)
316
+ ## Block exact match
317
+ search_position = 0
318
+
319
+ block_begin = begin
320
+ _block_begin = str2.index(str1, search_position)
321
+ break if _block_begin.nil?
322
+ search_position = cultivation_map.search_again_position(_block_begin)
323
+ _block_begin
324
+ end until search_position.nil?
325
+
326
+ unless block_begin.nil?
327
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
328
+ end
329
+
330
+ search_position = 0
331
+
332
+ dstr1 = str1.downcase
333
+ dstr2 = str2.downcase
334
+ block_begin = begin
335
+ _block_begin = dstr2.index(dstr1, search_position)
336
+ break if _block_begin.nil?
337
+ search_position = cultivation_map.search_again_position(_block_begin)
338
+ _block_begin
339
+ end until search_position.nil?
340
+
341
+ unless block_begin.nil?
342
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
343
+ end
127
344
 
128
- @block_alignment[:blocks] = blocks2
345
+ nil
129
346
  end
130
347
 
131
348
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
@@ -138,7 +355,7 @@ class TextAlignment::TextAlignment
138
355
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
356
 
140
357
  position = 0
141
- tblocks = ds_in_scope.map do |term|
358
+ _tblocks = ds_in_scope.map do |term|
142
359
  lex = term[:lex]
143
360
  r = block2.index(lex, position)
144
361
  if r.nil?
@@ -146,11 +363,11 @@ class TextAlignment::TextAlignment
146
363
  break
147
364
  end
148
365
  position = r + lex.length
149
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
366
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
150
367
  end
151
368
 
152
369
  # missing term found
153
- tblocks = [] if position.nil?
370
+ _tblocks = [] if position.nil?
154
371
 
155
372
  # redundant matching found
156
373
  unless position.nil?
@@ -158,19 +375,20 @@ class TextAlignment::TextAlignment
158
375
  lex = term[:lex]
159
376
  look_forward = block2.index(lex, position)
160
377
  unless look_forward.nil?
161
- puts lex
162
- tblocks = []
378
+ _tblocks = []
163
379
  break
164
380
  end
165
381
  end
166
382
  end
167
383
 
168
- tblocks
384
+ _tblocks
385
+ else
386
+ []
169
387
  end
170
388
 
171
389
  if tblocks.empty?
172
390
  if b1 == 0 && e1 == str1.length
173
- if (e1 > 1000) || (e2 > 1000)
391
+ if (e1 > 2000) || (e2 > 2000)
174
392
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
175
393
  else
176
394
  block1 = str1[b1 ... e1]
@@ -237,153 +455,4 @@ class TextAlignment::TextAlignment
237
455
  end
238
456
  end
239
457
 
240
-
241
- def indices(str, target)
242
- position = 0
243
- len = target.len
244
- Enumerator.new do |yielder|
245
- while idx = str.index(target, position)
246
- yielder << idx
247
- position = idx + len
248
- end
249
- end
250
- end
251
-
252
- def transform_begin_position(begin_position)
253
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
254
- block = @block_alignment[:blocks][i]
255
-
256
- b = if block[:alignment] == :block || block[:alignment] == :term
257
- begin_position + block[:delta]
258
- elsif block[:alignment] == :empty
259
- if begin_position == block[:source][:begin]
260
- block[:target][:begin]
261
- else
262
- nil
263
- end
264
- else
265
- r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
266
- r.nil? ? nil : r + block[:target][:begin]
267
- end
268
- end
269
-
270
- def transform_end_position(end_position)
271
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
272
- block = @block_alignment[:blocks][i]
273
-
274
- e = if block[:alignment] == :block || block[:alignment] == :term
275
- end_position + block[:delta]
276
- elsif block[:alignment] == :empty
277
- if end_position == block[:source][:end]
278
- block[:target][:end]
279
- else
280
- nil
281
- end
282
- else
283
- r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
284
- r.nil? ? nil : r + block[:target][:begin]
285
- end
286
- end
287
-
288
- def transform_a_span(span)
289
- {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
290
- end
291
-
292
- def transform_spans(spans)
293
- spans.map{|span| transform_a_span(span)}
294
- end
295
-
296
- def transform_denotations!(denotations)
297
- return nil if denotations.nil?
298
- @lost_annotations = []
299
-
300
- denotations.each do |d|
301
- source = {begin:d.begin, end:d.end}
302
- d.begin = transform_begin_position(d.begin);
303
- d.end = transform_end_position(d.end);
304
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
305
- rescue
306
- @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
- d.begin = nil
308
- d.end = nil
309
- end
310
-
311
- @lost_annotations
312
- end
313
-
314
- def transform_hdenotations(hdenotations)
315
- return nil if hdenotations.nil?
316
- @lost_annotations = []
317
-
318
- r = hdenotations.collect do |d|
319
- t = transform_a_span(d[:span])
320
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
321
- new_d = d.dup.merge({span:t})
322
- rescue
323
- @lost_annotations << {source: d[:span], target:t}
324
- nil
325
- end.compact
326
-
327
- r
328
- end
329
-
330
- def alignment_show
331
- stext = @block_alignment[:source_text]
332
- ttext = @block_alignment[:target_text]
333
-
334
- show = ''
335
- @block_alignment[:blocks].each do |a|
336
- show += case a[:alignment]
337
- when :block
338
- "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
339
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
340
- when :term
341
- "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
342
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
343
- when :empty
344
- "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
345
- "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
346
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
347
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
348
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
349
- else
350
- astr1 = ''
351
- astr2 = ''
352
-
353
- base = a[:source][:begin]
354
- astr1 = a[:alignment].sdiff.map do |c|
355
- case c.action
356
- when '='
357
- stext[c.old_position + base]
358
- when '+'
359
- '_'
360
- when '-'
361
- stext[c.old_position + base]
362
- when '!'
363
- stext[c.old_position + base] + '_'
364
- end
365
- end.join('')
366
-
367
- base = a[:target][:begin]
368
- astr2 = a[:alignment].sdiff.map do |c|
369
- case c.action
370
- when '='
371
- ttext[c.new_position + base]
372
- when '+'
373
- ttext[c.new_position + base]
374
- when '-'
375
- '_'
376
- when '!'
377
- '_' + ttext[c.new_position + base]
378
- end
379
- end.join('')
380
-
381
- "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
382
- "[#{astr1}]\n" +
383
- "[#{astr2}]\n\n"
384
- end
385
- end
386
- show
387
- end
388
-
389
458
  end