text_alignment 0.7.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,189 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::CHAR_MAPPING = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["‐", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
68
+ ["−", "-"], #U+2212 (minus sign)
69
+ ["–", "-"], #U+2013 (en dash)
70
+ ["′", "'"], #U+2032 (prime)
71
+ ["‘", "'"], #U+2018 (left single quotation mark)
72
+ ["’", "'"], #U+2019 (right single quotation mark)
73
+ ["“", '"'], #U+201C (left double quotation mark)
74
+ ["”", '"'], #U+201D (right double quotation mark)
75
+ ['"', "''"]
76
+ ]
77
+
78
+
79
+ class TextAlignment::CharMapping
80
+ attr_reader :mapped_text
81
+
82
+ def initialize(_text, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
88
+
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
92
+
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
96
+
97
+ def enmap_denotations(_denotations)
98
+ return nil if _denotations.nil?
99
+
100
+ denotations = _denotations.map do |d|
101
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
102
+ end
103
+ end
104
+
105
+ private
106
+
107
+ def enmap_text(_text, char_mapping)
108
+ text = _text.dup
109
+
110
+ # To execute the single letter mapping
111
+ char_mapping.each do |one, long|
112
+ text.gsub!(one, long) if long.length == 1
113
+ end
114
+
115
+ # To get the (location, length) index for replacements
116
+ loc_len = []
117
+ char_mapping.each do |one, long|
118
+ next if long.length == 1
119
+
120
+ init_next = 0
121
+ while loc = text.index(long, init_next)
122
+ loc_len << [loc, long.length]
123
+ init_next = loc + long.length
124
+ end
125
+
126
+ # a workaround to avoid messing-up due to embedding
127
+ text.gsub!(long, one * long.length)
128
+ end
129
+
130
+ # To get the (location, length) index for consecutive whitespace sequences
131
+ init_next = 0
132
+ while loc = text.index(/\s{2,}/, init_next)
133
+ len = $~[0].length
134
+ loc_len << [loc, len]
135
+ init_next = loc + len
136
+ end
137
+
138
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
139
+
140
+ # To get the offset_mapping before and after replacement
141
+ offset_mapping = []
142
+ init_next = 0
143
+ j = 0
144
+
145
+ loc_len.each do |loc, len|
146
+ offset_mapping += (init_next .. loc).map do |i|
147
+ j += 1
148
+ [i, j - 1]
149
+ end
150
+ init_next = loc + len
151
+ end
152
+
153
+ offset_mapping += (init_next .. text.length).map do |i|
154
+ j += 1
155
+ [i, j - 1]
156
+ end
157
+
158
+ # To execute the long letter mapping
159
+ char_mapping.each do |one, long|
160
+ text.gsub!(one * long.length, one) if long.length > 1
161
+ end
162
+
163
+ # To replace multi whitespace sequences to a space
164
+ text.gsub!(/\s{2,}/, ' ')
165
+
166
+ [text, offset_mapping]
167
+ end
168
+ end
169
+
170
+ if __FILE__ == $0
171
+ require 'json'
172
+
173
+ unless ARGV.length == 1
174
+ warn "#{$0} an_annotation_json_file.json"
175
+ exit
176
+ end
177
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
+ denotations = annotations[:denotations]
179
+ if denotations.nil? && annotations[:tracks]
180
+ denotations = annotations[:tracks].first[:denotations]
181
+ end
182
+
183
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
184
+ text_mapped = text_mapping.mapped_text
185
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
186
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
187
+
188
+ puts new_annotations.to_json
189
+ end
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 30 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -0,0 +1,19 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = {}
8
+ end
9
+
10
+ def cultivate(regions)
11
+ regions.each do |b, e|
12
+ (b ... e).each{|p| @map[p] = e}
13
+ end
14
+ end
15
+
16
+ def search_again_position(position)
17
+ @map[position]
18
+ end
19
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -17,10 +17,12 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -139,72 +141,14 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
144
+ def compute_similarity(s1, s2, sdiff)
198
145
  return 0 if sdiff.nil?
199
146
 
200
147
  # compute the lcs only with non-whitespace letters
201
148
  lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
149
  return 0 if lcs == 0
203
150
 
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
151
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
208
152
  end
209
153
 
210
154
  end
@@ -2,39 +2,233 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
9
  class TextAlignment::TextAlignment
11
10
  attr_reader :block_alignment
12
11
  attr_reader :similarity
13
12
  attr_reader :lost_annotations
14
13
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
18
+ @original_rtext = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @to_prevent_overlap = to_prevent_overlap
21
21
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
22
+ @original_text = nil
23
+ @block_alignment = nil
24
+ @cultivation_map = TextAlignment::CultivationMap.new
25
+ end
26
+
27
+ def align(text, denotations = nil)
28
+ # To maintain the cultivation map
29
+ update_cultivation_map if @to_prevent_overlap
30
+
31
+ # In case the input text is the same as the previous one, reuse the previous text mapping
32
+ unless @original_text && @original_text == text
33
+ @original_text = text
34
+ @text_mapping = TextAlignment::CharMapping.new(text)
27
35
  end
28
36
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
32
- return
37
+ text_mapped = @text_mapping.mapped_text
38
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
39
+
40
+ rtext_mapped = @rtext_mapping.mapped_text
41
+
42
+ ## To generate the block_alignment of the input text against the reference text
43
+
44
+ # Initialization
45
+ @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
46
+
47
+ # Generation
48
+ @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
49
+ r
50
+ else
51
+ find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
52
+ end
53
+ end
54
+
55
+ def update_cultivation_map
56
+ return if @block_alignment.nil? || @block_alignment[:blocks].nil?
57
+
58
+ ## To update the cultivation map
59
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
60
+ if b[:alignment] == :block || b[:alignment] == :term
61
+ [b[:target][:begin], b[:target][:end]]
62
+ else
63
+ nil
64
+ end
65
+ end.compact.inject([]) do |condensed, region|
66
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
67
+ condensed.push region
68
+ else
69
+ condensed.last[1] = region.last
70
+ end
71
+ condensed
72
+ end
73
+
74
+ @cultivation_map.cultivate(newly_cultivated_regions)
75
+ end
76
+
77
+ def transform_begin_position(_begin_position)
78
+ begin_position = @text_mapping.enmap_position(_begin_position)
79
+
80
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
81
+ block = @block_alignment[:blocks][i]
82
+
83
+ b = if block[:alignment] == :block || block[:alignment] == :term
84
+ begin_position + block[:delta]
85
+ elsif block[:alignment] == :empty
86
+ if begin_position == block[:source][:begin]
87
+ block[:target][:begin]
88
+ else
89
+ nil
90
+ end
91
+ else
92
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
93
+ r.nil? ? nil : r + block[:target][:begin]
94
+ end
95
+
96
+ @rtext_mapping.demap_position(b)
97
+ end
98
+
99
+ def transform_end_position(_end_position)
100
+ end_position = @text_mapping.enmap_position(_end_position)
101
+
102
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
103
+ block = @block_alignment[:blocks][i]
104
+
105
+ e = if block[:alignment] == :block || block[:alignment] == :term
106
+ end_position + block[:delta]
107
+ elsif block[:alignment] == :empty
108
+ if end_position == block[:source][:end]
109
+ block[:target][:end]
110
+ else
111
+ nil
112
+ end
113
+ else
114
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
115
+ r.nil? ? nil : r + block[:target][:begin]
116
+ end
117
+
118
+ @rtext_mapping.demap_position(e)
119
+ end
120
+
121
+ def transform_a_span(span)
122
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
123
+ end
124
+
125
+ def transform_spans(spans)
126
+ spans.map{|span| transform_a_span(span)}
127
+ end
128
+
129
+ def transform_denotations!(denotations)
130
+ return nil if denotations.nil?
131
+ @lost_annotations = []
132
+
133
+ denotations.each do |d|
134
+ source = {begin:d.begin, end:d.end}
135
+ d.begin = transform_begin_position(d.begin);
136
+ d.end = transform_end_position(d.end);
137
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
138
+ rescue
139
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
140
+ d.begin = nil
141
+ d.end = nil
33
142
  end
34
143
 
144
+ @lost_annotations
145
+ end
146
+
147
+ def transform_hdenotations(hdenotations)
148
+ return nil if hdenotations.nil?
149
+ @lost_annotations = []
150
+
151
+ r = hdenotations.collect do |d|
152
+ t = transform_a_span(d[:span])
153
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
154
+ new_d = d.dup.merge({span:t})
155
+ rescue
156
+ @lost_annotations << {source: d[:span], target:t}
157
+ nil
158
+ end.compact
159
+
160
+ r
161
+ end
162
+
163
+ def alignment_show
164
+ stext = @block_alignment[:text]
165
+ ttext = @block_alignment[:reference_text]
166
+
167
+ show = ''
168
+ @block_alignment[:blocks].each do |a|
169
+ show += case a[:alignment]
170
+ when :block
171
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
172
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
173
+ when :term
174
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
175
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
176
+ when :empty
177
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
178
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
179
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
180
+ ">>>>> string 2 " +
181
+ if a[:target]
182
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
183
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
184
+ else
185
+ "[-]\n\n"
186
+ end
187
+ else
188
+ astr1 = ''
189
+ astr2 = ''
190
+
191
+ base = a[:source][:begin]
192
+ astr1 = a[:alignment].sdiff.map do |c|
193
+ case c.action
194
+ when '='
195
+ stext[c.old_position + base]
196
+ when '+'
197
+ '_'
198
+ when '-'
199
+ stext[c.old_position + base]
200
+ when '!'
201
+ stext[c.old_position + base] + '_'
202
+ end
203
+ end.join('')
35
204
 
205
+ base = a[:target][:begin]
206
+ astr2 = a[:alignment].sdiff.map do |c|
207
+ case c.action
208
+ when '='
209
+ ttext[c.new_position + base]
210
+ when '+'
211
+ ttext[c.new_position + base]
212
+ when '-'
213
+ '_'
214
+ when '!'
215
+ '_' + ttext[c.new_position + base]
216
+ end
217
+ end.join('')
218
+
219
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
220
+ "[#{astr1}]\n" +
221
+ "[#{astr2}]\n\n"
222
+ end
223
+ end
224
+ show
225
+ end
226
+
227
+ private
228
+
229
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
36
230
  ## to find block alignments
37
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
231
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
38
232
 
39
233
  blocks = []
40
234
  while block = anchor_finder.get_next_anchor
@@ -77,12 +271,13 @@ class TextAlignment::TextAlignment
77
271
 
78
272
  if b2 == e2
79
273
  [
80
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
274
+ {source:{begin:b1, end:e1}, alignment: :empty},
81
275
  block
82
276
  ]
83
277
  else
278
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
279
+
84
280
  if b1 == 0 && b2 == 0
85
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
281
  b2 = e2 - len_buffer if e2 > len_buffer
87
282
  end
88
283
 
@@ -94,6 +289,10 @@ class TextAlignment::TextAlignment
94
289
  {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
290
  block
96
291
  ]
292
+ elsif ((e2 - b2) - (e1 - b1)) > len_buffer
293
+ la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
294
+ la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
295
+ [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
97
296
  else
98
297
  local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
99
298
  end
@@ -111,21 +310,58 @@ class TextAlignment::TextAlignment
111
310
  b1 = last_block[:source][:end]
112
311
  if b1 < str1.length
113
312
  e1 = str1.length
114
-
115
313
  b2 = last_block[:target][:end]
116
- if b2 < str2.length
117
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
314
+
315
+ _str1 = str1[b1 ... e1]
316
+ if _str1.strip.empty?
317
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
120
318
  else
121
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
319
+ if b2 < str2.length
320
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
321
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
322
+
323
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
324
+ else
325
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
326
+ end
122
327
  end
123
328
  else
124
329
  []
125
330
  end
126
331
  end
332
+ end
333
+
334
+ def whole_block_alignment(str1, str2, cultivation_map)
335
+ ## Block exact match
336
+ search_position = 0
337
+
338
+ block_begin = begin
339
+ _block_begin = str2.index(str1, search_position)
340
+ break if _block_begin.nil?
341
+ search_position = cultivation_map.search_again_position(_block_begin)
342
+ _block_begin
343
+ end until search_position.nil?
344
+
345
+ unless block_begin.nil?
346
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
347
+ end
348
+
349
+ search_position = 0
127
350
 
128
- @block_alignment[:blocks] = blocks2
351
+ dstr1 = str1.downcase
352
+ dstr2 = str2.downcase
353
+ block_begin = begin
354
+ _block_begin = dstr2.index(dstr1, search_position)
355
+ break if _block_begin.nil?
356
+ search_position = cultivation_map.search_again_position(_block_begin)
357
+ _block_begin
358
+ end until search_position.nil?
359
+
360
+ unless block_begin.nil?
361
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
362
+ end
363
+
364
+ nil
129
365
  end
130
366
 
131
367
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
@@ -138,7 +374,7 @@ class TextAlignment::TextAlignment
138
374
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
375
 
140
376
  position = 0
141
- tblocks = ds_in_scope.map do |term|
377
+ _tblocks = ds_in_scope.map do |term|
142
378
  lex = term[:lex]
143
379
  r = block2.index(lex, position)
144
380
  if r.nil?
@@ -146,11 +382,11 @@ class TextAlignment::TextAlignment
146
382
  break
147
383
  end
148
384
  position = r + lex.length
149
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
385
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
150
386
  end
151
387
 
152
388
  # missing term found
153
- tblocks = [] if position.nil?
389
+ _tblocks = [] if position.nil?
154
390
 
155
391
  # redundant matching found
156
392
  unless position.nil?
@@ -158,14 +394,15 @@ class TextAlignment::TextAlignment
158
394
  lex = term[:lex]
159
395
  look_forward = block2.index(lex, position)
160
396
  unless look_forward.nil?
161
- puts lex
162
- tblocks = []
397
+ _tblocks = []
163
398
  break
164
399
  end
165
400
  end
166
401
  end
167
402
 
168
- tblocks
403
+ _tblocks
404
+ else
405
+ []
169
406
  end
170
407
 
171
408
  if tblocks.empty?
@@ -237,153 +474,4 @@ class TextAlignment::TextAlignment
237
474
  end
238
475
  end
239
476
 
240
-
241
- def indices(str, target)
242
- position = 0
243
- len = target.len
244
- Enumerator.new do |yielder|
245
- while idx = str.index(target, position)
246
- yielder << idx
247
- position = idx + len
248
- end
249
- end
250
- end
251
-
252
- def transform_begin_position(begin_position)
253
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
254
- block = @block_alignment[:blocks][i]
255
-
256
- b = if block[:alignment] == :block || block[:alignment] == :term
257
- begin_position + block[:delta]
258
- elsif block[:alignment] == :empty
259
- if begin_position == block[:source][:begin]
260
- block[:target][:begin]
261
- else
262
- nil
263
- end
264
- else
265
- r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
266
- r.nil? ? nil : r + block[:target][:begin]
267
- end
268
- end
269
-
270
- def transform_end_position(end_position)
271
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
272
- block = @block_alignment[:blocks][i]
273
-
274
- e = if block[:alignment] == :block || block[:alignment] == :term
275
- end_position + block[:delta]
276
- elsif block[:alignment] == :empty
277
- if end_position == block[:source][:end]
278
- block[:target][:end]
279
- else
280
- nil
281
- end
282
- else
283
- r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
284
- r.nil? ? nil : r + block[:target][:begin]
285
- end
286
- end
287
-
288
- def transform_a_span(span)
289
- {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
290
- end
291
-
292
- def transform_spans(spans)
293
- spans.map{|span| transform_a_span(span)}
294
- end
295
-
296
- def transform_denotations!(denotations)
297
- return nil if denotations.nil?
298
- @lost_annotations = []
299
-
300
- denotations.each do |d|
301
- source = {begin:d.begin, end:d.end}
302
- d.begin = transform_begin_position(d.begin);
303
- d.end = transform_end_position(d.end);
304
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
305
- rescue
306
- @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
- d.begin = nil
308
- d.end = nil
309
- end
310
-
311
- @lost_annotations
312
- end
313
-
314
- def transform_hdenotations(hdenotations)
315
- return nil if hdenotations.nil?
316
- @lost_annotations = []
317
-
318
- r = hdenotations.collect do |d|
319
- t = transform_a_span(d[:span])
320
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
321
- new_d = d.dup.merge({span:t})
322
- rescue
323
- @lost_annotations << {source: d[:span], target:t}
324
- nil
325
- end.compact
326
-
327
- r
328
- end
329
-
330
- def alignment_show
331
- stext = @block_alignment[:source_text]
332
- ttext = @block_alignment[:target_text]
333
-
334
- show = ''
335
- @block_alignment[:blocks].each do |a|
336
- show += case a[:alignment]
337
- when :block
338
- "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
339
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
340
- when :term
341
- "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
342
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
343
- when :empty
344
- "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
345
- "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
346
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
347
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
348
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
349
- else
350
- astr1 = ''
351
- astr2 = ''
352
-
353
- base = a[:source][:begin]
354
- astr1 = a[:alignment].sdiff.map do |c|
355
- case c.action
356
- when '='
357
- stext[c.old_position + base]
358
- when '+'
359
- '_'
360
- when '-'
361
- stext[c.old_position + base]
362
- when '!'
363
- stext[c.old_position + base] + '_'
364
- end
365
- end.join('')
366
-
367
- base = a[:target][:begin]
368
- astr2 = a[:alignment].sdiff.map do |c|
369
- case c.action
370
- when '='
371
- ttext[c.new_position + base]
372
- when '+'
373
- ttext[c.new_position + base]
374
- when '-'
375
- '_'
376
- when '!'
377
- '_' + ttext[c.new_position + base]
378
- end
379
- end.join('')
380
-
381
- "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
382
- "[#{astr1}]\n" +
383
- "[#{astr2}]\n\n"
384
- end
385
- end
386
- show
387
- end
388
-
389
477
  end