text_alignment 0.7.3 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,189 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::CHAR_MAPPING = [
4
+ ["©", "(c)"], #U+00A9 (Copyright Sign)
5
+
6
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
7
+ ["β", "beta"], #U+03B2 (greek small letter beta)
8
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
9
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
10
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
11
+ ["ζ", "zeta"], #U+03B6 (greek small letter zeta)
12
+ ["η", "eta"], #U+03B7 (greek small letter eta)
13
+ ["θ", "theta"], #U+03B7 (greek small letter eta)
14
+ ["ι", "iota"], #U+03B7 (greek small letter eta)
15
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
16
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
17
+ ["λ", "lamda"], #U+03BB (greek small letter lambda)
18
+ ["μ", "mu"], #U+03BC (greek small letter mu)
19
+ ["ν", "nu"], #U+03BD (greek small letter nu)
20
+ ["ξ", "xi"], #U+03BE (greek small letter xi)
21
+ ["ο", "omicron"], #U+03BF (greek small letter omicron)
22
+ ["π", "pi"], #U+03C0 (greek small letter pi)
23
+ ["ρ", "rho"], #U+03C1 (greek small letter rho)
24
+ ["σ", "sigma"], #U+03C3 (greek small letter sigma)
25
+ ["τ", "tau"], #U+03C4 (greek small letter tau)
26
+ ["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
27
+ ["φ", "phi"], #U+03C6 (greek small letter phi)
28
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
29
+ ["ψ", "psi"], #U+03C8 (greek small letter psi)
30
+ ["ω", "omega"], #U+03C9 (greek small letter omega)
31
+
32
+ ["Α", "Alpha"], #U+0391 (greek capital letter alpha)
33
+ ["Β", "Beta"], #U+0392 (greek capital letter beta)
34
+ ["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
35
+ ["Δ", "Delta"], #U+0394 (greek capital letter delta)
36
+ ["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
37
+ ["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
38
+ ["Η", "Eta"], #U+0397 (greek capital letter eta)
39
+ ["Θ", "Theta"], #U+0398 (greek capital letter theta)
40
+ ["Ι", "Iota"], #U+0399 (greek capital letter iota)
41
+ ["Κ", "Kappa"], #U+039A (greek capital letter kappa)
42
+ ["Λ", "Lambda"], #U+039B (greek capital letter lambda)
43
+ ["Λ", "Lamda"], #U+039B (greek capital letter lambda)
44
+ ["Μ", "Mu"], #U+039C (greek capital letter mu)
45
+ ["Ν", "Nu"], #U+039D (greek capital letter nu)
46
+ ["Ξ", "Xi"], #U+039E (greek capital letter xi)
47
+ ["Ο", "Omicron"], #U+039F (greek capital letter omicron)
48
+ ["Π", "Pi"], #U+03A0 (greek capital letter pi)
49
+ ["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
50
+ ["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
51
+ ["Τ", "Tau"], #U+03A4 (greek capital letter tau)
52
+ ["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
53
+ ["Φ", "Phi"], #U+03A6 (greek capital letter phi)
54
+ ["Χ", "Chi"], #U+03A7 (greek capital letter chi)
55
+ ["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
56
+ ["Ω", "Omega"], #U+03A9 (greek capital letter omega)
57
+
58
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
59
+
60
+ ["×", "x"], #U+00D7 (multiplication sign)
61
+ ["•", "*"], #U+2022 (bullet)
62
+ [" ", " "], #U+2009 (thin space)
63
+ [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
+ [" ", " "], #U+3000 (ideographic space)
66
+ ["‐", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
68
+ ["−", "-"], #U+2212 (minus sign)
69
+ ["–", "-"], #U+2013 (en dash)
70
+ ["′", "'"], #U+2032 (prime)
71
+ ["‘", "'"], #U+2018 (left single quotation mark)
72
+ ["’", "'"], #U+2019 (right single quotation mark)
73
+ ["“", '"'], #U+201C (left double quotation mark)
74
+ ["”", '"'], #U+201D (right double quotation mark)
75
+ ['"', "''"]
76
+ ]
77
+
78
+
79
+ class TextAlignment::CharMapping
80
+ attr_reader :mapped_text
81
+
82
+ def initialize(_text, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
88
+
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
92
+
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
96
+
97
+ def enmap_denotations(_denotations)
98
+ return nil if _denotations.nil?
99
+
100
+ denotations = _denotations.map do |d|
101
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
102
+ end
103
+ end
104
+
105
+ private
106
+
107
+ def enmap_text(_text, char_mapping)
108
+ text = _text.dup
109
+
110
+ # To execute the single letter mapping
111
+ char_mapping.each do |one, long|
112
+ text.gsub!(one, long) if long.length == 1
113
+ end
114
+
115
+ # To get the (location, length) index for replacements
116
+ loc_len = []
117
+ char_mapping.each do |one, long|
118
+ next if long.length == 1
119
+
120
+ init_next = 0
121
+ while loc = text.index(long, init_next)
122
+ loc_len << [loc, long.length]
123
+ init_next = loc + long.length
124
+ end
125
+
126
+ # a workaround to avoid messing-up due to embedding
127
+ text.gsub!(long, one * long.length)
128
+ end
129
+
130
+ # To get the (location, length) index for consecutive whitespace sequences
131
+ init_next = 0
132
+ while loc = text.index(/\s{2,}/, init_next)
133
+ len = $~[0].length
134
+ loc_len << [loc, len]
135
+ init_next = loc + len
136
+ end
137
+
138
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
139
+
140
+ # To get the offset_mapping before and after replacement
141
+ offset_mapping = []
142
+ init_next = 0
143
+ j = 0
144
+
145
+ loc_len.each do |loc, len|
146
+ offset_mapping += (init_next .. loc).map do |i|
147
+ j += 1
148
+ [i, j - 1]
149
+ end
150
+ init_next = loc + len
151
+ end
152
+
153
+ offset_mapping += (init_next .. text.length).map do |i|
154
+ j += 1
155
+ [i, j - 1]
156
+ end
157
+
158
+ # To execute the long letter mapping
159
+ char_mapping.each do |one, long|
160
+ text.gsub!(one * long.length, one) if long.length > 1
161
+ end
162
+
163
+ # To replace multi whitespace sequences to a space
164
+ text.gsub!(/\s{2,}/, ' ')
165
+
166
+ [text, offset_mapping]
167
+ end
168
+ end
169
+
170
+ if __FILE__ == $0
171
+ require 'json'
172
+
173
+ unless ARGV.length == 1
174
+ warn "#{$0} an_annotation_json_file.json"
175
+ exit
176
+ end
177
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
+ denotations = annotations[:denotations]
179
+ if denotations.nil? && annotations[:tracks]
180
+ denotations = annotations[:tracks].first[:denotations]
181
+ end
182
+
183
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
184
+ text_mapped = text_mapping.mapped_text
185
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
186
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
187
+
188
+ puts new_annotations.to_json
189
+ end
@@ -1,7 +1,7 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
3
  TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
- TextAlignment::SIZE_WINDOW = 30 unless defined? TextAlignment::SIZE_WINDOW
4
+ TextAlignment::SIZE_WINDOW = 10 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
7
  TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -0,0 +1,19 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = {}
8
+ end
9
+
10
+ def cultivate(regions)
11
+ regions.each do |b, e|
12
+ (b ... e).each{|p| @map[p] = e}
13
+ end
14
+ end
15
+
16
+ def search_again_position(position)
17
+ @map[position]
18
+ end
19
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -17,10 +17,12 @@ class TextAlignment::MixedAlignment
17
17
  attr_reader :similarity
18
18
  attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
19
 
20
- def initialize(_str1, _str2)
20
+ def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = string_preprocessing(_str1, _str2)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -139,72 +141,14 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
142
- private
143
-
144
- def string_preprocessing(_str1, _str2)
145
- str1 = _str1.dup
146
- str2 = _str2.dup
147
- mappings = TextAlignment::MAPPINGS.dup
148
-
149
- ## single character mappings
150
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
151
- characters_from = character_mappings.collect{|m| m[0]}.join
152
- characters_to = character_mappings.collect{|m| m[1]}.join
153
- characters_to.gsub!(/-/, '\-')
154
-
155
- str1.tr!(characters_from, characters_to)
156
- str2.tr!(characters_from, characters_to)
157
-
158
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
159
-
160
- ## long to one character mappings
161
- pletters = TextAlignment::PADDING_LETTERS
162
-
163
- # find the padding letter for str1
164
- @padding_letter1 = begin
165
- i = pletters.index{|l| str2.index(l).nil?}
166
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
167
- TextAlignment::PADDING_LETTERS[i]
168
- end
169
-
170
- # find the padding letter for str2
171
- @padding_letter2 = begin
172
- i = pletters.index{|l| l != @padding_letter1 && str1.index(l).nil?}
173
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
174
- TextAlignment::PADDING_LETTERS[i]
175
- end
176
-
177
- # ASCII foldings
178
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
179
- ascii_foldings.each do |f|
180
- from = f[1]
181
-
182
- if str2.index(f[0])
183
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
184
- str1.gsub!(from, to)
185
- end
186
-
187
- if str1.index(f[0])
188
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
189
- str2.gsub!(from, to)
190
- end
191
- end
192
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
193
-
194
- [str1, str2, mappings]
195
- end
196
-
197
- def compute_similarity(_s1, _s2, sdiff)
144
+ def compute_similarity(s1, s2, sdiff)
198
145
  return 0 if sdiff.nil?
199
146
 
200
147
  # compute the lcs only with non-whitespace letters
201
148
  lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
202
149
  return 0 if lcs == 0
203
150
 
204
- s1 = _s1.tr(@padding_letter1, ' ')
205
- s2 = _s2.tr(@padding_letter2, ' ')
206
-
207
- similarity = lcs / [s1.scan(/\S/).count, s2.scan(/\S/).count].min.to_f
151
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
208
152
  end
209
153
 
210
154
  end
@@ -2,39 +2,233 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
8
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
-
10
9
  class TextAlignment::TextAlignment
11
10
  attr_reader :block_alignment
12
11
  attr_reader :similarity
13
12
  attr_reader :lost_annotations
14
13
 
15
- def initialize(str1, str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @block_alignment = {source_text:str1, target_text:str2}
19
- @str1 = str1
20
- @str2 = str2
18
+ @original_rtext = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @to_prevent_overlap = to_prevent_overlap
21
21
 
22
- ## Block exact match
23
- block_begin = str2.index(str1)
24
- unless block_begin.nil?
25
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
26
- return
22
+ @original_text = nil
23
+ @block_alignment = nil
24
+ @cultivation_map = TextAlignment::CultivationMap.new
25
+ end
26
+
27
+ def align(text, denotations = nil)
28
+ # To maintain the cultivation map
29
+ update_cultivation_map if @to_prevent_overlap
30
+
31
+ # In case the input text is the same as the previous one, reuse the previous text mapping
32
+ unless @original_text && @original_text == text
33
+ @original_text = text
34
+ @text_mapping = TextAlignment::CharMapping.new(text)
27
35
  end
28
36
 
29
- block_begin = str2.downcase.index(str1.downcase)
30
- unless block_begin.nil?
31
- @block_alignment[:blocks] = [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
32
- return
37
+ text_mapped = @text_mapping.mapped_text
38
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
39
+
40
+ rtext_mapped = @rtext_mapping.mapped_text
41
+
42
+ ## To generate the block_alignment of the input text against the reference text
43
+
44
+ # Initialization
45
+ @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
46
+
47
+ # Generation
48
+ @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
49
+ r
50
+ else
51
+ find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
52
+ end
53
+ end
54
+
55
+ def update_cultivation_map
56
+ return if @block_alignment.nil? || @block_alignment[:blocks].nil?
57
+
58
+ ## To update the cultivation map
59
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
60
+ if b[:alignment] == :block || b[:alignment] == :term
61
+ [b[:target][:begin], b[:target][:end]]
62
+ else
63
+ nil
64
+ end
65
+ end.compact.inject([]) do |condensed, region|
66
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
67
+ condensed.push region
68
+ else
69
+ condensed.last[1] = region.last
70
+ end
71
+ condensed
72
+ end
73
+
74
+ @cultivation_map.cultivate(newly_cultivated_regions)
75
+ end
76
+
77
+ def transform_begin_position(_begin_position)
78
+ begin_position = @text_mapping.enmap_position(_begin_position)
79
+
80
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
81
+ block = @block_alignment[:blocks][i]
82
+
83
+ b = if block[:alignment] == :block || block[:alignment] == :term
84
+ begin_position + block[:delta]
85
+ elsif block[:alignment] == :empty
86
+ if begin_position == block[:source][:begin]
87
+ block[:target][:begin]
88
+ else
89
+ nil
90
+ end
91
+ else
92
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
93
+ r.nil? ? nil : r + block[:target][:begin]
94
+ end
95
+
96
+ @rtext_mapping.demap_position(b)
97
+ end
98
+
99
+ def transform_end_position(_end_position)
100
+ end_position = @text_mapping.enmap_position(_end_position)
101
+
102
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
103
+ block = @block_alignment[:blocks][i]
104
+
105
+ e = if block[:alignment] == :block || block[:alignment] == :term
106
+ end_position + block[:delta]
107
+ elsif block[:alignment] == :empty
108
+ if end_position == block[:source][:end]
109
+ block[:target][:end]
110
+ else
111
+ nil
112
+ end
113
+ else
114
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
115
+ r.nil? ? nil : r + block[:target][:begin]
116
+ end
117
+
118
+ @rtext_mapping.demap_position(e)
119
+ end
120
+
121
+ def transform_a_span(span)
122
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
123
+ end
124
+
125
+ def transform_spans(spans)
126
+ spans.map{|span| transform_a_span(span)}
127
+ end
128
+
129
+ def transform_denotations!(denotations)
130
+ return nil if denotations.nil?
131
+ @lost_annotations = []
132
+
133
+ denotations.each do |d|
134
+ source = {begin:d.begin, end:d.end}
135
+ d.begin = transform_begin_position(d.begin);
136
+ d.end = transform_end_position(d.end);
137
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
138
+ rescue
139
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
140
+ d.begin = nil
141
+ d.end = nil
33
142
  end
34
143
 
144
+ @lost_annotations
145
+ end
146
+
147
+ def transform_hdenotations(hdenotations)
148
+ return nil if hdenotations.nil?
149
+ @lost_annotations = []
150
+
151
+ r = hdenotations.collect do |d|
152
+ t = transform_a_span(d[:span])
153
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
154
+ new_d = d.dup.merge({span:t})
155
+ rescue
156
+ @lost_annotations << {source: d[:span], target:t}
157
+ nil
158
+ end.compact
159
+
160
+ r
161
+ end
162
+
163
+ def alignment_show
164
+ stext = @block_alignment[:text]
165
+ ttext = @block_alignment[:reference_text]
166
+
167
+ show = ''
168
+ @block_alignment[:blocks].each do |a|
169
+ show += case a[:alignment]
170
+ when :block
171
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
172
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
173
+ when :term
174
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
175
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
176
+ when :empty
177
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
178
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
179
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
180
+ ">>>>> string 2 " +
181
+ if a[:target]
182
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
183
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
184
+ else
185
+ "[-]\n\n"
186
+ end
187
+ else
188
+ astr1 = ''
189
+ astr2 = ''
190
+
191
+ base = a[:source][:begin]
192
+ astr1 = a[:alignment].sdiff.map do |c|
193
+ case c.action
194
+ when '='
195
+ stext[c.old_position + base]
196
+ when '+'
197
+ '_'
198
+ when '-'
199
+ stext[c.old_position + base]
200
+ when '!'
201
+ stext[c.old_position + base] + '_'
202
+ end
203
+ end.join('')
35
204
 
205
+ base = a[:target][:begin]
206
+ astr2 = a[:alignment].sdiff.map do |c|
207
+ case c.action
208
+ when '='
209
+ ttext[c.new_position + base]
210
+ when '+'
211
+ ttext[c.new_position + base]
212
+ when '-'
213
+ '_'
214
+ when '!'
215
+ '_' + ttext[c.new_position + base]
216
+ end
217
+ end.join('')
218
+
219
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
220
+ "[#{astr1}]\n" +
221
+ "[#{astr2}]\n\n"
222
+ end
223
+ end
224
+ show
225
+ end
226
+
227
+ private
228
+
229
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
36
230
  ## to find block alignments
37
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
231
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
38
232
 
39
233
  blocks = []
40
234
  while block = anchor_finder.get_next_anchor
@@ -77,12 +271,13 @@ class TextAlignment::TextAlignment
77
271
 
78
272
  if b2 == e2
79
273
  [
80
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
274
+ {source:{begin:b1, end:e1}, alignment: :empty},
81
275
  block
82
276
  ]
83
277
  else
278
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
279
+
84
280
  if b1 == 0 && b2 == 0
85
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
86
281
  b2 = e2 - len_buffer if e2 > len_buffer
87
282
  end
88
283
 
@@ -94,6 +289,10 @@ class TextAlignment::TextAlignment
94
289
  {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
95
290
  block
96
291
  ]
292
+ elsif ((e2 - b2) - (e1 - b1)) > len_buffer
293
+ la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
294
+ la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
295
+ [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
97
296
  else
98
297
  local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
99
298
  end
@@ -111,21 +310,58 @@ class TextAlignment::TextAlignment
111
310
  b1 = last_block[:source][:end]
112
311
  if b1 < str1.length
113
312
  e1 = str1.length
114
-
115
313
  b2 = last_block[:target][:end]
116
- if b2 < str2.length
117
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
118
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
119
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
314
+
315
+ _str1 = str1[b1 ... e1]
316
+ if _str1.strip.empty?
317
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
120
318
  else
121
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
319
+ if b2 < str2.length
320
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
321
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
322
+
323
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
324
+ else
325
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
326
+ end
122
327
  end
123
328
  else
124
329
  []
125
330
  end
126
331
  end
332
+ end
333
+
334
+ def whole_block_alignment(str1, str2, cultivation_map)
335
+ ## Block exact match
336
+ search_position = 0
337
+
338
+ block_begin = begin
339
+ _block_begin = str2.index(str1, search_position)
340
+ break if _block_begin.nil?
341
+ search_position = cultivation_map.search_again_position(_block_begin)
342
+ _block_begin
343
+ end until search_position.nil?
344
+
345
+ unless block_begin.nil?
346
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
347
+ end
348
+
349
+ search_position = 0
127
350
 
128
- @block_alignment[:blocks] = blocks2
351
+ dstr1 = str1.downcase
352
+ dstr2 = str2.downcase
353
+ block_begin = begin
354
+ _block_begin = dstr2.index(dstr1, search_position)
355
+ break if _block_begin.nil?
356
+ search_position = cultivation_map.search_again_position(_block_begin)
357
+ _block_begin
358
+ end until search_position.nil?
359
+
360
+ unless block_begin.nil?
361
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
362
+ end
363
+
364
+ nil
129
365
  end
130
366
 
131
367
  def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
@@ -138,7 +374,7 @@ class TextAlignment::TextAlignment
138
374
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
139
375
 
140
376
  position = 0
141
- tblocks = ds_in_scope.map do |term|
377
+ _tblocks = ds_in_scope.map do |term|
142
378
  lex = term[:lex]
143
379
  r = block2.index(lex, position)
144
380
  if r.nil?
@@ -146,11 +382,11 @@ class TextAlignment::TextAlignment
146
382
  break
147
383
  end
148
384
  position = r + lex.length
149
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
385
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
150
386
  end
151
387
 
152
388
  # missing term found
153
- tblocks = [] if position.nil?
389
+ _tblocks = [] if position.nil?
154
390
 
155
391
  # redundant matching found
156
392
  unless position.nil?
@@ -158,14 +394,15 @@ class TextAlignment::TextAlignment
158
394
  lex = term[:lex]
159
395
  look_forward = block2.index(lex, position)
160
396
  unless look_forward.nil?
161
- puts lex
162
- tblocks = []
397
+ _tblocks = []
163
398
  break
164
399
  end
165
400
  end
166
401
  end
167
402
 
168
- tblocks
403
+ _tblocks
404
+ else
405
+ []
169
406
  end
170
407
 
171
408
  if tblocks.empty?
@@ -237,153 +474,4 @@ class TextAlignment::TextAlignment
237
474
  end
238
475
  end
239
476
 
240
-
241
- def indices(str, target)
242
- position = 0
243
- len = target.len
244
- Enumerator.new do |yielder|
245
- while idx = str.index(target, position)
246
- yielder << idx
247
- position = idx + len
248
- end
249
- end
250
- end
251
-
252
- def transform_begin_position(begin_position)
253
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
254
- block = @block_alignment[:blocks][i]
255
-
256
- b = if block[:alignment] == :block || block[:alignment] == :term
257
- begin_position + block[:delta]
258
- elsif block[:alignment] == :empty
259
- if begin_position == block[:source][:begin]
260
- block[:target][:begin]
261
- else
262
- nil
263
- end
264
- else
265
- r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
266
- r.nil? ? nil : r + block[:target][:begin]
267
- end
268
- end
269
-
270
- def transform_end_position(end_position)
271
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
272
- block = @block_alignment[:blocks][i]
273
-
274
- e = if block[:alignment] == :block || block[:alignment] == :term
275
- end_position + block[:delta]
276
- elsif block[:alignment] == :empty
277
- if end_position == block[:source][:end]
278
- block[:target][:end]
279
- else
280
- nil
281
- end
282
- else
283
- r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
284
- r.nil? ? nil : r + block[:target][:begin]
285
- end
286
- end
287
-
288
- def transform_a_span(span)
289
- {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
290
- end
291
-
292
- def transform_spans(spans)
293
- spans.map{|span| transform_a_span(span)}
294
- end
295
-
296
- def transform_denotations!(denotations)
297
- return nil if denotations.nil?
298
- @lost_annotations = []
299
-
300
- denotations.each do |d|
301
- source = {begin:d.begin, end:d.end}
302
- d.begin = transform_begin_position(d.begin);
303
- d.end = transform_end_position(d.end);
304
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @str2.length
305
- rescue
306
- @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
307
- d.begin = nil
308
- d.end = nil
309
- end
310
-
311
- @lost_annotations
312
- end
313
-
314
- def transform_hdenotations(hdenotations)
315
- return nil if hdenotations.nil?
316
- @lost_annotations = []
317
-
318
- r = hdenotations.collect do |d|
319
- t = transform_a_span(d[:span])
320
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @str2.length
321
- new_d = d.dup.merge({span:t})
322
- rescue
323
- @lost_annotations << {source: d[:span], target:t}
324
- nil
325
- end.compact
326
-
327
- r
328
- end
329
-
330
- def alignment_show
331
- stext = @block_alignment[:source_text]
332
- ttext = @block_alignment[:target_text]
333
-
334
- show = ''
335
- @block_alignment[:blocks].each do |a|
336
- show += case a[:alignment]
337
- when :block
338
- "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
339
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
340
- when :term
341
- "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
342
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
343
- when :empty
344
- "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
345
- "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
346
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
347
- ">>>>> string 2 [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
348
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
349
- else
350
- astr1 = ''
351
- astr2 = ''
352
-
353
- base = a[:source][:begin]
354
- astr1 = a[:alignment].sdiff.map do |c|
355
- case c.action
356
- when '='
357
- stext[c.old_position + base]
358
- when '+'
359
- '_'
360
- when '-'
361
- stext[c.old_position + base]
362
- when '!'
363
- stext[c.old_position + base] + '_'
364
- end
365
- end.join('')
366
-
367
- base = a[:target][:begin]
368
- astr2 = a[:alignment].sdiff.map do |c|
369
- case c.action
370
- when '='
371
- ttext[c.new_position + base]
372
- when '+'
373
- ttext[c.new_position + base]
374
- when '-'
375
- '_'
376
- when '!'
377
- '_' + ttext[c.new_position + base]
378
- end
379
- end.join('')
380
-
381
- "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
382
- "[#{astr1}]\n" +
383
- "[#{astr2}]\n\n"
384
- end
385
- end
386
- show
387
- end
388
-
389
477
  end