text_alignment 0.9.1 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bc60f2422e09058c8abc037a5f4c7e28a2c26c4b0defa3e157a478f6c691e85e
4
- data.tar.gz: 3732d51c46d0597cec005396c13e5aa7c84c766232f5de0c5b90e789a2fa77f1
3
+ metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
4
+ data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
5
5
  SHA512:
6
- metadata.gz: '0095e5682996e5ccb6d6cc7529c40901656f169670e49d26331acad139964b528a6b3ae9c48f32844fbe2a8737f0ab66fdc4f4da51dc37808bed65e7a7447f37'
7
- data.tar.gz: b8e00566dbcba94fbfd1d84bd7d10ac6ba7677124aa8a0676797223d4969e76917ea21013cb509762a46d14324eb28e38b1d6ad7dc26cd0fcb2a30af573e6612
6
+ metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
7
+ data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
@@ -26,8 +26,9 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
29
+ def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
30
+ alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
31
+ cm = alignment.cultivation_map
31
32
  new_denotations = alignment.transform_hdenotations(denotations)
32
33
 
33
34
  if debug
@@ -47,7 +48,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
47
48
  warn
48
49
 
49
50
  # return target annotations
50
- new_denotations
51
+ [new_denotations, cm]
51
52
  end
52
53
 
53
54
  def align_mannotations(source_annotations, target_text, debug = false)
@@ -58,11 +59,13 @@ def align_mannotations(source_annotations, target_text, debug = false)
58
59
  idnum_attributes = 0
59
60
  idnum_modifications = 0
60
61
 
62
+ cm = nil
61
63
  source_annotations.each_with_index do |annotations, i|
62
64
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
63
65
  ididx = {}
64
66
  warn "[#{i}]-=-=-=-=-"
65
- denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
67
+ denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
68
+
66
69
  denotations.each do |d|
67
70
  reid = 'T' + (idnum_denotations += 1).to_s
68
71
  ididx[d[:id]] = reid
@@ -114,8 +117,9 @@ target_text = read_text(ARGV[1])
114
117
  target_annotations = if source_annotations.class == Array
115
118
  align_mannotations(source_annotations, target_text, false)
116
119
  else
117
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
120
+ denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
121
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
118
122
  source_annotations.merge({text:target_text, denotations:denotations})
119
123
  end
120
124
 
121
- # puts target_annotations.to_json
125
+ puts target_annotations.to_json
@@ -6,76 +6,65 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
13
-
14
- @reverse = (target_str.length < source_str.length)
15
-
16
- @s1, @s2 = if @reverse
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1, @s2 = if reverse?(source_str, target_str)
17
11
  [target_str.downcase, source_str.downcase]
18
12
  else
19
13
  [source_str.downcase, target_str.downcase]
20
14
  end
21
15
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
26
- end
27
-
28
- def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
16
+ @cultivation_map = cultivation_map
36
17
 
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
18
+ @size_ngram = TextAlignment::SIZE_NGRAM
19
+ @size_window = TextAlignment::SIZE_WINDOW
20
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
42
22
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
23
+ # positions of last match
24
+ @pos_s1_last_match = 0
25
+ @pos_s2_last_match = 0
26
+ end
45
27
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
28
+ def reverse?(source_str = nil, target_str = nil)
29
+ unless source_str.nil?
30
+ @reverse_p = target_str.length < source_str.length
31
+ end
32
+ @reverse_p
33
+ end
48
34
 
49
- search_position = @beg_s2 + 1
50
- end
35
+ def get_next_anchor
36
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
51
38
 
52
- break unless @beg_s2.nil?
39
+ # To skip whitespace letters
40
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
53
41
 
54
- @beg_s1 += 1
42
+ _beg_s2 = get_beg_s2(beg_s1)
43
+ break _beg_s2 unless _beg_s2.nil?
55
44
  end
56
45
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
46
+ # To return nil when it fails to find an anchor
47
+ return nil if beg_s2.class == Range
58
48
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
49
+ # To extend the block to the left
50
+ b1 = beg_s1
51
+ b2 = beg_s2
52
+ while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
63
53
  b1 -= 1; b2 -= 1
64
54
  end
65
-
66
55
  b1 += 1; b2 += 1
67
56
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
57
+ # To extend the block to the right
58
+ e1 = beg_s1 + @size_ngram
59
+ e2 = beg_s2 + @size_ngram
70
60
  while @s1[e1] && @s1[e1] == @s2[e2]
71
61
  e1 += 1; e2 += 1
72
62
  end
73
63
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
64
+ @pos_s1_last_match = e1
65
+ @pos_s2_last_match = e2
77
66
 
78
- if @reverse
67
+ if reverse?
79
68
  {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
69
  else
81
70
  {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
@@ -84,14 +73,92 @@ class TextAlignment::AnchorFinder
84
73
 
85
74
  private
86
75
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
76
+ def get_beg_s2(beg_s1)
77
+ # to get the anchor to search for in s2
78
+ anchor = @s1[beg_s1, @size_ngram]
79
+
80
+ # comment out below with the assumption that texts are in the same order
81
+ # search_position = 0
82
+ search_position = @pos_s2_last_match
83
+
84
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
85
+ return nil if beg_s2_candidates.empty?
86
+
87
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
88
+ end
89
+
90
+ # To find beg_s2 which match to the anchor
91
+ # return nil if the anchor is too much frequent
92
+ def find_beg_s2_candidates(anchor, search_position)
93
+ candidates = []
94
+ while _beg_s2 = @s2.index(anchor, search_position)
95
+ search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
+ unless search_again_position.nil?
97
+ search_position = search_again_position
98
+ next
99
+ end
100
+
101
+ candidates << _beg_s2
102
+
103
+ # for speed, skip anchor of high frequency
104
+ if candidates.length > 5
105
+ candidates.clear
106
+ break
107
+ end
108
+
109
+ search_position = _beg_s2 + 1
110
+ end
111
+ candidates
112
+ end
113
+
114
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
115
+ valid_beg_s2 = nil
116
+
117
+ (10 .. 30).step(10).each do |size_window|
118
+ valid_beg_s2 = nil
119
+
120
+ r = beg_s2_candidates.each do |beg_s2|
121
+ # if both the begining points are sufficiantly close to the end points of the last match
122
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
123
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
124
+ break unless valid_beg_s2.nil?
125
+ valid_beg_s2 = beg_s2
126
+ next
127
+ end
128
+
129
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
130
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
+ break unless valid_beg_s2.nil?
132
+ valid_beg_s2 = beg_s2
133
+ next
134
+ end
135
+
136
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
137
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
+ break unless valid_beg_s2.nil?
139
+ valid_beg_s2 = beg_s2
140
+ next
141
+ end
142
+ end
143
+
144
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
+ break unless r.nil?
147
+ end
148
+
149
+ valid_beg_s2
150
+ end
151
+
152
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
153
+ size_window ||= @size_window
154
+
155
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
156
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
157
 
91
158
  window_s1 = ''
92
- loc = @beg_s1 - 1
159
+ loc = beg_s1 - 1
93
160
  count = 0
94
- while count < @size_window && loc >= 0
161
+ while count < size_window && loc >= 0
95
162
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
163
  window_s1 += @s1[loc]
97
164
  count += 1
@@ -100,9 +167,9 @@ class TextAlignment::AnchorFinder
100
167
  end
101
168
 
102
169
  window_s2 = ''
103
- loc = @beg_s2 - 1
170
+ loc = beg_s2 - 1
104
171
  count = 0
105
- while count < @size_window && loc >= 0
172
+ while count < size_window && loc >= 0
106
173
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
174
  window_s2 += @s2[loc]
108
175
  count += 1
@@ -113,15 +180,17 @@ class TextAlignment::AnchorFinder
113
180
  [window_s1, window_s2]
114
181
  end
115
182
 
116
- def get_right_windows
183
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
184
+ size_window ||= @size_window
185
+
117
186
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
187
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
188
 
120
189
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
190
+ loc = beg_s1 + @size_ngram
122
191
  len_s1 = @s1.length
123
192
  count = 0
124
- while count < @size_window && loc < len_s1
193
+ while count < size_window && loc < len_s1
125
194
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
195
  window_s1 += @s1[loc]
127
196
  count += 1
@@ -130,10 +199,10 @@ class TextAlignment::AnchorFinder
130
199
  end
131
200
 
132
201
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
202
+ loc = beg_s2 + @size_ngram
134
203
  len_s2 = @s2.length
135
204
  count = 0
136
- while count < @size_window && loc < len_s2
205
+ while count < size_window && loc < len_s2
137
206
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
207
  window_s2 += @s2[loc]
139
208
  count += 1
@@ -148,5 +217,4 @@ class TextAlignment::AnchorFinder
148
217
  return 0 if str1.nil? || str2.nil?
149
218
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
219
  end
151
-
152
- end
220
+ end
@@ -1,6 +1,6 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
3
+ TextAlignment::CHAR_MAPPING = [
4
4
  ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
6
  ["α", "alpha"], #U+03B1 (greek small letter alpha)
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
65
  [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2211 (Non-Breaking Hyphen)
66
+ ["", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
67
68
  ["−", "-"], #U+2212 (minus sign)
68
69
  ["–", "-"], #U+2013 (en dash)
69
70
  ["′", "'"], #U+2032 (prime)
@@ -75,98 +76,112 @@ TextAlignment::MAPPINGS = [
75
76
  ]
76
77
 
77
78
 
78
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+ class TextAlignment::CharMapping
80
+ attr_reader :str
79
81
 
82
+ def initialize(_str, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @str, offset_mapping = enmap_str(_str, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
88
+
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
80
92
 
81
- class << TextAlignment
82
- def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
- _mappings ||= TextAlignment::MAPPINGS
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
84
96
 
85
- character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
- if character_mappings.empty?
87
- [_str1, _str2, _mappings]
88
- else
89
- characters_from = character_mappings.collect{|m| m[0]}.join
90
- characters_to = character_mappings.collect{|m| m[1]}.join
91
- characters_to.gsub!(/-/, '\-')
97
+ def enmap_denotations(_denotations)
98
+ denotations = _denotations.map do |d|
99
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
+ end
101
+ end
92
102
 
93
- str1 = _str1.tr(characters_from, characters_to)
94
- str2 = _str2.tr(characters_from, characters_to)
103
+ private
95
104
 
96
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
105
+ def enmap_str(_str, char_mapping)
106
+ str = _str.dup
97
107
 
98
- [str1, str2, mappings]
108
+ # To execute the single letter mapping
109
+ char_mapping.each do |one, long|
110
+ str.gsub!(one, long) if long.length == 1
99
111
  end
100
- end
101
112
 
102
- def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
- _mappings ||= TextAlignment::MAPPINGS
104
-
105
- long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
- if long_to_one_mappings.empty?
107
- [_str1, _str2, _mappings]
108
- else
109
- ## long to one character mappings
110
- pletters = TextAlignment::PADDING_LETTERS
111
-
112
- # find the padding letter for str1
113
- @padding_letter1 = begin
114
- i = pletters.index{|l| _str2.index(l).nil?}
115
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
- TextAlignment::PADDING_LETTERS[i]
117
- end
113
+ # To get the (location, length) index for replacements
114
+ loc_len = []
115
+ char_mapping.each do |one, long|
116
+ next if long.length == 1
118
117
 
119
- # find the padding letter for str2
120
- @padding_letter2 = begin
121
- i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
- TextAlignment::PADDING_LETTERS[i]
118
+ init_next = 0
119
+ while loc = str.index(long, init_next)
120
+ loc_len << [loc, long.length]
121
+ init_next = loc + long.length
124
122
  end
125
123
 
126
- str1 = str2 = nil
127
- long_to_one_mappings.each do |f|
128
- from = f[1]
129
-
130
- str1 = if _str2.index(f[0])
131
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
- _str1.gsub(from, to)
133
- else
134
- _str1
135
- end
136
-
137
- str2 = if _str1.index(f[0])
138
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
- _str2.gsub(from, to)
140
- else
141
- _str2
142
- end
143
- end
144
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
124
+ # a workaround to avoid messing-up due to embedding
125
+ str.gsub!(long, one * long.length)
126
+ end
145
127
 
146
- [str1, str2, mappings]
128
+ # To get the (location, length) index for consecutive whitespace sequences
129
+ init_next = 0
130
+ while loc = str.index(/\s{2,}/, init_next)
131
+ len = $~[0].length
132
+ loc_len << [loc, len]
133
+ init_next = loc + len
147
134
  end
148
- end
149
135
 
150
- def compute_similarity(_s1, _s2, sdiff)
151
- return 0 if sdiff.nil?
136
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
152
137
 
153
- # compute the lcs only with non-whitespace letters
154
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
- return 0 if lcs == 0
138
+ # To get the offset_mapping before and after replacement
139
+ offset_mapping = []
140
+ init_next = 0
141
+ j = 0
156
142
 
157
- s1 = if @padding_letter1
158
- _s1.tr(@padding_letter1, ' ')
159
- else
160
- _s1
143
+ loc_len.each do |loc, len|
144
+ offset_mapping += (init_next .. loc).map do |i|
145
+ j += 1
146
+ [i, j - 1]
147
+ end
148
+ init_next = loc + len
149
+ end
150
+
151
+ offset_mapping += (init_next .. str.length).map do |i|
152
+ j += 1
153
+ [i, j - 1]
161
154
  end
162
155
 
163
- s2 = if @padding_letter2
164
- _s2.tr(@padding_letter2, ' ')
165
- else
166
- _s2
156
+ # To execute the long letter mapping
157
+ char_mapping.each do |one, long|
158
+ str.gsub!(one * long.length, one) if long.length > 1
167
159
  end
168
160
 
169
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
161
+ # To replace multi whitespace sequences to a space
162
+ str.gsub!(/\s{2,}/, ' ')
163
+
164
+ [str, offset_mapping]
165
+ end
166
+ end
167
+
168
+ if __FILE__ == $0
169
+ require 'json'
170
+
171
+ unless ARGV.length == 1
172
+ warn "#{$0} an_annotation_json_file.json"
173
+ exit
170
174
  end
175
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
176
+ denotations = annotations[:denotations]
177
+ if denotations.nil? && annotations[:tracks]
178
+ denotations = annotations[:tracks].first[:denotations]
179
+ end
180
+
181
+ str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
+ str_mapped = str_mapping.str
183
+ denotations_mapped = str_mapping.enmap_denotations(denotations)
184
+ new_annotations = {text:str_mapped, denotations:denotations_mapped}
171
185
 
186
+ puts new_annotations.to_json
172
187
  end
@@ -0,0 +1,19 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = {}
8
+ end
9
+
10
+ def cultivate(regions)
11
+ regions.each do |b, e|
12
+ (b ... e).each{|p| @map[p] = e}
13
+ end
14
+ end
15
+
16
+ def search_again_position(position)
17
+ @map[position]
18
+ end
19
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
20
20
  def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
63
65
  end
64
66
 
65
67
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
68
+ @similarity = compute_similarity(str1, str2, @sdiff)
67
69
  @str1_match_initial = cmp.str1_match_initial
68
70
  @str1_match_final = cmp.str1_match_final
69
71
  @str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,14 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
144
+ def compute_similarity(s1, s2, sdiff)
145
+ return 0 if sdiff.nil?
146
+
147
+ # compute the lcs only with non-whitespace letters
148
+ lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
+ return 0 if lcs == 0
150
+
151
+ similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
+ end
153
+
142
154
  end
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
@@ -9,23 +10,206 @@ class TextAlignment::TextAlignment
9
10
  attr_reader :block_alignment
10
11
  attr_reader :similarity
11
12
  attr_reader :lost_annotations
13
+ attr_reader :cultivation_map
12
14
 
13
- def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
15
+ def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
14
16
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
15
17
 
16
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: denotations}
18
+ @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
17
19
  @original_str1 = _str1
18
20
  @original_str2 = _str2
19
21
 
20
- str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
22
+ @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
+ @str2_mapping = TextAlignment::CharMapping.new(_str2)
21
24
 
22
- if r = whole_block_alignment(str1, str2)
23
- @block_alignment[:blocks] = r
24
- return
25
+ str1 = @str1_mapping.str
26
+ denotations = @str1_mapping.enmap_denotations(_denotations)
27
+
28
+ str2 = @str2_mapping.str
29
+
30
+ @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
31
+
32
+ @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
+ # whole block alignment
34
+ r
35
+ else
36
+ find_block_alignment(str1, str2, denotations, @cultivation_map)
37
+ end
38
+
39
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
+ if b[:alignment] == :block || b[:alignment] == :term
41
+ [b[:target][:begin], b[:target][:end]]
42
+ else
43
+ nil
44
+ end
45
+ end.compact
46
+ newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
47
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
48
+ condensed.push region
49
+ else
50
+ condensed.last[1] = region.last
51
+ end
52
+ condensed
53
+ end
54
+
55
+ @cultivation_map.cultivate(newly_cultivated_regions_condensed)
56
+ end
57
+
58
+ def transform_begin_position(_begin_position)
59
+ begin_position = @str1_mapping.enmap_position(_begin_position)
60
+
61
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
+ block = @block_alignment[:blocks][i]
63
+
64
+ b = if block[:alignment] == :block || block[:alignment] == :term
65
+ begin_position + block[:delta]
66
+ elsif block[:alignment] == :empty
67
+ if begin_position == block[:source][:begin]
68
+ block[:target][:begin]
69
+ else
70
+ nil
71
+ end
72
+ else
73
+ r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
74
+ r.nil? ? nil : r + block[:target][:begin]
25
75
  end
26
76
 
77
+ @str2_mapping.demap_position(b)
78
+ end
79
+
80
+ def transform_end_position(_end_position)
81
+ end_position = @str1_mapping.enmap_position(_end_position)
82
+
83
+ i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
+ block = @block_alignment[:blocks][i]
85
+
86
+ e = if block[:alignment] == :block || block[:alignment] == :term
87
+ end_position + block[:delta]
88
+ elsif block[:alignment] == :empty
89
+ if end_position == block[:source][:end]
90
+ block[:target][:end]
91
+ else
92
+ nil
93
+ end
94
+ else
95
+ r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
96
+ r.nil? ? nil : r + block[:target][:begin]
97
+ end
98
+
99
+ @str2_mapping.demap_position(e)
100
+ end
101
+
102
+ def transform_a_span(span)
103
+ {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
104
+ end
105
+
106
+ def transform_spans(spans)
107
+ spans.map{|span| transform_a_span(span)}
108
+ end
109
+
110
+ def transform_denotations!(denotations)
111
+ return nil if denotations.nil?
112
+ @lost_annotations = []
113
+
114
+ denotations.each do |d|
115
+ source = {begin:d.begin, end:d.end}
116
+ d.begin = transform_begin_position(d.begin);
117
+ d.end = transform_end_position(d.end);
118
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
119
+ rescue
120
+ @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
+ d.begin = nil
122
+ d.end = nil
123
+ end
124
+
125
+ @lost_annotations
126
+ end
127
+
128
+ def transform_hdenotations(hdenotations)
129
+ return nil if hdenotations.nil?
130
+ @lost_annotations = []
131
+
132
+ r = hdenotations.collect do |d|
133
+ t = transform_a_span(d[:span])
134
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
135
+ new_d = d.dup.merge({span:t})
136
+ rescue
137
+ @lost_annotations << {source: d[:span], target:t}
138
+ nil
139
+ end.compact
140
+
141
+ r
142
+ end
143
+
144
+ def alignment_show
145
+ stext = @block_alignment[:source_text]
146
+ ttext = @block_alignment[:target_text]
147
+
148
+ show = ''
149
+ @block_alignment[:blocks].each do |a|
150
+ show += case a[:alignment]
151
+ when :block
152
+ "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
153
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
154
+ when :term
155
+ "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
156
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
157
+ when :empty
158
+ "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
159
+ "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
160
+ stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
161
+ ">>>>> string 2 " +
162
+ if a[:target]
163
+ "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
164
+ ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
165
+ else
166
+ "[-]\n\n"
167
+ end
168
+ else
169
+ astr1 = ''
170
+ astr2 = ''
171
+
172
+ base = a[:source][:begin]
173
+ astr1 = a[:alignment].sdiff.map do |c|
174
+ case c.action
175
+ when '='
176
+ stext[c.old_position + base]
177
+ when '+'
178
+ '_'
179
+ when '-'
180
+ stext[c.old_position + base]
181
+ when '!'
182
+ stext[c.old_position + base] + '_'
183
+ end
184
+ end.join('')
185
+
186
+ base = a[:target][:begin]
187
+ astr2 = a[:alignment].sdiff.map do |c|
188
+ case c.action
189
+ when '='
190
+ ttext[c.new_position + base]
191
+ when '+'
192
+ ttext[c.new_position + base]
193
+ when '-'
194
+ '_'
195
+ when '!'
196
+ '_' + ttext[c.new_position + base]
197
+ end
198
+ end.join('')
199
+
200
+ "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
201
+ "[#{astr1}]\n" +
202
+ "[#{astr2}]\n\n"
203
+ end
204
+ end
205
+ show
206
+ end
207
+
208
+ private
209
+
210
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
27
211
  ## to find block alignments
28
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
212
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
29
213
 
30
214
  blocks = []
31
215
  while block = anchor_finder.get_next_anchor
@@ -68,12 +252,13 @@ class TextAlignment::TextAlignment
68
252
 
69
253
  if b2 == e2
70
254
  [
71
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
255
+ {source:{begin:b1, end:e1}, alignment: :empty},
72
256
  block
73
257
  ]
74
258
  else
259
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
260
+
75
261
  if b1 == 0 && b2 == 0
76
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
262
  b2 = e2 - len_buffer if e2 > len_buffer
78
263
  end
79
264
 
@@ -85,6 +270,10 @@ class TextAlignment::TextAlignment
85
270
  {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
86
271
  block
87
272
  ]
273
+ elsif ((e2 - b2) - (e1 - b1)) > len_buffer
274
+ la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
275
+ la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
276
+ [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
88
277
  else
89
278
  local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
90
279
  end
@@ -102,31 +291,53 @@ class TextAlignment::TextAlignment
102
291
  b1 = last_block[:source][:end]
103
292
  if b1 < str1.length
104
293
  e1 = str1.length
105
-
106
294
  b2 = last_block[:target][:end]
107
- if b2 < str2.length
108
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
109
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
110
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
295
+
296
+ _str1 = str1[b1 ... e1]
297
+ if _str1.strip.empty?
298
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
111
299
  else
112
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
300
+ if b2 < str2.length
301
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
302
+ e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
303
+
304
+ local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
305
+ else
306
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
307
+ end
113
308
  end
114
309
  else
115
310
  []
116
311
  end
117
312
  end
118
-
119
- @block_alignment[:blocks] = blocks2
120
313
  end
121
314
 
122
- def whole_block_alignment(str1, str2)
315
+ def whole_block_alignment(str1, str2, cultivation_map)
123
316
  ## Block exact match
124
- block_begin = str2.index(str1)
317
+ search_position = 0
318
+
319
+ block_begin = begin
320
+ _block_begin = str2.index(str1, search_position)
321
+ break if _block_begin.nil?
322
+ search_position = cultivation_map.search_again_position(_block_begin)
323
+ _block_begin
324
+ end until search_position.nil?
325
+
125
326
  unless block_begin.nil?
126
327
  return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
328
  end
128
329
 
129
- block_begin = str2.downcase.index(str1.downcase)
330
+ search_position = 0
331
+
332
+ dstr1 = str1.downcase
333
+ dstr2 = str2.downcase
334
+ block_begin = begin
335
+ _block_begin = dstr2.index(dstr1, search_position)
336
+ break if _block_begin.nil?
337
+ search_position = cultivation_map.search_again_position(_block_begin)
338
+ _block_begin
339
+ end until search_position.nil?
340
+
130
341
  unless block_begin.nil?
131
342
  return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
343
  end
@@ -144,7 +355,7 @@ class TextAlignment::TextAlignment
144
355
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
145
356
 
146
357
  position = 0
147
- tblocks = ds_in_scope.map do |term|
358
+ _tblocks = ds_in_scope.map do |term|
148
359
  lex = term[:lex]
149
360
  r = block2.index(lex, position)
150
361
  if r.nil?
@@ -152,11 +363,11 @@ class TextAlignment::TextAlignment
152
363
  break
153
364
  end
154
365
  position = r + lex.length
155
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r + b2 - term[:span][:begin]}
366
+ {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
156
367
  end
157
368
 
158
369
  # missing term found
159
- tblocks = [] if position.nil?
370
+ _tblocks = [] if position.nil?
160
371
 
161
372
  # redundant matching found
162
373
  unless position.nil?
@@ -164,13 +375,13 @@ class TextAlignment::TextAlignment
164
375
  lex = term[:lex]
165
376
  look_forward = block2.index(lex, position)
166
377
  unless look_forward.nil?
167
- tblocks = []
378
+ _tblocks = []
168
379
  break
169
380
  end
170
381
  end
171
382
  end
172
383
 
173
- tblocks
384
+ _tblocks
174
385
  else
175
386
  []
176
387
  end
@@ -184,7 +395,7 @@ class TextAlignment::TextAlignment
184
395
  block2 = str2[b2 ... e2]
185
396
 
186
397
  ## character-based alignment
187
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
398
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
188
399
  if alignment.sdiff.nil?
189
400
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
401
  else
@@ -196,7 +407,7 @@ class TextAlignment::TextAlignment
196
407
  block2 = str2[b2 ... e2]
197
408
 
198
409
  ## character-based alignment
199
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
410
+ alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
200
411
  if alignment.sdiff.nil?
201
412
  [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
413
  else
@@ -244,157 +455,4 @@ class TextAlignment::TextAlignment
244
455
  end
245
456
  end
246
457
 
247
-
248
- def indices(str, target)
249
- position = 0
250
- len = target.len
251
- Enumerator.new do |yielder|
252
- while idx = str.index(target, position)
253
- yielder << idx
254
- position = idx + len
255
- end
256
- end
257
- end
258
-
259
- def transform_begin_position(begin_position)
260
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
261
- block = @block_alignment[:blocks][i]
262
-
263
- b = if block[:alignment] == :block || block[:alignment] == :term
264
- begin_position + block[:delta]
265
- elsif block[:alignment] == :empty
266
- if begin_position == block[:source][:begin]
267
- block[:target][:begin]
268
- else
269
- nil
270
- end
271
- else
272
- r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
273
- r.nil? ? nil : r + block[:target][:begin]
274
- end
275
- end
276
-
277
- def transform_end_position(end_position)
278
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
279
- block = @block_alignment[:blocks][i]
280
-
281
- e = if block[:alignment] == :block || block[:alignment] == :term
282
- end_position + block[:delta]
283
- elsif block[:alignment] == :empty
284
- if end_position == block[:source][:end]
285
- block[:target][:end]
286
- else
287
- nil
288
- end
289
- else
290
- r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
291
- r.nil? ? nil : r + block[:target][:begin]
292
- end
293
- end
294
-
295
- def transform_a_span(span)
296
- {begin: transform_begin_position(span[:begin]), end: transform_end_position(span[:end])}
297
- end
298
-
299
- def transform_spans(spans)
300
- spans.map{|span| transform_a_span(span)}
301
- end
302
-
303
- def transform_denotations!(denotations)
304
- return nil if denotations.nil?
305
- @lost_annotations = []
306
-
307
- denotations.each do |d|
308
- source = {begin:d.begin, end:d.end}
309
- d.begin = transform_begin_position(d.begin);
310
- d.end = transform_end_position(d.end);
311
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
312
- rescue
313
- @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
314
- d.begin = nil
315
- d.end = nil
316
- end
317
-
318
- @lost_annotations
319
- end
320
-
321
- def transform_hdenotations(hdenotations)
322
- return nil if hdenotations.nil?
323
- @lost_annotations = []
324
-
325
- r = hdenotations.collect do |d|
326
- t = transform_a_span(d[:span])
327
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
328
- new_d = d.dup.merge({span:t})
329
- rescue
330
- @lost_annotations << {source: d[:span], target:t}
331
- nil
332
- end.compact
333
-
334
- r
335
- end
336
-
337
- def alignment_show
338
- stext = @block_alignment[:source_text]
339
- ttext = @block_alignment[:target_text]
340
-
341
- show = ''
342
- @block_alignment[:blocks].each do |a|
343
- show += case a[:alignment]
344
- when :block
345
- "===== common (block) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
346
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
347
- when :term
348
- "===== common (term) ===== [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
349
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n"
350
- when :empty
351
- "xxxxx disparate texts (similarity: #{a[:similarity]})\n" +
352
- "<<<<< string 1 [#{a[:source][:begin]} - #{a[:source][:end]}]\n" +
353
- stext[a[:source][:begin] ... a[:source][:end]] + "\n\n" +
354
- ">>>>> string 2 " +
355
- if a[:target]
356
- "[#{a[:target][:begin]} - #{a[:target][:end]}]\n" +
357
- ttext[a[:target][:begin] ... a[:target][:end]] + "\n\n"
358
- else
359
- "[-]\n\n"
360
- end
361
- else
362
- astr1 = ''
363
- astr2 = ''
364
-
365
- base = a[:source][:begin]
366
- astr1 = a[:alignment].sdiff.map do |c|
367
- case c.action
368
- when '='
369
- stext[c.old_position + base]
370
- when '+'
371
- '_'
372
- when '-'
373
- stext[c.old_position + base]
374
- when '!'
375
- stext[c.old_position + base] + '_'
376
- end
377
- end.join('')
378
-
379
- base = a[:target][:begin]
380
- astr2 = a[:alignment].sdiff.map do |c|
381
- case c.action
382
- when '='
383
- ttext[c.new_position + base]
384
- when '+'
385
- ttext[c.new_position + base]
386
- when '-'
387
- '_'
388
- when '!'
389
- '_' + ttext[c.new_position + base]
390
- end
391
- end.join('')
392
-
393
- "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
394
- "[#{astr1}]\n" +
395
- "[#{astr2}]\n\n"
396
- end
397
- end
398
- show
399
- end
400
458
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.9.1'
2
+ VERSION = '0.10.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.1
4
+ version: 0.10.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-01-20 00:00:00.000000000 Z
11
+ date: 2021-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,7 +77,9 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/char_mapping.rb
80
81
  - lib/text_alignment/constants.rb
82
+ - lib/text_alignment/cultivation_map.rb
81
83
  - lib/text_alignment/find_divisions.rb
82
84
  - lib/text_alignment/glcs_alignment.rb
83
85
  - lib/text_alignment/glcs_alignment_fast.rb
@@ -86,7 +88,6 @@ files:
86
88
  - lib/text_alignment/lcs_cdiff.rb
87
89
  - lib/text_alignment/lcs_comparison.rb
88
90
  - lib/text_alignment/lcs_min.rb
89
- - lib/text_alignment/mappings.rb
90
91
  - lib/text_alignment/mixed_alignment.rb
91
92
  - lib/text_alignment/text_alignment.rb
92
93
  - lib/text_alignment/version.rb