text_alignment 0.9.1 → 0.11.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bc60f2422e09058c8abc037a5f4c7e28a2c26c4b0defa3e157a478f6c691e85e
4
- data.tar.gz: 3732d51c46d0597cec005396c13e5aa7c84c766232f5de0c5b90e789a2fa77f1
3
+ metadata.gz: 0f019e7fbd144890e96eda8f2cf9b27cf091930c96b81236452172a5142e2cf3
4
+ data.tar.gz: 47d32ec727511d53730bf56557992f972f7747832f9437f5e2f5798cd0764f41
5
5
  SHA512:
6
- metadata.gz: '0095e5682996e5ccb6d6cc7529c40901656f169670e49d26331acad139964b528a6b3ae9c48f32844fbe2a8737f0ab66fdc4f4da51dc37808bed65e7a7447f37'
7
- data.tar.gz: b8e00566dbcba94fbfd1d84bd7d10ac6ba7677124aa8a0676797223d4969e76917ea21013cb509762a46d14324eb28e38b1d6ad7dc26cd0fcb2a30af573e6612
6
+ metadata.gz: bf2720ce7af3612a8c0b1823bf6265e90f0d5e92f315d7eb697c4b13e1c9752e795adb5b1dbc840629379f3e96cf94115dd5d9400663f1d43a4caf428274f69a
7
+ data.tar.gz: d7bca56968c54fa68d83b5e9d89aa89107774cde761117e28d561cf9c63226b08a613e84b3484f2df58edaadfac7e1b286155bc3486559d6fd7c63d6ec082907
@@ -26,8 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
31
31
  new_denotations = alignment.transform_hdenotations(denotations)
32
32
 
33
33
  if debug
@@ -37,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
37
37
  end
38
38
 
39
39
  lost_annotations = alignment.lost_annotations
40
- unless lost_annotations.empty?
40
+ unless lost_annotations.nil? || lost_annotations.empty?
41
41
  warn "\n[lost annotations] #{lost_annotations.length}"
42
42
  lost_annotations.each do |a|
43
43
  warn "#{a}"
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
50
50
  new_denotations
51
51
  end
52
52
 
53
- def align_mannotations(source_annotations, target_text, debug = false)
54
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
55
55
 
56
56
  idnum_denotations = 0
57
57
  idnum_relations = 0
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
62
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
63
63
  ididx = {}
64
64
  warn "[#{i}]-=-=-=-=-"
65
- denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
66
+
66
67
  denotations.each do |d|
67
68
  reid = 'T' + (idnum_denotations += 1).to_s
68
69
  ididx[d[:id]] = reid
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
76
77
  annotations[:relations].each do |r|
77
78
  reid = 'R' + (idnum_relations += 1).to_s
78
79
  ididx[r[:id]] = reid
79
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
80
83
  end
81
84
  end
82
85
 
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
85
88
  annotations[:attributes].each do |a|
86
89
  reid = 'A' + (idnum_attributes += 1).to_s
87
90
  ididx[a[:id]] = reid
88
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
89
93
  end
90
94
  end
91
95
 
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
94
98
  annotations[:modifications].each do |m|
95
99
  reid = 'M' + (idnum_modifications += 1).to_s
96
100
  ididx[m[:id]] = reid
97
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
98
103
  end
99
104
  end
100
105
  end
@@ -109,13 +114,18 @@ unless ARGV.length == 2
109
114
  end
110
115
 
111
116
  source_annotations = read_annotations(ARGV[0])
112
- target_text = read_text(ARGV[1])
117
+ reference_text = read_text(ARGV[1])
118
+
119
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
113
120
 
114
121
  target_annotations = if source_annotations.class == Array
115
- align_mannotations(source_annotations, target_text, false)
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
123
+ align_mannotations(source_annotations, reference_text, alignment, false)
116
124
  else
117
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
118
- source_annotations.merge({text:target_text, denotations:denotations})
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
127
+ source_annotations.merge({text:reference_text, denotations:denotations})
119
128
  end
120
129
 
121
- # puts target_annotations.to_json
130
+ # pp alignment.block_alignment
131
+ # puts target_annotations.to_json
@@ -6,92 +6,145 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
13
12
 
14
- @reverse = (target_str.length < source_str.length)
13
+ @cultivation_map = cultivation_map
15
14
 
16
- @s1, @s2 = if @reverse
17
- [target_str.downcase, source_str.downcase]
18
- else
19
- [source_str.downcase, target_str.downcase]
20
- end
15
+ @size_ngram = TextAlignment::SIZE_NGRAM
16
+ @size_window = TextAlignment::SIZE_WINDOW
17
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
18
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
21
20
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
21
+ # positions of last match
22
+ @pos_s1_last_match = 0
23
+ @pos_s2_last_match = 0
26
24
  end
27
25
 
28
26
  def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
36
-
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
27
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
28
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
42
29
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
30
+ # To skip whitespace letters
31
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
45
32
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
33
+ _beg_s2 = get_beg_s2(beg_s1)
34
+ break _beg_s2 unless _beg_s2.nil?
35
+ end
48
36
 
49
- search_position = @beg_s2 + 1
50
- end
37
+ # To return nil when it fails to find an anchor
38
+ return nil if beg_s2.class == Range
51
39
 
52
- break unless @beg_s2.nil?
40
+ # To extend the block to the left
41
+ b1 = beg_s1
42
+ b2 = beg_s2
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
45
+ b1 -= 1; b2 -= 1
46
+ end
53
47
 
54
- @beg_s1 += 1
48
+ # To extend the block to the right
49
+ e1 = beg_s1 + @size_ngram
50
+ e2 = beg_s2 + @size_ngram
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
53
+ e1 += 1; e2 += 1
55
54
  end
56
55
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
56
+ @pos_s1_last_match = e1
57
+ @pos_s2_last_match = e2
58
58
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
63
- b1 -= 1; b2 -= 1
64
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
60
+ end
65
61
 
66
- b1 += 1; b2 += 1
62
+ private
67
63
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
70
- while @s1[e1] && @s1[e1] == @s2[e2]
71
- e1 += 1; e2 += 1
64
+ def get_beg_s2(beg_s1)
65
+ # to get the anchor to search for in s2
66
+ anchor = @s1[beg_s1, @size_ngram]
67
+
68
+ # comment out below with the assumption that texts are in the same order
69
+ # search_position = 0
70
+ search_position = @pos_s2_last_match
71
+
72
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
73
+ return nil if beg_s2_candidates.empty?
74
+
75
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
76
+ end
77
+
78
+ # To find beg_s2 which match to the anchor
79
+ # return nil if the anchor is too much frequent
80
+ def find_beg_s2_candidates(anchor, search_position)
81
+ candidates = []
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
83
+ candidates << _beg_s2
84
+
85
+ # for speed, skip anchor of high frequency
86
+ if candidates.length > 5
87
+ candidates.clear
88
+ break
89
+ end
90
+
91
+ search_position = _beg_s2 + 1
72
92
  end
93
+ candidates
94
+ end
73
95
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
96
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
97
+ valid_beg_s2 = nil
77
98
 
78
- if @reverse
79
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
- else
81
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
99
+ (10 .. 30).step(10).each do |size_window|
100
+ valid_beg_s2 = nil
101
+
102
+ r = beg_s2_candidates.each do |beg_s2|
103
+ # if both the begining points are sufficiantly close to the end points of the last match
104
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
105
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
106
+ break unless valid_beg_s2.nil?
107
+ valid_beg_s2 = beg_s2
108
+ next
109
+ end
110
+
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
112
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
+ break unless valid_beg_s2.nil?
114
+ valid_beg_s2 = beg_s2
115
+ next
116
+ end
117
+
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
119
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
+ break unless valid_beg_s2.nil?
121
+ valid_beg_s2 = beg_s2
122
+ next
123
+ end
124
+ end
125
+
126
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
82
133
  end
134
+
135
+ valid_beg_s2
83
136
  end
84
137
 
85
- private
138
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
139
+ size_window ||= @size_window
86
140
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
141
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
142
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
143
 
91
144
  window_s1 = ''
92
- loc = @beg_s1 - 1
145
+ loc = beg_s1 - 1
93
146
  count = 0
94
- while count < @size_window && loc >= 0
147
+ while count < size_window && loc >= 0
95
148
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
149
  window_s1 += @s1[loc]
97
150
  count += 1
@@ -100,9 +153,9 @@ class TextAlignment::AnchorFinder
100
153
  end
101
154
 
102
155
  window_s2 = ''
103
- loc = @beg_s2 - 1
156
+ loc = beg_s2 - 1
104
157
  count = 0
105
- while count < @size_window && loc >= 0
158
+ while count < size_window && loc >= 0
106
159
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
160
  window_s2 += @s2[loc]
108
161
  count += 1
@@ -113,15 +166,17 @@ class TextAlignment::AnchorFinder
113
166
  [window_s1, window_s2]
114
167
  end
115
168
 
116
- def get_right_windows
169
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
170
+ size_window ||= @size_window
171
+
117
172
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
173
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
174
 
120
175
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
176
+ loc = beg_s1 + @size_ngram
122
177
  len_s1 = @s1.length
123
178
  count = 0
124
- while count < @size_window && loc < len_s1
179
+ while count < size_window && loc < len_s1
125
180
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
181
  window_s1 += @s1[loc]
127
182
  count += 1
@@ -130,10 +185,10 @@ class TextAlignment::AnchorFinder
130
185
  end
131
186
 
132
187
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
188
+ loc = beg_s2 + @size_ngram
134
189
  len_s2 = @s2.length
135
190
  count = 0
136
- while count < @size_window && loc < len_s2
191
+ while count < size_window && loc < len_s2
137
192
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
193
  window_s2 += @s2[loc]
139
194
  count += 1
@@ -148,5 +203,4 @@ class TextAlignment::AnchorFinder
148
203
  return 0 if str1.nil? || str2.nil?
149
204
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
205
  end
151
-
152
- end
206
+ end
@@ -1,6 +1,6 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
3
+ TextAlignment::CHAR_MAPPING = [
4
4
  ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
6
  ["α", "alpha"], #U+03B1 (greek small letter alpha)
@@ -61,9 +61,11 @@ TextAlignment::MAPPINGS = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
64
+ ["", " "], #U+202F (narrow no-break space)
65
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["", "-"], #U+2010 (Hyphen)
68
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
67
69
  ["−", "-"], #U+2212 (minus sign)
68
70
  ["–", "-"], #U+2013 (en dash)
69
71
  ["′", "'"], #U+2032 (prime)
@@ -75,98 +77,114 @@ TextAlignment::MAPPINGS = [
75
77
  ]
76
78
 
77
79
 
78
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
80
+ class TextAlignment::CharMapping
81
+ attr_reader :mapped_text
79
82
 
83
+ def initialize(_text, char_mapping = nil)
84
+ char_mapping ||= TextAlignment::CHAR_MAPPING
85
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
86
+ @index_enmap = offset_mapping.to_h
87
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
88
+ end
80
89
 
81
- class << TextAlignment
82
- def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
- _mappings ||= TextAlignment::MAPPINGS
84
-
85
- character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
- if character_mappings.empty?
87
- [_str1, _str2, _mappings]
88
- else
89
- characters_from = character_mappings.collect{|m| m[0]}.join
90
- characters_to = character_mappings.collect{|m| m[1]}.join
91
- characters_to.gsub!(/-/, '\-')
90
+ def enmap_position(position)
91
+ @index_enmap[position]
92
+ end
92
93
 
93
- str1 = _str1.tr(characters_from, characters_to)
94
- str2 = _str2.tr(characters_from, characters_to)
94
+ def demap_position(position)
95
+ @index_demap[position]
96
+ end
95
97
 
96
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
97
100
 
98
- [str1, str2, mappings]
101
+ denotations.map do |d|
102
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
99
103
  end
100
104
  end
101
105
 
102
- def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
- _mappings ||= TextAlignment::MAPPINGS
104
-
105
- long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
- if long_to_one_mappings.empty?
107
- [_str1, _str2, _mappings]
108
- else
109
- ## long to one character mappings
110
- pletters = TextAlignment::PADDING_LETTERS
111
-
112
- # find the padding letter for str1
113
- @padding_letter1 = begin
114
- i = pletters.index{|l| _str2.index(l).nil?}
115
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
- TextAlignment::PADDING_LETTERS[i]
117
- end
106
+ private
118
107
 
119
- # find the padding letter for str2
120
- @padding_letter2 = begin
121
- i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
- TextAlignment::PADDING_LETTERS[i]
124
- end
108
+ def enmap_text(_text, char_mapping)
109
+ text = _text.dup
125
110
 
126
- str1 = str2 = nil
127
- long_to_one_mappings.each do |f|
128
- from = f[1]
129
-
130
- str1 = if _str2.index(f[0])
131
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
- _str1.gsub(from, to)
133
- else
134
- _str1
135
- end
136
-
137
- str2 = if _str1.index(f[0])
138
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
- _str2.gsub(from, to)
140
- else
141
- _str2
142
- end
111
+ # To execute the single letter mapping
112
+ char_mapping.each do |one, long|
113
+ text.gsub!(one, long) if long.length == 1
114
+ end
115
+
116
+ # To get the (location, length) index for replacements
117
+ loc_len = []
118
+ char_mapping.each do |one, long|
119
+ next if long.length == 1
120
+
121
+ init_next = 0
122
+ while loc = text.index(long, init_next)
123
+ loc_len << [loc, long.length]
124
+ init_next = loc + long.length
143
125
  end
144
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
126
 
146
- [str1, str2, mappings]
127
+ # a workaround to avoid messing-up due to embedding
128
+ text.gsub!(long, one * long.length)
147
129
  end
148
- end
149
130
 
150
- def compute_similarity(_s1, _s2, sdiff)
151
- return 0 if sdiff.nil?
131
+ # To get the (location, length) index for consecutive whitespace sequences
132
+ init_next = 0
133
+ while loc = text.index(/\s{2,}/, init_next)
134
+ len = $~[0].length
135
+ loc_len << [loc, len]
136
+ init_next = loc + len
137
+ end
138
+
139
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
140
+
141
+ # To get the offset_mapping before and after replacement
142
+ offset_mapping = []
143
+ init_next = 0
144
+ j = 0
152
145
 
153
- # compute the lcs only with non-whitespace letters
154
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
- return 0 if lcs == 0
146
+ loc_len.each do |loc, len|
147
+ offset_mapping += (init_next .. loc).map do |i|
148
+ j += 1
149
+ [i, j - 1]
150
+ end
151
+ init_next = loc + len
152
+ end
156
153
 
157
- s1 = if @padding_letter1
158
- _s1.tr(@padding_letter1, ' ')
159
- else
160
- _s1
154
+ offset_mapping += (init_next .. text.length).map do |i|
155
+ j += 1
156
+ [i, j - 1]
161
157
  end
162
158
 
163
- s2 = if @padding_letter2
164
- _s2.tr(@padding_letter2, ' ')
165
- else
166
- _s2
159
+ # To execute the long letter mapping
160
+ char_mapping.each do |one, long|
161
+ text.gsub!(one * long.length, one) if long.length > 1
167
162
  end
168
163
 
169
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
164
+ # To replace multi whitespace sequences to a space
165
+ text.gsub!(/\s{2,}/, ' ')
166
+
167
+ [text, offset_mapping]
170
168
  end
169
+ end
170
+
171
+ if __FILE__ == $0
172
+ require 'json'
173
+
174
+ unless ARGV.length == 1
175
+ warn "#{$0} an_annotation_json_file.json"
176
+ exit
177
+ end
178
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
179
+ denotations = annotations[:denotations] || []
180
+ if denotations.nil? && annotations[:tracks]
181
+ denotations = annotations[:tracks].first[:denotations]
182
+ end
183
+
184
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
185
+ text_mapped = text_mapping.mapped_text
186
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
187
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
171
188
 
189
+ puts new_annotations.to_json
172
190
  end