text_alignment 0.9 → 0.11.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c44cc3036273c8c34800d8f78a79316c53efb80fe45ad81092a6172da3b03c6
4
- data.tar.gz: 50ab44cc66b50bf732e99f900c10584025c6ed498603ccf3afd75de90cac4b79
3
+ metadata.gz: 81994ec9a8c7c08d2aad32b351b5942fef1748e4035ba762af546d4f3fe7cee8
4
+ data.tar.gz: bfa75451d33b9d21c2baa1a52280f03486bf10a41b5ac2a97469f0ca3a4f7379
5
5
  SHA512:
6
- metadata.gz: 98645c1ba4566c822d1e6ba6488e4ecdfe100c30923cc7effe7d2a4390ebb6901707e8c9f6a12145e2f98515bc6792afef4f9bfa5fcd683c77d3a5cf599094c7
7
- data.tar.gz: 11657abdb8acb64c8edfd5271bbf78d2a75024753180988030c5ce6722b4da2781760e583ca6e33ed469cca85e4a2f8e28af6ef4dc62029ada5bd8a184200dfb
6
+ metadata.gz: 73d0ff212a89d6ad33751f87f14a9b292b45ce177c61efd0ede5f852eb3834d1bd3940d202c4d87cfb0422c4dc566dea30c560f8500a220dcd3e1dd492e29eac
7
+ data.tar.gz: 6c763a564e339267624bec5c809b334b0adf9951d25409eac21ae4b4582beae4a08739119dfa369022c44c19d4faacfd0e06766c6223bd219a43c3961cfab08c
@@ -26,8 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
31
31
  new_denotations = alignment.transform_hdenotations(denotations)
32
32
 
33
33
  if debug
@@ -37,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false)
37
37
  end
38
38
 
39
39
  lost_annotations = alignment.lost_annotations
40
- unless lost_annotations.empty?
40
+ unless lost_annotations.nil? || lost_annotations.empty?
41
41
  warn "\n[lost annotations] #{lost_annotations.length}"
42
42
  lost_annotations.each do |a|
43
43
  warn "#{a}"
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
50
50
  new_denotations
51
51
  end
52
52
 
53
- def align_mannotations(source_annotations, target_text, debug = false)
54
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
55
55
 
56
56
  idnum_denotations = 0
57
57
  idnum_relations = 0
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
62
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
63
63
  ididx = {}
64
64
  warn "[#{i}]-=-=-=-=-"
65
- denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
66
+
66
67
  denotations.each do |d|
67
68
  reid = 'T' + (idnum_denotations += 1).to_s
68
69
  ididx[d[:id]] = reid
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
76
77
  annotations[:relations].each do |r|
77
78
  reid = 'R' + (idnum_relations += 1).to_s
78
79
  ididx[r[:id]] = reid
79
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
80
83
  end
81
84
  end
82
85
 
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
85
88
  annotations[:attributes].each do |a|
86
89
  reid = 'A' + (idnum_attributes += 1).to_s
87
90
  ididx[a[:id]] = reid
88
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
89
93
  end
90
94
  end
91
95
 
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
94
98
  annotations[:modifications].each do |m|
95
99
  reid = 'M' + (idnum_modifications += 1).to_s
96
100
  ididx[m[:id]] = reid
97
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
98
103
  end
99
104
  end
100
105
  end
@@ -109,13 +114,18 @@ unless ARGV.length == 2
109
114
  end
110
115
 
111
116
  source_annotations = read_annotations(ARGV[0])
112
- target_text = read_text(ARGV[1])
117
+ reference_text = read_text(ARGV[1])
118
+
119
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
113
120
 
114
121
  target_annotations = if source_annotations.class == Array
115
- align_mannotations(source_annotations, target_text, false)
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
123
+ align_mannotations(source_annotations, reference_text, alignment, false)
116
124
  else
117
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
118
- source_annotations.merge({text:target_text, denotations:denotations})
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
127
+ source_annotations.merge({text:reference_text, denotations:denotations})
119
128
  end
120
129
 
121
- # puts target_annotations.to_json
130
+ # pp alignment.block_alignment
131
+ # puts target_annotations.to_json
@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
13
12
 
14
- @reverse = (target_str.length < source_str.length)
13
+ @cultivation_map = cultivation_map
15
14
 
16
- @s1, @s2 = if @reverse
17
- [target_str.downcase, source_str.downcase]
18
- else
19
- [source_str.downcase, target_str.downcase]
20
- end
15
+ @size_ngram = TextAlignment::SIZE_NGRAM
16
+ @size_window = TextAlignment::SIZE_WINDOW
17
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
18
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
21
20
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
21
+ # positions of last match
22
+ @pos_s1_last_match = 0
23
+ @pos_s2_last_match = 0
26
24
  end
27
25
 
28
26
  def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
36
-
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
27
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
28
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
42
29
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
30
+ # To skip whitespace letters
31
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
45
32
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
33
+ _beg_s2 = get_beg_s2(beg_s1)
34
+ break _beg_s2 unless _beg_s2.nil?
35
+ end
48
36
 
49
- search_position = @beg_s2 + 1
50
- end
37
+ # To return nil when it fails to find an anchor
38
+ return nil if beg_s2.class == Range
51
39
 
52
- break unless @beg_s2.nil?
40
+ # To extend the block to the left
41
+ b1 = beg_s1
42
+ b2 = beg_s2
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
45
+ b1 -= 1; b2 -= 1
46
+ end
53
47
 
54
- @beg_s1 += 1
48
+ # To extend the block to the right
49
+ e1 = beg_s1 + @size_ngram
50
+ e2 = beg_s2 + @size_ngram
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
53
+ e1 += 1; e2 += 1
55
54
  end
56
55
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
56
+ @pos_s1_last_match = e1
57
+ @pos_s2_last_match = e2
58
58
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
63
- b1 -= 1; b2 -= 1
64
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
60
+ end
65
61
 
66
- b1 += 1; b2 += 1
62
+ private
67
63
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
70
- while @s1[e1] && @s1[e1] == @s2[e2]
71
- e1 += 1; e2 += 1
64
+ def get_beg_s2(beg_s1)
65
+ # to get the anchor to search for in s2
66
+ anchor = @s1[beg_s1, @size_ngram]
67
+
68
+ # comment out below with the assumption that texts are in the same order
69
+ # search_position = 0
70
+ search_position = @pos_s2_last_match
71
+
72
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
73
+ return nil if beg_s2_candidates.empty?
74
+
75
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
76
+ end
77
+
78
+ # To find beg_s2 which match to the anchor
79
+ # return nil if the anchor is too much frequent
80
+ def find_beg_s2_candidates(anchor, search_position)
81
+ candidates = []
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
83
+ candidates << _beg_s2
84
+
85
+ # for speed, skip anchor of high frequency
86
+ if candidates.length > 5
87
+ candidates.clear
88
+ break
89
+ end
90
+
91
+ search_position = _beg_s2 + 1
72
92
  end
93
+ candidates
94
+ end
95
+
96
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
97
+ valid_beg_s2 = nil
73
98
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
99
+ (10 .. 30).step(10).each do |size_window|
100
+ valid_beg_s2 = nil
77
101
 
78
- if @reverse
79
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
- else
81
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
102
+ r = beg_s2_candidates.each do |beg_s2|
103
+ # if both the begining points are sufficiantly close to the end points of the last match
104
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
105
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
106
+ break unless valid_beg_s2.nil?
107
+ valid_beg_s2 = beg_s2
108
+ next
109
+ end
110
+
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
112
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
+ break unless valid_beg_s2.nil?
114
+ valid_beg_s2 = beg_s2
115
+ next
116
+ end
117
+
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
119
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
+ break unless valid_beg_s2.nil?
121
+ valid_beg_s2 = beg_s2
122
+ next
123
+ end
124
+ end
125
+
126
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
+ break unless r.nil?
82
129
  end
130
+
131
+ valid_beg_s2
83
132
  end
84
133
 
85
- private
134
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
135
+ size_window ||= @size_window
86
136
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
137
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
138
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
139
 
91
140
  window_s1 = ''
92
- loc = @beg_s1 - 1
141
+ loc = beg_s1 - 1
93
142
  count = 0
94
- while count < @size_window && loc >= 0
143
+ while count < size_window && loc >= 0
95
144
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
145
  window_s1 += @s1[loc]
97
146
  count += 1
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
100
149
  end
101
150
 
102
151
  window_s2 = ''
103
- loc = @beg_s2 - 1
152
+ loc = beg_s2 - 1
104
153
  count = 0
105
- while count < @size_window && loc >= 0
154
+ while count < size_window && loc >= 0
106
155
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
156
  window_s2 += @s2[loc]
108
157
  count += 1
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
113
162
  [window_s1, window_s2]
114
163
  end
115
164
 
116
- def get_right_windows
165
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
166
+ size_window ||= @size_window
167
+
117
168
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
169
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
170
 
120
171
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
172
+ loc = beg_s1 + @size_ngram
122
173
  len_s1 = @s1.length
123
174
  count = 0
124
- while count < @size_window && loc < len_s1
175
+ while count < size_window && loc < len_s1
125
176
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
177
  window_s1 += @s1[loc]
127
178
  count += 1
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
130
181
  end
131
182
 
132
183
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
184
+ loc = beg_s2 + @size_ngram
134
185
  len_s2 = @s2.length
135
186
  count = 0
136
- while count < @size_window && loc < len_s2
187
+ while count < size_window && loc < len_s2
137
188
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
189
  window_s2 += @s2[loc]
139
190
  count += 1
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
148
199
  return 0 if str1.nil? || str2.nil?
149
200
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
201
  end
151
-
152
- end
202
+ end
@@ -1,6 +1,6 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
3
+ TextAlignment::CHAR_MAPPING = [
4
4
  ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
6
  ["α", "alpha"], #U+03B1 (greek small letter alpha)
@@ -61,9 +61,11 @@ TextAlignment::MAPPINGS = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
64
+ ["", " "], #U+202F (narrow no-break space)
65
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2211 (Non-Breaking Hyphen)
67
+ ["", "-"], #U+2010 (Hyphen)
68
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
67
69
  ["−", "-"], #U+2212 (minus sign)
68
70
  ["–", "-"], #U+2013 (en dash)
69
71
  ["′", "'"], #U+2032 (prime)
@@ -75,98 +77,114 @@ TextAlignment::MAPPINGS = [
75
77
  ]
76
78
 
77
79
 
78
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
80
+ class TextAlignment::CharMapping
81
+ attr_reader :mapped_text
79
82
 
83
+ def initialize(_text, char_mapping = nil)
84
+ char_mapping ||= TextAlignment::CHAR_MAPPING
85
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
86
+ @index_enmap = offset_mapping.to_h
87
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
88
+ end
80
89
 
81
- class << TextAlignment
82
- def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
- _mappings ||= TextAlignment::MAPPINGS
84
-
85
- character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
- if character_mappings.empty?
87
- [_str1, _str2, _mappings]
88
- else
89
- characters_from = character_mappings.collect{|m| m[0]}.join
90
- characters_to = character_mappings.collect{|m| m[1]}.join
91
- characters_to.gsub!(/-/, '\-')
90
+ def enmap_position(position)
91
+ @index_enmap[position]
92
+ end
92
93
 
93
- str1 = _str1.tr(characters_from, characters_to)
94
- str2 = _str2.tr(characters_from, characters_to)
94
+ def demap_position(position)
95
+ @index_demap[position]
96
+ end
95
97
 
96
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
97
100
 
98
- [str1, str2, mappings]
101
+ denotations.map do |d|
102
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
99
103
  end
100
104
  end
101
105
 
102
- def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
- _mappings ||= TextAlignment::MAPPINGS
104
-
105
- long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
- if long_to_one_mappings.empty?
107
- [_str1, _str2, _mappings]
108
- else
109
- ## long to one character mappings
110
- pletters = TextAlignment::PADDING_LETTERS
111
-
112
- # find the padding letter for str1
113
- @padding_letter1 = begin
114
- i = pletters.index{|l| _str2.index(l).nil?}
115
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
- TextAlignment::PADDING_LETTERS[i]
117
- end
106
+ private
118
107
 
119
- # find the padding letter for str2
120
- @padding_letter2 = begin
121
- i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
- TextAlignment::PADDING_LETTERS[i]
124
- end
108
+ def enmap_text(_text, char_mapping)
109
+ text = _text.dup
125
110
 
126
- str1 = str2 = nil
127
- long_to_one_mappings.each do |f|
128
- from = f[1]
129
-
130
- str1 = if _str2.index(f[0])
131
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
- _str1.gsub(from, to)
133
- else
134
- _str1
135
- end
136
-
137
- str2 = if _str1.index(f[0])
138
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
- _str2.gsub(from, to)
140
- else
141
- _str2
142
- end
111
+ # To execute the single letter mapping
112
+ char_mapping.each do |one, long|
113
+ text.gsub!(one, long) if long.length == 1
114
+ end
115
+
116
+ # To get the (location, length) index for replacements
117
+ loc_len = []
118
+ char_mapping.each do |one, long|
119
+ next if long.length == 1
120
+
121
+ init_next = 0
122
+ while loc = text.index(long, init_next)
123
+ loc_len << [loc, long.length]
124
+ init_next = loc + long.length
143
125
  end
144
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
126
 
146
- [str1, str2, mappings]
127
+ # a workaround to avoid messing-up due to embedding
128
+ text.gsub!(long, one * long.length)
147
129
  end
148
- end
149
130
 
150
- def compute_similarity(_s1, _s2, sdiff)
151
- return 0 if sdiff.nil?
131
+ # To get the (location, length) index for consecutive whitespace sequences
132
+ init_next = 0
133
+ while loc = text.index(/\s{2,}/, init_next)
134
+ len = $~[0].length
135
+ loc_len << [loc, len]
136
+ init_next = loc + len
137
+ end
138
+
139
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
140
+
141
+ # To get the offset_mapping before and after replacement
142
+ offset_mapping = []
143
+ init_next = 0
144
+ j = 0
152
145
 
153
- # compute the lcs only with non-whitespace letters
154
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
- return 0 if lcs == 0
146
+ loc_len.each do |loc, len|
147
+ offset_mapping += (init_next .. loc).map do |i|
148
+ j += 1
149
+ [i, j - 1]
150
+ end
151
+ init_next = loc + len
152
+ end
156
153
 
157
- s1 = if @padding_letter1
158
- _s1.tr(@padding_letter1, ' ')
159
- else
160
- _s1
154
+ offset_mapping += (init_next .. text.length).map do |i|
155
+ j += 1
156
+ [i, j - 1]
161
157
  end
162
158
 
163
- s2 = if @padding_letter2
164
- _s2.tr(@padding_letter2, ' ')
165
- else
166
- _s2
159
+ # To execute the long letter mapping
160
+ char_mapping.each do |one, long|
161
+ text.gsub!(one * long.length, one) if long.length > 1
167
162
  end
168
163
 
169
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
164
+ # To replace multi whitespace sequences to a space
165
+ text.gsub!(/\s{2,}/, ' ')
166
+
167
+ [text, offset_mapping]
170
168
  end
169
+ end
170
+
171
+ if __FILE__ == $0
172
+ require 'json'
173
+
174
+ unless ARGV.length == 1
175
+ warn "#{$0} an_annotation_json_file.json"
176
+ exit
177
+ end
178
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
179
+ denotations = annotations[:denotations] || []
180
+ if denotations.nil? && annotations[:tracks]
181
+ denotations = annotations[:tracks].first[:denotations]
182
+ end
183
+
184
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
185
+ text_mapped = text_mapping.mapped_text
186
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
187
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
171
188
 
189
+ puts new_annotations.to_json
172
190
  end