text_alignment 0.8.1 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c49793b84e9ca5606a8fe9151530f6732337c2b92cfd1af1549a56ea3c2f39e
4
- data.tar.gz: 4a0b8328d4c6de43af50bd8c278f83facef311c74516dc3a9a0c9dd5f91fbfc0
3
+ metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
4
+ data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
5
5
  SHA512:
6
- metadata.gz: ad70de67b4a7b38290a59d89bed7cd9fa343ce3db62bb67b779e51b499929b9d7c4697871cf25017fa59a34c0525741d5da679559fc2f33d902fcda692f9f1ac
7
- data.tar.gz: 6242b35cbb8f53effc477b508d428d9060cba2e93895496134963755cff956f3931411a87a7bc0d33b7b91459f1a27eb67c422897693320e1b2ceb06eff6e22b
6
+ metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
7
+ data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
@@ -26,8 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations)
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
31
31
  new_denotations = alignment.transform_hdenotations(denotations)
32
32
 
33
33
  if debug
@@ -50,8 +50,8 @@ def align_denotations(denotations, source_text, target_text, debug = false)
50
50
  new_denotations
51
51
  end
52
52
 
53
- def align_mannotations(source_annotations, target_text, debug = false)
54
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
55
55
 
56
56
  idnum_denotations = 0
57
57
  idnum_relations = 0
@@ -62,7 +62,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
62
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
63
63
  ididx = {}
64
64
  warn "[#{i}]-=-=-=-=-"
65
- denotations = align_denotations(annotations[:denotations], annotations[:text], target_text, debug)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
66
+
66
67
  denotations.each do |d|
67
68
  reid = 'T' + (idnum_denotations += 1).to_s
68
69
  ididx[d[:id]] = reid
@@ -76,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
76
77
  annotations[:relations].each do |r|
77
78
  reid = 'R' + (idnum_relations += 1).to_s
78
79
  ididx[r[:id]] = reid
79
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
80
83
  end
81
84
  end
82
85
 
@@ -85,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
85
88
  annotations[:attributes].each do |a|
86
89
  reid = 'A' + (idnum_attributes += 1).to_s
87
90
  ididx[a[:id]] = reid
88
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
89
93
  end
90
94
  end
91
95
 
@@ -94,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
94
98
  annotations[:modifications].each do |m|
95
99
  reid = 'M' + (idnum_modifications += 1).to_s
96
100
  ididx[m[:id]] = reid
97
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
98
103
  end
99
104
  end
100
105
  end
@@ -109,13 +114,18 @@ unless ARGV.length == 2
109
114
  end
110
115
 
111
116
  source_annotations = read_annotations(ARGV[0])
112
- target_text = read_text(ARGV[1])
117
+ reference_text = read_text(ARGV[1])
118
+
119
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
113
120
 
114
121
  target_annotations = if source_annotations.class == Array
115
- align_mannotations(source_annotations, target_text, false)
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
123
+ align_mannotations(source_annotations, reference_text, alignment, false)
116
124
  else
117
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
118
- source_annotations.merge({text:target_text, denotations:denotations})
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
127
+ source_annotations.merge({text:reference_text, denotations:denotations})
119
128
  end
120
129
 
121
- # puts target_annotations.to_json
130
+ # pp alignment.block_alignment
131
+ puts target_annotations.to_json
@@ -6,92 +6,141 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
10
- @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
11
- @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
- @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
9
+ def initialize(source_str, target_str, cultivation_map)
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
13
12
 
14
- @reverse = (target_str.length < source_str.length)
13
+ @cultivation_map = cultivation_map
15
14
 
16
- @s1, @s2 = if @reverse
17
- [target_str.downcase, source_str.downcase]
18
- else
19
- [source_str.downcase, target_str.downcase]
20
- end
15
+ @size_ngram = TextAlignment::SIZE_NGRAM
16
+ @size_window = TextAlignment::SIZE_WINDOW
17
+ @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
18
+ @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
21
20
 
22
- # current position in s1
23
- @beg_s1 = 0
24
- @end_s1_prev = 0
25
- @end_s2_prev = 0
21
+ # positions of last match
22
+ @pos_s1_last_match = 0
23
+ @pos_s2_last_match = 0
26
24
  end
27
25
 
28
26
  def get_next_anchor
29
- # find the position of an anchor ngram in s1 and s2
30
- while @beg_s1 < (@s1.length - @size_ngram)
31
- if [' ', "\n", "\t"].include? @s1[@beg_s1]
32
- @beg_s1 += 1
33
- next
34
- end
35
- anchor = @s1[@beg_s1, @size_ngram]
36
-
37
- # search_position = 0
38
- search_position = @end_s2_prev
39
- while @beg_s2 = @s2.index(anchor, search_position)
40
- # if both the begining points are sufficiantly close to the end points of the last match
41
- break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
27
+ # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
28
+ beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
42
29
 
43
- left_window_s1, left_window_s2 = get_left_windows
44
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
30
+ # To skip whitespace letters
31
+ next if [' ', "\n", "\t"].include? @s1[beg_s1]
45
32
 
46
- right_window_s1, right_window_s2 = get_right_windows
47
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
33
+ _beg_s2 = get_beg_s2(beg_s1)
34
+ break _beg_s2 unless _beg_s2.nil?
35
+ end
48
36
 
49
- search_position = @beg_s2 + 1
50
- end
37
+ # To return nil when it fails to find an anchor
38
+ return nil if beg_s2.class == Range
51
39
 
52
- break unless @beg_s2.nil?
40
+ # To extend the block to the left
41
+ b1 = beg_s1
42
+ b2 = beg_s2
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
45
+ b1 -= 1; b2 -= 1
46
+ end
53
47
 
54
- @beg_s1 += 1
48
+ # To extend the block to the right
49
+ e1 = beg_s1 + @size_ngram
50
+ e2 = beg_s2 + @size_ngram
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
53
+ e1 += 1; e2 += 1
55
54
  end
56
55
 
57
- return nil if @beg_s1 >= (@s1.length - @size_ngram)
56
+ @pos_s1_last_match = e1
57
+ @pos_s2_last_match = e2
58
58
 
59
- # extend the block
60
- b1 = @beg_s1
61
- b2 = @beg_s2
62
- while b1 >= @end_s1_prev && b2 >= @end_s2_prev && @s1[b1] == @s2[b2]
63
- b1 -= 1; b2 -= 1
64
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
60
+ end
65
61
 
66
- b1 += 1; b2 += 1
62
+ private
67
63
 
68
- e1 = @beg_s1 + @size_ngram
69
- e2 = @beg_s2 + @size_ngram
70
- while @s1[e1] && @s1[e1] == @s2[e2]
71
- e1 += 1; e2 += 1
64
+ def get_beg_s2(beg_s1)
65
+ # to get the anchor to search for in s2
66
+ anchor = @s1[beg_s1, @size_ngram]
67
+
68
+ # comment out below with the assumption that texts are in the same order
69
+ # search_position = 0
70
+ search_position = @pos_s2_last_match
71
+
72
+ beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
73
+ return nil if beg_s2_candidates.empty?
74
+
75
+ find_valid_beg_s2(beg_s1, beg_s2_candidates)
76
+ end
77
+
78
+ # To find beg_s2 which match to the anchor
79
+ # return nil if the anchor is too much frequent
80
+ def find_beg_s2_candidates(anchor, search_position)
81
+ candidates = []
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
83
+ candidates << _beg_s2
84
+
85
+ # for speed, skip anchor of high frequency
86
+ if candidates.length > 5
87
+ candidates.clear
88
+ break
89
+ end
90
+
91
+ search_position = _beg_s2 + 1
72
92
  end
93
+ candidates
94
+ end
95
+
96
+ def find_valid_beg_s2(beg_s1, beg_s2_candidates)
97
+ valid_beg_s2 = nil
73
98
 
74
- @end_s1_prev = e1
75
- @end_s2_prev = e2
76
- @beg_s1 = e1
99
+ (10 .. 30).step(10).each do |size_window|
100
+ valid_beg_s2 = nil
77
101
 
78
- if @reverse
79
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
80
- else
81
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
102
+ r = beg_s2_candidates.each do |beg_s2|
103
+ # if both the begining points are sufficiantly close to the end points of the last match
104
+ # break if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 >= @pos_s2_last_match) && (beg_s2 - @pos_s2_last_match < 5)
105
+ if beg_s1 > 0 && beg_s2 > 0 && (beg_s1 - @pos_s1_last_match < 5) && (beg_s2 - @pos_s2_last_match < 5)
106
+ break unless valid_beg_s2.nil?
107
+ valid_beg_s2 = beg_s2
108
+ next
109
+ end
110
+
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
112
+ if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
+ break unless valid_beg_s2.nil?
114
+ valid_beg_s2 = beg_s2
115
+ next
116
+ end
117
+
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
119
+ if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
+ break unless valid_beg_s2.nil?
121
+ valid_beg_s2 = beg_s2
122
+ next
123
+ end
124
+ end
125
+
126
+ # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
+ # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
+ break unless r.nil?
82
129
  end
130
+
131
+ valid_beg_s2
83
132
  end
84
133
 
85
- private
134
+ def get_left_windows(beg_s1, beg_s2, size_window = nil)
135
+ size_window ||= @size_window
86
136
 
87
- def get_left_windows
88
- # commend below with the assumption that the beginning of a document gives a significant locational information
89
- # return if @beg_s1 < @size_window || @beg_s2 < @size_window
137
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
138
+ # return if @beg_s1 < size_window || @beg_s2 < size_window
90
139
 
91
140
  window_s1 = ''
92
- loc = @beg_s1 - 1
141
+ loc = beg_s1 - 1
93
142
  count = 0
94
- while count < @size_window && loc >= 0
143
+ while count < size_window && loc >= 0
95
144
  if @s1[loc] =~ /[0-9a-zA-Z]/
96
145
  window_s1 += @s1[loc]
97
146
  count += 1
@@ -100,9 +149,9 @@ class TextAlignment::AnchorFinder
100
149
  end
101
150
 
102
151
  window_s2 = ''
103
- loc = @beg_s2 - 1
152
+ loc = beg_s2 - 1
104
153
  count = 0
105
- while count < @size_window && loc >= 0
154
+ while count < size_window && loc >= 0
106
155
  if @s2[loc] =~ /[0-9a-zA-Z]/
107
156
  window_s2 += @s2[loc]
108
157
  count += 1
@@ -113,15 +162,17 @@ class TextAlignment::AnchorFinder
113
162
  [window_s1, window_s2]
114
163
  end
115
164
 
116
- def get_right_windows
165
+ def get_right_windows(beg_s1, beg_s2, size_window = nil)
166
+ size_window ||= @size_window
167
+
117
168
  # commend below with the assumption that the end of a document gives a significant locational
118
- # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
169
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
119
170
 
120
171
  window_s1 = ''
121
- loc = @beg_s1 + @size_ngram
172
+ loc = beg_s1 + @size_ngram
122
173
  len_s1 = @s1.length
123
174
  count = 0
124
- while count < @size_window && loc < len_s1
175
+ while count < size_window && loc < len_s1
125
176
  if @s1[loc] =~ /[0-9a-zA-Z]/
126
177
  window_s1 += @s1[loc]
127
178
  count += 1
@@ -130,10 +181,10 @@ class TextAlignment::AnchorFinder
130
181
  end
131
182
 
132
183
  window_s2 = ''
133
- loc = @beg_s2 + @size_ngram
184
+ loc = beg_s2 + @size_ngram
134
185
  len_s2 = @s2.length
135
186
  count = 0
136
- while count < @size_window && loc < len_s2
187
+ while count < size_window && loc < len_s2
137
188
  if @s2[loc] =~ /[0-9a-zA-Z]/
138
189
  window_s2 += @s2[loc]
139
190
  count += 1
@@ -148,5 +199,4 @@ class TextAlignment::AnchorFinder
148
199
  return 0 if str1.nil? || str2.nil?
149
200
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
150
201
  end
151
-
152
- end
202
+ end
@@ -1,6 +1,6 @@
1
1
  module TextAlignment; end unless defined? TextAlignment
2
2
 
3
- TextAlignment::MAPPINGS = [
3
+ TextAlignment::CHAR_MAPPING = [
4
4
  ["©", "(c)"], #U+00A9 (Copyright Sign)
5
5
 
6
6
  ["α", "alpha"], #U+03B1 (greek small letter alpha)
@@ -61,9 +61,10 @@ TextAlignment::MAPPINGS = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
- [" ", " "], #U+00A0 (no-break space)
64
+ [" ", " "], #U+00A0 (Non-Breaking space)
65
65
  [" ", " "], #U+3000 (ideographic space)
66
- ["", "-"], #U+2211 (Non-Breaking Hyphen)
66
+ ["", "-"], #U+2010 (Hyphen)
67
+ ["‑", "-"], #U+2011 (Non-Breaking Hyphen)
67
68
  ["−", "-"], #U+2212 (minus sign)
68
69
  ["–", "-"], #U+2013 (en dash)
69
70
  ["′", "'"], #U+2032 (prime)
@@ -75,98 +76,114 @@ TextAlignment::MAPPINGS = [
75
76
  ]
76
77
 
77
78
 
78
- TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
79
+ class TextAlignment::CharMapping
80
+ attr_reader :mapped_text
79
81
 
82
+ def initialize(_text, char_mapping = nil)
83
+ char_mapping ||= TextAlignment::CHAR_MAPPING
84
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
+ @index_enmap = offset_mapping.to_h
86
+ @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
+ end
80
88
 
81
- class << TextAlignment
82
- def single_character_mapping_preprocessing(_str1, _str2, _mappings = nil)
83
- _mappings ||= TextAlignment::MAPPINGS
84
-
85
- character_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length == 1}
86
- if character_mappings.empty?
87
- [_str1, _str2, _mappings]
88
- else
89
- characters_from = character_mappings.collect{|m| m[0]}.join
90
- characters_to = character_mappings.collect{|m| m[1]}.join
91
- characters_to.gsub!(/-/, '\-')
89
+ def enmap_position(position)
90
+ @index_enmap[position]
91
+ end
92
92
 
93
- str1 = _str1.tr(characters_from, characters_to)
94
- str2 = _str2.tr(characters_from, characters_to)
93
+ def demap_position(position)
94
+ @index_demap[position]
95
+ end
95
96
 
96
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length > 1}
97
+ def enmap_denotations(_denotations)
98
+ return nil if _denotations.nil?
97
99
 
98
- [str1, str2, mappings]
100
+ denotations = _denotations.map do |d|
101
+ d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
99
102
  end
100
103
  end
101
104
 
102
- def long_to_one_mapping_preprocessing(_str1, _str2, _mappings = nil)
103
- _mappings ||= TextAlignment::MAPPINGS
104
-
105
- long_to_one_mappings = _mappings.select{|m| m[0].length == 1 && m[1].length > 1}
106
- if long_to_one_mappings.empty?
107
- [_str1, _str2, _mappings]
108
- else
109
- ## long to one character mappings
110
- pletters = TextAlignment::PADDING_LETTERS
111
-
112
- # find the padding letter for str1
113
- @padding_letter1 = begin
114
- i = pletters.index{|l| _str2.index(l).nil?}
115
- raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
116
- TextAlignment::PADDING_LETTERS[i]
117
- end
105
+ private
118
106
 
119
- # find the padding letter for str2
120
- @padding_letter2 = begin
121
- i = pletters.index{|l| l != @padding_letter1 && _str1.index(l).nil?}
122
- raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
123
- TextAlignment::PADDING_LETTERS[i]
124
- end
107
+ def enmap_text(_text, char_mapping)
108
+ text = _text.dup
125
109
 
126
- str1 = str2 = nil
127
- long_to_one_mappings.each do |f|
128
- from = f[1]
129
-
130
- str1 = if _str2.index(f[0])
131
- to = f[0] + (@padding_letter1 * (f[1].length - 1))
132
- _str1.gsub(from, to)
133
- else
134
- _str1
135
- end
136
-
137
- str2 = if _str1.index(f[0])
138
- to = f[0] + (@padding_letter2 * (f[1].length - 1))
139
- _str2.gsub(from, to)
140
- else
141
- _str2
142
- end
110
+ # To execute the single letter mapping
111
+ char_mapping.each do |one, long|
112
+ text.gsub!(one, long) if long.length == 1
113
+ end
114
+
115
+ # To get the (location, length) index for replacements
116
+ loc_len = []
117
+ char_mapping.each do |one, long|
118
+ next if long.length == 1
119
+
120
+ init_next = 0
121
+ while loc = text.index(long, init_next)
122
+ loc_len << [loc, long.length]
123
+ init_next = loc + long.length
143
124
  end
144
- mappings = _mappings.select{|m| m[0].length > 1 || m[1].length == 1}
145
125
 
146
- [str1, str2, mappings]
126
+ # a workaround to avoid messing-up due to embedding
127
+ text.gsub!(long, one * long.length)
147
128
  end
148
- end
149
129
 
150
- def compute_similarity(_s1, _s2, sdiff)
151
- return 0 if sdiff.nil?
130
+ # To get the (location, length) index for consecutive whitespace sequences
131
+ init_next = 0
132
+ while loc = text.index(/\s{2,}/, init_next)
133
+ len = $~[0].length
134
+ loc_len << [loc, len]
135
+ init_next = loc + len
136
+ end
137
+
138
+ loc_len.sort!{|a, b| a[0] <=> b[0]}
139
+
140
+ # To get the offset_mapping before and after replacement
141
+ offset_mapping = []
142
+ init_next = 0
143
+ j = 0
152
144
 
153
- # compute the lcs only with non-whitespace letters
154
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
155
- return 0 if lcs == 0
145
+ loc_len.each do |loc, len|
146
+ offset_mapping += (init_next .. loc).map do |i|
147
+ j += 1
148
+ [i, j - 1]
149
+ end
150
+ init_next = loc + len
151
+ end
156
152
 
157
- s1 = if @padding_letter1
158
- _s1.tr(@padding_letter1, ' ')
159
- else
160
- _s1
153
+ offset_mapping += (init_next .. text.length).map do |i|
154
+ j += 1
155
+ [i, j - 1]
161
156
  end
162
157
 
163
- s2 = if @padding_letter2
164
- _s2.tr(@padding_letter2, ' ')
165
- else
166
- _s2
158
+ # To execute the long letter mapping
159
+ char_mapping.each do |one, long|
160
+ text.gsub!(one * long.length, one) if long.length > 1
167
161
  end
168
162
 
169
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
163
+ # To replace multi whitespace sequences to a space
164
+ text.gsub!(/\s{2,}/, ' ')
165
+
166
+ [text, offset_mapping]
170
167
  end
168
+ end
169
+
170
+ if __FILE__ == $0
171
+ require 'json'
172
+
173
+ unless ARGV.length == 1
174
+ warn "#{$0} an_annotation_json_file.json"
175
+ exit
176
+ end
177
+ annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
+ denotations = annotations[:denotations]
179
+ if denotations.nil? && annotations[:tracks]
180
+ denotations = annotations[:tracks].first[:denotations]
181
+ end
182
+
183
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
184
+ text_mapped = text_mapping.mapped_text
185
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
186
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
171
187
 
188
+ puts new_annotations.to_json
172
189
  end
@@ -0,0 +1,94 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ class TextAlignment::CultivationMap
4
+ attr_reader :map
5
+
6
+ def initialize
7
+ @map = []
8
+ end
9
+
10
+ def cultivate(regions)
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
24
+ end
25
+ @map = new_map
26
+ end
27
+
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position < r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
93
+ end
94
+ end
@@ -5,7 +5,7 @@ require 'text_alignment/find_divisions'
5
5
  require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/glcs_alignment'
8
- require 'text_alignment/mappings'
8
+ require 'text_alignment/char_mapping'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
@@ -106,7 +106,7 @@ if __FILE__ == $0
106
106
 
107
107
  dictionary = [["β", "beta"]]
108
108
  # align = TextAlignment::TextAlignment.new(str1, str2)
109
- align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
109
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::CHAR_MAPPING)
110
110
  p align.common_elements
111
111
  p align.mapped_elements
112
112
  end
@@ -6,7 +6,7 @@ require 'text_alignment/lcs_comparison'
6
6
  require 'text_alignment/lcs_alignment'
7
7
  require 'text_alignment/lcs_cdiff'
8
8
  require 'text_alignment/glcs_alignment'
9
- require 'text_alignment/mappings'
9
+ require 'text_alignment/char_mapping'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
@@ -20,7 +20,9 @@ class TextAlignment::MixedAlignment
20
20
  def initialize(_str1, _str2, _mappings = nil)
21
21
  raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
22
22
 
23
- str1, str2, mappings = TextAlignment::long_to_one_mapping_preprocessing(_str1, _str2, _mappings)
23
+ mappings ||= TextAlignment::CHAR_MAPPING
24
+ str1 = _str1.dup
25
+ str2 = _str2.dup
24
26
 
25
27
  _compute_mixed_alignment(str1, str2, mappings)
26
28
  end
@@ -63,7 +65,7 @@ class TextAlignment::MixedAlignment
63
65
  end
64
66
 
65
67
  cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
66
- @similarity = TextAlignment::compute_similarity(str1, str2, @sdiff)
68
+ @similarity = compute_similarity(str1, str2, @sdiff)
67
69
  @str1_match_initial = cmp.str1_match_initial
68
70
  @str1_match_final = cmp.str1_match_final
69
71
  @str2_match_initial = cmp.str2_match_initial
@@ -139,4 +141,20 @@ class TextAlignment::MixedAlignment
139
141
  @position_map_end = posmap_end.sort.to_h
140
142
  end
141
143
 
144
+ def compute_similarity(s1, s2, sdiff)
145
+ return 0 if sdiff.nil?
146
+
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
+
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
157
+
158
+ similarity = coverage * rate_frag
159
+ end
142
160
  end
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment/constants'
3
3
  require 'text_alignment/anchor_finder'
4
4
  require 'text_alignment/mixed_alignment'
5
+ require 'text_alignment/cultivation_map'
5
6
 
6
7
  module TextAlignment; end unless defined? TextAlignment
7
8
 
@@ -10,253 +11,71 @@ class TextAlignment::TextAlignment
10
11
  attr_reader :similarity
11
12
  attr_reader :lost_annotations
12
13
 
13
- def initialize(_str1, _str2, denotations = nil, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
15
17
 
16
- @block_alignment = {source_text:_str1, target_text:_str2}
17
- @original_str1 = _str1
18
- @original_str2 = _str2
18
+ @original_reference_text = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
21
+ @to_prevent_overlap = to_prevent_overlap
19
22
 
20
- str1, str2, @mappings = TextAlignment::single_character_mapping_preprocessing(_str1, _str2)
21
-
22
- if r = whole_block_alignment(str1, str2)
23
- @block_alignment[:blocks] = r
24
- return
25
- end
23
+ @original_text = nil
24
+ @block_alignment = nil
25
+ @cultivation_map = TextAlignment::CultivationMap.new
26
+ end
26
27
 
27
- ## to find block alignments
28
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
28
+ def align(text, denotations = nil)
29
+ # To maintain the cultivation map
30
+ update_cultivation_map if @to_prevent_overlap
29
31
 
30
- blocks = []
31
- while block = anchor_finder.get_next_anchor
32
- last = blocks.last
33
- if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
34
- last[:source][:end] = block[:source][:end]
35
- last[:target][:end] = block[:target][:end]
36
- else
37
- blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
38
- end
32
+ # In case the input text is the same as the previous one, reuse the previous text mapping
33
+ unless @original_text && @original_text == text
34
+ @original_text = text
35
+ @text_mapping = TextAlignment::CharMapping.new(text)
39
36
  end
40
37
 
41
- # pp blocks
42
- # puts "-----"
43
- # puts
44
- # exit
45
- # blocks.each do |b|
46
- # p [b[:source], b[:target]]
47
- # puts "---"
48
- # puts str1[b[:source][:begin] ... b[:source][:end]]
49
- # puts "---"
50
- # puts str2[b[:target][:begin] ... b[:target][:end]]
51
- # puts "====="
52
- # puts
53
- # end
54
- # puts "-=-=-=-=-"
55
- # puts
56
-
57
- ## to fill the gaps
58
- last_block = nil
59
- blocks2 = blocks.inject([]) do |sum, block|
60
- b1 = last_block ? last_block[:source][:end] : 0
61
- e1 = block[:source][:begin]
62
-
63
- sum += if b1 == e1
64
- [block]
65
- else
66
- b2 = last_block ? last_block[:target][:end] : 0
67
- e2 = block[:target][:begin]
68
-
69
- if b2 == e2
70
- [
71
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
72
- block
73
- ]
74
- else
75
- if b1 == 0 && b2 == 0
76
- len_buffer = (e1 * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
77
- b2 = e2 - len_buffer if e2 > len_buffer
78
- end
79
-
80
- _str1 = str1[b1 ... e1]
81
- _str2 = str2[b2 ... e2]
82
-
83
- if _str1.strip.empty? || _str2.strip.empty?
84
- [
85
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
86
- block
87
- ]
88
- else
89
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
90
- end
91
- end
92
- end
38
+ @mapped_text = @text_mapping.mapped_text
39
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
93
40
 
94
- last_block = block
95
- sum
96
- end
41
+ ## To generate the block_alignment of the input text against the reference text
42
+ # Initialization
43
+ # @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
44
+ @block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
97
45
 
98
- # the last step
99
- blocks2 += if last_block.nil?
100
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
46
+ # Generation
47
+ @block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
48
+ r
101
49
  else
102
- b1 = last_block[:source][:end]
103
- if b1 < str1.length
104
- e1 = str1.length
105
-
106
- b2 = last_block[:target][:end]
107
- if b2 < str2.length
108
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
109
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
110
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
111
- else
112
- [{source:{begin:last_block[:source][:end], end:str1.length}, alignment: :empty}]
113
- end
114
- else
115
- []
116
- end
50
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
117
51
  end
118
-
119
- @block_alignment[:blocks] = blocks2
120
52
  end
121
53
 
122
- def whole_block_alignment(str1, str2)
123
- ## Block exact match
124
- block_begin = str2.index(str1)
125
- unless block_begin.nil?
126
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
127
- end
54
+ def update_cultivation_map
55
+ return if @block_alignment.nil? || @block_alignment[:blocks].nil?
128
56
 
129
- block_begin = str2.downcase.index(str1.downcase)
130
- unless block_begin.nil?
131
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
132
- end
133
-
134
- nil
135
- end
136
-
137
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
138
- block2 = str2[b2 ... e2]
139
-
140
- ## term-based alignment
141
- tblocks = if denotations
142
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
143
- sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
144
- map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
145
-
146
- position = 0
147
- tblocks = ds_in_scope.map do |term|
148
- lex = term[:lex]
149
- r = block2.index(lex, position)
150
- if r.nil?
151
- position = nil
152
- break
153
- end
154
- position = r + lex.length
155
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, delta: r - term[:span][:begin]}
156
- end
157
-
158
- # missing term found
159
- tblocks = [] if position.nil?
160
-
161
- # redundant matching found
162
- unless position.nil?
163
- ds_in_scope.each do |term|
164
- lex = term[:lex]
165
- look_forward = block2.index(lex, position)
166
- unless look_forward.nil?
167
- tblocks = []
168
- break
169
- end
170
- end
171
- end
172
-
173
- tblocks
174
- else
175
- []
176
- end
177
-
178
- if tblocks.empty?
179
- if b1 == 0 && e1 == str1.length
180
- if (e1 > 2000) || (e2 > 2000)
181
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
182
- else
183
- block1 = str1[b1 ... e1]
184
- block2 = str2[b2 ... e2]
185
-
186
- ## character-based alignment
187
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
188
- if alignment.sdiff.nil?
189
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
190
- else
191
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
192
- end
193
- end
57
+ ## To update the cultivation map
58
+ newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
59
+ if b[:alignment] == :block || b[:alignment] == :term
60
+ [b[:target][:begin], b[:target][:end]]
194
61
  else
195
- block1 = str1[b1 ... e1]
196
- block2 = str2[b2 ... e2]
197
-
198
- ## character-based alignment
199
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase, @mappings)
200
- if alignment.sdiff.nil?
201
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
202
- else
203
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
204
- end
205
- end
206
- else
207
- last_tblock = nil
208
- lblocks = tblocks.inject([]) do |sum, tblock|
209
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
210
- te1 = tblock[:source][:begin]
211
-
212
- sum += if te1 == tb1
213
- [tblock]
214
- else
215
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
216
- te2 = tblock[:target][:begin]
217
-
218
- if b2 == e2
219
- [
220
- {source:{begin:tb1, end:te1}, alignment: :empty},
221
- tblock
222
- ]
223
- else
224
- [
225
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
226
- tblock
227
- ]
228
- end
229
- end
230
-
231
- last_tblock = tblock
232
- sum
62
+ nil
233
63
  end
234
-
235
- if last_tblock[:source][:end] < e1
236
- if last_tblock[:target][:end] < e2
237
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
238
- else
239
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
240
- end
64
+ end.compact.inject([]) do |condensed, region|
65
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
66
+ condensed.push region
67
+ else
68
+ condensed.last[1] = region.last
241
69
  end
242
-
243
- lblocks
70
+ condensed
244
71
  end
245
- end
246
-
247
72
 
248
- def indices(str, target)
249
- position = 0
250
- len = target.len
251
- Enumerator.new do |yielder|
252
- while idx = str.index(target, position)
253
- yielder << idx
254
- position = idx + len
255
- end
256
- end
73
+ @cultivation_map.cultivate(newly_cultivated_regions)
257
74
  end
258
75
 
259
- def transform_begin_position(begin_position)
76
+ def transform_begin_position(_begin_position)
77
+ begin_position = @text_mapping.enmap_position(_begin_position)
78
+
260
79
  i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
261
80
  block = @block_alignment[:blocks][i]
262
81
 
@@ -272,9 +91,13 @@ class TextAlignment::TextAlignment
272
91
  r = block[:alignment].transform_begin_position(begin_position - block[:source][:begin])
273
92
  r.nil? ? nil : r + block[:target][:begin]
274
93
  end
94
+
95
+ @rtext_mapping.demap_position(b)
275
96
  end
276
97
 
277
- def transform_end_position(end_position)
98
+ def transform_end_position(_end_position)
99
+ end_position = @text_mapping.enmap_position(_end_position)
100
+
278
101
  i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
279
102
  block = @block_alignment[:blocks][i]
280
103
 
@@ -290,6 +113,8 @@ class TextAlignment::TextAlignment
290
113
  r = block[:alignment].transform_end_position(end_position - block[:source][:begin])
291
114
  r.nil? ? nil : r + block[:target][:begin]
292
115
  end
116
+
117
+ @rtext_mapping.demap_position(e)
293
118
  end
294
119
 
295
120
  def transform_a_span(span)
@@ -308,7 +133,7 @@ class TextAlignment::TextAlignment
308
133
  source = {begin:d.begin, end:d.end}
309
134
  d.begin = transform_begin_position(d.begin);
310
135
  d.end = transform_end_position(d.end);
311
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
136
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
312
137
  rescue
313
138
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
314
139
  d.begin = nil
@@ -324,7 +149,7 @@ class TextAlignment::TextAlignment
324
149
 
325
150
  r = hdenotations.collect do |d|
326
151
  t = transform_a_span(d[:span])
327
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
152
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
328
153
  new_d = d.dup.merge({span:t})
329
154
  rescue
330
155
  @lost_annotations << {source: d[:span], target:t}
@@ -335,8 +160,8 @@ class TextAlignment::TextAlignment
335
160
  end
336
161
 
337
162
  def alignment_show
338
- stext = @block_alignment[:source_text]
339
- ttext = @block_alignment[:target_text]
163
+ stext = @mapped_text
164
+ ttext = @mapped_reference_text
340
165
 
341
166
  show = ''
342
167
  @block_alignment[:blocks].each do |a|
@@ -392,9 +217,192 @@ class TextAlignment::TextAlignment
392
217
 
393
218
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
394
219
  "[#{astr1}]\n" +
395
- "[#{astr2}]\n\n"
220
+ "[#{astr2.gsub("\n", " ")}]\n\n"
396
221
  end
397
222
  end
398
223
  show
399
224
  end
225
+
226
+ private
227
+
228
+ def find_block_alignment(str1, str2, denotations, cultivation_map)
229
+ ## to find block alignments
230
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
231
+
232
+ blocks = []
233
+ while block = anchor_finder.get_next_anchor
234
+ last = blocks.last
235
+ if last && (block[:source][:begin] == last[:source][:end] + 1) && (block[:target][:begin] == last[:target][:end] + 1)
236
+ last[:source][:end] = block[:source][:end]
237
+ last[:target][:end] = block[:target][:end]
238
+ else
239
+ blocks << block.merge(alignment: :block, delta: block[:target][:begin] - block[:source][:begin])
240
+ end
241
+ end
242
+
243
+ # pp blocks
244
+ # puts "-----"
245
+ # puts
246
+ # exit
247
+ # blocks.each do |b|
248
+ # p [b[:source], b[:target]]
249
+ # puts "---"
250
+ # puts str1[b[:source][:begin] ... b[:source][:end]]
251
+ # puts "---"
252
+ # puts str2[b[:target][:begin] ... b[:target][:end]]
253
+ # puts "====="
254
+ # puts
255
+ # end
256
+ # puts "-=-=-=-=-"
257
+ # puts
258
+
259
+ ## To fill the gaps
260
+ ## lblock: last block, cblock: current block
261
+ lblock = nil
262
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
263
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
264
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
265
+
266
+ if b1 < e1
267
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
268
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
269
+ _str1 = str1[b1 ... e1]
270
+ _str2 = str2[b2 ... e2]
271
+
272
+ sum += if _str1.strip.empty? || _str2.strip.empty?
273
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
274
+ else
275
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
276
+ region_state, state_region = cultivation_map.region_state([b2, e2])
277
+ case region_state
278
+ when :closed
279
+ []
280
+ when :front_open
281
+ oe2 = state_region[1]
282
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
283
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
284
+ when :rear_open
285
+ ob2 = state_region[0]
286
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
287
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
288
+ when :middle_closed
289
+ oe2 = state_region[0]
290
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
291
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
292
+ if attempt1.empty?
293
+ ob2 = state_region[1]
294
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
295
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
296
+ else
297
+ attempt1
298
+ end
299
+ else # :open
300
+ if (e2 - b2) > len_buffer
301
+ attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
302
+ if attempt1.empty?
303
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
304
+ else
305
+ attempt1
306
+ end
307
+ else
308
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
309
+ end
310
+ end
311
+ end
312
+ end
313
+
314
+ lblock = cblock
315
+ cblock.nil? ? sum : sum << cblock
316
+ end
317
+
318
+ end
319
+
320
+ def whole_block_alignment(str1, str2, cultivation_map)
321
+ block_begin = cultivation_map.index(str1, str2, 0)
322
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
323
+
324
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
325
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
326
+
327
+ nil
328
+ end
329
+
330
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
332
+ if tblocks.empty?
333
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
334
+ else
335
+ tblocks
336
+ end
337
+ end
338
+
339
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
340
+ str2_block = str2[0 ... e2]
341
+
342
+ ## term-based alignment
343
+ tblocks = if denotations
344
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
345
+ sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
346
+ map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
347
+
348
+ search_position = b2
349
+ _tblocks = denotations_in_scope.map do |denotation|
350
+ lex = denotation[:lex]
351
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
352
+ break [] if term_begin.nil? # break the loop if a missing term is found
353
+ search_position = term_begin + lex.length
354
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
355
+ end
356
+
357
+ # redundant matching found
358
+ unless _tblocks.empty?
359
+ search_position = _tblocks.last[:target][:end]
360
+ denotations_in_scope.each do |term|
361
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
362
+ unless look_forward.nil?
363
+ _tblocks = []
364
+ break
365
+ end
366
+ end
367
+ end
368
+
369
+ _tblocks
370
+ else
371
+ []
372
+ end
373
+
374
+ ltblock = nil
375
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
376
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
377
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
378
+
379
+ if te1 > tb1
380
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
381
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
382
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
383
+ end
384
+
385
+ ltblock = ctblock
386
+ ctblock.nil? ? sum : sum << ctblock
387
+ end
388
+
389
+ tblocks2
390
+ end
391
+
392
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
393
+ source = {begin:b1, end:e1}
394
+ target = {begin:b2, end:e2}
395
+
396
+ if (e1 - b1) > 2000
397
+ [{source:source, target:target, alignment: :empty}]
398
+ else
399
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
400
+ if alignment.similarity < 0.5
401
+ [{source:source, target:target, alignment: :empty}]
402
+ else
403
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
404
+ end
405
+ end
406
+ end
407
+
400
408
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.8.1'
2
+ VERSION = '0.11.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-10-26 00:00:00.000000000 Z
11
+ date: 2021-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,7 +77,9 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/char_mapping.rb
80
81
  - lib/text_alignment/constants.rb
82
+ - lib/text_alignment/cultivation_map.rb
81
83
  - lib/text_alignment/find_divisions.rb
82
84
  - lib/text_alignment/glcs_alignment.rb
83
85
  - lib/text_alignment/glcs_alignment_fast.rb
@@ -86,7 +88,6 @@ files:
86
88
  - lib/text_alignment/lcs_cdiff.rb
87
89
  - lib/text_alignment/lcs_comparison.rb
88
90
  - lib/text_alignment/lcs_min.rb
89
- - lib/text_alignment/mappings.rb
90
91
  - lib/text_alignment/mixed_alignment.rb
91
92
  - lib/text_alignment/text_alignment.rb
92
93
  - lib/text_alignment/version.rb