text_alignment 0.10.1 → 0.11.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
4
- data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
3
+ metadata.gz: abf4387fb9ea598d924eb0a7369ca58ec0be7cdd35ac89abc57647d665c00207
4
+ data.tar.gz: 30ab0ccea5c04cae8132a70872cf5de4704ce7fd4e2104f7fddc6527794424c5
5
5
  SHA512:
6
- metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
7
- data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
6
+ metadata.gz: 2033123f2289c991b2a53826921ca19489f7f39c53918df5f962e321b1f3ff0996760e9df302872d07a7e638e6c56ad86428d1ce773d6e14c54180ad2360b1be
7
+ data.tar.gz: a902ca48bba502af787deb3abf6c5a2b4c9a121394452ef25e95c61212ee0cfa4e991f83315bc6e03a52d45ead909acedeaf0abd9cdc278ccc58b7cb315ab556
@@ -26,9 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
31
- cm = alignment.cultivation_map
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
32
31
  new_denotations = alignment.transform_hdenotations(denotations)
33
32
 
34
33
  if debug
@@ -38,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
38
37
  end
39
38
 
40
39
  lost_annotations = alignment.lost_annotations
41
- unless lost_annotations.empty?
40
+ unless lost_annotations.nil? || lost_annotations.empty?
42
41
  warn "\n[lost annotations] #{lost_annotations.length}"
43
42
  lost_annotations.each do |a|
44
43
  warn "#{a}"
@@ -48,23 +47,22 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
48
47
  warn
49
48
 
50
49
  # return target annotations
51
- [new_denotations, cm]
50
+ new_denotations
52
51
  end
53
52
 
54
- def align_mannotations(source_annotations, target_text, debug = false)
55
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
56
55
 
57
56
  idnum_denotations = 0
58
57
  idnum_relations = 0
59
58
  idnum_attributes = 0
60
59
  idnum_modifications = 0
61
60
 
62
- cm = nil
63
61
  source_annotations.each_with_index do |annotations, i|
64
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
65
63
  ididx = {}
66
64
  warn "[#{i}]-=-=-=-=-"
67
- denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
68
66
 
69
67
  denotations.each do |d|
70
68
  reid = 'T' + (idnum_denotations += 1).to_s
@@ -79,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
79
77
  annotations[:relations].each do |r|
80
78
  reid = 'R' + (idnum_relations += 1).to_s
81
79
  ididx[r[:id]] = reid
82
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
83
83
  end
84
84
  end
85
85
 
@@ -88,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
88
88
  annotations[:attributes].each do |a|
89
89
  reid = 'A' + (idnum_attributes += 1).to_s
90
90
  ididx[a[:id]] = reid
91
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
92
93
  end
93
94
  end
94
95
 
@@ -97,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
97
98
  annotations[:modifications].each do |m|
98
99
  reid = 'M' + (idnum_modifications += 1).to_s
99
100
  ididx[m[:id]] = reid
100
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
101
103
  end
102
104
  end
103
105
  end
@@ -112,14 +114,18 @@ unless ARGV.length == 2
112
114
  end
113
115
 
114
116
  source_annotations = read_annotations(ARGV[0])
115
- target_text = read_text(ARGV[1])
117
+ reference_text = read_text(ARGV[1])
118
+
119
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
116
120
 
117
121
  target_annotations = if source_annotations.class == Array
118
- align_mannotations(source_annotations, target_text, false)
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
123
+ align_mannotations(source_annotations, reference_text, alignment, false)
119
124
  else
120
- denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
122
- source_annotations.merge({text:target_text, denotations:denotations})
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
127
+ source_annotations.merge({text:reference_text, denotations:denotations})
123
128
  end
124
129
 
125
- puts target_annotations.to_json
130
+ # pp alignment.block_alignment
131
+ # puts target_annotations.to_json
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
9
  def initialize(source_str, target_str, cultivation_map)
10
- @s1, @s2 = if reverse?(source_str, target_str)
11
- [target_str.downcase, source_str.downcase]
12
- else
13
- [source_str.downcase, target_str.downcase]
14
- end
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
15
12
 
16
13
  @cultivation_map = cultivation_map
17
14
 
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
19
16
  @size_window = TextAlignment::SIZE_WINDOW
20
17
  @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
18
  @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
22
20
 
23
21
  # positions of last match
24
22
  @pos_s1_last_match = 0
25
23
  @pos_s2_last_match = 0
26
24
  end
27
25
 
28
- def reverse?(source_str = nil, target_str = nil)
29
- unless source_str.nil?
30
- @reverse_p = target_str.length < source_str.length
31
- end
32
- @reverse_p
33
- end
34
-
35
26
  def get_next_anchor
36
27
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
28
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
49
40
  # To extend the block to the left
50
41
  b1 = beg_s1
51
42
  b2 = beg_s2
52
- while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
53
45
  b1 -= 1; b2 -= 1
54
46
  end
55
- b1 += 1; b2 += 1
56
47
 
57
48
  # To extend the block to the right
58
49
  e1 = beg_s1 + @size_ngram
59
50
  e2 = beg_s2 + @size_ngram
60
- while @s1[e1] && @s1[e1] == @s2[e2]
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
61
53
  e1 += 1; e2 += 1
62
54
  end
63
55
 
64
56
  @pos_s1_last_match = e1
65
57
  @pos_s2_last_match = e2
66
58
 
67
- if reverse?
68
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
69
- else
70
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
71
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
72
60
  end
73
61
 
74
62
  private
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
91
79
  # return nil if the anchor is too much frequent
92
80
  def find_beg_s2_candidates(anchor, search_position)
93
81
  candidates = []
94
- while _beg_s2 = @s2.index(anchor, search_position)
95
- search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
- unless search_again_position.nil?
97
- search_position = search_again_position
98
- next
99
- end
100
-
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
101
83
  candidates << _beg_s2
102
84
 
103
85
  # for speed, skip anchor of high frequency
@@ -126,14 +108,14 @@ class TextAlignment::AnchorFinder
126
108
  next
127
109
  end
128
110
 
129
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
130
112
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
113
  break unless valid_beg_s2.nil?
132
114
  valid_beg_s2 = beg_s2
133
115
  next
134
116
  end
135
117
 
136
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
137
119
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
120
  break unless valid_beg_s2.nil?
139
121
  valid_beg_s2 = beg_s2
@@ -143,7 +125,11 @@ class TextAlignment::AnchorFinder
143
125
 
144
126
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
127
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
- break unless r.nil?
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
147
133
  end
148
134
 
149
135
  valid_beg_s2
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+202F (narrow no-break space)
64
65
  [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
67
  ["‐", "-"], #U+2010 (Hyphen)
@@ -77,11 +78,11 @@ TextAlignment::CHAR_MAPPING = [
77
78
 
78
79
 
79
80
  class TextAlignment::CharMapping
80
- attr_reader :str
81
+ attr_reader :mapped_text
81
82
 
82
- def initialize(_str, char_mapping = nil)
83
+ def initialize(_text, char_mapping = nil)
83
84
  char_mapping ||= TextAlignment::CHAR_MAPPING
84
- @str, offset_mapping = enmap_str(_str, char_mapping)
85
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
86
  @index_enmap = offset_mapping.to_h
86
87
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
88
  end
@@ -94,20 +95,22 @@ class TextAlignment::CharMapping
94
95
  @index_demap[position]
95
96
  end
96
97
 
97
- def enmap_denotations(_denotations)
98
- denotations = _denotations.map do |d|
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
100
+
101
+ denotations.map do |d|
99
102
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
103
  end
101
104
  end
102
105
 
103
106
  private
104
107
 
105
- def enmap_str(_str, char_mapping)
106
- str = _str.dup
108
+ def enmap_text(_text, char_mapping)
109
+ text = _text.dup
107
110
 
108
111
  # To execute the single letter mapping
109
112
  char_mapping.each do |one, long|
110
- str.gsub!(one, long) if long.length == 1
113
+ text.gsub!(one, long) if long.length == 1
111
114
  end
112
115
 
113
116
  # To get the (location, length) index for replacements
@@ -116,18 +119,18 @@ class TextAlignment::CharMapping
116
119
  next if long.length == 1
117
120
 
118
121
  init_next = 0
119
- while loc = str.index(long, init_next)
122
+ while loc = text.index(long, init_next)
120
123
  loc_len << [loc, long.length]
121
124
  init_next = loc + long.length
122
125
  end
123
126
 
124
127
  # a workaround to avoid messing-up due to embedding
125
- str.gsub!(long, one * long.length)
128
+ text.gsub!(long, one * long.length)
126
129
  end
127
130
 
128
131
  # To get the (location, length) index for consecutive whitespace sequences
129
132
  init_next = 0
130
- while loc = str.index(/\s{2,}/, init_next)
133
+ while loc = text.index(/\s{2,}/, init_next)
131
134
  len = $~[0].length
132
135
  loc_len << [loc, len]
133
136
  init_next = loc + len
@@ -148,20 +151,20 @@ class TextAlignment::CharMapping
148
151
  init_next = loc + len
149
152
  end
150
153
 
151
- offset_mapping += (init_next .. str.length).map do |i|
154
+ offset_mapping += (init_next .. text.length).map do |i|
152
155
  j += 1
153
156
  [i, j - 1]
154
157
  end
155
158
 
156
159
  # To execute the long letter mapping
157
160
  char_mapping.each do |one, long|
158
- str.gsub!(one * long.length, one) if long.length > 1
161
+ text.gsub!(one * long.length, one) if long.length > 1
159
162
  end
160
163
 
161
164
  # To replace multi whitespace sequences to a space
162
- str.gsub!(/\s{2,}/, ' ')
165
+ text.gsub!(/\s{2,}/, ' ')
163
166
 
164
- [str, offset_mapping]
167
+ [text, offset_mapping]
165
168
  end
166
169
  end
167
170
 
@@ -173,15 +176,15 @@ if __FILE__ == $0
173
176
  exit
174
177
  end
175
178
  annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
176
- denotations = annotations[:denotations]
179
+ denotations = annotations[:denotations] || []
177
180
  if denotations.nil? && annotations[:tracks]
178
181
  denotations = annotations[:tracks].first[:denotations]
179
182
  end
180
183
 
181
- str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
- str_mapped = str_mapping.str
183
- denotations_mapped = str_mapping.enmap_denotations(denotations)
184
- new_annotations = {text:str_mapped, denotations:denotations_mapped}
184
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
185
+ text_mapped = text_mapping.mapped_text
186
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
187
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
185
188
 
186
189
  puts new_annotations.to_json
187
190
  end
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
4
4
  attr_reader :map
5
5
 
6
6
  def initialize
7
- @map = {}
7
+ @map = []
8
8
  end
9
9
 
10
10
  def cultivate(regions)
11
- regions.each do |b, e|
12
- (b ... e).each{|p| @map[p] = e}
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
13
24
  end
25
+ @map = new_map
14
26
  end
15
27
 
16
- def search_again_position(position)
17
- @map[position]
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position <= r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position = 0)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
18
93
  end
19
94
  end
@@ -144,11 +144,17 @@ class TextAlignment::MixedAlignment
144
144
  def compute_similarity(s1, s2, sdiff)
145
145
  return 0 if sdiff.nil?
146
146
 
147
- # compute the lcs only with non-whitespace letters
148
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
- return 0 if lcs == 0
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
150
 
151
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
- end
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/} + 1
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
153
157
 
158
+ similarity = coverage * rate_frag
159
+ end
154
160
  end
@@ -10,56 +10,49 @@ class TextAlignment::TextAlignment
10
10
  attr_reader :block_alignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
- attr_reader :cultivation_map
14
13
 
15
- def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
19
- @original_str1 = _str1
20
- @original_str2 = _str2
18
+ @original_reference_text = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
21
+ @to_prevent_overlap = to_prevent_overlap
21
22
 
22
- @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
- @str2_mapping = TextAlignment::CharMapping.new(_str2)
23
+ @original_text = nil
24
+ @blocks = nil
25
+ @cultivation_map = TextAlignment::CultivationMap.new
26
+ end
24
27
 
25
- str1 = @str1_mapping.str
26
- denotations = @str1_mapping.enmap_denotations(_denotations)
28
+ def align(text, denotations = nil)
29
+ # To maintain the cultivation map
30
+ update_cultivation_map if @to_prevent_overlap
27
31
 
28
- str2 = @str2_mapping.str
32
+ # In case the input text is the same as the previous one, reuse the previous text mapping
33
+ unless @original_text && @original_text == text
34
+ @original_text = text
35
+ @text_mapping = TextAlignment::CharMapping.new(text)
36
+ end
29
37
 
30
- @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
38
+ @mapped_text = @text_mapping.mapped_text
39
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
31
40
 
32
- @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
- # whole block alignment
41
+ ## To generate the block_alignment of the input text against the reference text
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
34
43
  r
35
44
  else
36
- find_block_alignment(str1, str2, denotations, @cultivation_map)
45
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
37
46
  end
38
47
 
39
- newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
- if b[:alignment] == :block || b[:alignment] == :term
41
- [b[:target][:begin], b[:target][:end]]
42
- else
43
- nil
44
- end
45
- end.compact
46
- newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
47
- if condensed.empty? || (condensed.last.last + 1 < region.first)
48
- condensed.push region
49
- else
50
- condensed.last[1] = region.last
51
- end
52
- condensed
53
- end
54
-
55
- @cultivation_map.cultivate(newly_cultivated_regions_condensed)
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
56
49
  end
57
50
 
58
51
  def transform_begin_position(_begin_position)
59
- begin_position = @str1_mapping.enmap_position(_begin_position)
52
+ begin_position = @text_mapping.enmap_position(_begin_position)
60
53
 
61
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
63
56
 
64
57
  b = if block[:alignment] == :block || block[:alignment] == :term
65
58
  begin_position + block[:delta]
@@ -74,14 +67,14 @@ class TextAlignment::TextAlignment
74
67
  r.nil? ? nil : r + block[:target][:begin]
75
68
  end
76
69
 
77
- @str2_mapping.demap_position(b)
70
+ @rtext_mapping.demap_position(b)
78
71
  end
79
72
 
80
73
  def transform_end_position(_end_position)
81
- end_position = @str1_mapping.enmap_position(_end_position)
74
+ end_position = @text_mapping.enmap_position(_end_position)
82
75
 
83
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
- block = @block_alignment[:blocks][i]
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
85
78
 
86
79
  e = if block[:alignment] == :block || block[:alignment] == :term
87
80
  end_position + block[:delta]
@@ -96,7 +89,7 @@ class TextAlignment::TextAlignment
96
89
  r.nil? ? nil : r + block[:target][:begin]
97
90
  end
98
91
 
99
- @str2_mapping.demap_position(e)
92
+ @rtext_mapping.demap_position(e)
100
93
  end
101
94
 
102
95
  def transform_a_span(span)
@@ -115,7 +108,7 @@ class TextAlignment::TextAlignment
115
108
  source = {begin:d.begin, end:d.end}
116
109
  d.begin = transform_begin_position(d.begin);
117
110
  d.end = transform_end_position(d.end);
118
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
111
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
119
112
  rescue
120
113
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
114
  d.begin = nil
@@ -131,7 +124,7 @@ class TextAlignment::TextAlignment
131
124
 
132
125
  r = hdenotations.collect do |d|
133
126
  t = transform_a_span(d[:span])
134
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
127
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
135
128
  new_d = d.dup.merge({span:t})
136
129
  rescue
137
130
  @lost_annotations << {source: d[:span], target:t}
@@ -142,8 +135,8 @@ class TextAlignment::TextAlignment
142
135
  end
143
136
 
144
137
  def alignment_show
145
- stext = @block_alignment[:source_text]
146
- ttext = @block_alignment[:target_text]
138
+ stext = @block_alignment[:text]
139
+ ttext = @block_alignment[:reference_text]
147
140
 
148
141
  show = ''
149
142
  @block_alignment[:blocks].each do |a|
@@ -199,7 +192,7 @@ class TextAlignment::TextAlignment
199
192
 
200
193
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
201
194
  "[#{astr1}]\n" +
202
- "[#{astr2}]\n\n"
195
+ "[#{astr2.gsub("\n", " ")}]\n\n"
203
196
  end
204
197
  end
205
198
  show
@@ -238,142 +231,125 @@ class TextAlignment::TextAlignment
238
231
  # puts "-=-=-=-=-"
239
232
  # puts
240
233
 
241
- ## to fill the gaps
242
- last_block = nil
243
- blocks2 = blocks.inject([]) do |sum, block|
244
- b1 = last_block ? last_block[:source][:end] : 0
245
- e1 = block[:source][:begin]
234
+ ## To fill the gaps
235
+ ## lblock: last block, cblock: current block
236
+ lblock = nil
237
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
238
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
239
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
246
240
 
247
- sum += if b1 == e1
248
- [block]
249
- else
250
- b2 = last_block ? last_block[:target][:end] : 0
251
- e2 = block[:target][:begin]
252
-
253
- if b2 == e2
254
- [
255
- {source:{begin:b1, end:e1}, alignment: :empty},
256
- block
257
- ]
241
+ if b1 < e1
242
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
243
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
244
+ _str1 = str1[b1 ... e1]
245
+ _str2 = str2[b2 ... e2]
246
+
247
+ sum += if _str1.strip.empty? || _str2.strip.empty?
248
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
258
249
  else
259
250
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
260
-
261
- if b1 == 0 && b2 == 0
262
- b2 = e2 - len_buffer if e2 > len_buffer
263
- end
264
-
265
- _str1 = str1[b1 ... e1]
266
- _str2 = str2[b2 ... e2]
267
-
268
- if _str1.strip.empty? || _str2.strip.empty?
269
- [
270
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
271
- block
272
- ]
273
- elsif ((e2 - b2) - (e1 - b1)) > len_buffer
274
- la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
275
- la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
276
- [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
277
- else
278
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
251
+ region_state, state_region = cultivation_map.region_state([b2, e2])
252
+ case region_state
253
+ when :closed
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
255
+ when :front_open
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
263
+ when :rear_open
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
271
+ when :middle_closed
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
280
+ ob2 = state_region[1]
281
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
282
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
283
+ else
284
+ attempt1
285
+ end
286
+ else # :open
287
+ if (e2 - b2) > len_buffer
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
294
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
295
+ else
296
+ attempt1
297
+ end
298
+ else
299
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
300
+ end
279
301
  end
280
302
  end
281
303
  end
282
304
 
283
- last_block = block
284
- sum
305
+ lblock = cblock
306
+ cblock.nil? ? sum : sum << cblock
285
307
  end
286
308
 
287
- # the last step
288
- blocks2 += if last_block.nil?
289
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
290
- else
291
- b1 = last_block[:source][:end]
292
- if b1 < str1.length
293
- e1 = str1.length
294
- b2 = last_block[:target][:end]
295
-
296
- _str1 = str1[b1 ... e1]
297
- if _str1.strip.empty?
298
- [{source:{begin:b1, end:e1}, alignment: :empty}]
299
- else
300
- if b2 < str2.length
301
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
302
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
303
-
304
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
305
- else
306
- [{source:{begin:b1, end:e1}, alignment: :empty}]
307
- end
308
- end
309
- else
310
- []
311
- end
312
- end
313
309
  end
314
310
 
315
311
  def whole_block_alignment(str1, str2, cultivation_map)
316
- ## Block exact match
317
- search_position = 0
318
-
319
- block_begin = begin
320
- _block_begin = str2.index(str1, search_position)
321
- break if _block_begin.nil?
322
- search_position = cultivation_map.search_again_position(_block_begin)
323
- _block_begin
324
- end until search_position.nil?
325
-
326
- unless block_begin.nil?
327
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
328
- end
312
+ block_begin = cultivation_map.index(str1, str2)
313
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
329
314
 
330
- search_position = 0
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
316
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
331
317
 
332
- dstr1 = str1.downcase
333
- dstr2 = str2.downcase
334
- block_begin = begin
335
- _block_begin = dstr2.index(dstr1, search_position)
336
- break if _block_begin.nil?
337
- search_position = cultivation_map.search_again_position(_block_begin)
338
- _block_begin
339
- end until search_position.nil?
318
+ nil
319
+ end
340
320
 
341
- unless block_begin.nil?
342
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
321
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
324
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
+ else
326
+ tblocks
343
327
  end
344
-
345
- nil
346
328
  end
347
329
 
348
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
349
- block2 = str2[b2 ... e2]
330
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ str2_block = str2[0 ... e2]
350
332
 
351
333
  ## term-based alignment
352
334
  tblocks = if denotations
353
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
335
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
354
336
  sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
355
337
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
356
338
 
357
- position = 0
358
- _tblocks = ds_in_scope.map do |term|
359
- lex = term[:lex]
360
- r = block2.index(lex, position)
361
- if r.nil?
362
- position = nil
363
- break
364
- end
365
- position = r + lex.length
366
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
339
+ search_position = b2
340
+ _tblocks = denotations_in_scope.map do |denotation|
341
+ lex = denotation[:lex]
342
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
343
+ break [] if term_begin.nil? # break the loop if a missing term is found
344
+ search_position = term_begin + lex.length
345
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
367
346
  end
368
347
 
369
- # missing term found
370
- _tblocks = [] if position.nil?
371
-
372
348
  # redundant matching found
373
- unless position.nil?
374
- ds_in_scope.each do |term|
375
- lex = term[:lex]
376
- look_forward = block2.index(lex, position)
349
+ unless _tblocks.empty?
350
+ search_position = _tblocks.last[:target][:end]
351
+ denotations_in_scope.each do |term|
352
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
377
353
  unless look_forward.nil?
378
354
  _tblocks = []
379
355
  break
@@ -386,73 +362,72 @@ class TextAlignment::TextAlignment
386
362
  []
387
363
  end
388
364
 
389
- if tblocks.empty?
390
- if b1 == 0 && e1 == str1.length
391
- if (e1 > 2000) || (e2 > 2000)
392
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
393
- else
394
- block1 = str1[b1 ... e1]
395
- block2 = str2[b2 ... e2]
396
-
397
- ## character-based alignment
398
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
399
- if alignment.sdiff.nil?
400
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
401
- else
402
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
403
- end
404
- end
405
- else
406
- block1 = str1[b1 ... e1]
407
- block2 = str2[b2 ... e2]
365
+ ltblock = nil
366
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
367
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
368
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
408
369
 
409
- ## character-based alignment
410
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
411
- if alignment.sdiff.nil?
412
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
413
- else
414
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
415
- end
370
+ if te1 > tb1
371
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
372
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
373
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
416
374
  end
417
- else
418
- last_tblock = nil
419
- lblocks = tblocks.inject([]) do |sum, tblock|
420
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
421
- te1 = tblock[:source][:begin]
422
375
 
423
- sum += if te1 == tb1
424
- [tblock]
425
- else
426
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
427
- te2 = tblock[:target][:begin]
428
-
429
- if b2 == e2
430
- [
431
- {source:{begin:tb1, end:te1}, alignment: :empty},
432
- tblock
433
- ]
434
- else
435
- [
436
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
437
- tblock
438
- ]
439
- end
440
- end
376
+ ltblock = ctblock
377
+ ctblock.nil? ? sum : sum << ctblock
378
+ end
441
379
 
442
- last_tblock = tblock
443
- sum
380
+ tblocks2
381
+ end
382
+
383
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
384
+ source = {begin:b1, end:e1}
385
+ target = {begin:b2, end:e2}
386
+
387
+ if (e1 - b1) > 2000
388
+ [{source:source, target:target, alignment: :empty}]
389
+ else
390
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
391
+ if alignment.similarity < 0.5
392
+ [{source:source, target:target, alignment: :empty}]
393
+ else
394
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
444
395
  end
396
+ end
397
+ end
445
398
 
446
- if last_tblock[:source][:end] < e1
447
- if last_tblock[:target][:end] < e2
448
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
449
- else
450
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
451
- end
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
452
414
  end
415
+ condensed
416
+ end
417
+
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
453
420
 
454
- lblocks
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
455
428
  end
429
+
430
+ blocks
456
431
  end
457
432
 
458
433
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.10.1'
2
+ VERSION = '0.11.5'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.11.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-03 00:00:00.000000000 Z
11
+ date: 2021-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary