text_alignment 0.10.1 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 710fa3dfe07e268c62de77c67abb52e1893a36a9c0fbdfbf7400328ab7f8358a
4
- data.tar.gz: 31a086666978c4776a60b0ce19fde48beeb8a5bce599b7572f20089cfb5c7500
3
+ metadata.gz: abf4387fb9ea598d924eb0a7369ca58ec0be7cdd35ac89abc57647d665c00207
4
+ data.tar.gz: 30ab0ccea5c04cae8132a70872cf5de4704ce7fd4e2104f7fddc6527794424c5
5
5
  SHA512:
6
- metadata.gz: d0a752d2203f65e48b4f10ff29eb29a77504e7cef4da1dd3602e2ea5107a99b7edff4c3609c79c7dcba0f9cc458780dc4788ff8123a781a4d77ae89236170bf5
7
- data.tar.gz: '094ff5f662ccdc2fc14f81549126362be49fd64610f994c6ba5c4457fbbbececc1931257589ebde49211c2bce744ca033bfe3673c8af5c8a01af7679bfb626cc'
6
+ metadata.gz: 2033123f2289c991b2a53826921ca19489f7f39c53918df5f962e321b1f3ff0996760e9df302872d07a7e638e6c56ad86428d1ce773d6e14c54180ad2360b1be
7
+ data.tar.gz: a902ca48bba502af787deb3abf6c5a2b4c9a121394452ef25e95c61212ee0cfa4e991f83315bc6e03a52d45ead909acedeaf0abd9cdc278ccc58b7cb315ab556
@@ -26,9 +26,8 @@ def read_text(filename)
26
26
  end
27
27
  end
28
28
 
29
- def align_denotations(denotations, source_text, target_text, debug = false, cm = nil)
30
- alignment = TextAlignment::TextAlignment.new(source_text, target_text, denotations, cm)
31
- cm = alignment.cultivation_map
29
+ def align_denotations(denotations, source_text, alignment, debug = false)
30
+ alignment.align(source_text, denotations)
32
31
  new_denotations = alignment.transform_hdenotations(denotations)
33
32
 
34
33
  if debug
@@ -38,7 +37,7 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
38
37
  end
39
38
 
40
39
  lost_annotations = alignment.lost_annotations
41
- unless lost_annotations.empty?
40
+ unless lost_annotations.nil? || lost_annotations.empty?
42
41
  warn "\n[lost annotations] #{lost_annotations.length}"
43
42
  lost_annotations.each do |a|
44
43
  warn "#{a}"
@@ -48,23 +47,22 @@ def align_denotations(denotations, source_text, target_text, debug = false, cm =
48
47
  warn
49
48
 
50
49
  # return target annotations
51
- [new_denotations, cm]
50
+ new_denotations
52
51
  end
53
52
 
54
- def align_mannotations(source_annotations, target_text, debug = false)
55
- target_annotations = {text:target_text}
53
+ def align_mannotations(source_annotations, reference_text, alignment, debug = false)
54
+ target_annotations = {text:reference_text}
56
55
 
57
56
  idnum_denotations = 0
58
57
  idnum_relations = 0
59
58
  idnum_attributes = 0
60
59
  idnum_modifications = 0
61
60
 
62
- cm = nil
63
61
  source_annotations.each_with_index do |annotations, i|
64
62
  if annotations.has_key?(:denotations) && !annotations[:denotations].empty?
65
63
  ididx = {}
66
64
  warn "[#{i}]-=-=-=-=-"
67
- denotations, cm = align_denotations(annotations[:denotations], annotations[:text], target_text, debug, cm)
65
+ denotations = align_denotations(annotations[:denotations], annotations[:text], alignment, debug)
68
66
 
69
67
  denotations.each do |d|
70
68
  reid = 'T' + (idnum_denotations += 1).to_s
@@ -79,7 +77,9 @@ def align_mannotations(source_annotations, target_text, debug = false)
79
77
  annotations[:relations].each do |r|
80
78
  reid = 'R' + (idnum_relations += 1).to_s
81
79
  ididx[r[:id]] = reid
82
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
80
+ sid = ididx[r[:subj]]
81
+ oid = ididx[r[:obj]]
82
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
83
83
  end
84
84
  end
85
85
 
@@ -88,7 +88,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
88
88
  annotations[:attributes].each do |a|
89
89
  reid = 'A' + (idnum_attributes += 1).to_s
90
90
  ididx[a[:id]] = reid
91
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
91
+ sid = ididx[a[:subj]]
92
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
92
93
  end
93
94
  end
94
95
 
@@ -97,7 +98,8 @@ def align_mannotations(source_annotations, target_text, debug = false)
97
98
  annotations[:modifications].each do |m|
98
99
  reid = 'M' + (idnum_modifications += 1).to_s
99
100
  ididx[m[:id]] = reid
100
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
101
+ oid = ididx[m[:obj]]
102
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
101
103
  end
102
104
  end
103
105
  end
@@ -112,14 +114,18 @@ unless ARGV.length == 2
112
114
  end
113
115
 
114
116
  source_annotations = read_annotations(ARGV[0])
115
- target_text = read_text(ARGV[1])
117
+ reference_text = read_text(ARGV[1])
118
+
119
+ alignment = TextAlignment::TextAlignment.new(reference_text, true)
116
120
 
117
121
  target_annotations = if source_annotations.class == Array
118
- align_mannotations(source_annotations, target_text, false)
122
+ # align_mannotations(source_annotations, reference_text, alignment, true)
123
+ align_mannotations(source_annotations, reference_text, alignment, false)
119
124
  else
120
- denotations, cm = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, false)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], target_text, true)
122
- source_annotations.merge({text:target_text, denotations:denotations})
125
+ # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
127
+ source_annotations.merge({text:reference_text, denotations:denotations})
123
128
  end
124
129
 
125
- puts target_annotations.to_json
130
+ # pp alignment.block_alignment
131
+ # puts target_annotations.to_json
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
9
  def initialize(source_str, target_str, cultivation_map)
10
- @s1, @s2 = if reverse?(source_str, target_str)
11
- [target_str.downcase, source_str.downcase]
12
- else
13
- [source_str.downcase, target_str.downcase]
14
- end
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
15
12
 
16
13
  @cultivation_map = cultivation_map
17
14
 
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
19
16
  @size_window = TextAlignment::SIZE_WINDOW
20
17
  @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
18
  @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
22
20
 
23
21
  # positions of last match
24
22
  @pos_s1_last_match = 0
25
23
  @pos_s2_last_match = 0
26
24
  end
27
25
 
28
- def reverse?(source_str = nil, target_str = nil)
29
- unless source_str.nil?
30
- @reverse_p = target_str.length < source_str.length
31
- end
32
- @reverse_p
33
- end
34
-
35
26
  def get_next_anchor
36
27
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
28
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
49
40
  # To extend the block to the left
50
41
  b1 = beg_s1
51
42
  b2 = beg_s2
52
- while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
53
45
  b1 -= 1; b2 -= 1
54
46
  end
55
- b1 += 1; b2 += 1
56
47
 
57
48
  # To extend the block to the right
58
49
  e1 = beg_s1 + @size_ngram
59
50
  e2 = beg_s2 + @size_ngram
60
- while @s1[e1] && @s1[e1] == @s2[e2]
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
61
53
  e1 += 1; e2 += 1
62
54
  end
63
55
 
64
56
  @pos_s1_last_match = e1
65
57
  @pos_s2_last_match = e2
66
58
 
67
- if reverse?
68
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
69
- else
70
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
71
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
72
60
  end
73
61
 
74
62
  private
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
91
79
  # return nil if the anchor is too much frequent
92
80
  def find_beg_s2_candidates(anchor, search_position)
93
81
  candidates = []
94
- while _beg_s2 = @s2.index(anchor, search_position)
95
- search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
- unless search_again_position.nil?
97
- search_position = search_again_position
98
- next
99
- end
100
-
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
101
83
  candidates << _beg_s2
102
84
 
103
85
  # for speed, skip anchor of high frequency
@@ -126,14 +108,14 @@ class TextAlignment::AnchorFinder
126
108
  next
127
109
  end
128
110
 
129
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
130
112
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
113
  break unless valid_beg_s2.nil?
132
114
  valid_beg_s2 = beg_s2
133
115
  next
134
116
  end
135
117
 
136
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
137
119
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
120
  break unless valid_beg_s2.nil?
139
121
  valid_beg_s2 = beg_s2
@@ -143,7 +125,11 @@ class TextAlignment::AnchorFinder
143
125
 
144
126
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
127
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
- break unless r.nil?
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
147
133
  end
148
134
 
149
135
  valid_beg_s2
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+202F (narrow no-break space)
64
65
  [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
67
  ["‐", "-"], #U+2010 (Hyphen)
@@ -77,11 +78,11 @@ TextAlignment::CHAR_MAPPING = [
77
78
 
78
79
 
79
80
  class TextAlignment::CharMapping
80
- attr_reader :str
81
+ attr_reader :mapped_text
81
82
 
82
- def initialize(_str, char_mapping = nil)
83
+ def initialize(_text, char_mapping = nil)
83
84
  char_mapping ||= TextAlignment::CHAR_MAPPING
84
- @str, offset_mapping = enmap_str(_str, char_mapping)
85
+ @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
85
86
  @index_enmap = offset_mapping.to_h
86
87
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
87
88
  end
@@ -94,20 +95,22 @@ class TextAlignment::CharMapping
94
95
  @index_demap[position]
95
96
  end
96
97
 
97
- def enmap_denotations(_denotations)
98
- denotations = _denotations.map do |d|
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
100
+
101
+ denotations.map do |d|
99
102
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
100
103
  end
101
104
  end
102
105
 
103
106
  private
104
107
 
105
- def enmap_str(_str, char_mapping)
106
- str = _str.dup
108
+ def enmap_text(_text, char_mapping)
109
+ text = _text.dup
107
110
 
108
111
  # To execute the single letter mapping
109
112
  char_mapping.each do |one, long|
110
- str.gsub!(one, long) if long.length == 1
113
+ text.gsub!(one, long) if long.length == 1
111
114
  end
112
115
 
113
116
  # To get the (location, length) index for replacements
@@ -116,18 +119,18 @@ class TextAlignment::CharMapping
116
119
  next if long.length == 1
117
120
 
118
121
  init_next = 0
119
- while loc = str.index(long, init_next)
122
+ while loc = text.index(long, init_next)
120
123
  loc_len << [loc, long.length]
121
124
  init_next = loc + long.length
122
125
  end
123
126
 
124
127
  # a workaround to avoid messing-up due to embedding
125
- str.gsub!(long, one * long.length)
128
+ text.gsub!(long, one * long.length)
126
129
  end
127
130
 
128
131
  # To get the (location, length) index for consecutive whitespace sequences
129
132
  init_next = 0
130
- while loc = str.index(/\s{2,}/, init_next)
133
+ while loc = text.index(/\s{2,}/, init_next)
131
134
  len = $~[0].length
132
135
  loc_len << [loc, len]
133
136
  init_next = loc + len
@@ -148,20 +151,20 @@ class TextAlignment::CharMapping
148
151
  init_next = loc + len
149
152
  end
150
153
 
151
- offset_mapping += (init_next .. str.length).map do |i|
154
+ offset_mapping += (init_next .. text.length).map do |i|
152
155
  j += 1
153
156
  [i, j - 1]
154
157
  end
155
158
 
156
159
  # To execute the long letter mapping
157
160
  char_mapping.each do |one, long|
158
- str.gsub!(one * long.length, one) if long.length > 1
161
+ text.gsub!(one * long.length, one) if long.length > 1
159
162
  end
160
163
 
161
164
  # To replace multi whitespace sequences to a space
162
- str.gsub!(/\s{2,}/, ' ')
165
+ text.gsub!(/\s{2,}/, ' ')
163
166
 
164
- [str, offset_mapping]
167
+ [text, offset_mapping]
165
168
  end
166
169
  end
167
170
 
@@ -173,15 +176,15 @@ if __FILE__ == $0
173
176
  exit
174
177
  end
175
178
  annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
176
- denotations = annotations[:denotations]
179
+ denotations = annotations[:denotations] || []
177
180
  if denotations.nil? && annotations[:tracks]
178
181
  denotations = annotations[:tracks].first[:denotations]
179
182
  end
180
183
 
181
- str_mapping = TextAlignment::CharMapping.new(annotations[:text])
182
- str_mapped = str_mapping.str
183
- denotations_mapped = str_mapping.enmap_denotations(denotations)
184
- new_annotations = {text:str_mapped, denotations:denotations_mapped}
184
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text])
185
+ text_mapped = text_mapping.mapped_text
186
+ denotations_mapped = text_mapping.enmap_denotations(denotations)
187
+ new_annotations = {text:text_mapped, denotations:denotations_mapped}
185
188
 
186
189
  puts new_annotations.to_json
187
190
  end
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
4
4
  attr_reader :map
5
5
 
6
6
  def initialize
7
- @map = {}
7
+ @map = []
8
8
  end
9
9
 
10
10
  def cultivate(regions)
11
- regions.each do |b, e|
12
- (b ... e).each{|p| @map[p] = e}
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
13
24
  end
25
+ @map = new_map
14
26
  end
15
27
 
16
- def search_again_position(position)
17
- @map[position]
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position <= r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position = 0)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
18
93
  end
19
94
  end
@@ -144,11 +144,17 @@ class TextAlignment::MixedAlignment
144
144
  def compute_similarity(s1, s2, sdiff)
145
145
  return 0 if sdiff.nil?
146
146
 
147
- # compute the lcs only with non-whitespace letters
148
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
- return 0 if lcs == 0
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
150
 
151
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
- end
151
+ coverage = count_nws_match.to_f / count_nws
152
+
153
+ # fragmentation rate
154
+ count_ofrag = sdiff.count{|d| d.old_element =~ /\s/} + 1
155
+ count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
+ rate_frag = count_ofrag.to_f / count_frag
153
157
 
158
+ similarity = coverage * rate_frag
159
+ end
154
160
  end
@@ -10,56 +10,49 @@ class TextAlignment::TextAlignment
10
10
  attr_reader :block_alignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
- attr_reader :cultivation_map
14
13
 
15
- def initialize(_str1, _str2, _denotations = nil, _cultivation_map = nil)
16
- raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
14
+ # Initialize with a reference text, again which texts will be aligned
15
+ def initialize(reference_text, to_prevent_overlap = false)
16
+ raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @block_alignment = {source_text: _str1, target_text: _str2, denotations: _denotations}
19
- @original_str1 = _str1
20
- @original_str2 = _str2
18
+ @original_reference_text = reference_text
19
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
21
+ @to_prevent_overlap = to_prevent_overlap
21
22
 
22
- @str1_mapping = TextAlignment::CharMapping.new(_str1)
23
- @str2_mapping = TextAlignment::CharMapping.new(_str2)
23
+ @original_text = nil
24
+ @blocks = nil
25
+ @cultivation_map = TextAlignment::CultivationMap.new
26
+ end
24
27
 
25
- str1 = @str1_mapping.str
26
- denotations = @str1_mapping.enmap_denotations(_denotations)
28
+ def align(text, denotations = nil)
29
+ # To maintain the cultivation map
30
+ update_cultivation_map if @to_prevent_overlap
27
31
 
28
- str2 = @str2_mapping.str
32
+ # In case the input text is the same as the previous one, reuse the previous text mapping
33
+ unless @original_text && @original_text == text
34
+ @original_text = text
35
+ @text_mapping = TextAlignment::CharMapping.new(text)
36
+ end
29
37
 
30
- @cultivation_map = _cultivation_map || TextAlignment::CultivationMap.new
38
+ @mapped_text = @text_mapping.mapped_text
39
+ denotations_mapped = @text_mapping.enmap_denotations(denotations)
31
40
 
32
- @block_alignment[:blocks] = if r = whole_block_alignment(str1, str2, @cultivation_map)
33
- # whole block alignment
41
+ ## To generate the block_alignment of the input text against the reference text
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
34
43
  r
35
44
  else
36
- find_block_alignment(str1, str2, denotations, @cultivation_map)
45
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
37
46
  end
38
47
 
39
- newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
40
- if b[:alignment] == :block || b[:alignment] == :term
41
- [b[:target][:begin], b[:target][:end]]
42
- else
43
- nil
44
- end
45
- end.compact
46
- newly_cultivated_regions_condensed = newly_cultivated_regions.inject([]) do |condensed, region|
47
- if condensed.empty? || (condensed.last.last + 1 < region.first)
48
- condensed.push region
49
- else
50
- condensed.last[1] = region.last
51
- end
52
- condensed
53
- end
54
-
55
- @cultivation_map.cultivate(newly_cultivated_regions_condensed)
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
56
49
  end
57
50
 
58
51
  def transform_begin_position(_begin_position)
59
- begin_position = @str1_mapping.enmap_position(_begin_position)
52
+ begin_position = @text_mapping.enmap_position(_begin_position)
60
53
 
61
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
62
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
63
56
 
64
57
  b = if block[:alignment] == :block || block[:alignment] == :term
65
58
  begin_position + block[:delta]
@@ -74,14 +67,14 @@ class TextAlignment::TextAlignment
74
67
  r.nil? ? nil : r + block[:target][:begin]
75
68
  end
76
69
 
77
- @str2_mapping.demap_position(b)
70
+ @rtext_mapping.demap_position(b)
78
71
  end
79
72
 
80
73
  def transform_end_position(_end_position)
81
- end_position = @str1_mapping.enmap_position(_end_position)
74
+ end_position = @text_mapping.enmap_position(_end_position)
82
75
 
83
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
84
- block = @block_alignment[:blocks][i]
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
85
78
 
86
79
  e = if block[:alignment] == :block || block[:alignment] == :term
87
80
  end_position + block[:delta]
@@ -96,7 +89,7 @@ class TextAlignment::TextAlignment
96
89
  r.nil? ? nil : r + block[:target][:begin]
97
90
  end
98
91
 
99
- @str2_mapping.demap_position(e)
92
+ @rtext_mapping.demap_position(e)
100
93
  end
101
94
 
102
95
  def transform_a_span(span)
@@ -115,7 +108,7 @@ class TextAlignment::TextAlignment
115
108
  source = {begin:d.begin, end:d.end}
116
109
  d.begin = transform_begin_position(d.begin);
117
110
  d.end = transform_end_position(d.end);
118
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_str2.length
111
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
119
112
  rescue
120
113
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
121
114
  d.begin = nil
@@ -131,7 +124,7 @@ class TextAlignment::TextAlignment
131
124
 
132
125
  r = hdenotations.collect do |d|
133
126
  t = transform_a_span(d[:span])
134
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_str2.length
127
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
135
128
  new_d = d.dup.merge({span:t})
136
129
  rescue
137
130
  @lost_annotations << {source: d[:span], target:t}
@@ -142,8 +135,8 @@ class TextAlignment::TextAlignment
142
135
  end
143
136
 
144
137
  def alignment_show
145
- stext = @block_alignment[:source_text]
146
- ttext = @block_alignment[:target_text]
138
+ stext = @block_alignment[:text]
139
+ ttext = @block_alignment[:reference_text]
147
140
 
148
141
  show = ''
149
142
  @block_alignment[:blocks].each do |a|
@@ -199,7 +192,7 @@ class TextAlignment::TextAlignment
199
192
 
200
193
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
201
194
  "[#{astr1}]\n" +
202
- "[#{astr2}]\n\n"
195
+ "[#{astr2.gsub("\n", " ")}]\n\n"
203
196
  end
204
197
  end
205
198
  show
@@ -238,142 +231,125 @@ class TextAlignment::TextAlignment
238
231
  # puts "-=-=-=-=-"
239
232
  # puts
240
233
 
241
- ## to fill the gaps
242
- last_block = nil
243
- blocks2 = blocks.inject([]) do |sum, block|
244
- b1 = last_block ? last_block[:source][:end] : 0
245
- e1 = block[:source][:begin]
234
+ ## To fill the gaps
235
+ ## lblock: last block, cblock: current block
236
+ lblock = nil
237
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
238
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
239
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
246
240
 
247
- sum += if b1 == e1
248
- [block]
249
- else
250
- b2 = last_block ? last_block[:target][:end] : 0
251
- e2 = block[:target][:begin]
252
-
253
- if b2 == e2
254
- [
255
- {source:{begin:b1, end:e1}, alignment: :empty},
256
- block
257
- ]
241
+ if b1 < e1
242
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
243
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
244
+ _str1 = str1[b1 ... e1]
245
+ _str2 = str2[b2 ... e2]
246
+
247
+ sum += if _str1.strip.empty? || _str2.strip.empty?
248
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
258
249
  else
259
250
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
260
-
261
- if b1 == 0 && b2 == 0
262
- b2 = e2 - len_buffer if e2 > len_buffer
263
- end
264
-
265
- _str1 = str1[b1 ... e1]
266
- _str2 = str2[b2 ... e2]
267
-
268
- if _str1.strip.empty? || _str2.strip.empty?
269
- [
270
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
271
- block
272
- ]
273
- elsif ((e2 - b2) - (e1 - b1)) > len_buffer
274
- la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
275
- la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
276
- [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
277
- else
278
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
251
+ region_state, state_region = cultivation_map.region_state([b2, e2])
252
+ case region_state
253
+ when :closed
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
255
+ when :front_open
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
263
+ when :rear_open
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
271
+ when :middle_closed
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
280
+ ob2 = state_region[1]
281
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
282
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
283
+ else
284
+ attempt1
285
+ end
286
+ else # :open
287
+ if (e2 - b2) > len_buffer
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
294
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
295
+ else
296
+ attempt1
297
+ end
298
+ else
299
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
300
+ end
279
301
  end
280
302
  end
281
303
  end
282
304
 
283
- last_block = block
284
- sum
305
+ lblock = cblock
306
+ cblock.nil? ? sum : sum << cblock
285
307
  end
286
308
 
287
- # the last step
288
- blocks2 += if last_block.nil?
289
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
290
- else
291
- b1 = last_block[:source][:end]
292
- if b1 < str1.length
293
- e1 = str1.length
294
- b2 = last_block[:target][:end]
295
-
296
- _str1 = str1[b1 ... e1]
297
- if _str1.strip.empty?
298
- [{source:{begin:b1, end:e1}, alignment: :empty}]
299
- else
300
- if b2 < str2.length
301
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
302
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
303
-
304
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
305
- else
306
- [{source:{begin:b1, end:e1}, alignment: :empty}]
307
- end
308
- end
309
- else
310
- []
311
- end
312
- end
313
309
  end
314
310
 
315
311
  def whole_block_alignment(str1, str2, cultivation_map)
316
- ## Block exact match
317
- search_position = 0
318
-
319
- block_begin = begin
320
- _block_begin = str2.index(str1, search_position)
321
- break if _block_begin.nil?
322
- search_position = cultivation_map.search_again_position(_block_begin)
323
- _block_begin
324
- end until search_position.nil?
325
-
326
- unless block_begin.nil?
327
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
328
- end
312
+ block_begin = cultivation_map.index(str1, str2)
313
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
329
314
 
330
- search_position = 0
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
316
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
331
317
 
332
- dstr1 = str1.downcase
333
- dstr2 = str2.downcase
334
- block_begin = begin
335
- _block_begin = dstr2.index(dstr1, search_position)
336
- break if _block_begin.nil?
337
- search_position = cultivation_map.search_again_position(_block_begin)
338
- _block_begin
339
- end until search_position.nil?
318
+ nil
319
+ end
340
320
 
341
- unless block_begin.nil?
342
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
321
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
324
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
+ else
326
+ tblocks
343
327
  end
344
-
345
- nil
346
328
  end
347
329
 
348
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
349
- block2 = str2[b2 ... e2]
330
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ str2_block = str2[0 ... e2]
350
332
 
351
333
  ## term-based alignment
352
334
  tblocks = if denotations
353
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
335
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
354
336
  sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
355
337
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
356
338
 
357
- position = 0
358
- _tblocks = ds_in_scope.map do |term|
359
- lex = term[:lex]
360
- r = block2.index(lex, position)
361
- if r.nil?
362
- position = nil
363
- break
364
- end
365
- position = r + lex.length
366
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
339
+ search_position = b2
340
+ _tblocks = denotations_in_scope.map do |denotation|
341
+ lex = denotation[:lex]
342
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
343
+ break [] if term_begin.nil? # break the loop if a missing term is found
344
+ search_position = term_begin + lex.length
345
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
367
346
  end
368
347
 
369
- # missing term found
370
- _tblocks = [] if position.nil?
371
-
372
348
  # redundant matching found
373
- unless position.nil?
374
- ds_in_scope.each do |term|
375
- lex = term[:lex]
376
- look_forward = block2.index(lex, position)
349
+ unless _tblocks.empty?
350
+ search_position = _tblocks.last[:target][:end]
351
+ denotations_in_scope.each do |term|
352
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
377
353
  unless look_forward.nil?
378
354
  _tblocks = []
379
355
  break
@@ -386,73 +362,72 @@ class TextAlignment::TextAlignment
386
362
  []
387
363
  end
388
364
 
389
- if tblocks.empty?
390
- if b1 == 0 && e1 == str1.length
391
- if (e1 > 2000) || (e2 > 2000)
392
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
393
- else
394
- block1 = str1[b1 ... e1]
395
- block2 = str2[b2 ... e2]
396
-
397
- ## character-based alignment
398
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
399
- if alignment.sdiff.nil?
400
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
401
- else
402
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
403
- end
404
- end
405
- else
406
- block1 = str1[b1 ... e1]
407
- block2 = str2[b2 ... e2]
365
+ ltblock = nil
366
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
367
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
368
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
408
369
 
409
- ## character-based alignment
410
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
411
- if alignment.sdiff.nil?
412
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
413
- else
414
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
415
- end
370
+ if te1 > tb1
371
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
372
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
373
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
416
374
  end
417
- else
418
- last_tblock = nil
419
- lblocks = tblocks.inject([]) do |sum, tblock|
420
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
421
- te1 = tblock[:source][:begin]
422
375
 
423
- sum += if te1 == tb1
424
- [tblock]
425
- else
426
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
427
- te2 = tblock[:target][:begin]
428
-
429
- if b2 == e2
430
- [
431
- {source:{begin:tb1, end:te1}, alignment: :empty},
432
- tblock
433
- ]
434
- else
435
- [
436
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
437
- tblock
438
- ]
439
- end
440
- end
376
+ ltblock = ctblock
377
+ ctblock.nil? ? sum : sum << ctblock
378
+ end
441
379
 
442
- last_tblock = tblock
443
- sum
380
+ tblocks2
381
+ end
382
+
383
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
384
+ source = {begin:b1, end:e1}
385
+ target = {begin:b2, end:e2}
386
+
387
+ if (e1 - b1) > 2000
388
+ [{source:source, target:target, alignment: :empty}]
389
+ else
390
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
391
+ if alignment.similarity < 0.5
392
+ [{source:source, target:target, alignment: :empty}]
393
+ else
394
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
444
395
  end
396
+ end
397
+ end
445
398
 
446
- if last_tblock[:source][:end] < e1
447
- if last_tblock[:target][:end] < e2
448
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
449
- else
450
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
451
- end
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
452
414
  end
415
+ condensed
416
+ end
417
+
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
453
420
 
454
- lblocks
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
455
428
  end
429
+
430
+ blocks
456
431
  end
457
432
 
458
433
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.10.1'
2
+ VERSION = '0.11.5'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.1
4
+ version: 0.11.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-03 00:00:00.000000000 Z
11
+ date: 2021-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary