text_alignment 0.11.0 → 0.11.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
- data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
3
+ metadata.gz: 5ed6071b0293e9b7fa86acf4a737c80c9d784f453d3fb77992e3d9f1acc02bbe
4
+ data.tar.gz: 7a57b7afbe21061d9aac2f96cd9f7a3c83ff2f01b6c3973905f7681a40463287
5
5
  SHA512:
6
- metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
- data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
6
+ metadata.gz: 141158c2f6b80975bacf68babc713b06d809f02b96cfd7accdecb00c6b4c362b9d130e29820074be7b89ac53c7cc93f850ec3d7ea9796eaad5021d441528e82b
7
+ data.tar.gz: 8bc4cc9ca9becc1c5638b6501268d33dd7700e8369d7c20c1ec6fd9ef11e9fd73e6a1354cb37376af250c8462e945cccf08417ba6d258f08660c53eb418e4658
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment'
3
3
  require 'json'
4
4
  require 'pp'
5
+ require 'optparse'
5
6
 
6
7
  def read_annotations(filename)
7
8
  case File.extname(filename)
@@ -37,7 +38,7 @@ def align_denotations(denotations, source_text, alignment, debug = false)
37
38
  end
38
39
 
39
40
  lost_annotations = alignment.lost_annotations
40
- unless lost_annotations.empty?
41
+ unless lost_annotations.nil? || lost_annotations.empty?
41
42
  warn "\n[lost annotations] #{lost_annotations.length}"
42
43
  lost_annotations.each do |a|
43
44
  warn "#{a}"
@@ -77,7 +78,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
77
78
  annotations[:relations].each do |r|
78
79
  reid = 'R' + (idnum_relations += 1).to_s
79
80
  ididx[r[:id]] = reid
80
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
81
+ sid = ididx[r[:subj]]
82
+ oid = ididx[r[:obj]]
83
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
81
84
  end
82
85
  end
83
86
 
@@ -86,7 +89,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
86
89
  annotations[:attributes].each do |a|
87
90
  reid = 'A' + (idnum_attributes += 1).to_s
88
91
  ididx[a[:id]] = reid
89
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
92
+ sid = ididx[a[:subj]]
93
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
90
94
  end
91
95
  end
92
96
 
@@ -95,7 +99,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
95
99
  annotations[:modifications].each do |m|
96
100
  reid = 'M' + (idnum_modifications += 1).to_s
97
101
  ididx[m[:id]] = reid
98
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
102
+ oid = ididx[m[:obj]]
103
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
99
104
  end
100
105
  end
101
106
  end
@@ -104,21 +109,45 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
104
109
  end
105
110
 
106
111
 
112
+ ## Options
113
+ overlap_p = false
114
+ debug_p = false
115
+
116
+ ## command line option processing
117
+ require 'optparse'
118
+ optparse = OptionParser.new do |opts|
119
+ opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
+
121
+ opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
+ overlap_p = true
123
+ end
124
+
125
+ opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
+ debug_p = true
127
+ end
128
+
129
+ opts.on('-h', '--help', 'displays this screen.') do
130
+ puts opts
131
+ exit
132
+ end
133
+ end
134
+
135
+ optparse.parse!
136
+
107
137
  unless ARGV.length == 2
108
- warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
109
- exit
138
+ puts optparse.help
139
+ exit 1
110
140
  end
111
141
 
112
142
  source_annotations = read_annotations(ARGV[0])
113
143
  reference_text = read_text(ARGV[1])
114
144
 
115
- alignment = TextAlignment::TextAlignment.new(reference_text, true)
145
+ alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
116
146
 
117
147
  target_annotations = if source_annotations.class == Array
118
- align_mannotations(source_annotations, reference_text, alignment, false)
148
+ align_mannotations(source_annotations, reference_text, alignment, debug_p)
119
149
  else
120
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
150
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
122
151
  source_annotations.merge({text:reference_text, denotations:denotations})
123
152
  end
124
153
 
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
9
  def initialize(source_str, target_str, cultivation_map)
10
- @s1, @s2 = if reverse?(source_str, target_str)
11
- [target_str.downcase, source_str.downcase]
12
- else
13
- [source_str.downcase, target_str.downcase]
14
- end
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
15
12
 
16
13
  @cultivation_map = cultivation_map
17
14
 
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
19
16
  @size_window = TextAlignment::SIZE_WINDOW
20
17
  @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
18
  @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
22
20
 
23
21
  # positions of last match
24
22
  @pos_s1_last_match = 0
25
23
  @pos_s2_last_match = 0
26
24
  end
27
25
 
28
- def reverse?(source_str = nil, target_str = nil)
29
- unless source_str.nil?
30
- @reverse_p = target_str.length < source_str.length
31
- end
32
- @reverse_p
33
- end
34
-
35
26
  def get_next_anchor
36
27
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
28
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
49
40
  # To extend the block to the left
50
41
  b1 = beg_s1
51
42
  b2 = beg_s2
52
- while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
53
45
  b1 -= 1; b2 -= 1
54
46
  end
55
- b1 += 1; b2 += 1
56
47
 
57
48
  # To extend the block to the right
58
49
  e1 = beg_s1 + @size_ngram
59
50
  e2 = beg_s2 + @size_ngram
60
- while @s1[e1] && @s1[e1] == @s2[e2]
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
61
53
  e1 += 1; e2 += 1
62
54
  end
63
55
 
64
56
  @pos_s1_last_match = e1
65
57
  @pos_s2_last_match = e2
66
58
 
67
- if reverse?
68
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
69
- else
70
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
71
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
72
60
  end
73
61
 
74
62
  private
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
91
79
  # return nil if the anchor is too much frequent
92
80
  def find_beg_s2_candidates(anchor, search_position)
93
81
  candidates = []
94
- while _beg_s2 = @s2.index(anchor, search_position)
95
- search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
- unless search_again_position.nil?
97
- search_position = search_again_position
98
- next
99
- end
100
-
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
101
83
  candidates << _beg_s2
102
84
 
103
85
  # for speed, skip anchor of high frequency
@@ -126,14 +108,14 @@ class TextAlignment::AnchorFinder
126
108
  next
127
109
  end
128
110
 
129
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
130
112
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
113
  break unless valid_beg_s2.nil?
132
114
  valid_beg_s2 = beg_s2
133
115
  next
134
116
  end
135
117
 
136
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
137
119
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
120
  break unless valid_beg_s2.nil?
139
121
  valid_beg_s2 = beg_s2
@@ -143,7 +125,11 @@ class TextAlignment::AnchorFinder
143
125
 
144
126
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
127
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
- break unless r.nil?
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
147
133
  end
148
134
 
149
135
  valid_beg_s2
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+202F (narrow no-break space)
64
65
  [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
67
  ["‐", "-"], #U+2010 (Hyphen)
@@ -94,10 +95,10 @@ class TextAlignment::CharMapping
94
95
  @index_demap[position]
95
96
  end
96
97
 
97
- def enmap_denotations(_denotations)
98
- return nil if _denotations.nil?
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
99
100
 
100
- denotations = _denotations.map do |d|
101
+ denotations.map do |d|
101
102
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
102
103
  end
103
104
  end
@@ -107,7 +108,7 @@ class TextAlignment::CharMapping
107
108
  def enmap_text(_text, char_mapping)
108
109
  text = _text.dup
109
110
 
110
- # To execute the single letter mapping
111
+ # To execute the single letter mapping replacement
111
112
  char_mapping.each do |one, long|
112
113
  text.gsub!(one, long) if long.length == 1
113
114
  end
@@ -175,7 +176,7 @@ if __FILE__ == $0
175
176
  exit
176
177
  end
177
178
  annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
- denotations = annotations[:denotations]
179
+ denotations = annotations[:denotations] || []
179
180
  if denotations.nil? && annotations[:tracks]
180
181
  denotations = annotations[:tracks].first[:denotations]
181
182
  end
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
4
4
  attr_reader :map
5
5
 
6
6
  def initialize
7
- @map = {}
7
+ @map = []
8
8
  end
9
9
 
10
10
  def cultivate(regions)
11
- regions.each do |b, e|
12
- (b ... e).each{|p| @map[p] = e}
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
13
24
  end
25
+ @map = new_map
14
26
  end
15
27
 
16
- def search_again_position(position)
17
- @map[position]
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position <= r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position = 0)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
18
93
  end
19
94
  end
@@ -144,11 +144,28 @@ class TextAlignment::MixedAlignment
144
144
  def compute_similarity(s1, s2, sdiff)
145
145
  return 0 if sdiff.nil?
146
146
 
147
- # compute the lcs only with non-whitespace letters
148
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
- return 0 if lcs == 0
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
+ coverage = count_nws_match.to_f / count_nws
151
+
152
+ # fragmentation rate
153
+ frag_str = sdiff.collect do |d|
154
+ case d.action
155
+ when '='
156
+ '='
157
+ when '-'
158
+ ''
159
+ when '+'
160
+ (d.new_element =~ /\S/) ? '+' : ''
161
+ else
162
+ ''
163
+ end
164
+ end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
150
165
 
151
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
- end
166
+ count_frag = frag_str.scan(/=+/).count
167
+ rate_frag = 1.0 / count_frag
153
168
 
169
+ similarity = coverage * rate_frag
170
+ end
154
171
  end
@@ -15,12 +15,13 @@ class TextAlignment::TextAlignment
15
15
  def initialize(reference_text, to_prevent_overlap = false)
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @original_rtext = reference_text
18
+ @original_reference_text = reference_text
19
19
  @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
20
21
  @to_prevent_overlap = to_prevent_overlap
21
22
 
22
23
  @original_text = nil
23
- @block_alignment = nil
24
+ @blocks = nil
24
25
  @cultivation_map = TextAlignment::CultivationMap.new
25
26
  end
26
27
 
@@ -34,51 +35,24 @@ class TextAlignment::TextAlignment
34
35
  @text_mapping = TextAlignment::CharMapping.new(text)
35
36
  end
36
37
 
37
- text_mapped = @text_mapping.mapped_text
38
+ @mapped_text = @text_mapping.mapped_text
38
39
  denotations_mapped = @text_mapping.enmap_denotations(denotations)
39
40
 
40
- rtext_mapped = @rtext_mapping.mapped_text
41
-
42
41
  ## To generate the block_alignment of the input text against the reference text
43
-
44
- # Initialization
45
- @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
46
-
47
- # Generation
48
- @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
49
43
  r
50
44
  else
51
- find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
45
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
52
46
  end
53
- end
54
-
55
- def update_cultivation_map
56
- return if @block_alignment.nil? || @block_alignment[:blocks].nil?
57
47
 
58
- ## To update the cultivation map
59
- newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
60
- if b[:alignment] == :block || b[:alignment] == :term
61
- [b[:target][:begin], b[:target][:end]]
62
- else
63
- nil
64
- end
65
- end.compact.inject([]) do |condensed, region|
66
- if condensed.empty? || (condensed.last.last + 1 < region.first)
67
- condensed.push region
68
- else
69
- condensed.last[1] = region.last
70
- end
71
- condensed
72
- end
73
-
74
- @cultivation_map.cultivate(newly_cultivated_regions)
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
75
49
  end
76
50
 
77
51
  def transform_begin_position(_begin_position)
78
52
  begin_position = @text_mapping.enmap_position(_begin_position)
79
53
 
80
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
81
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
82
56
 
83
57
  b = if block[:alignment] == :block || block[:alignment] == :term
84
58
  begin_position + block[:delta]
@@ -99,8 +73,8 @@ class TextAlignment::TextAlignment
99
73
  def transform_end_position(_end_position)
100
74
  end_position = @text_mapping.enmap_position(_end_position)
101
75
 
102
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
103
- block = @block_alignment[:blocks][i]
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
104
78
 
105
79
  e = if block[:alignment] == :block || block[:alignment] == :term
106
80
  end_position + block[:delta]
@@ -134,7 +108,7 @@ class TextAlignment::TextAlignment
134
108
  source = {begin:d.begin, end:d.end}
135
109
  d.begin = transform_begin_position(d.begin);
136
110
  d.end = transform_end_position(d.end);
137
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
111
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
138
112
  rescue
139
113
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
140
114
  d.begin = nil
@@ -150,7 +124,7 @@ class TextAlignment::TextAlignment
150
124
 
151
125
  r = hdenotations.collect do |d|
152
126
  t = transform_a_span(d[:span])
153
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
127
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
154
128
  new_d = d.dup.merge({span:t})
155
129
  rescue
156
130
  @lost_annotations << {source: d[:span], target:t}
@@ -218,7 +192,7 @@ class TextAlignment::TextAlignment
218
192
 
219
193
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
220
194
  "[#{astr1}]\n" +
221
- "[#{astr2}]\n\n"
195
+ "[#{astr2.gsub("\n", " ")}]\n\n"
222
196
  end
223
197
  end
224
198
  show
@@ -257,142 +231,125 @@ class TextAlignment::TextAlignment
257
231
  # puts "-=-=-=-=-"
258
232
  # puts
259
233
 
260
- ## to fill the gaps
261
- last_block = nil
262
- blocks2 = blocks.inject([]) do |sum, block|
263
- b1 = last_block ? last_block[:source][:end] : 0
264
- e1 = block[:source][:begin]
234
+ ## To fill the gaps
235
+ ## lblock: last block, cblock: current block
236
+ lblock = nil
237
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
238
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
239
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
265
240
 
266
- sum += if b1 == e1
267
- [block]
268
- else
269
- b2 = last_block ? last_block[:target][:end] : 0
270
- e2 = block[:target][:begin]
271
-
272
- if b2 == e2
273
- [
274
- {source:{begin:b1, end:e1}, alignment: :empty},
275
- block
276
- ]
241
+ if b1 < e1
242
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
243
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
244
+ _str1 = str1[b1 ... e1]
245
+ _str2 = str2[b2 ... e2]
246
+
247
+ sum += if _str1.strip.empty? || _str2.strip.empty?
248
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
277
249
  else
278
250
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
279
-
280
- if b1 == 0 && b2 == 0
281
- b2 = e2 - len_buffer if e2 > len_buffer
282
- end
283
-
284
- _str1 = str1[b1 ... e1]
285
- _str2 = str2[b2 ... e2]
286
-
287
- if _str1.strip.empty? || _str2.strip.empty?
288
- [
289
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
290
- block
291
- ]
292
- elsif ((e2 - b2) - (e1 - b1)) > len_buffer
293
- la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
294
- la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
295
- [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
296
- else
297
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
251
+ region_state, state_region = cultivation_map.region_state([b2, e2])
252
+ case region_state
253
+ when :closed
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
255
+ when :front_open
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
263
+ when :rear_open
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
271
+ when :middle_closed
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
280
+ ob2 = state_region[1]
281
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
282
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
283
+ else
284
+ attempt1
285
+ end
286
+ else # :open
287
+ if (e2 - b2) > len_buffer
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
294
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
295
+ else
296
+ attempt1
297
+ end
298
+ else
299
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
300
+ end
298
301
  end
299
302
  end
300
303
  end
301
304
 
302
- last_block = block
303
- sum
305
+ lblock = cblock
306
+ cblock.nil? ? sum : sum << cblock
304
307
  end
305
308
 
306
- # the last step
307
- blocks2 += if last_block.nil?
308
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
309
- else
310
- b1 = last_block[:source][:end]
311
- if b1 < str1.length
312
- e1 = str1.length
313
- b2 = last_block[:target][:end]
314
-
315
- _str1 = str1[b1 ... e1]
316
- if _str1.strip.empty?
317
- [{source:{begin:b1, end:e1}, alignment: :empty}]
318
- else
319
- if b2 < str2.length
320
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
321
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
322
-
323
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
324
- else
325
- [{source:{begin:b1, end:e1}, alignment: :empty}]
326
- end
327
- end
328
- else
329
- []
330
- end
331
- end
332
309
  end
333
310
 
334
311
  def whole_block_alignment(str1, str2, cultivation_map)
335
- ## Block exact match
336
- search_position = 0
337
-
338
- block_begin = begin
339
- _block_begin = str2.index(str1, search_position)
340
- break if _block_begin.nil?
341
- search_position = cultivation_map.search_again_position(_block_begin)
342
- _block_begin
343
- end until search_position.nil?
344
-
345
- unless block_begin.nil?
346
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
347
- end
312
+ block_begin = cultivation_map.index(str1, str2)
313
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
348
314
 
349
- search_position = 0
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
316
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
350
317
 
351
- dstr1 = str1.downcase
352
- dstr2 = str2.downcase
353
- block_begin = begin
354
- _block_begin = dstr2.index(dstr1, search_position)
355
- break if _block_begin.nil?
356
- search_position = cultivation_map.search_again_position(_block_begin)
357
- _block_begin
358
- end until search_position.nil?
318
+ nil
319
+ end
359
320
 
360
- unless block_begin.nil?
361
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
321
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
324
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
+ else
326
+ tblocks
362
327
  end
363
-
364
- nil
365
328
  end
366
329
 
367
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
368
- block2 = str2[b2 ... e2]
330
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ str2_block = str2[0 ... e2]
369
332
 
370
333
  ## term-based alignment
371
334
  tblocks = if denotations
372
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
335
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
373
336
  sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
374
337
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
375
338
 
376
- position = 0
377
- _tblocks = ds_in_scope.map do |term|
378
- lex = term[:lex]
379
- r = block2.index(lex, position)
380
- if r.nil?
381
- position = nil
382
- break
383
- end
384
- position = r + lex.length
385
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
339
+ search_position = b2
340
+ _tblocks = denotations_in_scope.map do |denotation|
341
+ lex = denotation[:lex]
342
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
343
+ break [] if term_begin.nil? # break the loop if a missing term is found
344
+ search_position = term_begin + lex.length
345
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
386
346
  end
387
347
 
388
- # missing term found
389
- _tblocks = [] if position.nil?
390
-
391
348
  # redundant matching found
392
- unless position.nil?
393
- ds_in_scope.each do |term|
394
- lex = term[:lex]
395
- look_forward = block2.index(lex, position)
349
+ unless _tblocks.empty?
350
+ search_position = _tblocks.last[:target][:end]
351
+ denotations_in_scope.each do |term|
352
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
396
353
  unless look_forward.nil?
397
354
  _tblocks = []
398
355
  break
@@ -405,73 +362,72 @@ class TextAlignment::TextAlignment
405
362
  []
406
363
  end
407
364
 
408
- if tblocks.empty?
409
- if b1 == 0 && e1 == str1.length
410
- if (e1 > 2000) || (e2 > 2000)
411
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
412
- else
413
- block1 = str1[b1 ... e1]
414
- block2 = str2[b2 ... e2]
415
-
416
- ## character-based alignment
417
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
418
- if alignment.sdiff.nil?
419
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
420
- else
421
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
422
- end
423
- end
424
- else
425
- block1 = str1[b1 ... e1]
426
- block2 = str2[b2 ... e2]
365
+ ltblock = nil
366
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
367
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
368
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
427
369
 
428
- ## character-based alignment
429
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
430
- if alignment.sdiff.nil?
431
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
432
- else
433
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
434
- end
370
+ if te1 > tb1
371
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
372
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
373
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
435
374
  end
436
- else
437
- last_tblock = nil
438
- lblocks = tblocks.inject([]) do |sum, tblock|
439
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
440
- te1 = tblock[:source][:begin]
441
375
 
442
- sum += if te1 == tb1
443
- [tblock]
444
- else
445
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
446
- te2 = tblock[:target][:begin]
447
-
448
- if b2 == e2
449
- [
450
- {source:{begin:tb1, end:te1}, alignment: :empty},
451
- tblock
452
- ]
453
- else
454
- [
455
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
456
- tblock
457
- ]
458
- end
459
- end
376
+ ltblock = ctblock
377
+ ctblock.nil? ? sum : sum << ctblock
378
+ end
379
+
380
+ tblocks2
381
+ end
460
382
 
461
- last_tblock = tblock
462
- sum
383
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
384
+ source = {begin:b1, end:e1}
385
+ target = {begin:b2, end:e2}
386
+
387
+ if (e1 - b1) > 2000
388
+ [{source:source, target:target, alignment: :empty}]
389
+ else
390
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
391
+ if alignment.similarity < 0.5
392
+ [{source:source, target:target, alignment: :empty}]
393
+ else
394
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
463
395
  end
396
+ end
397
+ end
464
398
 
465
- if last_tblock[:source][:end] < e1
466
- if last_tblock[:target][:end] < e2
467
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
468
- else
469
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
470
- end
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
471
414
  end
415
+ condensed
416
+ end
472
417
 
473
- lblocks
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
420
+
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
474
428
  end
429
+
430
+ blocks
475
431
  end
476
432
 
477
433
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.0'
2
+ VERSION = '0.11.6'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-04 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary