text_alignment 0.11.0 → 0.11.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d626b64acdca0630dc344e2f0f4c2152e481ca7c8209ac1aa8c025fb1ae7c0f
4
- data.tar.gz: 972222f1f3a575cdb30cd83f886d6d3f36109b79a35cc2c970aa89cbe8fd007d
3
+ metadata.gz: 5ed6071b0293e9b7fa86acf4a737c80c9d784f453d3fb77992e3d9f1acc02bbe
4
+ data.tar.gz: 7a57b7afbe21061d9aac2f96cd9f7a3c83ff2f01b6c3973905f7681a40463287
5
5
  SHA512:
6
- metadata.gz: 288691e9ee650af3b9dfe5b47f1a3d05ff952292958ef733a5a5f47aeeaa73e6e1b7b0591afe3549b6cbabf04817bf406d3ca2ee55faf65b7e3816dfe52be644
7
- data.tar.gz: 7f45baecc36f00310f868a596ca60bce709078bdc51a5f9438a916cd42062bb93dacadafb64e90e42276d0633914c978045fed4b597ea7411c8762761be6bd36
6
+ metadata.gz: 141158c2f6b80975bacf68babc713b06d809f02b96cfd7accdecb00c6b4c362b9d130e29820074be7b89ac53c7cc93f850ec3d7ea9796eaad5021d441528e82b
7
+ data.tar.gz: 8bc4cc9ca9becc1c5638b6501268d33dd7700e8369d7c20c1ec6fd9ef11e9fd73e6a1354cb37376af250c8462e945cccf08417ba6d258f08660c53eb418e4658
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment'
3
3
  require 'json'
4
4
  require 'pp'
5
+ require 'optparse'
5
6
 
6
7
  def read_annotations(filename)
7
8
  case File.extname(filename)
@@ -37,7 +38,7 @@ def align_denotations(denotations, source_text, alignment, debug = false)
37
38
  end
38
39
 
39
40
  lost_annotations = alignment.lost_annotations
40
- unless lost_annotations.empty?
41
+ unless lost_annotations.nil? || lost_annotations.empty?
41
42
  warn "\n[lost annotations] #{lost_annotations.length}"
42
43
  lost_annotations.each do |a|
43
44
  warn "#{a}"
@@ -77,7 +78,9 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
77
78
  annotations[:relations].each do |r|
78
79
  reid = 'R' + (idnum_relations += 1).to_s
79
80
  ididx[r[:id]] = reid
80
- target_annotations[:relations] << r.dup.merge({id:reid, subj:ididx[r[:subj]], obj:ididx[r[:obj]]})
81
+ sid = ididx[r[:subj]]
82
+ oid = ididx[r[:obj]]
83
+ target_annotations[:relations] << r.dup.merge({id:reid, subj:sid, obj:oid}) unless sid.nil? || oid.nil?
81
84
  end
82
85
  end
83
86
 
@@ -86,7 +89,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
86
89
  annotations[:attributes].each do |a|
87
90
  reid = 'A' + (idnum_attributes += 1).to_s
88
91
  ididx[a[:id]] = reid
89
- target_annotations[:attributes] << a.dup.merge({id:reid, subj:ididx[a[:subj]]})
92
+ sid = ididx[a[:subj]]
93
+ target_annotations[:attributes] << a.dup.merge({id:reid, subj:sid}) unless sid.nil?
90
94
  end
91
95
  end
92
96
 
@@ -95,7 +99,8 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
95
99
  annotations[:modifications].each do |m|
96
100
  reid = 'M' + (idnum_modifications += 1).to_s
97
101
  ididx[m[:id]] = reid
98
- target_annotations[:modifications] << m.dup.merge({id:reid, obj:ididx[m[:obj]]})
102
+ oid = ididx[m[:obj]]
103
+ target_annotations[:modifications] << m.dup.merge({id:reid, obj:oid}) unless oid.nil?
99
104
  end
100
105
  end
101
106
  end
@@ -104,21 +109,45 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
104
109
  end
105
110
 
106
111
 
112
+ ## Options
113
+ overlap_p = false
114
+ debug_p = false
115
+
116
+ ## command line option processing
117
+ require 'optparse'
118
+ optparse = OptionParser.new do |opts|
119
+ opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
+
121
+ opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
+ overlap_p = true
123
+ end
124
+
125
+ opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
+ debug_p = true
127
+ end
128
+
129
+ opts.on('-h', '--help', 'displays this screen.') do
130
+ puts opts
131
+ exit
132
+ end
133
+ end
134
+
135
+ optparse.parse!
136
+
107
137
  unless ARGV.length == 2
108
- warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
109
- exit
138
+ puts optparse.help
139
+ exit 1
110
140
  end
111
141
 
112
142
  source_annotations = read_annotations(ARGV[0])
113
143
  reference_text = read_text(ARGV[1])
114
144
 
115
- alignment = TextAlignment::TextAlignment.new(reference_text, true)
145
+ alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
116
146
 
117
147
  target_annotations = if source_annotations.class == Array
118
- align_mannotations(source_annotations, reference_text, alignment, false)
148
+ align_mannotations(source_annotations, reference_text, alignment, debug_p)
119
149
  else
120
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
121
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], reference_text, true)
150
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
122
151
  source_annotations.merge({text:reference_text, denotations:denotations})
123
152
  end
124
153
 
@@ -7,11 +7,8 @@ module TextAlignment; end unless defined? TextAlignment
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
9
  def initialize(source_str, target_str, cultivation_map)
10
- @s1, @s2 = if reverse?(source_str, target_str)
11
- [target_str.downcase, source_str.downcase]
12
- else
13
- [source_str.downcase, target_str.downcase]
14
- end
10
+ @s1 = source_str.downcase
11
+ @s2 = target_str.downcase
15
12
 
16
13
  @cultivation_map = cultivation_map
17
14
 
@@ -19,19 +16,13 @@ class TextAlignment::AnchorFinder
19
16
  @size_window = TextAlignment::SIZE_WINDOW
20
17
  @sim_threshold = TextAlignment::TEXT_SIMILARITY_THRESHOLD
21
18
  @pos_s1_final_possible_begin = @s1.length - @size_ngram - 1
19
+ @pos_s2_final_possible_end = @s2.length
22
20
 
23
21
  # positions of last match
24
22
  @pos_s1_last_match = 0
25
23
  @pos_s2_last_match = 0
26
24
  end
27
25
 
28
- def reverse?(source_str = nil, target_str = nil)
29
- unless source_str.nil?
30
- @reverse_p = target_str.length < source_str.length
31
- end
32
- @reverse_p
33
- end
34
-
35
26
  def get_next_anchor
36
27
  # To find the beginning positions of an anchor ngram in s1 and s2, beginning from the last positions matched
37
28
  beg_s2 = for beg_s1 in @pos_s1_last_match .. @pos_s1_final_possible_begin
@@ -49,26 +40,23 @@ class TextAlignment::AnchorFinder
49
40
  # To extend the block to the left
50
41
  b1 = beg_s1
51
42
  b2 = beg_s2
52
- while b1 >= @pos_s1_last_match && b2 >= @pos_s2_last_match && @s1[b1] == @s2[b2]
43
+ left_boundary_b2 = [@pos_s2_last_match, (@cultivation_map.last_cultivated_position(b2) || 0)].max
44
+ while b1 > @pos_s1_last_match && b2 > left_boundary_b2 && @s1[b1 - 1] == @s2[b2 - 1]
53
45
  b1 -= 1; b2 -= 1
54
46
  end
55
- b1 += 1; b2 += 1
56
47
 
57
48
  # To extend the block to the right
58
49
  e1 = beg_s1 + @size_ngram
59
50
  e2 = beg_s2 + @size_ngram
60
- while @s1[e1] && @s1[e1] == @s2[e2]
51
+ right_boundary_b2 = @cultivation_map.next_cultivated_position(e2) || @pos_s2_final_possible_end
52
+ while @s1[e1] && e2 < right_boundary_b2 && @s1[e1] == @s2[e2]
61
53
  e1 += 1; e2 += 1
62
54
  end
63
55
 
64
56
  @pos_s1_last_match = e1
65
57
  @pos_s2_last_match = e2
66
58
 
67
- if reverse?
68
- {source:{begin:b2 , end:e2}, target:{begin:b1, end:e1}}
69
- else
70
- {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
71
- end
59
+ {source:{begin:b1 , end:e1}, target:{begin:b2, end:e2}}
72
60
  end
73
61
 
74
62
  private
@@ -91,13 +79,7 @@ class TextAlignment::AnchorFinder
91
79
  # return nil if the anchor is too much frequent
92
80
  def find_beg_s2_candidates(anchor, search_position)
93
81
  candidates = []
94
- while _beg_s2 = @s2.index(anchor, search_position)
95
- search_again_position = @cultivation_map.search_again_position(_beg_s2)
96
- unless search_again_position.nil?
97
- search_position = search_again_position
98
- next
99
- end
100
-
82
+ while _beg_s2 = @cultivation_map.index(anchor, @s2, search_position)
101
83
  candidates << _beg_s2
102
84
 
103
85
  # for speed, skip anchor of high frequency
@@ -126,14 +108,14 @@ class TextAlignment::AnchorFinder
126
108
  next
127
109
  end
128
110
 
129
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
130
112
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
131
113
  break unless valid_beg_s2.nil?
132
114
  valid_beg_s2 = beg_s2
133
115
  next
134
116
  end
135
117
 
136
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
137
119
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
138
120
  break unless valid_beg_s2.nil?
139
121
  valid_beg_s2 = beg_s2
@@ -143,7 +125,11 @@ class TextAlignment::AnchorFinder
143
125
 
144
126
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
145
127
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
146
- break unless r.nil?
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
147
133
  end
148
134
 
149
135
  valid_beg_s2
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+202F (narrow no-break space)
64
65
  [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
67
  ["‐", "-"], #U+2010 (Hyphen)
@@ -94,10 +95,10 @@ class TextAlignment::CharMapping
94
95
  @index_demap[position]
95
96
  end
96
97
 
97
- def enmap_denotations(_denotations)
98
- return nil if _denotations.nil?
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
99
100
 
100
- denotations = _denotations.map do |d|
101
+ denotations.map do |d|
101
102
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
102
103
  end
103
104
  end
@@ -107,7 +108,7 @@ class TextAlignment::CharMapping
107
108
  def enmap_text(_text, char_mapping)
108
109
  text = _text.dup
109
110
 
110
- # To execute the single letter mapping
111
+ # To execute the single letter mapping replacement
111
112
  char_mapping.each do |one, long|
112
113
  text.gsub!(one, long) if long.length == 1
113
114
  end
@@ -175,7 +176,7 @@ if __FILE__ == $0
175
176
  exit
176
177
  end
177
178
  annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
- denotations = annotations[:denotations]
179
+ denotations = annotations[:denotations] || []
179
180
  if denotations.nil? && annotations[:tracks]
180
181
  denotations = annotations[:tracks].first[:denotations]
181
182
  end
@@ -4,16 +4,91 @@ class TextAlignment::CultivationMap
4
4
  attr_reader :map
5
5
 
6
6
  def initialize
7
- @map = {}
7
+ @map = []
8
8
  end
9
9
 
10
10
  def cultivate(regions)
11
- regions.each do |b, e|
12
- (b ... e).each{|p| @map[p] = e}
11
+ @map += regions
12
+ @map.sort!{|a, b| a[0] <=> b[0]}
13
+ new_map = []
14
+ @map.each do |region|
15
+ if new_map.empty?
16
+ new_map << region
17
+ elsif new_map.last[1] > region[0]
18
+ raise "Overlapping regions: #{new_map.last} : #{region}"
19
+ elsif new_map.last[1] == region[0]
20
+ new_map.last[1] == region[1]
21
+ else
22
+ new_map << region
23
+ end
13
24
  end
25
+ @map = new_map
14
26
  end
15
27
 
16
- def search_again_position(position)
17
- @map[position]
28
+ def search_again_position(position, end_position = nil)
29
+ end_position ||= position
30
+ region = @map.bsearch{|r| end_position < r[1]}
31
+ if region.nil? || region[0] > position
32
+ nil
33
+ else
34
+ region[1]
35
+ end
36
+ end
37
+
38
+ def last_cultivated_position(position)
39
+ ridx = @map.rindex{|r| r[1] <= position}
40
+ ridx.nil? ? nil : @map[ridx][1]
41
+ end
42
+
43
+ def next_cultivated_position(position)
44
+ region = @map.bsearch{|r| position <= r[0]}
45
+ region.nil? ? nil : region[0]
46
+ end
47
+
48
+ def in_regions(region)
49
+ @map.select{|r| (r[1] > region[0] && r[1] <= region[1]) || (r[0] < region[1] && r[0] >= region[0])}
50
+ end
51
+
52
+ def region_state(region)
53
+ closed_parts = in_regions(region)
54
+ if closed_parts.empty?
55
+ [:open, region]
56
+ else
57
+ if front_open?(region, closed_parts)
58
+ if rear_open?(region, closed_parts)
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
+ else
61
+ [:front_open, [region[0], closed_parts.first[0]]]
62
+ end
63
+ else
64
+ if rear_open?(region, closed_parts)
65
+ [:rear_open, [closed_parts.last[1], region[1]]]
66
+ else
67
+ [:closed, nil]
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ def index(target, string, position = 0)
74
+ length = target.length
75
+ loop do
76
+ _begin = string.index(target, position)
77
+ break if _begin.nil?
78
+ position = search_again_position(_begin)
79
+ next unless position.nil?
80
+ break _begin if region_state([_begin, _begin + length])[0] == :open
81
+ position = _begin + 1
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def front_open?(region, closed_parts)
88
+ closed_parts.first[0] > region[0]
89
+ end
90
+
91
+ def rear_open?(region, closed_parts)
92
+ closed_parts.last[1] < region[1]
18
93
  end
19
94
  end
@@ -144,11 +144,28 @@ class TextAlignment::MixedAlignment
144
144
  def compute_similarity(s1, s2, sdiff)
145
145
  return 0 if sdiff.nil?
146
146
 
147
- # compute the lcs only with non-whitespace letters
148
- lcs = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/ && d.new_element =~ /\S/}
149
- return 0 if lcs == 0
147
+ # recoverbility
148
+ count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
+ count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
+ coverage = count_nws_match.to_f / count_nws
151
+
152
+ # fragmentation rate
153
+ frag_str = sdiff.collect do |d|
154
+ case d.action
155
+ when '='
156
+ '='
157
+ when '-'
158
+ ''
159
+ when '+'
160
+ (d.new_element =~ /\S/) ? '+' : ''
161
+ else
162
+ ''
163
+ end
164
+ end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
150
165
 
151
- similarity = lcs.to_f / [s1.scan(/\S/).count, s2.scan(/\S/).count].min
152
- end
166
+ count_frag = frag_str.scan(/=+/).count
167
+ rate_frag = 1.0 / count_frag
153
168
 
169
+ similarity = coverage * rate_frag
170
+ end
154
171
  end
@@ -15,12 +15,13 @@ class TextAlignment::TextAlignment
15
15
  def initialize(reference_text, to_prevent_overlap = false)
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
- @original_rtext = reference_text
18
+ @original_reference_text = reference_text
19
19
  @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
20
+ @mapped_reference_text = @rtext_mapping.mapped_text
20
21
  @to_prevent_overlap = to_prevent_overlap
21
22
 
22
23
  @original_text = nil
23
- @block_alignment = nil
24
+ @blocks = nil
24
25
  @cultivation_map = TextAlignment::CultivationMap.new
25
26
  end
26
27
 
@@ -34,51 +35,24 @@ class TextAlignment::TextAlignment
34
35
  @text_mapping = TextAlignment::CharMapping.new(text)
35
36
  end
36
37
 
37
- text_mapped = @text_mapping.mapped_text
38
+ @mapped_text = @text_mapping.mapped_text
38
39
  denotations_mapped = @text_mapping.enmap_denotations(denotations)
39
40
 
40
- rtext_mapped = @rtext_mapping.mapped_text
41
-
42
41
  ## To generate the block_alignment of the input text against the reference text
43
-
44
- # Initialization
45
- @block_alignment = {text: @original_text, reference_text: @original_rtext, denotations: denotations}
46
-
47
- # Generation
48
- @block_alignment[:blocks] = if r = whole_block_alignment(text_mapped, rtext_mapped, @cultivation_map)
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
49
43
  r
50
44
  else
51
- find_block_alignment(text_mapped, rtext_mapped, denotations_mapped, @cultivation_map)
45
+ find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
52
46
  end
53
- end
54
-
55
- def update_cultivation_map
56
- return if @block_alignment.nil? || @block_alignment[:blocks].nil?
57
47
 
58
- ## To update the cultivation map
59
- newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
60
- if b[:alignment] == :block || b[:alignment] == :term
61
- [b[:target][:begin], b[:target][:end]]
62
- else
63
- nil
64
- end
65
- end.compact.inject([]) do |condensed, region|
66
- if condensed.empty? || (condensed.last.last + 1 < region.first)
67
- condensed.push region
68
- else
69
- condensed.last[1] = region.last
70
- end
71
- condensed
72
- end
73
-
74
- @cultivation_map.cultivate(newly_cultivated_regions)
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
75
49
  end
76
50
 
77
51
  def transform_begin_position(_begin_position)
78
52
  begin_position = @text_mapping.enmap_position(_begin_position)
79
53
 
80
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
81
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
82
56
 
83
57
  b = if block[:alignment] == :block || block[:alignment] == :term
84
58
  begin_position + block[:delta]
@@ -99,8 +73,8 @@ class TextAlignment::TextAlignment
99
73
  def transform_end_position(_end_position)
100
74
  end_position = @text_mapping.enmap_position(_end_position)
101
75
 
102
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
103
- block = @block_alignment[:blocks][i]
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
104
78
 
105
79
  e = if block[:alignment] == :block || block[:alignment] == :term
106
80
  end_position + block[:delta]
@@ -134,7 +108,7 @@ class TextAlignment::TextAlignment
134
108
  source = {begin:d.begin, end:d.end}
135
109
  d.begin = transform_begin_position(d.begin);
136
110
  d.end = transform_end_position(d.end);
137
- raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_rtext.length
111
+ raise "invalid transform" unless !d.begin.nil? && !d.end.nil? && d.begin >= 0 && d.end > d.begin && d.end <= @original_reference_text.length
138
112
  rescue
139
113
  @lost_annotations << {source: source, target:{begin:d.begin, end:d.end}}
140
114
  d.begin = nil
@@ -150,7 +124,7 @@ class TextAlignment::TextAlignment
150
124
 
151
125
  r = hdenotations.collect do |d|
152
126
  t = transform_a_span(d[:span])
153
- raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_rtext.length
127
+ raise "invalid transform" unless !t[:begin].nil? && !t[:end].nil? && t[:begin] >= 0 && t[:end] > t[:begin] && t[:end] <= @original_reference_text.length
154
128
  new_d = d.dup.merge({span:t})
155
129
  rescue
156
130
  @lost_annotations << {source: d[:span], target:t}
@@ -218,7 +192,7 @@ class TextAlignment::TextAlignment
218
192
 
219
193
  "***** local mismatch [#{a[:source][:begin]} - #{a[:source][:end]}] [#{a[:target][:begin]} - #{a[:target][:end]}] (similarity: #{a[:similarity]})\n" +
220
194
  "[#{astr1}]\n" +
221
- "[#{astr2}]\n\n"
195
+ "[#{astr2.gsub("\n", " ")}]\n\n"
222
196
  end
223
197
  end
224
198
  show
@@ -257,142 +231,125 @@ class TextAlignment::TextAlignment
257
231
  # puts "-=-=-=-=-"
258
232
  # puts
259
233
 
260
- ## to fill the gaps
261
- last_block = nil
262
- blocks2 = blocks.inject([]) do |sum, block|
263
- b1 = last_block ? last_block[:source][:end] : 0
264
- e1 = block[:source][:begin]
234
+ ## To fill the gaps
235
+ ## lblock: last block, cblock: current block
236
+ lblock = nil
237
+ blocks2 = (blocks + [nil]).inject([]) do |sum, cblock|
238
+ b1 = lblock.nil? ? 0 : lblock[:source][:end]
239
+ e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
265
240
 
266
- sum += if b1 == e1
267
- [block]
268
- else
269
- b2 = last_block ? last_block[:target][:end] : 0
270
- e2 = block[:target][:begin]
271
-
272
- if b2 == e2
273
- [
274
- {source:{begin:b1, end:e1}, alignment: :empty},
275
- block
276
- ]
241
+ if b1 < e1
242
+ b2 = lblock.nil? ? 0 : lblock[:target][:end]
243
+ e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
244
+ _str1 = str1[b1 ... e1]
245
+ _str2 = str2[b2 ... e2]
246
+
247
+ sum += if _str1.strip.empty? || _str2.strip.empty?
248
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
277
249
  else
278
250
  len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
279
-
280
- if b1 == 0 && b2 == 0
281
- b2 = e2 - len_buffer if e2 > len_buffer
282
- end
283
-
284
- _str1 = str1[b1 ... e1]
285
- _str2 = str2[b2 ... e2]
286
-
287
- if _str1.strip.empty? || _str2.strip.empty?
288
- [
289
- {source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty},
290
- block
291
- ]
292
- elsif ((e2 - b2) - (e1 - b1)) > len_buffer
293
- la_block1 = local_alignment_blocks(str1, b1, e1, str2, b2, b2 + len_buffer, denotations)
294
- la_block2 = local_alignment_blocks(str1, b1, e1, str2, e2 - len_buffer, e2, denotations)
295
- [la_block2, la_block2].max{|a, b| a.first[:similarity] <=> b.first[:similarity]} << block
296
- else
297
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations) << block
251
+ region_state, state_region = cultivation_map.region_state([b2, e2])
252
+ case region_state
253
+ when :closed
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
255
+ when :front_open
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
263
+ when :rear_open
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
271
+ when :middle_closed
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
280
+ ob2 = state_region[1]
281
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
282
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
283
+ else
284
+ attempt1
285
+ end
286
+ else # :open
287
+ if (e2 - b2) > len_buffer
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
294
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
295
+ else
296
+ attempt1
297
+ end
298
+ else
299
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
300
+ end
298
301
  end
299
302
  end
300
303
  end
301
304
 
302
- last_block = block
303
- sum
305
+ lblock = cblock
306
+ cblock.nil? ? sum : sum << cblock
304
307
  end
305
308
 
306
- # the last step
307
- blocks2 += if last_block.nil?
308
- local_alignment_blocks(str1, 0, str1.length, str2, 0, str2.length, denotations)
309
- else
310
- b1 = last_block[:source][:end]
311
- if b1 < str1.length
312
- e1 = str1.length
313
- b2 = last_block[:target][:end]
314
-
315
- _str1 = str1[b1 ... e1]
316
- if _str1.strip.empty?
317
- [{source:{begin:b1, end:e1}, alignment: :empty}]
318
- else
319
- if b2 < str2.length
320
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
321
- e2 = (str2.length - b2) > len_buffer ? b2 + len_buffer : str2.length
322
-
323
- local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations)
324
- else
325
- [{source:{begin:b1, end:e1}, alignment: :empty}]
326
- end
327
- end
328
- else
329
- []
330
- end
331
- end
332
309
  end
333
310
 
334
311
  def whole_block_alignment(str1, str2, cultivation_map)
335
- ## Block exact match
336
- search_position = 0
337
-
338
- block_begin = begin
339
- _block_begin = str2.index(str1, search_position)
340
- break if _block_begin.nil?
341
- search_position = cultivation_map.search_again_position(_block_begin)
342
- _block_begin
343
- end until search_position.nil?
344
-
345
- unless block_begin.nil?
346
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
347
- end
312
+ block_begin = cultivation_map.index(str1, str2)
313
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
348
314
 
349
- search_position = 0
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
316
+ return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
350
317
 
351
- dstr1 = str1.downcase
352
- dstr2 = str2.downcase
353
- block_begin = begin
354
- _block_begin = dstr2.index(dstr1, search_position)
355
- break if _block_begin.nil?
356
- search_position = cultivation_map.search_again_position(_block_begin)
357
- _block_begin
358
- end until search_position.nil?
318
+ nil
319
+ end
359
320
 
360
- unless block_begin.nil?
361
- return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}]
321
+ def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
+ tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
324
+ lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
+ else
326
+ tblocks
362
327
  end
363
-
364
- nil
365
328
  end
366
329
 
367
- def local_alignment_blocks(str1, b1, e1, str2, b2, e2, denotations = nil)
368
- block2 = str2[b2 ... e2]
330
+ def term_based_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
+ str2_block = str2[0 ... e2]
369
332
 
370
333
  ## term-based alignment
371
334
  tblocks = if denotations
372
- ds_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
335
+ denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
373
336
  sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
374
337
  map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
375
338
 
376
- position = 0
377
- _tblocks = ds_in_scope.map do |term|
378
- lex = term[:lex]
379
- r = block2.index(lex, position)
380
- if r.nil?
381
- position = nil
382
- break
383
- end
384
- position = r + lex.length
385
- {source:term[:span], target:{begin:r + b2, end:r + b2 + lex.length}, alignment: :term, similarity: 0.9, delta: r + b2 - term[:span][:begin]}
339
+ search_position = b2
340
+ _tblocks = denotations_in_scope.map do |denotation|
341
+ lex = denotation[:lex]
342
+ term_begin = cultivation_map.index(lex, str2_block, search_position)
343
+ break [] if term_begin.nil? # break the loop if a missing term is found
344
+ search_position = term_begin + lex.length
345
+ {source:denotation[:span], target:{begin:term_begin, end:term_begin + lex.length}, alignment: :term, similarity: 0.9, delta: term_begin - denotation[:span][:begin]}
386
346
  end
387
347
 
388
- # missing term found
389
- _tblocks = [] if position.nil?
390
-
391
348
  # redundant matching found
392
- unless position.nil?
393
- ds_in_scope.each do |term|
394
- lex = term[:lex]
395
- look_forward = block2.index(lex, position)
349
+ unless _tblocks.empty?
350
+ search_position = _tblocks.last[:target][:end]
351
+ denotations_in_scope.each do |term|
352
+ look_forward = cultivation_map.index(term[:lex], str2_block, search_position)
396
353
  unless look_forward.nil?
397
354
  _tblocks = []
398
355
  break
@@ -405,73 +362,72 @@ class TextAlignment::TextAlignment
405
362
  []
406
363
  end
407
364
 
408
- if tblocks.empty?
409
- if b1 == 0 && e1 == str1.length
410
- if (e1 > 2000) || (e2 > 2000)
411
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
412
- else
413
- block1 = str1[b1 ... e1]
414
- block2 = str2[b2 ... e2]
415
-
416
- ## character-based alignment
417
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
418
- if alignment.sdiff.nil?
419
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
420
- else
421
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
422
- end
423
- end
424
- else
425
- block1 = str1[b1 ... e1]
426
- block2 = str2[b2 ... e2]
365
+ ltblock = nil
366
+ tblocks2 = (tblocks + [nil]).inject([]) do |sum, ctblock|
367
+ tb1 = ltblock.nil? ? b1 : ltblock[:source][:end]
368
+ te1 = ctblock.nil? ? e1 : ctblock[:source][:begin]
427
369
 
428
- ## character-based alignment
429
- alignment = TextAlignment::MixedAlignment.new(block1.downcase, block2.downcase)
430
- if alignment.sdiff.nil?
431
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
432
- else
433
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: alignment, similarity: alignment.similarity}]
434
- end
370
+ if te1 > tb1
371
+ tb2 = ltblock.nil? ? b2 : ltblock[:target][:end]
372
+ te2 = ctblock.nil? ? e2 : ctblock[:target][:begin]
373
+ sum << {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty}
435
374
  end
436
- else
437
- last_tblock = nil
438
- lblocks = tblocks.inject([]) do |sum, tblock|
439
- tb1 = last_tblock ? last_tblock[:source][:end] : b1
440
- te1 = tblock[:source][:begin]
441
375
 
442
- sum += if te1 == tb1
443
- [tblock]
444
- else
445
- tb2 = last_tblock ? last_tblock[:target][:end] : b2
446
- te2 = tblock[:target][:begin]
447
-
448
- if b2 == e2
449
- [
450
- {source:{begin:tb1, end:te1}, alignment: :empty},
451
- tblock
452
- ]
453
- else
454
- [
455
- {source:{begin:tb1, end:te1}, target:{begin:tb2, end:te2}, alignment: :empty},
456
- tblock
457
- ]
458
- end
459
- end
376
+ ltblock = ctblock
377
+ ctblock.nil? ? sum : sum << ctblock
378
+ end
379
+
380
+ tblocks2
381
+ end
460
382
 
461
- last_tblock = tblock
462
- sum
383
+ def lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
384
+ source = {begin:b1, end:e1}
385
+ target = {begin:b2, end:e2}
386
+
387
+ if (e1 - b1) > 2000
388
+ [{source:source, target:target, alignment: :empty}]
389
+ else
390
+ alignment = TextAlignment::MixedAlignment.new(str1[b1 ... e1].downcase, str2[b2 ... e2].downcase)
391
+ if alignment.similarity < 0.5
392
+ [{source:source, target:target, alignment: :empty}]
393
+ else
394
+ [{source:source, target:target, alignment: alignment, similarity: alignment.similarity}]
463
395
  end
396
+ end
397
+ end
464
398
 
465
- if last_tblock[:source][:end] < e1
466
- if last_tblock[:target][:end] < e2
467
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, target:{begin:last_tblock[:target][:end], end:e2}, alignment: :empty}
468
- else
469
- lblocks << {source:{begin:last_tblock[:source][:end], end:e1}, alignment: :empty}
470
- end
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
471
414
  end
415
+ condensed
416
+ end
472
417
 
473
- lblocks
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
420
+
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
474
428
  end
429
+
430
+ blocks
475
431
  end
476
432
 
477
433
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.0'
2
+ VERSION = '0.11.6'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-04 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary