text_alignment 0.11.1 → 0.11.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
4
- data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
3
+ metadata.gz: 52ad7c8b5822308ae15153e35626e13b5ec60d77428c80cbe3f1dd69f36edac1
4
+ data.tar.gz: efbcbbea9eb87606b1614661187356e15f65965b85c30ebf7d349d2f7e1028b0
5
5
  SHA512:
6
- metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
7
- data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
6
+ metadata.gz: 59a2a8dd68066271b61b950c18f513b03d2b674e86c0d6d065e265367e079c99bfa973cb41c75bdc2f2d19443a9b6f8e46dcca27bb71eecd27ed00d00c07533e
7
+ data.tar.gz: 4779c1cbe899021125f56204d87462a8c1222c961f547e1ddc9ba4c46e29211da564bdc6492346baaa16ad4eaaedce4601f6afdcc0a16031a29be79f624c4935
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment'
3
3
  require 'json'
4
4
  require 'pp'
5
+ require 'optparse'
5
6
 
6
7
  def read_annotations(filename)
7
8
  case File.extname(filename)
@@ -37,7 +38,7 @@ def align_denotations(denotations, source_text, alignment, debug = false)
37
38
  end
38
39
 
39
40
  lost_annotations = alignment.lost_annotations
40
- unless lost_annotations.empty?
41
+ unless lost_annotations.nil? || lost_annotations.empty?
41
42
  warn "\n[lost annotations] #{lost_annotations.length}"
42
43
  lost_annotations.each do |a|
43
44
  warn "#{a}"
@@ -108,24 +109,46 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
108
109
  end
109
110
 
110
111
 
112
+ ## Options
113
+ overlap_p = false
114
+ debug_p = false
115
+
116
+ ## command line option processing
117
+ require 'optparse'
118
+ optparse = OptionParser.new do |opts|
119
+ opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
+
121
+ opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
+ overlap_p = true
123
+ end
124
+
125
+ opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
+ debug_p = true
127
+ end
128
+
129
+ opts.on('-h', '--help', 'displays this screen.') do
130
+ puts opts
131
+ exit
132
+ end
133
+ end
134
+
135
+ optparse.parse!
136
+
111
137
  unless ARGV.length == 2
112
- warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
113
- exit
138
+ puts optparse.help
139
+ exit 1
114
140
  end
115
141
 
116
142
  source_annotations = read_annotations(ARGV[0])
117
143
  reference_text = read_text(ARGV[1])
118
144
 
119
- alignment = TextAlignment::TextAlignment.new(reference_text, true)
145
+ alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
120
146
 
121
147
  target_annotations = if source_annotations.class == Array
122
- # align_mannotations(source_annotations, reference_text, alignment, true)
123
- align_mannotations(source_annotations, reference_text, alignment, false)
148
+ align_mannotations(source_annotations, reference_text, alignment, debug_p)
124
149
  else
125
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
150
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
127
151
  source_annotations.merge({text:reference_text, denotations:denotations})
128
152
  end
129
153
 
130
- # pp alignment.block_alignment
131
- puts target_annotations.to_json
154
+ # puts target_annotations.to_json
@@ -108,14 +108,14 @@ class TextAlignment::AnchorFinder
108
108
  next
109
109
  end
110
110
 
111
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
112
112
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
113
  break unless valid_beg_s2.nil?
114
114
  valid_beg_s2 = beg_s2
115
115
  next
116
116
  end
117
117
 
118
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
119
119
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
120
  break unless valid_beg_s2.nil?
121
121
  valid_beg_s2 = beg_s2
@@ -125,7 +125,11 @@ class TextAlignment::AnchorFinder
125
125
 
126
126
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
127
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
- break unless r.nil?
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
129
133
  end
130
134
 
131
135
  valid_beg_s2
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+202F (narrow no-break space)
64
65
  [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
67
  ["‐", "-"], #U+2010 (Hyphen)
@@ -94,10 +95,10 @@ class TextAlignment::CharMapping
94
95
  @index_demap[position]
95
96
  end
96
97
 
97
- def enmap_denotations(_denotations)
98
- return nil if _denotations.nil?
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
99
100
 
100
- denotations = _denotations.map do |d|
101
+ denotations.map do |d|
101
102
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
102
103
  end
103
104
  end
@@ -107,7 +108,7 @@ class TextAlignment::CharMapping
107
108
  def enmap_text(_text, char_mapping)
108
109
  text = _text.dup
109
110
 
110
- # To execute the single letter mapping
111
+ # To execute the single letter mapping replacement
111
112
  char_mapping.each do |one, long|
112
113
  text.gsub!(one, long) if long.length == 1
113
114
  end
@@ -119,7 +120,7 @@ class TextAlignment::CharMapping
119
120
 
120
121
  init_next = 0
121
122
  while loc = text.index(long, init_next)
122
- loc_len << [loc, long.length]
123
+ loc_len << [loc, long.length, 1]
123
124
  init_next = loc + long.length
124
125
  end
125
126
 
@@ -129,9 +130,9 @@ class TextAlignment::CharMapping
129
130
 
130
131
  # To get the (location, length) index for consecutive whitespace sequences
131
132
  init_next = 0
132
- while loc = text.index(/\s{2,}/, init_next)
133
+ while loc = text.index(/\s{1,}/, init_next)
133
134
  len = $~[0].length
134
- loc_len << [loc, len]
135
+ loc_len << [loc, len, 0]
135
136
  init_next = loc + len
136
137
  end
137
138
 
@@ -142,17 +143,21 @@ class TextAlignment::CharMapping
142
143
  init_next = 0
143
144
  j = 0
144
145
 
145
- loc_len.each do |loc, len|
146
+ loc_len.each do |loc, old_len, new_len|
146
147
  offset_mapping += (init_next .. loc).map do |i|
148
+ m = [i, j]
147
149
  j += 1
148
- [i, j - 1]
150
+ m
149
151
  end
150
- init_next = loc + len
152
+
153
+ init_next = loc + old_len
154
+ j += (new_len - 1)
151
155
  end
152
156
 
153
157
  offset_mapping += (init_next .. text.length).map do |i|
158
+ m = [i, j]
154
159
  j += 1
155
- [i, j - 1]
160
+ m
156
161
  end
157
162
 
158
163
  # To execute the long letter mapping
@@ -161,7 +166,7 @@ class TextAlignment::CharMapping
161
166
  end
162
167
 
163
168
  # To replace multi whitespace sequences to a space
164
- text.gsub!(/\s{2,}/, ' ')
169
+ text.gsub!(/\s{1,}/, '')
165
170
 
166
171
  [text, offset_mapping]
167
172
  end
@@ -175,7 +180,7 @@ if __FILE__ == $0
175
180
  exit
176
181
  end
177
182
  annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
- denotations = annotations[:denotations]
183
+ denotations = annotations[:denotations] || []
179
184
  if denotations.nil? && annotations[:tracks]
180
185
  denotations = annotations[:tracks].first[:denotations]
181
186
  end
@@ -41,7 +41,7 @@ class TextAlignment::CultivationMap
41
41
  end
42
42
 
43
43
  def next_cultivated_position(position)
44
- region = @map.bsearch{|r| position < r[0]}
44
+ region = @map.bsearch{|r| position <= r[0]}
45
45
  region.nil? ? nil : region[0]
46
46
  end
47
47
 
@@ -56,7 +56,7 @@ class TextAlignment::CultivationMap
56
56
  else
57
57
  if front_open?(region, closed_parts)
58
58
  if rear_open?(region, closed_parts)
59
- [:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
60
  else
61
61
  [:front_open, [region[0], closed_parts.first[0]]]
62
62
  end
@@ -70,7 +70,7 @@ class TextAlignment::CultivationMap
70
70
  end
71
71
  end
72
72
 
73
- def index(target, string, position)
73
+ def index(target, string, position = 0)
74
74
  length = target.length
75
75
  loop do
76
76
  _begin = string.index(target, position)
@@ -147,13 +147,25 @@ class TextAlignment::MixedAlignment
147
147
  # recoverbility
148
148
  count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
149
  count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
-
151
150
  coverage = count_nws_match.to_f / count_nws
152
151
 
153
152
  # fragmentation rate
154
- count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
- count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
- rate_frag = count_ofrag.to_f / count_frag
153
+ frag_str = sdiff.collect do |d|
154
+ case d.action
155
+ when '='
156
+ '='
157
+ when '-'
158
+ ''
159
+ when '+'
160
+ # (d.new_element =~ /\S/) ? '+' : ''
161
+ '+'
162
+ else
163
+ ''
164
+ end
165
+ end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
166
+
167
+ count_frag = frag_str.scan(/=+/).count
168
+ rate_frag = 1.0 / count_frag
157
169
 
158
170
  similarity = coverage * rate_frag
159
171
  end
@@ -21,7 +21,7 @@ class TextAlignment::TextAlignment
21
21
  @to_prevent_overlap = to_prevent_overlap
22
22
 
23
23
  @original_text = nil
24
- @block_alignment = nil
24
+ @blocks = nil
25
25
  @cultivation_map = TextAlignment::CultivationMap.new
26
26
  end
27
27
 
@@ -39,45 +39,20 @@ class TextAlignment::TextAlignment
39
39
  denotations_mapped = @text_mapping.enmap_denotations(denotations)
40
40
 
41
41
  ## To generate the block_alignment of the input text against the reference text
42
- # Initialization
43
- # @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
44
- @block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
45
-
46
- # Generation
47
- @block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
48
43
  r
49
44
  else
50
45
  find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
51
46
  end
52
- end
53
-
54
- def update_cultivation_map
55
- return if @block_alignment.nil? || @block_alignment[:blocks].nil?
56
-
57
- ## To update the cultivation map
58
- newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
59
- if b[:alignment] == :block || b[:alignment] == :term
60
- [b[:target][:begin], b[:target][:end]]
61
- else
62
- nil
63
- end
64
- end.compact.inject([]) do |condensed, region|
65
- if condensed.empty? || (condensed.last.last + 1 < region.first)
66
- condensed.push region
67
- else
68
- condensed.last[1] = region.last
69
- end
70
- condensed
71
- end
72
47
 
73
- @cultivation_map.cultivate(newly_cultivated_regions)
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
74
49
  end
75
50
 
76
51
  def transform_begin_position(_begin_position)
77
52
  begin_position = @text_mapping.enmap_position(_begin_position)
78
53
 
79
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
80
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
81
56
 
82
57
  b = if block[:alignment] == :block || block[:alignment] == :term
83
58
  begin_position + block[:delta]
@@ -98,8 +73,8 @@ class TextAlignment::TextAlignment
98
73
  def transform_end_position(_end_position)
99
74
  end_position = @text_mapping.enmap_position(_end_position)
100
75
 
101
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
102
- block = @block_alignment[:blocks][i]
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
103
78
 
104
79
  e = if block[:alignment] == :block || block[:alignment] == :term
105
80
  end_position + block[:delta]
@@ -160,8 +135,8 @@ class TextAlignment::TextAlignment
160
135
  end
161
136
 
162
137
  def alignment_show
163
- stext = @mapped_text
164
- ttext = @mapped_reference_text
138
+ stext = @block_alignment[:text]
139
+ ttext = @block_alignment[:reference_text]
165
140
 
166
141
  show = ''
167
142
  @block_alignment[:blocks].each do |a|
@@ -276,20 +251,32 @@ class TextAlignment::TextAlignment
276
251
  region_state, state_region = cultivation_map.region_state([b2, e2])
277
252
  case region_state
278
253
  when :closed
279
- []
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
280
255
  when :front_open
281
- oe2 = state_region[1]
282
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
283
- local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
284
263
  when :rear_open
285
- ob2 = state_region[0]
286
- mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
287
- local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
288
271
  when :middle_closed
289
- oe2 = state_region[0]
290
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
291
- attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
292
- if attempt1.empty?
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
293
280
  ob2 = state_region[1]
294
281
  mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
295
282
  local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
@@ -298,8 +285,12 @@ class TextAlignment::TextAlignment
298
285
  end
299
286
  else # :open
300
287
  if (e2 - b2) > len_buffer
301
- attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
302
- if attempt1.empty?
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
303
294
  local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
304
295
  else
305
296
  attempt1
@@ -318,10 +309,10 @@ class TextAlignment::TextAlignment
318
309
  end
319
310
 
320
311
  def whole_block_alignment(str1, str2, cultivation_map)
321
- block_begin = cultivation_map.index(str1, str2, 0)
312
+ block_begin = cultivation_map.index(str1, str2)
322
313
  return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
323
314
 
324
- block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
325
316
  return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
326
317
 
327
318
  nil
@@ -329,7 +320,7 @@ class TextAlignment::TextAlignment
329
320
 
330
321
  def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
322
  tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
332
- if tblocks.empty?
323
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
333
324
  lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
334
325
  else
335
326
  tblocks
@@ -405,4 +396,38 @@ class TextAlignment::TextAlignment
405
396
  end
406
397
  end
407
398
 
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
414
+ end
415
+ condensed
416
+ end
417
+
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
420
+
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
428
+ end
429
+
430
+ blocks
431
+ end
432
+
408
433
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.1'
2
+ VERSION = '0.11.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.1
4
+ version: 0.11.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-08 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary