text_alignment 0.11.1 → 0.11.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef59c0cd578ed453a67edeb3b29059f4b76c4c541f777fa35a06e76b299e2564
4
- data.tar.gz: 5594c0f6eb1d52cc331c210fdf936e7cb09e30277f21933027e55b0c8cfa0e24
3
+ metadata.gz: 52ad7c8b5822308ae15153e35626e13b5ec60d77428c80cbe3f1dd69f36edac1
4
+ data.tar.gz: efbcbbea9eb87606b1614661187356e15f65965b85c30ebf7d349d2f7e1028b0
5
5
  SHA512:
6
- metadata.gz: ef5bae492d7e7b840c00943ac24e571392c4c992a085c6a63c8cb6db126ed9137ba94f8cd7af42a91e17aa327cb9a5ce24b909944c157f21beb8a88c8ce7528b
7
- data.tar.gz: b6d84dcdc35399f91a0f6a5a24d84b11169de9d861a249acca52f0fec86e6f1fbf1bb4dbb47a5d43654fa43735b181aa64034447fb71ca675090818dcf67133a
6
+ metadata.gz: 59a2a8dd68066271b61b950c18f513b03d2b674e86c0d6d065e265367e079c99bfa973cb41c75bdc2f2d19443a9b6f8e46dcca27bb71eecd27ed00d00c07533e
7
+ data.tar.gz: 4779c1cbe899021125f56204d87462a8c1222c961f547e1ddc9ba4c46e29211da564bdc6492346baaa16ad4eaaedce4601f6afdcc0a16031a29be79f624c4935
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment'
3
3
  require 'json'
4
4
  require 'pp'
5
+ require 'optparse'
5
6
 
6
7
  def read_annotations(filename)
7
8
  case File.extname(filename)
@@ -37,7 +38,7 @@ def align_denotations(denotations, source_text, alignment, debug = false)
37
38
  end
38
39
 
39
40
  lost_annotations = alignment.lost_annotations
40
- unless lost_annotations.empty?
41
+ unless lost_annotations.nil? || lost_annotations.empty?
41
42
  warn "\n[lost annotations] #{lost_annotations.length}"
42
43
  lost_annotations.each do |a|
43
44
  warn "#{a}"
@@ -108,24 +109,46 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
108
109
  end
109
110
 
110
111
 
112
+ ## Options
113
+ overlap_p = false
114
+ debug_p = false
115
+
116
+ ## command line option processing
117
+ require 'optparse'
118
+ optparse = OptionParser.new do |opts|
119
+ opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
+
121
+ opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
+ overlap_p = true
123
+ end
124
+
125
+ opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
+ debug_p = true
127
+ end
128
+
129
+ opts.on('-h', '--help', 'displays this screen.') do
130
+ puts opts
131
+ exit
132
+ end
133
+ end
134
+
135
+ optparse.parse!
136
+
111
137
  unless ARGV.length == 2
112
- warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
113
- exit
138
+ puts optparse.help
139
+ exit 1
114
140
  end
115
141
 
116
142
  source_annotations = read_annotations(ARGV[0])
117
143
  reference_text = read_text(ARGV[1])
118
144
 
119
- alignment = TextAlignment::TextAlignment.new(reference_text, true)
145
+ alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
120
146
 
121
147
  target_annotations = if source_annotations.class == Array
122
- # align_mannotations(source_annotations, reference_text, alignment, true)
123
- align_mannotations(source_annotations, reference_text, alignment, false)
148
+ align_mannotations(source_annotations, reference_text, alignment, debug_p)
124
149
  else
125
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
150
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
127
151
  source_annotations.merge({text:reference_text, denotations:denotations})
128
152
  end
129
153
 
130
- # pp alignment.block_alignment
131
- puts target_annotations.to_json
154
+ # puts target_annotations.to_json
@@ -108,14 +108,14 @@ class TextAlignment::AnchorFinder
108
108
  next
109
109
  end
110
110
 
111
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
111
+ left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
112
112
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
113
  break unless valid_beg_s2.nil?
114
114
  valid_beg_s2 = beg_s2
115
115
  next
116
116
  end
117
117
 
118
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
118
+ right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
119
119
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
120
  break unless valid_beg_s2.nil?
121
121
  valid_beg_s2 = beg_s2
@@ -125,7 +125,11 @@ class TextAlignment::AnchorFinder
125
125
 
126
126
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
127
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
- break unless r.nil?
128
+ if r.nil?
129
+ valid_beg_s2 = nil
130
+ else
131
+ break
132
+ end
129
133
  end
130
134
 
131
135
  valid_beg_s2
@@ -61,6 +61,7 @@ TextAlignment::CHAR_MAPPING = [
61
61
  ["•", "*"], #U+2022 (bullet)
62
62
  [" ", " "], #U+2009 (thin space)
63
63
  [" ", " "], #U+200A (hair space)
64
+ [" ", " "], #U+202F (narrow no-break space)
64
65
  [" ", " "], #U+00A0 (Non-Breaking space)
65
66
  [" ", " "], #U+3000 (ideographic space)
66
67
  ["‐", "-"], #U+2010 (Hyphen)
@@ -94,10 +95,10 @@ class TextAlignment::CharMapping
94
95
  @index_demap[position]
95
96
  end
96
97
 
97
- def enmap_denotations(_denotations)
98
- return nil if _denotations.nil?
98
+ def enmap_denotations(denotations)
99
+ return nil if denotations.nil?
99
100
 
100
- denotations = _denotations.map do |d|
101
+ denotations.map do |d|
101
102
  d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
102
103
  end
103
104
  end
@@ -107,7 +108,7 @@ class TextAlignment::CharMapping
107
108
  def enmap_text(_text, char_mapping)
108
109
  text = _text.dup
109
110
 
110
- # To execute the single letter mapping
111
+ # To execute the single letter mapping replacement
111
112
  char_mapping.each do |one, long|
112
113
  text.gsub!(one, long) if long.length == 1
113
114
  end
@@ -119,7 +120,7 @@ class TextAlignment::CharMapping
119
120
 
120
121
  init_next = 0
121
122
  while loc = text.index(long, init_next)
122
- loc_len << [loc, long.length]
123
+ loc_len << [loc, long.length, 1]
123
124
  init_next = loc + long.length
124
125
  end
125
126
 
@@ -129,9 +130,9 @@ class TextAlignment::CharMapping
129
130
 
130
131
  # To get the (location, length) index for consecutive whitespace sequences
131
132
  init_next = 0
132
- while loc = text.index(/\s{2,}/, init_next)
133
+ while loc = text.index(/\s{1,}/, init_next)
133
134
  len = $~[0].length
134
- loc_len << [loc, len]
135
+ loc_len << [loc, len, 0]
135
136
  init_next = loc + len
136
137
  end
137
138
 
@@ -142,17 +143,21 @@ class TextAlignment::CharMapping
142
143
  init_next = 0
143
144
  j = 0
144
145
 
145
- loc_len.each do |loc, len|
146
+ loc_len.each do |loc, old_len, new_len|
146
147
  offset_mapping += (init_next .. loc).map do |i|
148
+ m = [i, j]
147
149
  j += 1
148
- [i, j - 1]
150
+ m
149
151
  end
150
- init_next = loc + len
152
+
153
+ init_next = loc + old_len
154
+ j += (new_len - 1)
151
155
  end
152
156
 
153
157
  offset_mapping += (init_next .. text.length).map do |i|
158
+ m = [i, j]
154
159
  j += 1
155
- [i, j - 1]
160
+ m
156
161
  end
157
162
 
158
163
  # To execute the long letter mapping
@@ -161,7 +166,7 @@ class TextAlignment::CharMapping
161
166
  end
162
167
 
163
168
  # To replace multi whitespace sequences to a space
164
- text.gsub!(/\s{2,}/, ' ')
169
+ text.gsub!(/\s{1,}/, '')
165
170
 
166
171
  [text, offset_mapping]
167
172
  end
@@ -175,7 +180,7 @@ if __FILE__ == $0
175
180
  exit
176
181
  end
177
182
  annotations = JSON.parse File.read(ARGV[0]).strip, symbolize_names: true
178
- denotations = annotations[:denotations]
183
+ denotations = annotations[:denotations] || []
179
184
  if denotations.nil? && annotations[:tracks]
180
185
  denotations = annotations[:tracks].first[:denotations]
181
186
  end
@@ -41,7 +41,7 @@ class TextAlignment::CultivationMap
41
41
  end
42
42
 
43
43
  def next_cultivated_position(position)
44
- region = @map.bsearch{|r| position < r[0]}
44
+ region = @map.bsearch{|r| position <= r[0]}
45
45
  region.nil? ? nil : region[0]
46
46
  end
47
47
 
@@ -56,7 +56,7 @@ class TextAlignment::CultivationMap
56
56
  else
57
57
  if front_open?(region, closed_parts)
58
58
  if rear_open?(region, closed_parts)
59
- [:middle_closed, [closed_parts.first[1], closed_parts.last[0]]]
59
+ [:middle_closed, [closed_parts.first[0], closed_parts.last[1]]]
60
60
  else
61
61
  [:front_open, [region[0], closed_parts.first[0]]]
62
62
  end
@@ -70,7 +70,7 @@ class TextAlignment::CultivationMap
70
70
  end
71
71
  end
72
72
 
73
- def index(target, string, position)
73
+ def index(target, string, position = 0)
74
74
  length = target.length
75
75
  loop do
76
76
  _begin = string.index(target, position)
@@ -147,13 +147,25 @@ class TextAlignment::MixedAlignment
147
147
  # recoverbility
148
148
  count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
149
  count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
-
151
150
  coverage = count_nws_match.to_f / count_nws
152
151
 
153
152
  # fragmentation rate
154
- count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
- count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
- rate_frag = count_ofrag.to_f / count_frag
153
+ frag_str = sdiff.collect do |d|
154
+ case d.action
155
+ when '='
156
+ '='
157
+ when '-'
158
+ ''
159
+ when '+'
160
+ # (d.new_element =~ /\S/) ? '+' : ''
161
+ '+'
162
+ else
163
+ ''
164
+ end
165
+ end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
166
+
167
+ count_frag = frag_str.scan(/=+/).count
168
+ rate_frag = 1.0 / count_frag
157
169
 
158
170
  similarity = coverage * rate_frag
159
171
  end
@@ -21,7 +21,7 @@ class TextAlignment::TextAlignment
21
21
  @to_prevent_overlap = to_prevent_overlap
22
22
 
23
23
  @original_text = nil
24
- @block_alignment = nil
24
+ @blocks = nil
25
25
  @cultivation_map = TextAlignment::CultivationMap.new
26
26
  end
27
27
 
@@ -39,45 +39,20 @@ class TextAlignment::TextAlignment
39
39
  denotations_mapped = @text_mapping.enmap_denotations(denotations)
40
40
 
41
41
  ## To generate the block_alignment of the input text against the reference text
42
- # Initialization
43
- # @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations}
44
- @block_alignment = {text: @mapped_text, reference_text: @mapped_reference_text, denotations: denotations}
45
-
46
- # Generation
47
- @block_alignment[:blocks] = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
42
+ @blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
48
43
  r
49
44
  else
50
45
  find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
51
46
  end
52
- end
53
-
54
- def update_cultivation_map
55
- return if @block_alignment.nil? || @block_alignment[:blocks].nil?
56
-
57
- ## To update the cultivation map
58
- newly_cultivated_regions = @block_alignment[:blocks].collect do |b|
59
- if b[:alignment] == :block || b[:alignment] == :term
60
- [b[:target][:begin], b[:target][:end]]
61
- else
62
- nil
63
- end
64
- end.compact.inject([]) do |condensed, region|
65
- if condensed.empty? || (condensed.last.last + 1 < region.first)
66
- condensed.push region
67
- else
68
- condensed.last[1] = region.last
69
- end
70
- condensed
71
- end
72
47
 
73
- @cultivation_map.cultivate(newly_cultivated_regions)
48
+ @block_alignment = {text: @original_text, reference_text: @original_reference_text, denotations: denotations, blocks: demap_blocks(@blocks)}
74
49
  end
75
50
 
76
51
  def transform_begin_position(_begin_position)
77
52
  begin_position = @text_mapping.enmap_position(_begin_position)
78
53
 
79
- i = @block_alignment[:blocks].index{|b| b[:source][:end] > begin_position}
80
- block = @block_alignment[:blocks][i]
54
+ i = @blocks.index{|b| b[:source][:end] > begin_position}
55
+ block = @blocks[i]
81
56
 
82
57
  b = if block[:alignment] == :block || block[:alignment] == :term
83
58
  begin_position + block[:delta]
@@ -98,8 +73,8 @@ class TextAlignment::TextAlignment
98
73
  def transform_end_position(_end_position)
99
74
  end_position = @text_mapping.enmap_position(_end_position)
100
75
 
101
- i = @block_alignment[:blocks].index{|b| b[:source][:end] >= end_position}
102
- block = @block_alignment[:blocks][i]
76
+ i = @blocks.index{|b| b[:source][:end] >= end_position}
77
+ block = @blocks[i]
103
78
 
104
79
  e = if block[:alignment] == :block || block[:alignment] == :term
105
80
  end_position + block[:delta]
@@ -160,8 +135,8 @@ class TextAlignment::TextAlignment
160
135
  end
161
136
 
162
137
  def alignment_show
163
- stext = @mapped_text
164
- ttext = @mapped_reference_text
138
+ stext = @block_alignment[:text]
139
+ ttext = @block_alignment[:reference_text]
165
140
 
166
141
  show = ''
167
142
  @block_alignment[:blocks].each do |a|
@@ -276,20 +251,32 @@ class TextAlignment::TextAlignment
276
251
  region_state, state_region = cultivation_map.region_state([b2, e2])
277
252
  case region_state
278
253
  when :closed
279
- []
254
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
280
255
  when :front_open
281
- oe2 = state_region[1]
282
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
283
- local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
256
+ if sum.empty? # when there is no preceding matched block
257
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
258
+ else
259
+ oe2 = state_region[1]
260
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
261
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
262
+ end
284
263
  when :rear_open
285
- ob2 = state_region[0]
286
- mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
287
- local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
264
+ if cblock.nil? # when there is no following matched block
265
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
266
+ else
267
+ ob2 = state_region[0]
268
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
269
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
270
+ end
288
271
  when :middle_closed
289
- oe2 = state_region[0]
290
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
291
- attempt1 = local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
292
- if attempt1.empty?
272
+ attempt1 = if sum.empty?
273
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
274
+ else
275
+ oe2 = state_region[0]
276
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
277
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
278
+ end
279
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
293
280
  ob2 = state_region[1]
294
281
  mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
295
282
  local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
@@ -298,8 +285,12 @@ class TextAlignment::TextAlignment
298
285
  end
299
286
  else # :open
300
287
  if (e2 - b2) > len_buffer
301
- attempt1 = local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
302
- if attempt1.empty?
288
+ attempt1 = if sum.empty?
289
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
290
+ else
291
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
292
+ end
293
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
303
294
  local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
304
295
  else
305
296
  attempt1
@@ -318,10 +309,10 @@ class TextAlignment::TextAlignment
318
309
  end
319
310
 
320
311
  def whole_block_alignment(str1, str2, cultivation_map)
321
- block_begin = cultivation_map.index(str1, str2, 0)
312
+ block_begin = cultivation_map.index(str1, str2)
322
313
  return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
323
314
 
324
- block_begin = cultivation_map.index(str1.downcase, str2.downcase, 0)
315
+ block_begin = cultivation_map.index(str1.downcase, str2.downcase)
325
316
  return [{source:{begin:0, end:str1.length}, target:{begin:block_begin, end:block_begin + str1.length}, delta:block_begin, alignment: :block}] unless block_begin.nil?
326
317
 
327
318
  nil
@@ -329,7 +320,7 @@ class TextAlignment::TextAlignment
329
320
 
330
321
  def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
331
322
  tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
332
- if tblocks.empty?
323
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
333
324
  lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
334
325
  else
335
326
  tblocks
@@ -405,4 +396,38 @@ class TextAlignment::TextAlignment
405
396
  end
406
397
  end
407
398
 
399
+ def update_cultivation_map
400
+ return if @blocks.nil?
401
+
402
+ ## To update the cultivation map
403
+ newly_cultivated_regions = @blocks.collect do |b|
404
+ if b[:alignment] == :block || b[:alignment] == :term
405
+ [b[:target][:begin], b[:target][:end]]
406
+ else
407
+ nil
408
+ end
409
+ end.compact.inject([]) do |condensed, region|
410
+ if condensed.empty? || (condensed.last.last + 1 < region.first)
411
+ condensed.push region
412
+ else
413
+ condensed.last[1] = region.last
414
+ end
415
+ condensed
416
+ end
417
+
418
+ @cultivation_map.cultivate(newly_cultivated_regions)
419
+ end
420
+
421
+ def demap_blocks(_blocks)
422
+ return nil if _blocks.nil?
423
+
424
+ blocks = _blocks.map{|b| b.dup}
425
+ blocks.each do |b|
426
+ b[:source] = {begin:@text_mapping.demap_position(b[:source][:begin]), end:@text_mapping.demap_position(b[:source][:end])} if b[:source]
427
+ b[:target] = {begin:@rtext_mapping.demap_position(b[:target][:begin]), end:@rtext_mapping.demap_position(b[:target][:end])} if b[:target]
428
+ end
429
+
430
+ blocks
431
+ end
432
+
408
433
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.1'
2
+ VERSION = '0.11.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.1
4
+ version: 0.11.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-08 00:00:00.000000000 Z
11
+ date: 2021-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary