text_alignment 0.11.8 → 0.11.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
4
- data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
3
+ metadata.gz: 5f95c7fb8bfdeb768fbd1400f7e785a2ed18016322f2cdcba3ea7196aa4e86ac
4
+ data.tar.gz: cad0296a218108884703af07bcbf1b303a6e12c51ab14ad872586c50cfc8e82c
5
5
  SHA512:
6
- metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
7
- data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
6
+ metadata.gz: 5d18a8d142974967fcfe358b561c83d76d1c5d8d2a277092ed4a8b42de14cd791b004830ff9f145735e6bc0a1265295f4fd6190c674ba62b57190d7e035a863b
7
+ data.tar.gz: c155ff9780d5f82893825787ee39ba319ddb1f16396d6a933334a83e485ff4d2eb2840d5fc1399f9873cd69a2458f029ced0d128d9099c4f08137d360e3e2007
@@ -110,20 +110,28 @@ end
110
110
 
111
111
 
112
112
  ## Options
113
- overlap_p = false
114
- debug_p = false
113
+ options = {}
114
+ verbose = false
115
115
 
116
116
  ## command line option processing
117
117
  require 'optparse'
118
118
  optparse = OptionParser.new do |opts|
119
119
  opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
120
 
121
- opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
- overlap_p = true
121
+ opts.on('-d', '--duplicate', 'tells it to assume there may be duplicate texts.') do
122
+ options[:duplicate_texts] = true
123
123
  end
124
124
 
125
- opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
- debug_p = true
125
+ opts.on('-w', '--no-whitespaces', 'tells it to ignore whitespaces.') do
126
+ options[:to_ignore_whitespaces] = true
127
+ end
128
+
129
+ opts.on('-o', '--no-order', 'tells it to ignore the order of the texts.') do
130
+ options[:to_ignore_text_order] = true
131
+ end
132
+
133
+ opts.on('-v', '--verbose', 'tells it to show the state verbosely for debugging.') do
134
+ verbose = true
127
135
  end
128
136
 
129
137
  opts.on('-h', '--help', 'displays this screen.') do
@@ -142,12 +150,12 @@ end
142
150
  source_annotations = read_annotations(ARGV[0])
143
151
  reference_text = read_text(ARGV[1])
144
152
 
145
- alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
153
+ alignment = TextAlignment::TextAlignment.new(reference_text, options)
146
154
 
147
155
  target_annotations = if source_annotations.class == Array
148
- align_mannotations(source_annotations, reference_text, alignment, debug_p)
156
+ align_mannotations(source_annotations, reference_text, alignment, verbose)
149
157
  else
150
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
158
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, verbose)
151
159
  source_annotations.merge({text:reference_text, denotations:denotations})
152
160
  end
153
161
 
@@ -6,17 +6,18 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
10
- @method_get_left_windows, @method_get_right_windows = if squeeze_ws
11
- [method(:get_left_windows), method(:get_right_windows)]
12
- else
9
+ def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
10
+ @method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces
13
11
  [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
12
+ else
13
+ [method(:get_left_windows), method(:get_right_windows)]
14
14
  end
15
15
 
16
16
  @s1 = source_str.downcase
17
17
  @s2 = target_str.downcase
18
18
 
19
19
  @cultivation_map = cultivation_map
20
+ @to_ignore_text_order = to_ignore_text_order
20
21
 
21
22
  @size_ngram = TextAlignment::SIZE_NGRAM
22
23
  @size_window = TextAlignment::SIZE_WINDOW
@@ -71,10 +72,7 @@ class TextAlignment::AnchorFinder
71
72
  # to get the anchor to search for in s2
72
73
  anchor = @s1[beg_s1, @size_ngram]
73
74
 
74
- # comment out below with the assumption that texts are in the same order
75
- # search_position = 0
76
- search_position = @pos_s2_last_match
77
-
75
+ search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
78
76
  beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
79
77
  return nil if beg_s2_candidates.empty?
80
78
 
@@ -80,10 +80,10 @@ TextAlignment::CHAR_MAPPING = [
80
80
 
81
81
 
82
82
  class TextAlignment::CharMapping
83
- attr_reader :mapped_text
83
+ attr_reader :mapped_text, :index_enmap
84
84
 
85
- def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
86
- if squeeze_ws_to == 0
85
+ def initialize(_text, char_mapping = nil, to_ignore_whitespaces = false)
86
+ if to_ignore_whitespaces
87
87
  @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
88
88
  @method_squeeze_ws = method(:squeeze_ws_0!)
89
89
  else
@@ -140,29 +140,25 @@ class TextAlignment::CharMapping
140
140
 
141
141
  # To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
142
142
  rpositions += @method_get_positions_squeeze_ws.call(text)
143
-
144
143
  rpositions.sort!{|a, b| a[0] <=> b[0]}
145
144
 
146
145
  # To get the offset_mapping before and after replacement
147
- offset_mapping = []
148
- init_next = 0
149
- j = 0
150
-
151
- rpositions.each do |loc, old_len, new_len|
152
- offset_mapping += (init_next .. loc).map do |i|
153
- m = [i, j]
154
- j += 1
146
+ offset_mapping = begin
147
+ i, j = 0, 0
148
+
149
+ offset_mappings = rpositions.map do |loc, old_len, new_len|
150
+ pre_len = loc - i
151
+ m = (0 .. pre_len).map{|c| [i + c, j + c]}
152
+ i = loc + old_len
153
+ j += pre_len + new_len
154
+
155
155
  m
156
156
  end
157
157
 
158
- init_next = loc + old_len
159
- j += (new_len - 1)
160
- end
158
+ pre_len = text.length - i
159
+ offset_mappings << (0 .. pre_len).map{|c| [i + c, j + c]}
161
160
 
162
- offset_mapping += (init_next .. text.length).map do |i|
163
- m = [i, j]
164
- j += 1
165
- m
161
+ offset_mappings.reduce(:+)
166
162
  end
167
163
 
168
164
  # To execute the long letter mapping
@@ -179,7 +175,7 @@ class TextAlignment::CharMapping
179
175
  # To get squeeze positions of whitespaces to one
180
176
  def get_positions_squeeze_ws_1(text)
181
177
  rpositions = []
182
- text.scan(/s{2,}/) do |s|
178
+ text.scan(/\s{2,}/) do |s|
183
179
  loc = $~.begin(0)
184
180
  len = $~.end(0) - loc
185
181
  rpositions << [loc, len, 1]
@@ -189,13 +185,7 @@ class TextAlignment::CharMapping
189
185
 
190
186
  # To get squeeze positions of whitespaces to zero
191
187
  def get_positions_squeeze_ws_0(text)
192
- rpositions = []
193
- text.scan(/\s+/) do |s|
194
- loc = $~.begin(0)
195
- len = $~.end(0) - loc
196
- rpositions << [loc, len, 0]
197
- end
198
- rpositions
188
+ text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
199
189
  end
200
190
 
201
191
  def squeeze_ws_1!(text)
@@ -210,6 +200,7 @@ end
210
200
 
211
201
  if __FILE__ == $0
212
202
  require 'json'
203
+ # require 'profile'
213
204
 
214
205
  unless ARGV.length == 1
215
206
  warn "#{$0} an_annotation_json_file.json"
@@ -221,7 +212,8 @@ if __FILE__ == $0
221
212
  denotations = annotations[:tracks].first[:denotations]
222
213
  end
223
214
 
224
- text_mapping = TextAlignment::CharMapping.new(annotations[:text])
215
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, false)
216
+ # text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, true)
225
217
  text_mapped = text_mapping.mapped_text
226
218
  denotations_mapped = text_mapping.enmap_denotations(denotations)
227
219
  new_annotations = {text:text_mapped, denotations:denotations_mapped}
@@ -16,11 +16,12 @@ class TextAlignment::TextAlignment
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
18
  options ||= {}
19
- @to_prevent_overlap = options[:to_prevent_overlap] || false
20
- @squeeze_ws_to = options[:squeeze_ws_to] || 0
19
+ @duplicate_texts = options[:duplicate_texts] || false
20
+ @to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
21
+ @to_ignore_text_order = options[:to_ignore_text_order] || false
21
22
 
22
23
  @original_reference_text = reference_text
23
- @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
24
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
24
25
  @mapped_reference_text = @rtext_mapping.mapped_text
25
26
 
26
27
  @original_text = nil
@@ -30,12 +31,12 @@ class TextAlignment::TextAlignment
30
31
 
31
32
  def align(text, denotations = nil)
32
33
  # To maintain the cultivation map
33
- update_cultivation_map if @to_prevent_overlap
34
+ update_cultivation_map unless @duplicate_texts
34
35
 
35
36
  # In case the input text is the same as the previous one, reuse the previous text mapping
36
37
  unless @original_text && @original_text == text
37
38
  @original_text = text
38
- @text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
39
+ @text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
39
40
  end
40
41
 
41
42
  @mapped_text = @text_mapping.mapped_text
@@ -205,7 +206,7 @@ class TextAlignment::TextAlignment
205
206
 
206
207
  def find_block_alignment(str1, str2, denotations, cultivation_map)
207
208
  ## to find block alignments
208
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
209
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)
209
210
 
210
211
  blocks = []
211
212
  while block = anchor_finder.get_next_anchor
@@ -241,68 +242,75 @@ class TextAlignment::TextAlignment
241
242
  b1 = lblock.nil? ? 0 : lblock[:source][:end]
242
243
  e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
243
244
 
244
- if b1 < e1
245
+ if b1 <= e1
246
+ _str1 = str1[b1 ... e1]
247
+
245
248
  b2 = lblock.nil? ? 0 : lblock[:target][:end]
246
249
  e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
247
- _str1 = str1[b1 ... e1]
248
- _str2 = str2[b2 ... e2]
249
250
 
250
- sum += if _str1.strip.empty? || _str2.strip.empty?
251
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
252
- else
253
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
254
- region_state, state_region = cultivation_map.region_state([b2, e2])
255
- case region_state
256
- when :closed
257
- [{source:{begin:b1, end:e1}, alignment: :empty}]
258
- when :front_open
259
- if sum.empty? # when there is no preceding matched block
260
- [{source:{begin:b1, end:e1}, alignment: :empty}]
261
- else
262
- oe2 = state_region[1]
263
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
264
- local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
265
- end
266
- when :rear_open
267
- if cblock.nil? # when there is no following matched block
268
- [{source:{begin:b1, end:e1}, alignment: :empty}]
269
- else
270
- ob2 = state_region[0]
271
- mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
272
- local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
273
- end
274
- when :middle_closed
275
- attempt1 = if sum.empty?
251
+ if b2 < e2
252
+ _str2 = str2[b2 ... e2]
253
+
254
+ sum += if _str1.strip.empty? || _str2.strip.empty?
255
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
256
+ else
257
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
258
+ region_state, state_region = cultivation_map.region_state([b2, e2])
259
+ case region_state
260
+ when :closed
276
261
  [{source:{begin:b1, end:e1}, alignment: :empty}]
277
- else
278
- oe2 = state_region[0]
279
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
280
- local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
281
- end
282
- if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
283
- ob2 = state_region[1]
284
- mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
285
- local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
286
- else
287
- attempt1
288
- end
289
- else # :open
290
- if (e2 - b2) > len_buffer
262
+ when :front_open
263
+ if sum.empty? # when there is no preceding matched block
264
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
265
+ else
266
+ oe2 = state_region[1]
267
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
268
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
269
+ end
270
+ when :rear_open
271
+ if cblock.nil? # when there is no following matched block
272
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
273
+ else
274
+ ob2 = state_region[0]
275
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
276
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
277
+ end
278
+ when :middle_closed
291
279
  attempt1 = if sum.empty?
292
280
  [{source:{begin:b1, end:e1}, alignment: :empty}]
293
281
  else
294
- local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
282
+ oe2 = state_region[0]
283
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
284
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
295
285
  end
296
286
  if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
297
- local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
287
+ ob2 = state_region[1]
288
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
289
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
298
290
  else
299
291
  attempt1
300
292
  end
301
- else
302
- local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
293
+ else # :open
294
+ if (e2 - b2) > len_buffer
295
+ attempt1 = if sum.empty?
296
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
297
+ else
298
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
299
+ end
300
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
301
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
302
+ else
303
+ attempt1
304
+ end
305
+ else
306
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
307
+ end
303
308
  end
304
309
  end
310
+ elsif b2 > e2 # when out of order
311
+ # ToDo
305
312
  end
313
+
306
314
  end
307
315
 
308
316
  lblock = cblock
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.8'
2
+ VERSION = '0.11.10'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.8
4
+ version: 0.11.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-06 00:00:00.000000000 Z
11
+ date: 2021-04-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
111
111
  - !ruby/object:Gem::Version
112
112
  version: '0'
113
113
  requirements: []
114
- rubygems_version: 3.0.8
114
+ rubygems_version: 3.0.9
115
115
  signing_key:
116
116
  specification_version: 4
117
117
  summary: Ruby class for aligning two character strings