text_alignment 0.11.8 → 0.11.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
4
- data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
3
+ metadata.gz: 5f95c7fb8bfdeb768fbd1400f7e785a2ed18016322f2cdcba3ea7196aa4e86ac
4
+ data.tar.gz: cad0296a218108884703af07bcbf1b303a6e12c51ab14ad872586c50cfc8e82c
5
5
  SHA512:
6
- metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
7
- data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
6
+ metadata.gz: 5d18a8d142974967fcfe358b561c83d76d1c5d8d2a277092ed4a8b42de14cd791b004830ff9f145735e6bc0a1265295f4fd6190c674ba62b57190d7e035a863b
7
+ data.tar.gz: c155ff9780d5f82893825787ee39ba319ddb1f16396d6a933334a83e485ff4d2eb2840d5fc1399f9873cd69a2458f029ced0d128d9099c4f08137d360e3e2007
@@ -110,20 +110,28 @@ end
110
110
 
111
111
 
112
112
  ## Options
113
- overlap_p = false
114
- debug_p = false
113
+ options = {}
114
+ verbose = false
115
115
 
116
116
  ## command line option processing
117
117
  require 'optparse'
118
118
  optparse = OptionParser.new do |opts|
119
119
  opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
120
 
121
- opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
- overlap_p = true
121
+ opts.on('-d', '--duplicate', 'tells it to assume there may be duplicate texts.') do
122
+ options[:duplicate_texts] = true
123
123
  end
124
124
 
125
- opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
- debug_p = true
125
+ opts.on('-w', '--no-whitespaces', 'tells it to ignore whitespaces.') do
126
+ options[:to_ignore_whitespaces] = true
127
+ end
128
+
129
+ opts.on('-o', '--no-order', 'tells it to ignore the order of the texts.') do
130
+ options[:to_ignore_text_order] = true
131
+ end
132
+
133
+ opts.on('-v', '--verbose', 'tells it to show the state verbosely for debugging.') do
134
+ verbose = true
127
135
  end
128
136
 
129
137
  opts.on('-h', '--help', 'displays this screen.') do
@@ -142,12 +150,12 @@ end
142
150
  source_annotations = read_annotations(ARGV[0])
143
151
  reference_text = read_text(ARGV[1])
144
152
 
145
- alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
153
+ alignment = TextAlignment::TextAlignment.new(reference_text, options)
146
154
 
147
155
  target_annotations = if source_annotations.class == Array
148
- align_mannotations(source_annotations, reference_text, alignment, debug_p)
156
+ align_mannotations(source_annotations, reference_text, alignment, verbose)
149
157
  else
150
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
158
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, verbose)
151
159
  source_annotations.merge({text:reference_text, denotations:denotations})
152
160
  end
153
161
 
@@ -6,17 +6,18 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
10
- @method_get_left_windows, @method_get_right_windows = if squeeze_ws
11
- [method(:get_left_windows), method(:get_right_windows)]
12
- else
9
+ def initialize(source_str, target_str, cultivation_map, to_ignore_whitespaces = false, to_ignore_text_order = false)
10
+ @method_get_left_windows, @method_get_right_windows = if to_ignore_whitespaces
13
11
  [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
12
+ else
13
+ [method(:get_left_windows), method(:get_right_windows)]
14
14
  end
15
15
 
16
16
  @s1 = source_str.downcase
17
17
  @s2 = target_str.downcase
18
18
 
19
19
  @cultivation_map = cultivation_map
20
+ @to_ignore_text_order = to_ignore_text_order
20
21
 
21
22
  @size_ngram = TextAlignment::SIZE_NGRAM
22
23
  @size_window = TextAlignment::SIZE_WINDOW
@@ -71,10 +72,7 @@ class TextAlignment::AnchorFinder
71
72
  # to get the anchor to search for in s2
72
73
  anchor = @s1[beg_s1, @size_ngram]
73
74
 
74
- # comment out below with the assumption that texts are in the same order
75
- # search_position = 0
76
- search_position = @pos_s2_last_match
77
-
75
+ search_position = @to_ignore_text_order ? 0 : @pos_s2_last_match
78
76
  beg_s2_candidates = find_beg_s2_candidates(anchor, search_position)
79
77
  return nil if beg_s2_candidates.empty?
80
78
 
@@ -80,10 +80,10 @@ TextAlignment::CHAR_MAPPING = [
80
80
 
81
81
 
82
82
  class TextAlignment::CharMapping
83
- attr_reader :mapped_text
83
+ attr_reader :mapped_text, :index_enmap
84
84
 
85
- def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
86
- if squeeze_ws_to == 0
85
+ def initialize(_text, char_mapping = nil, to_ignore_whitespaces = false)
86
+ if to_ignore_whitespaces
87
87
  @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
88
88
  @method_squeeze_ws = method(:squeeze_ws_0!)
89
89
  else
@@ -140,29 +140,25 @@ class TextAlignment::CharMapping
140
140
 
141
141
  # To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
142
142
  rpositions += @method_get_positions_squeeze_ws.call(text)
143
-
144
143
  rpositions.sort!{|a, b| a[0] <=> b[0]}
145
144
 
146
145
  # To get the offset_mapping before and after replacement
147
- offset_mapping = []
148
- init_next = 0
149
- j = 0
150
-
151
- rpositions.each do |loc, old_len, new_len|
152
- offset_mapping += (init_next .. loc).map do |i|
153
- m = [i, j]
154
- j += 1
146
+ offset_mapping = begin
147
+ i, j = 0, 0
148
+
149
+ offset_mappings = rpositions.map do |loc, old_len, new_len|
150
+ pre_len = loc - i
151
+ m = (0 .. pre_len).map{|c| [i + c, j + c]}
152
+ i = loc + old_len
153
+ j += pre_len + new_len
154
+
155
155
  m
156
156
  end
157
157
 
158
- init_next = loc + old_len
159
- j += (new_len - 1)
160
- end
158
+ pre_len = text.length - i
159
+ offset_mappings << (0 .. pre_len).map{|c| [i + c, j + c]}
161
160
 
162
- offset_mapping += (init_next .. text.length).map do |i|
163
- m = [i, j]
164
- j += 1
165
- m
161
+ offset_mappings.reduce(:+)
166
162
  end
167
163
 
168
164
  # To execute the long letter mapping
@@ -179,7 +175,7 @@ class TextAlignment::CharMapping
179
175
  # To get squeeze positions of whitespaces to one
180
176
  def get_positions_squeeze_ws_1(text)
181
177
  rpositions = []
182
- text.scan(/s{2,}/) do |s|
178
+ text.scan(/\s{2,}/) do |s|
183
179
  loc = $~.begin(0)
184
180
  len = $~.end(0) - loc
185
181
  rpositions << [loc, len, 1]
@@ -189,13 +185,7 @@ class TextAlignment::CharMapping
189
185
 
190
186
  # To get squeeze positions of whitespaces to zero
191
187
  def get_positions_squeeze_ws_0(text)
192
- rpositions = []
193
- text.scan(/\s+/) do |s|
194
- loc = $~.begin(0)
195
- len = $~.end(0) - loc
196
- rpositions << [loc, len, 0]
197
- end
198
- rpositions
188
+ text.enum_for(:scan, /\s+/).map{[b = $~.begin(0), $~.end(0) - b, 0]}
199
189
  end
200
190
 
201
191
  def squeeze_ws_1!(text)
@@ -210,6 +200,7 @@ end
210
200
 
211
201
  if __FILE__ == $0
212
202
  require 'json'
203
+ # require 'profile'
213
204
 
214
205
  unless ARGV.length == 1
215
206
  warn "#{$0} an_annotation_json_file.json"
@@ -221,7 +212,8 @@ if __FILE__ == $0
221
212
  denotations = annotations[:tracks].first[:denotations]
222
213
  end
223
214
 
224
- text_mapping = TextAlignment::CharMapping.new(annotations[:text])
215
+ text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, false)
216
+ # text_mapping = TextAlignment::CharMapping.new(annotations[:text], nil, true)
225
217
  text_mapped = text_mapping.mapped_text
226
218
  denotations_mapped = text_mapping.enmap_denotations(denotations)
227
219
  new_annotations = {text:text_mapped, denotations:denotations_mapped}
@@ -16,11 +16,12 @@ class TextAlignment::TextAlignment
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
18
  options ||= {}
19
- @to_prevent_overlap = options[:to_prevent_overlap] || false
20
- @squeeze_ws_to = options[:squeeze_ws_to] || 0
19
+ @duplicate_texts = options[:duplicate_texts] || false
20
+ @to_ignore_whitespaces = options[:to_ignore_whitespaces] || false
21
+ @to_ignore_text_order = options[:to_ignore_text_order] || false
21
22
 
22
23
  @original_reference_text = reference_text
23
- @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
24
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @to_ignore_whitespaces)
24
25
  @mapped_reference_text = @rtext_mapping.mapped_text
25
26
 
26
27
  @original_text = nil
@@ -30,12 +31,12 @@ class TextAlignment::TextAlignment
30
31
 
31
32
  def align(text, denotations = nil)
32
33
  # To maintain the cultivation map
33
- update_cultivation_map if @to_prevent_overlap
34
+ update_cultivation_map unless @duplicate_texts
34
35
 
35
36
  # In case the input text is the same as the previous one, reuse the previous text mapping
36
37
  unless @original_text && @original_text == text
37
38
  @original_text = text
38
- @text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
39
+ @text_mapping = TextAlignment::CharMapping.new(text, nil, @to_ignore_whitespaces)
39
40
  end
40
41
 
41
42
  @mapped_text = @text_mapping.mapped_text
@@ -205,7 +206,7 @@ class TextAlignment::TextAlignment
205
206
 
206
207
  def find_block_alignment(str1, str2, denotations, cultivation_map)
207
208
  ## to find block alignments
208
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
209
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @to_ignore_whitespaces, @to_ignore_text_order)
209
210
 
210
211
  blocks = []
211
212
  while block = anchor_finder.get_next_anchor
@@ -241,68 +242,75 @@ class TextAlignment::TextAlignment
241
242
  b1 = lblock.nil? ? 0 : lblock[:source][:end]
242
243
  e1 = cblock.nil? ? str1.length : cblock[:source][:begin]
243
244
 
244
- if b1 < e1
245
+ if b1 <= e1
246
+ _str1 = str1[b1 ... e1]
247
+
245
248
  b2 = lblock.nil? ? 0 : lblock[:target][:end]
246
249
  e2 = cblock.nil? ? str2.length : cblock[:target][:begin]
247
- _str1 = str1[b1 ... e1]
248
- _str2 = str2[b2 ... e2]
249
250
 
250
- sum += if _str1.strip.empty? || _str2.strip.empty?
251
- [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
252
- else
253
- len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
254
- region_state, state_region = cultivation_map.region_state([b2, e2])
255
- case region_state
256
- when :closed
257
- [{source:{begin:b1, end:e1}, alignment: :empty}]
258
- when :front_open
259
- if sum.empty? # when there is no preceding matched block
260
- [{source:{begin:b1, end:e1}, alignment: :empty}]
261
- else
262
- oe2 = state_region[1]
263
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
264
- local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
265
- end
266
- when :rear_open
267
- if cblock.nil? # when there is no following matched block
268
- [{source:{begin:b1, end:e1}, alignment: :empty}]
269
- else
270
- ob2 = state_region[0]
271
- mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
272
- local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
273
- end
274
- when :middle_closed
275
- attempt1 = if sum.empty?
251
+ if b2 < e2
252
+ _str2 = str2[b2 ... e2]
253
+
254
+ sum += if _str1.strip.empty? || _str2.strip.empty?
255
+ [{source:{begin:b1, end:e1}, target:{begin:b2, end:e2}, alignment: :empty}]
256
+ else
257
+ len_buffer = ((e1 - b1) * (1 + TextAlignment::BUFFER_RATE)).to_i + TextAlignment::BUFFER_MIN
258
+ region_state, state_region = cultivation_map.region_state([b2, e2])
259
+ case region_state
260
+ when :closed
276
261
  [{source:{begin:b1, end:e1}, alignment: :empty}]
277
- else
278
- oe2 = state_region[0]
279
- me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
280
- local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
281
- end
282
- if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
283
- ob2 = state_region[1]
284
- mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
285
- local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
286
- else
287
- attempt1
288
- end
289
- else # :open
290
- if (e2 - b2) > len_buffer
262
+ when :front_open
263
+ if sum.empty? # when there is no preceding matched block
264
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
265
+ else
266
+ oe2 = state_region[1]
267
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
268
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
269
+ end
270
+ when :rear_open
271
+ if cblock.nil? # when there is no following matched block
272
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
273
+ else
274
+ ob2 = state_region[0]
275
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
276
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
277
+ end
278
+ when :middle_closed
291
279
  attempt1 = if sum.empty?
292
280
  [{source:{begin:b1, end:e1}, alignment: :empty}]
293
281
  else
294
- local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
282
+ oe2 = state_region[0]
283
+ me2 = (oe2 - b2) > len_buffer ? b2 + len_buffer : oe2
284
+ local_alignment(str1, b1, e1, str2, b2, me2, denotations, cultivation_map)
295
285
  end
296
286
  if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
297
- local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
287
+ ob2 = state_region[1]
288
+ mb2 = (e2 - ob2) > len_buffer ? e2 - len_buffer : ob2
289
+ local_alignment(str1, b1, e1, str2, mb2, e2, denotations, cultivation_map)
298
290
  else
299
291
  attempt1
300
292
  end
301
- else
302
- local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
293
+ else # :open
294
+ if (e2 - b2) > len_buffer
295
+ attempt1 = if sum.empty?
296
+ [{source:{begin:b1, end:e1}, alignment: :empty}]
297
+ else
298
+ local_alignment(str1, b1, e1, str2, b2, b2 + len_buffer, denotations, cultivation_map)
299
+ end
300
+ if (attempt1.empty? || attempt1.first[:alignment] == :empty) && !cblock.nil?
301
+ local_alignment(str1, b1, e1, str2, e2 - len_buffer, e2, denotations, cultivation_map)
302
+ else
303
+ attempt1
304
+ end
305
+ else
306
+ local_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
307
+ end
303
308
  end
304
309
  end
310
+ elsif b2 > e2 # when out of order
311
+ # ToDo
305
312
  end
313
+
306
314
  end
307
315
 
308
316
  lblock = cblock
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.8'
2
+ VERSION = '0.11.10'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.8
4
+ version: 0.11.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-04-06 00:00:00.000000000 Z
11
+ date: 2021-04-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
111
111
  - !ruby/object:Gem::Version
112
112
  version: '0'
113
113
  requirements: []
114
- rubygems_version: 3.0.8
114
+ rubygems_version: 3.0.9
115
115
  signing_key:
116
116
  specification_version: 4
117
117
  summary: Ruby class for aligning two character strings