text_alignment 0.11.2 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 81994ec9a8c7c08d2aad32b351b5942fef1748e4035ba762af546d4f3fe7cee8
4
- data.tar.gz: bfa75451d33b9d21c2baa1a52280f03486bf10a41b5ac2a97469f0ca3a4f7379
3
+ metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
4
+ data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
5
5
  SHA512:
6
- metadata.gz: 73d0ff212a89d6ad33751f87f14a9b292b45ce177c61efd0ede5f852eb3834d1bd3940d202c4d87cfb0422c4dc566dea30c560f8500a220dcd3e1dd492e29eac
7
- data.tar.gz: 6c763a564e339267624bec5c809b334b0adf9951d25409eac21ae4b4582beae4a08739119dfa369022c44c19d4faacfd0e06766c6223bd219a43c3961cfab08c
6
+ metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
7
+ data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment'
3
3
  require 'json'
4
4
  require 'pp'
5
+ require 'optparse'
5
6
 
6
7
  def read_annotations(filename)
7
8
  case File.extname(filename)
@@ -108,24 +109,46 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
108
109
  end
109
110
 
110
111
 
112
+ ## Options
113
+ overlap_p = false
114
+ debug_p = false
115
+
116
+ ## command line option processing
117
+ require 'optparse'
118
+ optparse = OptionParser.new do |opts|
119
+ opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
+
121
+ opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
+ overlap_p = true
123
+ end
124
+
125
+ opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
+ debug_p = true
127
+ end
128
+
129
+ opts.on('-h', '--help', 'displays this screen.') do
130
+ puts opts
131
+ exit
132
+ end
133
+ end
134
+
135
+ optparse.parse!
136
+
111
137
  unless ARGV.length == 2
112
- warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
113
- exit
138
+ puts optparse.help
139
+ exit 1
114
140
  end
115
141
 
116
142
  source_annotations = read_annotations(ARGV[0])
117
143
  reference_text = read_text(ARGV[1])
118
144
 
119
- alignment = TextAlignment::TextAlignment.new(reference_text, true)
145
+ alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
120
146
 
121
147
  target_annotations = if source_annotations.class == Array
122
- # align_mannotations(source_annotations, reference_text, alignment, true)
123
- align_mannotations(source_annotations, reference_text, alignment, false)
148
+ align_mannotations(source_annotations, reference_text, alignment, debug_p)
124
149
  else
125
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
150
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
127
151
  source_annotations.merge({text:reference_text, denotations:denotations})
128
152
  end
129
153
 
130
- # pp alignment.block_alignment
131
154
  # puts target_annotations.to_json
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, cultivation_map)
9
+ def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
10
+ @method_get_left_windows, @method_get_right_windows = if squeeze_ws
11
+ [method(:get_left_windows), method(:get_right_windows)]
12
+ else
13
+ [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
14
+ end
15
+
10
16
  @s1 = source_str.downcase
11
17
  @s2 = target_str.downcase
12
18
 
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
108
114
  next
109
115
  end
110
116
 
111
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
117
+ left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
112
118
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
119
  break unless valid_beg_s2.nil?
114
120
  valid_beg_s2 = beg_s2
115
121
  next
116
122
  end
117
123
 
118
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
124
+ right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
119
125
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
126
  break unless valid_beg_s2.nil?
121
127
  valid_beg_s2 = beg_s2
@@ -125,7 +131,11 @@ class TextAlignment::AnchorFinder
125
131
 
126
132
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
133
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
- break unless r.nil?
134
+ if r.nil?
135
+ valid_beg_s2 = nil
136
+ else
137
+ break
138
+ end
129
139
  end
130
140
 
131
141
  valid_beg_s2
@@ -135,7 +145,7 @@ class TextAlignment::AnchorFinder
135
145
  size_window ||= @size_window
136
146
 
137
147
  # comment out below with the assumption that the beginning of a document gives a significant locational information
138
- # return if @beg_s1 < size_window || @beg_s2 < size_window
148
+ # return if beg_s1 < size_window || beg_s2 < size_window
139
149
 
140
150
  window_s1 = ''
141
151
  loc = beg_s1 - 1
@@ -166,7 +176,7 @@ class TextAlignment::AnchorFinder
166
176
  size_window ||= @size_window
167
177
 
168
178
  # commend below with the assumption that the end of a document gives a significant locational
169
- # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
179
+ # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
170
180
 
171
181
  window_s1 = ''
172
182
  loc = beg_s1 + @size_ngram
@@ -195,6 +205,44 @@ class TextAlignment::AnchorFinder
195
205
  [window_s1, window_s2]
196
206
  end
197
207
 
208
+ def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
209
+ size_window ||= @size_window
210
+
211
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
212
+ # return if beg_s1 < size_window || beg_s2 < size_window
213
+
214
+ wbeg = beg_s1 - size_window
215
+ wbeg = 0 if wbeg < 0
216
+ window_s1 = @s1[wbeg ... beg_s1]
217
+
218
+ wbeg = beg_s2 - size_window
219
+ wbeg = 0 if wbeg < 0
220
+ window_s2 = @s2[wbeg ... beg_s2]
221
+
222
+ [window_s1, window_s2]
223
+ end
224
+
225
+ def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
226
+ size_window ||= @size_window
227
+
228
+ # commend below with the assumption that the end of a document gives a significant locational
229
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
230
+
231
+ slen = @s1.length
232
+ wbeg = beg_s1 + @size_ngram
233
+ wend = wbeg + size_window
234
+ wend = slen if wend > slen
235
+ window_s1 = @s1[wbeg ... wend]
236
+
237
+ slen = @s2.length
238
+ wbeg = beg_s2 + @size_ngram
239
+ wend = wbeg + size_window
240
+ wend = slen if wend > slen
241
+ window_s2 = @s2[wbeg ... wend]
242
+
243
+ [window_s1, window_s2]
244
+ end
245
+
198
246
  def text_similarity(str1, str2, ngram_order = 2)
199
247
  return 0 if str1.nil? || str2.nil?
200
248
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module TextAlignment; end unless defined? TextAlignment
2
4
 
3
5
  TextAlignment::CHAR_MAPPING = [
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
80
82
  class TextAlignment::CharMapping
81
83
  attr_reader :mapped_text
82
84
 
83
- def initialize(_text, char_mapping = nil)
84
- char_mapping ||= TextAlignment::CHAR_MAPPING
85
+ def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
86
+ if squeeze_ws_to == 0
87
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
88
+ @method_squeeze_ws = method(:squeeze_ws_0!)
89
+ else
90
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
91
+ @method_squeeze_ws = method(:squeeze_ws_1!)
92
+ end
93
+
94
+ char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
85
95
  @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
86
96
  @index_enmap = offset_mapping.to_h
87
97
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
@@ -105,22 +115,22 @@ class TextAlignment::CharMapping
105
115
 
106
116
  private
107
117
 
108
- def enmap_text(_text, char_mapping)
118
+ def enmap_text(_text, char_mapping, no_ws = false)
109
119
  text = _text.dup
110
120
 
111
- # To execute the single letter mapping
121
+ # To execute the single letter mapping replacement
112
122
  char_mapping.each do |one, long|
113
123
  text.gsub!(one, long) if long.length == 1
114
124
  end
115
125
 
116
- # To get the (location, length) index for replacements
117
- loc_len = []
126
+ # To get the replacement positions, (position, old_length, new_length), for char mappings
127
+ rpositions = []
118
128
  char_mapping.each do |one, long|
119
129
  next if long.length == 1
120
130
 
121
131
  init_next = 0
122
132
  while loc = text.index(long, init_next)
123
- loc_len << [loc, long.length]
133
+ rpositions << [loc, long.length, 1]
124
134
  init_next = loc + long.length
125
135
  end
126
136
 
@@ -128,32 +138,31 @@ class TextAlignment::CharMapping
128
138
  text.gsub!(long, one * long.length)
129
139
  end
130
140
 
131
- # To get the (location, length) index for consecutive whitespace sequences
132
- init_next = 0
133
- while loc = text.index(/\s{2,}/, init_next)
134
- len = $~[0].length
135
- loc_len << [loc, len]
136
- init_next = loc + len
137
- end
141
+ # To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
142
+ rpositions += @method_get_positions_squeeze_ws.call(text)
138
143
 
139
- loc_len.sort!{|a, b| a[0] <=> b[0]}
144
+ rpositions.sort!{|a, b| a[0] <=> b[0]}
140
145
 
141
146
  # To get the offset_mapping before and after replacement
142
147
  offset_mapping = []
143
148
  init_next = 0
144
149
  j = 0
145
150
 
146
- loc_len.each do |loc, len|
151
+ rpositions.each do |loc, old_len, new_len|
147
152
  offset_mapping += (init_next .. loc).map do |i|
153
+ m = [i, j]
148
154
  j += 1
149
- [i, j - 1]
155
+ m
150
156
  end
151
- init_next = loc + len
157
+
158
+ init_next = loc + old_len
159
+ j += (new_len - 1)
152
160
  end
153
161
 
154
162
  offset_mapping += (init_next .. text.length).map do |i|
163
+ m = [i, j]
155
164
  j += 1
156
- [i, j - 1]
165
+ m
157
166
  end
158
167
 
159
168
  # To execute the long letter mapping
@@ -162,10 +171,41 @@ class TextAlignment::CharMapping
162
171
  end
163
172
 
164
173
  # To replace multi whitespace sequences to a space
165
- text.gsub!(/\s{2,}/, ' ')
174
+ @method_squeeze_ws.call(text)
166
175
 
167
176
  [text, offset_mapping]
168
177
  end
178
+
179
+ # To get squeeze positions of whitespaces to one
180
+ def get_positions_squeeze_ws_1(text)
181
+ rpositions = []
182
+ text.scan(/s{2,}/) do |s|
183
+ loc = $~.begin(0)
184
+ len = $~.end(0) - loc
185
+ rpositions << [loc, len, 1]
186
+ end
187
+ rpositions
188
+ end
189
+
190
+ # To get squeeze positions of whitespaces to zero
191
+ def get_positions_squeeze_ws_0(text)
192
+ rpositions = []
193
+ text.scan(/\s+/) do |s|
194
+ loc = $~.begin(0)
195
+ len = $~.end(0) - loc
196
+ rpositions << [loc, len, 0]
197
+ end
198
+ rpositions
199
+ end
200
+
201
+ def squeeze_ws_1!(text)
202
+ text.gsub!(/\s{2,}/, ' ')
203
+ end
204
+
205
+ def squeeze_ws_0!(text)
206
+ text.gsub!(/\s+/, '')
207
+ end
208
+
169
209
  end
170
210
 
171
211
  if __FILE__ == $0
@@ -186,5 +226,5 @@ if __FILE__ == $0
186
226
  denotations_mapped = text_mapping.enmap_denotations(denotations)
187
227
  new_annotations = {text:text_mapped, denotations:denotations_mapped}
188
228
 
189
- puts new_annotations.to_json
229
+ # puts new_annotations.to_json
190
230
  end
@@ -41,7 +41,7 @@ class TextAlignment::CultivationMap
41
41
  end
42
42
 
43
43
  def next_cultivated_position(position)
44
- region = @map.bsearch{|r| position < r[0]}
44
+ region = @map.bsearch{|r| position <= r[0]}
45
45
  region.nil? ? nil : region[0]
46
46
  end
47
47
 
@@ -147,13 +147,24 @@ class TextAlignment::MixedAlignment
147
147
  # recoverbility
148
148
  count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
149
  count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
-
151
150
  coverage = count_nws_match.to_f / count_nws
152
151
 
153
152
  # fragmentation rate
154
- count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
- count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
- rate_frag = count_ofrag.to_f / count_frag
153
+ frag_str = sdiff.collect do |d|
154
+ case d.action
155
+ when '='
156
+ '='
157
+ when '-'
158
+ ''
159
+ when '+'
160
+ (d.new_element =~ /\S/) ? '+' : ''
161
+ else
162
+ ''
163
+ end
164
+ end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
165
+
166
+ count_frag = frag_str.scan(/=+/).count
167
+ rate_frag = 1.0 / count_frag
157
168
 
158
169
  similarity = coverage * rate_frag
159
170
  end
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
13
 
14
- # Initialize with a reference text, again which texts will be aligned
15
- def initialize(reference_text, to_prevent_overlap = false)
14
+ # Initialize with a reference text, against which texts will be aligned
15
+ def initialize(reference_text, options = {})
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
+ options ||= {}
19
+ @to_prevent_overlap = options[:to_prevent_overlap] || false
20
+ @squeeze_ws_to = options[:squeeze_ws_to] || 0
21
+
18
22
  @original_reference_text = reference_text
19
- @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
23
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
20
24
  @mapped_reference_text = @rtext_mapping.mapped_text
21
- @to_prevent_overlap = to_prevent_overlap
22
25
 
23
26
  @original_text = nil
24
27
  @blocks = nil
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
32
35
  # In case the input text is the same as the previous one, reuse the previous text mapping
33
36
  unless @original_text && @original_text == text
34
37
  @original_text = text
35
- @text_mapping = TextAlignment::CharMapping.new(text)
38
+ @text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
36
39
  end
37
40
 
38
41
  @mapped_text = @text_mapping.mapped_text
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
202
205
 
203
206
  def find_block_alignment(str1, str2, denotations, cultivation_map)
204
207
  ## to find block alignments
205
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
208
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
206
209
 
207
210
  blocks = []
208
211
  while block = anchor_finder.get_next_anchor
@@ -320,7 +323,7 @@ class TextAlignment::TextAlignment
320
323
 
321
324
  def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
325
  tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
- if tblocks.empty?
326
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
324
327
  lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
328
  else
326
329
  tblocks
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.2'
2
+ VERSION = '0.11.8'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.2
4
+ version: 0.11.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-14 00:00:00.000000000 Z
11
+ date: 2021-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary