text_alignment 0.11.2 → 0.11.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 81994ec9a8c7c08d2aad32b351b5942fef1748e4035ba762af546d4f3fe7cee8
4
- data.tar.gz: bfa75451d33b9d21c2baa1a52280f03486bf10a41b5ac2a97469f0ca3a4f7379
3
+ metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
4
+ data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
5
5
  SHA512:
6
- metadata.gz: 73d0ff212a89d6ad33751f87f14a9b292b45ce177c61efd0ede5f852eb3834d1bd3940d202c4d87cfb0422c4dc566dea30c560f8500a220dcd3e1dd492e29eac
7
- data.tar.gz: 6c763a564e339267624bec5c809b334b0adf9951d25409eac21ae4b4582beae4a08739119dfa369022c44c19d4faacfd0e06766c6223bd219a43c3961cfab08c
6
+ metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
7
+ data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
@@ -2,6 +2,7 @@
2
2
  require 'text_alignment'
3
3
  require 'json'
4
4
  require 'pp'
5
+ require 'optparse'
5
6
 
6
7
  def read_annotations(filename)
7
8
  case File.extname(filename)
@@ -108,24 +109,46 @@ def align_mannotations(source_annotations, reference_text, alignment, debug = fa
108
109
  end
109
110
 
110
111
 
112
+ ## Options
113
+ overlap_p = false
114
+ debug_p = false
115
+
116
+ ## command line option processing
117
+ require 'optparse'
118
+ optparse = OptionParser.new do |opts|
119
+ opts.banner = "Usage: align_annotations [options] target_annotations(.json|.txt) reference_text(.json|.txt)"
120
+
121
+ opts.on('-o', '--overlap', 'tells it to assume there may be overlapping texts.') do
122
+ overlap_p = true
123
+ end
124
+
125
+ opts.on('-d', '--debug', 'tells it to show debugging information.') do
126
+ debug_p = true
127
+ end
128
+
129
+ opts.on('-h', '--help', 'displays this screen.') do
130
+ puts opts
131
+ exit
132
+ end
133
+ end
134
+
135
+ optparse.parse!
136
+
111
137
  unless ARGV.length == 2
112
- warn "align_annotations target_annotations(.json|.txt) reference_annotations(.json|.txt)"
113
- exit
138
+ puts optparse.help
139
+ exit 1
114
140
  end
115
141
 
116
142
  source_annotations = read_annotations(ARGV[0])
117
143
  reference_text = read_text(ARGV[1])
118
144
 
119
- alignment = TextAlignment::TextAlignment.new(reference_text, true)
145
+ alignment = TextAlignment::TextAlignment.new(reference_text, !overlap_p)
120
146
 
121
147
  target_annotations = if source_annotations.class == Array
122
- # align_mannotations(source_annotations, reference_text, alignment, true)
123
- align_mannotations(source_annotations, reference_text, alignment, false)
148
+ align_mannotations(source_annotations, reference_text, alignment, debug_p)
124
149
  else
125
- # denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment)
126
- denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, false)
150
+ denotations = align_denotations(source_annotations[:denotations], source_annotations[:text], alignment, debug_p)
127
151
  source_annotations.merge({text:reference_text, denotations:denotations})
128
152
  end
129
153
 
130
- # pp alignment.block_alignment
131
154
  # puts target_annotations.to_json
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, cultivation_map)
9
+ def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
10
+ @method_get_left_windows, @method_get_right_windows = if squeeze_ws
11
+ [method(:get_left_windows), method(:get_right_windows)]
12
+ else
13
+ [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
14
+ end
15
+
10
16
  @s1 = source_str.downcase
11
17
  @s2 = target_str.downcase
12
18
 
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
108
114
  next
109
115
  end
110
116
 
111
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2)
117
+ left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
112
118
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
119
  break unless valid_beg_s2.nil?
114
120
  valid_beg_s2 = beg_s2
115
121
  next
116
122
  end
117
123
 
118
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2)
124
+ right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
119
125
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
126
  break unless valid_beg_s2.nil?
121
127
  valid_beg_s2 = beg_s2
@@ -125,7 +131,11 @@ class TextAlignment::AnchorFinder
125
131
 
126
132
  # r == nil means that the inner loop was broken (multiple candidates had passed the tests)
127
133
  # r != nil means that the inner loop was completed (with or w/o a valid beg_s2 found)
128
- break unless r.nil?
134
+ if r.nil?
135
+ valid_beg_s2 = nil
136
+ else
137
+ break
138
+ end
129
139
  end
130
140
 
131
141
  valid_beg_s2
@@ -135,7 +145,7 @@ class TextAlignment::AnchorFinder
135
145
  size_window ||= @size_window
136
146
 
137
147
  # comment out below with the assumption that the beginning of a document gives a significant locational information
138
- # return if @beg_s1 < size_window || @beg_s2 < size_window
148
+ # return if beg_s1 < size_window || beg_s2 < size_window
139
149
 
140
150
  window_s1 = ''
141
151
  loc = beg_s1 - 1
@@ -166,7 +176,7 @@ class TextAlignment::AnchorFinder
166
176
  size_window ||= @size_window
167
177
 
168
178
  # commend below with the assumption that the end of a document gives a significant locational
169
- # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
179
+ # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
170
180
 
171
181
  window_s1 = ''
172
182
  loc = beg_s1 + @size_ngram
@@ -195,6 +205,44 @@ class TextAlignment::AnchorFinder
195
205
  [window_s1, window_s2]
196
206
  end
197
207
 
208
+ def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
209
+ size_window ||= @size_window
210
+
211
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
212
+ # return if beg_s1 < size_window || beg_s2 < size_window
213
+
214
+ wbeg = beg_s1 - size_window
215
+ wbeg = 0 if wbeg < 0
216
+ window_s1 = @s1[wbeg ... beg_s1]
217
+
218
+ wbeg = beg_s2 - size_window
219
+ wbeg = 0 if wbeg < 0
220
+ window_s2 = @s2[wbeg ... beg_s2]
221
+
222
+ [window_s1, window_s2]
223
+ end
224
+
225
+ def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
226
+ size_window ||= @size_window
227
+
228
+ # commend below with the assumption that the end of a document gives a significant locational
229
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
230
+
231
+ slen = @s1.length
232
+ wbeg = beg_s1 + @size_ngram
233
+ wend = wbeg + size_window
234
+ wend = slen if wend > slen
235
+ window_s1 = @s1[wbeg ... wend]
236
+
237
+ slen = @s2.length
238
+ wbeg = beg_s2 + @size_ngram
239
+ wend = wbeg + size_window
240
+ wend = slen if wend > slen
241
+ window_s2 = @s2[wbeg ... wend]
242
+
243
+ [window_s1, window_s2]
244
+ end
245
+
198
246
  def text_similarity(str1, str2, ngram_order = 2)
199
247
  return 0 if str1.nil? || str2.nil?
200
248
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module TextAlignment; end unless defined? TextAlignment
2
4
 
3
5
  TextAlignment::CHAR_MAPPING = [
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
80
82
  class TextAlignment::CharMapping
81
83
  attr_reader :mapped_text
82
84
 
83
- def initialize(_text, char_mapping = nil)
84
- char_mapping ||= TextAlignment::CHAR_MAPPING
85
+ def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
86
+ if squeeze_ws_to == 0
87
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
88
+ @method_squeeze_ws = method(:squeeze_ws_0!)
89
+ else
90
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
91
+ @method_squeeze_ws = method(:squeeze_ws_1!)
92
+ end
93
+
94
+ char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
85
95
  @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
86
96
  @index_enmap = offset_mapping.to_h
87
97
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
@@ -105,22 +115,22 @@ class TextAlignment::CharMapping
105
115
 
106
116
  private
107
117
 
108
- def enmap_text(_text, char_mapping)
118
+ def enmap_text(_text, char_mapping, no_ws = false)
109
119
  text = _text.dup
110
120
 
111
- # To execute the single letter mapping
121
+ # To execute the single letter mapping replacement
112
122
  char_mapping.each do |one, long|
113
123
  text.gsub!(one, long) if long.length == 1
114
124
  end
115
125
 
116
- # To get the (location, length) index for replacements
117
- loc_len = []
126
+ # To get the replacement positions, (position, old_length, new_length), for char mappings
127
+ rpositions = []
118
128
  char_mapping.each do |one, long|
119
129
  next if long.length == 1
120
130
 
121
131
  init_next = 0
122
132
  while loc = text.index(long, init_next)
123
- loc_len << [loc, long.length]
133
+ rpositions << [loc, long.length, 1]
124
134
  init_next = loc + long.length
125
135
  end
126
136
 
@@ -128,32 +138,31 @@ class TextAlignment::CharMapping
128
138
  text.gsub!(long, one * long.length)
129
139
  end
130
140
 
131
- # To get the (location, length) index for consecutive whitespace sequences
132
- init_next = 0
133
- while loc = text.index(/\s{2,}/, init_next)
134
- len = $~[0].length
135
- loc_len << [loc, len]
136
- init_next = loc + len
137
- end
141
+ # To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
142
+ rpositions += @method_get_positions_squeeze_ws.call(text)
138
143
 
139
- loc_len.sort!{|a, b| a[0] <=> b[0]}
144
+ rpositions.sort!{|a, b| a[0] <=> b[0]}
140
145
 
141
146
  # To get the offset_mapping before and after replacement
142
147
  offset_mapping = []
143
148
  init_next = 0
144
149
  j = 0
145
150
 
146
- loc_len.each do |loc, len|
151
+ rpositions.each do |loc, old_len, new_len|
147
152
  offset_mapping += (init_next .. loc).map do |i|
153
+ m = [i, j]
148
154
  j += 1
149
- [i, j - 1]
155
+ m
150
156
  end
151
- init_next = loc + len
157
+
158
+ init_next = loc + old_len
159
+ j += (new_len - 1)
152
160
  end
153
161
 
154
162
  offset_mapping += (init_next .. text.length).map do |i|
163
+ m = [i, j]
155
164
  j += 1
156
- [i, j - 1]
165
+ m
157
166
  end
158
167
 
159
168
  # To execute the long letter mapping
@@ -162,10 +171,41 @@ class TextAlignment::CharMapping
162
171
  end
163
172
 
164
173
  # To replace multi whitespace sequences to a space
165
- text.gsub!(/\s{2,}/, ' ')
174
+ @method_squeeze_ws.call(text)
166
175
 
167
176
  [text, offset_mapping]
168
177
  end
178
+
179
+ # To get squeeze positions of whitespaces to one
180
+ def get_positions_squeeze_ws_1(text)
181
+ rpositions = []
182
+ text.scan(/s{2,}/) do |s|
183
+ loc = $~.begin(0)
184
+ len = $~.end(0) - loc
185
+ rpositions << [loc, len, 1]
186
+ end
187
+ rpositions
188
+ end
189
+
190
+ # To get squeeze positions of whitespaces to zero
191
+ def get_positions_squeeze_ws_0(text)
192
+ rpositions = []
193
+ text.scan(/\s+/) do |s|
194
+ loc = $~.begin(0)
195
+ len = $~.end(0) - loc
196
+ rpositions << [loc, len, 0]
197
+ end
198
+ rpositions
199
+ end
200
+
201
+ def squeeze_ws_1!(text)
202
+ text.gsub!(/\s{2,}/, ' ')
203
+ end
204
+
205
+ def squeeze_ws_0!(text)
206
+ text.gsub!(/\s+/, '')
207
+ end
208
+
169
209
  end
170
210
 
171
211
  if __FILE__ == $0
@@ -186,5 +226,5 @@ if __FILE__ == $0
186
226
  denotations_mapped = text_mapping.enmap_denotations(denotations)
187
227
  new_annotations = {text:text_mapped, denotations:denotations_mapped}
188
228
 
189
- puts new_annotations.to_json
229
+ # puts new_annotations.to_json
190
230
  end
@@ -41,7 +41,7 @@ class TextAlignment::CultivationMap
41
41
  end
42
42
 
43
43
  def next_cultivated_position(position)
44
- region = @map.bsearch{|r| position < r[0]}
44
+ region = @map.bsearch{|r| position <= r[0]}
45
45
  region.nil? ? nil : region[0]
46
46
  end
47
47
 
@@ -147,13 +147,24 @@ class TextAlignment::MixedAlignment
147
147
  # recoverbility
148
148
  count_nws = sdiff.count{|d| d.old_element =~ /\S/}
149
149
  count_nws_match = sdiff.count{|d| d.action == '=' && d.old_element =~ /\S/}
150
-
151
150
  coverage = count_nws_match.to_f / count_nws
152
151
 
153
152
  # fragmentation rate
154
- count_ofrag = sdiff.count{|d| d.old_element =~ /\s/}
155
- count_frag = sdiff.collect{|d| (d.action == '=') && (d.old_element =~/\s/) ? ' ' : d.action}.join.scan(/=+/).count
156
- rate_frag = count_ofrag.to_f / count_frag
153
+ frag_str = sdiff.collect do |d|
154
+ case d.action
155
+ when '='
156
+ '='
157
+ when '-'
158
+ ''
159
+ when '+'
160
+ (d.new_element =~ /\S/) ? '+' : ''
161
+ else
162
+ ''
163
+ end
164
+ end.join.sub(/^[^=]++/, '').sub(/[^=]+$/, '')
165
+
166
+ count_frag = frag_str.scan(/=+/).count
167
+ rate_frag = 1.0 / count_frag
157
168
 
158
169
  similarity = coverage * rate_frag
159
170
  end
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
13
 
14
- # Initialize with a reference text, again which texts will be aligned
15
- def initialize(reference_text, to_prevent_overlap = false)
14
+ # Initialize with a reference text, against which texts will be aligned
15
+ def initialize(reference_text, options = {})
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
+ options ||= {}
19
+ @to_prevent_overlap = options[:to_prevent_overlap] || false
20
+ @squeeze_ws_to = options[:squeeze_ws_to] || 0
21
+
18
22
  @original_reference_text = reference_text
19
- @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
23
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
20
24
  @mapped_reference_text = @rtext_mapping.mapped_text
21
- @to_prevent_overlap = to_prevent_overlap
22
25
 
23
26
  @original_text = nil
24
27
  @blocks = nil
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
32
35
  # In case the input text is the same as the previous one, reuse the previous text mapping
33
36
  unless @original_text && @original_text == text
34
37
  @original_text = text
35
- @text_mapping = TextAlignment::CharMapping.new(text)
38
+ @text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
36
39
  end
37
40
 
38
41
  @mapped_text = @text_mapping.mapped_text
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
202
205
 
203
206
  def find_block_alignment(str1, str2, denotations, cultivation_map)
204
207
  ## to find block alignments
205
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
208
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
206
209
 
207
210
  blocks = []
208
211
  while block = anchor_finder.get_next_anchor
@@ -320,7 +323,7 @@ class TextAlignment::TextAlignment
320
323
 
321
324
  def local_alignment(str1, b1, e1, str2, b2, e2, denotations = nil, cultivation_map)
322
325
  tblocks = term_based_alignment(str1, b1, e1, str2, b2, e2, denotations, cultivation_map)
323
- if tblocks.empty?
326
+ if tblocks.empty? || tblocks.first[:alignment] == :empty
324
327
  lcs_alignment(str1, b1, e1, str2, b2, e2, cultivation_map)
325
328
  else
326
329
  tblocks
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.2'
2
+ VERSION = '0.11.8'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.2
4
+ version: 0.11.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-14 00:00:00.000000000 Z
11
+ date: 2021-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary