text_alignment 0.11.7 → 0.11.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52ad7c8b5822308ae15153e35626e13b5ec60d77428c80cbe3f1dd69f36edac1
4
- data.tar.gz: efbcbbea9eb87606b1614661187356e15f65965b85c30ebf7d349d2f7e1028b0
3
+ metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
4
+ data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
5
5
  SHA512:
6
- metadata.gz: 59a2a8dd68066271b61b950c18f513b03d2b674e86c0d6d065e265367e079c99bfa973cb41c75bdc2f2d19443a9b6f8e46dcca27bb71eecd27ed00d00c07533e
7
- data.tar.gz: 4779c1cbe899021125f56204d87462a8c1222c961f547e1ddc9ba4c46e29211da564bdc6492346baaa16ad4eaaedce4601f6afdcc0a16031a29be79f624c4935
6
+ metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
7
+ data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, cultivation_map)
9
+ def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
10
+ @method_get_left_windows, @method_get_right_windows = if squeeze_ws
11
+ [method(:get_left_windows), method(:get_right_windows)]
12
+ else
13
+ [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
14
+ end
15
+
10
16
  @s1 = source_str.downcase
11
17
  @s2 = target_str.downcase
12
18
 
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
108
114
  next
109
115
  end
110
116
 
111
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
117
+ left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
112
118
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
119
  break unless valid_beg_s2.nil?
114
120
  valid_beg_s2 = beg_s2
115
121
  next
116
122
  end
117
123
 
118
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
124
+ right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
119
125
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
126
  break unless valid_beg_s2.nil?
121
127
  valid_beg_s2 = beg_s2
@@ -139,7 +145,7 @@ class TextAlignment::AnchorFinder
139
145
  size_window ||= @size_window
140
146
 
141
147
  # comment out below with the assumption that the beginning of a document gives a significant locational information
142
- # return if @beg_s1 < size_window || @beg_s2 < size_window
148
+ # return if beg_s1 < size_window || beg_s2 < size_window
143
149
 
144
150
  window_s1 = ''
145
151
  loc = beg_s1 - 1
@@ -170,7 +176,7 @@ class TextAlignment::AnchorFinder
170
176
  size_window ||= @size_window
171
177
 
172
178
  # commend below with the assumption that the end of a document gives a significant locational
173
- # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
179
+ # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
174
180
 
175
181
  window_s1 = ''
176
182
  loc = beg_s1 + @size_ngram
@@ -199,6 +205,44 @@ class TextAlignment::AnchorFinder
199
205
  [window_s1, window_s2]
200
206
  end
201
207
 
208
+ def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
209
+ size_window ||= @size_window
210
+
211
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
212
+ # return if beg_s1 < size_window || beg_s2 < size_window
213
+
214
+ wbeg = beg_s1 - size_window
215
+ wbeg = 0 if wbeg < 0
216
+ window_s1 = @s1[wbeg ... beg_s1]
217
+
218
+ wbeg = beg_s2 - size_window
219
+ wbeg = 0 if wbeg < 0
220
+ window_s2 = @s2[wbeg ... beg_s2]
221
+
222
+ [window_s1, window_s2]
223
+ end
224
+
225
+ def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
226
+ size_window ||= @size_window
227
+
228
+ # commend below with the assumption that the end of a document gives a significant locational
229
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
230
+
231
+ slen = @s1.length
232
+ wbeg = beg_s1 + @size_ngram
233
+ wend = wbeg + size_window
234
+ wend = slen if wend > slen
235
+ window_s1 = @s1[wbeg ... wend]
236
+
237
+ slen = @s2.length
238
+ wbeg = beg_s2 + @size_ngram
239
+ wend = wbeg + size_window
240
+ wend = slen if wend > slen
241
+ window_s2 = @s2[wbeg ... wend]
242
+
243
+ [window_s1, window_s2]
244
+ end
245
+
202
246
  def text_similarity(str1, str2, ngram_order = 2)
203
247
  return 0 if str1.nil? || str2.nil?
204
248
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module TextAlignment; end unless defined? TextAlignment
2
4
 
3
5
  TextAlignment::CHAR_MAPPING = [
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
80
82
  class TextAlignment::CharMapping
81
83
  attr_reader :mapped_text
82
84
 
83
- def initialize(_text, char_mapping = nil)
84
- char_mapping ||= TextAlignment::CHAR_MAPPING
85
+ def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
86
+ if squeeze_ws_to == 0
87
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
88
+ @method_squeeze_ws = method(:squeeze_ws_0!)
89
+ else
90
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
91
+ @method_squeeze_ws = method(:squeeze_ws_1!)
92
+ end
93
+
94
+ char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
85
95
  @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
86
96
  @index_enmap = offset_mapping.to_h
87
97
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
@@ -105,7 +115,7 @@ class TextAlignment::CharMapping
105
115
 
106
116
  private
107
117
 
108
- def enmap_text(_text, char_mapping)
118
+ def enmap_text(_text, char_mapping, no_ws = false)
109
119
  text = _text.dup
110
120
 
111
121
  # To execute the single letter mapping replacement
@@ -113,14 +123,14 @@ class TextAlignment::CharMapping
113
123
  text.gsub!(one, long) if long.length == 1
114
124
  end
115
125
 
116
- # To get the (location, length) index for replacements
117
- loc_len = []
126
+ # To get the replacement positions, (position, old_length, new_length), for char mappings
127
+ rpositions = []
118
128
  char_mapping.each do |one, long|
119
129
  next if long.length == 1
120
130
 
121
131
  init_next = 0
122
132
  while loc = text.index(long, init_next)
123
- loc_len << [loc, long.length, 1]
133
+ rpositions << [loc, long.length, 1]
124
134
  init_next = loc + long.length
125
135
  end
126
136
 
@@ -128,22 +138,17 @@ class TextAlignment::CharMapping
128
138
  text.gsub!(long, one * long.length)
129
139
  end
130
140
 
131
- # To get the (location, length) index for consecutive whitespace sequences
132
- init_next = 0
133
- while loc = text.index(/\s{1,}/, init_next)
134
- len = $~[0].length
135
- loc_len << [loc, len, 0]
136
- init_next = loc + len
137
- end
141
+ # To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
142
+ rpositions += @method_get_positions_squeeze_ws.call(text)
138
143
 
139
- loc_len.sort!{|a, b| a[0] <=> b[0]}
144
+ rpositions.sort!{|a, b| a[0] <=> b[0]}
140
145
 
141
146
  # To get the offset_mapping before and after replacement
142
147
  offset_mapping = []
143
148
  init_next = 0
144
149
  j = 0
145
150
 
146
- loc_len.each do |loc, old_len, new_len|
151
+ rpositions.each do |loc, old_len, new_len|
147
152
  offset_mapping += (init_next .. loc).map do |i|
148
153
  m = [i, j]
149
154
  j += 1
@@ -166,10 +171,41 @@ class TextAlignment::CharMapping
166
171
  end
167
172
 
168
173
  # To replace multi whitespace sequences to a space
169
- text.gsub!(/\s{1,}/, '')
174
+ @method_squeeze_ws.call(text)
170
175
 
171
176
  [text, offset_mapping]
172
177
  end
178
+
179
+ # To get squeeze positions of whitespaces to one
180
+ def get_positions_squeeze_ws_1(text)
181
+ rpositions = []
182
+ text.scan(/s{2,}/) do |s|
183
+ loc = $~.begin(0)
184
+ len = $~.end(0) - loc
185
+ rpositions << [loc, len, 1]
186
+ end
187
+ rpositions
188
+ end
189
+
190
+ # To get squeeze positions of whitespaces to zero
191
+ def get_positions_squeeze_ws_0(text)
192
+ rpositions = []
193
+ text.scan(/\s+/) do |s|
194
+ loc = $~.begin(0)
195
+ len = $~.end(0) - loc
196
+ rpositions << [loc, len, 0]
197
+ end
198
+ rpositions
199
+ end
200
+
201
+ def squeeze_ws_1!(text)
202
+ text.gsub!(/\s{2,}/, ' ')
203
+ end
204
+
205
+ def squeeze_ws_0!(text)
206
+ text.gsub!(/\s+/, '')
207
+ end
208
+
173
209
  end
174
210
 
175
211
  if __FILE__ == $0
@@ -190,5 +226,5 @@ if __FILE__ == $0
190
226
  denotations_mapped = text_mapping.enmap_denotations(denotations)
191
227
  new_annotations = {text:text_mapped, denotations:denotations_mapped}
192
228
 
193
- puts new_annotations.to_json
229
+ # puts new_annotations.to_json
194
230
  end
@@ -157,8 +157,7 @@ class TextAlignment::MixedAlignment
157
157
  when '-'
158
158
  ''
159
159
  when '+'
160
- # (d.new_element =~ /\S/) ? '+' : ''
161
- '+'
160
+ (d.new_element =~ /\S/) ? '+' : ''
162
161
  else
163
162
  ''
164
163
  end
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
13
 
14
- # Initialize with a reference text, again which texts will be aligned
15
- def initialize(reference_text, to_prevent_overlap = false)
14
+ # Initialize with a reference text, against which texts will be aligned
15
+ def initialize(reference_text, options = {})
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
+ options ||= {}
19
+ @to_prevent_overlap = options[:to_prevent_overlap] || false
20
+ @squeeze_ws_to = options[:squeeze_ws_to] || 0
21
+
18
22
  @original_reference_text = reference_text
19
- @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
23
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
20
24
  @mapped_reference_text = @rtext_mapping.mapped_text
21
- @to_prevent_overlap = to_prevent_overlap
22
25
 
23
26
  @original_text = nil
24
27
  @blocks = nil
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
32
35
  # In case the input text is the same as the previous one, reuse the previous text mapping
33
36
  unless @original_text && @original_text == text
34
37
  @original_text = text
35
- @text_mapping = TextAlignment::CharMapping.new(text)
38
+ @text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
36
39
  end
37
40
 
38
41
  @mapped_text = @text_mapping.mapped_text
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
202
205
 
203
206
  def find_block_alignment(str1, str2, denotations, cultivation_map)
204
207
  ## to find block alignments
205
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
208
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
206
209
 
207
210
  blocks = []
208
211
  while block = anchor_finder.get_next_anchor
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.7'
2
+ VERSION = '0.11.8'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.7
4
+ version: 0.11.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-27 00:00:00.000000000 Z
11
+ date: 2021-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary