text_alignment 0.11.7 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 52ad7c8b5822308ae15153e35626e13b5ec60d77428c80cbe3f1dd69f36edac1
4
- data.tar.gz: efbcbbea9eb87606b1614661187356e15f65965b85c30ebf7d349d2f7e1028b0
3
+ metadata.gz: d07269e998620f3e8f1564a4b81d3710e1898058248377224be0c5398690872f
4
+ data.tar.gz: 7f457820b2d5a9a9dcbf00ceb89342ca7264e2ef85afd02bd2463256e71680dd
5
5
  SHA512:
6
- metadata.gz: 59a2a8dd68066271b61b950c18f513b03d2b674e86c0d6d065e265367e079c99bfa973cb41c75bdc2f2d19443a9b6f8e46dcca27bb71eecd27ed00d00c07533e
7
- data.tar.gz: 4779c1cbe899021125f56204d87462a8c1222c961f547e1ddc9ba4c46e29211da564bdc6492346baaa16ad4eaaedce4601f6afdcc0a16031a29be79f624c4935
6
+ metadata.gz: 299320641fb58973b89c64f1db0c79007f43103cb063bd171f386d9118cb49257765d0652024bbd7869daa88c4f18da7338b8d60285f3e0a95258049b9e62562
7
+ data.tar.gz: 3bdd474509910999521c52808a535227dbf5a6aa74345e39dc0e37eae7a000b51b1df3dbb50598abc865a036a4990cee0f3ccce35471d69cb2ef4bb9633fc83a
@@ -6,7 +6,13 @@ module TextAlignment; end unless defined? TextAlignment
6
6
 
7
7
  class TextAlignment::AnchorFinder
8
8
 
9
- def initialize(source_str, target_str, cultivation_map)
9
+ def initialize(source_str, target_str, cultivation_map, squeeze_ws = true)
10
+ @method_get_left_windows, @method_get_right_windows = if squeeze_ws
11
+ [method(:get_left_windows), method(:get_right_windows)]
12
+ else
13
+ [method(:get_left_windows_no_squeeze_ws), method(:get_right_windows_no_squeeze_ws)]
14
+ end
15
+
10
16
  @s1 = source_str.downcase
11
17
  @s2 = target_str.downcase
12
18
 
@@ -108,14 +114,14 @@ class TextAlignment::AnchorFinder
108
114
  next
109
115
  end
110
116
 
111
- left_window_s1, left_window_s2 = get_left_windows(beg_s1, beg_s2, size_window)
117
+ left_window_s1, left_window_s2 = @method_get_left_windows.call(beg_s1, beg_s2, size_window)
112
118
  if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
113
119
  break unless valid_beg_s2.nil?
114
120
  valid_beg_s2 = beg_s2
115
121
  next
116
122
  end
117
123
 
118
- right_window_s1, right_window_s2 = get_right_windows(beg_s1, beg_s2, size_window)
124
+ right_window_s1, right_window_s2 = @method_get_right_windows.call(beg_s1, beg_s2, size_window)
119
125
  if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
120
126
  break unless valid_beg_s2.nil?
121
127
  valid_beg_s2 = beg_s2
@@ -139,7 +145,7 @@ class TextAlignment::AnchorFinder
139
145
  size_window ||= @size_window
140
146
 
141
147
  # comment out below with the assumption that the beginning of a document gives a significant locational information
142
- # return if @beg_s1 < size_window || @beg_s2 < size_window
148
+ # return if beg_s1 < size_window || beg_s2 < size_window
143
149
 
144
150
  window_s1 = ''
145
151
  loc = beg_s1 - 1
@@ -170,7 +176,7 @@ class TextAlignment::AnchorFinder
170
176
  size_window ||= @size_window
171
177
 
172
178
  # commend below with the assumption that the end of a document gives a significant locational
173
- # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
179
+ # return if (beg_s1 + @size_ngram > (@s1.length - size_window)) || (beg_s2 + @size_ngram > (@s2.length - size_window))
174
180
 
175
181
  window_s1 = ''
176
182
  loc = beg_s1 + @size_ngram
@@ -199,6 +205,44 @@ class TextAlignment::AnchorFinder
199
205
  [window_s1, window_s2]
200
206
  end
201
207
 
208
+ def get_left_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
209
+ size_window ||= @size_window
210
+
211
+ # comment out below with the assumption that the beginning of a document gives a significant locational information
212
+ # return if beg_s1 < size_window || beg_s2 < size_window
213
+
214
+ wbeg = beg_s1 - size_window
215
+ wbeg = 0 if wbeg < 0
216
+ window_s1 = @s1[wbeg ... beg_s1]
217
+
218
+ wbeg = beg_s2 - size_window
219
+ wbeg = 0 if wbeg < 0
220
+ window_s2 = @s2[wbeg ... beg_s2]
221
+
222
+ [window_s1, window_s2]
223
+ end
224
+
225
+ def get_right_windows_no_squeeze_ws(beg_s1, beg_s2, size_window = nil)
226
+ size_window ||= @size_window
227
+
228
+ # commend below with the assumption that the end of a document gives a significant locational
229
+ # return if (@beg_s1 + @size_ngram > (@s1.length - size_window)) || (@beg_s2 + @size_ngram > (@s2.length - size_window))
230
+
231
+ slen = @s1.length
232
+ wbeg = beg_s1 + @size_ngram
233
+ wend = wbeg + size_window
234
+ wend = slen if wend > slen
235
+ window_s1 = @s1[wbeg ... wend]
236
+
237
+ slen = @s2.length
238
+ wbeg = beg_s2 + @size_ngram
239
+ wend = wbeg + size_window
240
+ wend = slen if wend > slen
241
+ window_s2 = @s2[wbeg ... wend]
242
+
243
+ [window_s1, window_s2]
244
+ end
245
+
202
246
  def text_similarity(str1, str2, ngram_order = 2)
203
247
  return 0 if str1.nil? || str2.nil?
204
248
  String::Similarity.cosine(str1, str2, ngram:ngram_order)
@@ -1,3 +1,5 @@
1
+ require 'strscan'
2
+
1
3
  module TextAlignment; end unless defined? TextAlignment
2
4
 
3
5
  TextAlignment::CHAR_MAPPING = [
@@ -80,8 +82,16 @@ TextAlignment::CHAR_MAPPING = [
80
82
  class TextAlignment::CharMapping
81
83
  attr_reader :mapped_text
82
84
 
83
- def initialize(_text, char_mapping = nil)
84
- char_mapping ||= TextAlignment::CHAR_MAPPING
85
+ def initialize(_text, char_mapping = nil, squeeze_ws_to = 1)
86
+ if squeeze_ws_to == 0
87
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_0)
88
+ @method_squeeze_ws = method(:squeeze_ws_0!)
89
+ else
90
+ @method_get_positions_squeeze_ws = method(:get_positions_squeeze_ws_1)
91
+ @method_squeeze_ws = method(:squeeze_ws_1!)
92
+ end
93
+
94
+ char_mapping ||= TextAlignment::CHAR_MAPPING.sort{|a, b| b[1].length <=> a[1].length}
85
95
  @mapped_text, offset_mapping = enmap_text(_text, char_mapping)
86
96
  @index_enmap = offset_mapping.to_h
87
97
  @index_demap = offset_mapping.map{|m| m.reverse}.to_h
@@ -105,7 +115,7 @@ class TextAlignment::CharMapping
105
115
 
106
116
  private
107
117
 
108
- def enmap_text(_text, char_mapping)
118
+ def enmap_text(_text, char_mapping, no_ws = false)
109
119
  text = _text.dup
110
120
 
111
121
  # To execute the single letter mapping replacement
@@ -113,14 +123,14 @@ class TextAlignment::CharMapping
113
123
  text.gsub!(one, long) if long.length == 1
114
124
  end
115
125
 
116
- # To get the (location, length) index for replacements
117
- loc_len = []
126
+ # To get the replacement positions, (position, old_length, new_length), for char mappings
127
+ rpositions = []
118
128
  char_mapping.each do |one, long|
119
129
  next if long.length == 1
120
130
 
121
131
  init_next = 0
122
132
  while loc = text.index(long, init_next)
123
- loc_len << [loc, long.length, 1]
133
+ rpositions << [loc, long.length, 1]
124
134
  init_next = loc + long.length
125
135
  end
126
136
 
@@ -128,22 +138,17 @@ class TextAlignment::CharMapping
128
138
  text.gsub!(long, one * long.length)
129
139
  end
130
140
 
131
- # To get the (location, length) index for consecutive whitespace sequences
132
- init_next = 0
133
- while loc = text.index(/\s{1,}/, init_next)
134
- len = $~[0].length
135
- loc_len << [loc, len, 0]
136
- init_next = loc + len
137
- end
141
+ # To get the replacement positions, (position, old_length, new_length), for consecutive whitespaces
142
+ rpositions += @method_get_positions_squeeze_ws.call(text)
138
143
 
139
- loc_len.sort!{|a, b| a[0] <=> b[0]}
144
+ rpositions.sort!{|a, b| a[0] <=> b[0]}
140
145
 
141
146
  # To get the offset_mapping before and after replacement
142
147
  offset_mapping = []
143
148
  init_next = 0
144
149
  j = 0
145
150
 
146
- loc_len.each do |loc, old_len, new_len|
151
+ rpositions.each do |loc, old_len, new_len|
147
152
  offset_mapping += (init_next .. loc).map do |i|
148
153
  m = [i, j]
149
154
  j += 1
@@ -166,10 +171,41 @@ class TextAlignment::CharMapping
166
171
  end
167
172
 
168
173
  # To replace multi whitespace sequences to a space
169
- text.gsub!(/\s{1,}/, '')
174
+ @method_squeeze_ws.call(text)
170
175
 
171
176
  [text, offset_mapping]
172
177
  end
178
+
179
+ # To get squeeze positions of whitespaces to one
180
+ def get_positions_squeeze_ws_1(text)
181
+ rpositions = []
182
+ text.scan(/s{2,}/) do |s|
183
+ loc = $~.begin(0)
184
+ len = $~.end(0) - loc
185
+ rpositions << [loc, len, 1]
186
+ end
187
+ rpositions
188
+ end
189
+
190
+ # To get squeeze positions of whitespaces to zero
191
+ def get_positions_squeeze_ws_0(text)
192
+ rpositions = []
193
+ text.scan(/\s+/) do |s|
194
+ loc = $~.begin(0)
195
+ len = $~.end(0) - loc
196
+ rpositions << [loc, len, 0]
197
+ end
198
+ rpositions
199
+ end
200
+
201
+ def squeeze_ws_1!(text)
202
+ text.gsub!(/\s{2,}/, ' ')
203
+ end
204
+
205
+ def squeeze_ws_0!(text)
206
+ text.gsub!(/\s+/, '')
207
+ end
208
+
173
209
  end
174
210
 
175
211
  if __FILE__ == $0
@@ -190,5 +226,5 @@ if __FILE__ == $0
190
226
  denotations_mapped = text_mapping.enmap_denotations(denotations)
191
227
  new_annotations = {text:text_mapped, denotations:denotations_mapped}
192
228
 
193
- puts new_annotations.to_json
229
+ # puts new_annotations.to_json
194
230
  end
@@ -157,8 +157,7 @@ class TextAlignment::MixedAlignment
157
157
  when '-'
158
158
  ''
159
159
  when '+'
160
- # (d.new_element =~ /\S/) ? '+' : ''
161
- '+'
160
+ (d.new_element =~ /\S/) ? '+' : ''
162
161
  else
163
162
  ''
164
163
  end
@@ -11,14 +11,17 @@ class TextAlignment::TextAlignment
11
11
  attr_reader :similarity
12
12
  attr_reader :lost_annotations
13
13
 
14
- # Initialize with a reference text, again which texts will be aligned
15
- def initialize(reference_text, to_prevent_overlap = false)
14
+ # Initialize with a reference text, against which texts will be aligned
15
+ def initialize(reference_text, options = {})
16
16
  raise ArgumentError, "nil text" if reference_text.nil?
17
17
 
18
+ options ||= {}
19
+ @to_prevent_overlap = options[:to_prevent_overlap] || false
20
+ @squeeze_ws_to = options[:squeeze_ws_to] || 0
21
+
18
22
  @original_reference_text = reference_text
19
- @rtext_mapping = TextAlignment::CharMapping.new(reference_text)
23
+ @rtext_mapping = TextAlignment::CharMapping.new(reference_text, nil, @squeeze_ws_to)
20
24
  @mapped_reference_text = @rtext_mapping.mapped_text
21
- @to_prevent_overlap = to_prevent_overlap
22
25
 
23
26
  @original_text = nil
24
27
  @blocks = nil
@@ -32,7 +35,7 @@ class TextAlignment::TextAlignment
32
35
  # In case the input text is the same as the previous one, reuse the previous text mapping
33
36
  unless @original_text && @original_text == text
34
37
  @original_text = text
35
- @text_mapping = TextAlignment::CharMapping.new(text)
38
+ @text_mapping = TextAlignment::CharMapping.new(text, nil, @squeeze_ws_to)
36
39
  end
37
40
 
38
41
  @mapped_text = @text_mapping.mapped_text
@@ -202,7 +205,7 @@ class TextAlignment::TextAlignment
202
205
 
203
206
  def find_block_alignment(str1, str2, denotations, cultivation_map)
204
207
  ## to find block alignments
205
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map)
208
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, cultivation_map, @squeeze_ws_to == 1)
206
209
 
207
210
  blocks = []
208
211
  while block = anchor_finder.get_next_anchor
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.11.7'
2
+ VERSION = '0.11.8'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.7
4
+ version: 0.11.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-27 00:00:00.000000000 Z
11
+ date: 2021-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary