text_alignment 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
4
- data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
3
+ metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
4
+ data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
5
5
  SHA512:
6
- metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
7
- data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
6
+ metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
7
+ data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
@@ -4,4 +4,4 @@ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
4
  TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
- TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
13
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
-
15
13
  class TextAlignment::MixedAlignment
16
14
  attr_reader :sdiff
17
15
  attr_reader :position_map_begin, :position_map_end
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
21
19
 
22
20
  def initialize(str1, str2, mappings = [])
23
21
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
- raise ArgumentError, "nil mappings" if mappings.nil?
25
-
26
- ## preprocessing
27
- str1 = str1.dup
28
- str2 = str2.dup
29
- mappings = mappings.dup
30
-
31
- ## find the first nomatch character
32
- TextAlignment::NOMATCH_CHARS.each_char do |c|
33
- if str2.index(c).nil?
34
- @nomatch_char1 = c
35
- break
36
- end
37
- end
38
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
-
40
- ## find the first nomatch character
41
- TextAlignment::NOMATCH_CHARS.each_char do |c|
42
- if c != @nomatch_char1 && str1.index(c).nil?
43
- @nomatch_char2 = c
44
- break
45
- end
46
- end
47
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
-
49
- # single character mappings
50
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
- characters_from = character_mappings.collect{|m| m[0]}.join
52
- characters_to = character_mappings.collect{|m| m[1]}.join
53
- characters_to.gsub!(/-/, '\-')
54
-
55
- str1.tr!(characters_from, characters_to)
56
- str2.tr!(characters_from, characters_to)
57
-
58
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
-
60
- # ASCII foldings
61
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
- ascii_foldings.each do |f|
63
- from = f[1]
64
-
65
- if str2.index(f[0])
66
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
- str1.gsub!(from, to)
68
- end
69
-
70
- if str1.index(f[0])
71
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
- str2.gsub!(from, to)
73
- end
74
- end
75
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
22
+ mappings ||= []
76
23
 
77
24
  _compute_mixed_alignment(str1, str2, mappings)
78
25
  end
@@ -5,19 +5,17 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
+
8
10
  class TextAlignment::TextAlignment
9
11
  attr_reader :block_alignments
10
12
  attr_reader :similarity
11
13
  attr_reader :lost_annotations
12
14
 
13
- def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
15
-
16
- size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
17
- size_window = _size_window || TextAlignment::SIZE_WINDOW
18
- sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
15
+ def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
19
17
 
20
- mappings ||= TextAlignment::MAPPINGS
18
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
21
19
 
22
20
  # try exact match
23
21
  block_begin = str2.index(str1)
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
26
24
  return @block_alignments
27
25
  end
28
26
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
27
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
30
28
 
31
29
  # To collect matched blocks
32
30
  mblocks = []
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
232
230
  r
233
231
  end
234
232
 
233
+ private
234
+
235
+
236
+ def string_preprocessing(_str1, _str2)
237
+ str1 = _str1.dup
238
+ str2 = _str2.dup
239
+ mappings = TextAlignment::MAPPINGS.dup
240
+
241
+ ## single character mappings
242
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
243
+ characters_from = character_mappings.collect{|m| m[0]}.join
244
+ characters_to = character_mappings.collect{|m| m[1]}.join
245
+ characters_to.gsub!(/-/, '\-')
246
+
247
+ str1.tr!(characters_from, characters_to)
248
+ str2.tr!(characters_from, characters_to)
249
+
250
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
251
+
252
+ ## long to one character mappings
253
+ pletters = TextAlignment::PADDING_LETTERS
254
+
255
+ # find the padding letter for str1
256
+ padding_letter1 = begin
257
+ i = pletters.index{|l| str2.index(l).nil?}
258
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
259
+ TextAlignment::PADDING_LETTERS[i]
260
+ end
261
+
262
+ # find the padding letter for str2
263
+ padding_letter2 = begin
264
+ i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
265
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
266
+ TextAlignment::PADDING_LETTERS[i]
267
+ end
268
+
269
+ # ASCII foldings
270
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
271
+ ascii_foldings.each do |f|
272
+ from = f[1]
273
+
274
+ if str2.index(f[0])
275
+ to = f[0] + (padding_letter1 * (f[1].length - 1))
276
+ str1.gsub!(from, to)
277
+ end
278
+
279
+ if str1.index(f[0])
280
+ to = f[0] + (padding_letter2 * (f[1].length - 1))
281
+ str2.gsub!(from, to)
282
+ end
283
+ end
284
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
285
+
286
+ [str1, str2, mappings]
287
+ end
288
+
235
289
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.4.2'
2
+ VERSION = '0.4.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim