text_alignment 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
4
- data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
3
+ metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
4
+ data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
5
5
  SHA512:
6
- metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
7
- data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
6
+ metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
7
+ data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
@@ -4,4 +4,4 @@ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
4
  TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
5
  TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
6
  TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
- TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
13
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
-
15
13
  class TextAlignment::MixedAlignment
16
14
  attr_reader :sdiff
17
15
  attr_reader :position_map_begin, :position_map_end
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
21
19
 
22
20
  def initialize(str1, str2, mappings = [])
23
21
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
- raise ArgumentError, "nil mappings" if mappings.nil?
25
-
26
- ## preprocessing
27
- str1 = str1.dup
28
- str2 = str2.dup
29
- mappings = mappings.dup
30
-
31
- ## find the first nomatch character
32
- TextAlignment::NOMATCH_CHARS.each_char do |c|
33
- if str2.index(c).nil?
34
- @nomatch_char1 = c
35
- break
36
- end
37
- end
38
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
-
40
- ## find the first nomatch character
41
- TextAlignment::NOMATCH_CHARS.each_char do |c|
42
- if c != @nomatch_char1 && str1.index(c).nil?
43
- @nomatch_char2 = c
44
- break
45
- end
46
- end
47
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
-
49
- # single character mappings
50
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
- characters_from = character_mappings.collect{|m| m[0]}.join
52
- characters_to = character_mappings.collect{|m| m[1]}.join
53
- characters_to.gsub!(/-/, '\-')
54
-
55
- str1.tr!(characters_from, characters_to)
56
- str2.tr!(characters_from, characters_to)
57
-
58
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
-
60
- # ASCII foldings
61
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
- ascii_foldings.each do |f|
63
- from = f[1]
64
-
65
- if str2.index(f[0])
66
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
- str1.gsub!(from, to)
68
- end
69
-
70
- if str1.index(f[0])
71
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
- str2.gsub!(from, to)
73
- end
74
- end
75
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
22
+ mappings ||= []
76
23
 
77
24
  _compute_mixed_alignment(str1, str2, mappings)
78
25
  end
@@ -5,19 +5,17 @@ require 'text_alignment/mixed_alignment'
5
5
 
6
6
  module TextAlignment; end unless defined? TextAlignment
7
7
 
8
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
9
+
8
10
  class TextAlignment::TextAlignment
9
11
  attr_reader :block_alignments
10
12
  attr_reader :similarity
11
13
  attr_reader :lost_annotations
12
14
 
13
- def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
14
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
15
-
16
- size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
17
- size_window = _size_window || TextAlignment::SIZE_WINDOW
18
- sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
15
+ def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
19
17
 
20
- mappings ||= TextAlignment::MAPPINGS
18
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
21
19
 
22
20
  # try exact match
23
21
  block_begin = str2.index(str1)
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
26
24
  return @block_alignments
27
25
  end
28
26
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
27
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
30
28
 
31
29
  # To collect matched blocks
32
30
  mblocks = []
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
232
230
  r
233
231
  end
234
232
 
233
+ private
234
+
235
+
236
+ def string_preprocessing(_str1, _str2)
237
+ str1 = _str1.dup
238
+ str2 = _str2.dup
239
+ mappings = TextAlignment::MAPPINGS.dup
240
+
241
+ ## single character mappings
242
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
243
+ characters_from = character_mappings.collect{|m| m[0]}.join
244
+ characters_to = character_mappings.collect{|m| m[1]}.join
245
+ characters_to.gsub!(/-/, '\-')
246
+
247
+ str1.tr!(characters_from, characters_to)
248
+ str2.tr!(characters_from, characters_to)
249
+
250
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
251
+
252
+ ## long to one character mappings
253
+ pletters = TextAlignment::PADDING_LETTERS
254
+
255
+ # find the padding letter for str1
256
+ padding_letter1 = begin
257
+ i = pletters.index{|l| str2.index(l).nil?}
258
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
259
+ TextAlignment::PADDING_LETTERS[i]
260
+ end
261
+
262
+ # find the padding letter for str2
263
+ padding_letter2 = begin
264
+ i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
265
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
266
+ TextAlignment::PADDING_LETTERS[i]
267
+ end
268
+
269
+ # ASCII foldings
270
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
271
+ ascii_foldings.each do |f|
272
+ from = f[1]
273
+
274
+ if str2.index(f[0])
275
+ to = f[0] + (padding_letter1 * (f[1].length - 1))
276
+ str1.gsub!(from, to)
277
+ end
278
+
279
+ if str1.index(f[0])
280
+ to = f[0] + (padding_letter2 * (f[1].length - 1))
281
+ str2.gsub!(from, to)
282
+ end
283
+ end
284
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
285
+
286
+ [str1, str2, mappings]
287
+ end
288
+
235
289
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.4.2'
2
+ VERSION = '0.4.3'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim