text_alignment 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
|
4
|
+
data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
|
7
|
+
data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
|
@@ -4,4 +4,4 @@ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
|
4
4
|
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
-
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
-
|
15
13
|
class TextAlignment::MixedAlignment
|
16
14
|
attr_reader :sdiff
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
21
19
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
-
|
25
|
-
|
26
|
-
## preprocessing
|
27
|
-
str1 = str1.dup
|
28
|
-
str2 = str2.dup
|
29
|
-
mappings = mappings.dup
|
30
|
-
|
31
|
-
## find the first nomatch character
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
-
if str2.index(c).nil?
|
34
|
-
@nomatch_char1 = c
|
35
|
-
break
|
36
|
-
end
|
37
|
-
end
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
-
|
40
|
-
## find the first nomatch character
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
-
@nomatch_char2 = c
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
-
|
49
|
-
# single character mappings
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
54
|
-
|
55
|
-
str1.tr!(characters_from, characters_to)
|
56
|
-
str2.tr!(characters_from, characters_to)
|
57
|
-
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
-
|
60
|
-
# ASCII foldings
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
-
ascii_foldings.each do |f|
|
63
|
-
from = f[1]
|
64
|
-
|
65
|
-
if str2.index(f[0])
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
-
str1.gsub!(from, to)
|
68
|
-
end
|
69
|
-
|
70
|
-
if str1.index(f[0])
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
-
str2.gsub!(from, to)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
22
|
+
mappings ||= []
|
76
23
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
78
25
|
end
|
@@ -5,19 +5,17 @@ require 'text_alignment/mixed_alignment'
|
|
5
5
|
|
6
6
|
module TextAlignment; end unless defined? TextAlignment
|
7
7
|
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
+
|
8
10
|
class TextAlignment::TextAlignment
|
9
11
|
attr_reader :block_alignments
|
10
12
|
attr_reader :similarity
|
11
13
|
attr_reader :lost_annotations
|
12
14
|
|
13
|
-
def initialize(
|
14
|
-
raise ArgumentError, "nil string" if
|
15
|
-
|
16
|
-
size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
17
|
-
size_window = _size_window || TextAlignment::SIZE_WINDOW
|
18
|
-
sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
19
17
|
|
20
|
-
mappings
|
18
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
21
19
|
|
22
20
|
# try exact match
|
23
21
|
block_begin = str2.index(str1)
|
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
|
|
26
24
|
return @block_alignments
|
27
25
|
end
|
28
26
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2,
|
27
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
30
28
|
|
31
29
|
# To collect matched blocks
|
32
30
|
mblocks = []
|
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
|
|
232
230
|
r
|
233
231
|
end
|
234
232
|
|
233
|
+
private
|
234
|
+
|
235
|
+
|
236
|
+
def string_preprocessing(_str1, _str2)
|
237
|
+
str1 = _str1.dup
|
238
|
+
str2 = _str2.dup
|
239
|
+
mappings = TextAlignment::MAPPINGS.dup
|
240
|
+
|
241
|
+
## single character mappings
|
242
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
243
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
244
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
245
|
+
characters_to.gsub!(/-/, '\-')
|
246
|
+
|
247
|
+
str1.tr!(characters_from, characters_to)
|
248
|
+
str2.tr!(characters_from, characters_to)
|
249
|
+
|
250
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
251
|
+
|
252
|
+
## long to one character mappings
|
253
|
+
pletters = TextAlignment::PADDING_LETTERS
|
254
|
+
|
255
|
+
# find the padding letter for str1
|
256
|
+
padding_letter1 = begin
|
257
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
258
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
259
|
+
TextAlignment::PADDING_LETTERS[i]
|
260
|
+
end
|
261
|
+
|
262
|
+
# find the padding letter for str2
|
263
|
+
padding_letter2 = begin
|
264
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
265
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
266
|
+
TextAlignment::PADDING_LETTERS[i]
|
267
|
+
end
|
268
|
+
|
269
|
+
# ASCII foldings
|
270
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
271
|
+
ascii_foldings.each do |f|
|
272
|
+
from = f[1]
|
273
|
+
|
274
|
+
if str2.index(f[0])
|
275
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
276
|
+
str1.gsub!(from, to)
|
277
|
+
end
|
278
|
+
|
279
|
+
if str1.index(f[0])
|
280
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
281
|
+
str2.gsub!(from, to)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
285
|
+
|
286
|
+
[str1, str2, mappings]
|
287
|
+
end
|
288
|
+
|
235
289
|
end
|