text_alignment 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
|
4
|
+
data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
|
7
|
+
data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
|
@@ -4,4 +4,4 @@ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
|
4
4
|
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
5
|
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
6
|
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
-
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
|
|
10
10
|
|
11
11
|
module TextAlignment; end unless defined? TextAlignment
|
12
12
|
|
13
|
-
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
-
|
15
13
|
class TextAlignment::MixedAlignment
|
16
14
|
attr_reader :sdiff
|
17
15
|
attr_reader :position_map_begin, :position_map_end
|
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
|
|
21
19
|
|
22
20
|
def initialize(str1, str2, mappings = [])
|
23
21
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
-
|
25
|
-
|
26
|
-
## preprocessing
|
27
|
-
str1 = str1.dup
|
28
|
-
str2 = str2.dup
|
29
|
-
mappings = mappings.dup
|
30
|
-
|
31
|
-
## find the first nomatch character
|
32
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
33
|
-
if str2.index(c).nil?
|
34
|
-
@nomatch_char1 = c
|
35
|
-
break
|
36
|
-
end
|
37
|
-
end
|
38
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
39
|
-
|
40
|
-
## find the first nomatch character
|
41
|
-
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
42
|
-
if c != @nomatch_char1 && str1.index(c).nil?
|
43
|
-
@nomatch_char2 = c
|
44
|
-
break
|
45
|
-
end
|
46
|
-
end
|
47
|
-
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
48
|
-
|
49
|
-
# single character mappings
|
50
|
-
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
51
|
-
characters_from = character_mappings.collect{|m| m[0]}.join
|
52
|
-
characters_to = character_mappings.collect{|m| m[1]}.join
|
53
|
-
characters_to.gsub!(/-/, '\-')
|
54
|
-
|
55
|
-
str1.tr!(characters_from, characters_to)
|
56
|
-
str2.tr!(characters_from, characters_to)
|
57
|
-
|
58
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
59
|
-
|
60
|
-
# ASCII foldings
|
61
|
-
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
62
|
-
ascii_foldings.each do |f|
|
63
|
-
from = f[1]
|
64
|
-
|
65
|
-
if str2.index(f[0])
|
66
|
-
to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
67
|
-
str1.gsub!(from, to)
|
68
|
-
end
|
69
|
-
|
70
|
-
if str1.index(f[0])
|
71
|
-
to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
72
|
-
str2.gsub!(from, to)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
22
|
+
mappings ||= []
|
76
23
|
|
77
24
|
_compute_mixed_alignment(str1, str2, mappings)
|
78
25
|
end
|
@@ -5,19 +5,17 @@ require 'text_alignment/mixed_alignment'
|
|
5
5
|
|
6
6
|
module TextAlignment; end unless defined? TextAlignment
|
7
7
|
|
8
|
+
TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
|
9
|
+
|
8
10
|
class TextAlignment::TextAlignment
|
9
11
|
attr_reader :block_alignments
|
10
12
|
attr_reader :similarity
|
11
13
|
attr_reader :lost_annotations
|
12
14
|
|
13
|
-
def initialize(
|
14
|
-
raise ArgumentError, "nil string" if
|
15
|
-
|
16
|
-
size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
17
|
-
size_window = _size_window || TextAlignment::SIZE_WINDOW
|
18
|
-
sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
15
|
+
def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
16
|
+
raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
|
19
17
|
|
20
|
-
mappings
|
18
|
+
str1, str2, mappings = string_preprocessing(_str1, _str2)
|
21
19
|
|
22
20
|
# try exact match
|
23
21
|
block_begin = str2.index(str1)
|
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
|
|
26
24
|
return @block_alignments
|
27
25
|
end
|
28
26
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2,
|
27
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
|
30
28
|
|
31
29
|
# To collect matched blocks
|
32
30
|
mblocks = []
|
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
|
|
232
230
|
r
|
233
231
|
end
|
234
232
|
|
233
|
+
private
|
234
|
+
|
235
|
+
|
236
|
+
def string_preprocessing(_str1, _str2)
|
237
|
+
str1 = _str1.dup
|
238
|
+
str2 = _str2.dup
|
239
|
+
mappings = TextAlignment::MAPPINGS.dup
|
240
|
+
|
241
|
+
## single character mappings
|
242
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
243
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
244
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
245
|
+
characters_to.gsub!(/-/, '\-')
|
246
|
+
|
247
|
+
str1.tr!(characters_from, characters_to)
|
248
|
+
str2.tr!(characters_from, characters_to)
|
249
|
+
|
250
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
251
|
+
|
252
|
+
## long to one character mappings
|
253
|
+
pletters = TextAlignment::PADDING_LETTERS
|
254
|
+
|
255
|
+
# find the padding letter for str1
|
256
|
+
padding_letter1 = begin
|
257
|
+
i = pletters.index{|l| str2.index(l).nil?}
|
258
|
+
raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
|
259
|
+
TextAlignment::PADDING_LETTERS[i]
|
260
|
+
end
|
261
|
+
|
262
|
+
# find the padding letter for str2
|
263
|
+
padding_letter2 = begin
|
264
|
+
i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
|
265
|
+
raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
|
266
|
+
TextAlignment::PADDING_LETTERS[i]
|
267
|
+
end
|
268
|
+
|
269
|
+
# ASCII foldings
|
270
|
+
ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
271
|
+
ascii_foldings.each do |f|
|
272
|
+
from = f[1]
|
273
|
+
|
274
|
+
if str2.index(f[0])
|
275
|
+
to = f[0] + (padding_letter1 * (f[1].length - 1))
|
276
|
+
str1.gsub!(from, to)
|
277
|
+
end
|
278
|
+
|
279
|
+
if str1.index(f[0])
|
280
|
+
to = f[0] + (padding_letter2 * (f[1].length - 1))
|
281
|
+
str2.gsub!(from, to)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
285
|
+
|
286
|
+
[str1, str2, mappings]
|
287
|
+
end
|
288
|
+
|
235
289
|
end
|