text_alignment 0.2.9 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +190 -39
- data/lib/text_alignment/anchor_finder.rb +149 -0
- data/lib/text_alignment/approximate_fit.rb +50 -52
- data/lib/text_alignment/find_divisions.rb +198 -200
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -1
- metadata +5 -13
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
@@ -12,103 +12,103 @@ module TextAlignment; end unless defined? TextAlignment
|
|
12
12
|
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
13
|
|
14
14
|
class TextAlignment::GLCSTextAlignment
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
15
|
+
attr_reader :position_map_begin, :position_map_end
|
16
|
+
attr_reader :common_elements, :mapped_elements
|
17
|
+
attr_reader :similarity
|
18
|
+
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
|
+
|
20
|
+
def initialize(str1, str2, mappings = [], lcs = nil, sdiff = nil)
|
21
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
22
|
+
raise ArgumentError, "nil mappings" if mappings.nil?
|
23
|
+
|
24
|
+
_glcs_alignment_fast(str1, str2, mapptings, lcs, sdiff)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def _glcs_alignment_fast(str1, str2, mappings, lcs, sdiff)
|
30
|
+
sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
|
31
|
+
|
32
|
+
posmap_begin, posmap_end = {}, {}
|
33
|
+
@common_elements, @mapped_elements = [], []
|
34
|
+
|
35
|
+
addition, deletion = [], []
|
36
|
+
|
37
|
+
sdiff.each do |h|
|
38
|
+
case h.action
|
39
|
+
when '='
|
40
|
+
p1, p2 = h.old_position, h.new_position
|
41
|
+
|
42
|
+
@common_elements << [str1[p1], str2[p2]]
|
43
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
44
|
+
|
45
|
+
if !addition.empty? && deletion.empty?
|
46
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
47
|
+
elsif addition.empty? && !deletion.empty?
|
48
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
49
|
+
elsif !addition.empty? && !deletion.empty?
|
50
|
+
if addition.length > 1 || deletion.length > 1
|
51
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
52
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
53
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
54
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
55
|
+
@common_elements += galign.common_elements
|
56
|
+
@mapped_elements += galign.mapped_elements
|
57
|
+
else
|
58
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
59
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
60
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
addition.clear; deletion.clear
|
65
|
+
|
66
|
+
when '!'
|
67
|
+
deletion << h.old_position
|
68
|
+
addition << h.new_position
|
69
|
+
when '-'
|
70
|
+
deletion << h.old_position
|
71
|
+
when '+'
|
72
|
+
addition << h.new_position
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
p1, p2 = str1.length, str2.length
|
77
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
78
|
+
|
79
|
+
if !addition.empty? && deletion.empty?
|
80
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
81
|
+
elsif addition.empty? && !deletion.empty?
|
82
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
83
|
+
elsif !addition.empty? && !deletion.empty?
|
84
|
+
if addition.length > 1 && deletion.length > 1
|
85
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
86
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
87
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
88
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
89
|
+
@mapped_elements += galign.common_elements + galign.mapped_elements
|
90
|
+
else
|
91
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
92
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
93
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
@position_map_begin = posmap_begin.sort.to_h
|
98
|
+
@position_map_end = posmap_end.sort.to_h
|
99
|
+
end
|
100
100
|
end
|
101
101
|
|
102
102
|
if __FILE__ == $0
|
103
|
-
|
104
|
-
|
103
|
+
str1 = '-βκ-'
|
104
|
+
str2 = '-betakappa-'
|
105
105
|
|
106
|
-
|
107
|
-
|
106
|
+
# anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
107
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
109
|
+
dictionary = [["β", "beta"]]
|
110
|
+
# align = TextAlignment::TextAlignment.new(str1, str2)
|
111
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
112
|
+
p align.common_elements
|
113
|
+
p align.mapped_elements
|
114
114
|
end
|
@@ -2,67 +2,67 @@
|
|
2
2
|
module TextAlignment; end unless defined? TextAlignment
|
3
3
|
|
4
4
|
class << TextAlignment
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def glcs_required?(str1, mappings = [])
|
6
|
+
raise ArgumentError, "nil string" if str1.nil?
|
7
|
+
raise ArgumentError, "nil mappings" if mappings.nil?
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
# character mappings can be safely applied to the strings withoug changing the position of other characters
|
10
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
11
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
12
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
13
|
+
characters_to.gsub!(/-/, '\-')
|
14
14
|
|
15
|
-
|
15
|
+
str1.tr!(characters_from, characters_to)
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
str1 =~/([^\p{ASCII}][^\p{ASCII}])/
|
18
|
+
$1
|
19
|
+
end
|
20
20
|
end
|
21
21
|
|
22
22
|
if __FILE__ == $0
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
24
|
+
dictionary = [
|
25
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
26
|
+
["•", "*"], #U+2022 (bullet)
|
27
|
+
["Δ", "delta"], #U+0394 (greek capital letter delta)
|
28
|
+
["Φ", "phi"], #U+03A6 (greek capital letter phi)
|
29
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
30
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
31
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
32
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
33
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
34
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
35
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
36
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
37
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
38
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
39
|
+
[" ", " "], #U+2009 (thin space)
|
40
|
+
[" ", " "], #U+200A (hair space)
|
41
|
+
[" ", " "], #U+00A0 (no-break space)
|
42
|
+
[" ", " "], #U+3000 (ideographic space)
|
43
|
+
["−", "-"], #U+2212 (minus sign)
|
44
|
+
["–", "-"], #U+2013 (en dash)
|
45
|
+
["′", "'"], #U+2032 (prime)
|
46
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
47
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
48
|
+
["“", '"'], #U+201C (left double quotation mark)
|
49
|
+
["”", '"'] #U+201D (right double quotation mark)
|
50
|
+
]
|
51
51
|
|
52
|
-
|
52
|
+
str = "TGF-β–induced"
|
53
53
|
|
54
|
-
|
55
|
-
|
54
|
+
# from_text = "TGF-beta-induced"
|
55
|
+
# to_text = "TGF-β–induced"
|
56
56
|
|
57
|
-
|
58
|
-
|
57
|
+
# from_text = "TGF-β–β induced"
|
58
|
+
# to_text = "TGF-beta-beta induced"
|
59
59
|
|
60
|
-
|
60
|
+
# str = "-βκ-"
|
61
61
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
if ARGV.length == 1
|
63
|
+
str = File.read(ARGV[0])
|
64
|
+
end
|
65
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
66
66
|
|
67
|
-
|
67
|
+
p TextAlignment.glcs_required?(str, dictionary)
|
68
68
|
end
|
@@ -2,145 +2,145 @@
|
|
2
2
|
require 'text_alignment/lcs_min'
|
3
3
|
|
4
4
|
class TextAlignment::LCSAlignment
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
5
|
+
attr_reader :position_map_begin, :position_map_end
|
6
|
+
attr_reader :common_elements, :mapped_elements
|
7
|
+
|
8
|
+
# It initializes the LCS table for the given two strings, str1 and str2.
|
9
|
+
# Exception is raised when nil given passed to either str1, str2 or dictionary
|
10
|
+
def initialize(str1, str2, lcs = nil, sdiff = nil)
|
11
|
+
raise ArgumentError, "nil string" if str1 == nil || str2 == nil
|
12
|
+
sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
|
13
|
+
_compute_position_map(str1, str2, sdiff)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def _compute_position_map(str1, str2, sdiff)
|
19
|
+
posmap_begin, posmap_end = {}, {}
|
20
|
+
@common_elements, @mapped_elements = [], []
|
21
|
+
|
22
|
+
addition, deletion = [], []
|
23
|
+
|
24
|
+
sdiff.each do |h|
|
25
|
+
case h.action
|
26
|
+
when '='
|
27
|
+
p1, p2 = h.old_position, h.new_position
|
28
|
+
|
29
|
+
@common_elements << [str1[p1], str2[p2]]
|
30
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
31
|
+
|
32
|
+
if !addition.empty? && deletion.empty?
|
33
|
+
# correct the position for end
|
34
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
35
|
+
elsif addition.empty? && !deletion.empty?
|
36
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
37
|
+
elsif !addition.empty? && !deletion.empty?
|
38
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
39
|
+
|
40
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
41
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
42
|
+
end
|
43
|
+
|
44
|
+
addition.clear; deletion.clear
|
45
|
+
|
46
|
+
when '!'
|
47
|
+
deletion << h.old_position
|
48
|
+
addition << h.new_position
|
49
|
+
when '-'
|
50
|
+
deletion << h.old_position
|
51
|
+
when '+'
|
52
|
+
addition << h.new_position
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
p1, p2 = str1.length, str2.length
|
57
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
58
|
+
|
59
|
+
if !addition.empty? && deletion.empty?
|
60
|
+
# correct the position for end
|
61
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
62
|
+
elsif addition.empty? && !deletion.empty?
|
63
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
64
|
+
elsif !addition.empty? && !deletion.empty?
|
65
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
66
|
+
|
67
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
68
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
69
|
+
end
|
70
|
+
|
71
|
+
@position_map_begin = posmap_begin.sort.to_h
|
72
|
+
@position_map_end = posmap_end.sort.to_h
|
73
|
+
end
|
74
74
|
|
75
75
|
end
|
76
76
|
|
77
77
|
if __FILE__ == $0
|
78
78
|
|
79
|
-
|
80
|
-
|
79
|
+
# from_text = "TGF-β mRNA"
|
80
|
+
# to_text = "TGF-beta mRNA"
|
81
81
|
|
82
|
-
|
83
|
-
|
82
|
+
# from_text = "TGF-beta mRNA"
|
83
|
+
# to_text = "TGF-β mRNA"
|
84
84
|
|
85
|
-
|
86
|
-
|
85
|
+
# from_text = "TGF-beta mRNA"
|
86
|
+
# to_text = "TGF- mRNA"
|
87
87
|
|
88
|
-
|
89
|
-
|
88
|
+
# from_text = "TGF-β–induced"
|
89
|
+
# to_text = "TGF-beta-induced"
|
90
90
|
|
91
|
-
|
92
|
-
|
91
|
+
from_text = 'abxyzcd'
|
92
|
+
to_text = 'abcd'
|
93
93
|
|
94
|
-
|
95
|
-
|
94
|
+
# from_text = "TGF-beta-induced"
|
95
|
+
# to_text = "TGF-β–induced"
|
96
96
|
|
97
|
-
|
98
|
-
|
97
|
+
# from_text = "beta-induced"
|
98
|
+
# to_text = "TGF-beta-induced"
|
99
99
|
|
100
|
-
|
101
|
-
|
100
|
+
# from_text = "TGF-beta-induced"
|
101
|
+
# to_text = "beta-induced"
|
102
102
|
|
103
|
-
|
104
|
-
|
103
|
+
# from_text = "TGF-β–β induced"
|
104
|
+
# to_text = "TGF-beta-beta induced"
|
105
105
|
|
106
|
-
|
107
|
-
|
106
|
+
# from_text = "-βκ-"
|
107
|
+
# to_text = "-betakappa-"
|
108
108
|
|
109
|
-
|
110
|
-
|
109
|
+
# from_text = "-betakappa-beta-z"
|
110
|
+
# to_text = "-βκ-β–z"
|
111
111
|
|
112
|
-
|
113
|
-
|
112
|
+
# from_text = "affect C/EBP-β’s ability"
|
113
|
+
# to_text = "affect C/EBP-beta's ability"
|
114
114
|
|
115
|
-
|
116
|
-
|
115
|
+
# from_text = "12 ± 34"
|
116
|
+
# to_text = "12 +/- 34"
|
117
117
|
|
118
|
-
|
119
|
-
|
118
|
+
# from_text = "TGF-β–treated"
|
119
|
+
# to_text = "TGF-beta-treated"
|
120
120
|
|
121
|
-
|
122
|
-
|
121
|
+
# from_text = "in TGF-β–treated cells"
|
122
|
+
# to_text = "in TGF-beta-treated cells"
|
123
123
|
|
124
|
-
|
125
|
-
|
124
|
+
# from_text = "TGF-β–induced"
|
125
|
+
# to_text = "TGF-beta-induced"
|
126
126
|
|
127
|
-
|
128
|
-
|
127
|
+
# anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
128
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
129
129
|
|
130
|
-
|
131
|
-
|
130
|
+
# aligner = TextAlignment.new(anns1[:text], anns2[:text], [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"]])
|
131
|
+
# denotations = aligner.transform_denotations(anns1[:denotations])
|
132
132
|
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
denotations_s = <<-'ANN'
|
134
|
+
[{"id":"T0", "span":{"begin":1,"end":2}, "category":"Protein"}]
|
135
|
+
ANN
|
136
136
|
|
137
|
-
|
137
|
+
# denotations = JSON.parse denotations_s, :symbolize_names => true
|
138
138
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
139
|
+
a = TextAlignment::LCSAlignment.new(from_text, to_text)
|
140
|
+
p a.position_map_begin
|
141
|
+
puts "-----"
|
142
|
+
p a.position_map_end
|
143
|
+
# aligner = TextAlignment.new(from_text, to_text, [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"], ["β", "beta"]])
|
144
144
|
|
145
|
-
|
145
|
+
# p denotations
|
146
146
|
end
|