text_alignment 0.2.5 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +3 -1
- data/bin/align_annotations +190 -35
- data/lib/text_alignment/anchor_finder.rb +149 -0
- data/lib/text_alignment/approximate_fit.rb +63 -48
- data/lib/text_alignment/find_divisions.rb +199 -101
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +3 -2
- metadata +21 -15
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
@@ -12,103 +12,103 @@ module TextAlignment; end unless defined? TextAlignment
|
|
12
12
|
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
13
|
|
14
14
|
class TextAlignment::GLCSTextAlignment
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
15
|
+
attr_reader :position_map_begin, :position_map_end
|
16
|
+
attr_reader :common_elements, :mapped_elements
|
17
|
+
attr_reader :similarity
|
18
|
+
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
19
|
+
|
20
|
+
def initialize(str1, str2, mappings = [], lcs = nil, sdiff = nil)
|
21
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
22
|
+
raise ArgumentError, "nil mappings" if mappings.nil?
|
23
|
+
|
24
|
+
_glcs_alignment_fast(str1, str2, mapptings, lcs, sdiff)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def _glcs_alignment_fast(str1, str2, mappings, lcs, sdiff)
|
30
|
+
sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
|
31
|
+
|
32
|
+
posmap_begin, posmap_end = {}, {}
|
33
|
+
@common_elements, @mapped_elements = [], []
|
34
|
+
|
35
|
+
addition, deletion = [], []
|
36
|
+
|
37
|
+
sdiff.each do |h|
|
38
|
+
case h.action
|
39
|
+
when '='
|
40
|
+
p1, p2 = h.old_position, h.new_position
|
41
|
+
|
42
|
+
@common_elements << [str1[p1], str2[p2]]
|
43
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
44
|
+
|
45
|
+
if !addition.empty? && deletion.empty?
|
46
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
47
|
+
elsif addition.empty? && !deletion.empty?
|
48
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
49
|
+
elsif !addition.empty? && !deletion.empty?
|
50
|
+
if addition.length > 1 || deletion.length > 1
|
51
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
52
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
53
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
54
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
55
|
+
@common_elements += galign.common_elements
|
56
|
+
@mapped_elements += galign.mapped_elements
|
57
|
+
else
|
58
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
59
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
60
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
addition.clear; deletion.clear
|
65
|
+
|
66
|
+
when '!'
|
67
|
+
deletion << h.old_position
|
68
|
+
addition << h.new_position
|
69
|
+
when '-'
|
70
|
+
deletion << h.old_position
|
71
|
+
when '+'
|
72
|
+
addition << h.new_position
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
p1, p2 = str1.length, str2.length
|
77
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
78
|
+
|
79
|
+
if !addition.empty? && deletion.empty?
|
80
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
81
|
+
elsif addition.empty? && !deletion.empty?
|
82
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
83
|
+
elsif !addition.empty? && !deletion.empty?
|
84
|
+
if addition.length > 1 && deletion.length > 1
|
85
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
86
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
87
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
88
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
89
|
+
@mapped_elements += galign.common_elements + galign.mapped_elements
|
90
|
+
else
|
91
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
92
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
93
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
@position_map_begin = posmap_begin.sort.to_h
|
98
|
+
@position_map_end = posmap_end.sort.to_h
|
99
|
+
end
|
100
100
|
end
|
101
101
|
|
102
102
|
if __FILE__ == $0
|
103
|
-
|
104
|
-
|
103
|
+
str1 = '-βκ-'
|
104
|
+
str2 = '-betakappa-'
|
105
105
|
|
106
|
-
|
107
|
-
|
106
|
+
# anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
107
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
108
108
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
109
|
+
dictionary = [["β", "beta"]]
|
110
|
+
# align = TextAlignment::TextAlignment.new(str1, str2)
|
111
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
112
|
+
p align.common_elements
|
113
|
+
p align.mapped_elements
|
114
114
|
end
|
@@ -2,67 +2,67 @@
|
|
2
2
|
module TextAlignment; end unless defined? TextAlignment
|
3
3
|
|
4
4
|
class << TextAlignment
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def glcs_required?(str1, mappings = [])
|
6
|
+
raise ArgumentError, "nil string" if str1.nil?
|
7
|
+
raise ArgumentError, "nil mappings" if mappings.nil?
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
# character mappings can be safely applied to the strings withoug changing the position of other characters
|
10
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
11
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
12
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
13
|
+
characters_to.gsub!(/-/, '\-')
|
14
14
|
|
15
|
-
|
15
|
+
str1.tr!(characters_from, characters_to)
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
str1 =~/([^\p{ASCII}][^\p{ASCII}])/
|
18
|
+
$1
|
19
|
+
end
|
20
20
|
end
|
21
21
|
|
22
22
|
if __FILE__ == $0
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
24
|
+
dictionary = [
|
25
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
26
|
+
["•", "*"], #U+2022 (bullet)
|
27
|
+
["Δ", "delta"], #U+0394 (greek capital letter delta)
|
28
|
+
["Φ", "phi"], #U+03A6 (greek capital letter phi)
|
29
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
30
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
31
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
32
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
33
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
34
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
35
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
36
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
37
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
38
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
39
|
+
[" ", " "], #U+2009 (thin space)
|
40
|
+
[" ", " "], #U+200A (hair space)
|
41
|
+
[" ", " "], #U+00A0 (no-break space)
|
42
|
+
[" ", " "], #U+3000 (ideographic space)
|
43
|
+
["−", "-"], #U+2212 (minus sign)
|
44
|
+
["–", "-"], #U+2013 (en dash)
|
45
|
+
["′", "'"], #U+2032 (prime)
|
46
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
47
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
48
|
+
["“", '"'], #U+201C (left double quotation mark)
|
49
|
+
["”", '"'] #U+201D (right double quotation mark)
|
50
|
+
]
|
51
51
|
|
52
|
-
|
52
|
+
str = "TGF-β–induced"
|
53
53
|
|
54
|
-
|
55
|
-
|
54
|
+
# from_text = "TGF-beta-induced"
|
55
|
+
# to_text = "TGF-β–induced"
|
56
56
|
|
57
|
-
|
58
|
-
|
57
|
+
# from_text = "TGF-β–β induced"
|
58
|
+
# to_text = "TGF-beta-beta induced"
|
59
59
|
|
60
|
-
|
60
|
+
# str = "-βκ-"
|
61
61
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
if ARGV.length == 1
|
63
|
+
str = File.read(ARGV[0])
|
64
|
+
end
|
65
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
66
66
|
|
67
|
-
|
67
|
+
p TextAlignment.glcs_required?(str, dictionary)
|
68
68
|
end
|
@@ -2,145 +2,145 @@
|
|
2
2
|
require 'text_alignment/lcs_min'
|
3
3
|
|
4
4
|
class TextAlignment::LCSAlignment
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
5
|
+
attr_reader :position_map_begin, :position_map_end
|
6
|
+
attr_reader :common_elements, :mapped_elements
|
7
|
+
|
8
|
+
# It initializes the LCS table for the given two strings, str1 and str2.
|
9
|
+
# Exception is raised when nil given passed to either str1, str2 or dictionary
|
10
|
+
def initialize(str1, str2, lcs = nil, sdiff = nil)
|
11
|
+
raise ArgumentError, "nil string" if str1 == nil || str2 == nil
|
12
|
+
sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
|
13
|
+
_compute_position_map(str1, str2, sdiff)
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def _compute_position_map(str1, str2, sdiff)
|
19
|
+
posmap_begin, posmap_end = {}, {}
|
20
|
+
@common_elements, @mapped_elements = [], []
|
21
|
+
|
22
|
+
addition, deletion = [], []
|
23
|
+
|
24
|
+
sdiff.each do |h|
|
25
|
+
case h.action
|
26
|
+
when '='
|
27
|
+
p1, p2 = h.old_position, h.new_position
|
28
|
+
|
29
|
+
@common_elements << [str1[p1], str2[p2]]
|
30
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
31
|
+
|
32
|
+
if !addition.empty? && deletion.empty?
|
33
|
+
# correct the position for end
|
34
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
35
|
+
elsif addition.empty? && !deletion.empty?
|
36
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
37
|
+
elsif !addition.empty? && !deletion.empty?
|
38
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
39
|
+
|
40
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
41
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
42
|
+
end
|
43
|
+
|
44
|
+
addition.clear; deletion.clear
|
45
|
+
|
46
|
+
when '!'
|
47
|
+
deletion << h.old_position
|
48
|
+
addition << h.new_position
|
49
|
+
when '-'
|
50
|
+
deletion << h.old_position
|
51
|
+
when '+'
|
52
|
+
addition << h.new_position
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
p1, p2 = str1.length, str2.length
|
57
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
58
|
+
|
59
|
+
if !addition.empty? && deletion.empty?
|
60
|
+
# correct the position for end
|
61
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
62
|
+
elsif addition.empty? && !deletion.empty?
|
63
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
64
|
+
elsif !addition.empty? && !deletion.empty?
|
65
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
66
|
+
|
67
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
68
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
69
|
+
end
|
70
|
+
|
71
|
+
@position_map_begin = posmap_begin.sort.to_h
|
72
|
+
@position_map_end = posmap_end.sort.to_h
|
73
|
+
end
|
74
74
|
|
75
75
|
end
|
76
76
|
|
77
77
|
if __FILE__ == $0
|
78
78
|
|
79
|
-
|
80
|
-
|
79
|
+
# from_text = "TGF-β mRNA"
|
80
|
+
# to_text = "TGF-beta mRNA"
|
81
81
|
|
82
|
-
|
83
|
-
|
82
|
+
# from_text = "TGF-beta mRNA"
|
83
|
+
# to_text = "TGF-β mRNA"
|
84
84
|
|
85
|
-
|
86
|
-
|
85
|
+
# from_text = "TGF-beta mRNA"
|
86
|
+
# to_text = "TGF- mRNA"
|
87
87
|
|
88
|
-
|
89
|
-
|
88
|
+
# from_text = "TGF-β–induced"
|
89
|
+
# to_text = "TGF-beta-induced"
|
90
90
|
|
91
|
-
|
92
|
-
|
91
|
+
from_text = 'abxyzcd'
|
92
|
+
to_text = 'abcd'
|
93
93
|
|
94
|
-
|
95
|
-
|
94
|
+
# from_text = "TGF-beta-induced"
|
95
|
+
# to_text = "TGF-β–induced"
|
96
96
|
|
97
|
-
|
98
|
-
|
97
|
+
# from_text = "beta-induced"
|
98
|
+
# to_text = "TGF-beta-induced"
|
99
99
|
|
100
|
-
|
101
|
-
|
100
|
+
# from_text = "TGF-beta-induced"
|
101
|
+
# to_text = "beta-induced"
|
102
102
|
|
103
|
-
|
104
|
-
|
103
|
+
# from_text = "TGF-β–β induced"
|
104
|
+
# to_text = "TGF-beta-beta induced"
|
105
105
|
|
106
|
-
|
107
|
-
|
106
|
+
# from_text = "-βκ-"
|
107
|
+
# to_text = "-betakappa-"
|
108
108
|
|
109
|
-
|
110
|
-
|
109
|
+
# from_text = "-betakappa-beta-z"
|
110
|
+
# to_text = "-βκ-β–z"
|
111
111
|
|
112
|
-
|
113
|
-
|
112
|
+
# from_text = "affect C/EBP-β’s ability"
|
113
|
+
# to_text = "affect C/EBP-beta's ability"
|
114
114
|
|
115
|
-
|
116
|
-
|
115
|
+
# from_text = "12 ± 34"
|
116
|
+
# to_text = "12 +/- 34"
|
117
117
|
|
118
|
-
|
119
|
-
|
118
|
+
# from_text = "TGF-β–treated"
|
119
|
+
# to_text = "TGF-beta-treated"
|
120
120
|
|
121
|
-
|
122
|
-
|
121
|
+
# from_text = "in TGF-β–treated cells"
|
122
|
+
# to_text = "in TGF-beta-treated cells"
|
123
123
|
|
124
|
-
|
125
|
-
|
124
|
+
# from_text = "TGF-β–induced"
|
125
|
+
# to_text = "TGF-beta-induced"
|
126
126
|
|
127
|
-
|
128
|
-
|
127
|
+
# anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
128
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
129
129
|
|
130
|
-
|
131
|
-
|
130
|
+
# aligner = TextAlignment.new(anns1[:text], anns2[:text], [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"]])
|
131
|
+
# denotations = aligner.transform_denotations(anns1[:denotations])
|
132
132
|
|
133
|
-
|
134
|
-
|
135
|
-
|
133
|
+
denotations_s = <<-'ANN'
|
134
|
+
[{"id":"T0", "span":{"begin":1,"end":2}, "category":"Protein"}]
|
135
|
+
ANN
|
136
136
|
|
137
|
-
|
137
|
+
# denotations = JSON.parse denotations_s, :symbolize_names => true
|
138
138
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
139
|
+
a = TextAlignment::LCSAlignment.new(from_text, to_text)
|
140
|
+
p a.position_map_begin
|
141
|
+
puts "-----"
|
142
|
+
p a.position_map_end
|
143
|
+
# aligner = TextAlignment.new(from_text, to_text, [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"], ["β", "beta"]])
|
144
144
|
|
145
|
-
|
145
|
+
# p denotations
|
146
146
|
end
|