text_alignment 0.2.9 → 0.3.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +225 -39
- data/lib/text_alignment/anchor_finder.rb +146 -0
- data/lib/text_alignment/approximate_fit.rb +50 -52
- data/lib/text_alignment/find_divisions.rb +198 -200
- data/lib/text_alignment/glcs_alignment.rb +297 -297
- data/lib/text_alignment/glcs_alignment_fast.rb +94 -94
- data/lib/text_alignment/glcs_required.rb +50 -50
- data/lib/text_alignment/lcs_alignment.rb +115 -115
- data/lib/text_alignment/lcs_cdiff.rb +46 -48
- data/lib/text_alignment/lcs_comparison.rb +53 -53
- data/lib/text_alignment/lcs_min.rb +144 -138
- data/lib/text_alignment/mappings.rb +68 -69
- data/lib/text_alignment/mixed_alignment.rb +193 -0
- data/lib/text_alignment/text_alignment.rb +232 -174
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -1
- metadata +5 -13
- data/spec/spec_helper.rb +0 -1
- data/spec/text_alignment/glcs_alignment_spec.rb +0 -302
- data/spec/text_alignment/lcs_alignment_spec.rb +0 -98
- data/spec/text_alignment/lcs_comparision_spec.rb +0 -322
- data/spec/text_alignment/text_alignment_spec.rb +0 -302
@@ -3,59 +3,57 @@ require 'diff-lcs'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
|
7
|
-
NIL_CHARACTER = '_'
|
8
|
-
end
|
6
|
+
TextAlignment::NIL_CHARACTER = '_' unless defined? TextAlignment::NIL_CHARACTER
|
9
7
|
|
10
8
|
class << TextAlignment
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
10
|
+
def cdiff(str1, str2)
|
11
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
12
|
+
raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
|
13
|
+
sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
|
14
|
+
end
|
15
|
+
|
16
|
+
def sdiff2cdiff (sdiff)
|
17
|
+
raise ArgumentError, "nil sdiff" if sdiff.nil?
|
18
|
+
|
19
|
+
cdiff_str1, cdiff_str2 = '', ''
|
20
|
+
|
21
|
+
sdiff.each do |h|
|
22
|
+
case h.action
|
23
|
+
when '='
|
24
|
+
cdiff_str1 += h.old_element
|
25
|
+
cdiff_str2 += h.new_element
|
26
|
+
when '!'
|
27
|
+
cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
|
28
|
+
cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
|
29
|
+
when '-'
|
30
|
+
cdiff_str1 += h.old_element
|
31
|
+
cdiff_str2 += TextAlignment::NIL_CHARACTER
|
32
|
+
when '+'
|
33
|
+
cdiff_str1 += TextAlignment::NIL_CHARACTER
|
34
|
+
cdiff_str2 += h.new_element
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
cdiff_str1.gsub(/\n/, ' ') + "\n>>>>><<<<<\n" + cdiff_str2.gsub(/\n/, ' ')
|
39
|
+
end
|
42
40
|
|
43
41
|
end
|
44
42
|
|
45
43
|
if __FILE__ == $0
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
44
|
+
require 'json'
|
45
|
+
str1 = 'abcde'
|
46
|
+
str2 = 'naxbyzabcdexydzem'
|
47
|
+
|
48
|
+
if ARGV.length == 2
|
49
|
+
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
50
|
+
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "string 1: #{str1}"
|
54
|
+
puts "-----"
|
55
|
+
puts "string 2: #{str2}"
|
56
|
+
puts "-----"
|
57
|
+
puts "[cdiff]"
|
58
|
+
puts TextAlignment::cdiff(str1, str2)
|
61
59
|
end
|
@@ -4,60 +4,60 @@ require 'text_alignment/lcs_min'
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
6
|
class TextAlignment::LCSComparison
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
7
|
+
# The similarity ratio of the given two strings after stripping unmatched prefixes and suffixes
|
8
|
+
attr_reader :similarity
|
9
|
+
|
10
|
+
# The initial and final matching positions of str1 and str2
|
11
|
+
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
12
|
+
|
13
|
+
def initialize(str1, str2, lcs = nil, sdiff = nil)
|
14
|
+
raise ArgumentError, "nil string" if str1 == nil || str2 == nil
|
15
|
+
@str1, @str2 = str1, str2
|
16
|
+
_lcs_comparison(str1, str2, lcs, sdiff)
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def _lcs_comparison(str1, str2, lcs = nil, sdiff = nil)
|
22
|
+
if lcs.nil?
|
23
|
+
lcsmin = TextAlignment::LCSMin.new(str1, str2)
|
24
|
+
lcs = lcsmin.lcs
|
25
|
+
sdiff = lcsmin.sdiff
|
26
|
+
end
|
27
|
+
|
28
|
+
if lcs > 0
|
29
|
+
match_initial = sdiff.index{|d| d.action == '='}
|
30
|
+
match_final = sdiff.rindex{|d| d.action == '='}
|
31
|
+
|
32
|
+
@str1_match_initial = sdiff[match_initial].old_position
|
33
|
+
@str2_match_initial = sdiff[match_initial].new_position
|
34
|
+
@str1_match_final = sdiff[match_final].old_position
|
35
|
+
@str2_match_final = sdiff[match_final].new_position
|
36
|
+
@similarity = 2 * lcs / ((@str1_match_final - @str1_match_initial + 1) + (@str2_match_final - @str2_match_initial + 1)).to_f
|
37
|
+
else
|
38
|
+
@str1_match_initial = 0
|
39
|
+
@str2_match_initial = 0
|
40
|
+
@str1_match_final = 0
|
41
|
+
@str2_match_final = 0
|
42
|
+
@similarity = 0
|
43
|
+
end
|
44
|
+
end
|
45
45
|
end
|
46
46
|
|
47
47
|
if __FILE__ == $0
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
48
|
+
require 'json'
|
49
|
+
str1 = 'naxbyzabcdexydzem'
|
50
|
+
str2 = 'abcde'
|
51
|
+
if ARGV.length == 2
|
52
|
+
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
53
|
+
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
54
|
+
end
|
55
|
+
comparison = TextAlignment::LCSComparison.new(str1, str2)
|
56
|
+
puts "Similarity: #{comparison.similarity}"
|
57
|
+
puts "String 1 match: (#{comparison.str1_match_initial}, #{comparison.str1_match_final})"
|
58
|
+
puts "String 2 match: (#{comparison.str2_match_initial}, #{comparison.str2_match_final})"
|
59
|
+
puts "-----"
|
60
|
+
puts '[' + str1[comparison.str1_match_initial .. comparison.str1_match_final] + ']'
|
61
|
+
puts "-----"
|
62
|
+
puts '[' + str2[comparison.str2_match_initial .. comparison.str2_match_final] + ']'
|
63
63
|
end
|
@@ -5,156 +5,162 @@ module TextAlignment; end unless defined? TextAlignment
|
|
5
5
|
|
6
6
|
# change the class definition of ContextChange to allow update of the two instance variables
|
7
7
|
class Diff::LCS::ContextChange
|
8
|
-
|
8
|
+
attr_accessor :old_position, :new_position
|
9
9
|
end
|
10
10
|
|
11
11
|
# It finds minimal lcs and sdiff of the given strings, str1 and str2.
|
12
12
|
# It relies on the diff-lcs gem for the computation of lcs table.
|
13
13
|
class TextAlignment::LCSMin
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
14
|
+
attr_reader :sdiff, :lcs, :m1_initial, :m1_final, :m2_initial, :m2_final
|
15
|
+
|
16
|
+
PLACEHOLDER_CHAR = '_'
|
17
|
+
|
18
|
+
def initialize (str1, str2)
|
19
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
20
|
+
raise ArgumentError, "empty string" if str1.empty? || str2.empty?
|
21
|
+
|
22
|
+
# str1 is copied as it is.
|
23
|
+
# str2 is copied with w/s characters replaced with the placeholder characters,
|
24
|
+
# to avoid overfitting to w/s characters during LCS computation.
|
25
|
+
@str1 = str1
|
26
|
+
@str2 = str2.gsub(/\s/, PLACEHOLDER_CHAR)
|
27
|
+
|
28
|
+
# find the corresponding minimal range of the two strings
|
29
|
+
r = _find_min_range(0, @str1.length - 1, 0, @str2.length - 1)
|
30
|
+
if r.nil?
|
31
|
+
@sdiff = nil
|
32
|
+
@lcs = 0
|
33
|
+
return
|
34
|
+
end
|
35
|
+
|
36
|
+
@m1_initial, @m1_final, @m2_initial, @m2_final = r[:m1_initial], r[:m1_final], r[:m2_initial], r[:m2_final]
|
37
|
+
|
38
|
+
if @m1_initial.nil?
|
39
|
+
@sdiff = nil
|
40
|
+
@lcs = 0
|
41
|
+
else
|
42
|
+
# compute sdiff and lcs
|
43
|
+
# here the original str2 is used with all the w/s characters preserved.
|
44
|
+
@sdiff = Diff::LCS.sdiff(@str1[@m1_initial..@m1_final], str2[@m2_initial..@m2_final])
|
45
|
+
@lcs = @sdiff.count{|d| d.action == '='}
|
46
|
+
|
47
|
+
# adjust the position values of sdiff
|
48
|
+
@sdiff.each do |h|
|
49
|
+
h.old_position += @m1_initial unless h.old_position.nil?
|
50
|
+
h.new_position += @m2_initial unless h.new_position.nil?
|
51
|
+
end
|
52
|
+
|
53
|
+
(0 ... @m2_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
|
54
|
+
(0 ... @m1_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
|
55
|
+
(@m1_final + 1 ... @str1.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
|
56
|
+
(@m2_final + 1 ... @str2.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def _find_min_range (m1_initial, m1_final, m2_initial, m2_final, clcs = 0)
|
61
|
+
return nil if (m1_final - m1_initial < 0) || (m2_final - m2_initial < 0)
|
62
|
+
sdiff = Diff::LCS.sdiff(@str1[m1_initial..m1_final], @str2[m2_initial..m2_final])
|
63
|
+
lcs = sdiff.count{|d| d.action == '='}
|
64
|
+
|
65
|
+
return nil if lcs == 0
|
66
|
+
return nil if lcs < clcs
|
67
|
+
|
68
|
+
match_last = sdiff.rindex{|d| d.action == '='}
|
69
|
+
m1_final = sdiff[match_last].old_position + m1_initial
|
70
|
+
m2_final = sdiff[match_last].new_position + m2_initial
|
71
|
+
|
72
|
+
match_first = sdiff.index{|d| d.action == '='}
|
73
|
+
m1_initial = sdiff[match_first].old_position + m1_initial
|
74
|
+
m2_initial = sdiff[match_first].new_position + m2_initial
|
75
|
+
|
76
|
+
# attempt for shorter match
|
77
|
+
if ((m1_final - m1_initial) > (m2_final - m2_initial))
|
78
|
+
r = _find_min_range(m1_initial + 1, m1_final, m2_initial, m2_final, lcs)
|
79
|
+
return r unless r.nil?
|
80
|
+
r = _find_min_range(m1_initial, m1_final - 1, m2_initial, m2_final, lcs)
|
81
|
+
return r unless r.nil?
|
82
|
+
else
|
83
|
+
r = _find_min_range(m1_initial, m1_final, m2_initial + 1, m2_final, lcs)
|
84
|
+
return r unless r.nil?
|
85
|
+
r = _find_min_range(m1_initial, m1_final, m2_initial, m2_final - 1, lcs)
|
86
|
+
return r unless r.nil?
|
87
|
+
end
|
88
|
+
|
89
|
+
return {
|
90
|
+
m1_initial: m1_initial,
|
91
|
+
m1_final: m1_final,
|
92
|
+
m2_initial: m2_initial,
|
93
|
+
m2_final: m2_final
|
94
|
+
}
|
95
|
+
end
|
96
|
+
|
97
|
+
def num_big_gaps (sdiff, initial, last)
|
98
|
+
raise ArgumentError, "nil sdiff" if sdiff.nil?
|
99
|
+
raise ArgumentError, "invalid indice: #{initial}, #{last}" unless last >= initial
|
100
|
+
|
101
|
+
state1 = :initial
|
102
|
+
state2 = :initial
|
103
|
+
gaps1 = []
|
104
|
+
gaps2 = []
|
105
|
+
|
106
|
+
(initial .. last).each do |i|
|
107
|
+
case sdiff[i].action
|
108
|
+
when '='
|
109
|
+
state1 = :continue
|
110
|
+
state2 = :continue
|
111
|
+
when '!'
|
112
|
+
gaps1 << 1
|
113
|
+
state1 = :break
|
114
|
+
|
115
|
+
if state2 == :break
|
116
|
+
gaps2[-1] += 1
|
117
|
+
else
|
118
|
+
gaps2 << 1
|
119
|
+
end
|
120
|
+
state2 = :continue
|
121
|
+
when '+'
|
122
|
+
if state1 == :break
|
123
|
+
gaps1[-1] += 1
|
124
|
+
else
|
125
|
+
gaps1 << 1
|
126
|
+
end
|
127
|
+
state1 = :break
|
128
|
+
when '-'
|
129
|
+
if state2 == :break
|
130
|
+
gaps2[-1] += 1
|
131
|
+
else
|
132
|
+
gaps2 << 1
|
133
|
+
end
|
134
|
+
state2 = :break
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
num_big_gaps1 = gaps1.select{|g| g > MAX_LEN_BIG_GAP}.length
|
139
|
+
num_big_gaps2 = gaps2.select{|g| g > MAX_LEN_BIG_GAP}.length
|
140
|
+
num_big_gaps1 + num_big_gaps2
|
141
|
+
end
|
136
142
|
|
137
143
|
end
|
138
144
|
|
139
145
|
|
140
146
|
if __FILE__ == $0
|
141
|
-
|
142
|
-
|
147
|
+
require 'json'
|
148
|
+
require 'text_alignment/lcs_cdiff'
|
143
149
|
|
144
|
-
|
145
|
-
|
150
|
+
str2 = 'abcde'
|
151
|
+
str1 = 'naxbyzabcdexydzem'
|
146
152
|
|
147
|
-
|
148
|
-
|
149
|
-
|
153
|
+
str1 = "TI - Identification of a region which directs the monocytic activity of the\n colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor\n promoter and binds PEBP2/CBF (AML1)."
|
154
|
+
str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts. We have demonstrated that the monocyte-specific expression of the CSF-1 receptor is regulated at the level of transcription by a tissue-specific promoter whose activity is stimulated by the monocyte/B-cell-specific transcription factor PU.1 (D.-E. Zhang, C.J. Hetherington, H.-M. Chen, and D.G. Tenen, Mol. Cell. Biol. 14:373-381, 1994). Here we report that the tissue specificity of this promoter is also mediated by sequences in a region II (bp -88 to -59), which lies 10 bp upstream from the PU.1-binding site. When analyzed by DNase footprinting, region II was protected preferentially in monocytic cells. Electrophoretic mobility shift assays confirmed that region II interacts specifically with nuclear proteins from monocytic cells. Two gel shift complexes (Mono A and Mono B) were formed with separate sequence elements within this region. Competition and supershift experiments indicate that Mono B contains a member of the polyomavirus enhancer-binding protein 2/core-binding factor (PEBP2/CBF) family, which includes the AML1 gene product, while Mono A is a distinct complex preferentially expressed in monocytic cells. Promoter constructs with mutations in these sequence elements were no longer expressed specifically in monocytes. Furthermore, multimerized region II sequence elements enhanced the activity of a heterologous thymidine kinase promoter in monocytic cells but not other cell types tested. These results indicate that the monocyte/B-cell-specific transcription factor PU.1 and the Mono A and Mono B protein complexes act in concert to regulate monocyte-specific transcription of the CSF-1 receptor."
|
155
|
+
# str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts."
|
150
156
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
157
|
+
if ARGV.length == 2
|
158
|
+
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
159
|
+
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
160
|
+
end
|
155
161
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
162
|
+
lcsmin = TextAlignment::LCSMin.new(str1, str2)
|
163
|
+
# puts lcs
|
164
|
+
# sdiff.each {|h| p h}
|
165
|
+
puts TextAlignment.sdiff2cdiff(lcsmin.sdiff)
|
160
166
|
end
|