text_alignment 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +22 -0
- data/README.md +27 -0
- data/lib/text_alignment.rb +1 -0
- data/lib/text_alignment/approximate_fit.rb +61 -0
- data/lib/text_alignment/find_divisions.rb +117 -0
- data/lib/text_alignment/glcs_alignment.rb +311 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +114 -0
- data/lib/text_alignment/glcs_required.rb +68 -0
- data/lib/text_alignment/lcs_alignment.rb +146 -0
- data/lib/text_alignment/lcs_cdiff.rb +61 -0
- data/lib/text_alignment/lcs_comparison.rb +63 -0
- data/lib/text_alignment/lcs_min.rb +160 -0
- data/lib/text_alignment/mappings.rb +75 -0
- data/lib/text_alignment/text_alignment.rb +223 -0
- data/lib/text_alignment/version.rb +3 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/text_alignment/glcs_alignment_spec.rb +302 -0
- data/spec/text_alignment/lcs_alignment_spec.rb +98 -0
- data/spec/text_alignment/lcs_comparision_spec.rb +322 -0
- data/spec/text_alignment/text_alignment_spec.rb +302 -0
- data/text_alignment.gemspec +22 -0
- metadata +108 -0
@@ -0,0 +1,160 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'diff-lcs'
|
3
|
+
|
4
|
+
module TextAlignment; end unless defined? TextAlignment
|
5
|
+
|
6
|
+
# change the class definition of ContextChange to allow update of the two instance variables
|
7
|
+
class Diff::LCS::ContextChange
|
8
|
+
attr_accessor :old_position, :new_position
|
9
|
+
end
|
10
|
+
|
11
|
+
# It finds minimal lcs and sdiff of the given strings, str1 and str2.
|
12
|
+
# It relies on the diff-lcs gem for the computation of lcs table.
|
13
|
+
class TextAlignment::LCSMin
|
14
|
+
attr_reader :sdiff, :lcs, :m1_initial, :m1_final, :m2_initial, :m2_final
|
15
|
+
|
16
|
+
PLACEHOLDER_CHAR = '_'
|
17
|
+
|
18
|
+
def initialize (str1, str2)
|
19
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
20
|
+
raise ArgumentError, "empty string" if str1.empty? || str2.empty?
|
21
|
+
|
22
|
+
# str1 is copied as it is.
|
23
|
+
# str2 is copied with w/s characters replaced with the placeholder characters,
|
24
|
+
# to avoid overfitting to w/s characters during LCS computation.
|
25
|
+
@str1 = str1
|
26
|
+
@str2 = str2.gsub(/\s/, PLACEHOLDER_CHAR)
|
27
|
+
|
28
|
+
# find the corresponding minimal range of the two strings
|
29
|
+
r = _find_min_range(0, @str1.length - 1, 0, @str2.length - 1)
|
30
|
+
@m1_initial, @m1_final, @m2_initial, @m2_final = r[:m1_initial], r[:m1_final], r[:m2_initial], r[:m2_final]
|
31
|
+
|
32
|
+
if @m1_initial.nil?
|
33
|
+
@sdiff = nil
|
34
|
+
@lcs = 0
|
35
|
+
else
|
36
|
+
# compute sdiff and lcs
|
37
|
+
# here the original str2 is used with all the w/s characters preserved.
|
38
|
+
@sdiff = Diff::LCS.sdiff(@str1[@m1_initial..@m1_final], str2[@m2_initial..@m2_final])
|
39
|
+
@lcs = @sdiff.count{|d| d.action == '='}
|
40
|
+
|
41
|
+
# adjust the position values of sdiff
|
42
|
+
@sdiff.each do |h|
|
43
|
+
h.old_position += @m1_initial unless h.old_position.nil?
|
44
|
+
h.new_position += @m2_initial unless h.new_position.nil?
|
45
|
+
end
|
46
|
+
|
47
|
+
(0 ... @m2_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
|
48
|
+
(0 ... @m1_initial).reverse_each{|i| @sdiff.unshift(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
|
49
|
+
(@m1_final + 1 ... @str1.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('-', i, @str1[i], nil, nil))}
|
50
|
+
(@m2_final + 1 ... @str2.length).each{|i| @sdiff.push(Diff::LCS::ContextChange.new('+', nil, nil, i, @str2[i]))}
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def _find_min_range (m1_initial, m1_final, m2_initial, m2_final, clcs = 0)
|
55
|
+
return nil if (m1_final - m1_initial < 0) || (m2_final - m2_initial < 0)
|
56
|
+
sdiff = Diff::LCS.sdiff(@str1[m1_initial..m1_final], @str2[m2_initial..m2_final])
|
57
|
+
lcs = sdiff.count{|d| d.action == '='}
|
58
|
+
|
59
|
+
return nil if lcs == 0
|
60
|
+
return nil if lcs < clcs
|
61
|
+
|
62
|
+
match_last = sdiff.rindex{|d| d.action == '='}
|
63
|
+
m1_final = sdiff[match_last].old_position + m1_initial
|
64
|
+
m2_final = sdiff[match_last].new_position + m2_initial
|
65
|
+
|
66
|
+
match_first = sdiff.index{|d| d.action == '='}
|
67
|
+
m1_initial = sdiff[match_first].old_position + m1_initial
|
68
|
+
m2_initial = sdiff[match_first].new_position + m2_initial
|
69
|
+
|
70
|
+
# attempt for shorter match
|
71
|
+
if ((m1_final - m1_initial) > (m2_final - m2_initial))
|
72
|
+
r = _find_min_range(m1_initial + 1, m1_final, m2_initial, m2_final, lcs)
|
73
|
+
return r unless r.nil?
|
74
|
+
r = _find_min_range(m1_initial, m1_final - 1, m2_initial, m2_final, lcs)
|
75
|
+
return r unless r.nil?
|
76
|
+
else
|
77
|
+
r = _find_min_range(m1_initial, m1_final, m2_initial + 1, m2_final, lcs)
|
78
|
+
return r unless r.nil?
|
79
|
+
r = _find_min_range(m1_initial, m1_final, m2_initial, m2_final - 1, lcs)
|
80
|
+
return r unless r.nil?
|
81
|
+
end
|
82
|
+
|
83
|
+
return {
|
84
|
+
m1_initial: m1_initial,
|
85
|
+
m1_final: m1_final,
|
86
|
+
m2_initial: m2_initial,
|
87
|
+
m2_final: m2_final
|
88
|
+
}
|
89
|
+
end
|
90
|
+
|
91
|
+
def num_big_gaps (sdiff, initial, last)
|
92
|
+
raise ArgumentError, "nil sdiff" if sdiff.nil?
|
93
|
+
raise ArgumentError, "invalid indice: #{initial}, #{last}" unless last >= initial
|
94
|
+
|
95
|
+
state1 = :initial
|
96
|
+
state2 = :initial
|
97
|
+
gaps1 = []
|
98
|
+
gaps2 = []
|
99
|
+
|
100
|
+
(initial .. last).each do |i|
|
101
|
+
case sdiff[i].action
|
102
|
+
when '='
|
103
|
+
state1 = :continue
|
104
|
+
state2 = :continue
|
105
|
+
when '!'
|
106
|
+
gaps1 << 1
|
107
|
+
state1 = :break
|
108
|
+
|
109
|
+
if state2 == :break
|
110
|
+
gaps2[-1] += 1
|
111
|
+
else
|
112
|
+
gaps2 << 1
|
113
|
+
end
|
114
|
+
state2 = :continue
|
115
|
+
when '+'
|
116
|
+
if state1 == :break
|
117
|
+
gaps1[-1] += 1
|
118
|
+
else
|
119
|
+
gaps1 << 1
|
120
|
+
end
|
121
|
+
state1 = :break
|
122
|
+
when '-'
|
123
|
+
if state2 == :break
|
124
|
+
gaps2[-1] += 1
|
125
|
+
else
|
126
|
+
gaps2 << 1
|
127
|
+
end
|
128
|
+
state2 = :break
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
num_big_gaps1 = gaps1.select{|g| g > MAX_LEN_BIG_GAP}.length
|
133
|
+
num_big_gaps2 = gaps2.select{|g| g > MAX_LEN_BIG_GAP}.length
|
134
|
+
num_big_gaps1 + num_big_gaps2
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
if __FILE__ == $0
|
141
|
+
require 'json'
|
142
|
+
require 'text_alignment/lcs_cdiff'
|
143
|
+
|
144
|
+
str2 = 'abcde'
|
145
|
+
str1 = 'naxbyzabcdexydzem'
|
146
|
+
|
147
|
+
str1 = "TI - Identification of a region which directs the monocytic activity of the\n colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor\n promoter and binds PEBP2/CBF (AML1)."
|
148
|
+
str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts. We have demonstrated that the monocyte-specific expression of the CSF-1 receptor is regulated at the level of transcription by a tissue-specific promoter whose activity is stimulated by the monocyte/B-cell-specific transcription factor PU.1 (D.-E. Zhang, C.J. Hetherington, H.-M. Chen, and D.G. Tenen, Mol. Cell. Biol. 14:373-381, 1994). Here we report that the tissue specificity of this promoter is also mediated by sequences in a region II (bp -88 to -59), which lies 10 bp upstream from the PU.1-binding site. When analyzed by DNase footprinting, region II was protected preferentially in monocytic cells. Electrophoretic mobility shift assays confirmed that region II interacts specifically with nuclear proteins from monocytic cells. Two gel shift complexes (Mono A and Mono B) were formed with separate sequence elements within this region. Competition and supershift experiments indicate that Mono B contains a member of the polyomavirus enhancer-binding protein 2/core-binding factor (PEBP2/CBF) family, which includes the AML1 gene product, while Mono A is a distinct complex preferentially expressed in monocytic cells. Promoter constructs with mutations in these sequence elements were no longer expressed specifically in monocytes. Furthermore, multimerized region II sequence elements enhanced the activity of a heterologous thymidine kinase promoter in monocytic cells but not other cell types tested. These results indicate that the monocyte/B-cell-specific transcription factor PU.1 and the Mono A and Mono B protein complexes act in concert to regulate monocyte-specific transcription of the CSF-1 receptor."
|
149
|
+
# str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts."
|
150
|
+
|
151
|
+
if ARGV.length == 2
|
152
|
+
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
153
|
+
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
154
|
+
end
|
155
|
+
|
156
|
+
lcsmin = TextAlignment::LCSMin.new(str1, str2)
|
157
|
+
# puts lcs
|
158
|
+
# sdiff.each {|h| p h}
|
159
|
+
puts TextAlignment.sdiff2cdiff(lcsmin.sdiff)
|
160
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::MAPPINGS = [
|
4
|
+
["©", "(c)"], #U+00A9 (Copyright Sign)
|
5
|
+
|
6
|
+
["α", "alpha"], #U+03B1 (greek small letter alpha)
|
7
|
+
["β", "beta"], #U+03B2 (greek small letter beta)
|
8
|
+
["γ", "gamma"], #U+03B3 (greek small letter gamma)
|
9
|
+
["δ", "delta"], #U+03B4 (greek small letter delta)
|
10
|
+
["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
|
11
|
+
["ζ", "zeta"], #U+03B6 (greek small letter zeta)
|
12
|
+
["η", "eta"], #U+03B7 (greek small letter eta)
|
13
|
+
["θ", "theta"], #U+03B7 (greek small letter eta)
|
14
|
+
["ι", "iota"], #U+03B7 (greek small letter eta)
|
15
|
+
["κ", "kappa"], #U+03BA (greek small letter kappa)
|
16
|
+
["λ", "lambda"], #U+03BB (greek small letter lambda)
|
17
|
+
["λ", "lamda"], #U+03BB (greek small letter lambda)
|
18
|
+
["μ", "mu"], #U+03BC (greek small letter mu)
|
19
|
+
["ν", "nu"], #U+03BD (greek small letter nu)
|
20
|
+
["ξ", "xi"], #U+03BE (greek small letter xi)
|
21
|
+
["ο", "omicron"], #U+03BF (greek small letter omicron)
|
22
|
+
["π", "pi"], #U+03C0 (greek small letter pi)
|
23
|
+
["ρ", "rho"], #U+03C1 (greek small letter rho)
|
24
|
+
["σ", "sigma"], #U+03C3 (greek small letter sigma)
|
25
|
+
["τ", "tau"], #U+03C4 (greek small letter tau)
|
26
|
+
["υ", "upsilon"], #U+03C5 (greek small letter upsilon)
|
27
|
+
["φ", "phi"], #U+03C6 (greek small letter phi)
|
28
|
+
["χ", "chi"], #U+03C7 (greek small letter chi)
|
29
|
+
["ψ", "psi"], #U+03C8 (greek small letter psi)
|
30
|
+
["ω", "omega"], #U+03C9 (greek small letter omega)
|
31
|
+
|
32
|
+
["Α", "Alpha"], #U+0391 (greek capital letter alpha)
|
33
|
+
["Β", "Beta"], #U+0392 (greek capital letter beta)
|
34
|
+
["Γ", "Gamma"], #U+0393 (greek capital letter gamma)
|
35
|
+
["Δ", "Delta"], #U+0394 (greek capital letter delta)
|
36
|
+
["Ε", "Epsilon"], #U+0395 (greek capital letter epsilon)
|
37
|
+
["Ζ", "Zeta"], #U+0396 (greek capital letter zeta)
|
38
|
+
["Η", "Eta"], #U+0397 (greek capital letter eta)
|
39
|
+
["Θ", "Theta"], #U+0398 (greek capital letter theta)
|
40
|
+
["Ι", "Iota"], #U+0399 (greek capital letter iota)
|
41
|
+
["Κ", "Kappa"], #U+039A (greek capital letter kappa)
|
42
|
+
["Λ", "Lambda"], #U+039B (greek capital letter lambda)
|
43
|
+
["Λ", "Lamda"], #U+039B (greek capital letter lambda)
|
44
|
+
["Μ", "Mu"], #U+039C (greek capital letter mu)
|
45
|
+
["Ν", "Nu"], #U+039D (greek capital letter nu)
|
46
|
+
["Ξ", "Xi"], #U+039E (greek capital letter xi)
|
47
|
+
["Ο", "Omicron"], #U+039F (greek capital letter omicron)
|
48
|
+
["Π", "Pi"], #U+03A0 (greek capital letter pi)
|
49
|
+
["Ρ", "Rho"], #U+03A1 (greek capital letter rho)
|
50
|
+
["Σ", "Sigma"], #U+03A3 (greek capital letter sigma)
|
51
|
+
["Τ", "Tau"], #U+03A4 (greek capital letter tau)
|
52
|
+
["Υ", "Upsilon"], #U+03A5 (greek capital letter upsilon)
|
53
|
+
["Φ", "Phi"], #U+03A6 (greek capital letter phi)
|
54
|
+
["Χ", "Chi"], #U+03A7 (greek capital letter chi)
|
55
|
+
["Ψ", "Psi"], #U+03A8 (greek capital letter Psi)
|
56
|
+
["Ω", "Omega"], #U+03A9 (greek capital letter omega)
|
57
|
+
|
58
|
+
["ϕ", "phi"], #U+03D5 (greek phi symbol)
|
59
|
+
|
60
|
+
["×", "x"], #U+00D7 (multiplication sign)
|
61
|
+
["•", "*"], #U+2022 (bullet)
|
62
|
+
[" ", " "], #U+2009 (thin space)
|
63
|
+
[" ", " "], #U+200A (hair space)
|
64
|
+
[" ", " "], #U+00A0 (no-break space)
|
65
|
+
[" ", " "], #U+3000 (ideographic space)
|
66
|
+
["−", "-"], #U+2212 (minus sign)
|
67
|
+
["–", "-"], #U+2013 (en dash)
|
68
|
+
["′", "'"], #U+2032 (prime)
|
69
|
+
["‘", "'"], #U+2018 (left single quotation mark)
|
70
|
+
["’", "'"], #U+2019 (right single quotation mark)
|
71
|
+
["“", '"'], #U+201C (left double quotation mark)
|
72
|
+
["”", '"'], #U+201D (right double quotation mark)
|
73
|
+
['"', "''"]
|
74
|
+
]
|
75
|
+
end
|
@@ -0,0 +1,223 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'diff-lcs'
|
3
|
+
require 'text_alignment/lcs_min'
|
4
|
+
require 'text_alignment/find_divisions'
|
5
|
+
require 'text_alignment/lcs_comparison'
|
6
|
+
require 'text_alignment/lcs_alignment'
|
7
|
+
require 'text_alignment/glcs_alignment'
|
8
|
+
require 'text_alignment/mappings'
|
9
|
+
|
10
|
+
module TextAlignment; end unless defined? TextAlignment
|
11
|
+
|
12
|
+
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
+
TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
|
14
|
+
|
15
|
+
class TextAlignment::TextAlignment
|
16
|
+
attr_reader :sdiff
|
17
|
+
attr_reader :position_map_begin, :position_map_end
|
18
|
+
attr_reader :common_elements, :mapped_elements
|
19
|
+
attr_reader :similarity
|
20
|
+
attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
|
21
|
+
|
22
|
+
def initialize(str1, str2, mappings = [])
|
23
|
+
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
24
|
+
raise ArgumentError, "nil mappings" if mappings.nil?
|
25
|
+
|
26
|
+
## preprocessing
|
27
|
+
str1 = str1.dup
|
28
|
+
str2 = str2.dup
|
29
|
+
|
30
|
+
## find the first nomatch character
|
31
|
+
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
32
|
+
if str2.index(c).nil?
|
33
|
+
@nomatch_char1 = c
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
|
38
|
+
|
39
|
+
## find the first nomatch character
|
40
|
+
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
41
|
+
if c != @nomatch_char1 && str1.index(c).nil?
|
42
|
+
@nomatch_char2 = c
|
43
|
+
break
|
44
|
+
end
|
45
|
+
end
|
46
|
+
raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
|
47
|
+
|
48
|
+
# single character mappings
|
49
|
+
character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
|
50
|
+
characters_from = character_mappings.collect{|m| m[0]}.join
|
51
|
+
characters_to = character_mappings.collect{|m| m[1]}.join
|
52
|
+
characters_to.gsub!(/-/, '\-')
|
53
|
+
|
54
|
+
str1.tr!(characters_from, characters_to)
|
55
|
+
str2.tr!(characters_from, characters_to)
|
56
|
+
|
57
|
+
mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
|
58
|
+
|
59
|
+
# ASCII foldings
|
60
|
+
# ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
|
61
|
+
# ascii_foldings.each do |f|
|
62
|
+
# from = f[1]
|
63
|
+
|
64
|
+
# if str2.index(f[0])
|
65
|
+
# to = f[0] + (@nomatch_char1 * (f[1].length - 1))
|
66
|
+
# str1.gsub!(from, to)
|
67
|
+
# end
|
68
|
+
|
69
|
+
# if str1.index(f[0])
|
70
|
+
# to = f[0] + (@nomatch_char2 * (f[1].length - 1))
|
71
|
+
# str2.gsub!(from, to)
|
72
|
+
# end
|
73
|
+
# end
|
74
|
+
|
75
|
+
# mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
|
76
|
+
|
77
|
+
_compute_mixed_alignment(str1, str2, mappings)
|
78
|
+
end
|
79
|
+
|
80
|
+
def transform_a_span(span)
|
81
|
+
{:begin=>@position_map_begin[span[:begin]], :end=>@position_map_end[span[:end]]}
|
82
|
+
end
|
83
|
+
|
84
|
+
def transform_spans(spans)
|
85
|
+
spans.map{|span| transform_a_span(span)}
|
86
|
+
end
|
87
|
+
|
88
|
+
def transform_denotations!(denotations)
|
89
|
+
denotations.map!{|d| d.begin = @position_map_begin[d.begin]; d.end = @position_map_end[d.end]; d} unless denotations.nil?
|
90
|
+
end
|
91
|
+
|
92
|
+
def transform_hdenotations(hdenotations)
|
93
|
+
unless hdenotations.nil?
|
94
|
+
hdenotations_new = Array.new(hdenotations)
|
95
|
+
(0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
|
96
|
+
hdenotations_new
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def _compute_mixed_alignment(str1, str2, mappings = [])
|
103
|
+
lcsmin = TextAlignment::LCSMin.new(str1, str2)
|
104
|
+
lcs = lcsmin.lcs
|
105
|
+
@sdiff = lcsmin.sdiff
|
106
|
+
|
107
|
+
cmp = TextAlignment::LCSComparison.new(str1, str2, lcs, @sdiff)
|
108
|
+
@similarity = cmp.similarity
|
109
|
+
@str1_match_initial = cmp.str1_match_initial
|
110
|
+
@str1_match_final = cmp.str1_match_final
|
111
|
+
@str2_match_initial = cmp.str2_match_initial
|
112
|
+
@str2_match_final = cmp.str2_match_final
|
113
|
+
|
114
|
+
posmap_begin, posmap_end = {}, {}
|
115
|
+
@common_elements, @mapped_elements = [], []
|
116
|
+
|
117
|
+
addition, deletion = [], []
|
118
|
+
|
119
|
+
@sdiff.each do |h|
|
120
|
+
case h.action
|
121
|
+
when '='
|
122
|
+
p1, p2 = h.old_position, h.new_position
|
123
|
+
|
124
|
+
@common_elements << [str1[p1], str2[p2]]
|
125
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
126
|
+
|
127
|
+
if !addition.empty? && deletion.empty?
|
128
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
129
|
+
elsif addition.empty? && !deletion.empty?
|
130
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
131
|
+
elsif !addition.empty? && !deletion.empty?
|
132
|
+
if addition.length > 1 || deletion.length > 1
|
133
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
134
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
135
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
136
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
137
|
+
@common_elements += galign.common_elements
|
138
|
+
@mapped_elements += galign.mapped_elements
|
139
|
+
else
|
140
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
141
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
142
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
addition.clear; deletion.clear
|
147
|
+
|
148
|
+
when '!'
|
149
|
+
deletion << h.old_position
|
150
|
+
addition << h.new_position
|
151
|
+
when '-'
|
152
|
+
deletion << h.old_position
|
153
|
+
when '+'
|
154
|
+
addition << h.new_position
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
p1, p2 = str1.length, str2.length
|
159
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
160
|
+
|
161
|
+
if !addition.empty? && deletion.empty?
|
162
|
+
posmap_end[p1] = p2 - addition.length unless p1 == 0
|
163
|
+
elsif addition.empty? && !deletion.empty?
|
164
|
+
deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
|
165
|
+
elsif !addition.empty? && !deletion.empty?
|
166
|
+
if addition.length > 1 && deletion.length > 1
|
167
|
+
galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
|
168
|
+
galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
169
|
+
galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
|
170
|
+
posmap_begin[p1], posmap_end[p1] = p2, p2
|
171
|
+
@common_elements += galign.common_elements
|
172
|
+
@mapped_elements += galign.mapped_elements
|
173
|
+
else
|
174
|
+
posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
|
175
|
+
deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
|
176
|
+
@mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
@position_map_begin = posmap_begin.sort.to_h
|
181
|
+
@position_map_end = posmap_end.sort.to_h
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
if __FILE__ == $0
|
186
|
+
require 'json'
|
187
|
+
require 'text_alignment/lcs_cdiff'
|
188
|
+
|
189
|
+
str1 = "TI - Identification of a region which directs the monocytic activity of the\n colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor\n promoter and binds PEBP2/CBF (AML1)."
|
190
|
+
str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts. We have demonstrated that the monocyte-specific expression of the CSF-1 receptor is regulated at the level of transcription by a tissue-specific promoter whose activity is stimulated by the monocyte/B-cell-specific transcription factor PU.1 (D.-E. Zhang, C.J. Hetherington, H.-M. Chen, and D.G. Tenen, Mol. Cell. Biol. 14:373-381, 1994). Here we report that the tissue specificity of this promoter is also mediated by sequences in a region II (bp -88 to -59), which lies 10 bp upstream from the PU.1-binding site. When analyzed by DNase footprinting, region II was protected preferentially in monocytic cells. Electrophoretic mobility shift assays confirmed that region II interacts specifically with nuclear proteins from monocytic cells. Two gel shift complexes (Mono A and Mono B) were formed with separate sequence elements within this region. Competition and supershift experiments indicate that Mono B contains a member of the polyomavirus enhancer-binding protein 2/core-binding factor (PEBP2/CBF) family, which includes the AML1 gene product, while Mono A is a distinct complex preferentially expressed in monocytic cells. Promoter constructs with mutations in these sequence elements were no longer expressed specifically in monocytes. Furthermore, multimerized region II sequence elements enhanced the activity of a heterologous thymidine kinase promoter in monocytic cells but not other cell types tested. These results indicate that the monocyte/B-cell-specific transcription factor PU.1 and the Mono A and Mono B protein complexes act in concert to regulate monocyte-specific transcription of the CSF-1 receptor."
|
191
|
+
|
192
|
+
# anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
193
|
+
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
194
|
+
|
195
|
+
if ARGV.length == 2
|
196
|
+
# str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
197
|
+
# denotations = JSON.parse(File.read(ARGV[0]).strip, symbolize_names:true)[:denotations]
|
198
|
+
# str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
199
|
+
str1 = File.read(ARGV[0])
|
200
|
+
str2 = File.read(ARGV[1])
|
201
|
+
end
|
202
|
+
|
203
|
+
# dictionary = [["β", "beta"]]
|
204
|
+
# align = TextAlignment::TextAlignment.new(str1, str2)
|
205
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
206
|
+
|
207
|
+
# p align.common_elements
|
208
|
+
# puts "---------------"
|
209
|
+
# p align.mapped_elements
|
210
|
+
|
211
|
+
puts TextAlignment::sdiff2cdiff(align.sdiff)
|
212
|
+
# p align
|
213
|
+
# puts "-----"
|
214
|
+
|
215
|
+
# p denotations
|
216
|
+
# puts "-----"
|
217
|
+
|
218
|
+
# new_denotations = align.transform_hdenotations(denotations)
|
219
|
+
|
220
|
+
# p new_denotations
|
221
|
+
# puts "-----"
|
222
|
+
|
223
|
+
end
|