text_alignment 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+ require 'text_alignment/lcs_min'
4
+ require 'text_alignment/find_divisions'
5
+ require 'text_alignment/lcs_comparison'
6
+ require 'text_alignment/lcs_alignment'
7
+ require 'text_alignment/glcs_alignment'
8
+ require 'text_alignment/mappings'
9
+
10
+ module TextAlignment; end unless defined? TextAlignment
11
+
12
+ TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
+
14
+ class TextAlignment::GLCSTextAlignment
15
+ attr_reader :position_map_begin, :position_map_end
16
+ attr_reader :common_elements, :mapped_elements
17
+ attr_reader :similarity
18
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
+
20
+ def initialize(str1, str2, mappings = [], lcs = nil, sdiff = nil)
21
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
+ raise ArgumentError, "nil mappings" if mappings.nil?
23
+
24
+ _glcs_alignment_fast(str1, str2, mapptings, lcs, sdiff)
25
+ end
26
+
27
+ private
28
+
29
+ def _glcs_alignment_fast(str1, str2, mappings, lcs, sdiff)
30
+ sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
31
+
32
+ posmap_begin, posmap_end = {}, {}
33
+ @common_elements, @mapped_elements = [], []
34
+
35
+ addition, deletion = [], []
36
+
37
+ sdiff.each do |h|
38
+ case h.action
39
+ when '='
40
+ p1, p2 = h.old_position, h.new_position
41
+
42
+ @common_elements << [str1[p1], str2[p2]]
43
+ posmap_begin[p1], posmap_end[p1] = p2, p2
44
+
45
+ if !addition.empty? && deletion.empty?
46
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
47
+ elsif addition.empty? && !deletion.empty?
48
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
49
+ elsif !addition.empty? && !deletion.empty?
50
+ if addition.length > 1 || deletion.length > 1
51
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
52
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
53
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
54
+ posmap_begin[p1], posmap_end[p1] = p2, p2
55
+ @common_elements += galign.common_elements
56
+ @mapped_elements += galign.mapped_elements
57
+ else
58
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
59
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
60
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
61
+ end
62
+ end
63
+
64
+ addition.clear; deletion.clear
65
+
66
+ when '!'
67
+ deletion << h.old_position
68
+ addition << h.new_position
69
+ when '-'
70
+ deletion << h.old_position
71
+ when '+'
72
+ addition << h.new_position
73
+ end
74
+ end
75
+
76
+ p1, p2 = str1.length, str2.length
77
+ posmap_begin[p1], posmap_end[p1] = p2, p2
78
+
79
+ if !addition.empty? && deletion.empty?
80
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
81
+ elsif addition.empty? && !deletion.empty?
82
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
83
+ elsif !addition.empty? && !deletion.empty?
84
+ if addition.length > 1 && deletion.length > 1
85
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
86
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
87
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
88
+ posmap_begin[p1], posmap_end[p1] = p2, p2
89
+ @mapped_elements += galign.common_elements + galign.mapped_elements
90
+ else
91
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
92
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
93
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
94
+ end
95
+ end
96
+
97
+ @position_map_begin = posmap_begin.sort.to_h
98
+ @position_map_end = posmap_end.sort.to_h
99
+ end
100
+ end
101
+
102
+ if __FILE__ == $0
103
+ str1 = '-βκ-'
104
+ str2 = '-betakappa-'
105
+
106
+ # anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
107
+ # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
108
+
109
+ dictionary = [["β", "beta"]]
110
+ # align = TextAlignment::TextAlignment.new(str1, str2)
111
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
112
+ p align.common_elements
113
+ p align.mapped_elements
114
+ end
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+ module TextAlignment; end unless defined? TextAlignment
3
+
4
+ class << TextAlignment
5
+ def glcs_required?(str1, mappings = [])
6
+ raise ArgumentError, "nil string" if str1.nil?
7
+ raise ArgumentError, "nil mappings" if mappings.nil?
8
+
9
+ # character mappings can be safely applied to the strings withoug changing the position of other characters
10
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
11
+ characters_from = character_mappings.collect{|m| m[0]}.join
12
+ characters_to = character_mappings.collect{|m| m[1]}.join
13
+ characters_to.gsub!(/-/, '\-')
14
+
15
+ str1.tr!(characters_from, characters_to)
16
+
17
+ str1 =~/([^\p{ASCII}][^\p{ASCII}])/
18
+ $1
19
+ end
20
+ end
21
+
22
+ if __FILE__ == $0
23
+
24
+ dictionary = [
25
+ ["×", "x"], #U+00D7 (multiplication sign)
26
+ ["•", "*"], #U+2022 (bullet)
27
+ ["Δ", "delta"], #U+0394 (greek capital letter delta)
28
+ ["Φ", "phi"], #U+03A6 (greek capital letter phi)
29
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
30
+ ["β", "beta"], #U+03B2 (greek small letter beta)
31
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
32
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
33
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
34
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
35
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
36
+ ["μ", "mu"], #U+03BC (greek small letter mu)
37
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
38
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
39
+ [" ", " "], #U+2009 (thin space)
40
+ [" ", " "], #U+200A (hair space)
41
+ [" ", " "], #U+00A0 (no-break space)
42
+ [" ", " "], #U+3000 (ideographic space)
43
+ ["−", "-"], #U+2212 (minus sign)
44
+ ["–", "-"], #U+2013 (en dash)
45
+ ["′", "'"], #U+2032 (prime)
46
+ ["‘", "'"], #U+2018 (left single quotation mark)
47
+ ["’", "'"], #U+2019 (right single quotation mark)
48
+ ["“", '"'], #U+201C (left double quotation mark)
49
+ ["”", '"'] #U+201D (right double quotation mark)
50
+ ]
51
+
52
+ str = "TGF-β–induced"
53
+
54
+ # from_text = "TGF-beta-induced"
55
+ # to_text = "TGF-β–induced"
56
+
57
+ # from_text = "TGF-β–β induced"
58
+ # to_text = "TGF-beta-beta induced"
59
+
60
+ # str = "-βκ-"
61
+
62
+ if ARGV.length == 1
63
+ str = File.read(ARGV[0])
64
+ end
65
+ # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
66
+
67
+ p TextAlignment.glcs_required?(str, dictionary)
68
+ end
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_alignment/lcs_min'
3
+
4
+ class TextAlignment::LCSAlignment
5
+ attr_reader :position_map_begin, :position_map_end
6
+ attr_reader :common_elements, :mapped_elements
7
+
8
+ # It initializes the LCS table for the given two strings, str1 and str2.
9
+ # Exception is raised when nil given passed to either str1, str2 or dictionary
10
+ def initialize(str1, str2, lcs = nil, sdiff = nil)
11
+ raise ArgumentError, "nil string" if str1 == nil || str2 == nil
12
+ sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
13
+ _compute_position_map(str1, str2, sdiff)
14
+ end
15
+
16
+ private
17
+
18
+ def _compute_position_map(str1, str2, sdiff)
19
+ posmap_begin, posmap_end = {}, {}
20
+ @common_elements, @mapped_elements = [], []
21
+
22
+ addition, deletion = [], []
23
+
24
+ sdiff.each do |h|
25
+ case h.action
26
+ when '='
27
+ p1, p2 = h.old_position, h.new_position
28
+
29
+ @common_elements << [str1[p1], str2[p2]]
30
+ posmap_begin[p1], posmap_end[p1] = p2, p2
31
+
32
+ if !addition.empty? && deletion.empty?
33
+ # correct the position for end
34
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
35
+ elsif addition.empty? && !deletion.empty?
36
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
37
+ elsif !addition.empty? && !deletion.empty?
38
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
39
+
40
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
41
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
42
+ end
43
+
44
+ addition.clear; deletion.clear
45
+
46
+ when '!'
47
+ deletion << h.old_position
48
+ addition << h.new_position
49
+ when '-'
50
+ deletion << h.old_position
51
+ when '+'
52
+ addition << h.new_position
53
+ end
54
+ end
55
+
56
+ p1, p2 = str1.length, str2.length
57
+ posmap_begin[p1], posmap_end[p1] = p2, p2
58
+
59
+ if !addition.empty? && deletion.empty?
60
+ # correct the position for end
61
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
62
+ elsif addition.empty? && !deletion.empty?
63
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
64
+ elsif !addition.empty? && !deletion.empty?
65
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
66
+
67
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
68
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
69
+ end
70
+
71
+ @position_map_begin = posmap_begin.sort.to_h
72
+ @position_map_end = posmap_end.sort.to_h
73
+ end
74
+
75
+ end
76
+
77
+ if __FILE__ == $0
78
+
79
+ # from_text = "TGF-β mRNA"
80
+ # to_text = "TGF-beta mRNA"
81
+
82
+ # from_text = "TGF-beta mRNA"
83
+ # to_text = "TGF-β mRNA"
84
+
85
+ # from_text = "TGF-beta mRNA"
86
+ # to_text = "TGF- mRNA"
87
+
88
+ # from_text = "TGF-β–induced"
89
+ # to_text = "TGF-beta-induced"
90
+
91
+ from_text = 'abxyzcd'
92
+ to_text = 'abcd'
93
+
94
+ # from_text = "TGF-beta-induced"
95
+ # to_text = "TGF-β–induced"
96
+
97
+ # from_text = "beta-induced"
98
+ # to_text = "TGF-beta-induced"
99
+
100
+ # from_text = "TGF-beta-induced"
101
+ # to_text = "beta-induced"
102
+
103
+ # from_text = "TGF-β–β induced"
104
+ # to_text = "TGF-beta-beta induced"
105
+
106
+ # from_text = "-βκ-"
107
+ # to_text = "-betakappa-"
108
+
109
+ # from_text = "-betakappa-beta-z"
110
+ # to_text = "-βκ-β–z"
111
+
112
+ # from_text = "affect C/EBP-β’s ability"
113
+ # to_text = "affect C/EBP-beta's ability"
114
+
115
+ # from_text = "12 ± 34"
116
+ # to_text = "12 +/- 34"
117
+
118
+ # from_text = "TGF-β–treated"
119
+ # to_text = "TGF-beta-treated"
120
+
121
+ # from_text = "in TGF-β–treated cells"
122
+ # to_text = "in TGF-beta-treated cells"
123
+
124
+ # from_text = "TGF-β–induced"
125
+ # to_text = "TGF-beta-induced"
126
+
127
+ # anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
128
+ # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
129
+
130
+ # aligner = TextAlignment.new(anns1[:text], anns2[:text], [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"]])
131
+ # denotations = aligner.transform_denotations(anns1[:denotations])
132
+
133
+ denotations_s = <<-'ANN'
134
+ [{"id":"T0", "span":{"begin":1,"end":2}, "category":"Protein"}]
135
+ ANN
136
+
137
+ # denotations = JSON.parse denotations_s, :symbolize_names => true
138
+
139
+ a = TextAlignment::LCSAlignment.new(from_text, to_text)
140
+ p a.position_map_begin
141
+ puts "-----"
142
+ p a.position_map_end
143
+ # aligner = TextAlignment.new(from_text, to_text, [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"], ["β", "beta"]])
144
+
145
+ # p denotations
146
+ end
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+
4
+ module TextAlignment; end unless defined? TextAlignment
5
+
6
+ module TextAlignment
7
+ NIL_CHARACTER = '_'
8
+ end
9
+
10
+ class << TextAlignment
11
+
12
+ def cdiff(str1, str2)
13
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
14
+ raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
15
+ sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
16
+ end
17
+
18
+ def sdiff2cdiff (sdiff)
19
+ raise ArgumentError, "nil sdiff" if sdiff.nil?
20
+
21
+ cdiff_str1, cdiff_str2 = '', ''
22
+
23
+ sdiff.each do |h|
24
+ case h.action
25
+ when '='
26
+ cdiff_str1 += h.old_element
27
+ cdiff_str2 += h.new_element
28
+ when '!'
29
+ cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
30
+ cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
31
+ when '-'
32
+ cdiff_str1 += h.old_element
33
+ cdiff_str2 += TextAlignment::NIL_CHARACTER
34
+ when '+'
35
+ cdiff_str1 += TextAlignment::NIL_CHARACTER
36
+ cdiff_str2 += h.new_element
37
+ end
38
+ end
39
+
40
+ cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
41
+ end
42
+
43
+ end
44
+
45
+ if __FILE__ == $0
46
+ require 'json'
47
+ str1 = 'abcde'
48
+ str2 = 'naxbyzabcdexydzem'
49
+
50
+ if ARGV.length == 2
51
+ str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
52
+ str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
53
+ end
54
+
55
+ puts "string 1: #{str1}"
56
+ puts "-----"
57
+ puts "string 2: #{str2}"
58
+ puts "-----"
59
+ puts "[cdiff]"
60
+ puts TextAlignment::cdiff(str1, str2)
61
+ end
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_alignment/lcs_min'
3
+
4
+ module TextAlignment; end unless defined? TextAlignment
5
+
6
+ class TextAlignment::LCSComparison
7
+ # The similarity ratio of the given two strings after stripping unmatched prefixes and suffixes
8
+ attr_reader :similarity
9
+
10
+ # The initial and final matching positions of str1 and str2
11
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
12
+
13
+ def initialize(str1, str2, lcs = nil, sdiff = nil)
14
+ raise ArgumentError, "nil string" if str1 == nil || str2 == nil
15
+ @str1, @str2 = str1, str2
16
+ _lcs_comparison(str1, str2, lcs, sdiff)
17
+ end
18
+
19
+ private
20
+
21
+ def _lcs_comparison(str1, str2, lcs = nil, sdiff = nil)
22
+ if lcs.nil?
23
+ lcsmin = TextAlignment::LCSMin.new(str1, str2)
24
+ lcs = lcsmin.lcs
25
+ sdiff = lcsmin.sdiff
26
+ end
27
+
28
+ if lcs > 0
29
+ match_initial = sdiff.index{|d| d.action == '='}
30
+ match_final = sdiff.rindex{|d| d.action == '='}
31
+
32
+ @str1_match_initial = sdiff[match_initial].old_position
33
+ @str2_match_initial = sdiff[match_initial].new_position
34
+ @str1_match_final = sdiff[match_final].old_position
35
+ @str2_match_final = sdiff[match_final].new_position
36
+ @similarity = 2 * lcs / ((@str1_match_final - @str1_match_initial + 1) + (@str2_match_final - @str2_match_initial + 1)).to_f
37
+ else
38
+ @str1_match_initial = 0
39
+ @str2_match_initial = 0
40
+ @str1_match_final = 0
41
+ @str2_match_final = 0
42
+ @similarity = 0
43
+ end
44
+ end
45
+ end
46
+
47
+ if __FILE__ == $0
48
+ require 'json'
49
+ str1 = 'naxbyzabcdexydzem'
50
+ str2 = 'abcde'
51
+ if ARGV.length == 2
52
+ str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
53
+ str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
54
+ end
55
+ comparison = TextAlignment::LCSComparison.new(str1, str2)
56
+ puts "Similarity: #{comparison.similarity}"
57
+ puts "String 1 match: (#{comparison.str1_match_initial}, #{comparison.str1_match_final})"
58
+ puts "String 2 match: (#{comparison.str2_match_initial}, #{comparison.str2_match_final})"
59
+ puts "-----"
60
+ puts '[' + str1[comparison.str1_match_initial .. comparison.str1_match_final] + ']'
61
+ puts "-----"
62
+ puts '[' + str2[comparison.str2_match_initial .. comparison.str2_match_final] + ']'
63
+ end