text_alignment 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+ require 'text_alignment/lcs_min'
4
+ require 'text_alignment/find_divisions'
5
+ require 'text_alignment/lcs_comparison'
6
+ require 'text_alignment/lcs_alignment'
7
+ require 'text_alignment/glcs_alignment'
8
+ require 'text_alignment/mappings'
9
+
10
+ module TextAlignment; end unless defined? TextAlignment
11
+
12
+ TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
+
14
+ class TextAlignment::GLCSTextAlignment
15
+ attr_reader :position_map_begin, :position_map_end
16
+ attr_reader :common_elements, :mapped_elements
17
+ attr_reader :similarity
18
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
19
+
20
+ def initialize(str1, str2, mappings = [], lcs = nil, sdiff = nil)
21
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
22
+ raise ArgumentError, "nil mappings" if mappings.nil?
23
+
24
+ _glcs_alignment_fast(str1, str2, mapptings, lcs, sdiff)
25
+ end
26
+
27
+ private
28
+
29
+ def _glcs_alignment_fast(str1, str2, mappings, lcs, sdiff)
30
+ sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
31
+
32
+ posmap_begin, posmap_end = {}, {}
33
+ @common_elements, @mapped_elements = [], []
34
+
35
+ addition, deletion = [], []
36
+
37
+ sdiff.each do |h|
38
+ case h.action
39
+ when '='
40
+ p1, p2 = h.old_position, h.new_position
41
+
42
+ @common_elements << [str1[p1], str2[p2]]
43
+ posmap_begin[p1], posmap_end[p1] = p2, p2
44
+
45
+ if !addition.empty? && deletion.empty?
46
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
47
+ elsif addition.empty? && !deletion.empty?
48
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
49
+ elsif !addition.empty? && !deletion.empty?
50
+ if addition.length > 1 || deletion.length > 1
51
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
52
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
53
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
54
+ posmap_begin[p1], posmap_end[p1] = p2, p2
55
+ @common_elements += galign.common_elements
56
+ @mapped_elements += galign.mapped_elements
57
+ else
58
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
59
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
60
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
61
+ end
62
+ end
63
+
64
+ addition.clear; deletion.clear
65
+
66
+ when '!'
67
+ deletion << h.old_position
68
+ addition << h.new_position
69
+ when '-'
70
+ deletion << h.old_position
71
+ when '+'
72
+ addition << h.new_position
73
+ end
74
+ end
75
+
76
+ p1, p2 = str1.length, str2.length
77
+ posmap_begin[p1], posmap_end[p1] = p2, p2
78
+
79
+ if !addition.empty? && deletion.empty?
80
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
81
+ elsif addition.empty? && !deletion.empty?
82
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
83
+ elsif !addition.empty? && !deletion.empty?
84
+ if addition.length > 1 && deletion.length > 1
85
+ galign = TextAlignment::GLCSAlignment.new(str1[deletion[0] .. deletion[-1]], str2[addition[0] .. addition[-1]], mappings)
86
+ galign.position_map_begin.each {|k, v| posmap_begin[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
87
+ galign.position_map_end.each {|k, v| posmap_end[k + deletion[0]] = v.nil? ? nil : v + addition[0]}
88
+ posmap_begin[p1], posmap_end[p1] = p2, p2
89
+ @mapped_elements += galign.common_elements + galign.mapped_elements
90
+ else
91
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
92
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
93
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
94
+ end
95
+ end
96
+
97
+ @position_map_begin = posmap_begin.sort.to_h
98
+ @position_map_end = posmap_end.sort.to_h
99
+ end
100
+ end
101
+
102
+ if __FILE__ == $0
103
+ str1 = '-βκ-'
104
+ str2 = '-betakappa-'
105
+
106
+ # anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
107
+ # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
108
+
109
+ dictionary = [["β", "beta"]]
110
+ # align = TextAlignment::TextAlignment.new(str1, str2)
111
+ align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
112
+ p align.common_elements
113
+ p align.mapped_elements
114
+ end
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+ module TextAlignment; end unless defined? TextAlignment
3
+
4
+ class << TextAlignment
5
+ def glcs_required?(str1, mappings = [])
6
+ raise ArgumentError, "nil string" if str1.nil?
7
+ raise ArgumentError, "nil mappings" if mappings.nil?
8
+
9
+ # character mappings can be safely applied to the strings withoug changing the position of other characters
10
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
11
+ characters_from = character_mappings.collect{|m| m[0]}.join
12
+ characters_to = character_mappings.collect{|m| m[1]}.join
13
+ characters_to.gsub!(/-/, '\-')
14
+
15
+ str1.tr!(characters_from, characters_to)
16
+
17
+ str1 =~/([^\p{ASCII}][^\p{ASCII}])/
18
+ $1
19
+ end
20
+ end
21
+
22
+ if __FILE__ == $0
23
+
24
+ dictionary = [
25
+ ["×", "x"], #U+00D7 (multiplication sign)
26
+ ["•", "*"], #U+2022 (bullet)
27
+ ["Δ", "delta"], #U+0394 (greek capital letter delta)
28
+ ["Φ", "phi"], #U+03A6 (greek capital letter phi)
29
+ ["α", "alpha"], #U+03B1 (greek small letter alpha)
30
+ ["β", "beta"], #U+03B2 (greek small letter beta)
31
+ ["γ", "gamma"], #U+03B3 (greek small letter gamma)
32
+ ["δ", "delta"], #U+03B4 (greek small letter delta)
33
+ ["ε", "epsilon"], #U+03B5 (greek small letter epsilon)
34
+ ["κ", "kappa"], #U+03BA (greek small letter kappa)
35
+ ["λ", "lambda"], #U+03BB (greek small letter lambda)
36
+ ["μ", "mu"], #U+03BC (greek small letter mu)
37
+ ["χ", "chi"], #U+03C7 (greek small letter chi)
38
+ ["ϕ", "phi"], #U+03D5 (greek phi symbol)
39
+ [" ", " "], #U+2009 (thin space)
40
+ [" ", " "], #U+200A (hair space)
41
+ [" ", " "], #U+00A0 (no-break space)
42
+ [" ", " "], #U+3000 (ideographic space)
43
+ ["−", "-"], #U+2212 (minus sign)
44
+ ["–", "-"], #U+2013 (en dash)
45
+ ["′", "'"], #U+2032 (prime)
46
+ ["‘", "'"], #U+2018 (left single quotation mark)
47
+ ["’", "'"], #U+2019 (right single quotation mark)
48
+ ["“", '"'], #U+201C (left double quotation mark)
49
+ ["”", '"'] #U+201D (right double quotation mark)
50
+ ]
51
+
52
+ str = "TGF-β–induced"
53
+
54
+ # from_text = "TGF-beta-induced"
55
+ # to_text = "TGF-β–induced"
56
+
57
+ # from_text = "TGF-β–β induced"
58
+ # to_text = "TGF-beta-beta induced"
59
+
60
+ # str = "-βκ-"
61
+
62
+ if ARGV.length == 1
63
+ str = File.read(ARGV[0])
64
+ end
65
+ # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
66
+
67
+ p TextAlignment.glcs_required?(str, dictionary)
68
+ end
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_alignment/lcs_min'
3
+
4
+ class TextAlignment::LCSAlignment
5
+ attr_reader :position_map_begin, :position_map_end
6
+ attr_reader :common_elements, :mapped_elements
7
+
8
+ # It initializes the LCS table for the given two strings, str1 and str2.
9
+ # Exception is raised when nil given passed to either str1, str2 or dictionary
10
+ def initialize(str1, str2, lcs = nil, sdiff = nil)
11
+ raise ArgumentError, "nil string" if str1 == nil || str2 == nil
12
+ sdiff = TextAlignment::LCSMin.new(str1, str2).sdiff if sdiff.nil?
13
+ _compute_position_map(str1, str2, sdiff)
14
+ end
15
+
16
+ private
17
+
18
+ def _compute_position_map(str1, str2, sdiff)
19
+ posmap_begin, posmap_end = {}, {}
20
+ @common_elements, @mapped_elements = [], []
21
+
22
+ addition, deletion = [], []
23
+
24
+ sdiff.each do |h|
25
+ case h.action
26
+ when '='
27
+ p1, p2 = h.old_position, h.new_position
28
+
29
+ @common_elements << [str1[p1], str2[p2]]
30
+ posmap_begin[p1], posmap_end[p1] = p2, p2
31
+
32
+ if !addition.empty? && deletion.empty?
33
+ # correct the position for end
34
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
35
+ elsif addition.empty? && !deletion.empty?
36
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
37
+ elsif !addition.empty? && !deletion.empty?
38
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
39
+
40
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
41
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
42
+ end
43
+
44
+ addition.clear; deletion.clear
45
+
46
+ when '!'
47
+ deletion << h.old_position
48
+ addition << h.new_position
49
+ when '-'
50
+ deletion << h.old_position
51
+ when '+'
52
+ addition << h.new_position
53
+ end
54
+ end
55
+
56
+ p1, p2 = str1.length, str2.length
57
+ posmap_begin[p1], posmap_end[p1] = p2, p2
58
+
59
+ if !addition.empty? && deletion.empty?
60
+ # correct the position for end
61
+ posmap_end[p1] = p2 - addition.length unless p1 == 0
62
+ elsif addition.empty? && !deletion.empty?
63
+ deletion.each{|p| posmap_begin[p], posmap_end[p] = p2, p2}
64
+ elsif !addition.empty? && !deletion.empty?
65
+ @mapped_elements << [str1[deletion[0], deletion.length], str2[addition[0], addition.length]]
66
+
67
+ posmap_begin[deletion[0]], posmap_end[deletion[0]] = addition[0], addition[0]
68
+ deletion[1..-1].each{|p| posmap_begin[p], posmap_end[p] = nil, nil}
69
+ end
70
+
71
+ @position_map_begin = posmap_begin.sort.to_h
72
+ @position_map_end = posmap_end.sort.to_h
73
+ end
74
+
75
+ end
76
+
77
+ if __FILE__ == $0
78
+
79
+ # from_text = "TGF-β mRNA"
80
+ # to_text = "TGF-beta mRNA"
81
+
82
+ # from_text = "TGF-beta mRNA"
83
+ # to_text = "TGF-β mRNA"
84
+
85
+ # from_text = "TGF-beta mRNA"
86
+ # to_text = "TGF- mRNA"
87
+
88
+ # from_text = "TGF-β–induced"
89
+ # to_text = "TGF-beta-induced"
90
+
91
+ from_text = 'abxyzcd'
92
+ to_text = 'abcd'
93
+
94
+ # from_text = "TGF-beta-induced"
95
+ # to_text = "TGF-β–induced"
96
+
97
+ # from_text = "beta-induced"
98
+ # to_text = "TGF-beta-induced"
99
+
100
+ # from_text = "TGF-beta-induced"
101
+ # to_text = "beta-induced"
102
+
103
+ # from_text = "TGF-β–β induced"
104
+ # to_text = "TGF-beta-beta induced"
105
+
106
+ # from_text = "-βκ-"
107
+ # to_text = "-betakappa-"
108
+
109
+ # from_text = "-betakappa-beta-z"
110
+ # to_text = "-βκ-β–z"
111
+
112
+ # from_text = "affect C/EBP-β’s ability"
113
+ # to_text = "affect C/EBP-beta's ability"
114
+
115
+ # from_text = "12 ± 34"
116
+ # to_text = "12 +/- 34"
117
+
118
+ # from_text = "TGF-β–treated"
119
+ # to_text = "TGF-beta-treated"
120
+
121
+ # from_text = "in TGF-β–treated cells"
122
+ # to_text = "in TGF-beta-treated cells"
123
+
124
+ # from_text = "TGF-β–induced"
125
+ # to_text = "TGF-beta-induced"
126
+
127
+ # anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
128
+ # anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
129
+
130
+ # aligner = TextAlignment.new(anns1[:text], anns2[:text], [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"]])
131
+ # denotations = aligner.transform_denotations(anns1[:denotations])
132
+
133
+ denotations_s = <<-'ANN'
134
+ [{"id":"T0", "span":{"begin":1,"end":2}, "category":"Protein"}]
135
+ ANN
136
+
137
+ # denotations = JSON.parse denotations_s, :symbolize_names => true
138
+
139
+ a = TextAlignment::LCSAlignment.new(from_text, to_text)
140
+ p a.position_map_begin
141
+ puts "-----"
142
+ p a.position_map_end
143
+ # aligner = TextAlignment.new(from_text, to_text, [["Δ", "delta"], [" ", " "], ["–", "-"], ["′", "'"], ["β", "beta"]])
144
+
145
+ # p denotations
146
+ end
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+ require 'diff-lcs'
3
+
4
+ module TextAlignment; end unless defined? TextAlignment
5
+
6
+ module TextAlignment
7
+ NIL_CHARACTER = '_'
8
+ end
9
+
10
+ class << TextAlignment
11
+
12
+ def cdiff(str1, str2)
13
+ raise ArgumentError, "nil string" if str1.nil? || str2.nil?
14
+ raise "a nil character appears in the input string" if str1.index(TextAlignment::NIL_CHARACTER) || str2.index(TextAlignment::NIL_CHARACTER)
15
+ sdiff2cdiff(Diff::LCS.sdiff(str1, str2))
16
+ end
17
+
18
+ def sdiff2cdiff (sdiff)
19
+ raise ArgumentError, "nil sdiff" if sdiff.nil?
20
+
21
+ cdiff_str1, cdiff_str2 = '', ''
22
+
23
+ sdiff.each do |h|
24
+ case h.action
25
+ when '='
26
+ cdiff_str1 += h.old_element
27
+ cdiff_str2 += h.new_element
28
+ when '!'
29
+ cdiff_str1 += h.old_element + TextAlignment::NIL_CHARACTER
30
+ cdiff_str2 += TextAlignment::NIL_CHARACTER + h.new_element
31
+ when '-'
32
+ cdiff_str1 += h.old_element
33
+ cdiff_str2 += TextAlignment::NIL_CHARACTER
34
+ when '+'
35
+ cdiff_str1 += TextAlignment::NIL_CHARACTER
36
+ cdiff_str2 += h.new_element
37
+ end
38
+ end
39
+
40
+ cdiff_str1.gsub(/\n/, ' ') + "\n" + cdiff_str2.gsub(/\n/, ' ')
41
+ end
42
+
43
+ end
44
+
45
+ if __FILE__ == $0
46
+ require 'json'
47
+ str1 = 'abcde'
48
+ str2 = 'naxbyzabcdexydzem'
49
+
50
+ if ARGV.length == 2
51
+ str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
52
+ str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
53
+ end
54
+
55
+ puts "string 1: #{str1}"
56
+ puts "-----"
57
+ puts "string 2: #{str2}"
58
+ puts "-----"
59
+ puts "[cdiff]"
60
+ puts TextAlignment::cdiff(str1, str2)
61
+ end
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_alignment/lcs_min'
3
+
4
+ module TextAlignment; end unless defined? TextAlignment
5
+
6
+ class TextAlignment::LCSComparison
7
+ # The similarity ratio of the given two strings after stripping unmatched prefixes and suffixes
8
+ attr_reader :similarity
9
+
10
+ # The initial and final matching positions of str1 and str2
11
+ attr_reader :str1_match_initial, :str1_match_final, :str2_match_initial, :str2_match_final
12
+
13
+ def initialize(str1, str2, lcs = nil, sdiff = nil)
14
+ raise ArgumentError, "nil string" if str1 == nil || str2 == nil
15
+ @str1, @str2 = str1, str2
16
+ _lcs_comparison(str1, str2, lcs, sdiff)
17
+ end
18
+
19
+ private
20
+
21
+ def _lcs_comparison(str1, str2, lcs = nil, sdiff = nil)
22
+ if lcs.nil?
23
+ lcsmin = TextAlignment::LCSMin.new(str1, str2)
24
+ lcs = lcsmin.lcs
25
+ sdiff = lcsmin.sdiff
26
+ end
27
+
28
+ if lcs > 0
29
+ match_initial = sdiff.index{|d| d.action == '='}
30
+ match_final = sdiff.rindex{|d| d.action == '='}
31
+
32
+ @str1_match_initial = sdiff[match_initial].old_position
33
+ @str2_match_initial = sdiff[match_initial].new_position
34
+ @str1_match_final = sdiff[match_final].old_position
35
+ @str2_match_final = sdiff[match_final].new_position
36
+ @similarity = 2 * lcs / ((@str1_match_final - @str1_match_initial + 1) + (@str2_match_final - @str2_match_initial + 1)).to_f
37
+ else
38
+ @str1_match_initial = 0
39
+ @str2_match_initial = 0
40
+ @str1_match_final = 0
41
+ @str2_match_final = 0
42
+ @similarity = 0
43
+ end
44
+ end
45
+ end
46
+
47
+ if __FILE__ == $0
48
+ require 'json'
49
+ str1 = 'naxbyzabcdexydzem'
50
+ str2 = 'abcde'
51
+ if ARGV.length == 2
52
+ str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
53
+ str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
54
+ end
55
+ comparison = TextAlignment::LCSComparison.new(str1, str2)
56
+ puts "Similarity: #{comparison.similarity}"
57
+ puts "String 1 match: (#{comparison.str1_match_initial}, #{comparison.str1_match_final})"
58
+ puts "String 2 match: (#{comparison.str2_match_initial}, #{comparison.str2_match_final})"
59
+ puts "-----"
60
+ puts '[' + str1[comparison.str1_match_initial .. comparison.str1_match_final] + ']'
61
+ puts "-----"
62
+ puts '[' + str2[comparison.str2_match_initial .. comparison.str2_match_final] + ']'
63
+ end