text_alignment 0.2.2 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +3 -1
- data/bin/align_annotations +52 -0
- data/lib/text_alignment/approximate_fit.rb +31 -17
- data/lib/text_alignment/lcs_cdiff.rb +1 -1
- data/lib/text_alignment/text_alignment.rb +5 -46
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -0
- metadata +20 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
|
4
|
+
data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
|
7
|
+
data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -15,6 +15,7 @@ GEM
|
|
15
15
|
rspec-support (~> 3.0.0)
|
16
16
|
rspec-support (3.0.4)
|
17
17
|
ruby-dictionary (1.1.1)
|
18
|
+
string-similarity (2.1.0)
|
18
19
|
|
19
20
|
PLATFORMS
|
20
21
|
ruby
|
@@ -23,9 +24,10 @@ DEPENDENCIES
|
|
23
24
|
diff-lcs (~> 1.3)
|
24
25
|
rspec (~> 3.0)
|
25
26
|
ruby-dictionary (~> 1.1, >= 1.1.1)
|
27
|
+
string-similarity (~> 2.1)
|
26
28
|
|
27
29
|
RUBY VERSION
|
28
|
-
ruby 2.
|
30
|
+
ruby 2.5.5p157
|
29
31
|
|
30
32
|
BUNDLED WITH
|
31
33
|
1.17.3
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment'
|
3
|
+
|
4
|
+
require 'json'
|
5
|
+
require 'pp'
|
6
|
+
|
7
|
+
unless ARGV.length == 2
|
8
|
+
warn "align_annotations target_annotations(.json) reference_annotations(.json)"
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
anns1 = JSON.parse File.read(ARGV[0].strip), :symbolize_names => true
|
13
|
+
anns2 = JSON.parse File.read(ARGV[1].strip), :symbolize_names => true
|
14
|
+
|
15
|
+
str1 = anns1[:text]
|
16
|
+
str2 = anns2[:text]
|
17
|
+
|
18
|
+
denotations = anns1[:denotations]
|
19
|
+
|
20
|
+
puts "[Alignment1]====="
|
21
|
+
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
22
|
+
puts TextAlignment::sdiff2cdiff(align.sdiff)
|
23
|
+
puts
|
24
|
+
puts "[Similarity]\n#{align.similarity}"
|
25
|
+
puts
|
26
|
+
puts '[Denotations original]'
|
27
|
+
pp denotations
|
28
|
+
puts
|
29
|
+
puts '[Denotations transformed]'
|
30
|
+
new_denotations = align.transform_hdenotations(denotations)
|
31
|
+
pp new_denotations
|
32
|
+
puts
|
33
|
+
puts "[Alignment2 (downcased)]====="
|
34
|
+
align = TextAlignment::TextAlignment.new(str1.downcase, str2.downcase, TextAlignment::MAPPINGS)
|
35
|
+
puts TextAlignment::sdiff2cdiff(align.sdiff)
|
36
|
+
puts
|
37
|
+
puts "[Similarity]\n#{align.similarity}"
|
38
|
+
puts
|
39
|
+
puts '[Denotations original]'
|
40
|
+
pp denotations
|
41
|
+
puts
|
42
|
+
puts '[Denotations transformed]'
|
43
|
+
new_denotations = align.transform_hdenotations(denotations)
|
44
|
+
pp new_denotations
|
45
|
+
puts
|
46
|
+
puts '[Annotations transformed]'
|
47
|
+
anns2[:denotations] = new_denotations
|
48
|
+
puts anns2.to_json
|
49
|
+
|
50
|
+
# p align.common_elements
|
51
|
+
# puts "---------------"
|
52
|
+
# p align.mapped_elements
|
@@ -1,11 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'string-similarity'
|
3
|
+
|
2
4
|
module TextAlignment; end unless defined? TextAlignment
|
3
5
|
|
4
6
|
# approximate the location of str1 in str2
|
5
7
|
module TextAlignment
|
6
8
|
SIGNATURE_NGRAM = 5
|
7
9
|
MIN_LENGTH_FOR_APPROXIMATION = 50
|
8
|
-
BUFFER_RATE = 0.
|
10
|
+
BUFFER_RATE = 0.1
|
11
|
+
TEXT_SIMILARITY_TRESHOLD = 0.8
|
9
12
|
end
|
10
13
|
|
11
14
|
class << TextAlignment
|
@@ -22,29 +25,40 @@ class << TextAlignment
|
|
22
25
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
23
26
|
return nil, nil if ngram_shared.empty?
|
24
27
|
|
25
|
-
|
26
|
-
|
28
|
+
signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
|
29
|
+
return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
|
30
|
+
|
31
|
+
fit_begin, fit_end = nil, nil
|
32
|
+
signature_ngrams.each do |signature_ngram|
|
33
|
+
loc_signature_ngram_in_str1 = str1.index(signature_ngram)
|
34
|
+
loc_signature_ngram_in_str2 = str2.index(signature_ngram)
|
27
35
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
fit_begin = 0 if fit_begin < 0
|
36
|
+
# approximate the beginning of the fit
|
37
|
+
fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
|
38
|
+
fit_begin = 0 if fit_begin < 0
|
32
39
|
|
33
|
-
|
34
|
-
|
40
|
+
# approximate the end of the fit
|
41
|
+
offset_end = str1.length - loc_signature_ngram_in_str1
|
42
|
+
fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
|
43
|
+
fit_end = str2.length if fit_end > str2.length
|
35
44
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
return nil, nil if signature_ngram.nil? # raise "no signature ngram"
|
41
|
-
offset = str1.length - str1.rindex(signature_ngram)
|
42
|
-
fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
|
43
|
-
fit_end = str2.length if fit_end > str2.length
|
45
|
+
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
47
|
+
fit_begin, fit_end = nil, nil
|
48
|
+
end
|
44
49
|
|
45
50
|
return nil, nil if fit_begin >= fit_end
|
46
51
|
return fit_begin, fit_end
|
47
52
|
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def text_similarity(str1, str2, ngram_order = 3)
|
57
|
+
_str1 = str1.delete(" \t\r\n")
|
58
|
+
_str2 = str2.delete(" \t\r\n")
|
59
|
+
String::Similarity.cosine(_str1, _str2, ngram:2)
|
60
|
+
end
|
61
|
+
|
48
62
|
end
|
49
63
|
|
50
64
|
if __FILE__ == $0
|
@@ -4,6 +4,7 @@ require 'text_alignment/lcs_min'
|
|
4
4
|
require 'text_alignment/find_divisions'
|
5
5
|
require 'text_alignment/lcs_comparison'
|
6
6
|
require 'text_alignment/lcs_alignment'
|
7
|
+
require 'text_alignment/lcs_cdiff'
|
7
8
|
require 'text_alignment/glcs_alignment'
|
8
9
|
require 'text_alignment/mappings'
|
9
10
|
|
@@ -26,6 +27,7 @@ class TextAlignment::TextAlignment
|
|
26
27
|
## preprocessing
|
27
28
|
str1 = str1.dup
|
28
29
|
str2 = str2.dup
|
30
|
+
mappings = mappings.dup
|
29
31
|
|
30
32
|
## find the first nomatch character
|
31
33
|
TextAlignment::NOMATCH_CHARS.each_char do |c|
|
@@ -77,7 +79,7 @@ class TextAlignment::TextAlignment
|
|
77
79
|
end
|
78
80
|
|
79
81
|
def transform_a_span(span)
|
80
|
-
{:
|
82
|
+
{begin: @position_map_begin[span[:begin]], end: @position_map_end[span[:end]]}
|
81
83
|
end
|
82
84
|
|
83
85
|
def transform_spans(spans)
|
@@ -89,11 +91,8 @@ class TextAlignment::TextAlignment
|
|
89
91
|
end
|
90
92
|
|
91
93
|
def transform_hdenotations(hdenotations)
|
92
|
-
|
93
|
-
|
94
|
-
(0...hdenotations.length).each {|i| hdenotations_new[i][:span] = transform_a_span(hdenotations[i][:span])}
|
95
|
-
hdenotations_new
|
96
|
-
end
|
94
|
+
return nil if hdenotations.nil?
|
95
|
+
hdenotations.collect{|d| d.dup.merge({span:transform_a_span(d[:span])})}
|
97
96
|
end
|
98
97
|
|
99
98
|
private
|
@@ -180,43 +179,3 @@ class TextAlignment::TextAlignment
|
|
180
179
|
@position_map_end = posmap_end.sort.to_h
|
181
180
|
end
|
182
181
|
end
|
183
|
-
|
184
|
-
if __FILE__ == $0
|
185
|
-
require 'json'
|
186
|
-
require 'text_alignment/lcs_cdiff'
|
187
|
-
|
188
|
-
str1 = "TI - Identification of a region which directs the monocytic activity of the\n colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor\n promoter and binds PEBP2/CBF (AML1)."
|
189
|
-
str2 = "Identification of a region which directs the monocytic activity of the colony-stimulating factor 1 (macrophage colony-stimulating factor) receptor promoter and binds PEBP2/CBF (AML1).\nThe receptor for the macrophage colony-stimulating factor (or colony-stimulating factor 1 [CSF-1]) is expressed from different promoters in monocytic cells and placental trophoblasts. We have demonstrated that the monocyte-specific expression of the CSF-1 receptor is regulated at the level of transcription by a tissue-specific promoter whose activity is stimulated by the monocyte/B-cell-specific transcription factor PU.1 (D.-E. Zhang, C.J. Hetherington, H.-M. Chen, and D.G. Tenen, Mol. Cell. Biol. 14:373-381, 1994). Here we report that the tissue specificity of this promoter is also mediated by sequences in a region II (bp -88 to -59), which lies 10 bp upstream from the PU.1-binding site. When analyzed by DNase footprinting, region II was protected preferentially in monocytic cells. Electrophoretic mobility shift assays confirmed that region II interacts specifically with nuclear proteins from monocytic cells. Two gel shift complexes (Mono A and Mono B) were formed with separate sequence elements within this region. Competition and supershift experiments indicate that Mono B contains a member of the polyomavirus enhancer-binding protein 2/core-binding factor (PEBP2/CBF) family, which includes the AML1 gene product, while Mono A is a distinct complex preferentially expressed in monocytic cells. Promoter constructs with mutations in these sequence elements were no longer expressed specifically in monocytes. Furthermore, multimerized region II sequence elements enhanced the activity of a heterologous thymidine kinase promoter in monocytic cells but not other cell types tested. These results indicate that the monocyte/B-cell-specific transcription factor PU.1 and the Mono A and Mono B protein complexes act in concert to regulate monocyte-specific transcription of the CSF-1 receptor."
|
190
|
-
|
191
|
-
# anns1 = JSON.parse File.read(ARGV[0]), :symbolize_names => true
|
192
|
-
# anns2 = JSON.parse File.read(ARGV[1]), :symbolize_names => true
|
193
|
-
|
194
|
-
if ARGV.length == 2
|
195
|
-
str1 = JSON.parse(File.read(ARGV[0]).strip)["text"]
|
196
|
-
denotations = JSON.parse(File.read(ARGV[0]).strip, symbolize_names:true)[:denotations]
|
197
|
-
str2 = JSON.parse(File.read(ARGV[1]).strip)["text"]
|
198
|
-
# str1 = File.read(ARGV[0])
|
199
|
-
# str2 = File.read(ARGV[1])
|
200
|
-
end
|
201
|
-
|
202
|
-
# dictionary = [["β", "beta"]]
|
203
|
-
# align = TextAlignment::TextAlignment.new(str1, str2)
|
204
|
-
align = TextAlignment::TextAlignment.new(str1, str2, TextAlignment::MAPPINGS)
|
205
|
-
|
206
|
-
# p align.common_elements
|
207
|
-
# puts "---------------"
|
208
|
-
# p align.mapped_elements
|
209
|
-
|
210
|
-
puts TextAlignment::sdiff2cdiff(align.sdiff)
|
211
|
-
# p align
|
212
|
-
# puts "-----"
|
213
|
-
|
214
|
-
# p denotations
|
215
|
-
# puts "-----"
|
216
|
-
|
217
|
-
# new_denotations = align.transform_hdenotations(denotations)
|
218
|
-
|
219
|
-
# p new_denotations
|
220
|
-
# puts "-----"
|
221
|
-
|
222
|
-
end
|
data/text_alignment.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -30,6 +30,20 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.1.1
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: string-similarity
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '2.1'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.1'
|
33
47
|
- !ruby/object:Gem::Dependency
|
34
48
|
name: rspec
|
35
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -49,7 +63,8 @@ description: |-
|
|
49
63
|
of two character strings and annotations made to them.
|
50
64
|
email:
|
51
65
|
- jdkim@dbcls.rois.ac.jp
|
52
|
-
executables:
|
66
|
+
executables:
|
67
|
+
- align_annotations
|
53
68
|
extensions: []
|
54
69
|
extra_rdoc_files: []
|
55
70
|
files:
|
@@ -58,6 +73,7 @@ files:
|
|
58
73
|
- Gemfile.lock
|
59
74
|
- LICENSE.txt
|
60
75
|
- README.md
|
76
|
+
- bin/align_annotations
|
61
77
|
- lib/text_alignment.rb
|
62
78
|
- lib/text_alignment/approximate_fit.rb
|
63
79
|
- lib/text_alignment/find_divisions.rb
|
@@ -96,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
112
|
- !ruby/object:Gem::Version
|
97
113
|
version: '0'
|
98
114
|
requirements: []
|
99
|
-
rubygems_version: 3.0.
|
115
|
+
rubygems_version: 3.0.8
|
100
116
|
signing_key:
|
101
117
|
specification_version: 4
|
102
118
|
summary: Ruby class for aligning two character strings
|