rbbt-text 1.5.1 → 1.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/pubtator.rb +67 -0
- data/lib/rbbt/segment/transformed.rb +23 -9
- data/lib/rbbt/text/misc.rb +5 -0
- data/share/text/greek.tsv +51 -0
- data/test/rbbt/ner/test_pubtator.rb +70 -0
- data/test/rbbt/segment/test_transformed.rb +23 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80e662635b01507c60638565bb8f00f789b4e6423227b98cee0ac38d8dd4b344
|
4
|
+
data.tar.gz: 6b9a1be5fdea2bb56a770ca2fe6838528102dfe2d15f27abb5f0e5b1849e6574
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 383affa64fa2b1e6d54817e343b3710d134c01d68867699490c604258324a6806bd829492873e819a5666780e0282ab1a375ae03147691aa64ef78705d7d0097
|
7
|
+
data.tar.gz: 67a52fca24335775faacf2a68cd101b24ff19d0fbff9560b4468c61546cee62aa02cdf8c5a1d3db39e5784856bd4df4573c9a00f901e8ca368cf169f8a88101d
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
|
+
require 'rbbt/segment/transformed'
|
5
|
+
require 'rbbt/text/misc'
|
6
|
+
require 'rest-client'
|
7
|
+
module Pubtator
|
8
|
+
PUBTATOR_URL="https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator"
|
9
|
+
|
10
|
+
def self.pubtator_entities(pmids, concepts = ['gene'], alignments = nil)
|
11
|
+
|
12
|
+
texts = {}
|
13
|
+
entities = {}
|
14
|
+
|
15
|
+
last = nil
|
16
|
+
Misc.chunk(pmids, 1000) do |chunk|
|
17
|
+
time = Time.now
|
18
|
+
if last
|
19
|
+
diff = time - last
|
20
|
+
if diff < 3
|
21
|
+
sleep(3 - diff)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
last = time
|
25
|
+
|
26
|
+
response = RestClient.post(PUBTATOR_URL, {pmids: chunk, concepts: concepts}.to_json, {content_type: 'json', accept: 'json'}).body
|
27
|
+
response.split("\n").each do |line|
|
28
|
+
next if line.empty?
|
29
|
+
if line =~ /^\d+\|/
|
30
|
+
pmid, text_type, content = line.split("|")
|
31
|
+
texts[pmid] ||= []
|
32
|
+
texts[pmid] << content
|
33
|
+
else
|
34
|
+
pmid, start, eend, literal, type, code = line.split("\t")
|
35
|
+
ne = NamedEntity.setup(literal, code: code, type: type, offset: start.to_i)
|
36
|
+
entities[pmid] ||= []
|
37
|
+
entities[pmid] << ne
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
if alignments
|
43
|
+
new_entities = {}
|
44
|
+
entities.each do |pmid,list|
|
45
|
+
text = texts[pmid] * " "
|
46
|
+
alignment = alignments[pmid]
|
47
|
+
raise "Alignment for #{pmid} not found" if alignment.nil?
|
48
|
+
greek_characters = Misc.greek_characters
|
49
|
+
new_list = Transformed.with_transform(alignment, greek_characters.keys, lambda{|k| greek_characters[k] }) do
|
50
|
+
list.collect do |entity|
|
51
|
+
begin
|
52
|
+
Segment.relocate(entity, text, alignment, 10)
|
53
|
+
entity
|
54
|
+
rescue Exception
|
55
|
+
Log.low "Entity #{entity} (#{entity.range}) not found in alignment text for #{pmid}"
|
56
|
+
next
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
new_entities[pmid] = new_list.compact
|
61
|
+
end
|
62
|
+
entities = new_entities
|
63
|
+
end
|
64
|
+
|
65
|
+
entities
|
66
|
+
end
|
67
|
+
end
|
@@ -10,10 +10,10 @@ module Transformed
|
|
10
10
|
text
|
11
11
|
end
|
12
12
|
|
13
|
-
def self.with_transform(text,
|
13
|
+
def self.with_transform(text, replace_segments, replacement = nil)
|
14
14
|
|
15
15
|
text.extend Transformed
|
16
|
-
text.replace_segments(
|
16
|
+
text.replace_segments(replace_segments, replacement)
|
17
17
|
|
18
18
|
segments = yield text
|
19
19
|
|
@@ -61,7 +61,7 @@ module Transformed
|
|
61
61
|
[begin_shift, end_shift]
|
62
62
|
end
|
63
63
|
|
64
|
-
def replace_segments(segments, replacement = nil, &block)
|
64
|
+
def replace_segments(segments, replacement = nil, strict = false, &block)
|
65
65
|
@transformed_segments ||= {}
|
66
66
|
@transformation_stack ||= []
|
67
67
|
stack = []
|
@@ -71,6 +71,15 @@ module Transformed
|
|
71
71
|
|
72
72
|
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
73
|
|
74
|
+
segments = segments.collect do |s|
|
75
|
+
if Segment === s
|
76
|
+
s
|
77
|
+
elsif String === s
|
78
|
+
matches = self.scan(s)
|
79
|
+
Segment.align(self, matches)
|
80
|
+
end
|
81
|
+
end.flatten
|
82
|
+
|
74
83
|
segments = segments.select do |s|
|
75
84
|
shift = shift s.range
|
76
85
|
s_offset = s.offset.to_i
|
@@ -82,7 +91,6 @@ module Transformed
|
|
82
91
|
|
83
92
|
Segment.clean_sort(segments).each do |segment|
|
84
93
|
next if segment.offset.nil?
|
85
|
-
|
86
94
|
shift = shift segment.range
|
87
95
|
|
88
96
|
next if shift.nil?
|
@@ -139,7 +147,7 @@ module Transformed
|
|
139
147
|
when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
|
140
148
|
segment.replace self[segment.offset.to_i..segment.eend - diff]
|
141
149
|
else
|
142
|
-
raise "Segment
|
150
|
+
raise "Segment overlaps with transformation: #{Misc.fingerprint(segment)} (#{segment.range} & #{range.begin}..#{range.end + diff})"
|
143
151
|
end
|
144
152
|
end
|
145
153
|
|
@@ -155,10 +163,16 @@ module Transformed
|
|
155
163
|
|
156
164
|
new_range = (range.begin..range.last + diff)
|
157
165
|
self[new_range] = text
|
158
|
-
segments.
|
159
|
-
next unless Segment === segment
|
160
|
-
|
161
|
-
|
166
|
+
segments = segments.collect do |segment|
|
167
|
+
next segment unless Segment === segment
|
168
|
+
begin
|
169
|
+
fix_segment(segment, range, diff)
|
170
|
+
segment
|
171
|
+
rescue
|
172
|
+
Log.low "Skipped: " + $!.message
|
173
|
+
next
|
174
|
+
end
|
175
|
+
end.compact if Array === segments
|
162
176
|
end
|
163
177
|
segments
|
164
178
|
else
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#: :type=:single
|
2
|
+
#Character Greek letter
|
3
|
+
Α Alpha
|
4
|
+
α alpha
|
5
|
+
Β Beta
|
6
|
+
β beta
|
7
|
+
γ gamma
|
8
|
+
Γ Gamma
|
9
|
+
Δ Delta
|
10
|
+
δ Delta
|
11
|
+
Ε Epsilon
|
12
|
+
ε epsilon
|
13
|
+
Ζ Zeta
|
14
|
+
ζ zeta
|
15
|
+
η eta
|
16
|
+
Η Eta
|
17
|
+
Θ Theta
|
18
|
+
θ theta
|
19
|
+
ι iota
|
20
|
+
Ι Iota
|
21
|
+
Κ Kappa
|
22
|
+
κ kappa
|
23
|
+
λ lamda
|
24
|
+
Λ Lamda
|
25
|
+
Μ Mu
|
26
|
+
μ mu
|
27
|
+
ν nu
|
28
|
+
Ν Nu
|
29
|
+
Ξ Xi
|
30
|
+
ξ xi
|
31
|
+
ο omicron
|
32
|
+
Ο Omicron
|
33
|
+
π pi
|
34
|
+
Π Pi
|
35
|
+
Ρ Rho
|
36
|
+
ρ rho
|
37
|
+
ς final sigma
|
38
|
+
σ sigma
|
39
|
+
Σ Sigma
|
40
|
+
τ tau
|
41
|
+
Τ Tau
|
42
|
+
υ upsilon
|
43
|
+
Υ Upsilon
|
44
|
+
φ phi
|
45
|
+
Φ Phi
|
46
|
+
Χ Chi
|
47
|
+
χ chi
|
48
|
+
Ψ Psi
|
49
|
+
ψ psi
|
50
|
+
ω omega
|
51
|
+
Ω Omega
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
require 'rbbt/ner/NER'
|
5
|
+
require 'rbbt/document'
|
6
|
+
require 'rbbt/document/corpus'
|
7
|
+
require 'rbbt/document/corpus/pubmed'
|
8
|
+
class TestPubtator < Test::Unit::TestCase
|
9
|
+
def with_corpus(&block)
|
10
|
+
TmpFile.with_file do |corpus|
|
11
|
+
yield Document::Corpus.setup(corpus)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def _test_align
|
16
|
+
pmids = "19522013|20861254|38267746".split("|")
|
17
|
+
alignments = {}
|
18
|
+
with_corpus do |corpus|
|
19
|
+
corpus.add_pmid(pmids).each do |document|
|
20
|
+
alignments[document.code] = document
|
21
|
+
end
|
22
|
+
entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
|
23
|
+
entities.each do |pmid,list|
|
24
|
+
document = corpus.add_pmid(pmid)
|
25
|
+
list.each do |entity|
|
26
|
+
assert_equal entity, document[entity.range]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_pmid
|
33
|
+
Log.severity = 0
|
34
|
+
pmids = "22291955".split("|")
|
35
|
+
alignments = {}
|
36
|
+
with_corpus do |corpus|
|
37
|
+
corpus.add_pmid(pmids).each do |document|
|
38
|
+
alignments[document.code] = document
|
39
|
+
end
|
40
|
+
entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
|
41
|
+
entities.each do |pmid,list|
|
42
|
+
document = corpus.add_pmid(pmid)
|
43
|
+
list.each do |entity|
|
44
|
+
assert_equal entity, document[entity.range]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def _test_greek
|
51
|
+
|
52
|
+
pmids = "20861254".split("|")
|
53
|
+
alignments = {}
|
54
|
+
with_corpus do |corpus|
|
55
|
+
corpus.add_pmid(pmids).each do |document|
|
56
|
+
alignments[document.code] = document
|
57
|
+
end
|
58
|
+
entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
|
59
|
+
entities.each do |pmid,list|
|
60
|
+
document = corpus.add_pmid(pmid)
|
61
|
+
list.each do |entity|
|
62
|
+
assert_equal entity, document[entity.range]
|
63
|
+
end
|
64
|
+
assert list.select{|e| e.include? 'α' }.any?
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
@@ -279,6 +279,29 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
279
279
|
end
|
280
280
|
end
|
281
281
|
|
282
|
+
def test_string_transform
|
283
|
+
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
284
|
+
|
285
|
+
gene1 = "TP53"
|
286
|
+
|
287
|
+
gene2 = "CDK5R1"
|
288
|
+
|
289
|
+
Transformed.with_transform(a, [gene1,gene2], "[G]") do
|
290
|
+
assert_equal "This sentence mentions the [G] gene and the [G] protein", a
|
291
|
+
end
|
292
|
+
Transformed.with_transform(a, [gene1], "[G1]") do
|
293
|
+
Transformed.with_transform(a, [gene2], "[G2]") do
|
294
|
+
assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
|
295
|
+
end
|
296
|
+
end
|
297
|
+
Transformed.with_transform(a, [gene2], "[G2]") do
|
298
|
+
Transformed.with_transform(a, [gene1], "[G1]") do
|
299
|
+
assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
|
282
305
|
def test_offset_transform
|
283
306
|
a = "ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
|
284
307
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- lib/rbbt/ner/oscar3.rb
|
83
83
|
- lib/rbbt/ner/oscar4.rb
|
84
84
|
- lib/rbbt/ner/patterns.rb
|
85
|
+
- lib/rbbt/ner/pubtator.rb
|
85
86
|
- lib/rbbt/ner/regexpNER.rb
|
86
87
|
- lib/rbbt/ner/rner.rb
|
87
88
|
- lib/rbbt/ner/rnorm.rb
|
@@ -104,6 +105,7 @@ files:
|
|
104
105
|
- lib/rbbt/segment/token.rb
|
105
106
|
- lib/rbbt/segment/transformed.rb
|
106
107
|
- lib/rbbt/segment/tsv.rb
|
108
|
+
- lib/rbbt/text/misc.rb
|
107
109
|
- share/install/software/ABNER
|
108
110
|
- share/install/software/BANNER
|
109
111
|
- share/install/software/ChemicalTagger
|
@@ -119,6 +121,7 @@ files:
|
|
119
121
|
- share/rner/config.rb
|
120
122
|
- share/rnorm/cue_default
|
121
123
|
- share/rnorm/tokens_default
|
124
|
+
- share/text/greek.tsv
|
122
125
|
- share/wordlists/stopwords
|
123
126
|
- test/rbbt/bow/test_bow.rb
|
124
127
|
- test/rbbt/bow/test_dictionary.rb
|
@@ -139,6 +142,7 @@ files:
|
|
139
142
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
140
143
|
- test/rbbt/ner/test_oscar4.rb
|
141
144
|
- test/rbbt/ner/test_patterns.rb
|
145
|
+
- test/rbbt/ner/test_pubtator.rb
|
142
146
|
- test/rbbt/ner/test_regexpNER.rb
|
143
147
|
- test/rbbt/ner/test_rner.rb
|
144
148
|
- test/rbbt/ner/test_rnorm.rb
|
@@ -199,6 +203,7 @@ test_files:
|
|
199
203
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
200
204
|
- test/rbbt/ner/test_oscar4.rb
|
201
205
|
- test/rbbt/ner/test_patterns.rb
|
206
|
+
- test/rbbt/ner/test_pubtator.rb
|
202
207
|
- test/rbbt/ner/test_regexpNER.rb
|
203
208
|
- test/rbbt/ner/test_rner.rb
|
204
209
|
- test/rbbt/ner/test_rnorm.rb
|