rbbt-text 1.5.1 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/pubtator.rb +67 -0
- data/lib/rbbt/segment/transformed.rb +23 -9
- data/lib/rbbt/text/misc.rb +5 -0
- data/share/text/greek.tsv +51 -0
- data/test/rbbt/ner/test_pubtator.rb +70 -0
- data/test/rbbt/segment/test_transformed.rb +23 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80e662635b01507c60638565bb8f00f789b4e6423227b98cee0ac38d8dd4b344
|
4
|
+
data.tar.gz: 6b9a1be5fdea2bb56a770ca2fe6838528102dfe2d15f27abb5f0e5b1849e6574
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 383affa64fa2b1e6d54817e343b3710d134c01d68867699490c604258324a6806bd829492873e819a5666780e0282ab1a375ae03147691aa64ef78705d7d0097
|
7
|
+
data.tar.gz: 67a52fca24335775faacf2a68cd101b24ff19d0fbff9560b4468c61546cee62aa02cdf8c5a1d3db39e5784856bd4df4573c9a00f901e8ca368cf169f8a88101d
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
|
+
require 'rbbt/segment/transformed'
|
5
|
+
require 'rbbt/text/misc'
|
6
|
+
require 'rest-client'
|
7
|
+
module Pubtator
|
8
|
+
PUBTATOR_URL="https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator"
|
9
|
+
|
10
|
+
def self.pubtator_entities(pmids, concepts = ['gene'], alignments = nil)
|
11
|
+
|
12
|
+
texts = {}
|
13
|
+
entities = {}
|
14
|
+
|
15
|
+
last = nil
|
16
|
+
Misc.chunk(pmids, 1000) do |chunk|
|
17
|
+
time = Time.now
|
18
|
+
if last
|
19
|
+
diff = time - last
|
20
|
+
if diff < 3
|
21
|
+
sleep(3 - diff)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
last = time
|
25
|
+
|
26
|
+
response = RestClient.post(PUBTATOR_URL, {pmids: chunk, concepts: concepts}.to_json, {content_type: 'json', accept: 'json'}).body
|
27
|
+
response.split("\n").each do |line|
|
28
|
+
next if line.empty?
|
29
|
+
if line =~ /^\d+\|/
|
30
|
+
pmid, text_type, content = line.split("|")
|
31
|
+
texts[pmid] ||= []
|
32
|
+
texts[pmid] << content
|
33
|
+
else
|
34
|
+
pmid, start, eend, literal, type, code = line.split("\t")
|
35
|
+
ne = NamedEntity.setup(literal, code: code, type: type, offset: start.to_i)
|
36
|
+
entities[pmid] ||= []
|
37
|
+
entities[pmid] << ne
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
if alignments
|
43
|
+
new_entities = {}
|
44
|
+
entities.each do |pmid,list|
|
45
|
+
text = texts[pmid] * " "
|
46
|
+
alignment = alignments[pmid]
|
47
|
+
raise "Alignment for #{pmid} not found" if alignment.nil?
|
48
|
+
greek_characters = Misc.greek_characters
|
49
|
+
new_list = Transformed.with_transform(alignment, greek_characters.keys, lambda{|k| greek_characters[k] }) do
|
50
|
+
list.collect do |entity|
|
51
|
+
begin
|
52
|
+
Segment.relocate(entity, text, alignment, 10)
|
53
|
+
entity
|
54
|
+
rescue Exception
|
55
|
+
Log.low "Entity #{entity} (#{entity.range}) not found in alignment text for #{pmid}"
|
56
|
+
next
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
new_entities[pmid] = new_list.compact
|
61
|
+
end
|
62
|
+
entities = new_entities
|
63
|
+
end
|
64
|
+
|
65
|
+
entities
|
66
|
+
end
|
67
|
+
end
|
@@ -10,10 +10,10 @@ module Transformed
|
|
10
10
|
text
|
11
11
|
end
|
12
12
|
|
13
|
-
def self.with_transform(text,
|
13
|
+
def self.with_transform(text, replace_segments, replacement = nil)
|
14
14
|
|
15
15
|
text.extend Transformed
|
16
|
-
text.replace_segments(
|
16
|
+
text.replace_segments(replace_segments, replacement)
|
17
17
|
|
18
18
|
segments = yield text
|
19
19
|
|
@@ -61,7 +61,7 @@ module Transformed
|
|
61
61
|
[begin_shift, end_shift]
|
62
62
|
end
|
63
63
|
|
64
|
-
def replace_segments(segments, replacement = nil, &block)
|
64
|
+
def replace_segments(segments, replacement = nil, strict = false, &block)
|
65
65
|
@transformed_segments ||= {}
|
66
66
|
@transformation_stack ||= []
|
67
67
|
stack = []
|
@@ -71,6 +71,15 @@ module Transformed
|
|
71
71
|
|
72
72
|
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
73
|
|
74
|
+
segments = segments.collect do |s|
|
75
|
+
if Segment === s
|
76
|
+
s
|
77
|
+
elsif String === s
|
78
|
+
matches = self.scan(s)
|
79
|
+
Segment.align(self, matches)
|
80
|
+
end
|
81
|
+
end.flatten
|
82
|
+
|
74
83
|
segments = segments.select do |s|
|
75
84
|
shift = shift s.range
|
76
85
|
s_offset = s.offset.to_i
|
@@ -82,7 +91,6 @@ module Transformed
|
|
82
91
|
|
83
92
|
Segment.clean_sort(segments).each do |segment|
|
84
93
|
next if segment.offset.nil?
|
85
|
-
|
86
94
|
shift = shift segment.range
|
87
95
|
|
88
96
|
next if shift.nil?
|
@@ -139,7 +147,7 @@ module Transformed
|
|
139
147
|
when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
|
140
148
|
segment.replace self[segment.offset.to_i..segment.eend - diff]
|
141
149
|
else
|
142
|
-
raise "Segment
|
150
|
+
raise "Segment overlaps with transformation: #{Misc.fingerprint(segment)} (#{segment.range} & #{range.begin}..#{range.end + diff})"
|
143
151
|
end
|
144
152
|
end
|
145
153
|
|
@@ -155,10 +163,16 @@ module Transformed
|
|
155
163
|
|
156
164
|
new_range = (range.begin..range.last + diff)
|
157
165
|
self[new_range] = text
|
158
|
-
segments.
|
159
|
-
next unless Segment === segment
|
160
|
-
|
161
|
-
|
166
|
+
segments = segments.collect do |segment|
|
167
|
+
next segment unless Segment === segment
|
168
|
+
begin
|
169
|
+
fix_segment(segment, range, diff)
|
170
|
+
segment
|
171
|
+
rescue
|
172
|
+
Log.low "Skipped: " + $!.message
|
173
|
+
next
|
174
|
+
end
|
175
|
+
end.compact if Array === segments
|
162
176
|
end
|
163
177
|
segments
|
164
178
|
else
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#: :type=:single
|
2
|
+
#Character Greek letter
|
3
|
+
Α Alpha
|
4
|
+
α alpha
|
5
|
+
Β Beta
|
6
|
+
β beta
|
7
|
+
γ gamma
|
8
|
+
Γ Gamma
|
9
|
+
Δ Delta
|
10
|
+
δ Delta
|
11
|
+
Ε Epsilon
|
12
|
+
ε epsilon
|
13
|
+
Ζ Zeta
|
14
|
+
ζ zeta
|
15
|
+
η eta
|
16
|
+
Η Eta
|
17
|
+
Θ Theta
|
18
|
+
θ theta
|
19
|
+
ι iota
|
20
|
+
Ι Iota
|
21
|
+
Κ Kappa
|
22
|
+
κ kappa
|
23
|
+
λ lamda
|
24
|
+
Λ Lamda
|
25
|
+
Μ Mu
|
26
|
+
μ mu
|
27
|
+
ν nu
|
28
|
+
Ν Nu
|
29
|
+
Ξ Xi
|
30
|
+
ξ xi
|
31
|
+
ο omicron
|
32
|
+
Ο Omicron
|
33
|
+
π pi
|
34
|
+
Π Pi
|
35
|
+
Ρ Rho
|
36
|
+
ρ rho
|
37
|
+
ς final sigma
|
38
|
+
σ sigma
|
39
|
+
Σ Sigma
|
40
|
+
τ tau
|
41
|
+
Τ Tau
|
42
|
+
υ upsilon
|
43
|
+
Υ Upsilon
|
44
|
+
φ phi
|
45
|
+
Φ Phi
|
46
|
+
Χ Chi
|
47
|
+
χ chi
|
48
|
+
Ψ Psi
|
49
|
+
ψ psi
|
50
|
+
ω omega
|
51
|
+
Ω Omega
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
require 'rbbt/ner/NER'
|
5
|
+
require 'rbbt/document'
|
6
|
+
require 'rbbt/document/corpus'
|
7
|
+
require 'rbbt/document/corpus/pubmed'
|
8
|
+
class TestPubtator < Test::Unit::TestCase
|
9
|
+
def with_corpus(&block)
|
10
|
+
TmpFile.with_file do |corpus|
|
11
|
+
yield Document::Corpus.setup(corpus)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def _test_align
|
16
|
+
pmids = "19522013|20861254|38267746".split("|")
|
17
|
+
alignments = {}
|
18
|
+
with_corpus do |corpus|
|
19
|
+
corpus.add_pmid(pmids).each do |document|
|
20
|
+
alignments[document.code] = document
|
21
|
+
end
|
22
|
+
entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
|
23
|
+
entities.each do |pmid,list|
|
24
|
+
document = corpus.add_pmid(pmid)
|
25
|
+
list.each do |entity|
|
26
|
+
assert_equal entity, document[entity.range]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_pmid
|
33
|
+
Log.severity = 0
|
34
|
+
pmids = "22291955".split("|")
|
35
|
+
alignments = {}
|
36
|
+
with_corpus do |corpus|
|
37
|
+
corpus.add_pmid(pmids).each do |document|
|
38
|
+
alignments[document.code] = document
|
39
|
+
end
|
40
|
+
entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
|
41
|
+
entities.each do |pmid,list|
|
42
|
+
document = corpus.add_pmid(pmid)
|
43
|
+
list.each do |entity|
|
44
|
+
assert_equal entity, document[entity.range]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def _test_greek
|
51
|
+
|
52
|
+
pmids = "20861254".split("|")
|
53
|
+
alignments = {}
|
54
|
+
with_corpus do |corpus|
|
55
|
+
corpus.add_pmid(pmids).each do |document|
|
56
|
+
alignments[document.code] = document
|
57
|
+
end
|
58
|
+
entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
|
59
|
+
entities.each do |pmid,list|
|
60
|
+
document = corpus.add_pmid(pmid)
|
61
|
+
list.each do |entity|
|
62
|
+
assert_equal entity, document[entity.range]
|
63
|
+
end
|
64
|
+
assert list.select{|e| e.include? 'α' }.any?
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
@@ -279,6 +279,29 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
279
279
|
end
|
280
280
|
end
|
281
281
|
|
282
|
+
def test_string_transform
|
283
|
+
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
284
|
+
|
285
|
+
gene1 = "TP53"
|
286
|
+
|
287
|
+
gene2 = "CDK5R1"
|
288
|
+
|
289
|
+
Transformed.with_transform(a, [gene1,gene2], "[G]") do
|
290
|
+
assert_equal "This sentence mentions the [G] gene and the [G] protein", a
|
291
|
+
end
|
292
|
+
Transformed.with_transform(a, [gene1], "[G1]") do
|
293
|
+
Transformed.with_transform(a, [gene2], "[G2]") do
|
294
|
+
assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
|
295
|
+
end
|
296
|
+
end
|
297
|
+
Transformed.with_transform(a, [gene2], "[G2]") do
|
298
|
+
Transformed.with_transform(a, [gene1], "[G1]") do
|
299
|
+
assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
|
300
|
+
end
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
|
282
305
|
def test_offset_transform
|
283
306
|
a = "ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
|
284
307
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- lib/rbbt/ner/oscar3.rb
|
83
83
|
- lib/rbbt/ner/oscar4.rb
|
84
84
|
- lib/rbbt/ner/patterns.rb
|
85
|
+
- lib/rbbt/ner/pubtator.rb
|
85
86
|
- lib/rbbt/ner/regexpNER.rb
|
86
87
|
- lib/rbbt/ner/rner.rb
|
87
88
|
- lib/rbbt/ner/rnorm.rb
|
@@ -104,6 +105,7 @@ files:
|
|
104
105
|
- lib/rbbt/segment/token.rb
|
105
106
|
- lib/rbbt/segment/transformed.rb
|
106
107
|
- lib/rbbt/segment/tsv.rb
|
108
|
+
- lib/rbbt/text/misc.rb
|
107
109
|
- share/install/software/ABNER
|
108
110
|
- share/install/software/BANNER
|
109
111
|
- share/install/software/ChemicalTagger
|
@@ -119,6 +121,7 @@ files:
|
|
119
121
|
- share/rner/config.rb
|
120
122
|
- share/rnorm/cue_default
|
121
123
|
- share/rnorm/tokens_default
|
124
|
+
- share/text/greek.tsv
|
122
125
|
- share/wordlists/stopwords
|
123
126
|
- test/rbbt/bow/test_bow.rb
|
124
127
|
- test/rbbt/bow/test_dictionary.rb
|
@@ -139,6 +142,7 @@ files:
|
|
139
142
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
140
143
|
- test/rbbt/ner/test_oscar4.rb
|
141
144
|
- test/rbbt/ner/test_patterns.rb
|
145
|
+
- test/rbbt/ner/test_pubtator.rb
|
142
146
|
- test/rbbt/ner/test_regexpNER.rb
|
143
147
|
- test/rbbt/ner/test_rner.rb
|
144
148
|
- test/rbbt/ner/test_rnorm.rb
|
@@ -199,6 +203,7 @@ test_files:
|
|
199
203
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
200
204
|
- test/rbbt/ner/test_oscar4.rb
|
201
205
|
- test/rbbt/ner/test_patterns.rb
|
206
|
+
- test/rbbt/ner/test_pubtator.rb
|
202
207
|
- test/rbbt/ner/test_regexpNER.rb
|
203
208
|
- test/rbbt/ner/test_rner.rb
|
204
209
|
- test/rbbt/ner/test_rnorm.rb
|