rbbt-text 1.5.1 → 1.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a1a38b7a9c9f9fe0ce8fd7b8fd19a3ca39f483e8ccaaa1412af023262181fd8
4
- data.tar.gz: ddec2f95b5c6fe9a69e67cf79a29c3ef2d9c37301e442d8664c7bbff1298a365
3
+ metadata.gz: 80e662635b01507c60638565bb8f00f789b4e6423227b98cee0ac38d8dd4b344
4
+ data.tar.gz: 6b9a1be5fdea2bb56a770ca2fe6838528102dfe2d15f27abb5f0e5b1849e6574
5
5
  SHA512:
6
- metadata.gz: b554f4db313a65e0b682e2fd4456aaf9d1b4a489ca025ad3dc89cf71df772d97807b8b2ec62e3753eb62cf93bd88d7d0555c7ca5f9d76bd52a6c6d5fd56b313d
7
- data.tar.gz: c419af9eb52723c0a761ddc025aadad99cebd1891cbd178aff466d509b84fdf76f449dfa62ba574a0b9dbeb0c519a4341b53375f18c09e52d19fd912cf5c1188
6
+ metadata.gz: 383affa64fa2b1e6d54817e343b3710d134c01d68867699490c604258324a6806bd829492873e819a5666780e0282ab1a375ae03147691aa64ef78705d7d0097
7
+ data.tar.gz: 67a52fca24335775faacf2a68cd101b24ff19d0fbff9560b4468c61546cee62aa02cdf8c5a1d3db39e5784856bd4df4573c9a00f901e8ca368cf169f8a88101d
@@ -0,0 +1,67 @@
1
+
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
4
+ require 'rbbt/segment/transformed'
5
+ require 'rbbt/text/misc'
6
+ require 'rest-client'
7
+ module Pubtator
8
+ PUBTATOR_URL="https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator"
9
+
10
+ def self.pubtator_entities(pmids, concepts = ['gene'], alignments = nil)
11
+
12
+ texts = {}
13
+ entities = {}
14
+
15
+ last = nil
16
+ Misc.chunk(pmids, 1000) do |chunk|
17
+ time = Time.now
18
+ if last
19
+ diff = time - last
20
+ if diff < 3
21
+ sleep(3 - diff)
22
+ end
23
+ end
24
+ last = time
25
+
26
+ response = RestClient.post(PUBTATOR_URL, {pmids: chunk, concepts: concepts}.to_json, {content_type: 'json', accept: 'json'}).body
27
+ response.split("\n").each do |line|
28
+ next if line.empty?
29
+ if line =~ /^\d+\|/
30
+ pmid, text_type, content = line.split("|")
31
+ texts[pmid] ||= []
32
+ texts[pmid] << content
33
+ else
34
+ pmid, start, eend, literal, type, code = line.split("\t")
35
+ ne = NamedEntity.setup(literal, code: code, type: type, offset: start.to_i)
36
+ entities[pmid] ||= []
37
+ entities[pmid] << ne
38
+ end
39
+ end
40
+ end
41
+
42
+ if alignments
43
+ new_entities = {}
44
+ entities.each do |pmid,list|
45
+ text = texts[pmid] * " "
46
+ alignment = alignments[pmid]
47
+ raise "Alignment for #{pmid} not found" if alignment.nil?
48
+ greek_characters = Misc.greek_characters
49
+ new_list = Transformed.with_transform(alignment, greek_characters.keys, lambda{|k| greek_characters[k] }) do
50
+ list.collect do |entity|
51
+ begin
52
+ Segment.relocate(entity, text, alignment, 10)
53
+ entity
54
+ rescue Exception
55
+ Log.low "Entity #{entity} (#{entity.range}) not found in alignment text for #{pmid}"
56
+ next
57
+ end
58
+ end
59
+ end
60
+ new_entities[pmid] = new_list.compact
61
+ end
62
+ entities = new_entities
63
+ end
64
+
65
+ entities
66
+ end
67
+ end
@@ -10,10 +10,10 @@ module Transformed
10
10
  text
11
11
  end
12
12
 
13
- def self.with_transform(text, segments, replacement = nil)
13
+ def self.with_transform(text, replace_segments, replacement = nil)
14
14
 
15
15
  text.extend Transformed
16
- text.replace_segments(segments, replacement)
16
+ text.replace_segments(replace_segments, replacement)
17
17
 
18
18
  segments = yield text
19
19
 
@@ -61,7 +61,7 @@ module Transformed
61
61
  [begin_shift, end_shift]
62
62
  end
63
63
 
64
- def replace_segments(segments, replacement = nil, &block)
64
+ def replace_segments(segments, replacement = nil, strict = false, &block)
65
65
  @transformed_segments ||= {}
66
66
  @transformation_stack ||= []
67
67
  stack = []
@@ -71,6 +71,15 @@ module Transformed
71
71
 
72
72
  offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
73
 
74
+ segments = segments.collect do |s|
75
+ if Segment === s
76
+ s
77
+ elsif String === s
78
+ matches = self.scan(s)
79
+ Segment.align(self, matches)
80
+ end
81
+ end.flatten
82
+
74
83
  segments = segments.select do |s|
75
84
  shift = shift s.range
76
85
  s_offset = s.offset.to_i
@@ -82,7 +91,6 @@ module Transformed
82
91
 
83
92
  Segment.clean_sort(segments).each do |segment|
84
93
  next if segment.offset.nil?
85
-
86
94
  shift = shift segment.range
87
95
 
88
96
  next if shift.nil?
@@ -139,7 +147,7 @@ module Transformed
139
147
  when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
140
148
  segment.replace self[segment.offset.to_i..segment.eend - diff]
141
149
  else
142
- raise "Segment Overlaps"
150
+ raise "Segment overlaps with transformation: #{Misc.fingerprint(segment)} (#{segment.range} & #{range.begin}..#{range.end + diff})"
143
151
  end
144
152
  end
145
153
 
@@ -155,10 +163,16 @@ module Transformed
155
163
 
156
164
  new_range = (range.begin..range.last + diff)
157
165
  self[new_range] = text
158
- segments.each do |segment|
159
- next unless Segment === segment
160
- fix_segment(segment, range, diff)
161
- end if Array === segments
166
+ segments = segments.collect do |segment|
167
+ next segment unless Segment === segment
168
+ begin
169
+ fix_segment(segment, range, diff)
170
+ segment
171
+ rescue
172
+ Log.low "Skipped: " + $!.message
173
+ next
174
+ end
175
+ end.compact if Array === segments
162
176
  end
163
177
  segments
164
178
  else
@@ -0,0 +1,5 @@
1
+ module Misc
2
+ def self.greek_characters
3
+ @@greek_characters ||= Rbbt.share.text.greek.tsv
4
+ end
5
+ end
@@ -0,0 +1,51 @@
1
+ #: :type=:single
2
+ #Character Greek letter
3
+ Α Alpha
4
+ α alpha
5
+ Β Beta
6
+ β beta
7
+ γ gamma
8
+ Γ Gamma
9
+ Δ Delta
10
+ δ Delta
11
+ Ε Epsilon
12
+ ε epsilon
13
+ Ζ Zeta
14
+ ζ zeta
15
+ η eta
16
+ Η Eta
17
+ Θ Theta
18
+ θ theta
19
+ ι iota
20
+ Ι Iota
21
+ Κ Kappa
22
+ κ kappa
23
+ λ lamda
24
+ Λ Lamda
25
+ Μ Mu
26
+ μ mu
27
+ ν nu
28
+ Ν Nu
29
+ Ξ Xi
30
+ ξ xi
31
+ ο omicron
32
+ Ο Omicron
33
+ π pi
34
+ Π Pi
35
+ Ρ Rho
36
+ ρ rho
37
+ ς final sigma
38
+ σ sigma
39
+ Σ Sigma
40
+ τ tau
41
+ Τ Tau
42
+ υ upsilon
43
+ Υ Upsilon
44
+ φ phi
45
+ Φ Phi
46
+ Χ Chi
47
+ χ chi
48
+ Ψ Psi
49
+ ψ psi
50
+ ω omega
51
+ Ω Omega
@@ -0,0 +1,70 @@
1
+ require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
2
+ require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
3
+
4
+ require 'rbbt/ner/NER'
5
+ require 'rbbt/document'
6
+ require 'rbbt/document/corpus'
7
+ require 'rbbt/document/corpus/pubmed'
8
+ class TestPubtator < Test::Unit::TestCase
9
+ def with_corpus(&block)
10
+ TmpFile.with_file do |corpus|
11
+ yield Document::Corpus.setup(corpus)
12
+ end
13
+ end
14
+
15
+ def _test_align
16
+ pmids = "19522013|20861254|38267746".split("|")
17
+ alignments = {}
18
+ with_corpus do |corpus|
19
+ corpus.add_pmid(pmids).each do |document|
20
+ alignments[document.code] = document
21
+ end
22
+ entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
23
+ entities.each do |pmid,list|
24
+ document = corpus.add_pmid(pmid)
25
+ list.each do |entity|
26
+ assert_equal entity, document[entity.range]
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def test_pmid
33
+ Log.severity = 0
34
+ pmids = "22291955".split("|")
35
+ alignments = {}
36
+ with_corpus do |corpus|
37
+ corpus.add_pmid(pmids).each do |document|
38
+ alignments[document.code] = document
39
+ end
40
+ entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
41
+ entities.each do |pmid,list|
42
+ document = corpus.add_pmid(pmid)
43
+ list.each do |entity|
44
+ assert_equal entity, document[entity.range]
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ def _test_greek
51
+
52
+ pmids = "20861254".split("|")
53
+ alignments = {}
54
+ with_corpus do |corpus|
55
+ corpus.add_pmid(pmids).each do |document|
56
+ alignments[document.code] = document
57
+ end
58
+ entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
59
+ entities.each do |pmid,list|
60
+ document = corpus.add_pmid(pmid)
61
+ list.each do |entity|
62
+ assert_equal entity, document[entity.range]
63
+ end
64
+ assert list.select{|e| e.include? 'α' }.any?
65
+ end
66
+
67
+ end
68
+ end
69
+ end
70
+
@@ -279,6 +279,29 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
279
279
  end
280
280
  end
281
281
 
282
+ def test_string_transform
283
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
284
+
285
+ gene1 = "TP53"
286
+
287
+ gene2 = "CDK5R1"
288
+
289
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
290
+ assert_equal "This sentence mentions the [G] gene and the [G] protein", a
291
+ end
292
+ Transformed.with_transform(a, [gene1], "[G1]") do
293
+ Transformed.with_transform(a, [gene2], "[G2]") do
294
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
295
+ end
296
+ end
297
+ Transformed.with_transform(a, [gene2], "[G2]") do
298
+ Transformed.with_transform(a, [gene1], "[G1]") do
299
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
300
+ end
301
+ end
302
+ end
303
+
304
+
282
305
  def test_offset_transform
283
306
  a = "ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
284
307
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.1
4
+ version: 1.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-14 00:00:00.000000000 Z
11
+ date: 2024-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -82,6 +82,7 @@ files:
82
82
  - lib/rbbt/ner/oscar3.rb
83
83
  - lib/rbbt/ner/oscar4.rb
84
84
  - lib/rbbt/ner/patterns.rb
85
+ - lib/rbbt/ner/pubtator.rb
85
86
  - lib/rbbt/ner/regexpNER.rb
86
87
  - lib/rbbt/ner/rner.rb
87
88
  - lib/rbbt/ner/rnorm.rb
@@ -104,6 +105,7 @@ files:
104
105
  - lib/rbbt/segment/token.rb
105
106
  - lib/rbbt/segment/transformed.rb
106
107
  - lib/rbbt/segment/tsv.rb
108
+ - lib/rbbt/text/misc.rb
107
109
  - share/install/software/ABNER
108
110
  - share/install/software/BANNER
109
111
  - share/install/software/ChemicalTagger
@@ -119,6 +121,7 @@ files:
119
121
  - share/rner/config.rb
120
122
  - share/rnorm/cue_default
121
123
  - share/rnorm/tokens_default
124
+ - share/text/greek.tsv
122
125
  - share/wordlists/stopwords
123
126
  - test/rbbt/bow/test_bow.rb
124
127
  - test/rbbt/bow/test_dictionary.rb
@@ -139,6 +142,7 @@ files:
139
142
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
140
143
  - test/rbbt/ner/test_oscar4.rb
141
144
  - test/rbbt/ner/test_patterns.rb
145
+ - test/rbbt/ner/test_pubtator.rb
142
146
  - test/rbbt/ner/test_regexpNER.rb
143
147
  - test/rbbt/ner/test_rner.rb
144
148
  - test/rbbt/ner/test_rnorm.rb
@@ -199,6 +203,7 @@ test_files:
199
203
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
200
204
  - test/rbbt/ner/test_oscar4.rb
201
205
  - test/rbbt/ner/test_patterns.rb
206
+ - test/rbbt/ner/test_pubtator.rb
202
207
  - test/rbbt/ner/test_regexpNER.rb
203
208
  - test/rbbt/ner/test_rner.rb
204
209
  - test/rbbt/ner/test_rnorm.rb