rbbt-text 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a1a38b7a9c9f9fe0ce8fd7b8fd19a3ca39f483e8ccaaa1412af023262181fd8
4
- data.tar.gz: ddec2f95b5c6fe9a69e67cf79a29c3ef2d9c37301e442d8664c7bbff1298a365
3
+ metadata.gz: 80e662635b01507c60638565bb8f00f789b4e6423227b98cee0ac38d8dd4b344
4
+ data.tar.gz: 6b9a1be5fdea2bb56a770ca2fe6838528102dfe2d15f27abb5f0e5b1849e6574
5
5
  SHA512:
6
- metadata.gz: b554f4db313a65e0b682e2fd4456aaf9d1b4a489ca025ad3dc89cf71df772d97807b8b2ec62e3753eb62cf93bd88d7d0555c7ca5f9d76bd52a6c6d5fd56b313d
7
- data.tar.gz: c419af9eb52723c0a761ddc025aadad99cebd1891cbd178aff466d509b84fdf76f449dfa62ba574a0b9dbeb0c519a4341b53375f18c09e52d19fd912cf5c1188
6
+ metadata.gz: 383affa64fa2b1e6d54817e343b3710d134c01d68867699490c604258324a6806bd829492873e819a5666780e0282ab1a375ae03147691aa64ef78705d7d0097
7
+ data.tar.gz: 67a52fca24335775faacf2a68cd101b24ff19d0fbff9560b4468c61546cee62aa02cdf8c5a1d3db39e5784856bd4df4573c9a00f901e8ca368cf169f8a88101d
@@ -0,0 +1,67 @@
1
+
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/named_entity'
4
+ require 'rbbt/segment/transformed'
5
+ require 'rbbt/text/misc'
6
+ require 'rest-client'
7
+ module Pubtator
8
+ PUBTATOR_URL="https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator"
9
+
10
+ def self.pubtator_entities(pmids, concepts = ['gene'], alignments = nil)
11
+
12
+ texts = {}
13
+ entities = {}
14
+
15
+ last = nil
16
+ Misc.chunk(pmids, 1000) do |chunk|
17
+ time = Time.now
18
+ if last
19
+ diff = time - last
20
+ if diff < 3
21
+ sleep(3 - diff)
22
+ end
23
+ end
24
+ last = time
25
+
26
+ response = RestClient.post(PUBTATOR_URL, {pmids: chunk, concepts: concepts}.to_json, {content_type: 'json', accept: 'json'}).body
27
+ response.split("\n").each do |line|
28
+ next if line.empty?
29
+ if line =~ /^\d+\|/
30
+ pmid, text_type, content = line.split("|")
31
+ texts[pmid] ||= []
32
+ texts[pmid] << content
33
+ else
34
+ pmid, start, eend, literal, type, code = line.split("\t")
35
+ ne = NamedEntity.setup(literal, code: code, type: type, offset: start.to_i)
36
+ entities[pmid] ||= []
37
+ entities[pmid] << ne
38
+ end
39
+ end
40
+ end
41
+
42
+ if alignments
43
+ new_entities = {}
44
+ entities.each do |pmid,list|
45
+ text = texts[pmid] * " "
46
+ alignment = alignments[pmid]
47
+ raise "Alignment for #{pmid} not found" if alignment.nil?
48
+ greek_characters = Misc.greek_characters
49
+ new_list = Transformed.with_transform(alignment, greek_characters.keys, lambda{|k| greek_characters[k] }) do
50
+ list.collect do |entity|
51
+ begin
52
+ Segment.relocate(entity, text, alignment, 10)
53
+ entity
54
+ rescue Exception
55
+ Log.low "Entity #{entity} (#{entity.range}) not found in alignment text for #{pmid}"
56
+ next
57
+ end
58
+ end
59
+ end
60
+ new_entities[pmid] = new_list.compact
61
+ end
62
+ entities = new_entities
63
+ end
64
+
65
+ entities
66
+ end
67
+ end
@@ -10,10 +10,10 @@ module Transformed
10
10
  text
11
11
  end
12
12
 
13
- def self.with_transform(text, segments, replacement = nil)
13
+ def self.with_transform(text, replace_segments, replacement = nil)
14
14
 
15
15
  text.extend Transformed
16
- text.replace_segments(segments, replacement)
16
+ text.replace_segments(replace_segments, replacement)
17
17
 
18
18
  segments = yield text
19
19
 
@@ -61,7 +61,7 @@ module Transformed
61
61
  [begin_shift, end_shift]
62
62
  end
63
63
 
64
- def replace_segments(segments, replacement = nil, &block)
64
+ def replace_segments(segments, replacement = nil, strict = false, &block)
65
65
  @transformed_segments ||= {}
66
66
  @transformation_stack ||= []
67
67
  stack = []
@@ -71,6 +71,15 @@ module Transformed
71
71
 
72
72
  offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
73
 
74
+ segments = segments.collect do |s|
75
+ if Segment === s
76
+ s
77
+ elsif String === s
78
+ matches = self.scan(s)
79
+ Segment.align(self, matches)
80
+ end
81
+ end.flatten
82
+
74
83
  segments = segments.select do |s|
75
84
  shift = shift s.range
76
85
  s_offset = s.offset.to_i
@@ -82,7 +91,6 @@ module Transformed
82
91
 
83
92
  Segment.clean_sort(segments).each do |segment|
84
93
  next if segment.offset.nil?
85
-
86
94
  shift = shift segment.range
87
95
 
88
96
  next if shift.nil?
@@ -139,7 +147,7 @@ module Transformed
139
147
  when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
140
148
  segment.replace self[segment.offset.to_i..segment.eend - diff]
141
149
  else
142
- raise "Segment Overlaps"
150
+ raise "Segment overlaps with transformation: #{Misc.fingerprint(segment)} (#{segment.range} & #{range.begin}..#{range.end + diff})"
143
151
  end
144
152
  end
145
153
 
@@ -155,10 +163,16 @@ module Transformed
155
163
 
156
164
  new_range = (range.begin..range.last + diff)
157
165
  self[new_range] = text
158
- segments.each do |segment|
159
- next unless Segment === segment
160
- fix_segment(segment, range, diff)
161
- end if Array === segments
166
+ segments = segments.collect do |segment|
167
+ next segment unless Segment === segment
168
+ begin
169
+ fix_segment(segment, range, diff)
170
+ segment
171
+ rescue
172
+ Log.low "Skipped: " + $!.message
173
+ next
174
+ end
175
+ end.compact if Array === segments
162
176
  end
163
177
  segments
164
178
  else
@@ -0,0 +1,5 @@
1
+ module Misc
2
+ def self.greek_characters
3
+ @@greek_characters ||= Rbbt.share.text.greek.tsv
4
+ end
5
+ end
@@ -0,0 +1,51 @@
1
+ #: :type=:single
2
+ #Character Greek letter
3
+ Α Alpha
4
+ α alpha
5
+ Β Beta
6
+ β beta
7
+ γ gamma
8
+ Γ Gamma
9
+ Δ Delta
10
+ δ Delta
11
+ Ε Epsilon
12
+ ε epsilon
13
+ Ζ Zeta
14
+ ζ zeta
15
+ η eta
16
+ Η Eta
17
+ Θ Theta
18
+ θ theta
19
+ ι iota
20
+ Ι Iota
21
+ Κ Kappa
22
+ κ kappa
23
+ λ lamda
24
+ Λ Lamda
25
+ Μ Mu
26
+ μ mu
27
+ ν nu
28
+ Ν Nu
29
+ Ξ Xi
30
+ ξ xi
31
+ ο omicron
32
+ Ο Omicron
33
+ π pi
34
+ Π Pi
35
+ Ρ Rho
36
+ ρ rho
37
+ ς final sigma
38
+ σ sigma
39
+ Σ Sigma
40
+ τ tau
41
+ Τ Tau
42
+ υ upsilon
43
+ Υ Upsilon
44
+ φ phi
45
+ Φ Phi
46
+ Χ Chi
47
+ χ chi
48
+ Ψ Psi
49
+ ψ psi
50
+ ω omega
51
+ Ω Omega
@@ -0,0 +1,70 @@
1
+ require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
2
+ require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
3
+
4
+ require 'rbbt/ner/NER'
5
+ require 'rbbt/document'
6
+ require 'rbbt/document/corpus'
7
+ require 'rbbt/document/corpus/pubmed'
8
+ class TestPubtator < Test::Unit::TestCase
9
+ def with_corpus(&block)
10
+ TmpFile.with_file do |corpus|
11
+ yield Document::Corpus.setup(corpus)
12
+ end
13
+ end
14
+
15
+ def _test_align
16
+ pmids = "19522013|20861254|38267746".split("|")
17
+ alignments = {}
18
+ with_corpus do |corpus|
19
+ corpus.add_pmid(pmids).each do |document|
20
+ alignments[document.code] = document
21
+ end
22
+ entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
23
+ entities.each do |pmid,list|
24
+ document = corpus.add_pmid(pmid)
25
+ list.each do |entity|
26
+ assert_equal entity, document[entity.range]
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def test_pmid
33
+ Log.severity = 0
34
+ pmids = "22291955".split("|")
35
+ alignments = {}
36
+ with_corpus do |corpus|
37
+ corpus.add_pmid(pmids).each do |document|
38
+ alignments[document.code] = document
39
+ end
40
+ entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
41
+ entities.each do |pmid,list|
42
+ document = corpus.add_pmid(pmid)
43
+ list.each do |entity|
44
+ assert_equal entity, document[entity.range]
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ def _test_greek
51
+
52
+ pmids = "20861254".split("|")
53
+ alignments = {}
54
+ with_corpus do |corpus|
55
+ corpus.add_pmid(pmids).each do |document|
56
+ alignments[document.code] = document
57
+ end
58
+ entities = Pubtator.pubtator_entities(pmids, ['gene'], alignments)
59
+ entities.each do |pmid,list|
60
+ document = corpus.add_pmid(pmid)
61
+ list.each do |entity|
62
+ assert_equal entity, document[entity.range]
63
+ end
64
+ assert list.select{|e| e.include? 'α' }.any?
65
+ end
66
+
67
+ end
68
+ end
69
+ end
70
+
@@ -279,6 +279,29 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
279
279
  end
280
280
  end
281
281
 
282
+ def test_string_transform
283
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
284
+
285
+ gene1 = "TP53"
286
+
287
+ gene2 = "CDK5R1"
288
+
289
+ Transformed.with_transform(a, [gene1,gene2], "[G]") do
290
+ assert_equal "This sentence mentions the [G] gene and the [G] protein", a
291
+ end
292
+ Transformed.with_transform(a, [gene1], "[G1]") do
293
+ Transformed.with_transform(a, [gene2], "[G2]") do
294
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
295
+ end
296
+ end
297
+ Transformed.with_transform(a, [gene2], "[G2]") do
298
+ Transformed.with_transform(a, [gene1], "[G1]") do
299
+ assert_equal "This sentence mentions the [G1] gene and the [G2] protein", a
300
+ end
301
+ end
302
+ end
303
+
304
+
282
305
  def test_offset_transform
283
306
  a = "ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
284
307
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.1
4
+ version: 1.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-14 00:00:00.000000000 Z
11
+ date: 2024-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -82,6 +82,7 @@ files:
82
82
  - lib/rbbt/ner/oscar3.rb
83
83
  - lib/rbbt/ner/oscar4.rb
84
84
  - lib/rbbt/ner/patterns.rb
85
+ - lib/rbbt/ner/pubtator.rb
85
86
  - lib/rbbt/ner/regexpNER.rb
86
87
  - lib/rbbt/ner/rner.rb
87
88
  - lib/rbbt/ner/rnorm.rb
@@ -104,6 +105,7 @@ files:
104
105
  - lib/rbbt/segment/token.rb
105
106
  - lib/rbbt/segment/transformed.rb
106
107
  - lib/rbbt/segment/tsv.rb
108
+ - lib/rbbt/text/misc.rb
107
109
  - share/install/software/ABNER
108
110
  - share/install/software/BANNER
109
111
  - share/install/software/ChemicalTagger
@@ -119,6 +121,7 @@ files:
119
121
  - share/rner/config.rb
120
122
  - share/rnorm/cue_default
121
123
  - share/rnorm/tokens_default
124
+ - share/text/greek.tsv
122
125
  - share/wordlists/stopwords
123
126
  - test/rbbt/bow/test_bow.rb
124
127
  - test/rbbt/bow/test_dictionary.rb
@@ -139,6 +142,7 @@ files:
139
142
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
140
143
  - test/rbbt/ner/test_oscar4.rb
141
144
  - test/rbbt/ner/test_patterns.rb
145
+ - test/rbbt/ner/test_pubtator.rb
142
146
  - test/rbbt/ner/test_regexpNER.rb
143
147
  - test/rbbt/ner/test_rner.rb
144
148
  - test/rbbt/ner/test_rnorm.rb
@@ -199,6 +203,7 @@ test_files:
199
203
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
200
204
  - test/rbbt/ner/test_oscar4.rb
201
205
  - test/rbbt/ner/test_patterns.rb
206
+ - test/rbbt/ner/test_pubtator.rb
202
207
  - test/rbbt/ner/test_regexpNER.rb
203
208
  - test/rbbt/ner/test_rner.rb
204
209
  - test/rbbt/ner/test_rnorm.rb