rbbt-text 1.3.9 → 1.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
4
- data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
3
+ metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
4
+ data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
5
5
  SHA512:
6
- metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
7
- data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
6
+ metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
7
+ data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
@@ -6,7 +6,9 @@ module Document
6
6
  send :property, type do
7
7
  segments = self.instance_exec &block
8
8
 
9
- Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+ Segment.align(self, segments) unless segments.empty? ||
10
+ (Segment === segments && segments.offset) ||
11
+ (Array === segments && Segment === segments.first && segments.first.offset)
10
12
 
11
13
  segments.each do |segment|
12
14
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -17,6 +19,36 @@ module Document
17
19
 
18
20
  segments
19
21
  end
22
+
23
+ DocID.property type do
24
+ self.document.send(type)
25
+ end
26
+
27
+ SegID.property type do
28
+ self.overlaps(self.docid.send(type))
29
+ end
30
+
31
+ Segment.property type do
32
+ self.overlaps(self.docid.send(type))
33
+ end
34
+
35
+ seg_type = "segids_for_" + type.to_s
36
+
37
+ send :property, seg_type do
38
+ SegID.setup(self.send(type).collect{|s| s.segid })
39
+ end
40
+
41
+ DocID.property seg_type do
42
+ self.document.send(seg_type)
43
+ end
44
+
45
+ SegID.property seg_type do
46
+ self.overlaps(self.docid.send(seg_type))
47
+ end
48
+
49
+ Segment.property seg_type do
50
+ self.overlaps(self.docid.send(seg_type))
51
+ end
20
52
  end
21
53
 
22
54
  def self.define_multiple(type, &block)
@@ -28,7 +60,10 @@ module Document
28
60
  doc_segments.each_with_index do |segments,i|
29
61
  next if segments.nil?
30
62
  document = list[i]
31
- Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
63
+ Segment.align(document, segments) unless segments.nil? ||
64
+ segments.empty? ||
65
+ (Segment === segments && segments.offset) ||
66
+ (Array === segments && Segment === segments.first && segments.first.offset)
32
67
 
33
68
  segments.each do |segment|
34
69
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -41,5 +76,35 @@ module Document
41
76
  segments
42
77
  end
43
78
  end
79
+
80
+ DocID.property type do
81
+ self.document.send(type)
82
+ end
83
+
84
+ SegID.property type do
85
+ self.overlaps(self.docid.send(type))
86
+ end
87
+
88
+ Segment.property type do
89
+ self.overlaps(self.docid.send(type))
90
+ end
91
+
92
+ seg_type = "segids_for_" + type.to_s
93
+
94
+ send :property, seg_type do
95
+ SegID.setup(self.send(type).collect{|s| s.segid })
96
+ end
97
+
98
+ DocID.property seg_type do
99
+ self.document.send(seg_type)
100
+ end
101
+
102
+ SegID.property seg_type do
103
+ self.overlaps(self.docid.send(seg_type))
104
+ end
105
+
106
+ Segment.property seg_type do
107
+ self.overlaps(self.docid.send(seg_type))
108
+ end
44
109
  end
45
110
  end
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
2
2
 
3
3
  module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
- def add_pmid(pmid, type = nil, update = false)
6
- type = :abstract if type.nil?
5
+ def add_pmid(pmid, type = :title_and_abstract, update = false)
6
+ type = :title_and_abstract if type.nil?
7
7
 
8
8
  if ! (update || Array === pmid)
9
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
@@ -16,9 +16,11 @@ module Document::Corpus
16
16
 
17
17
  res = PubMed.get_article(pmids).collect do |pmid, article|
18
18
  document = if type.to_sym == :abstract
19
- Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
19
+ Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
20
20
  elsif type.to_sym == :title
21
- Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
21
+ Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
+ elsif type.to_sym == :title_and_abstract
23
+ Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
24
  else
23
25
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
24
26
  Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
data/lib/rbbt/document.rb CHANGED
@@ -9,6 +9,10 @@ module DocID
9
9
  attr_accessor :default_corpus
10
10
  end
11
11
 
12
+ def id
13
+ self
14
+ end
15
+
12
16
  def corpus
13
17
  annotation_values[:corpus] || DocID.default_corpus
14
18
  end
@@ -66,7 +66,8 @@ EOF
66
66
  end
67
67
 
68
68
  Open.write('config', CONFIG)
69
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
69
+ mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
70
+ CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
70
71
 
71
72
  if texts.respond_to? :key_field
72
73
  key_field = texts.key_field
@@ -11,7 +11,15 @@ class RegExpNER < NER
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
13
 
14
- if matchdata.captures.any?
14
+ if matchdata.named_captures.any?
15
+ match = matchdata[0]
16
+ code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
17
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
18
+ matches << match
19
+ eend = match.length + pre.length
20
+ text = text[eend..-1]
21
+ start += match.length + pre.length
22
+ elsif matchdata.captures.any?
15
23
  match = matchdata.captures.first
16
24
  offset, eend = matchdata.offset(1)
17
25
  NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
@@ -88,7 +96,7 @@ class RegExpNER < NER
88
96
  def match(text)
89
97
  matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
90
98
  matches.collect do |m|
91
- NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
99
+ NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
92
100
  end
93
101
  end
94
102
 
@@ -4,6 +4,7 @@ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
7
+ include SegID
7
8
  self.annotation :corpus
8
9
 
9
10
  def _parts
@@ -1,4 +1,4 @@
1
- module Segment
1
+ module SegmentRanges
2
2
  def pull(offset)
3
3
  if self.offset.nil? or offset.nil?
4
4
  self.offset = nil
@@ -61,3 +61,11 @@ module Segment
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ module Segment
66
+ include SegmentRanges
67
+ end
68
+
69
+ module SegID
70
+ include SegmentRanges
71
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -22,6 +22,10 @@ module SegID
22
22
  range.begin
23
23
  end
24
24
 
25
+ def eend
26
+ offset.to_i + length - 1
27
+ end
28
+
25
29
  def segment_length
26
30
  range.end - range.begin + 1
27
31
  end
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
13
13
  self.split(" ")
14
14
  end
15
15
 
16
+ Document.define :lines do
17
+ self.split("\n")
18
+ end
19
+
16
20
  $called_once = false
17
21
  Document.define :persisted_words do
18
22
  raise CalledOnce if $called_once
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
145
149
 
146
150
  assert text.ner.first.segid.include?("TEST:")
147
151
  end
152
+
153
+ def test_sentence_words
154
+ text =<<-EOF
155
+ This is sentence 1
156
+ This is sentence 2
157
+ EOF
158
+
159
+ Document.setup(text)
160
+
161
+ words = text.words
162
+ numbers = words.select{|w| w =~ /\d/}
163
+ text.lines.each do |sentence|
164
+ Transformed.with_transform(sentence, numbers, "[NUM]") do
165
+ puts sentence
166
+ end
167
+ end
168
+ end
148
169
  end
149
170
 
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
12
12
  sapiens
13
13
  EOF
14
14
 
15
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
15
16
  mentions = GNormPlus.process({:file => text})
16
17
 
17
18
  assert_equal 1, mentions.length
@@ -23,6 +24,7 @@ sapiens
23
24
  We found that TP53 is regulated by MDM2 in Homo sapiens
24
25
  EOF
25
26
 
27
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
26
28
  mentions = GNormPlus.entities({:file => text})
27
29
  assert mentions["file"].include?("TP53")
28
30
  mentions["file"].each do |mention|
@@ -88,6 +88,14 @@ class TestRegExpNER < Test::Unit::TestCase
88
88
  end
89
89
 
90
90
 
91
+ def test_entities_named_captures
92
+ sentence = "In a sentence I should find not this but this"
93
+
94
+ ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
95
+ matches = ner.entities(sentence)
96
+ end
97
+
98
+
91
99
 
92
100
  def test_regexp_order
93
101
  text =<<-EOF
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.9
4
+ version: 1.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  - !ruby/object:Gem::Version
176
176
  version: '0'
177
177
  requirements: []
178
- rubygems_version: 3.1.4
178
+ rubygems_version: 3.1.2
179
179
  signing_key:
180
180
  specification_version: 4
181
181
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
182
182
  test_files:
183
- - test/rbbt/nlp/test_nlp.rb
184
- - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
185
- - test/rbbt/nlp/genia/test_sentence_splitter.rb
183
+ - test/test_spaCy.rb
184
+ - test/test_helper.rb
185
+ - test/rbbt/bow/test_dictionary.rb
186
186
  - test/rbbt/bow/test_bow.rb
187
187
  - test/rbbt/bow/test_misc.rb
188
- - test/rbbt/bow/test_dictionary.rb
189
- - test/rbbt/test_document.rb
190
- - test/rbbt/document/test_annotation.rb
188
+ - test/rbbt/segment/test_encoding.rb
189
+ - test/rbbt/segment/test_transformed.rb
190
+ - test/rbbt/segment/test_overlaps.rb
191
+ - test/rbbt/segment/test_named_entity.rb
192
+ - test/rbbt/segment/test_corpus.rb
193
+ - test/rbbt/segment/test_range_index.rb
194
+ - test/rbbt/segment/test_annotation.rb
195
+ - test/rbbt/entity/test_document.rb
191
196
  - test/rbbt/document/corpus/test_pubmed.rb
192
197
  - test/rbbt/document/test_corpus.rb
193
- - test/rbbt/entity/test_document.rb
198
+ - test/rbbt/document/test_annotation.rb
199
+ - test/rbbt/test_document.rb
194
200
  - test/rbbt/ner/test_patterns.rb
195
- - test/rbbt/ner/test_NER.rb
196
- - test/rbbt/ner/test_abner.rb
197
201
  - test/rbbt/ner/rnorm/test_tokens.rb
198
- - test/rbbt/ner/test_rnorm.rb
199
- - test/rbbt/ner/test_regexpNER.rb
200
202
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
203
+ - test/rbbt/ner/test_token_trieNER.rb
204
+ - test/rbbt/ner/test_finder.rb
201
205
  - test/rbbt/ner/test_brat.rb
206
+ - test/rbbt/ner/test_regexpNER.rb
202
207
  - test/rbbt/ner/test_g_norm_plus.rb
208
+ - test/rbbt/ner/test_rnorm.rb
209
+ - test/rbbt/ner/test_linnaeus.rb
203
210
  - test/rbbt/ner/test_chemical_tagger.rb
204
- - test/rbbt/ner/test_banner.rb
205
- - test/rbbt/ner/test_token_trieNER.rb
206
- - test/rbbt/ner/test_finder.rb
211
+ - test/rbbt/ner/test_NER.rb
212
+ - test/rbbt/ner/test_abner.rb
207
213
  - test/rbbt/ner/test_rner.rb
208
- - test/rbbt/ner/test_linnaeus.rb
209
214
  - test/rbbt/ner/test_oscar4.rb
215
+ - test/rbbt/ner/test_banner.rb
210
216
  - test/rbbt/test_segment.rb
211
- - test/rbbt/segment/test_transformed.rb
212
- - test/rbbt/segment/test_overlaps.rb
213
- - test/rbbt/segment/test_annotation.rb
214
- - test/rbbt/segment/test_named_entity.rb
215
- - test/rbbt/segment/test_encoding.rb
216
- - test/rbbt/segment/test_range_index.rb
217
- - test/rbbt/segment/test_corpus.rb
218
- - test/test_spaCy.rb
219
- - test/test_helper.rb
217
+ - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
218
+ - test/rbbt/nlp/test_nlp.rb
219
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb