rbbt-text 1.3.9 → 1.3.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
4
- data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
3
+ metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
4
+ data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
5
5
  SHA512:
6
- metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
7
- data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
6
+ metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
7
+ data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
@@ -6,7 +6,9 @@ module Document
6
6
  send :property, type do
7
7
  segments = self.instance_exec &block
8
8
 
9
- Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+ Segment.align(self, segments) unless segments.empty? ||
10
+ (Segment === segments && segments.offset) ||
11
+ (Array === segments && Segment === segments.first && segments.first.offset)
10
12
 
11
13
  segments.each do |segment|
12
14
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -17,6 +19,36 @@ module Document
17
19
 
18
20
  segments
19
21
  end
22
+
23
+ DocID.property type do
24
+ self.document.send(type)
25
+ end
26
+
27
+ SegID.property type do
28
+ self.overlaps(self.docid.send(type))
29
+ end
30
+
31
+ Segment.property type do
32
+ self.overlaps(self.docid.send(type))
33
+ end
34
+
35
+ seg_type = "segids_for_" + type.to_s
36
+
37
+ send :property, seg_type do
38
+ SegID.setup(self.send(type).collect{|s| s.segid })
39
+ end
40
+
41
+ DocID.property seg_type do
42
+ self.document.send(seg_type)
43
+ end
44
+
45
+ SegID.property seg_type do
46
+ self.overlaps(self.docid.send(seg_type))
47
+ end
48
+
49
+ Segment.property seg_type do
50
+ self.overlaps(self.docid.send(seg_type))
51
+ end
20
52
  end
21
53
 
22
54
  def self.define_multiple(type, &block)
@@ -28,7 +60,10 @@ module Document
28
60
  doc_segments.each_with_index do |segments,i|
29
61
  next if segments.nil?
30
62
  document = list[i]
31
- Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
63
+ Segment.align(document, segments) unless segments.nil? ||
64
+ segments.empty? ||
65
+ (Segment === segments && segments.offset) ||
66
+ (Array === segments && Segment === segments.first && segments.first.offset)
32
67
 
33
68
  segments.each do |segment|
34
69
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -41,5 +76,35 @@ module Document
41
76
  segments
42
77
  end
43
78
  end
79
+
80
+ DocID.property type do
81
+ self.document.send(type)
82
+ end
83
+
84
+ SegID.property type do
85
+ self.overlaps(self.docid.send(type))
86
+ end
87
+
88
+ Segment.property type do
89
+ self.overlaps(self.docid.send(type))
90
+ end
91
+
92
+ seg_type = "segids_for_" + type.to_s
93
+
94
+ send :property, seg_type do
95
+ SegID.setup(self.send(type).collect{|s| s.segid })
96
+ end
97
+
98
+ DocID.property seg_type do
99
+ self.document.send(seg_type)
100
+ end
101
+
102
+ SegID.property seg_type do
103
+ self.overlaps(self.docid.send(seg_type))
104
+ end
105
+
106
+ Segment.property seg_type do
107
+ self.overlaps(self.docid.send(seg_type))
108
+ end
44
109
  end
45
110
  end
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
2
2
 
3
3
  module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
- def add_pmid(pmid, type = nil, update = false)
6
- type = :abstract if type.nil?
5
+ def add_pmid(pmid, type = :title_and_abstract, update = false)
6
+ type = :title_and_abstract if type.nil?
7
7
 
8
8
  if ! (update || Array === pmid)
9
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
@@ -16,9 +16,11 @@ module Document::Corpus
16
16
 
17
17
  res = PubMed.get_article(pmids).collect do |pmid, article|
18
18
  document = if type.to_sym == :abstract
19
- Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
19
+ Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
20
20
  elsif type.to_sym == :title
21
- Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
21
+ Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
+ elsif type.to_sym == :title_and_abstract
23
+ Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
24
  else
23
25
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
24
26
  Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
data/lib/rbbt/document.rb CHANGED
@@ -9,6 +9,10 @@ module DocID
9
9
  attr_accessor :default_corpus
10
10
  end
11
11
 
12
+ def id
13
+ self
14
+ end
15
+
12
16
  def corpus
13
17
  annotation_values[:corpus] || DocID.default_corpus
14
18
  end
@@ -66,7 +66,8 @@ EOF
66
66
  end
67
67
 
68
68
  Open.write('config', CONFIG)
69
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
69
+ mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
70
+ CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
70
71
 
71
72
  if texts.respond_to? :key_field
72
73
  key_field = texts.key_field
@@ -11,7 +11,15 @@ class RegExpNER < NER
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
13
 
14
- if matchdata.captures.any?
14
+ if matchdata.named_captures.any?
15
+ match = matchdata[0]
16
+ code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
17
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
18
+ matches << match
19
+ eend = match.length + pre.length
20
+ text = text[eend..-1]
21
+ start += match.length + pre.length
22
+ elsif matchdata.captures.any?
15
23
  match = matchdata.captures.first
16
24
  offset, eend = matchdata.offset(1)
17
25
  NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
@@ -88,7 +96,7 @@ class RegExpNER < NER
88
96
  def match(text)
89
97
  matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
90
98
  matches.collect do |m|
91
- NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
99
+ NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
92
100
  end
93
101
  end
94
102
 
@@ -4,6 +4,7 @@ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
7
+ include SegID
7
8
  self.annotation :corpus
8
9
 
9
10
  def _parts
@@ -1,4 +1,4 @@
1
- module Segment
1
+ module SegmentRanges
2
2
  def pull(offset)
3
3
  if self.offset.nil? or offset.nil?
4
4
  self.offset = nil
@@ -61,3 +61,11 @@ module Segment
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ module Segment
66
+ include SegmentRanges
67
+ end
68
+
69
+ module SegID
70
+ include SegmentRanges
71
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -22,6 +22,10 @@ module SegID
22
22
  range.begin
23
23
  end
24
24
 
25
+ def eend
26
+ offset.to_i + length - 1
27
+ end
28
+
25
29
  def segment_length
26
30
  range.end - range.begin + 1
27
31
  end
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
13
13
  self.split(" ")
14
14
  end
15
15
 
16
+ Document.define :lines do
17
+ self.split("\n")
18
+ end
19
+
16
20
  $called_once = false
17
21
  Document.define :persisted_words do
18
22
  raise CalledOnce if $called_once
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
145
149
 
146
150
  assert text.ner.first.segid.include?("TEST:")
147
151
  end
152
+
153
+ def test_sentence_words
154
+ text =<<-EOF
155
+ This is sentence 1
156
+ This is sentence 2
157
+ EOF
158
+
159
+ Document.setup(text)
160
+
161
+ words = text.words
162
+ numbers = words.select{|w| w =~ /\d/}
163
+ text.lines.each do |sentence|
164
+ Transformed.with_transform(sentence, numbers, "[NUM]") do
165
+ puts sentence
166
+ end
167
+ end
168
+ end
148
169
  end
149
170
 
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
12
12
  sapiens
13
13
  EOF
14
14
 
15
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
15
16
  mentions = GNormPlus.process({:file => text})
16
17
 
17
18
  assert_equal 1, mentions.length
@@ -23,6 +24,7 @@ sapiens
23
24
  We found that TP53 is regulated by MDM2 in Homo sapiens
24
25
  EOF
25
26
 
27
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
26
28
  mentions = GNormPlus.entities({:file => text})
27
29
  assert mentions["file"].include?("TP53")
28
30
  mentions["file"].each do |mention|
@@ -88,6 +88,14 @@ class TestRegExpNER < Test::Unit::TestCase
88
88
  end
89
89
 
90
90
 
91
+ def test_entities_named_captures
92
+ sentence = "In a sentence I should find not this but this"
93
+
94
+ ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
95
+ matches = ner.entities(sentence)
96
+ end
97
+
98
+
91
99
 
92
100
  def test_regexp_order
93
101
  text =<<-EOF
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.9
4
+ version: 1.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  - !ruby/object:Gem::Version
176
176
  version: '0'
177
177
  requirements: []
178
- rubygems_version: 3.1.4
178
+ rubygems_version: 3.1.2
179
179
  signing_key:
180
180
  specification_version: 4
181
181
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
182
182
  test_files:
183
- - test/rbbt/nlp/test_nlp.rb
184
- - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
185
- - test/rbbt/nlp/genia/test_sentence_splitter.rb
183
+ - test/test_spaCy.rb
184
+ - test/test_helper.rb
185
+ - test/rbbt/bow/test_dictionary.rb
186
186
  - test/rbbt/bow/test_bow.rb
187
187
  - test/rbbt/bow/test_misc.rb
188
- - test/rbbt/bow/test_dictionary.rb
189
- - test/rbbt/test_document.rb
190
- - test/rbbt/document/test_annotation.rb
188
+ - test/rbbt/segment/test_encoding.rb
189
+ - test/rbbt/segment/test_transformed.rb
190
+ - test/rbbt/segment/test_overlaps.rb
191
+ - test/rbbt/segment/test_named_entity.rb
192
+ - test/rbbt/segment/test_corpus.rb
193
+ - test/rbbt/segment/test_range_index.rb
194
+ - test/rbbt/segment/test_annotation.rb
195
+ - test/rbbt/entity/test_document.rb
191
196
  - test/rbbt/document/corpus/test_pubmed.rb
192
197
  - test/rbbt/document/test_corpus.rb
193
- - test/rbbt/entity/test_document.rb
198
+ - test/rbbt/document/test_annotation.rb
199
+ - test/rbbt/test_document.rb
194
200
  - test/rbbt/ner/test_patterns.rb
195
- - test/rbbt/ner/test_NER.rb
196
- - test/rbbt/ner/test_abner.rb
197
201
  - test/rbbt/ner/rnorm/test_tokens.rb
198
- - test/rbbt/ner/test_rnorm.rb
199
- - test/rbbt/ner/test_regexpNER.rb
200
202
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
203
+ - test/rbbt/ner/test_token_trieNER.rb
204
+ - test/rbbt/ner/test_finder.rb
201
205
  - test/rbbt/ner/test_brat.rb
206
+ - test/rbbt/ner/test_regexpNER.rb
202
207
  - test/rbbt/ner/test_g_norm_plus.rb
208
+ - test/rbbt/ner/test_rnorm.rb
209
+ - test/rbbt/ner/test_linnaeus.rb
203
210
  - test/rbbt/ner/test_chemical_tagger.rb
204
- - test/rbbt/ner/test_banner.rb
205
- - test/rbbt/ner/test_token_trieNER.rb
206
- - test/rbbt/ner/test_finder.rb
211
+ - test/rbbt/ner/test_NER.rb
212
+ - test/rbbt/ner/test_abner.rb
207
213
  - test/rbbt/ner/test_rner.rb
208
- - test/rbbt/ner/test_linnaeus.rb
209
214
  - test/rbbt/ner/test_oscar4.rb
215
+ - test/rbbt/ner/test_banner.rb
210
216
  - test/rbbt/test_segment.rb
211
- - test/rbbt/segment/test_transformed.rb
212
- - test/rbbt/segment/test_overlaps.rb
213
- - test/rbbt/segment/test_annotation.rb
214
- - test/rbbt/segment/test_named_entity.rb
215
- - test/rbbt/segment/test_encoding.rb
216
- - test/rbbt/segment/test_range_index.rb
217
- - test/rbbt/segment/test_corpus.rb
218
- - test/test_spaCy.rb
219
- - test/test_helper.rb
217
+ - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
218
+ - test/rbbt/nlp/test_nlp.rb
219
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb