rbbt-text 1.3.9 → 1.3.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
4
- data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
3
+ metadata.gz: 587b7971fd0f8fec2602323ea480521445ca417c3a407d057cf95df1f3a36216
4
+ data.tar.gz: 4f66306f80e838a0a27299705d79e5856b38dd936005d2b18004539bbb192431
5
5
  SHA512:
6
- metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
7
- data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
6
+ metadata.gz: 4aa191aa4e5cb5e3f7d4a49b30beb1eb7259a34074a7521a10b9951cd1cc7a097a06ba6a97d9f4f4e100b2058de3f94f5199cb069a030f93a3f69bf1ecec09ff
7
+ data.tar.gz: f3d5eb11d12f8a9d951d1073abd7e6cb5ace99bd075e7dc897f0aa715ae1552271019b4eb6849a172529d830f30bbc09ce40dd351fd81f9c06f338b075523e36
data/lib/rbbt/bow/misc.rb CHANGED
@@ -37,7 +37,7 @@ $greek.each{|l,s| $inverse_greek[s] = l }
37
37
 
38
38
  class String
39
39
  CONSONANTS = []
40
- if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
40
+ if File.exist? File.join(Rbbt.datadir, 'wordlists/consonants')
41
41
  Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
42
42
  end
43
43
 
@@ -6,7 +6,9 @@ module Document
6
6
  send :property, type do
7
7
  segments = self.instance_exec &block
8
8
 
9
- Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+ Segment.align(self, segments) unless segments.empty? ||
10
+ (Segment === segments && segments.offset) ||
11
+ (Array === segments && Segment === segments.first && segments.first.offset)
10
12
 
11
13
  segments.each do |segment|
12
14
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -17,6 +19,36 @@ module Document
17
19
 
18
20
  segments
19
21
  end
22
+
23
+ DocID.property type do
24
+ self.document.send(type)
25
+ end
26
+
27
+ SegID.property type do
28
+ self.overlaps(self.docid.send(type))
29
+ end
30
+
31
+ Segment.property type do
32
+ self.overlaps(self.docid.send(type))
33
+ end
34
+
35
+ seg_type = "segids_for_" + type.to_s
36
+
37
+ send :property, seg_type do
38
+ SegID.setup(self.send(type).collect{|s| s.segid })
39
+ end
40
+
41
+ DocID.property seg_type do
42
+ self.document.send(seg_type)
43
+ end
44
+
45
+ SegID.property seg_type do
46
+ self.overlaps(self.docid.send(seg_type))
47
+ end
48
+
49
+ Segment.property seg_type do
50
+ self.overlaps(self.docid.send(seg_type))
51
+ end
20
52
  end
21
53
 
22
54
  def self.define_multiple(type, &block)
@@ -28,7 +60,10 @@ module Document
28
60
  doc_segments.each_with_index do |segments,i|
29
61
  next if segments.nil?
30
62
  document = list[i]
31
- Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
63
+ Segment.align(document, segments) unless segments.nil? ||
64
+ segments.empty? ||
65
+ (Segment === segments && segments.offset) ||
66
+ (Array === segments && Segment === segments.first && segments.first.offset)
32
67
 
33
68
  segments.each do |segment|
34
69
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -41,5 +76,35 @@ module Document
41
76
  segments
42
77
  end
43
78
  end
79
+
80
+ DocID.property type do
81
+ self.document.send(type)
82
+ end
83
+
84
+ SegID.property type do
85
+ self.overlaps(self.docid.send(type))
86
+ end
87
+
88
+ Segment.property type do
89
+ self.overlaps(self.docid.send(type))
90
+ end
91
+
92
+ seg_type = "segids_for_" + type.to_s
93
+
94
+ send :property, seg_type do
95
+ SegID.setup(self.send(type).collect{|s| s.segid })
96
+ end
97
+
98
+ DocID.property seg_type do
99
+ self.document.send(seg_type)
100
+ end
101
+
102
+ SegID.property seg_type do
103
+ self.overlaps(self.docid.send(seg_type))
104
+ end
105
+
106
+ Segment.property seg_type do
107
+ self.overlaps(self.docid.send(seg_type))
108
+ end
44
109
  end
45
110
  end
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
2
2
 
3
3
  module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
- def add_pmid(pmid, type = nil, update = false)
6
- type = :abstract if type.nil?
5
+ def add_pmid(pmid, type = :title_and_abstract, update = false)
6
+ type = :title_and_abstract if type.nil?
7
7
 
8
8
  if ! (update || Array === pmid)
9
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
@@ -16,9 +16,11 @@ module Document::Corpus
16
16
 
17
17
  res = PubMed.get_article(pmids).collect do |pmid, article|
18
18
  document = if type.to_sym == :abstract
19
- Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
19
+ Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
20
20
  elsif type.to_sym == :title
21
- Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
21
+ Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
+ elsif type.to_sym == :title_and_abstract
23
+ Document.setup((article.title || "") + " " + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
24
  else
23
25
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
24
26
  Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
data/lib/rbbt/document.rb CHANGED
@@ -9,6 +9,10 @@ module DocID
9
9
  attr_accessor :default_corpus
10
10
  end
11
11
 
12
+ def id
13
+ self
14
+ end
15
+
12
16
  def corpus
13
17
  annotation_values[:corpus] || DocID.default_corpus
14
18
  end
@@ -66,7 +66,8 @@ EOF
66
66
  end
67
67
 
68
68
  Open.write('config', CONFIG)
69
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
69
+ mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
70
+ CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
70
71
 
71
72
  if texts.respond_to? :key_field
72
73
  key_field = texts.key_field
@@ -11,7 +11,15 @@ class RegExpNER < NER
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
13
 
14
- if matchdata.captures.any?
14
+ if matchdata.named_captures.any?
15
+ match = matchdata[0]
16
+ code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
17
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
18
+ matches << match
19
+ eend = match.length + pre.length
20
+ text = text[eend..-1]
21
+ start += match.length + pre.length
22
+ elsif matchdata.captures.any?
15
23
  match = matchdata.captures.first
16
24
  offset, eend = matchdata.offset(1)
17
25
  NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
@@ -88,7 +96,7 @@ class RegExpNER < NER
88
96
  def match(text)
89
97
  matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
90
98
  matches.collect do |m|
91
- NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
99
+ NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
92
100
  end
93
101
  end
94
102
 
@@ -4,6 +4,7 @@ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
7
+ include SegID
7
8
  self.annotation :corpus
8
9
 
9
10
  def _parts
@@ -1,4 +1,4 @@
1
- module Segment
1
+ module SegmentRanges
2
2
  def pull(offset)
3
3
  if self.offset.nil? or offset.nil?
4
4
  self.offset = nil
@@ -61,3 +61,11 @@ module Segment
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ module Segment
66
+ include SegmentRanges
67
+ end
68
+
69
+ module SegID
70
+ include SegmentRanges
71
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -22,6 +22,10 @@ module SegID
22
22
  range.begin
23
23
  end
24
24
 
25
+ def eend
26
+ offset.to_i + length - 1
27
+ end
28
+
25
29
  def segment_length
26
30
  range.end - range.begin + 1
27
31
  end
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
13
13
  self.split(" ")
14
14
  end
15
15
 
16
+ Document.define :lines do
17
+ self.split("\n")
18
+ end
19
+
16
20
  $called_once = false
17
21
  Document.define :persisted_words do
18
22
  raise CalledOnce if $called_once
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
145
149
 
146
150
  assert text.ner.first.segid.include?("TEST:")
147
151
  end
152
+
153
+ def test_sentence_words
154
+ text =<<-EOF
155
+ This is sentence 1
156
+ This is sentence 2
157
+ EOF
158
+
159
+ Document.setup(text)
160
+
161
+ words = text.words
162
+ numbers = words.select{|w| w =~ /\d/}
163
+ text.lines.each do |sentence|
164
+ Transformed.with_transform(sentence, numbers, "[NUM]") do
165
+ puts sentence
166
+ end
167
+ end
168
+ end
148
169
  end
149
170
 
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
12
12
  sapiens
13
13
  EOF
14
14
 
15
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
15
16
  mentions = GNormPlus.process({:file => text})
16
17
 
17
18
  assert_equal 1, mentions.length
@@ -23,6 +24,7 @@ sapiens
23
24
  We found that TP53 is regulated by MDM2 in Homo sapiens
24
25
  EOF
25
26
 
27
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
26
28
  mentions = GNormPlus.entities({:file => text})
27
29
  assert mentions["file"].include?("TP53")
28
30
  mentions["file"].each do |mention|
@@ -88,6 +88,14 @@ class TestRegExpNER < Test::Unit::TestCase
88
88
  end
89
89
 
90
90
 
91
+ def test_entities_named_captures
92
+ sentence = "In a sentence I should find not this but this"
93
+
94
+ ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
95
+ matches = ner.entities(sentence)
96
+ end
97
+
98
+
91
99
 
92
100
  def test_regexp_order
93
101
  text =<<-EOF
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.9
4
+ version: 1.3.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-04-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -175,7 +175,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  - !ruby/object:Gem::Version
176
176
  version: '0'
177
177
  requirements: []
178
- rubygems_version: 3.1.4
178
+ rubygems_version: 3.4.8
179
179
  signing_key:
180
180
  specification_version: 4
181
181
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)