rbbt-text 1.3.9 → 1.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document/annotation.rb +67 -2
- data/lib/rbbt/document/corpus/pubmed.rb +6 -4
- data/lib/rbbt/document.rb +4 -0
- data/lib/rbbt/ner/g_norm_plus.rb +2 -1
- data/lib/rbbt/ner/regexpNER.rb +10 -2
- data/lib/rbbt/segment/annotation.rb +1 -0
- data/lib/rbbt/segment/overlaps.rb +9 -1
- data/lib/rbbt/segment.rb +4 -0
- data/test/rbbt/document/test_annotation.rb +21 -0
- data/test/rbbt/ner/test_g_norm_plus.rb +2 -0
- data/test/rbbt/ner/test_regexpNER.rb +8 -0
- metadata +27 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
|
4
|
+
data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
|
7
|
+
data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
|
@@ -6,7 +6,9 @@ module Document
|
|
6
6
|
send :property, type do
|
7
7
|
segments = self.instance_exec &block
|
8
8
|
|
9
|
-
Segment.align(self, segments) unless segments.empty? ||
|
9
|
+
Segment.align(self, segments) unless segments.empty? ||
|
10
|
+
(Segment === segments && segments.offset) ||
|
11
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
10
12
|
|
11
13
|
segments.each do |segment|
|
12
14
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -17,6 +19,36 @@ module Document
|
|
17
19
|
|
18
20
|
segments
|
19
21
|
end
|
22
|
+
|
23
|
+
DocID.property type do
|
24
|
+
self.document.send(type)
|
25
|
+
end
|
26
|
+
|
27
|
+
SegID.property type do
|
28
|
+
self.overlaps(self.docid.send(type))
|
29
|
+
end
|
30
|
+
|
31
|
+
Segment.property type do
|
32
|
+
self.overlaps(self.docid.send(type))
|
33
|
+
end
|
34
|
+
|
35
|
+
seg_type = "segids_for_" + type.to_s
|
36
|
+
|
37
|
+
send :property, seg_type do
|
38
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
39
|
+
end
|
40
|
+
|
41
|
+
DocID.property seg_type do
|
42
|
+
self.document.send(seg_type)
|
43
|
+
end
|
44
|
+
|
45
|
+
SegID.property seg_type do
|
46
|
+
self.overlaps(self.docid.send(seg_type))
|
47
|
+
end
|
48
|
+
|
49
|
+
Segment.property seg_type do
|
50
|
+
self.overlaps(self.docid.send(seg_type))
|
51
|
+
end
|
20
52
|
end
|
21
53
|
|
22
54
|
def self.define_multiple(type, &block)
|
@@ -28,7 +60,10 @@ module Document
|
|
28
60
|
doc_segments.each_with_index do |segments,i|
|
29
61
|
next if segments.nil?
|
30
62
|
document = list[i]
|
31
|
-
Segment.align(document, segments) unless segments.nil? ||
|
63
|
+
Segment.align(document, segments) unless segments.nil? ||
|
64
|
+
segments.empty? ||
|
65
|
+
(Segment === segments && segments.offset) ||
|
66
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
32
67
|
|
33
68
|
segments.each do |segment|
|
34
69
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -41,5 +76,35 @@ module Document
|
|
41
76
|
segments
|
42
77
|
end
|
43
78
|
end
|
79
|
+
|
80
|
+
DocID.property type do
|
81
|
+
self.document.send(type)
|
82
|
+
end
|
83
|
+
|
84
|
+
SegID.property type do
|
85
|
+
self.overlaps(self.docid.send(type))
|
86
|
+
end
|
87
|
+
|
88
|
+
Segment.property type do
|
89
|
+
self.overlaps(self.docid.send(type))
|
90
|
+
end
|
91
|
+
|
92
|
+
seg_type = "segids_for_" + type.to_s
|
93
|
+
|
94
|
+
send :property, seg_type do
|
95
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
96
|
+
end
|
97
|
+
|
98
|
+
DocID.property seg_type do
|
99
|
+
self.document.send(seg_type)
|
100
|
+
end
|
101
|
+
|
102
|
+
SegID.property seg_type do
|
103
|
+
self.overlaps(self.docid.send(seg_type))
|
104
|
+
end
|
105
|
+
|
106
|
+
Segment.property seg_type do
|
107
|
+
self.overlaps(self.docid.send(seg_type))
|
108
|
+
end
|
44
109
|
end
|
45
110
|
end
|
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
|
|
2
2
|
|
3
3
|
module Document::Corpus
|
4
4
|
PUBMED_NAMESPACE="PMID"
|
5
|
-
def add_pmid(pmid, type =
|
6
|
-
type = :
|
5
|
+
def add_pmid(pmid, type = :title_and_abstract, update = false)
|
6
|
+
type = :title_and_abstract if type.nil?
|
7
7
|
|
8
8
|
if ! (update || Array === pmid)
|
9
9
|
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
@@ -16,9 +16,11 @@ module Document::Corpus
|
|
16
16
|
|
17
17
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
18
18
|
document = if type.to_sym == :abstract
|
19
|
-
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid,
|
19
|
+
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
|
20
20
|
elsif type.to_sym == :title
|
21
|
-
Document.setup(article.title, PUBMED_NAMESPACE, pmid,
|
21
|
+
Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
|
+
elsif type.to_sym == :title_and_abstract
|
23
|
+
Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
24
|
else
|
23
25
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
24
26
|
Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
|
data/lib/rbbt/document.rb
CHANGED
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -66,7 +66,8 @@ EOF
|
|
66
66
|
end
|
67
67
|
|
68
68
|
Open.write('config', CONFIG)
|
69
|
-
|
69
|
+
mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
|
70
|
+
CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
70
71
|
|
71
72
|
if texts.respond_to? :key_field
|
72
73
|
key_field = texts.key_field
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -11,7 +11,15 @@ class RegExpNER < NER
|
|
11
11
|
pre = matchdata.pre_match
|
12
12
|
post = matchdata.post_match
|
13
13
|
|
14
|
-
if matchdata.
|
14
|
+
if matchdata.named_captures.any?
|
15
|
+
match = matchdata[0]
|
16
|
+
code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
|
17
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
|
18
|
+
matches << match
|
19
|
+
eend = match.length + pre.length
|
20
|
+
text = text[eend..-1]
|
21
|
+
start += match.length + pre.length
|
22
|
+
elsif matchdata.captures.any?
|
15
23
|
match = matchdata.captures.first
|
16
24
|
offset, eend = matchdata.offset(1)
|
17
25
|
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
@@ -88,7 +96,7 @@ class RegExpNER < NER
|
|
88
96
|
def match(text)
|
89
97
|
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
90
98
|
matches.collect do |m|
|
91
|
-
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
99
|
+
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
|
92
100
|
end
|
93
101
|
end
|
94
102
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module SegmentRanges
|
2
2
|
def pull(offset)
|
3
3
|
if self.offset.nil? or offset.nil?
|
4
4
|
self.offset = nil
|
@@ -61,3 +61,11 @@ module Segment
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
module Segment
|
66
|
+
include SegmentRanges
|
67
|
+
end
|
68
|
+
|
69
|
+
module SegID
|
70
|
+
include SegmentRanges
|
71
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
|
|
13
13
|
self.split(" ")
|
14
14
|
end
|
15
15
|
|
16
|
+
Document.define :lines do
|
17
|
+
self.split("\n")
|
18
|
+
end
|
19
|
+
|
16
20
|
$called_once = false
|
17
21
|
Document.define :persisted_words do
|
18
22
|
raise CalledOnce if $called_once
|
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
|
|
145
149
|
|
146
150
|
assert text.ner.first.segid.include?("TEST:")
|
147
151
|
end
|
152
|
+
|
153
|
+
def test_sentence_words
|
154
|
+
text =<<-EOF
|
155
|
+
This is sentence 1
|
156
|
+
This is sentence 2
|
157
|
+
EOF
|
158
|
+
|
159
|
+
Document.setup(text)
|
160
|
+
|
161
|
+
words = text.words
|
162
|
+
numbers = words.select{|w| w =~ /\d/}
|
163
|
+
text.lines.each do |sentence|
|
164
|
+
Transformed.with_transform(sentence, numbers, "[NUM]") do
|
165
|
+
puts sentence
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
148
169
|
end
|
149
170
|
|
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
|
|
12
12
|
sapiens
|
13
13
|
EOF
|
14
14
|
|
15
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
15
16
|
mentions = GNormPlus.process({:file => text})
|
16
17
|
|
17
18
|
assert_equal 1, mentions.length
|
@@ -23,6 +24,7 @@ sapiens
|
|
23
24
|
We found that TP53 is regulated by MDM2 in Homo sapiens
|
24
25
|
EOF
|
25
26
|
|
27
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
26
28
|
mentions = GNormPlus.entities({:file => text})
|
27
29
|
assert mentions["file"].include?("TP53")
|
28
30
|
mentions["file"].each do |mention|
|
@@ -88,6 +88,14 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
88
88
|
end
|
89
89
|
|
90
90
|
|
91
|
+
def test_entities_named_captures
|
92
|
+
sentence = "In a sentence I should find not this but this"
|
93
|
+
|
94
|
+
ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
|
95
|
+
matches = ner.entities(sentence)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
91
99
|
|
92
100
|
def test_regexp_order
|
93
101
|
text =<<-EOF
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
175
|
- !ruby/object:Gem::Version
|
176
176
|
version: '0'
|
177
177
|
requirements: []
|
178
|
-
rubygems_version: 3.1.
|
178
|
+
rubygems_version: 3.1.2
|
179
179
|
signing_key:
|
180
180
|
specification_version: 4
|
181
181
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
182
182
|
test_files:
|
183
|
-
- test/
|
184
|
-
- test/
|
185
|
-
- test/rbbt/
|
183
|
+
- test/test_spaCy.rb
|
184
|
+
- test/test_helper.rb
|
185
|
+
- test/rbbt/bow/test_dictionary.rb
|
186
186
|
- test/rbbt/bow/test_bow.rb
|
187
187
|
- test/rbbt/bow/test_misc.rb
|
188
|
-
- test/rbbt/
|
189
|
-
- test/rbbt/
|
190
|
-
- test/rbbt/
|
188
|
+
- test/rbbt/segment/test_encoding.rb
|
189
|
+
- test/rbbt/segment/test_transformed.rb
|
190
|
+
- test/rbbt/segment/test_overlaps.rb
|
191
|
+
- test/rbbt/segment/test_named_entity.rb
|
192
|
+
- test/rbbt/segment/test_corpus.rb
|
193
|
+
- test/rbbt/segment/test_range_index.rb
|
194
|
+
- test/rbbt/segment/test_annotation.rb
|
195
|
+
- test/rbbt/entity/test_document.rb
|
191
196
|
- test/rbbt/document/corpus/test_pubmed.rb
|
192
197
|
- test/rbbt/document/test_corpus.rb
|
193
|
-
- test/rbbt/
|
198
|
+
- test/rbbt/document/test_annotation.rb
|
199
|
+
- test/rbbt/test_document.rb
|
194
200
|
- test/rbbt/ner/test_patterns.rb
|
195
|
-
- test/rbbt/ner/test_NER.rb
|
196
|
-
- test/rbbt/ner/test_abner.rb
|
197
201
|
- test/rbbt/ner/rnorm/test_tokens.rb
|
198
|
-
- test/rbbt/ner/test_rnorm.rb
|
199
|
-
- test/rbbt/ner/test_regexpNER.rb
|
200
202
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
203
|
+
- test/rbbt/ner/test_token_trieNER.rb
|
204
|
+
- test/rbbt/ner/test_finder.rb
|
201
205
|
- test/rbbt/ner/test_brat.rb
|
206
|
+
- test/rbbt/ner/test_regexpNER.rb
|
202
207
|
- test/rbbt/ner/test_g_norm_plus.rb
|
208
|
+
- test/rbbt/ner/test_rnorm.rb
|
209
|
+
- test/rbbt/ner/test_linnaeus.rb
|
203
210
|
- test/rbbt/ner/test_chemical_tagger.rb
|
204
|
-
- test/rbbt/ner/
|
205
|
-
- test/rbbt/ner/
|
206
|
-
- test/rbbt/ner/test_finder.rb
|
211
|
+
- test/rbbt/ner/test_NER.rb
|
212
|
+
- test/rbbt/ner/test_abner.rb
|
207
213
|
- test/rbbt/ner/test_rner.rb
|
208
|
-
- test/rbbt/ner/test_linnaeus.rb
|
209
214
|
- test/rbbt/ner/test_oscar4.rb
|
215
|
+
- test/rbbt/ner/test_banner.rb
|
210
216
|
- test/rbbt/test_segment.rb
|
211
|
-
- test/rbbt/
|
212
|
-
- test/rbbt/
|
213
|
-
- test/rbbt/
|
214
|
-
- test/rbbt/segment/test_named_entity.rb
|
215
|
-
- test/rbbt/segment/test_encoding.rb
|
216
|
-
- test/rbbt/segment/test_range_index.rb
|
217
|
-
- test/rbbt/segment/test_corpus.rb
|
218
|
-
- test/test_spaCy.rb
|
219
|
-
- test/test_helper.rb
|
217
|
+
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
218
|
+
- test/rbbt/nlp/test_nlp.rb
|
219
|
+
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|