rbbt-text 1.3.9 → 1.3.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/document/annotation.rb +67 -2
- data/lib/rbbt/document/corpus/pubmed.rb +6 -4
- data/lib/rbbt/document.rb +4 -0
- data/lib/rbbt/ner/g_norm_plus.rb +2 -1
- data/lib/rbbt/ner/regexpNER.rb +10 -2
- data/lib/rbbt/segment/annotation.rb +1 -0
- data/lib/rbbt/segment/overlaps.rb +9 -1
- data/lib/rbbt/segment.rb +4 -0
- data/test/rbbt/document/test_annotation.rb +21 -0
- data/test/rbbt/ner/test_g_norm_plus.rb +2 -0
- data/test/rbbt/ner/test_regexpNER.rb +8 -0
- metadata +27 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
|
4
|
+
data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
|
7
|
+
data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
|
@@ -6,7 +6,9 @@ module Document
|
|
6
6
|
send :property, type do
|
7
7
|
segments = self.instance_exec &block
|
8
8
|
|
9
|
-
Segment.align(self, segments) unless segments.empty? ||
|
9
|
+
Segment.align(self, segments) unless segments.empty? ||
|
10
|
+
(Segment === segments && segments.offset) ||
|
11
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
10
12
|
|
11
13
|
segments.each do |segment|
|
12
14
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -17,6 +19,36 @@ module Document
|
|
17
19
|
|
18
20
|
segments
|
19
21
|
end
|
22
|
+
|
23
|
+
DocID.property type do
|
24
|
+
self.document.send(type)
|
25
|
+
end
|
26
|
+
|
27
|
+
SegID.property type do
|
28
|
+
self.overlaps(self.docid.send(type))
|
29
|
+
end
|
30
|
+
|
31
|
+
Segment.property type do
|
32
|
+
self.overlaps(self.docid.send(type))
|
33
|
+
end
|
34
|
+
|
35
|
+
seg_type = "segids_for_" + type.to_s
|
36
|
+
|
37
|
+
send :property, seg_type do
|
38
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
39
|
+
end
|
40
|
+
|
41
|
+
DocID.property seg_type do
|
42
|
+
self.document.send(seg_type)
|
43
|
+
end
|
44
|
+
|
45
|
+
SegID.property seg_type do
|
46
|
+
self.overlaps(self.docid.send(seg_type))
|
47
|
+
end
|
48
|
+
|
49
|
+
Segment.property seg_type do
|
50
|
+
self.overlaps(self.docid.send(seg_type))
|
51
|
+
end
|
20
52
|
end
|
21
53
|
|
22
54
|
def self.define_multiple(type, &block)
|
@@ -28,7 +60,10 @@ module Document
|
|
28
60
|
doc_segments.each_with_index do |segments,i|
|
29
61
|
next if segments.nil?
|
30
62
|
document = list[i]
|
31
|
-
Segment.align(document, segments) unless segments.nil? ||
|
63
|
+
Segment.align(document, segments) unless segments.nil? ||
|
64
|
+
segments.empty? ||
|
65
|
+
(Segment === segments && segments.offset) ||
|
66
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
32
67
|
|
33
68
|
segments.each do |segment|
|
34
69
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -41,5 +76,35 @@ module Document
|
|
41
76
|
segments
|
42
77
|
end
|
43
78
|
end
|
79
|
+
|
80
|
+
DocID.property type do
|
81
|
+
self.document.send(type)
|
82
|
+
end
|
83
|
+
|
84
|
+
SegID.property type do
|
85
|
+
self.overlaps(self.docid.send(type))
|
86
|
+
end
|
87
|
+
|
88
|
+
Segment.property type do
|
89
|
+
self.overlaps(self.docid.send(type))
|
90
|
+
end
|
91
|
+
|
92
|
+
seg_type = "segids_for_" + type.to_s
|
93
|
+
|
94
|
+
send :property, seg_type do
|
95
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
96
|
+
end
|
97
|
+
|
98
|
+
DocID.property seg_type do
|
99
|
+
self.document.send(seg_type)
|
100
|
+
end
|
101
|
+
|
102
|
+
SegID.property seg_type do
|
103
|
+
self.overlaps(self.docid.send(seg_type))
|
104
|
+
end
|
105
|
+
|
106
|
+
Segment.property seg_type do
|
107
|
+
self.overlaps(self.docid.send(seg_type))
|
108
|
+
end
|
44
109
|
end
|
45
110
|
end
|
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
|
|
2
2
|
|
3
3
|
module Document::Corpus
|
4
4
|
PUBMED_NAMESPACE="PMID"
|
5
|
-
def add_pmid(pmid, type =
|
6
|
-
type = :
|
5
|
+
def add_pmid(pmid, type = :title_and_abstract, update = false)
|
6
|
+
type = :title_and_abstract if type.nil?
|
7
7
|
|
8
8
|
if ! (update || Array === pmid)
|
9
9
|
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
@@ -16,9 +16,11 @@ module Document::Corpus
|
|
16
16
|
|
17
17
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
18
18
|
document = if type.to_sym == :abstract
|
19
|
-
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid,
|
19
|
+
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
|
20
20
|
elsif type.to_sym == :title
|
21
|
-
Document.setup(article.title, PUBMED_NAMESPACE, pmid,
|
21
|
+
Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
|
+
elsif type.to_sym == :title_and_abstract
|
23
|
+
Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
24
|
else
|
23
25
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
24
26
|
Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
|
data/lib/rbbt/document.rb
CHANGED
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -66,7 +66,8 @@ EOF
|
|
66
66
|
end
|
67
67
|
|
68
68
|
Open.write('config', CONFIG)
|
69
|
-
|
69
|
+
mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
|
70
|
+
CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
70
71
|
|
71
72
|
if texts.respond_to? :key_field
|
72
73
|
key_field = texts.key_field
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -11,7 +11,15 @@ class RegExpNER < NER
|
|
11
11
|
pre = matchdata.pre_match
|
12
12
|
post = matchdata.post_match
|
13
13
|
|
14
|
-
if matchdata.
|
14
|
+
if matchdata.named_captures.any?
|
15
|
+
match = matchdata[0]
|
16
|
+
code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
|
17
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
|
18
|
+
matches << match
|
19
|
+
eend = match.length + pre.length
|
20
|
+
text = text[eend..-1]
|
21
|
+
start += match.length + pre.length
|
22
|
+
elsif matchdata.captures.any?
|
15
23
|
match = matchdata.captures.first
|
16
24
|
offset, eend = matchdata.offset(1)
|
17
25
|
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
@@ -88,7 +96,7 @@ class RegExpNER < NER
|
|
88
96
|
def match(text)
|
89
97
|
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
90
98
|
matches.collect do |m|
|
91
|
-
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
99
|
+
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
|
92
100
|
end
|
93
101
|
end
|
94
102
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module SegmentRanges
|
2
2
|
def pull(offset)
|
3
3
|
if self.offset.nil? or offset.nil?
|
4
4
|
self.offset = nil
|
@@ -61,3 +61,11 @@ module Segment
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
module Segment
|
66
|
+
include SegmentRanges
|
67
|
+
end
|
68
|
+
|
69
|
+
module SegID
|
70
|
+
include SegmentRanges
|
71
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
|
|
13
13
|
self.split(" ")
|
14
14
|
end
|
15
15
|
|
16
|
+
Document.define :lines do
|
17
|
+
self.split("\n")
|
18
|
+
end
|
19
|
+
|
16
20
|
$called_once = false
|
17
21
|
Document.define :persisted_words do
|
18
22
|
raise CalledOnce if $called_once
|
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
|
|
145
149
|
|
146
150
|
assert text.ner.first.segid.include?("TEST:")
|
147
151
|
end
|
152
|
+
|
153
|
+
def test_sentence_words
|
154
|
+
text =<<-EOF
|
155
|
+
This is sentence 1
|
156
|
+
This is sentence 2
|
157
|
+
EOF
|
158
|
+
|
159
|
+
Document.setup(text)
|
160
|
+
|
161
|
+
words = text.words
|
162
|
+
numbers = words.select{|w| w =~ /\d/}
|
163
|
+
text.lines.each do |sentence|
|
164
|
+
Transformed.with_transform(sentence, numbers, "[NUM]") do
|
165
|
+
puts sentence
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
148
169
|
end
|
149
170
|
|
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
|
|
12
12
|
sapiens
|
13
13
|
EOF
|
14
14
|
|
15
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
15
16
|
mentions = GNormPlus.process({:file => text})
|
16
17
|
|
17
18
|
assert_equal 1, mentions.length
|
@@ -23,6 +24,7 @@ sapiens
|
|
23
24
|
We found that TP53 is regulated by MDM2 in Homo sapiens
|
24
25
|
EOF
|
25
26
|
|
27
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
26
28
|
mentions = GNormPlus.entities({:file => text})
|
27
29
|
assert mentions["file"].include?("TP53")
|
28
30
|
mentions["file"].each do |mention|
|
@@ -88,6 +88,14 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
88
88
|
end
|
89
89
|
|
90
90
|
|
91
|
+
def test_entities_named_captures
|
92
|
+
sentence = "In a sentence I should find not this but this"
|
93
|
+
|
94
|
+
ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
|
95
|
+
matches = ner.entities(sentence)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
91
99
|
|
92
100
|
def test_regexp_order
|
93
101
|
text =<<-EOF
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
175
|
- !ruby/object:Gem::Version
|
176
176
|
version: '0'
|
177
177
|
requirements: []
|
178
|
-
rubygems_version: 3.1.
|
178
|
+
rubygems_version: 3.1.2
|
179
179
|
signing_key:
|
180
180
|
specification_version: 4
|
181
181
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
182
182
|
test_files:
|
183
|
-
- test/
|
184
|
-
- test/
|
185
|
-
- test/rbbt/
|
183
|
+
- test/test_spaCy.rb
|
184
|
+
- test/test_helper.rb
|
185
|
+
- test/rbbt/bow/test_dictionary.rb
|
186
186
|
- test/rbbt/bow/test_bow.rb
|
187
187
|
- test/rbbt/bow/test_misc.rb
|
188
|
-
- test/rbbt/
|
189
|
-
- test/rbbt/
|
190
|
-
- test/rbbt/
|
188
|
+
- test/rbbt/segment/test_encoding.rb
|
189
|
+
- test/rbbt/segment/test_transformed.rb
|
190
|
+
- test/rbbt/segment/test_overlaps.rb
|
191
|
+
- test/rbbt/segment/test_named_entity.rb
|
192
|
+
- test/rbbt/segment/test_corpus.rb
|
193
|
+
- test/rbbt/segment/test_range_index.rb
|
194
|
+
- test/rbbt/segment/test_annotation.rb
|
195
|
+
- test/rbbt/entity/test_document.rb
|
191
196
|
- test/rbbt/document/corpus/test_pubmed.rb
|
192
197
|
- test/rbbt/document/test_corpus.rb
|
193
|
-
- test/rbbt/
|
198
|
+
- test/rbbt/document/test_annotation.rb
|
199
|
+
- test/rbbt/test_document.rb
|
194
200
|
- test/rbbt/ner/test_patterns.rb
|
195
|
-
- test/rbbt/ner/test_NER.rb
|
196
|
-
- test/rbbt/ner/test_abner.rb
|
197
201
|
- test/rbbt/ner/rnorm/test_tokens.rb
|
198
|
-
- test/rbbt/ner/test_rnorm.rb
|
199
|
-
- test/rbbt/ner/test_regexpNER.rb
|
200
202
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|
203
|
+
- test/rbbt/ner/test_token_trieNER.rb
|
204
|
+
- test/rbbt/ner/test_finder.rb
|
201
205
|
- test/rbbt/ner/test_brat.rb
|
206
|
+
- test/rbbt/ner/test_regexpNER.rb
|
202
207
|
- test/rbbt/ner/test_g_norm_plus.rb
|
208
|
+
- test/rbbt/ner/test_rnorm.rb
|
209
|
+
- test/rbbt/ner/test_linnaeus.rb
|
203
210
|
- test/rbbt/ner/test_chemical_tagger.rb
|
204
|
-
- test/rbbt/ner/
|
205
|
-
- test/rbbt/ner/
|
206
|
-
- test/rbbt/ner/test_finder.rb
|
211
|
+
- test/rbbt/ner/test_NER.rb
|
212
|
+
- test/rbbt/ner/test_abner.rb
|
207
213
|
- test/rbbt/ner/test_rner.rb
|
208
|
-
- test/rbbt/ner/test_linnaeus.rb
|
209
214
|
- test/rbbt/ner/test_oscar4.rb
|
215
|
+
- test/rbbt/ner/test_banner.rb
|
210
216
|
- test/rbbt/test_segment.rb
|
211
|
-
- test/rbbt/
|
212
|
-
- test/rbbt/
|
213
|
-
- test/rbbt/
|
214
|
-
- test/rbbt/segment/test_named_entity.rb
|
215
|
-
- test/rbbt/segment/test_encoding.rb
|
216
|
-
- test/rbbt/segment/test_range_index.rb
|
217
|
-
- test/rbbt/segment/test_corpus.rb
|
218
|
-
- test/test_spaCy.rb
|
219
|
-
- test/test_helper.rb
|
217
|
+
- test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
|
218
|
+
- test/rbbt/nlp/test_nlp.rb
|
219
|
+
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|