rbbt-text 1.3.9 → 1.3.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/misc.rb +1 -1
- data/lib/rbbt/document/annotation.rb +67 -2
- data/lib/rbbt/document/corpus/pubmed.rb +6 -4
- data/lib/rbbt/document.rb +4 -0
- data/lib/rbbt/ner/g_norm_plus.rb +2 -1
- data/lib/rbbt/ner/regexpNER.rb +10 -2
- data/lib/rbbt/segment/annotation.rb +1 -0
- data/lib/rbbt/segment/overlaps.rb +9 -1
- data/lib/rbbt/segment.rb +4 -0
- data/test/rbbt/document/test_annotation.rb +21 -0
- data/test/rbbt/ner/test_g_norm_plus.rb +2 -0
- data/test/rbbt/ner/test_regexpNER.rb +8 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 587b7971fd0f8fec2602323ea480521445ca417c3a407d057cf95df1f3a36216
|
4
|
+
data.tar.gz: 4f66306f80e838a0a27299705d79e5856b38dd936005d2b18004539bbb192431
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4aa191aa4e5cb5e3f7d4a49b30beb1eb7259a34074a7521a10b9951cd1cc7a097a06ba6a97d9f4f4e100b2058de3f94f5199cb069a030f93a3f69bf1ecec09ff
|
7
|
+
data.tar.gz: f3d5eb11d12f8a9d951d1073abd7e6cb5ace99bd075e7dc897f0aa715ae1552271019b4eb6849a172529d830f30bbc09ce40dd351fd81f9c06f338b075523e36
|
data/lib/rbbt/bow/misc.rb
CHANGED
@@ -37,7 +37,7 @@ $greek.each{|l,s| $inverse_greek[s] = l }
|
|
37
37
|
|
38
38
|
class String
|
39
39
|
CONSONANTS = []
|
40
|
-
if File.
|
40
|
+
if File.exist? File.join(Rbbt.datadir, 'wordlists/consonants')
|
41
41
|
Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
|
42
42
|
end
|
43
43
|
|
@@ -6,7 +6,9 @@ module Document
|
|
6
6
|
send :property, type do
|
7
7
|
segments = self.instance_exec &block
|
8
8
|
|
9
|
-
Segment.align(self, segments) unless segments.empty? ||
|
9
|
+
Segment.align(self, segments) unless segments.empty? ||
|
10
|
+
(Segment === segments && segments.offset) ||
|
11
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
10
12
|
|
11
13
|
segments.each do |segment|
|
12
14
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -17,6 +19,36 @@ module Document
|
|
17
19
|
|
18
20
|
segments
|
19
21
|
end
|
22
|
+
|
23
|
+
DocID.property type do
|
24
|
+
self.document.send(type)
|
25
|
+
end
|
26
|
+
|
27
|
+
SegID.property type do
|
28
|
+
self.overlaps(self.docid.send(type))
|
29
|
+
end
|
30
|
+
|
31
|
+
Segment.property type do
|
32
|
+
self.overlaps(self.docid.send(type))
|
33
|
+
end
|
34
|
+
|
35
|
+
seg_type = "segids_for_" + type.to_s
|
36
|
+
|
37
|
+
send :property, seg_type do
|
38
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
39
|
+
end
|
40
|
+
|
41
|
+
DocID.property seg_type do
|
42
|
+
self.document.send(seg_type)
|
43
|
+
end
|
44
|
+
|
45
|
+
SegID.property seg_type do
|
46
|
+
self.overlaps(self.docid.send(seg_type))
|
47
|
+
end
|
48
|
+
|
49
|
+
Segment.property seg_type do
|
50
|
+
self.overlaps(self.docid.send(seg_type))
|
51
|
+
end
|
20
52
|
end
|
21
53
|
|
22
54
|
def self.define_multiple(type, &block)
|
@@ -28,7 +60,10 @@ module Document
|
|
28
60
|
doc_segments.each_with_index do |segments,i|
|
29
61
|
next if segments.nil?
|
30
62
|
document = list[i]
|
31
|
-
Segment.align(document, segments) unless segments.nil? ||
|
63
|
+
Segment.align(document, segments) unless segments.nil? ||
|
64
|
+
segments.empty? ||
|
65
|
+
(Segment === segments && segments.offset) ||
|
66
|
+
(Array === segments && Segment === segments.first && segments.first.offset)
|
32
67
|
|
33
68
|
segments.each do |segment|
|
34
69
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
@@ -41,5 +76,35 @@ module Document
|
|
41
76
|
segments
|
42
77
|
end
|
43
78
|
end
|
79
|
+
|
80
|
+
DocID.property type do
|
81
|
+
self.document.send(type)
|
82
|
+
end
|
83
|
+
|
84
|
+
SegID.property type do
|
85
|
+
self.overlaps(self.docid.send(type))
|
86
|
+
end
|
87
|
+
|
88
|
+
Segment.property type do
|
89
|
+
self.overlaps(self.docid.send(type))
|
90
|
+
end
|
91
|
+
|
92
|
+
seg_type = "segids_for_" + type.to_s
|
93
|
+
|
94
|
+
send :property, seg_type do
|
95
|
+
SegID.setup(self.send(type).collect{|s| s.segid })
|
96
|
+
end
|
97
|
+
|
98
|
+
DocID.property seg_type do
|
99
|
+
self.document.send(seg_type)
|
100
|
+
end
|
101
|
+
|
102
|
+
SegID.property seg_type do
|
103
|
+
self.overlaps(self.docid.send(seg_type))
|
104
|
+
end
|
105
|
+
|
106
|
+
Segment.property seg_type do
|
107
|
+
self.overlaps(self.docid.send(seg_type))
|
108
|
+
end
|
44
109
|
end
|
45
110
|
end
|
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
|
|
2
2
|
|
3
3
|
module Document::Corpus
|
4
4
|
PUBMED_NAMESPACE="PMID"
|
5
|
-
def add_pmid(pmid, type =
|
6
|
-
type = :
|
5
|
+
def add_pmid(pmid, type = :title_and_abstract, update = false)
|
6
|
+
type = :title_and_abstract if type.nil?
|
7
7
|
|
8
8
|
if ! (update || Array === pmid)
|
9
9
|
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
@@ -16,9 +16,11 @@ module Document::Corpus
|
|
16
16
|
|
17
17
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
18
18
|
document = if type.to_sym == :abstract
|
19
|
-
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid,
|
19
|
+
Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
|
20
20
|
elsif type.to_sym == :title
|
21
|
-
Document.setup(article.title, PUBMED_NAMESPACE, pmid,
|
21
|
+
Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
|
+
elsif type.to_sym == :title_and_abstract
|
23
|
+
Document.setup((article.title || "") + " " + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
|
22
24
|
else
|
23
25
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
24
26
|
Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
|
data/lib/rbbt/document.rb
CHANGED
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -66,7 +66,8 @@ EOF
|
|
66
66
|
end
|
67
67
|
|
68
68
|
Open.write('config', CONFIG)
|
69
|
-
|
69
|
+
mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
|
70
|
+
CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
70
71
|
|
71
72
|
if texts.respond_to? :key_field
|
72
73
|
key_field = texts.key_field
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -11,7 +11,15 @@ class RegExpNER < NER
|
|
11
11
|
pre = matchdata.pre_match
|
12
12
|
post = matchdata.post_match
|
13
13
|
|
14
|
-
if matchdata.
|
14
|
+
if matchdata.named_captures.any?
|
15
|
+
match = matchdata[0]
|
16
|
+
code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
|
17
|
+
NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
|
18
|
+
matches << match
|
19
|
+
eend = match.length + pre.length
|
20
|
+
text = text[eend..-1]
|
21
|
+
start += match.length + pre.length
|
22
|
+
elsif matchdata.captures.any?
|
15
23
|
match = matchdata.captures.first
|
16
24
|
offset, eend = matchdata.offset(1)
|
17
25
|
NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
|
@@ -88,7 +96,7 @@ class RegExpNER < NER
|
|
88
96
|
def match(text)
|
89
97
|
matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
|
90
98
|
matches.collect do |m|
|
91
|
-
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
|
99
|
+
NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
|
92
100
|
end
|
93
101
|
end
|
94
102
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module SegmentRanges
|
2
2
|
def pull(offset)
|
3
3
|
if self.offset.nil? or offset.nil?
|
4
4
|
self.offset = nil
|
@@ -61,3 +61,11 @@ module Segment
|
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
64
|
+
|
65
|
+
module Segment
|
66
|
+
include SegmentRanges
|
67
|
+
end
|
68
|
+
|
69
|
+
module SegID
|
70
|
+
include SegmentRanges
|
71
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
|
|
13
13
|
self.split(" ")
|
14
14
|
end
|
15
15
|
|
16
|
+
Document.define :lines do
|
17
|
+
self.split("\n")
|
18
|
+
end
|
19
|
+
|
16
20
|
$called_once = false
|
17
21
|
Document.define :persisted_words do
|
18
22
|
raise CalledOnce if $called_once
|
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
|
|
145
149
|
|
146
150
|
assert text.ner.first.segid.include?("TEST:")
|
147
151
|
end
|
152
|
+
|
153
|
+
def test_sentence_words
|
154
|
+
text =<<-EOF
|
155
|
+
This is sentence 1
|
156
|
+
This is sentence 2
|
157
|
+
EOF
|
158
|
+
|
159
|
+
Document.setup(text)
|
160
|
+
|
161
|
+
words = text.words
|
162
|
+
numbers = words.select{|w| w =~ /\d/}
|
163
|
+
text.lines.each do |sentence|
|
164
|
+
Transformed.with_transform(sentence, numbers, "[NUM]") do
|
165
|
+
puts sentence
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
148
169
|
end
|
149
170
|
|
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
|
|
12
12
|
sapiens
|
13
13
|
EOF
|
14
14
|
|
15
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
15
16
|
mentions = GNormPlus.process({:file => text})
|
16
17
|
|
17
18
|
assert_equal 1, mentions.length
|
@@ -23,6 +24,7 @@ sapiens
|
|
23
24
|
We found that TP53 is regulated by MDM2 in Homo sapiens
|
24
25
|
EOF
|
25
26
|
|
27
|
+
Rbbt::Config.add_entry :java_mem, "2G", :gnp
|
26
28
|
mentions = GNormPlus.entities({:file => text})
|
27
29
|
assert mentions["file"].include?("TP53")
|
28
30
|
mentions["file"].each do |mention|
|
@@ -88,6 +88,14 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
88
88
|
end
|
89
89
|
|
90
90
|
|
91
|
+
def test_entities_named_captures
|
92
|
+
sentence = "In a sentence I should find not this but this"
|
93
|
+
|
94
|
+
ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
|
95
|
+
matches = ner.entities(sentence)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
91
99
|
|
92
100
|
def test_regexp_order
|
93
101
|
text =<<-EOF
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -175,7 +175,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
175
|
- !ruby/object:Gem::Version
|
176
176
|
version: '0'
|
177
177
|
requirements: []
|
178
|
-
rubygems_version: 3.
|
178
|
+
rubygems_version: 3.4.8
|
179
179
|
signing_key:
|
180
180
|
specification_version: 4
|
181
181
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|