rbbt-text 1.3.6 → 1.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +20 -0
- data/lib/rbbt/document/corpus/pubmed.rb +13 -5
- data/lib/rbbt/document/corpus.rb +4 -4
- data/lib/rbbt/document.rb +1 -3
- data/lib/rbbt/ner/oscar3.rb +0 -1
- data/lib/rbbt/ner/oscar4.rb +0 -1
- data/lib/rbbt/ner/rnorm/tokens.rb +3 -1
- data/lib/rbbt/ner/rnorm.rb +5 -1
- data/lib/rbbt/nlp/spaCy.rb +2 -2
- data/lib/rbbt/segment/transformed.rb +9 -1
- data/share/install/software/OpenNLP +3 -2
- data/test/rbbt/document/corpus/test_pubmed.rb +1 -1
- data/test/rbbt/ner/rnorm/test_tokens.rb +11 -0
- data/test/rbbt/ner/test_rnorm.rb +5 -0
- data/test/rbbt/segment/test_transformed.rb +9 -26
- metadata +7 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
|
4
|
+
data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
|
7
|
+
data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2022 Miguel Vázquez García
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -4,10 +4,11 @@ module Document::Corpus
|
|
4
4
|
PUBMED_NAMESPACE="PMID"
|
5
5
|
def add_pmid(pmid, type = nil, update = false)
|
6
6
|
type = :abstract if type.nil?
|
7
|
-
|
7
|
+
|
8
|
+
if ! (update || Array === pmid)
|
8
9
|
id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
|
9
10
|
documents = self.documents(id)
|
10
|
-
return documents if documents.any?
|
11
|
+
return documents.first if documents.any?
|
11
12
|
end
|
12
13
|
|
13
14
|
pmids = Array === pmid ? pmid : [pmid]
|
@@ -27,7 +28,14 @@ module Document::Corpus
|
|
27
28
|
document
|
28
29
|
end
|
29
30
|
|
30
|
-
|
31
|
+
if Array === pmid
|
32
|
+
corpus = res.first.corpus if res.first
|
33
|
+
Document.setup(res, :corpus => corpus)
|
34
|
+
else
|
35
|
+
res = res.first
|
36
|
+
end
|
37
|
+
|
38
|
+
res
|
31
39
|
end
|
32
40
|
|
33
41
|
def add_pubmed_query(query, max = 3000, type = nil)
|
@@ -35,8 +43,8 @@ module Document::Corpus
|
|
35
43
|
add_pmid(pmids, type)
|
36
44
|
end
|
37
45
|
|
38
|
-
self.claim "PMID" do |id,
|
46
|
+
self.claim "PMID" do |id,type,update|
|
39
47
|
Log.debug "Claiming #{id}"
|
40
|
-
self.add_pmid(id, type)
|
48
|
+
self.add_pmid(id, type,update)
|
41
49
|
end
|
42
50
|
end
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -20,9 +20,9 @@ module Document::Corpus
|
|
20
20
|
|
21
21
|
def docids(*prefix)
|
22
22
|
prefix = prefix * ":"
|
23
|
-
prefix += ":" unless prefix == :all || prefix[-1] == ":"
|
23
|
+
prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
|
24
24
|
docids = self.read_and_close do
|
25
|
-
prefix ==
|
25
|
+
prefix == "all" ? self.keys : self.prefix(prefix)
|
26
26
|
end
|
27
27
|
DocID.setup(docids, :corpus => self)
|
28
28
|
end
|
@@ -34,7 +34,7 @@ module Document::Corpus
|
|
34
34
|
def [](*args)
|
35
35
|
docid, *rest = args
|
36
36
|
|
37
|
-
res = self.
|
37
|
+
res = self.with_read do
|
38
38
|
super(*args)
|
39
39
|
end
|
40
40
|
|
@@ -44,7 +44,7 @@ module Document::Corpus
|
|
44
44
|
namespace, id, type = docid.split(":")
|
45
45
|
|
46
46
|
if res.nil?
|
47
|
-
if Document::Corpus.claims.include?(namespace.to_s)
|
47
|
+
if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
|
48
48
|
res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
|
49
49
|
end
|
50
50
|
end
|
data/lib/rbbt/document.rb
CHANGED
@@ -22,9 +22,7 @@ module DocID
|
|
22
22
|
if Array === self
|
23
23
|
namespace, id, type = nil, nil, nil
|
24
24
|
docs = self.collect do |docid|
|
25
|
-
|
26
|
-
namespace, id, type = docid.split(":")
|
27
|
-
text
|
25
|
+
self.corpus[docid]
|
28
26
|
end
|
29
27
|
Document.setup(docs, :corpus => corpus)
|
30
28
|
else
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -172,6 +172,7 @@ class Tokenizer
|
|
172
172
|
|
173
173
|
#{{{ Token Types
|
174
174
|
GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
|
175
|
+
GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
|
175
176
|
def tokenize(word)
|
176
177
|
return word.
|
177
178
|
gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
|
@@ -180,6 +181,7 @@ class Tokenizer
|
|
180
181
|
gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
|
181
182
|
gsub(/^(#{GREEK_RE})/,'\1-').
|
182
183
|
gsub(/(#{GREEK_RE})$/,'-\1').
|
184
|
+
gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
|
183
185
|
split( /[^\w.]+/). # Split by separator char
|
184
186
|
select{|t| !t.empty? }
|
185
187
|
end
|
@@ -204,7 +206,7 @@ class Tokenizer
|
|
204
206
|
end
|
205
207
|
|
206
208
|
#{{{ Comparisons
|
207
|
-
|
209
|
+
|
208
210
|
def evaluate_tokens(list1, list2)
|
209
211
|
@operations.inject(0){|acc, o|
|
210
212
|
acc + o.eval(list1, list2)
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -18,6 +18,10 @@ class Normalizer
|
|
18
18
|
values.select{|p| p[1] == best}
|
19
19
|
end
|
20
20
|
|
21
|
+
def token_evaluate(mention, name)
|
22
|
+
@tokens.evaluate(mention, name)
|
23
|
+
end
|
24
|
+
|
21
25
|
# Compares the tokens and gives each candidate a score based on the
|
22
26
|
# commonalities and differences amongst the tokens.
|
23
27
|
def token_score(code, mention)
|
@@ -31,7 +35,7 @@ class Normalizer
|
|
31
35
|
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
32
36
|
80
|
33
37
|
else
|
34
|
-
|
38
|
+
token_evaluate(mention, name)
|
35
39
|
end
|
36
40
|
[value, name]
|
37
41
|
}.sort_by{|value, name| value }.last
|
data/lib/rbbt/nlp/spaCy.rb
CHANGED
@@ -153,8 +153,8 @@ module SpaCy
|
|
153
153
|
|
154
154
|
chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
|
155
155
|
|
156
|
-
source_id = chunk_index[source.offset].first || source.segid
|
157
|
-
target_id = chunk_index[target.offset].first || target.segid
|
156
|
+
source_id = chunk_index[source.offset.to_i].first || source.segid
|
157
|
+
target_id = chunk_index[target.offset.to_i].first || target.segid
|
158
158
|
|
159
159
|
path = Paths.dijkstra(graph, source_id, [target_id])
|
160
160
|
|
@@ -70,7 +70,15 @@ module Transformed
|
|
70
70
|
orig_length = self.length
|
71
71
|
|
72
72
|
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
-
|
73
|
+
|
74
|
+
segments = segments.select do |s|
|
75
|
+
shift = shift s.range
|
76
|
+
s_offset = s.offset.to_i
|
77
|
+
s_offset += shift.first if shift
|
78
|
+
|
79
|
+
s_offset >= offset &&
|
80
|
+
s_offset <= offset + self.length - 1
|
81
|
+
end
|
74
82
|
|
75
83
|
Segment.clean_sort(segments).each do |segment|
|
76
84
|
next if segment.offset.nil?
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
|
3
3
|
name='OpenNLP'
|
4
|
-
url="http://apache.rediris.es/opennlp/opennlp-1.9.
|
4
|
+
url="http://apache.rediris.es/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.tar.gz"
|
5
5
|
|
6
|
-
|
6
|
+
install_src $name $url
|
7
|
+
(cd $OPT_DIR/jars; ln -s $OPT_DIR/$name/lib/*.jar .)
|
@@ -7,7 +7,7 @@ class TestCorpusPubmed < Test::Unit::TestCase
|
|
7
7
|
def test_add_pmid
|
8
8
|
corpus = Document::Corpus.setup({})
|
9
9
|
|
10
|
-
document = corpus.add_pmid("33359141", :abstract)
|
10
|
+
document = corpus.add_pmid("33359141", :abstract, true)
|
11
11
|
title = document.to(:title)
|
12
12
|
assert title.include?("COVID-19")
|
13
13
|
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/ner/rnorm'
|
3
|
+
|
4
|
+
class TestRNorm < Test::Unit::TestCase
|
5
|
+
def test_evaluate
|
6
|
+
t = Tokenizer.new
|
7
|
+
assert t.evaluate("PDGFRA","PDGFRalpha") > 0
|
8
|
+
iii t.evaluate("JUNB","JunB")
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -393,43 +393,26 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
393
393
|
end
|
394
394
|
end
|
395
395
|
|
396
|
-
def
|
397
|
-
a = "
|
396
|
+
def test_transform_sorter_end
|
397
|
+
a = "The transcription factors farnesoid X receptor, small heterodimer partner, liver receptor homolog-1, and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of apoM."
|
398
398
|
original = a.dup
|
399
399
|
|
400
|
-
gene1 = "
|
400
|
+
gene1 = "liver receptor homolog-1"
|
401
401
|
gene1.extend Segment
|
402
402
|
gene1.offset = a.index gene1
|
403
403
|
|
404
|
-
gene2 = "
|
404
|
+
gene2 = "apoM"
|
405
405
|
gene2.extend Segment
|
406
406
|
gene2.offset = a.index gene2
|
407
407
|
|
408
408
|
assert_equal gene1, a[gene1.range]
|
409
409
|
assert_equal gene2, a[gene2.range]
|
410
410
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
417
|
-
|
418
|
-
iii a.transformation_offset_differences
|
419
|
-
raise
|
420
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
421
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
422
|
-
|
423
|
-
|
424
|
-
gene3 = "GN gene"
|
425
|
-
gene3.extend Segment
|
426
|
-
gene3.offset = a.index gene3
|
427
|
-
|
428
|
-
assert_equal gene3, a[gene3.range]
|
429
|
-
|
430
|
-
a.restore([gene3])
|
431
|
-
assert_equal original, a
|
432
|
-
assert_equal "TP53 gene", a[gene3.range]
|
411
|
+
Transformed.with_transform(a, [gene1], "[TF]") do
|
412
|
+
Transformed.with_transform(a, [gene2], "[TG]") do
|
413
|
+
assert_equal "The transcription factors farnesoid X receptor, small heterodimer partner, [TF], and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of [TG].", a
|
414
|
+
end
|
415
|
+
end
|
433
416
|
|
434
417
|
end
|
435
418
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-11-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: libxml-ruby
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: json
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -72,8 +58,10 @@ email: miguel.vazquez@fdi.ucm.es
|
|
72
58
|
executables:
|
73
59
|
- get_ppis.rb
|
74
60
|
extensions: []
|
75
|
-
extra_rdoc_files:
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE
|
76
63
|
files:
|
64
|
+
- LICENSE
|
77
65
|
- bin/get_ppis.rb
|
78
66
|
- lib/rbbt/bow/bow.rb
|
79
67
|
- lib/rbbt/bow/dictionary.rb
|
@@ -139,6 +127,7 @@ files:
|
|
139
127
|
- test/rbbt/document/test_annotation.rb
|
140
128
|
- test/rbbt/document/test_corpus.rb
|
141
129
|
- test/rbbt/entity/test_document.rb
|
130
|
+
- test/rbbt/ner/rnorm/test_tokens.rb
|
142
131
|
- test/rbbt/ner/test_NER.rb
|
143
132
|
- test/rbbt/ner/test_abner.rb
|
144
133
|
- test/rbbt/ner/test_banner.rb
|
@@ -205,6 +194,7 @@ test_files:
|
|
205
194
|
- test/rbbt/ner/test_patterns.rb
|
206
195
|
- test/rbbt/ner/test_NER.rb
|
207
196
|
- test/rbbt/ner/test_abner.rb
|
197
|
+
- test/rbbt/ner/rnorm/test_tokens.rb
|
208
198
|
- test/rbbt/ner/test_rnorm.rb
|
209
199
|
- test/rbbt/ner/test_regexpNER.rb
|
210
200
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|