rbbt-text 1.3.6 → 1.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +20 -0
- data/lib/rbbt/document/corpus.rb +4 -4
- data/lib/rbbt/document.rb +1 -3
- data/lib/rbbt/ner/oscar3.rb +0 -1
- data/lib/rbbt/ner/oscar4.rb +0 -1
- data/lib/rbbt/ner/rnorm/tokens.rb +3 -1
- data/lib/rbbt/ner/rnorm.rb +5 -1
- data/lib/rbbt/segment/transformed.rb +9 -1
- data/share/install/software/OpenNLP +3 -2
- data/test/rbbt/ner/rnorm/test_tokens.rb +11 -0
- data/test/rbbt/ner/test_rnorm.rb +5 -0
- data/test/rbbt/segment/test_transformed.rb +9 -26
- metadata +7 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8dfc374254fcbe88c8be6bfffd9a3cfabf6e23c953c11ecd2f61cf41027ff3d6
|
4
|
+
data.tar.gz: 3d3211f41cfecea05862505d1508a4b7b76eecb3c90b3b0000194eb08033715e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ed870e46bae2c113d0885697bfbade6064732a89477833c640eaf4ee8bdb2c0fbf52f69f456af5eb30a82e56a7f0aeb37e71127f884430c3d315202a07fa3cb
|
7
|
+
data.tar.gz: e31853e816321a5ead788036b5f67eecaca179c75168c0bb2804be1f18ae844031ab808a4e3c9d67e1f9a52f94ca478949798b8101e164eba32481c0182a1f58
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2022 Miguel Vázquez García
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -20,9 +20,9 @@ module Document::Corpus
|
|
20
20
|
|
21
21
|
def docids(*prefix)
|
22
22
|
prefix = prefix * ":"
|
23
|
-
prefix += ":" unless prefix == :all || prefix[-1] == ":"
|
23
|
+
prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
|
24
24
|
docids = self.read_and_close do
|
25
|
-
prefix ==
|
25
|
+
prefix == "all" ? self.keys : self.prefix(prefix)
|
26
26
|
end
|
27
27
|
DocID.setup(docids, :corpus => self)
|
28
28
|
end
|
@@ -34,7 +34,7 @@ module Document::Corpus
|
|
34
34
|
def [](*args)
|
35
35
|
docid, *rest = args
|
36
36
|
|
37
|
-
res = self.
|
37
|
+
res = self.with_read do
|
38
38
|
super(*args)
|
39
39
|
end
|
40
40
|
|
@@ -44,7 +44,7 @@ module Document::Corpus
|
|
44
44
|
namespace, id, type = docid.split(":")
|
45
45
|
|
46
46
|
if res.nil?
|
47
|
-
if Document::Corpus.claims.include?(namespace.to_s)
|
47
|
+
if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
|
48
48
|
res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
|
49
49
|
end
|
50
50
|
end
|
data/lib/rbbt/document.rb
CHANGED
@@ -22,9 +22,7 @@ module DocID
|
|
22
22
|
if Array === self
|
23
23
|
namespace, id, type = nil, nil, nil
|
24
24
|
docs = self.collect do |docid|
|
25
|
-
|
26
|
-
namespace, id, type = docid.split(":")
|
27
|
-
text
|
25
|
+
self.corpus[docid]
|
28
26
|
end
|
29
27
|
Document.setup(docs, :corpus => corpus)
|
30
28
|
else
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -172,6 +172,7 @@ class Tokenizer
|
|
172
172
|
|
173
173
|
#{{{ Token Types
|
174
174
|
GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
|
175
|
+
GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
|
175
176
|
def tokenize(word)
|
176
177
|
return word.
|
177
178
|
gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
|
@@ -180,6 +181,7 @@ class Tokenizer
|
|
180
181
|
gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
|
181
182
|
gsub(/^(#{GREEK_RE})/,'\1-').
|
182
183
|
gsub(/(#{GREEK_RE})$/,'-\1').
|
184
|
+
gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
|
183
185
|
split( /[^\w.]+/). # Split by separator char
|
184
186
|
select{|t| !t.empty? }
|
185
187
|
end
|
@@ -204,7 +206,7 @@ class Tokenizer
|
|
204
206
|
end
|
205
207
|
|
206
208
|
#{{{ Comparisons
|
207
|
-
|
209
|
+
|
208
210
|
def evaluate_tokens(list1, list2)
|
209
211
|
@operations.inject(0){|acc, o|
|
210
212
|
acc + o.eval(list1, list2)
|
data/lib/rbbt/ner/rnorm.rb
CHANGED
@@ -18,6 +18,10 @@ class Normalizer
|
|
18
18
|
values.select{|p| p[1] == best}
|
19
19
|
end
|
20
20
|
|
21
|
+
def token_evaluate(mention, name)
|
22
|
+
@tokens.evaluate(mention, name)
|
23
|
+
end
|
24
|
+
|
21
25
|
# Compares the tokens and gives each candidate a score based on the
|
22
26
|
# commonalities and differences amongst the tokens.
|
23
27
|
def token_score(code, mention)
|
@@ -31,7 +35,7 @@ class Normalizer
|
|
31
35
|
when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
|
32
36
|
80
|
33
37
|
else
|
34
|
-
|
38
|
+
token_evaluate(mention, name)
|
35
39
|
end
|
36
40
|
[value, name]
|
37
41
|
}.sort_by{|value, name| value }.last
|
@@ -70,7 +70,15 @@ module Transformed
|
|
70
70
|
orig_length = self.length
|
71
71
|
|
72
72
|
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
-
|
73
|
+
|
74
|
+
segments = segments.select do |s|
|
75
|
+
shift = shift s.range
|
76
|
+
s_offset = s.offset.to_i
|
77
|
+
s_offset += shift.first if shift
|
78
|
+
|
79
|
+
s_offset >= offset &&
|
80
|
+
s_offset <= offset + self.length - 1
|
81
|
+
end
|
74
82
|
|
75
83
|
Segment.clean_sort(segments).each do |segment|
|
76
84
|
next if segment.offset.nil?
|
@@ -1,6 +1,7 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
|
3
3
|
name='OpenNLP'
|
4
|
-
url="http://apache.rediris.es/opennlp/opennlp-1.9.
|
4
|
+
url="http://apache.rediris.es/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.tar.gz"
|
5
5
|
|
6
|
-
|
6
|
+
install_src $name $url
|
7
|
+
(cd $OPT_DIR/jars; ln -s $OPT_DIR/$name/lib/*.jar .)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/ner/rnorm'
|
3
|
+
|
4
|
+
class TestRNorm < Test::Unit::TestCase
|
5
|
+
def test_evaluate
|
6
|
+
t = Tokenizer.new
|
7
|
+
assert t.evaluate("PDGFRA","PDGFRalpha") > 0
|
8
|
+
iii t.evaluate("JUNB","JunB")
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -393,43 +393,26 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
393
393
|
end
|
394
394
|
end
|
395
395
|
|
396
|
-
def
|
397
|
-
a = "
|
396
|
+
def test_transform_sorter_end
|
397
|
+
a = "The transcription factors farnesoid X receptor, small heterodimer partner, liver receptor homolog-1, and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of apoM."
|
398
398
|
original = a.dup
|
399
399
|
|
400
|
-
gene1 = "
|
400
|
+
gene1 = "liver receptor homolog-1"
|
401
401
|
gene1.extend Segment
|
402
402
|
gene1.offset = a.index gene1
|
403
403
|
|
404
|
-
gene2 = "
|
404
|
+
gene2 = "apoM"
|
405
405
|
gene2.extend Segment
|
406
406
|
gene2.offset = a.index gene2
|
407
407
|
|
408
408
|
assert_equal gene1, a[gene1.range]
|
409
409
|
assert_equal gene2, a[gene2.range]
|
410
410
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
417
|
-
|
418
|
-
iii a.transformation_offset_differences
|
419
|
-
raise
|
420
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
421
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
422
|
-
|
423
|
-
|
424
|
-
gene3 = "GN gene"
|
425
|
-
gene3.extend Segment
|
426
|
-
gene3.offset = a.index gene3
|
427
|
-
|
428
|
-
assert_equal gene3, a[gene3.range]
|
429
|
-
|
430
|
-
a.restore([gene3])
|
431
|
-
assert_equal original, a
|
432
|
-
assert_equal "TP53 gene", a[gene3.range]
|
411
|
+
Transformed.with_transform(a, [gene1], "[TF]") do
|
412
|
+
Transformed.with_transform(a, [gene2], "[TG]") do
|
413
|
+
assert_equal "The transcription factors farnesoid X receptor, small heterodimer partner, [TF], and liver X receptor comprise the signaling cascade network that regulates the expression and secretion of [TG].", a
|
414
|
+
end
|
415
|
+
end
|
433
416
|
|
434
417
|
end
|
435
418
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: libxml-ruby
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: json
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -72,8 +58,10 @@ email: miguel.vazquez@fdi.ucm.es
|
|
72
58
|
executables:
|
73
59
|
- get_ppis.rb
|
74
60
|
extensions: []
|
75
|
-
extra_rdoc_files:
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE
|
76
63
|
files:
|
64
|
+
- LICENSE
|
77
65
|
- bin/get_ppis.rb
|
78
66
|
- lib/rbbt/bow/bow.rb
|
79
67
|
- lib/rbbt/bow/dictionary.rb
|
@@ -139,6 +127,7 @@ files:
|
|
139
127
|
- test/rbbt/document/test_annotation.rb
|
140
128
|
- test/rbbt/document/test_corpus.rb
|
141
129
|
- test/rbbt/entity/test_document.rb
|
130
|
+
- test/rbbt/ner/rnorm/test_tokens.rb
|
142
131
|
- test/rbbt/ner/test_NER.rb
|
143
132
|
- test/rbbt/ner/test_abner.rb
|
144
133
|
- test/rbbt/ner/test_banner.rb
|
@@ -205,6 +194,7 @@ test_files:
|
|
205
194
|
- test/rbbt/ner/test_patterns.rb
|
206
195
|
- test/rbbt/ner/test_NER.rb
|
207
196
|
- test/rbbt/ner/test_abner.rb
|
197
|
+
- test/rbbt/ner/rnorm/test_tokens.rb
|
208
198
|
- test/rbbt/ner/test_rnorm.rb
|
209
199
|
- test/rbbt/ner/test_regexpNER.rb
|
210
200
|
- test/rbbt/ner/test_ngram_prefix_dictionary.rb
|