rbbt-text 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/annotations'
|
2
|
-
module Annotated
|
3
|
-
attr_accessor :annotations
|
4
|
-
def self.annotate(string, annotations = nil)
|
5
|
-
string.extend Annotated
|
6
|
-
string.annotations = annotations || []
|
7
|
-
string
|
8
|
-
end
|
9
|
-
|
10
|
-
def split_segments(skip_segments = false)
|
11
|
-
Segment.split(self, @annotations, skip_segments)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
|
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/annotations'
|
2
|
-
|
3
|
-
module Relationship
|
4
|
-
attr_accessor :terms, :segment_types
|
5
|
-
include Segment
|
6
|
-
def self.annotate(string, offset = nil, terms = nil)
|
7
|
-
string.extend PPI
|
8
|
-
string.offset = offset unless offset.nil?
|
9
|
-
string.terms = terms unless terms.nil?
|
10
|
-
string
|
11
|
-
end
|
12
|
-
|
13
|
-
def html
|
14
|
-
text = <<-EOF
|
15
|
-
<span class='Relationship'\
|
16
|
-
>#{ self }</span>
|
17
|
-
EOF
|
18
|
-
text.chomp
|
19
|
-
end
|
20
|
-
|
21
|
-
def html_with_entities(*types)
|
22
|
-
annotations.values_at(*types).each do |segments|
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/annotations'
|
2
|
-
|
3
|
-
module Token
|
4
|
-
include Segment
|
5
|
-
attr_accessor :original
|
6
|
-
def self.annotate(string, offset = nil, original = nil)
|
7
|
-
string.extend Token
|
8
|
-
string.offset = offset unless offset.nil?
|
9
|
-
string.original = original || string.dup
|
10
|
-
string
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
14
|
-
|
15
|
-
tokens = []
|
16
|
-
while matchdata = text.match(split_at)
|
17
|
-
tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
18
|
-
tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
19
|
-
start += matchdata.end(0)
|
20
|
-
text = matchdata.post_match
|
21
|
-
end
|
22
|
-
|
23
|
-
tokens << Token.annotate(text, start) unless text.empty?
|
24
|
-
|
25
|
-
tokens
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
@@ -1,14 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/ner/annotations'
|
3
|
-
require 'rbbt/ner/annotations/named_entity'
|
4
|
-
require 'rbbt/ner/annotations/transformed'
|
5
|
-
|
6
|
-
class TestClass < Test::Unit::TestCase
|
7
|
-
def test_info
|
8
|
-
a = "test"
|
9
|
-
a.extend NamedEntity
|
10
|
-
assert(! a.info.keys.include?("offset"))
|
11
|
-
a.offset = 10
|
12
|
-
assert a.info.keys.include? "offset"
|
13
|
-
end
|
14
|
-
end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/ner/annotations'
|
3
|
-
require 'rbbt/ner/annotations/named_entity'
|
4
|
-
require 'rbbt/ner/annotations/transformed'
|
5
|
-
|
6
|
-
class TestClass < Test::Unit::TestCase
|
7
|
-
def test_info
|
8
|
-
a = "test"
|
9
|
-
a.extend NamedEntity
|
10
|
-
a.type = "type"
|
11
|
-
assert a.info.keys.include? "type"
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_segment_type
|
15
|
-
a = "test"
|
16
|
-
a.extend NamedEntity
|
17
|
-
assert a.segment_types.include? "NamedEntity"
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_align
|
21
|
-
text =<<-EOF
|
22
|
-
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
23
|
-
EOF
|
24
|
-
|
25
|
-
parts = text.split(/\W/)
|
26
|
-
Segment.align(text, parts)
|
27
|
-
|
28
|
-
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_sort
|
32
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
33
|
-
|
34
|
-
gene1 = "TP53"
|
35
|
-
gene1.extend NamedEntity
|
36
|
-
gene1.offset = a.index gene1
|
37
|
-
gene1.type = "Gene"
|
38
|
-
|
39
|
-
gene2 = "CDK5R1"
|
40
|
-
gene2.extend NamedEntity
|
41
|
-
gene2.offset = a.index gene2
|
42
|
-
gene2.type = "Gene"
|
43
|
-
|
44
|
-
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
45
|
-
|
46
|
-
end
|
47
|
-
|
48
|
-
def test_clean_sort
|
49
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
50
|
-
|
51
|
-
gene1 = "TP53"
|
52
|
-
gene1.extend NamedEntity
|
53
|
-
gene1.offset = a.index gene1
|
54
|
-
gene1.type = "Gene"
|
55
|
-
|
56
|
-
gene2 = "CDK5R1"
|
57
|
-
gene2.extend NamedEntity
|
58
|
-
gene2.offset = a.index gene2
|
59
|
-
gene2.type = "Gene"
|
60
|
-
|
61
|
-
gene3 = "TP53 gene"
|
62
|
-
gene3.extend NamedEntity
|
63
|
-
gene3.offset = a.index gene3
|
64
|
-
gene3.type = "Gene"
|
65
|
-
|
66
|
-
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
67
|
-
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|