rbbt-text 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,15 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/annotations'
|
2
|
-
module Annotated
|
3
|
-
attr_accessor :annotations
|
4
|
-
def self.annotate(string, annotations = nil)
|
5
|
-
string.extend Annotated
|
6
|
-
string.annotations = annotations || []
|
7
|
-
string
|
8
|
-
end
|
9
|
-
|
10
|
-
def split_segments(skip_segments = false)
|
11
|
-
Segment.split(self, @annotations, skip_segments)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
|
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/annotations'
|
2
|
-
|
3
|
-
module Relationship
|
4
|
-
attr_accessor :terms, :segment_types
|
5
|
-
include Segment
|
6
|
-
def self.annotate(string, offset = nil, terms = nil)
|
7
|
-
string.extend PPI
|
8
|
-
string.offset = offset unless offset.nil?
|
9
|
-
string.terms = terms unless terms.nil?
|
10
|
-
string
|
11
|
-
end
|
12
|
-
|
13
|
-
def html
|
14
|
-
text = <<-EOF
|
15
|
-
<span class='Relationship'\
|
16
|
-
>#{ self }</span>
|
17
|
-
EOF
|
18
|
-
text.chomp
|
19
|
-
end
|
20
|
-
|
21
|
-
def html_with_entities(*types)
|
22
|
-
annotations.values_at(*types).each do |segments|
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
@@ -1,28 +0,0 @@
|
|
1
|
-
require 'rbbt/ner/annotations'
|
2
|
-
|
3
|
-
module Token
|
4
|
-
include Segment
|
5
|
-
attr_accessor :original
|
6
|
-
def self.annotate(string, offset = nil, original = nil)
|
7
|
-
string.extend Token
|
8
|
-
string.offset = offset unless offset.nil?
|
9
|
-
string.original = original || string.dup
|
10
|
-
string
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
14
|
-
|
15
|
-
tokens = []
|
16
|
-
while matchdata = text.match(split_at)
|
17
|
-
tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
18
|
-
tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
19
|
-
start += matchdata.end(0)
|
20
|
-
text = matchdata.post_match
|
21
|
-
end
|
22
|
-
|
23
|
-
tokens << Token.annotate(text, start) unless text.empty?
|
24
|
-
|
25
|
-
tokens
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
@@ -1,14 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/ner/annotations'
|
3
|
-
require 'rbbt/ner/annotations/named_entity'
|
4
|
-
require 'rbbt/ner/annotations/transformed'
|
5
|
-
|
6
|
-
class TestClass < Test::Unit::TestCase
|
7
|
-
def test_info
|
8
|
-
a = "test"
|
9
|
-
a.extend NamedEntity
|
10
|
-
assert(! a.info.keys.include?("offset"))
|
11
|
-
a.offset = 10
|
12
|
-
assert a.info.keys.include? "offset"
|
13
|
-
end
|
14
|
-
end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
-
require 'rbbt/ner/annotations'
|
3
|
-
require 'rbbt/ner/annotations/named_entity'
|
4
|
-
require 'rbbt/ner/annotations/transformed'
|
5
|
-
|
6
|
-
class TestClass < Test::Unit::TestCase
|
7
|
-
def test_info
|
8
|
-
a = "test"
|
9
|
-
a.extend NamedEntity
|
10
|
-
a.type = "type"
|
11
|
-
assert a.info.keys.include? "type"
|
12
|
-
end
|
13
|
-
|
14
|
-
def test_segment_type
|
15
|
-
a = "test"
|
16
|
-
a.extend NamedEntity
|
17
|
-
assert a.segment_types.include? "NamedEntity"
|
18
|
-
end
|
19
|
-
|
20
|
-
def test_align
|
21
|
-
text =<<-EOF
|
22
|
-
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
23
|
-
EOF
|
24
|
-
|
25
|
-
parts = text.split(/\W/)
|
26
|
-
Segment.align(text, parts)
|
27
|
-
|
28
|
-
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_sort
|
32
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
33
|
-
|
34
|
-
gene1 = "TP53"
|
35
|
-
gene1.extend NamedEntity
|
36
|
-
gene1.offset = a.index gene1
|
37
|
-
gene1.type = "Gene"
|
38
|
-
|
39
|
-
gene2 = "CDK5R1"
|
40
|
-
gene2.extend NamedEntity
|
41
|
-
gene2.offset = a.index gene2
|
42
|
-
gene2.type = "Gene"
|
43
|
-
|
44
|
-
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
45
|
-
|
46
|
-
end
|
47
|
-
|
48
|
-
def test_clean_sort
|
49
|
-
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
50
|
-
|
51
|
-
gene1 = "TP53"
|
52
|
-
gene1.extend NamedEntity
|
53
|
-
gene1.offset = a.index gene1
|
54
|
-
gene1.type = "Gene"
|
55
|
-
|
56
|
-
gene2 = "CDK5R1"
|
57
|
-
gene2.extend NamedEntity
|
58
|
-
gene2.offset = a.index gene2
|
59
|
-
gene2.type = "Gene"
|
60
|
-
|
61
|
-
gene3 = "TP53 gene"
|
62
|
-
gene3.extend NamedEntity
|
63
|
-
gene3.offset = a.index gene3
|
64
|
-
gene3.type = "Gene"
|
65
|
-
|
66
|
-
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
67
|
-
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|