rbbt-text 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,15 +0,0 @@
1
- require 'rbbt/ner/annotations'
2
- module Annotated
3
- attr_accessor :annotations
4
- def self.annotate(string, annotations = nil)
5
- string.extend Annotated
6
- string.annotations = annotations || []
7
- string
8
- end
9
-
10
- def split_segments(skip_segments = false)
11
- Segment.split(self, @annotations, skip_segments)
12
- end
13
- end
14
-
15
-
@@ -1,25 +0,0 @@
1
- require 'rbbt/ner/annotations'
2
-
3
- module Relationship
4
- attr_accessor :terms, :segment_types
5
- include Segment
6
- def self.annotate(string, offset = nil, terms = nil)
7
- string.extend PPI
8
- string.offset = offset unless offset.nil?
9
- string.terms = terms unless terms.nil?
10
- string
11
- end
12
-
13
- def html
14
- text = <<-EOF
15
- <span class='Relationship'\
16
- >#{ self }</span>
17
- EOF
18
- text.chomp
19
- end
20
-
21
- def html_with_entities(*types)
22
- annotations.values_at(*types).each do |segments|
23
- end
24
- end
25
- end
@@ -1,28 +0,0 @@
1
- require 'rbbt/ner/annotations'
2
-
3
- module Token
4
- include Segment
5
- attr_accessor :original
6
- def self.annotate(string, offset = nil, original = nil)
7
- string.extend Token
8
- string.offset = offset unless offset.nil?
9
- string.original = original || string.dup
10
- string
11
- end
12
-
13
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
14
-
15
- tokens = []
16
- while matchdata = text.match(split_at)
17
- tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
18
- tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
19
- start += matchdata.end(0)
20
- text = matchdata.post_match
21
- end
22
-
23
- tokens << Token.annotate(text, start) unless text.empty?
24
-
25
- tokens
26
- end
27
- end
28
-
@@ -1,14 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/annotations'
3
- require 'rbbt/ner/annotations/named_entity'
4
- require 'rbbt/ner/annotations/transformed'
5
-
6
- class TestClass < Test::Unit::TestCase
7
- def test_info
8
- a = "test"
9
- a.extend NamedEntity
10
- assert(! a.info.keys.include?("offset"))
11
- a.offset = 10
12
- assert a.info.keys.include? "offset"
13
- end
14
- end
@@ -1,70 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/ner/annotations'
3
- require 'rbbt/ner/annotations/named_entity'
4
- require 'rbbt/ner/annotations/transformed'
5
-
6
- class TestClass < Test::Unit::TestCase
7
- def test_info
8
- a = "test"
9
- a.extend NamedEntity
10
- a.type = "type"
11
- assert a.info.keys.include? "type"
12
- end
13
-
14
- def test_segment_type
15
- a = "test"
16
- a.extend NamedEntity
17
- assert a.segment_types.include? "NamedEntity"
18
- end
19
-
20
- def test_align
21
- text =<<-EOF
22
- Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
23
- EOF
24
-
25
- parts = text.split(/\W/)
26
- Segment.align(text, parts)
27
-
28
- assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
29
- end
30
-
31
- def test_sort
32
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
33
-
34
- gene1 = "TP53"
35
- gene1.extend NamedEntity
36
- gene1.offset = a.index gene1
37
- gene1.type = "Gene"
38
-
39
- gene2 = "CDK5R1"
40
- gene2.extend NamedEntity
41
- gene2.offset = a.index gene2
42
- gene2.type = "Gene"
43
-
44
- assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
45
-
46
- end
47
-
48
- def test_clean_sort
49
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
50
-
51
- gene1 = "TP53"
52
- gene1.extend NamedEntity
53
- gene1.offset = a.index gene1
54
- gene1.type = "Gene"
55
-
56
- gene2 = "CDK5R1"
57
- gene2.extend NamedEntity
58
- gene2.offset = a.index gene2
59
- gene2.type = "Gene"
60
-
61
- gene3 = "TP53 gene"
62
- gene3.extend NamedEntity
63
- gene3.offset = a.index gene3
64
- gene3.type = "Gene"
65
-
66
- assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
67
-
68
- end
69
- end
70
-