rbbt-text 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,15 +0,0 @@
1
- require 'rbbt/ner/annotations'
2
- module Annotated
3
- attr_accessor :annotations
4
- def self.annotate(string, annotations = nil)
5
- string.extend Annotated
6
- string.annotations = annotations || []
7
- string
8
- end
9
-
10
- def split_segments(skip_segments = false)
11
- Segment.split(self, @annotations, skip_segments)
12
- end
13
- end
14
-
15
-
@@ -1,25 +0,0 @@
1
- require 'rbbt/ner/annotations'
2
-
3
- module Relationship
4
- attr_accessor :terms, :segment_types
5
- include Segment
6
- def self.annotate(string, offset = nil, terms = nil)
7
- string.extend PPI
8
- string.offset = offset unless offset.nil?
9
- string.terms = terms unless terms.nil?
10
- string
11
- end
12
-
13
- def html
14
- text = <<-EOF
15
- <span class='Relationship'\
16
- >#{ self }</span>
17
- EOF
18
- text.chomp
19
- end
20
-
21
- def html_with_entities(*types)
22
- annotations.values_at(*types).each do |segments|
23
- end
24
- end
25
- end
@@ -1,28 +0,0 @@
1
- require 'rbbt/ner/annotations'
2
-
3
- module Token
4
- include Segment
5
- attr_accessor :original
6
- def self.annotate(string, offset = nil, original = nil)
7
- string.extend Token
8
- string.offset = offset unless offset.nil?
9
- string.original = original || string.dup
10
- string
11
- end
12
-
13
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
14
-
15
- tokens = []
16
- while matchdata = text.match(split_at)
17
- tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
18
- tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
19
- start += matchdata.end(0)
20
- text = matchdata.post_match
21
- end
22
-
23
- tokens << Token.annotate(text, start) unless text.empty?
24
-
25
- tokens
26
- end
27
- end
28
-
@@ -1,14 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/annotations'
3
- require 'rbbt/ner/annotations/named_entity'
4
- require 'rbbt/ner/annotations/transformed'
5
-
6
- class TestClass < Test::Unit::TestCase
7
- def test_info
8
- a = "test"
9
- a.extend NamedEntity
10
- assert(! a.info.keys.include?("offset"))
11
- a.offset = 10
12
- assert a.info.keys.include? "offset"
13
- end
14
- end
@@ -1,70 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/ner/annotations'
3
- require 'rbbt/ner/annotations/named_entity'
4
- require 'rbbt/ner/annotations/transformed'
5
-
6
- class TestClass < Test::Unit::TestCase
7
- def test_info
8
- a = "test"
9
- a.extend NamedEntity
10
- a.type = "type"
11
- assert a.info.keys.include? "type"
12
- end
13
-
14
- def test_segment_type
15
- a = "test"
16
- a.extend NamedEntity
17
- assert a.segment_types.include? "NamedEntity"
18
- end
19
-
20
- def test_align
21
- text =<<-EOF
22
- Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
23
- EOF
24
-
25
- parts = text.split(/\W/)
26
- Segment.align(text, parts)
27
-
28
- assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
29
- end
30
-
31
- def test_sort
32
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
33
-
34
- gene1 = "TP53"
35
- gene1.extend NamedEntity
36
- gene1.offset = a.index gene1
37
- gene1.type = "Gene"
38
-
39
- gene2 = "CDK5R1"
40
- gene2.extend NamedEntity
41
- gene2.offset = a.index gene2
42
- gene2.type = "Gene"
43
-
44
- assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
45
-
46
- end
47
-
48
- def test_clean_sort
49
- a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
50
-
51
- gene1 = "TP53"
52
- gene1.extend NamedEntity
53
- gene1.offset = a.index gene1
54
- gene1.type = "Gene"
55
-
56
- gene2 = "CDK5R1"
57
- gene2.extend NamedEntity
58
- gene2.offset = a.index gene2
59
- gene2.type = "Gene"
60
-
61
- gene3 = "TP53 gene"
62
- gene3.extend NamedEntity
63
- gene3.offset = a.index gene3
64
- gene3.type = "Gene"
65
-
66
- assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
67
-
68
- end
69
- end
70
-