rbbt-text 1.3.0 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
@@ -6,7 +6,7 @@ module Segment::RangeIndex
6
6
  SegID.setup(res, :corpus => corpus)
7
7
  end
8
8
 
9
- def self.index(segments, corpus, persist_file = :memory)
9
+ def self.index(segments, corpus = nil, persist_file = :memory)
10
10
  segments = segments.values.flatten if Hash === segments
11
11
 
12
12
  annotation_index =
@@ -0,0 +1,7 @@
1
+ module Relationship
2
+ extend Entity
3
+
4
+ self.annotation :segments
5
+ self.annotation :type
6
+
7
+ end
@@ -68,6 +68,10 @@ module Transformed
68
68
 
69
69
  segments = [segments] unless Array === segments
70
70
  orig_length = self.length
71
+
72
+ offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
+ segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
74
+
71
75
  Segment.clean_sort(segments).each do |segment|
72
76
  next if segment.offset.nil?
73
77
 
@@ -86,7 +90,7 @@ module Transformed
86
90
 
87
91
  updated_text = self[updated_begin..updated_end]
88
92
  if updated_text.nil?
89
- Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
93
+ Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
90
94
  next
91
95
  end
92
96
 
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
5
5
 
6
6
  get_src "$name" "$url"
7
7
  move_opt "$name"
@@ -0,0 +1,51 @@
1
+ isLetters /^[A-Z]+$/i
2
+ isUpper /^[A-Z]+$/
3
+ isLower /^[a-z]+$/
4
+ isDigits /^[0-9]+$/i
5
+ isRoman /^[IVX]+$/
6
+ isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
+ isPunctuation /^[,.;]$/
8
+ isDelim /^[\/()\[\]{}\-]$/
9
+ isNonWord /^[^\w]+$/
10
+ isConjunction /^and|or|&|,$/
11
+
12
+ hasLetters /[A-Z]/i
13
+ hasUpper /.[A-Z]/
14
+ hasLower /[a-z]/
15
+ hasDigits /[0-9]/i
16
+ hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
+ hasPunctuation /[,.;]/
18
+ hasDelim /[\/()\[\]{}\-]/
19
+ hasNonWord /[^\w]/
20
+ caspMix /[a-z].[A-Z]/
21
+ keywords /(?:protein|gene|domain|ase)s?$/
22
+ hasSuffix /[a-z][A-Z0-9]$/
23
+
24
+ numLetters do |w| w.scan(/[A-Z]/i).length end
25
+ numDigits do |w| w.scan(/[0-9]/).length end
26
+ #
27
+ prefix_3 /^(...)/
28
+ prefix_4 /^(....)/
29
+ suffix_3 /(...)$/
30
+ suffix_4 /(....)$/
31
+
32
+
33
+ token1 do |w|
34
+ w.sub(/[A-Z]/,'A').
35
+ sub(/[a-z]/,'a').
36
+ sub(/[0-9]/,'0').
37
+ sub(/[^0-9a-z]/i,'x')
38
+ end
39
+ token2 do |w|
40
+ w.sub(/[A-Z]+/,'A').
41
+ sub(/[a-z]+/,'a').
42
+ sub(/[0-9]+/,'0').
43
+ sub(/[^0-9a-z]+/i,'x')
44
+ end
45
+ token3 do |w| w.downcase end
46
+ special do |w| w.is_special? end
47
+
48
+ context %w(special token2 isPunctuation isDelim)
49
+ window %w(1 2 3 -1 -2 -3)
50
+ #direction :reverse
51
+
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("32299157", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract).first
11
+ iii document.docid
11
12
  title = document.to(:title)
12
13
  assert title.include?("COVID-19")
13
14
  end
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
4
4
  require 'rbbt/segment'
5
5
  require 'rbbt/document/annotation'
6
6
  require 'rbbt/segment/named_entity'
7
+ require 'rbbt/ner/abner'
7
8
 
8
9
  class TestAnnotation < Test::Unit::TestCase
9
10
  class CalledOnce < Exception; end
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
28
29
  self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
29
30
  end
30
31
 
32
+ Document.define :abner do
33
+ $called_once = true
34
+ Abner.new.match(self)
35
+ end
36
+
37
+
31
38
  Document.persist :ner
32
39
  end
33
40
 
@@ -36,7 +43,7 @@ class TestAnnotation < Test::Unit::TestCase
36
43
  Document.setup(text, "TEST", "test_doc1", nil)
37
44
 
38
45
  corpus = {}
39
- corpus.extend Document::Corpus
46
+ Document::Corpus.setup corpus
40
47
 
41
48
  corpus.add_document(text)
42
49
 
@@ -50,7 +57,7 @@ class TestAnnotation < Test::Unit::TestCase
50
57
  Document.setup(text2, "TEST", "test_doc2", nil)
51
58
 
52
59
  corpus = {}
53
- corpus.extend Document::Corpus
60
+ Document::Corpus.setup corpus
54
61
 
55
62
  corpus.add_document(text1)
56
63
  corpus.add_document(text2)
@@ -68,7 +75,7 @@ class TestAnnotation < Test::Unit::TestCase
68
75
  Document.setup(text, "TEST", "test_doc1", nil)
69
76
 
70
77
  corpus = {}
71
- corpus.extend Document::Corpus
78
+ Document::Corpus.setup corpus
72
79
 
73
80
  corpus.add_document(text)
74
81
 
@@ -95,7 +102,7 @@ class TestAnnotation < Test::Unit::TestCase
95
102
  Document.setup(text, "TEST", "test_doc1", nil)
96
103
 
97
104
  corpus = {}
98
- corpus.extend Document::Corpus
105
+ Document::Corpus.setup corpus
99
106
 
100
107
  corpus.add_document(text)
101
108
 
@@ -122,7 +129,7 @@ class TestAnnotation < Test::Unit::TestCase
122
129
  Document.setup(text, "TEST", "test_doc1", nil)
123
130
 
124
131
  corpus = {}
125
- corpus.extend Document::Corpus
132
+ Document::Corpus.setup corpus
126
133
 
127
134
  corpus.add_document(text)
128
135
 
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
133
140
  text.ner
134
141
 
135
142
  assert ! $called_once
136
-
143
+
144
+ assert_equal text.abner.first.docid, text.docid
145
+
137
146
  assert text.ner.first.segid.include?("TEST:")
138
147
  end
139
148
  end
@@ -26,7 +26,21 @@ class TestDocumentCorpus < Test::Unit::TestCase
26
26
 
27
27
  corpus.add_document(text)
28
28
 
29
- assert corpus.prefix("TEST:").include?(text.docid)
29
+ assert corpus.docids("TEST:").include?(text.docid)
30
+ end
31
+ end
32
+
33
+ def test_load
34
+ text = "This is a document"
35
+ Document.setup(text, "TEST", "test_doc1", nil)
36
+
37
+ TmpFile.with_file do |path|
38
+ corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
39
+ corpus.extend Document::Corpus
40
+
41
+ corpus.add_document(text)
42
+
43
+ assert corpus.docids("TEST:").include?(text.docid)
30
44
  end
31
45
  end
32
46
  end
@@ -5,12 +5,17 @@ Log.severity = 0
5
5
  class TestGNormPlus < Test::Unit::TestCase
6
6
  def test_match
7
7
  text =<<-EOF
8
- We found that TP53 is regulated by MDM2 in Homo sapiens
8
+
9
+ Introduction
10
+
11
+ We found that TP53 is regulated by MDM2 in Homo
12
+ sapiens
9
13
  EOF
10
14
 
11
15
  mentions = GNormPlus.process({:file => text})
16
+
12
17
  assert_equal 1, mentions.length
13
- assert_equal 2, mentions["file"].length
18
+ assert_equal 3, mentions["file"].length
14
19
  end
15
20
 
16
21
  def test_entities
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
19
24
  EOF
20
25
 
21
26
  mentions = GNormPlus.entities({:file => text})
22
- mentions["file"].include? "TP53"
27
+ assert mentions["file"].include?("TP53")
28
+ mentions["file"].each do |mention|
29
+ assert_equal mention, text[mention.range].sub("\n", ' ')
30
+ end
23
31
  end
24
32
  end
25
33
 
@@ -0,0 +1,132 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rner'
4
+ require 'test/unit'
5
+
6
+ class TestRNer < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @parser = NERFeatures.new() do
10
+ isLetters /^[A-Z]+$/i
11
+ context prefix_3 /^(...)/
12
+ downcase do |w| w.downcase end
13
+
14
+ context %w(downcase)
15
+ end
16
+ end
17
+
18
+ def test_config
19
+ config = <<-EOC
20
+ isLetters /^[A-Z]+$/i
21
+ context prefix_3 /^(...)/
22
+ downcase do |w| w.downcase end
23
+
24
+ context %w(downcase)
25
+ EOC
26
+
27
+ assert_equal config.strip, @parser.config.strip
28
+ end
29
+
30
+ def test_reverse
31
+ assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
32
+ assert_equal(
33
+ ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
34
+ NERFeatures.reverse(
35
+ "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
36
+ ))
37
+ end
38
+
39
+ def test_features
40
+ assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
41
+ end
42
+
43
+ def test_template
44
+ template =<<-EOT
45
+ UisLetters: %x[0,1]
46
+ Uprefix_3: %x[0,2]
47
+ Uprefix_3#1: %x[1,2]
48
+ Uprefix_3#-1: %x[-1,2]
49
+ Udowncase: %x[0,3]
50
+ Udowncase#1: %x[1,3]
51
+ Udowncase#-1: %x[-1,3]
52
+ B
53
+ EOT
54
+
55
+ assert(@parser.template == template)
56
+ end
57
+
58
+ def test_tokens
59
+ assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
60
+ ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
61
+
62
+
63
+ end
64
+ def test_text_features
65
+
66
+ assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
67
+ assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
68
+ assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
69
+
70
+ end
71
+
72
+ def test_tagged_features
73
+ assert_equal(
74
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
75
+ ["of",true, false, "of", 0],
76
+ ["GENE1",false, "GEN", "gene1", 1],
77
+ [".", false, false, ".", 0]],
78
+ @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
79
+
80
+ assert_equal(
81
+ [["GENE1",false, "GEN", "gene1", 1],
82
+ ["phosphorilation",true, "pho", "phosphorilation", 0]],
83
+ @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
84
+
85
+
86
+ assert_equal(
87
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
88
+ ["of",true, false, "of", 0],
89
+ ["GENE",true, "GEN", "gene", 1],
90
+ ["1",false, false, "1", 2],
91
+ [".", false, false, ".", 0]],
92
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
93
+ end
94
+
95
+ def test_tagged_features_reverse
96
+ @parser.reverse = true
97
+ assert_equal(
98
+ [
99
+ ["GENE1",false, "GEN", "gene1", 1],
100
+ ["of",true, false, "of", 0],
101
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
102
+ ],
103
+ @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
104
+
105
+ assert_equal(
106
+ [
107
+ [".", false, false, ".", 0],
108
+ ["1",false, false, "1", 1],
109
+ ["GENE",true, "GEN", "gene", 2],
110
+ ["of",true, false, "of", 0],
111
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
112
+ ],
113
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
114
+ end
115
+
116
+ def test_default_config
117
+ require 'rbbt/bow/misc'
118
+ text =<<-EOF
119
+ This text explains how MDM2 interacts with TP53.
120
+ EOF
121
+ @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
122
+ features = @parser.tagged_features text, %w(TP53 MDM2)
123
+ assert features.first.first == "This"
124
+ end
125
+
126
+
127
+
128
+ def __test_CRFPP_install
129
+ assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
130
+ end
131
+
132
+ end
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
7
7
  This is a sentence.
8
8
  A funky character ™ in a sentence.
9
9
  This is a sentence.
10
- This is a
10
+ This is a broken
11
11
  sentence. This is
12
- another sentence.
12
+ another broken sentence.
13
13
  EOF
14
14
 
15
- assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
15
+ iii NLP.geniass_sentence_splitter(text)
16
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
16
17
  end
17
18
 
19
+ def test_sentences_2
20
+ text =<<-EOF
21
+ This is a sentence.
22
+ This is a sentence.
23
+ This is a broken
24
+ sentence. This is
25
+ another broken sentence.
26
+ EOF
27
+
28
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
29
+ end
30
+
31
+ def test_sentences_ext
32
+ text =<<-EOF
33
+ This is a sentence.
34
+ This is a sentence.
35
+ This is a broken
36
+ sentence. This is
37
+ another broken sentence.
38
+ EOF
39
+
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
18
42
  end
19
43
 
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
12
12
  segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
13
13
  annotation = SegmentAnnotation.setup(segment, :type => :verb)
14
14
 
15
- assert_equal 'verb', annotation.annotid.split(":").last
15
+ assert_equal 'verb', annotation.annotid.split(":")[5]
16
16
 
17
17
  annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
18
- assert_equal 'verb', annotation.annotid.split(":").last
18
+ assert_equal 'verb', annotation.annotid.split(":")[5]
19
19
  end
20
20
 
21
21
  def test_annotid
22
22
  text = "This is a document"
23
23
  Document.setup(text, "TEST", "test_doc1", nil)
24
24
 
25
- corpus = {}
26
- corpus.extend Document::Corpus
25
+ corpus = Document::Corpus.setup({})
27
26
 
28
27
  corpus.add_document(text)
29
28
 
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
2
2
  require 'rbbt/segment/encoding'
3
3
 
4
4
  class TestEncoding < Test::Unit::TestCase
5
- def _test_bad_chars
5
+ def test_bad_chars
6
6
  text = "A funky character ™ in a sentence."
7
7
 
8
8
  assert_equal ["™"], Segment.bad_chars(text)