rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
@@ -6,7 +6,7 @@ module Segment::RangeIndex
6
6
  SegID.setup(res, :corpus => corpus)
7
7
  end
8
8
 
9
- def self.index(segments, corpus, persist_file = :memory)
9
+ def self.index(segments, corpus = nil, persist_file = :memory)
10
10
  segments = segments.values.flatten if Hash === segments
11
11
 
12
12
  annotation_index =
@@ -0,0 +1,7 @@
1
+ module Relationship
2
+ extend Entity
3
+
4
+ self.annotation :segments
5
+ self.annotation :type
6
+
7
+ end
@@ -68,6 +68,10 @@ module Transformed
68
68
 
69
69
  segments = [segments] unless Array === segments
70
70
  orig_length = self.length
71
+
72
+ offset = self.respond_to?(:offset) ? self.offset.to_i : 0
73
+ segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
74
+
71
75
  Segment.clean_sort(segments).each do |segment|
72
76
  next if segment.offset.nil?
73
77
 
@@ -86,7 +90,7 @@ module Transformed
86
90
 
87
91
  updated_text = self[updated_begin..updated_end]
88
92
  if updated_text.nil?
89
- Log.warn "Range outside of segment: #{self.length} #{segment.locus} (#{updated_range})"
93
+ Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
90
94
  next
91
95
  end
92
96
 
@@ -1,7 +1,7 @@
1
1
  #!/bin/bash
2
2
 
3
3
  name='OpenNLP'
4
- url="http://apache.rediris.es/opennlp/opennlp-1.9.1/apache-opennlp-1.9.1-bin.tar.gz"
4
+ url="http://apache.rediris.es/opennlp/opennlp-1.9.2/apache-opennlp-1.9.2-bin.tar.gz"
5
5
 
6
6
  get_src "$name" "$url"
7
7
  move_opt "$name"
@@ -0,0 +1,51 @@
1
+ isLetters /^[A-Z]+$/i
2
+ isUpper /^[A-Z]+$/
3
+ isLower /^[a-z]+$/
4
+ isDigits /^[0-9]+$/i
5
+ isRoman /^[IVX]+$/
6
+ isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
+ isPunctuation /^[,.;]$/
8
+ isDelim /^[\/()\[\]{}\-]$/
9
+ isNonWord /^[^\w]+$/
10
+ isConjunction /^and|or|&|,$/
11
+
12
+ hasLetters /[A-Z]/i
13
+ hasUpper /.[A-Z]/
14
+ hasLower /[a-z]/
15
+ hasDigits /[0-9]/i
16
+ hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
+ hasPunctuation /[,.;]/
18
+ hasDelim /[\/()\[\]{}\-]/
19
+ hasNonWord /[^\w]/
20
+ caspMix /[a-z].[A-Z]/
21
+ keywords /(?:protein|gene|domain|ase)s?$/
22
+ hasSuffix /[a-z][A-Z0-9]$/
23
+
24
+ numLetters do |w| w.scan(/[A-Z]/i).length end
25
+ numDigits do |w| w.scan(/[0-9]/).length end
26
+ #
27
+ prefix_3 /^(...)/
28
+ prefix_4 /^(....)/
29
+ suffix_3 /(...)$/
30
+ suffix_4 /(....)$/
31
+
32
+
33
+ token1 do |w|
34
+ w.sub(/[A-Z]/,'A').
35
+ sub(/[a-z]/,'a').
36
+ sub(/[0-9]/,'0').
37
+ sub(/[^0-9a-z]/i,'x')
38
+ end
39
+ token2 do |w|
40
+ w.sub(/[A-Z]+/,'A').
41
+ sub(/[a-z]+/,'a').
42
+ sub(/[0-9]+/,'0').
43
+ sub(/[^0-9a-z]+/i,'x')
44
+ end
45
+ token3 do |w| w.downcase end
46
+ special do |w| w.is_special? end
47
+
48
+ context %w(special token2 isPunctuation isDelim)
49
+ window %w(1 2 3 -1 -2 -3)
50
+ #direction :reverse
51
+
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("32299157", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract).first
11
+ iii document.docid
11
12
  title = document.to(:title)
12
13
  assert title.include?("COVID-19")
13
14
  end
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
4
4
  require 'rbbt/segment'
5
5
  require 'rbbt/document/annotation'
6
6
  require 'rbbt/segment/named_entity'
7
+ require 'rbbt/ner/abner'
7
8
 
8
9
  class TestAnnotation < Test::Unit::TestCase
9
10
  class CalledOnce < Exception; end
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
28
29
  self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
29
30
  end
30
31
 
32
+ Document.define :abner do
33
+ $called_once = true
34
+ Abner.new.match(self)
35
+ end
36
+
37
+
31
38
  Document.persist :ner
32
39
  end
33
40
 
@@ -36,7 +43,7 @@ class TestAnnotation < Test::Unit::TestCase
36
43
  Document.setup(text, "TEST", "test_doc1", nil)
37
44
 
38
45
  corpus = {}
39
- corpus.extend Document::Corpus
46
+ Document::Corpus.setup corpus
40
47
 
41
48
  corpus.add_document(text)
42
49
 
@@ -50,7 +57,7 @@ class TestAnnotation < Test::Unit::TestCase
50
57
  Document.setup(text2, "TEST", "test_doc2", nil)
51
58
 
52
59
  corpus = {}
53
- corpus.extend Document::Corpus
60
+ Document::Corpus.setup corpus
54
61
 
55
62
  corpus.add_document(text1)
56
63
  corpus.add_document(text2)
@@ -68,7 +75,7 @@ class TestAnnotation < Test::Unit::TestCase
68
75
  Document.setup(text, "TEST", "test_doc1", nil)
69
76
 
70
77
  corpus = {}
71
- corpus.extend Document::Corpus
78
+ Document::Corpus.setup corpus
72
79
 
73
80
  corpus.add_document(text)
74
81
 
@@ -95,7 +102,7 @@ class TestAnnotation < Test::Unit::TestCase
95
102
  Document.setup(text, "TEST", "test_doc1", nil)
96
103
 
97
104
  corpus = {}
98
- corpus.extend Document::Corpus
105
+ Document::Corpus.setup corpus
99
106
 
100
107
  corpus.add_document(text)
101
108
 
@@ -122,7 +129,7 @@ class TestAnnotation < Test::Unit::TestCase
122
129
  Document.setup(text, "TEST", "test_doc1", nil)
123
130
 
124
131
  corpus = {}
125
- corpus.extend Document::Corpus
132
+ Document::Corpus.setup corpus
126
133
 
127
134
  corpus.add_document(text)
128
135
 
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
133
140
  text.ner
134
141
 
135
142
  assert ! $called_once
136
-
143
+
144
+ assert_equal text.abner.first.docid, text.docid
145
+
137
146
  assert text.ner.first.segid.include?("TEST:")
138
147
  end
139
148
  end
@@ -26,7 +26,21 @@ class TestDocumentCorpus < Test::Unit::TestCase
26
26
 
27
27
  corpus.add_document(text)
28
28
 
29
- assert corpus.prefix("TEST:").include?(text.docid)
29
+ assert corpus.docids("TEST:").include?(text.docid)
30
+ end
31
+ end
32
+
33
+ def test_load
34
+ text = "This is a document"
35
+ Document.setup(text, "TEST", "test_doc1", nil)
36
+
37
+ TmpFile.with_file do |path|
38
+ corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
39
+ corpus.extend Document::Corpus
40
+
41
+ corpus.add_document(text)
42
+
43
+ assert corpus.docids("TEST:").include?(text.docid)
30
44
  end
31
45
  end
32
46
  end
@@ -5,12 +5,17 @@ Log.severity = 0
5
5
  class TestGNormPlus < Test::Unit::TestCase
6
6
  def test_match
7
7
  text =<<-EOF
8
- We found that TP53 is regulated by MDM2 in Homo sapiens
8
+
9
+ Introduction
10
+
11
+ We found that TP53 is regulated by MDM2 in Homo
12
+ sapiens
9
13
  EOF
10
14
 
11
15
  mentions = GNormPlus.process({:file => text})
16
+
12
17
  assert_equal 1, mentions.length
13
- assert_equal 2, mentions["file"].length
18
+ assert_equal 3, mentions["file"].length
14
19
  end
15
20
 
16
21
  def test_entities
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
19
24
  EOF
20
25
 
21
26
  mentions = GNormPlus.entities({:file => text})
22
- mentions["file"].include? "TP53"
27
+ assert mentions["file"].include?("TP53")
28
+ mentions["file"].each do |mention|
29
+ assert_equal mention, text[mention.range].sub("\n", ' ')
30
+ end
23
31
  end
24
32
  end
25
33
 
@@ -0,0 +1,132 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rner'
4
+ require 'test/unit'
5
+
6
+ class TestRNer < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @parser = NERFeatures.new() do
10
+ isLetters /^[A-Z]+$/i
11
+ context prefix_3 /^(...)/
12
+ downcase do |w| w.downcase end
13
+
14
+ context %w(downcase)
15
+ end
16
+ end
17
+
18
+ def test_config
19
+ config = <<-EOC
20
+ isLetters /^[A-Z]+$/i
21
+ context prefix_3 /^(...)/
22
+ downcase do |w| w.downcase end
23
+
24
+ context %w(downcase)
25
+ EOC
26
+
27
+ assert_equal config.strip, @parser.config.strip
28
+ end
29
+
30
+ def test_reverse
31
+ assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
32
+ assert_equal(
33
+ ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
34
+ NERFeatures.reverse(
35
+ "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
36
+ ))
37
+ end
38
+
39
+ def test_features
40
+ assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
41
+ end
42
+
43
+ def test_template
44
+ template =<<-EOT
45
+ UisLetters: %x[0,1]
46
+ Uprefix_3: %x[0,2]
47
+ Uprefix_3#1: %x[1,2]
48
+ Uprefix_3#-1: %x[-1,2]
49
+ Udowncase: %x[0,3]
50
+ Udowncase#1: %x[1,3]
51
+ Udowncase#-1: %x[-1,3]
52
+ B
53
+ EOT
54
+
55
+ assert(@parser.template == template)
56
+ end
57
+
58
+ def test_tokens
59
+ assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
60
+ ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
61
+
62
+
63
+ end
64
+ def test_text_features
65
+
66
+ assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
67
+ assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
68
+ assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
69
+
70
+ end
71
+
72
+ def test_tagged_features
73
+ assert_equal(
74
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
75
+ ["of",true, false, "of", 0],
76
+ ["GENE1",false, "GEN", "gene1", 1],
77
+ [".", false, false, ".", 0]],
78
+ @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
79
+
80
+ assert_equal(
81
+ [["GENE1",false, "GEN", "gene1", 1],
82
+ ["phosphorilation",true, "pho", "phosphorilation", 0]],
83
+ @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
84
+
85
+
86
+ assert_equal(
87
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
88
+ ["of",true, false, "of", 0],
89
+ ["GENE",true, "GEN", "gene", 1],
90
+ ["1",false, false, "1", 2],
91
+ [".", false, false, ".", 0]],
92
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
93
+ end
94
+
95
+ def test_tagged_features_reverse
96
+ @parser.reverse = true
97
+ assert_equal(
98
+ [
99
+ ["GENE1",false, "GEN", "gene1", 1],
100
+ ["of",true, false, "of", 0],
101
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
102
+ ],
103
+ @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
104
+
105
+ assert_equal(
106
+ [
107
+ [".", false, false, ".", 0],
108
+ ["1",false, false, "1", 1],
109
+ ["GENE",true, "GEN", "gene", 2],
110
+ ["of",true, false, "of", 0],
111
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
112
+ ],
113
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
114
+ end
115
+
116
+ def test_default_config
117
+ require 'rbbt/bow/misc'
118
+ text =<<-EOF
119
+ This text explains how MDM2 interacts with TP53.
120
+ EOF
121
+ @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
122
+ features = @parser.tagged_features text, %w(TP53 MDM2)
123
+ assert features.first.first == "This"
124
+ end
125
+
126
+
127
+
128
+ def __test_CRFPP_install
129
+ assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
130
+ end
131
+
132
+ end
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
7
7
  This is a sentence.
8
8
  A funky character ™ in a sentence.
9
9
  This is a sentence.
10
- This is a
10
+ This is a broken
11
11
  sentence. This is
12
- another sentence.
12
+ another broken sentence.
13
13
  EOF
14
14
 
15
- assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
15
+ iii NLP.geniass_sentence_splitter(text)
16
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
16
17
  end
17
18
 
19
+ def test_sentences_2
20
+ text =<<-EOF
21
+ This is a sentence.
22
+ This is a sentence.
23
+ This is a broken
24
+ sentence. This is
25
+ another broken sentence.
26
+ EOF
27
+
28
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
29
+ end
30
+
31
+ def test_sentences_ext
32
+ text =<<-EOF
33
+ This is a sentence.
34
+ This is a sentence.
35
+ This is a broken
36
+ sentence. This is
37
+ another broken sentence.
38
+ EOF
39
+
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
18
42
  end
19
43
 
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
12
12
  segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
13
13
  annotation = SegmentAnnotation.setup(segment, :type => :verb)
14
14
 
15
- assert_equal 'verb', annotation.annotid.split(":").last
15
+ assert_equal 'verb', annotation.annotid.split(":")[5]
16
16
 
17
17
  annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
18
- assert_equal 'verb', annotation.annotid.split(":").last
18
+ assert_equal 'verb', annotation.annotid.split(":")[5]
19
19
  end
20
20
 
21
21
  def test_annotid
22
22
  text = "This is a document"
23
23
  Document.setup(text, "TEST", "test_doc1", nil)
24
24
 
25
- corpus = {}
26
- corpus.extend Document::Corpus
25
+ corpus = Document::Corpus.setup({})
27
26
 
28
27
  corpus.add_document(text)
29
28
 
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
2
2
  require 'rbbt/segment/encoding'
3
3
 
4
4
  class TestEncoding < Test::Unit::TestCase
5
- def _test_bad_chars
5
+ def test_bad_chars
6
6
  text = "A funky character ™ in a sentence."
7
7
 
8
8
  assert_equal ["™"], Segment.bad_chars(text)