rbbt-text 1.3.0 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +20 -5
- data/lib/rbbt/document/annotation.rb +7 -4
- data/lib/rbbt/document/corpus.rb +30 -3
- data/lib/rbbt/document/corpus/pubmed.rb +2 -1
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +32 -18
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +195 -0
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +9 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/named_entity.rb +7 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/share/install/software/OpenNLP +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +15 -6
- data/test/rbbt/document/test_corpus.rb +15 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +7 -5
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/segment/test_transformed.rb +33 -4
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +144 -0
- metadata +12 -3
| @@ -6,7 +6,7 @@ module Segment::RangeIndex | |
| 6 6 | 
             
                SegID.setup(res, :corpus => corpus)
         | 
| 7 7 | 
             
              end
         | 
| 8 8 |  | 
| 9 | 
            -
              def self.index(segments, corpus, persist_file = :memory)
         | 
| 9 | 
            +
              def self.index(segments, corpus = nil, persist_file = :memory)
         | 
| 10 10 | 
             
                segments = segments.values.flatten if Hash === segments
         | 
| 11 11 |  | 
| 12 12 | 
             
                annotation_index = 
         | 
| @@ -68,6 +68,10 @@ module Transformed | |
| 68 68 |  | 
| 69 69 | 
             
                segments = [segments] unless Array === segments 
         | 
| 70 70 | 
             
                orig_length = self.length
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                offset = self.respond_to?(:offset) ? self.offset.to_i : 0
         | 
| 73 | 
            +
                segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
         | 
| 74 | 
            +
             | 
| 71 75 | 
             
                Segment.clean_sort(segments).each do |segment|
         | 
| 72 76 | 
             
                  next if segment.offset.nil?
         | 
| 73 77 |  | 
| @@ -86,7 +90,7 @@ module Transformed | |
| 86 90 |  | 
| 87 91 | 
             
                  updated_text = self[updated_begin..updated_end]
         | 
| 88 92 | 
             
                  if updated_text.nil?
         | 
| 89 | 
            -
                    Log.warn "Range outside of segment: #{self.length} #{segment. | 
| 93 | 
            +
                    Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
         | 
| 90 94 | 
             
                    next
         | 
| 91 95 | 
             
                  end
         | 
| 92 96 |  | 
| @@ -0,0 +1,51 @@ | |
| 1 | 
            +
            isLetters     /^[A-Z]+$/i
         | 
| 2 | 
            +
            isUpper       /^[A-Z]+$/
         | 
| 3 | 
            +
            isLower       /^[a-z]+$/
         | 
| 4 | 
            +
            isDigits      /^[0-9]+$/i
         | 
| 5 | 
            +
            isRoman       /^[IVX]+$/
         | 
| 6 | 
            +
            isGreek       /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
         | 
| 7 | 
            +
            isPunctuation /^[,.;]$/
         | 
| 8 | 
            +
            isDelim       /^[\/()\[\]{}\-]$/
         | 
| 9 | 
            +
            isNonWord     /^[^\w]+$/
         | 
| 10 | 
            +
            isConjunction /^and|or|&|,$/
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            hasLetters    /[A-Z]/i
         | 
| 13 | 
            +
            hasUpper      /.[A-Z]/
         | 
| 14 | 
            +
            hasLower      /[a-z]/
         | 
| 15 | 
            +
            hasDigits     /[0-9]/i
         | 
| 16 | 
            +
            hasGreek      /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
         | 
| 17 | 
            +
            hasPunctuation /[,.;]/
         | 
| 18 | 
            +
            hasDelim      /[\/()\[\]{}\-]/
         | 
| 19 | 
            +
            hasNonWord    /[^\w]/
         | 
| 20 | 
            +
            caspMix       /[a-z].[A-Z]/
         | 
| 21 | 
            +
            keywords      /(?:protein|gene|domain|ase)s?$/
         | 
| 22 | 
            +
            hasSuffix     /[a-z][A-Z0-9]$/
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            numLetters    do |w| w.scan(/[A-Z]/i).length end
         | 
| 25 | 
            +
            numDigits     do |w| w.scan(/[0-9]/).length end
         | 
| 26 | 
            +
            #
         | 
| 27 | 
            +
            prefix_3      /^(...)/
         | 
| 28 | 
            +
            prefix_4      /^(....)/
         | 
| 29 | 
            +
            suffix_3      /(...)$/
         | 
| 30 | 
            +
            suffix_4      /(....)$/
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            token1        do |w|
         | 
| 34 | 
            +
                             w.sub(/[A-Z]/,'A').
         | 
| 35 | 
            +
                               sub(/[a-z]/,'a').
         | 
| 36 | 
            +
                               sub(/[0-9]/,'0').
         | 
| 37 | 
            +
                               sub(/[^0-9a-z]/i,'x')
         | 
| 38 | 
            +
                          end
         | 
| 39 | 
            +
            token2        do  |w|
         | 
| 40 | 
            +
                             w.sub(/[A-Z]+/,'A').
         | 
| 41 | 
            +
                               sub(/[a-z]+/,'a').
         | 
| 42 | 
            +
                               sub(/[0-9]+/,'0').
         | 
| 43 | 
            +
                               sub(/[^0-9a-z]+/i,'x')
         | 
| 44 | 
            +
                           end
         | 
| 45 | 
            +
            token3         do |w| w.downcase end
         | 
| 46 | 
            +
            special        do |w| w.is_special? end
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            context   %w(special token2 isPunctuation isDelim)
         | 
| 49 | 
            +
            window     %w(1 2 3 -1 -2 -3)
         | 
| 50 | 
            +
            #direction :reverse
         | 
| 51 | 
            +
             | 
| @@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase | |
| 7 7 | 
             
              def test_add_pmid
         | 
| 8 8 | 
             
                corpus = Document::Corpus.setup({})
         | 
| 9 9 |  | 
| 10 | 
            -
                document = corpus.add_pmid(" | 
| 10 | 
            +
                document = corpus.add_pmid("33359141", :abstract).first
         | 
| 11 | 
            +
                iii document.docid
         | 
| 11 12 | 
             
                title = document.to(:title)
         | 
| 12 13 | 
             
                assert title.include?("COVID-19")
         | 
| 13 14 | 
             
              end
         | 
| @@ -4,6 +4,7 @@ require 'rbbt/document/corpus' | |
| 4 4 | 
             
            require 'rbbt/segment'
         | 
| 5 5 | 
             
            require 'rbbt/document/annotation'
         | 
| 6 6 | 
             
            require 'rbbt/segment/named_entity'
         | 
| 7 | 
            +
            require 'rbbt/ner/abner'
         | 
| 7 8 |  | 
| 8 9 | 
             
            class TestAnnotation < Test::Unit::TestCase
         | 
| 9 10 | 
             
              class CalledOnce < Exception; end
         | 
| @@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase | |
| 28 29 | 
             
                  self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
         | 
| 29 30 | 
             
                end
         | 
| 30 31 |  | 
| 32 | 
            +
                Document.define :abner do
         | 
| 33 | 
            +
                  $called_once = true
         | 
| 34 | 
            +
                  Abner.new.match(self)
         | 
| 35 | 
            +
                end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
             | 
| 31 38 | 
             
                Document.persist :ner
         | 
| 32 39 | 
             
              end
         | 
| 33 40 |  | 
| @@ -36,7 +43,7 @@ class TestAnnotation < Test::Unit::TestCase | |
| 36 43 | 
             
                Document.setup(text, "TEST", "test_doc1", nil)
         | 
| 37 44 |  | 
| 38 45 | 
             
                corpus = {}
         | 
| 39 | 
            -
                 | 
| 46 | 
            +
                Document::Corpus.setup corpus
         | 
| 40 47 |  | 
| 41 48 | 
             
                corpus.add_document(text)
         | 
| 42 49 |  | 
| @@ -50,7 +57,7 @@ class TestAnnotation < Test::Unit::TestCase | |
| 50 57 | 
             
                Document.setup(text2, "TEST", "test_doc2", nil)
         | 
| 51 58 |  | 
| 52 59 | 
             
                corpus = {}
         | 
| 53 | 
            -
                 | 
| 60 | 
            +
                Document::Corpus.setup corpus
         | 
| 54 61 |  | 
| 55 62 | 
             
                corpus.add_document(text1)
         | 
| 56 63 | 
             
                corpus.add_document(text2)
         | 
| @@ -68,7 +75,7 @@ class TestAnnotation < Test::Unit::TestCase | |
| 68 75 | 
             
                Document.setup(text, "TEST", "test_doc1", nil)
         | 
| 69 76 |  | 
| 70 77 | 
             
                corpus = {}
         | 
| 71 | 
            -
                 | 
| 78 | 
            +
                Document::Corpus.setup corpus
         | 
| 72 79 |  | 
| 73 80 | 
             
                corpus.add_document(text)
         | 
| 74 81 |  | 
| @@ -95,7 +102,7 @@ class TestAnnotation < Test::Unit::TestCase | |
| 95 102 | 
             
                Document.setup(text, "TEST", "test_doc1", nil)
         | 
| 96 103 |  | 
| 97 104 | 
             
                corpus = {}
         | 
| 98 | 
            -
                 | 
| 105 | 
            +
                Document::Corpus.setup corpus
         | 
| 99 106 |  | 
| 100 107 | 
             
                corpus.add_document(text)
         | 
| 101 108 |  | 
| @@ -122,7 +129,7 @@ class TestAnnotation < Test::Unit::TestCase | |
| 122 129 | 
             
                Document.setup(text, "TEST", "test_doc1", nil)
         | 
| 123 130 |  | 
| 124 131 | 
             
                corpus = {}
         | 
| 125 | 
            -
                 | 
| 132 | 
            +
                Document::Corpus.setup corpus
         | 
| 126 133 |  | 
| 127 134 | 
             
                corpus.add_document(text)
         | 
| 128 135 |  | 
| @@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase | |
| 133 140 | 
             
                text.ner
         | 
| 134 141 |  | 
| 135 142 | 
             
                assert ! $called_once
         | 
| 136 | 
            -
             | 
| 143 | 
            +
             | 
| 144 | 
            +
                assert_equal text.abner.first.docid, text.docid
         | 
| 145 | 
            +
             | 
| 137 146 | 
             
                assert  text.ner.first.segid.include?("TEST:")
         | 
| 138 147 | 
             
              end
         | 
| 139 148 | 
             
            end
         | 
| @@ -26,7 +26,21 @@ class TestDocumentCorpus < Test::Unit::TestCase | |
| 26 26 |  | 
| 27 27 | 
             
                  corpus.add_document(text)
         | 
| 28 28 |  | 
| 29 | 
            -
                  assert corpus. | 
| 29 | 
            +
                  assert corpus.docids("TEST:").include?(text.docid)
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
              end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
              def test_load
         | 
| 34 | 
            +
                text = "This is a document"
         | 
| 35 | 
            +
                Document.setup(text, "TEST", "test_doc1", nil)
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                TmpFile.with_file do |path|
         | 
| 38 | 
            +
                  corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
         | 
| 39 | 
            +
                  corpus.extend Document::Corpus
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                  corpus.add_document(text)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  assert corpus.docids("TEST:").include?(text.docid)
         | 
| 30 44 | 
             
                end
         | 
| 31 45 | 
             
              end
         | 
| 32 46 | 
             
            end
         | 
| @@ -5,12 +5,17 @@ Log.severity = 0 | |
| 5 5 | 
             
            class TestGNormPlus < Test::Unit::TestCase
         | 
| 6 6 | 
             
              def test_match
         | 
| 7 7 | 
             
                text =<<-EOF
         | 
| 8 | 
            -
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            Introduction
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            We found that TP53 is regulated by MDM2 in Homo 
         | 
| 12 | 
            +
            sapiens
         | 
| 9 13 | 
             
                EOF
         | 
| 10 14 |  | 
| 11 15 | 
             
                mentions = GNormPlus.process({:file => text})
         | 
| 16 | 
            +
             | 
| 12 17 | 
             
                assert_equal 1, mentions.length
         | 
| 13 | 
            -
                assert_equal  | 
| 18 | 
            +
                assert_equal 3, mentions["file"].length
         | 
| 14 19 | 
             
              end
         | 
| 15 20 |  | 
| 16 21 | 
             
              def test_entities
         | 
| @@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens | |
| 19 24 | 
             
                EOF
         | 
| 20 25 |  | 
| 21 26 | 
             
                mentions = GNormPlus.entities({:file => text})
         | 
| 22 | 
            -
                mentions["file"].include? | 
| 27 | 
            +
                assert mentions["file"].include?("TP53")
         | 
| 28 | 
            +
                mentions["file"].each do |mention|
         | 
| 29 | 
            +
                  assert_equal mention, text[mention.range].sub("\n", ' ')
         | 
| 30 | 
            +
                end
         | 
| 23 31 | 
             
              end
         | 
| 24 32 | 
             
            end
         | 
| 25 33 |  | 
| @@ -0,0 +1,132 @@ | |
| 1 | 
            +
            require File.dirname(__FILE__) + '/../../test_helper'
         | 
| 2 | 
            +
            require 'rbbt'
         | 
| 3 | 
            +
            require 'rbbt/ner/rner'
         | 
| 4 | 
            +
            require 'test/unit'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            class TestRNer < Test::Unit::TestCase
         | 
| 7 | 
            +
             | 
| 8 | 
            +
              def setup
         | 
| 9 | 
            +
                @parser = NERFeatures.new() do
         | 
| 10 | 
            +
                  isLetters     /^[A-Z]+$/i 
         | 
| 11 | 
            +
                  context prefix_3      /^(...)/ 
         | 
| 12 | 
            +
                  downcase do |w| w.downcase end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  context %w(downcase)
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              def test_config
         | 
| 19 | 
            +
                config = <<-EOC
         | 
| 20 | 
            +
                  isLetters     /^[A-Z]+$/i 
         | 
| 21 | 
            +
                  context prefix_3      /^(...)/ 
         | 
| 22 | 
            +
                  downcase do |w| w.downcase end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  context %w(downcase)
         | 
| 25 | 
            +
                EOC
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                assert_equal config.strip, @parser.config.strip
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
              def test_reverse
         | 
| 31 | 
            +
                assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
         | 
| 32 | 
            +
                assert_equal(
         | 
| 33 | 
            +
                   ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A", 
         | 
| 34 | 
            +
                 NERFeatures.reverse(
         | 
| 35 | 
            +
                   "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
         | 
| 36 | 
            +
                  ))
         | 
| 37 | 
            +
              end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              def test_features
         | 
| 40 | 
            +
                assert_equal  @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              def test_template
         | 
| 44 | 
            +
                template =<<-EOT
         | 
| 45 | 
            +
            UisLetters: %x[0,1]
         | 
| 46 | 
            +
            Uprefix_3: %x[0,2]
         | 
| 47 | 
            +
            Uprefix_3#1: %x[1,2]
         | 
| 48 | 
            +
            Uprefix_3#-1: %x[-1,2]
         | 
| 49 | 
            +
            Udowncase: %x[0,3]
         | 
| 50 | 
            +
            Udowncase#1: %x[1,3]
         | 
| 51 | 
            +
            Udowncase#-1: %x[-1,3]
         | 
| 52 | 
            +
            B
         | 
| 53 | 
            +
                EOT
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                assert(@parser.template == template)
         | 
| 56 | 
            +
              end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              def test_tokens
         | 
| 59 | 
            +
                assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
         | 
| 60 | 
            +
                       ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
              def test_text_features
         | 
| 65 | 
            +
             
         | 
| 66 | 
            +
                assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
         | 
| 67 | 
            +
                assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
         | 
| 68 | 
            +
                assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
         | 
| 69 | 
            +
               
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
              def test_tagged_features
         | 
| 73 | 
            +
                assert_equal(
         | 
| 74 | 
            +
                  [["phosphorilation",true, "pho", "phosphorilation", 0], 
         | 
| 75 | 
            +
                    ["of",true, false, "of", 0], 
         | 
| 76 | 
            +
                    ["GENE1",false, "GEN", "gene1", 1],
         | 
| 77 | 
            +
                    [".", false, false, ".", 0]],
         | 
| 78 | 
            +
                  @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                  assert_equal(
         | 
| 81 | 
            +
                    [["GENE1",false, "GEN", "gene1", 1],
         | 
| 82 | 
            +
                      ["phosphorilation",true, "pho", "phosphorilation", 0]], 
         | 
| 83 | 
            +
                  @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
         | 
| 84 | 
            +
             | 
| 85 | 
            +
             
         | 
| 86 | 
            +
                assert_equal(
         | 
| 87 | 
            +
                       [["phosphorilation",true, "pho", "phosphorilation", 0], 
         | 
| 88 | 
            +
                        ["of",true, false, "of", 0], 
         | 
| 89 | 
            +
                        ["GENE",true, "GEN", "gene", 1],
         | 
| 90 | 
            +
                        ["1",false, false, "1", 2],
         | 
| 91 | 
            +
                        [".", false, false, ".", 0]],
         | 
| 92 | 
            +
                  @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
         | 
| 93 | 
            +
              end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
              def test_tagged_features_reverse
         | 
| 96 | 
            +
                @parser.reverse = true
         | 
| 97 | 
            +
                assert_equal(
         | 
| 98 | 
            +
                  [
         | 
| 99 | 
            +
                    ["GENE1",false, "GEN", "gene1", 1],
         | 
| 100 | 
            +
                    ["of",true, false, "of", 0], 
         | 
| 101 | 
            +
                    ["phosphorilation",true, "pho", "phosphorilation", 0]
         | 
| 102 | 
            +
                ],
         | 
| 103 | 
            +
                @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                assert_equal(
         | 
| 106 | 
            +
                      [
         | 
| 107 | 
            +
                        [".", false, false, ".", 0],
         | 
| 108 | 
            +
                        ["1",false, false, "1", 1],
         | 
| 109 | 
            +
                        ["GENE",true, "GEN", "gene", 2],
         | 
| 110 | 
            +
                        ["of",true, false, "of", 0], 
         | 
| 111 | 
            +
                        ["phosphorilation",true, "pho", "phosphorilation", 0]
         | 
| 112 | 
            +
                    ],
         | 
| 113 | 
            +
                @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
         | 
| 114 | 
            +
              end
         | 
| 115 | 
            +
             | 
| 116 | 
            +
              def test_default_config
         | 
| 117 | 
            +
                require 'rbbt/bow/misc'
         | 
| 118 | 
            +
                text =<<-EOF
         | 
| 119 | 
            +
            This text explains how MDM2 interacts with TP53.
         | 
| 120 | 
            +
                EOF
         | 
| 121 | 
            +
                @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
         | 
| 122 | 
            +
                features = @parser.tagged_features text, %w(TP53 MDM2)
         | 
| 123 | 
            +
                assert features.first.first == "This"
         | 
| 124 | 
            +
              end
         | 
| 125 | 
            +
             | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 | 
            +
              def __test_CRFPP_install
         | 
| 129 | 
            +
                assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
         | 
| 130 | 
            +
              end
         | 
| 131 | 
            +
             | 
| 132 | 
            +
            end
         | 
| @@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase | |
| 7 7 | 
             
            This is a sentence.    
         | 
| 8 8 | 
             
            A funky character ™ in a sentence.
         | 
| 9 9 | 
             
            This is a sentence.    
         | 
| 10 | 
            -
            This is a 
         | 
| 10 | 
            +
            This is a broken
         | 
| 11 11 | 
             
            sentence. This is
         | 
| 12 | 
            -
            another sentence. 
         | 
| 12 | 
            +
            another broken sentence. 
         | 
| 13 13 | 
             
                EOF
         | 
| 14 14 |  | 
| 15 | 
            -
                 | 
| 15 | 
            +
                iii NLP.geniass_sentence_splitter(text)
         | 
| 16 | 
            +
                assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
         | 
| 16 17 | 
             
              end
         | 
| 17 18 |  | 
| 19 | 
            +
              def test_sentences_2
         | 
| 20 | 
            +
                text =<<-EOF
         | 
| 21 | 
            +
            This is a sentence.    
         | 
| 22 | 
            +
            This is a sentence.    
         | 
| 23 | 
            +
            This is a broken
         | 
| 24 | 
            +
            sentence. This is
         | 
| 25 | 
            +
            another broken sentence. 
         | 
| 26 | 
            +
                EOF
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              def test_sentences_ext
         | 
| 32 | 
            +
                text =<<-EOF
         | 
| 33 | 
            +
            This is a sentence.    
         | 
| 34 | 
            +
            This is a sentence.    
         | 
| 35 | 
            +
            This is a broken
         | 
| 36 | 
            +
            sentence. This is
         | 
| 37 | 
            +
            another broken sentence. 
         | 
| 38 | 
            +
                EOF
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
         | 
| 41 | 
            +
              end
         | 
| 18 42 | 
             
            end
         | 
| 19 43 |  | 
| @@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase | |
| 12 12 | 
             
                segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
         | 
| 13 13 | 
             
                annotation = SegmentAnnotation.setup(segment, :type => :verb)
         | 
| 14 14 |  | 
| 15 | 
            -
                assert_equal 'verb', annotation.annotid.split(":") | 
| 15 | 
            +
                assert_equal 'verb', annotation.annotid.split(":")[5]
         | 
| 16 16 |  | 
| 17 17 | 
             
                annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
         | 
| 18 | 
            -
                assert_equal 'verb', annotation.annotid.split(":") | 
| 18 | 
            +
                assert_equal 'verb', annotation.annotid.split(":")[5]
         | 
| 19 19 | 
             
              end
         | 
| 20 20 |  | 
| 21 21 | 
             
              def test_annotid
         | 
| 22 22 | 
             
                text = "This is a document"
         | 
| 23 23 | 
             
                Document.setup(text, "TEST", "test_doc1", nil)
         | 
| 24 24 |  | 
| 25 | 
            -
                corpus = {}
         | 
| 26 | 
            -
                corpus.extend Document::Corpus
         | 
| 25 | 
            +
                corpus = Document::Corpus.setup({})
         | 
| 27 26 |  | 
| 28 27 | 
             
                corpus.add_document(text)
         | 
| 29 28 |  | 
| @@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe | |
| 2 2 | 
             
            require 'rbbt/segment/encoding'
         | 
| 3 3 |  | 
| 4 4 | 
             
            class TestEncoding < Test::Unit::TestCase
         | 
| 5 | 
            -
              def  | 
| 5 | 
            +
              def test_bad_chars
         | 
| 6 6 | 
             
                text = "A funky character ™ in a sentence."
         | 
| 7 7 |  | 
| 8 8 | 
             
                assert_equal ["™"], Segment.bad_chars(text)
         |