rbbt-text 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
| @@ -0,0 +1,29 @@ | |
| 1 | 
            +
            require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
         | 
| 2 | 
            +
            require 'rbbt/ner/segment'
         | 
| 3 | 
            +
            require 'rbbt/ner/segment/named_entity'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            class TestClass < Test::Unit::TestCase
         | 
| 6 | 
            +
              def test_info
         | 
| 7 | 
            +
                a = "test"
         | 
| 8 | 
            +
                NamedEntity.setup a
         | 
| 9 | 
            +
                assert(! a.info.keys.include?(:code))
         | 
| 10 | 
            +
                a.code = 10
         | 
| 11 | 
            +
                a.offset = 100
         | 
| 12 | 
            +
                assert a.info.include? :code
         | 
| 13 | 
            +
                assert a.info.include? :offset
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              def test_all_args
         | 
| 17 | 
            +
                a = "test"
         | 
| 18 | 
            +
                NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
         | 
| 19 | 
            +
                assert_equal 10, a.offset
         | 
| 20 | 
            +
              end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
              def test_tsv
         | 
| 23 | 
            +
                a = "test"
         | 
| 24 | 
            +
                NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
         | 
| 25 | 
            +
                assert Segment.tsv([a]).fields.include? "code"
         | 
| 26 | 
            +
                assert Segment.tsv([a], nil).fields.include? "code"
         | 
| 27 | 
            +
                assert Segment.tsv([a], "literal").fields.include? "code"
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
            end
         | 
| @@ -0,0 +1,23 @@ | |
| 1 | 
            +
            require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
         | 
| 2 | 
            +
            require 'rbbt/ner/segment/segmented'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            class TestClass < Test::Unit::TestCase
         | 
| 5 | 
            +
              def test_split
         | 
| 6 | 
            +
                a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                gene1 = "TP53"
         | 
| 9 | 
            +
                gene1.extend Segment
         | 
| 10 | 
            +
                gene1.offset = a.index gene1
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                gene2 = "CDK5R1"
         | 
| 13 | 
            +
                gene2.extend Segment
         | 
| 14 | 
            +
                gene2.offset = a.index gene2
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                gene3 = "TP53 gene"
         | 
| 17 | 
            +
                gene3.extend Segment
         | 
| 18 | 
            +
                gene3.offset = a.index gene3
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                Segmented.setup(a, [gene2, gene1, gene3])
         | 
| 21 | 
            +
                assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], a.split_segments
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
            end
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
         | 
| 2 | 
            -
            require 'rbbt/ner/ | 
| 3 | 
            -
            require 'rbbt/ner/ | 
| 2 | 
            +
            require 'rbbt/ner/segment/transformed'
         | 
| 3 | 
            +
            require 'rbbt/ner/segment/named_entity'
         | 
| 4 4 |  | 
| 5 5 | 
             
            class TestClass < Test::Unit::TestCase
         | 
| 6 6 | 
             
              def test_transform
         | 
| @@ -8,11 +8,11 @@ class TestClass < Test::Unit::TestCase | |
| 8 8 | 
             
                original = a.dup
         | 
| 9 9 |  | 
| 10 10 | 
             
                gene1 = "TP53"
         | 
| 11 | 
            -
                gene1.extend  | 
| 11 | 
            +
                gene1.extend Segment
         | 
| 12 12 | 
             
                gene1.offset = a.index gene1
         | 
| 13 13 |  | 
| 14 14 | 
             
                gene2 = "CDK5"
         | 
| 15 | 
            -
                gene2.extend  | 
| 15 | 
            +
                gene2.extend Segment
         | 
| 16 16 | 
             
                gene2.offset = a.index gene2
         | 
| 17 17 |  | 
| 18 18 | 
             
                assert_equal gene1, a[gene1.range]
         | 
| @@ -30,7 +30,7 @@ class TestClass < Test::Unit::TestCase | |
| 30 30 |  | 
| 31 31 |  | 
| 32 32 | 
             
                gene3 = "GN gene"
         | 
| 33 | 
            -
                gene3.extend  | 
| 33 | 
            +
                gene3.extend Segment
         | 
| 34 34 | 
             
                gene3.offset = a.index gene3
         | 
| 35 35 |  | 
| 36 36 | 
             
                assert_equal gene3, a[gene3.range]
         | 
| @@ -108,7 +108,7 @@ class TestClass < Test::Unit::TestCase | |
| 108 108 |  | 
| 109 109 | 
             
              def test_html_with_offset
         | 
| 110 110 | 
             
                a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
         | 
| 111 | 
            -
                Segment. | 
| 111 | 
            +
                Segment.setup(a, 10)
         | 
| 112 112 |  | 
| 113 113 | 
             
                gene1 = "TP53"
         | 
| 114 114 | 
             
                gene1.extend NamedEntity
         | 
| @@ -10,7 +10,21 @@ C2;11;22;3 3;bb | |
| 10 10 | 
             
                EOF
         | 
| 11 11 |  | 
| 12 12 | 
             
                TmpFile.with_file(lexicon) do |file|
         | 
| 13 | 
            -
                  index = NGramPrefixDictionary.new(TSV. | 
| 13 | 
            +
                  index = NGramPrefixDictionary.new(TSV.open(file, :flat, :sep => ';'), "test")
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  matches = index.match(' asdfa dsf asdf aa asdfasdf ')
         | 
| 16 | 
            +
                  assert matches.select{|m| m.code.include? 'C1'}.any?
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              def test_stream
         | 
| 21 | 
            +
                lexicon =<<-EOF
         | 
| 22 | 
            +
            C1;aa;AA;bb b
         | 
| 23 | 
            +
            C2;11;22;3 3;bb
         | 
| 24 | 
            +
                EOF
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                TmpFile.with_file(lexicon.gsub(/;/,"\t")) do |file|
         | 
| 27 | 
            +
                  index = NGramPrefixDictionary.new(file, "test")
         | 
| 14 28 |  | 
| 15 29 | 
             
                  matches = index.match(' asdfa dsf asdf aa asdfasdf ')
         | 
| 16 30 | 
             
                  assert matches.select{|m| m.code.include? 'C1'}.any?
         | 
| @@ -6,33 +6,32 @@ class TestPatternRelExt < Test::Unit::TestCase | |
| 6 6 | 
             
                text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
         | 
| 7 7 |  | 
| 8 8 | 
             
                gene1 = "TP53"
         | 
| 9 | 
            -
                NamedEntity. | 
| 9 | 
            +
                NamedEntity.setup(gene1, text.index(gene1), "Gene")
         | 
| 10 10 |  | 
| 11 11 | 
             
                gene2 = "CDK5"
         | 
| 12 | 
            -
                NamedEntity. | 
| 12 | 
            +
                NamedEntity.setup(gene2, text.index(gene2), "Gene")
         | 
| 13 13 |  | 
| 14 14 | 
             
                interaction = "interacts"
         | 
| 15 | 
            -
                NamedEntity. | 
| 15 | 
            +
                NamedEntity.setup(interaction, text.index(interaction), "Interaction")
         | 
| 16 16 |  | 
| 17 | 
            -
                 | 
| 17 | 
            +
                Segmented.setup(text, [gene1, gene2, interaction])
         | 
| 18 18 |  | 
| 19 19 | 
             
                assert_equal "TP53 interacts with CDK5", PatternRelExt.simple_pattern(text, "GENE INTERACTION with GENE").first
         | 
| 20 | 
            -
             | 
| 21 20 | 
             
              end
         | 
| 22 21 |  | 
| 23 22 | 
             
              def test_chunk_pattern
         | 
| 24 23 | 
             
                text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
         | 
| 25 24 |  | 
| 26 25 | 
             
                gene1 = "TP53"
         | 
| 27 | 
            -
                NamedEntity. | 
| 26 | 
            +
                NamedEntity.setup(gene1, text.index(gene1), "Gene")
         | 
| 28 27 |  | 
| 29 28 | 
             
                gene2 = "CDK5"
         | 
| 30 | 
            -
                NamedEntity. | 
| 29 | 
            +
                NamedEntity.setup(gene2, text.index(gene2), "Gene")
         | 
| 31 30 |  | 
| 32 31 | 
             
                interaction = "interacts"
         | 
| 33 | 
            -
                NamedEntity. | 
| 32 | 
            +
                NamedEntity.setup(interaction, text.index(interaction), "Interaction")
         | 
| 34 33 |  | 
| 35 | 
            -
                 | 
| 34 | 
            +
                Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
         | 
| 36 35 |  | 
| 37 36 | 
             
                assert_equal "TP53 found in cultivated cells interacts with CDK5", 
         | 
| 38 37 | 
             
                  PatternRelExt.new("NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]").match_sentences([text]).first.first
         | 
| @@ -45,12 +44,12 @@ class TestPatternRelExt < Test::Unit::TestCase | |
| 45 44 | 
             
                text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
         | 
| 46 45 |  | 
| 47 46 | 
             
                drug = "thiazolidinediones"
         | 
| 48 | 
            -
                NamedEntity. | 
| 47 | 
            +
                NamedEntity.setup(drug, text.index(drug), "Chemical Mention")
         | 
| 49 48 |  | 
| 50 49 | 
             
                disease = "colon cancer"
         | 
| 51 | 
            -
                NamedEntity. | 
| 50 | 
            +
                NamedEntity.setup(disease, text.index(disease), "disease")
         | 
| 52 51 |  | 
| 53 | 
            -
                 | 
| 52 | 
            +
                Segmented.setup(text, {:entitites => [drug, disease]})
         | 
| 54 53 |  | 
| 55 54 | 
             
                assert_equal "thiazolidinediones in patients with an increased risk of colon cancer", 
         | 
| 56 55 | 
             
                  PatternRelExt.new("NP[entity:Chemical Mention] NP[stem:risk] NP[entity:disease]").match_sentences([text]).first.first
         | 
| @@ -55,12 +55,13 @@ class TestRegExpNER < Test::Unit::TestCase | |
| 55 55 | 
             
                assert_equal "In this sentence I should find ".length, matches.select{|m| m.type == :this}[1].offset
         | 
| 56 56 | 
             
                assert_equal :this, matches.select{|m| m.type == :this}[0].type
         | 
| 57 57 |  | 
| 58 | 
            -
                 | 
| 58 | 
            +
                Segmented.setup(sentence)
         | 
| 59 59 | 
             
                ner_this = RegExpNER.new({:this => /this/})
         | 
| 60 60 | 
             
                ner_that = RegExpNER.new({:that => /that/})
         | 
| 61 | 
            -
                sentence. | 
| 62 | 
            -
                sentence. | 
| 63 | 
            -
                 | 
| 61 | 
            +
                sentence.segments ||= []
         | 
| 62 | 
            +
                sentence.segments += ner_this.entities(sentence)
         | 
| 63 | 
            +
                sentence.segments += ner_that.entities(sentence)
         | 
| 64 | 
            +
                matches = sentence.segments
         | 
| 64 65 |  | 
| 65 66 | 
             
                assert_equal ["this", "this", "that"].sort, matches.sort
         | 
| 66 67 | 
             
                assert_equal "In ".length, matches.select{|m| m.type == :this}[0].offset
         | 
| @@ -0,0 +1,101 @@ | |
| 1 | 
            +
            require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
         | 
| 2 | 
            +
            require 'rbbt/ner/segment'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            class TestClass < Test::Unit::TestCase
         | 
| 5 | 
            +
              def test_info
         | 
| 6 | 
            +
                a = "test"
         | 
| 7 | 
            +
                a.extend Segment
         | 
| 8 | 
            +
                a.offset = 10
         | 
| 9 | 
            +
                assert a.info.include? :offset
         | 
| 10 | 
            +
              end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              def test_sort
         | 
| 13 | 
            +
                a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                gene1 = "TP53"
         | 
| 16 | 
            +
                gene1.extend Segment
         | 
| 17 | 
            +
                gene1.offset = a.index gene1
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                gene2 = "CDK5R1"
         | 
| 20 | 
            +
                gene2.extend Segment
         | 
| 21 | 
            +
                gene2.offset = a.index gene2
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              def test_clean_sort
         | 
| 27 | 
            +
                a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                gene1 = "TP53"
         | 
| 30 | 
            +
                gene1.extend Segment
         | 
| 31 | 
            +
                gene1.offset = a.index gene1
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                gene2 = "CDK5R1"
         | 
| 34 | 
            +
                gene2.extend Segment
         | 
| 35 | 
            +
                gene2.offset = a.index gene2
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                gene3 = "TP53 gene"
         | 
| 38 | 
            +
                gene3.extend Segment
         | 
| 39 | 
            +
                gene3.offset = a.index gene3
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
         | 
| 42 | 
            +
              end
         | 
| 43 | 
            +
             | 
| 44 | 
            +
              def test_split
         | 
| 45 | 
            +
                a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                gene1 = "TP53"
         | 
| 48 | 
            +
                gene1.extend Segment
         | 
| 49 | 
            +
                gene1.offset = a.index gene1
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                gene2 = "CDK5R1"
         | 
| 52 | 
            +
                gene2.extend Segment
         | 
| 53 | 
            +
                gene2.offset = a.index gene2
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                gene3 = "TP53 gene"
         | 
| 56 | 
            +
                gene3.extend Segment
         | 
| 57 | 
            +
                gene3.offset = a.index gene3
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(a, [gene2,gene1,gene3])
         | 
| 60 | 
            +
              end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
              def test_align
         | 
| 64 | 
            +
                text =<<-EOF
         | 
| 65 | 
            +
            Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
         | 
| 66 | 
            +
                EOF
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                parts = text.split(/\W/)
         | 
| 69 | 
            +
                Segment.align(text, parts)
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
         | 
| 72 | 
            +
              end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
              def test_segment_index
         | 
| 75 | 
            +
                a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                gene1 = "TP53"
         | 
| 78 | 
            +
                gene1.extend Segment
         | 
| 79 | 
            +
                gene1.offset = a.index gene1
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                gene2 = "CDK5R1"
         | 
| 82 | 
            +
                gene2.extend Segment
         | 
| 83 | 
            +
                gene2.offset = a.index gene2
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                gene3 = "TP53 gene"
         | 
| 86 | 
            +
                gene3.extend Segment
         | 
| 87 | 
            +
                gene3.offset = a.index gene3
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                index = Segment.index([gene1, gene2, gene3])
         | 
| 90 | 
            +
                assert_equal %w(CDK5R1), index[gene2.offset + 1]
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                TmpFile.with_file do |fwt|
         | 
| 93 | 
            +
                  index = Segment.index([gene1, gene2, gene3], fwt)
         | 
| 94 | 
            +
                  assert_equal %w(CDK5R1), index[gene2.offset + 1]
         | 
| 95 | 
            +
                  index = Segment.index([gene1, gene2, gene3], fwt)
         | 
| 96 | 
            +
                  assert_equal %w(CDK5R1), index[gene2.offset + 1]
         | 
| 97 | 
            +
                end
         | 
| 98 | 
            +
              end
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            end
         | 
| 101 | 
            +
             | 
| @@ -30,7 +30,7 @@ C2;11;22;3 3;bb | |
| 30 30 |  | 
| 31 31 | 
             
                TmpFile.with_file(lexicon) do |file|
         | 
| 32 32 |  | 
| 33 | 
            -
                  index = TokenTrieNER.process({}, TSV. | 
| 33 | 
            +
                  index = TokenTrieNER.process({}, TSV.open(file, :flat, :sep => ';'))
         | 
| 34 34 |  | 
| 35 35 | 
             
                  assert_equal ['AA', 'aa', 'bb', '11', '22', '3'].sort, index.keys.sort
         | 
| 36 36 | 
             
                  assert_equal [:END], index['aa'].keys
         | 
| @@ -47,7 +47,7 @@ C2;11;22;3 3;bb | |
| 47 47 |  | 
| 48 48 |  | 
| 49 49 | 
             
                TmpFile.with_file(lexicon) do |file|
         | 
| 50 | 
            -
                  index = TokenTrieNER.process({}, TSV. | 
| 50 | 
            +
                  index = TokenTrieNER.process({}, TSV.open(file, :sep => ';', :type => :flat ))
         | 
| 51 51 |  | 
| 52 52 | 
             
                  assert TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf').extend(TokenTrieNER::EnumeratedArray), false).first.collect{|c| c.code}.include?   'C1'
         | 
| 53 53 | 
             
                  assert_equal %w(aa), TokenTrieNER.find(index, TokenTrieNER.tokenize('aa asdf').extend(TokenTrieNER::EnumeratedArray), false).last
         | 
| @@ -71,9 +71,8 @@ C2;11;22;3 3;bb | |
| 71 71 | 
             
                EOF
         | 
| 72 72 |  | 
| 73 73 | 
             
                TmpFile.with_file(lexicon) do |file|
         | 
| 74 | 
            -
                  index = TokenTrieNER.new("test", TSV. | 
| 74 | 
            +
                  index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
         | 
| 75 75 |  | 
| 76 | 
            -
                  index.match(' asdfa dsf asdf aa asdfasdf ')
         | 
| 77 76 | 
             
                  assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
         | 
| 78 77 | 
             
                end
         | 
| 79 78 | 
             
              end
         | 
| @@ -81,18 +80,19 @@ C2;11;22;3 3;bb | |
| 81 80 | 
             
              def test_slack
         | 
| 82 81 | 
             
                lexicon =<<-EOF
         | 
| 83 82 | 
             
            C1;aa;AA;bb cc cc b
         | 
| 84 | 
            -
            C2;11;22;3 3;bb
         | 
| 83 | 
            +
            C2;11;22;3 3;bb;bbbb
         | 
| 85 84 | 
             
                EOF
         | 
| 86 85 |  | 
| 87 86 | 
             
                TmpFile.with_file(lexicon) do |file|
         | 
| 88 87 | 
             
                  index = TokenTrieNER.new({})
         | 
| 89 88 | 
             
                  index.slack = Proc.new{|t| t =~ /^c*$/}
         | 
| 90 89 |  | 
| 91 | 
            -
                  index.merge TSV. | 
| 90 | 
            +
                  index.merge TSV.open(file, :flat, :sep => ';')
         | 
| 92 91 |  | 
| 93 92 | 
             
                  assert index.match(' aaaaa 3 cc 3').select{|m| m.code.include? 'C2'}.any?
         | 
| 94 93 | 
             
                  assert index.match(' bb cc b').select{|m| m.code.include? 'C1'}.any?
         | 
| 95 94 | 
             
                  assert index.match(' bb b').select{|m| m.code.include? 'C1'}.any?
         | 
| 95 | 
            +
                  assert index.match(' BBBB b').select{|m| m.code.include? 'C2'}.any?
         | 
| 96 96 | 
             
                end
         | 
| 97 97 | 
             
              end
         | 
| 98 98 |  | 
| @@ -106,7 +106,7 @@ C2;11;22;3 3;bb | |
| 106 106 | 
             
                  index = TokenTrieNER.new({})
         | 
| 107 107 | 
             
                  index.slack = Proc.new{|t| t =~ /^c*$/}
         | 
| 108 108 |  | 
| 109 | 
            -
                  index.merge TSV. | 
| 109 | 
            +
                  index.merge TSV.open(file, :flat, :sep => ';')
         | 
| 110 110 |  | 
| 111 111 | 
             
                  assert index.match(Token.tokenize('3 cc 3')).select{|m| m.code.include? 'C2'}.any?
         | 
| 112 112 | 
             
                end
         | 
| @@ -126,9 +126,8 @@ C2;11;22;3 3;bb | |
| 126 126 | 
             
                EOF
         | 
| 127 127 |  | 
| 128 128 | 
             
                TmpFile.with_file(lexicon) do |file|
         | 
| 129 | 
            -
                  index = TokenTrieNER.new("test", TSV. | 
| 129 | 
            +
                  index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'), :persistence => true)
         | 
| 130 130 |  | 
| 131 | 
            -
                  index.match(' asdfa dsf asdf aa asdfasdf ')
         | 
| 132 131 | 
             
                  assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
         | 
| 133 132 | 
             
                end
         | 
| 134 133 | 
             
              end
         | 
    
        data/test/test_helper.rb
    CHANGED
    
    | @@ -3,7 +3,7 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) | |
| 3 3 | 
             
            $LOAD_PATH.unshift(File.dirname(__FILE__))
         | 
| 4 4 |  | 
| 5 5 | 
             
            require 'rbbt'
         | 
| 6 | 
            -
            require 'rbbt/ | 
| 6 | 
            +
            require 'rbbt/persist'
         | 
| 7 7 | 
             
            require 'rbbt/util/tmpfile'
         | 
| 8 8 | 
             
            require 'rbbt/util/log'
         | 
| 9 9 | 
             
            require 'rbbt/corpus/document_repo'
         | 
| @@ -15,15 +15,15 @@ class Test::Unit::TestCase | |
| 15 15 |  | 
| 16 16 | 
             
              def setup
         | 
| 17 17 | 
             
                FileUtils.mkdir_p Rbbt.tmp.test.persistence.find(:user)
         | 
| 18 | 
            -
                 | 
| 18 | 
            +
                Persist.cachedir = Rbbt.tmp.test.persistence.find :user
         | 
| 19 19 | 
             
              end
         | 
| 20 20 |  | 
| 21 21 | 
             
              def teardown
         | 
| 22 22 | 
             
                FileUtils.rm_rf Rbbt.tmp.test.find :user
         | 
| 23 | 
            -
                 | 
| 24 | 
            -
                 | 
| 25 | 
            -
                DocumentRepo:: | 
| 26 | 
            -
                DocumentRepo:: | 
| 23 | 
            +
                Persist::TC_CONNECTIONS.values.each do |c| c.close end
         | 
| 24 | 
            +
                Persist::TC_CONNECTIONS.clear
         | 
| 25 | 
            +
                DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
         | 
| 26 | 
            +
                DocumentRepo::TC_CONNECTIONS.clear
         | 
| 27 27 | 
             
              end
         | 
| 28 28 |  | 
| 29 29 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: rbbt-text
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 5 | 
            -
              prerelease: 
         | 
| 4 | 
            +
              hash: 7
         | 
| 5 | 
            +
              prerelease: false
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 0
         | 
| 8 | 
            -
              -  | 
| 8 | 
            +
              - 6
         | 
| 9 9 | 
             
              - 0
         | 
| 10 | 
            -
              version: 0. | 
| 10 | 
            +
              version: 0.6.0
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - Miguel Vazquez
         | 
| @@ -15,7 +15,7 @@ autorequire: | |
| 15 15 | 
             
            bindir: bin
         | 
| 16 16 | 
             
            cert_chain: []
         | 
| 17 17 |  | 
| 18 | 
            -
            date: 2011-07 | 
| 18 | 
            +
            date: 2011-09-07 00:00:00 +02:00
         | 
| 19 19 | 
             
            default_executable: get_ppis.rb
         | 
| 20 20 | 
             
            dependencies: 
         | 
| 21 21 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -26,10 +26,12 @@ dependencies: | |
| 26 26 | 
             
                requirements: 
         | 
| 27 27 | 
             
                - - ">="
         | 
| 28 28 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 29 | 
            -
                    hash:  | 
| 29 | 
            +
                    hash: 63
         | 
| 30 30 | 
             
                    segments: 
         | 
| 31 | 
            +
                    - 4
         | 
| 31 32 | 
             
                    - 0
         | 
| 32 | 
            -
                     | 
| 33 | 
            +
                    - 0
         | 
| 34 | 
            +
                    version: 4.0.0
         | 
| 33 35 | 
             
              type: :runtime
         | 
| 34 36 | 
             
              version_requirements: *id001
         | 
| 35 37 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -74,6 +76,20 @@ dependencies: | |
| 74 76 | 
             
                    version: "0"
         | 
| 75 77 | 
             
              type: :runtime
         | 
| 76 78 | 
             
              version_requirements: *id004
         | 
| 79 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 80 | 
            +
              name: rjb
         | 
| 81 | 
            +
              prerelease: false
         | 
| 82 | 
            +
              requirement: &id005 !ruby/object:Gem::Requirement 
         | 
| 83 | 
            +
                none: false
         | 
| 84 | 
            +
                requirements: 
         | 
| 85 | 
            +
                - - ">="
         | 
| 86 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 87 | 
            +
                    hash: 3
         | 
| 88 | 
            +
                    segments: 
         | 
| 89 | 
            +
                    - 0
         | 
| 90 | 
            +
                    version: "0"
         | 
| 91 | 
            +
              type: :runtime
         | 
| 92 | 
            +
              version_requirements: *id005
         | 
| 77 93 | 
             
            description: "Text mining tools: named entity recognition and normalization, document classification, bag-of-words, dictionaries, etc"
         | 
| 78 94 | 
             
            email: miguel.vazquez@fdi.ucm.es
         | 
| 79 95 | 
             
            executables: 
         | 
| @@ -92,12 +108,6 @@ files: | |
| 92 108 | 
             
            - lib/rbbt/corpus/sources/pubmed.rb
         | 
| 93 109 | 
             
            - lib/rbbt/ner/NER.rb
         | 
| 94 110 | 
             
            - lib/rbbt/ner/abner.rb
         | 
| 95 | 
            -
            - lib/rbbt/ner/annotations.rb
         | 
| 96 | 
            -
            - lib/rbbt/ner/annotations/annotated.rb
         | 
| 97 | 
            -
            - lib/rbbt/ner/annotations/named_entity.rb
         | 
| 98 | 
            -
            - lib/rbbt/ner/annotations/relations.rb
         | 
| 99 | 
            -
            - lib/rbbt/ner/annotations/token.rb
         | 
| 100 | 
            -
            - lib/rbbt/ner/annotations/transformed.rb
         | 
| 101 111 | 
             
            - lib/rbbt/ner/banner.rb
         | 
| 102 112 | 
             
            - lib/rbbt/ner/chemical_tagger.rb
         | 
| 103 113 | 
             
            - lib/rbbt/ner/ngram_prefix_dictionary.rb
         | 
| @@ -108,6 +118,12 @@ files: | |
| 108 118 | 
             
            - lib/rbbt/ner/rnorm.rb
         | 
| 109 119 | 
             
            - lib/rbbt/ner/rnorm/cue_index.rb
         | 
| 110 120 | 
             
            - lib/rbbt/ner/rnorm/tokens.rb
         | 
| 121 | 
            +
            - lib/rbbt/ner/segment.rb
         | 
| 122 | 
            +
            - lib/rbbt/ner/segment/named_entity.rb
         | 
| 123 | 
            +
            - lib/rbbt/ner/segment/relationship.rb
         | 
| 124 | 
            +
            - lib/rbbt/ner/segment/segmented.rb
         | 
| 125 | 
            +
            - lib/rbbt/ner/segment/token.rb
         | 
| 126 | 
            +
            - lib/rbbt/ner/segment/transformed.rb
         | 
| 111 127 | 
             
            - lib/rbbt/ner/token_trieNER.rb
         | 
| 112 128 | 
             
            - lib/rbbt/nlp/genia/sentence_splitter.rb
         | 
| 113 129 | 
             
            - lib/rbbt/nlp/nlp.rb
         | 
| @@ -130,12 +146,13 @@ files: | |
| 130 146 | 
             
            - test/rbbt/ner/test_regexpNER.rb
         | 
| 131 147 | 
             
            - test/rbbt/ner/test_abner.rb
         | 
| 132 148 | 
             
            - test/rbbt/ner/test_banner.rb
         | 
| 133 | 
            -
            - test/rbbt/ner/ | 
| 134 | 
            -
            - test/rbbt/ner/annotations/test_named_entity.rb
         | 
| 149 | 
            +
            - test/rbbt/ner/test_NER.rb
         | 
| 135 150 | 
             
            - test/rbbt/ner/test_token_trieNER.rb
         | 
| 136 | 
            -
            - test/rbbt/ner/test_annotations.rb
         | 
| 137 151 | 
             
            - test/rbbt/ner/test_patterns.rb
         | 
| 138 | 
            -
            - test/rbbt/ner/ | 
| 152 | 
            +
            - test/rbbt/ner/segment/test_named_entity.rb
         | 
| 153 | 
            +
            - test/rbbt/ner/segment/test_segmented.rb
         | 
| 154 | 
            +
            - test/rbbt/ner/segment/test_transformed.rb
         | 
| 155 | 
            +
            - test/rbbt/ner/test_segment.rb
         | 
| 139 156 | 
             
            - test/rbbt/ner/test_rnorm.rb
         | 
| 140 157 | 
             
            - test/rbbt/ner/test_oscar4.rb
         | 
| 141 158 | 
             
            - test/rbbt/ner/test_chemical_tagger.rb
         | 
| @@ -174,7 +191,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 174 191 | 
             
            requirements: []
         | 
| 175 192 |  | 
| 176 193 | 
             
            rubyforge_project: 
         | 
| 177 | 
            -
            rubygems_version: 1. | 
| 194 | 
            +
            rubygems_version: 1.3.7
         | 
| 178 195 | 
             
            signing_key: 
         | 
| 179 196 | 
             
            specification_version: 3
         | 
| 180 197 | 
             
            summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
         | 
| @@ -186,12 +203,13 @@ test_files: | |
| 186 203 | 
             
            - test/rbbt/ner/test_regexpNER.rb
         | 
| 187 204 | 
             
            - test/rbbt/ner/test_abner.rb
         | 
| 188 205 | 
             
            - test/rbbt/ner/test_banner.rb
         | 
| 189 | 
            -
            - test/rbbt/ner/ | 
| 190 | 
            -
            - test/rbbt/ner/annotations/test_named_entity.rb
         | 
| 206 | 
            +
            - test/rbbt/ner/test_NER.rb
         | 
| 191 207 | 
             
            - test/rbbt/ner/test_token_trieNER.rb
         | 
| 192 | 
            -
            - test/rbbt/ner/test_annotations.rb
         | 
| 193 208 | 
             
            - test/rbbt/ner/test_patterns.rb
         | 
| 194 | 
            -
            - test/rbbt/ner/ | 
| 209 | 
            +
            - test/rbbt/ner/segment/test_named_entity.rb
         | 
| 210 | 
            +
            - test/rbbt/ner/segment/test_segmented.rb
         | 
| 211 | 
            +
            - test/rbbt/ner/segment/test_transformed.rb
         | 
| 212 | 
            +
            - test/rbbt/ner/test_segment.rb
         | 
| 195 213 | 
             
            - test/rbbt/ner/test_rnorm.rb
         | 
| 196 214 | 
             
            - test/rbbt/ner/test_oscar4.rb
         | 
| 197 215 | 
             
            - test/rbbt/ner/test_chemical_tagger.rb
         |