rbbt 1.1.8 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. data/README.rdoc +12 -12
  2. data/bin/rbbt_config +2 -3
  3. data/install_scripts/norm/Rakefile +4 -4
  4. data/install_scripts/organisms/{tair.Rakefile → Ath.Rakefile} +4 -3
  5. data/install_scripts/organisms/{cgd.Rakefile → Cal.Rakefile} +0 -0
  6. data/install_scripts/organisms/{worm.Rakefile → Cel.Rakefile} +0 -0
  7. data/install_scripts/organisms/{human.Rakefile → Hsa.Rakefile} +4 -8
  8. data/install_scripts/organisms/{mgi.Rakefile → Mmu.Rakefile} +0 -0
  9. data/install_scripts/organisms/{rgd.Rakefile → Rno.Rakefile} +0 -0
  10. data/install_scripts/organisms/{sgd.Rakefile → Sce.Rakefile} +0 -0
  11. data/install_scripts/organisms/{pombe.Rakefile → Spo.Rakefile} +0 -0
  12. data/install_scripts/organisms/rake-include.rb +15 -19
  13. data/lib/rbbt.rb +0 -3
  14. data/lib/rbbt/ner/rnorm.rb +2 -2
  15. data/lib/rbbt/sources/go.rb +48 -3
  16. data/lib/rbbt/sources/organism.rb +12 -17
  17. data/lib/rbbt/util/open.rb +27 -27
  18. data/lib/rbbt/util/tmpfile.rb +16 -0
  19. data/tasks/install.rake +1 -1
  20. data/test/rbbt/bow/test_bow.rb +33 -0
  21. data/test/rbbt/bow/test_classifier.rb +72 -0
  22. data/test/rbbt/bow/test_dictionary.rb +91 -0
  23. data/test/rbbt/ner/rnorm/test_cue_index.rb +57 -0
  24. data/test/rbbt/ner/rnorm/test_tokens.rb +70 -0
  25. data/test/rbbt/ner/test_abner.rb +17 -0
  26. data/test/rbbt/ner/test_banner.rb +17 -0
  27. data/test/rbbt/ner/test_dictionaryNER.rb +122 -0
  28. data/test/rbbt/ner/test_regexpNER.rb +33 -0
  29. data/test/rbbt/ner/test_rner.rb +126 -0
  30. data/test/rbbt/ner/test_rnorm.rb +47 -0
  31. data/test/rbbt/sources/test_biocreative.rb +38 -0
  32. data/test/rbbt/sources/test_biomart.rb +31 -0
  33. data/test/rbbt/sources/test_entrez.rb +49 -0
  34. data/test/rbbt/sources/test_go.rb +24 -0
  35. data/test/rbbt/sources/test_organism.rb +59 -0
  36. data/test/rbbt/sources/test_polysearch.rb +27 -0
  37. data/test/rbbt/sources/test_pubmed.rb +29 -0
  38. data/test/rbbt/util/test_arrayHash.rb +257 -0
  39. data/test/rbbt/util/test_filecache.rb +37 -0
  40. data/test/rbbt/util/test_index.rb +31 -0
  41. data/test/rbbt/util/test_misc.rb +20 -0
  42. data/test/rbbt/util/test_open.rb +97 -0
  43. data/test/rbbt/util/test_simpleDSL.rb +57 -0
  44. data/test/rbbt/util/test_tmpfile.rb +21 -0
  45. data/test/test_helper.rb +4 -0
  46. data/test/test_rbbt.rb +11 -0
  47. metadata +39 -12
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'rbbt/ner/regexpNER'
5
+ require 'test/unit'
6
+
7
+ class TestRegExpNER < Test::Unit::TestCase
8
+
9
+ def test_class
10
+ text = "a bc d e f g h i j k l m n o p q one two"
11
+
12
+ lexicon =<<-EOF
13
+ C1,a,x,xx,xxx
14
+ C2,bc,y,yy,yyy
15
+ C3,i,z,zz,zzz,m,one two
16
+ EOF
17
+
18
+ file = TmpFile.tmp_file
19
+ File.open(file, 'w'){|f| f.write lexicon}
20
+
21
+ r = RegExpNER.new(file, :sep => ',', :stopwords => false)
22
+ assert_equal(['a', 'bc', 'i', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
23
+
24
+ r = RegExpNER.new(file, :sep => ',', :stopwords => true)
25
+ assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
26
+
27
+
28
+ FileUtils.rm file
29
+ end
30
+
31
+ end
32
+
33
+
@@ -0,0 +1,126 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rner'
4
+ require 'test/unit'
5
+
6
+ class TestRNer < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @parser = NERFeatures.new do
10
+ isLetters /^[A-Z]+$/i
11
+ context prefix_3 /^(...)/
12
+ downcase do |w| w.downcase end
13
+
14
+ context %w(downcase)
15
+ end
16
+ end
17
+
18
+ def test_config
19
+ config = <<-EOC
20
+ isLetters(/^[A-Z]+$/i)
21
+ context(prefix_3(/^(...)/))
22
+ downcase { |w| w.downcase }
23
+ context(["downcase"])
24
+ EOC
25
+
26
+ assert(@parser.config == config)
27
+ end
28
+
29
+ def test_reverse
30
+ assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
31
+ assert_equal(
32
+ ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
33
+ NERFeatures.reverse(
34
+ "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
35
+ ))
36
+ end
37
+
38
+ def test_features
39
+ assert(@parser.features("abCdE"),["abCdE",true,'abc','abcde'])
40
+ end
41
+
42
+ def test_template
43
+ template =<<-EOT
44
+ UisLetters: %x[0,1]
45
+ Uprefix_3: %x[0,2]
46
+ Uprefix_3#1: %x[1,2]
47
+ Uprefix_3#-1: %x[-1,2]
48
+ Udowncase: %x[0,3]
49
+ Udowncase#1: %x[1,3]
50
+ Udowncase#-1: %x[-1,3]
51
+ B
52
+ EOT
53
+
54
+ assert(@parser.template == template)
55
+ end
56
+
57
+ def test_tokens
58
+ assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
59
+ ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
60
+
61
+
62
+ end
63
+ def test_text_features
64
+
65
+ assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
66
+ assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
67
+ assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
68
+
69
+ end
70
+
71
+ def test_tagged_features
72
+ assert_equal(
73
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
74
+ ["of",true, false, "of", 0],
75
+ ["GENE1",false, "GEN", "gene1", 1],
76
+ [".", false, false, ".", 0]],
77
+ @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
78
+
79
+ assert_equal(
80
+ [["GENE1",false, "GEN", "gene1", 1],
81
+ ["phosphorilation",true, "pho", "phosphorilation", 0]],
82
+ @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
83
+
84
+
85
+ assert_equal(
86
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
87
+ ["of",true, false, "of", 0],
88
+ ["GENE",true, "GEN", "gene", 1],
89
+ ["1",false, false, "1", 2],
90
+ [".", false, false, ".", 0]],
91
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
92
+ end
93
+
94
+ def test_tagged_features_reverse
95
+ @parser.reverse = true
96
+ assert_equal(
97
+ [
98
+ ["GENE1",false, "GEN", "gene1", 1],
99
+ ["of",true, false, "of", 0],
100
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
101
+ ],
102
+ @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
103
+
104
+ assert_equal(
105
+ [
106
+ [".", false, false, ".", 0],
107
+ ["1",false, false, "1", 1],
108
+ ["GENE",true, "GEN", "gene", 2],
109
+ ["of",true, false, "of", 0],
110
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
111
+ ],
112
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
113
+ end
114
+
115
+
116
+ def test_NER_default
117
+ parser = NERFeatures.new
118
+
119
+ assert(parser.template =~ /UisLetter/)
120
+ end
121
+
122
+ def test_CRFPP_install
123
+ assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
124
+ end
125
+
126
+ end
@@ -0,0 +1,47 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/ner/rnorm'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/util/tmpfile'
5
+ require 'test/unit'
6
+
7
+ class TestRNORM < Test::Unit::TestCase
8
+
9
+ def setup
10
+ tmp = TmpFile.tmp_file("test-rnorm-")
11
+ lexicon =<<-EOT
12
+ S000000029 YAL031C GIP4 FUN21
13
+ S000000030 YAL032C PRP45 FUN20
14
+ S000000031 YAL033W POP5 FUN53
15
+ S000000374 YBR170C NPL4 HRD4
16
+ S000000375 GENE1 BBB CCC
17
+ S000000376 AAA GENE1 DDD
18
+ EOT
19
+
20
+ Open.write(tmp, lexicon)
21
+
22
+ @norm = Normalizer.new(tmp)
23
+ FileUtils.rm tmp
24
+ end
25
+
26
+ def test_match
27
+ assert_equal(["S000000029"], @norm.match("FUN21"))
28
+ assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN"))
29
+ assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN 2"))
30
+ assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN 21"))
31
+ assert_equal([], @norm.match("GER4"))
32
+
33
+ @norm.match("FUN21")
34
+ end
35
+
36
+ def test_select
37
+ assert_equal(["S000000029"], @norm.select(["S000000030", "S000000029", "S000000031"],"FUN 21"))
38
+ end
39
+
40
+ def test_resolve
41
+ assert_equal(["S000000029"], @norm.resolve("FUN 21"))
42
+ end
43
+
44
+ def test_order
45
+ assert_equal(["S000000375"], @norm.resolve("GENE1"))
46
+ end
47
+ end
@@ -0,0 +1,38 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/biocreative'
3
+ require 'test/unit'
4
+
5
+ class TestBiocreative < Test::Unit::TestCase
6
+
7
+ def test_BC2GM
8
+ assert(Biocreative.BC2GM(:test)['BC2GM000008491'][:text] == "Phenotypic analysis demonstrates that trio and Abl cooperate in regulating axon outgrowth in the embryonic central nervous system (CNS).")
9
+ assert(Biocreative.BC2GM(:test)['BC2GM000008491'][:mentions] == ["trio", "Abl"] )
10
+ end
11
+
12
+ def test_position
13
+ mention = "IgA"
14
+ text = "Early complement components, C1q and C4, and IgA secretory piece were absent."
15
+ pos = [[38, 40]]
16
+ assert(Biocreative.position(text,mention) == pos)
17
+
18
+ mention = "tyrosine-specific phosphatase"
19
+ text = "When expressed in Escherichia coli, SH-PTP2 displays tyrosine-specific phosphatase activity."
20
+ pos = [[46, 73]]
21
+ assert(Biocreative.position(text,mention) == pos)
22
+
23
+ mention = "tyrosine - specific phosphatase"
24
+ text = "When expressed in Escherichia coli, SH-PTP2 displays tyrosine-specific phosphatase activity."
25
+ pos = [[46, 73]]
26
+ assert(Biocreative.position(text,mention) == pos)
27
+
28
+ mention = "LH"
29
+ text = "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum radioimmuno-assay of LH."
30
+ pos = [[52, 53],[96, 97]]
31
+ assert(Biocreative.position(text,mention) == pos)
32
+
33
+ end
34
+
35
+
36
+ end
37
+
38
+
@@ -0,0 +1,31 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/biomart'
3
+ require 'test/unit'
4
+
5
+ class TestBioMart < Test::Unit::TestCase
6
+
7
+ def test_get
8
+ assert_raise BioMart::QueryError do
9
+ BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
+ end
11
+
12
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[])
13
+ assert(data['856452']['protein_id'].include? 'AAB68382')
14
+
15
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data )
16
+ assert(data['856452']['protein_id'].include? 'AAB68382')
17
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
18
+
19
+ end
20
+
21
+ def test_query
22
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'],[])
23
+
24
+ assert(data['856452']['protein_id'].include? 'AAB68382')
25
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
26
+
27
+ end
28
+
29
+ end
30
+
31
+
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/entrez'
3
+ require 'test/unit'
4
+
5
+ class TestEntrez < Test::Unit::TestCase
6
+
7
+ def test_entrez2native
8
+ tax = 4932
9
+ fix = proc{|code| code.sub(/SGD:S0/,'S0') }
10
+ check = proc{|code| code.match(/^S0/)}
11
+
12
+ lexicon = Entrez.entrez2native(tax, 5, fix, check)
13
+ assert(lexicon['855611'].include? 'S000005056')
14
+ end
15
+
16
+ def test_entrez2pubmed
17
+ tax = 4932
18
+
19
+ data = Entrez.entrez2pubmed(tax)
20
+ assert(data['850320'].include? '15102838')
21
+ end
22
+
23
+ def test_getonline
24
+ geneids = 9129
25
+
26
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
27
+
28
+ geneids = [9129,9]
29
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
30
+ end
31
+
32
+ def test_getgene
33
+ geneids = 9129
34
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
35
+
36
+ geneids = [9129,728049]
37
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
38
+
39
+ end
40
+
41
+ def test_similarity
42
+ assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
43
+ assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
44
+
45
+ end
46
+
47
+ end
48
+
49
+
@@ -0,0 +1,24 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+
3
+ require 'rbbt/sources/go'
4
+ require 'test/unit'
5
+
6
+ class TestGo < Test::Unit::TestCase
7
+
8
+ def test_go
9
+ assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
10
+ assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
11
+ end
12
+
13
+ def test_ancestors
14
+ assert GO.id2ancestors('GO:0000001').include? 'GO:0048308'
15
+ end
16
+
17
+ def test_namespace
18
+ assert_equal 'biological_process', GO.id2namespace('GO:0000001')
19
+ end
20
+
21
+
22
+ end
23
+
24
+
@@ -0,0 +1,59 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/organism'
3
+ require 'test/unit'
4
+
5
+ class TestOrganism < Test::Unit::TestCase
6
+
7
+ def test_all
8
+ assert Organism.all.include? 'Sce'
9
+ end
10
+
11
+ def test_ner
12
+ assert(Organism.ner(:Sce, :abner).is_a? Abner)
13
+ end
14
+
15
+ def test_norm
16
+ assert_equal(["S000003008"], Organism.norm(:Sce).select(['S000029454','S000003008'],'SLU1', 'SLU1 has been used in the literature to refer to both HEM2/YGL040C, which encodes a porphobilinogen synthase and SLU1, which is essential for splicing.'))
17
+
18
+ end
19
+
20
+ def test_supported_ids
21
+
22
+ ids = Organism.supported_ids('Sce', :examples => true)
23
+ assert(ids.first[0] == 'SGD DB Id' && ids.first[1] =~ /^S00/)
24
+
25
+ ids = Organism.supported_ids('Sce')
26
+ assert(ids.first == 'SGD DB Id')
27
+ end
28
+
29
+ def test_index
30
+ index = Organism.id_index('Sce')
31
+ assert_equal("S000004431", index['851160'])
32
+ end
33
+
34
+ def test_index_partial
35
+ index = Organism.id_index('Sce',:other => ['Ensembl Gene ID', 'Protein ID'])
36
+ assert_nil(index['851160'])
37
+ assert_equal("S000000838", index['YER036C'])
38
+
39
+ index = Organism.id_index('Sce',:other => ['Ensembl Gene ID', 'Protein ID'], :native => "Entrez Gene ID")
40
+ assert_equal("856758", index['YER036C'])
41
+
42
+ end
43
+
44
+ def test_go_terms
45
+
46
+ begin
47
+ goterms = Organism.goterms('Sce')
48
+ assert(goterms["S000000838"].include? "GO:0016887")
49
+ rescue
50
+ puts $!
51
+ puts "No goterms produced, see if it is all installed"
52
+ end
53
+
54
+ end
55
+
56
+
57
+ end
58
+
59
+
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'rbbt/sources/polysearch'
5
+ require 'test/unit'
6
+
7
+ class TestPolysearch < Test::Unit::TestCase
8
+
9
+ def test_match
10
+ text =<<-EOT
11
+
12
+ Analysis of sorted peripheral blood lymphocytes (CD8 T cells, CD4 T cells,
13
+ B cells, NK cells) from patients with melanoma. These subpopulations are
14
+ involved in antitumor responses and negatively impacted by cancer. Results
15
+ provide insight into molecular mechanisms of immune dysfunction in cancer.
16
+
17
+ EOT
18
+
19
+ assert_equal(["B cells", "T cells", "blood", "lymphocytes", "peripheral blood", "peripheral blood lymphocytes"].sort, Polysearch.match(text,nil).values.flatten.uniq.sort)
20
+ end
21
+
22
+ def test_name
23
+ assert_equal('ligament', Polysearch.name('organ','OR00039'))
24
+ end
25
+ end
26
+
27
+