rbbt 1.1.8 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. data/README.rdoc +12 -12
  2. data/bin/rbbt_config +2 -3
  3. data/install_scripts/norm/Rakefile +4 -4
  4. data/install_scripts/organisms/{tair.Rakefile → Ath.Rakefile} +4 -3
  5. data/install_scripts/organisms/{cgd.Rakefile → Cal.Rakefile} +0 -0
  6. data/install_scripts/organisms/{worm.Rakefile → Cel.Rakefile} +0 -0
  7. data/install_scripts/organisms/{human.Rakefile → Hsa.Rakefile} +4 -8
  8. data/install_scripts/organisms/{mgi.Rakefile → Mmu.Rakefile} +0 -0
  9. data/install_scripts/organisms/{rgd.Rakefile → Rno.Rakefile} +0 -0
  10. data/install_scripts/organisms/{sgd.Rakefile → Sce.Rakefile} +0 -0
  11. data/install_scripts/organisms/{pombe.Rakefile → Spo.Rakefile} +0 -0
  12. data/install_scripts/organisms/rake-include.rb +15 -19
  13. data/lib/rbbt.rb +0 -3
  14. data/lib/rbbt/ner/rnorm.rb +2 -2
  15. data/lib/rbbt/sources/go.rb +48 -3
  16. data/lib/rbbt/sources/organism.rb +12 -17
  17. data/lib/rbbt/util/open.rb +27 -27
  18. data/lib/rbbt/util/tmpfile.rb +16 -0
  19. data/tasks/install.rake +1 -1
  20. data/test/rbbt/bow/test_bow.rb +33 -0
  21. data/test/rbbt/bow/test_classifier.rb +72 -0
  22. data/test/rbbt/bow/test_dictionary.rb +91 -0
  23. data/test/rbbt/ner/rnorm/test_cue_index.rb +57 -0
  24. data/test/rbbt/ner/rnorm/test_tokens.rb +70 -0
  25. data/test/rbbt/ner/test_abner.rb +17 -0
  26. data/test/rbbt/ner/test_banner.rb +17 -0
  27. data/test/rbbt/ner/test_dictionaryNER.rb +122 -0
  28. data/test/rbbt/ner/test_regexpNER.rb +33 -0
  29. data/test/rbbt/ner/test_rner.rb +126 -0
  30. data/test/rbbt/ner/test_rnorm.rb +47 -0
  31. data/test/rbbt/sources/test_biocreative.rb +38 -0
  32. data/test/rbbt/sources/test_biomart.rb +31 -0
  33. data/test/rbbt/sources/test_entrez.rb +49 -0
  34. data/test/rbbt/sources/test_go.rb +24 -0
  35. data/test/rbbt/sources/test_organism.rb +59 -0
  36. data/test/rbbt/sources/test_polysearch.rb +27 -0
  37. data/test/rbbt/sources/test_pubmed.rb +29 -0
  38. data/test/rbbt/util/test_arrayHash.rb +257 -0
  39. data/test/rbbt/util/test_filecache.rb +37 -0
  40. data/test/rbbt/util/test_index.rb +31 -0
  41. data/test/rbbt/util/test_misc.rb +20 -0
  42. data/test/rbbt/util/test_open.rb +97 -0
  43. data/test/rbbt/util/test_simpleDSL.rb +57 -0
  44. data/test/rbbt/util/test_tmpfile.rb +21 -0
  45. data/test/test_helper.rb +4 -0
  46. data/test/test_rbbt.rb +11 -0
  47. metadata +39 -12
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'rbbt/ner/regexpNER'
5
+ require 'test/unit'
6
+
7
+ class TestRegExpNER < Test::Unit::TestCase
8
+
9
+ def test_class
10
+ text = "a bc d e f g h i j k l m n o p q one two"
11
+
12
+ lexicon =<<-EOF
13
+ C1,a,x,xx,xxx
14
+ C2,bc,y,yy,yyy
15
+ C3,i,z,zz,zzz,m,one two
16
+ EOF
17
+
18
+ file = TmpFile.tmp_file
19
+ File.open(file, 'w'){|f| f.write lexicon}
20
+
21
+ r = RegExpNER.new(file, :sep => ',', :stopwords => false)
22
+ assert_equal(['a', 'bc', 'i', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
23
+
24
+ r = RegExpNER.new(file, :sep => ',', :stopwords => true)
25
+ assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
26
+
27
+
28
+ FileUtils.rm file
29
+ end
30
+
31
+ end
32
+
33
+
@@ -0,0 +1,126 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rner'
4
+ require 'test/unit'
5
+
6
+ class TestRNer < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @parser = NERFeatures.new do
10
+ isLetters /^[A-Z]+$/i
11
+ context prefix_3 /^(...)/
12
+ downcase do |w| w.downcase end
13
+
14
+ context %w(downcase)
15
+ end
16
+ end
17
+
18
+ def test_config
19
+ config = <<-EOC
20
+ isLetters(/^[A-Z]+$/i)
21
+ context(prefix_3(/^(...)/))
22
+ downcase { |w| w.downcase }
23
+ context(["downcase"])
24
+ EOC
25
+
26
+ assert(@parser.config == config)
27
+ end
28
+
29
+ def test_reverse
30
+ assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
31
+ assert_equal(
32
+ ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
33
+ NERFeatures.reverse(
34
+ "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
35
+ ))
36
+ end
37
+
38
+ def test_features
39
+ assert(@parser.features("abCdE"),["abCdE",true,'abc','abcde'])
40
+ end
41
+
42
+ def test_template
43
+ template =<<-EOT
44
+ UisLetters: %x[0,1]
45
+ Uprefix_3: %x[0,2]
46
+ Uprefix_3#1: %x[1,2]
47
+ Uprefix_3#-1: %x[-1,2]
48
+ Udowncase: %x[0,3]
49
+ Udowncase#1: %x[1,3]
50
+ Udowncase#-1: %x[-1,3]
51
+ B
52
+ EOT
53
+
54
+ assert(@parser.template == template)
55
+ end
56
+
57
+ def test_tokens
58
+ assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
59
+ ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
60
+
61
+
62
+ end
63
+ def test_text_features
64
+
65
+ assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
66
+ assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
67
+ assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
68
+
69
+ end
70
+
71
+ def test_tagged_features
72
+ assert_equal(
73
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
74
+ ["of",true, false, "of", 0],
75
+ ["GENE1",false, "GEN", "gene1", 1],
76
+ [".", false, false, ".", 0]],
77
+ @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
78
+
79
+ assert_equal(
80
+ [["GENE1",false, "GEN", "gene1", 1],
81
+ ["phosphorilation",true, "pho", "phosphorilation", 0]],
82
+ @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
83
+
84
+
85
+ assert_equal(
86
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
87
+ ["of",true, false, "of", 0],
88
+ ["GENE",true, "GEN", "gene", 1],
89
+ ["1",false, false, "1", 2],
90
+ [".", false, false, ".", 0]],
91
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
92
+ end
93
+
94
+ def test_tagged_features_reverse
95
+ @parser.reverse = true
96
+ assert_equal(
97
+ [
98
+ ["GENE1",false, "GEN", "gene1", 1],
99
+ ["of",true, false, "of", 0],
100
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
101
+ ],
102
+ @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
103
+
104
+ assert_equal(
105
+ [
106
+ [".", false, false, ".", 0],
107
+ ["1",false, false, "1", 1],
108
+ ["GENE",true, "GEN", "gene", 2],
109
+ ["of",true, false, "of", 0],
110
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
111
+ ],
112
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
113
+ end
114
+
115
+
116
+ def test_NER_default
117
+ parser = NERFeatures.new
118
+
119
+ assert(parser.template =~ /UisLetter/)
120
+ end
121
+
122
+ def test_CRFPP_install
123
+ assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
124
+ end
125
+
126
+ end
@@ -0,0 +1,47 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/ner/rnorm'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/util/tmpfile'
5
+ require 'test/unit'
6
+
7
+ class TestRNORM < Test::Unit::TestCase
8
+
9
+ def setup
10
+ tmp = TmpFile.tmp_file("test-rnorm-")
11
+ lexicon =<<-EOT
12
+ S000000029 YAL031C GIP4 FUN21
13
+ S000000030 YAL032C PRP45 FUN20
14
+ S000000031 YAL033W POP5 FUN53
15
+ S000000374 YBR170C NPL4 HRD4
16
+ S000000375 GENE1 BBB CCC
17
+ S000000376 AAA GENE1 DDD
18
+ EOT
19
+
20
+ Open.write(tmp, lexicon)
21
+
22
+ @norm = Normalizer.new(tmp)
23
+ FileUtils.rm tmp
24
+ end
25
+
26
+ def test_match
27
+ assert_equal(["S000000029"], @norm.match("FUN21"))
28
+ assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN"))
29
+ assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN 2"))
30
+ assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN 21"))
31
+ assert_equal([], @norm.match("GER4"))
32
+
33
+ @norm.match("FUN21")
34
+ end
35
+
36
+ def test_select
37
+ assert_equal(["S000000029"], @norm.select(["S000000030", "S000000029", "S000000031"],"FUN 21"))
38
+ end
39
+
40
+ def test_resolve
41
+ assert_equal(["S000000029"], @norm.resolve("FUN 21"))
42
+ end
43
+
44
+ def test_order
45
+ assert_equal(["S000000375"], @norm.resolve("GENE1"))
46
+ end
47
+ end
@@ -0,0 +1,38 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/biocreative'
3
+ require 'test/unit'
4
+
5
+ class TestBiocreative < Test::Unit::TestCase
6
+
7
+ def test_BC2GM
8
+ assert(Biocreative.BC2GM(:test)['BC2GM000008491'][:text] == "Phenotypic analysis demonstrates that trio and Abl cooperate in regulating axon outgrowth in the embryonic central nervous system (CNS).")
9
+ assert(Biocreative.BC2GM(:test)['BC2GM000008491'][:mentions] == ["trio", "Abl"] )
10
+ end
11
+
12
+ def test_position
13
+ mention = "IgA"
14
+ text = "Early complement components, C1q and C4, and IgA secretory piece were absent."
15
+ pos = [[38, 40]]
16
+ assert(Biocreative.position(text,mention) == pos)
17
+
18
+ mention = "tyrosine-specific phosphatase"
19
+ text = "When expressed in Escherichia coli, SH-PTP2 displays tyrosine-specific phosphatase activity."
20
+ pos = [[46, 73]]
21
+ assert(Biocreative.position(text,mention) == pos)
22
+
23
+ mention = "tyrosine - specific phosphatase"
24
+ text = "When expressed in Escherichia coli, SH-PTP2 displays tyrosine-specific phosphatase activity."
25
+ pos = [[46, 73]]
26
+ assert(Biocreative.position(text,mention) == pos)
27
+
28
+ mention = "LH"
29
+ text = "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum radioimmuno-assay of LH."
30
+ pos = [[52, 53],[96, 97]]
31
+ assert(Biocreative.position(text,mention) == pos)
32
+
33
+ end
34
+
35
+
36
+ end
37
+
38
+
@@ -0,0 +1,31 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/biomart'
3
+ require 'test/unit'
4
+
5
+ class TestBioMart < Test::Unit::TestCase
6
+
7
+ def test_get
8
+ assert_raise BioMart::QueryError do
9
+ BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
+ end
11
+
12
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[])
13
+ assert(data['856452']['protein_id'].include? 'AAB68382')
14
+
15
+ data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data )
16
+ assert(data['856452']['protein_id'].include? 'AAB68382')
17
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
18
+
19
+ end
20
+
21
+ def test_query
22
+ data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'],[])
23
+
24
+ assert(data['856452']['protein_id'].include? 'AAB68382')
25
+ assert(data['856452']['external_gene_id'].include? 'CUP1-2')
26
+
27
+ end
28
+
29
+ end
30
+
31
+
@@ -0,0 +1,49 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/entrez'
3
+ require 'test/unit'
4
+
5
+ class TestEntrez < Test::Unit::TestCase
6
+
7
+ def test_entrez2native
8
+ tax = 4932
9
+ fix = proc{|code| code.sub(/SGD:S0/,'S0') }
10
+ check = proc{|code| code.match(/^S0/)}
11
+
12
+ lexicon = Entrez.entrez2native(tax, 5, fix, check)
13
+ assert(lexicon['855611'].include? 'S000005056')
14
+ end
15
+
16
+ def test_entrez2pubmed
17
+ tax = 4932
18
+
19
+ data = Entrez.entrez2pubmed(tax)
20
+ assert(data['850320'].include? '15102838')
21
+ end
22
+
23
+ def test_getonline
24
+ geneids = 9129
25
+
26
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
27
+
28
+ geneids = [9129,9]
29
+ assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
30
+ end
31
+
32
+ def test_getgene
33
+ geneids = 9129
34
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
35
+
36
+ geneids = [9129,728049]
37
+ assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
38
+
39
+ end
40
+
41
+ def test_similarity
42
+ assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
43
+ assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
44
+
45
+ end
46
+
47
+ end
48
+
49
+
@@ -0,0 +1,24 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+
3
+ require 'rbbt/sources/go'
4
+ require 'test/unit'
5
+
6
+ class TestGo < Test::Unit::TestCase
7
+
8
+ def test_go
9
+ assert_match('vacuole inheritance',GO::id2name('GO:0000011'))
10
+ assert_equal(['vacuole inheritance','alpha-glucoside transport'], GO::id2name(['GO:0000011','GO:0000017']))
11
+ end
12
+
13
+ def test_ancestors
14
+ assert GO.id2ancestors('GO:0000001').include? 'GO:0048308'
15
+ end
16
+
17
+ def test_namespace
18
+ assert_equal 'biological_process', GO.id2namespace('GO:0000001')
19
+ end
20
+
21
+
22
+ end
23
+
24
+
@@ -0,0 +1,59 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/sources/organism'
3
+ require 'test/unit'
4
+
5
+ class TestOrganism < Test::Unit::TestCase
6
+
7
+ def test_all
8
+ assert Organism.all.include? 'Sce'
9
+ end
10
+
11
+ def test_ner
12
+ assert(Organism.ner(:Sce, :abner).is_a? Abner)
13
+ end
14
+
15
+ def test_norm
16
+ assert_equal(["S000003008"], Organism.norm(:Sce).select(['S000029454','S000003008'],'SLU1', 'SLU1 has been used in the literature to refer to both HEM2/YGL040C, which encodes a porphobilinogen synthase and SLU1, which is essential for splicing.'))
17
+
18
+ end
19
+
20
+ def test_supported_ids
21
+
22
+ ids = Organism.supported_ids('Sce', :examples => true)
23
+ assert(ids.first[0] == 'SGD DB Id' && ids.first[1] =~ /^S00/)
24
+
25
+ ids = Organism.supported_ids('Sce')
26
+ assert(ids.first == 'SGD DB Id')
27
+ end
28
+
29
+ def test_index
30
+ index = Organism.id_index('Sce')
31
+ assert_equal("S000004431", index['851160'])
32
+ end
33
+
34
+ def test_index_partial
35
+ index = Organism.id_index('Sce',:other => ['Ensembl Gene ID', 'Protein ID'])
36
+ assert_nil(index['851160'])
37
+ assert_equal("S000000838", index['YER036C'])
38
+
39
+ index = Organism.id_index('Sce',:other => ['Ensembl Gene ID', 'Protein ID'], :native => "Entrez Gene ID")
40
+ assert_equal("856758", index['YER036C'])
41
+
42
+ end
43
+
44
+ def test_go_terms
45
+
46
+ begin
47
+ goterms = Organism.goterms('Sce')
48
+ assert(goterms["S000000838"].include? "GO:0016887")
49
+ rescue
50
+ puts $!
51
+ puts "No goterms produced, see if it is all installed"
52
+ end
53
+
54
+ end
55
+
56
+
57
+ end
58
+
59
+
@@ -0,0 +1,27 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'rbbt/sources/polysearch'
5
+ require 'test/unit'
6
+
7
+ class TestPolysearch < Test::Unit::TestCase
8
+
9
+ def test_match
10
+ text =<<-EOT
11
+
12
+ Analysis of sorted peripheral blood lymphocytes (CD8 T cells, CD4 T cells,
13
+ B cells, NK cells) from patients with melanoma. These subpopulations are
14
+ involved in antitumor responses and negatively impacted by cancer. Results
15
+ provide insight into molecular mechanisms of immune dysfunction in cancer.
16
+
17
+ EOT
18
+
19
+ assert_equal(["B cells", "T cells", "blood", "lymphocytes", "peripheral blood", "peripheral blood lymphocytes"].sort, Polysearch.match(text,nil).values.flatten.uniq.sort)
20
+ end
21
+
22
+ def test_name
23
+ assert_equal('ligament', Polysearch.name('organ','OR00039'))
24
+ end
25
+ end
26
+
27
+