rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,17 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/abner'
3
- require 'test/unit'
4
-
5
- class TestAbner < Test::Unit::TestCase
6
-
7
- def test_extract
8
- ner = Abner.new
9
-
10
- mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
11
- ["SHP-2", "SHIP", "Shc"].each{|mention|
12
- assert(mentions.include? mention)
13
- }
14
-
15
- end
16
-
17
- end
@@ -1,17 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/banner'
3
- require 'test/unit'
4
-
5
- class TestBanner < Test::Unit::TestCase
6
-
7
- def test_extract
8
- ner = Banner.new
9
-
10
- mentions = ner.extract(" The P-ITIM-compelled multi-phosphoprotein complex binds to and activates SHP-2, which in turn dephosphorylates SHIP and Shc and probably other substrates.")
11
- ["SHP - 2", "SHIP", "Shc"].each{|mention|
12
- assert(mentions.include? mention)
13
- }
14
-
15
- end
16
-
17
- end
@@ -1,122 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/tmpfile'
3
- require 'rbbt/util/open'
4
- require 'rbbt/ner/dictionaryNER'
5
- require 'test/unit'
6
-
7
- class TestDictionaryNER < Test::Unit::TestCase
8
-
9
- def setup
10
- @dictionary =<<-EOT
11
- DICT1\tWord1 Word2\tWord1
12
- DICT2\tWord3-Word4\tWord4
13
- EOT
14
-
15
- @dict = {
16
- "word1" => [{'word2' => ['DICT1'] }, 'DICT1'],
17
- "word3" => [{'word4' => ['DICT2'] }],
18
- "word4" => ['DICT2'],
19
- }
20
- end
21
-
22
- def test_simplify
23
- assert_equal('word1', DictionaryNER.simplify( "Word1"))
24
- assert_equal('ACL', DictionaryNER.simplify("ACL"))
25
- end
26
-
27
- def test_chunk
28
- assert_equal(["Word1","Word2"], DictionaryNER.chunk('Word1-Word2'))
29
- assert_equal(["Word1-1"], DictionaryNER.chunk('Word1-1'))
30
- end
31
-
32
- def test_match
33
-
34
- [
35
-
36
- ["Word1", {"word1" => ["D1"]}, {"Word1" => ["D1"]}],
37
-
38
- ["Word1 Word1", {"word1" => ["D1"]}, {"Word1" => ["D1"]}],
39
-
40
- ["Word2 Word1 Word3", {"word1" => ["D1"]}, {"Word1" => ["D1"]} ],
41
-
42
- ["Word2 Word1 Word4", {"word1" => ["D1","D2"]}, {"Word1" => ["D1","D2"]} ],
43
-
44
- ["Word2 Word1 Word4",
45
- {"word1" => [{'word2' => ['D1']}]},
46
- {} ],
47
-
48
- [
49
- "Word2 Word1 Word4",
50
- {"word1" => [ {'word4' => ['D1']} ] },
51
- {"Word1 Word4" => ["D1"]},
52
- ],
53
-
54
- [
55
- "Word2 Word1 Word4",
56
- {"word1" => [ {'word4' => ['D1']} ], "word4" => ['D2'] },
57
- {"Word1 Word4" => ["D1"], "Word4" => ['D2']},
58
- ],
59
-
60
-
61
- ].each{|match_info|
62
- text = match_info[0]
63
- dict = match_info[1]
64
- result = match_info[2]
65
- assert_equal(result, DictionaryNER.match(dict, text))
66
- }
67
-
68
- end
69
-
70
- def test_add_name
71
-
72
- [
73
-
74
- ["Word1", {"word1" => ['code']}],
75
-
76
- ["Word1 Word2", {"word1" => [{"word2" => ['code']}]}],
77
-
78
- ["Cerebellar stroke syndrome", {"cerebellar" => [{'stroke' => [{'syndrome' => ['code']}]}]}]
79
-
80
- ].each{|info|
81
- name = info[0]
82
- result = info[1]
83
-
84
- dict = {}
85
- DictionaryNER.add_name(dict, name, 'code')
86
- assert_equal(result, dict)
87
- }
88
-
89
- end
90
-
91
- def test_load
92
- assert_equal(@dict, DictionaryNER.load(@dictionary))
93
- end
94
-
95
- def test_class
96
- ner = DictionaryNER.new(@dictionary)
97
-
98
- [
99
- [ "Word1 Word2", ["Word1 Word2", "Word1"] ],
100
- [ "foo Word1 Word2 foo", ["Word1 Word2", "Word1"] ],
101
- [ "Word1-Word2", ["Word1 Word2", "Word1"] ],
102
- [ "Word1\nWord2", ["Word1 Word2", "Word1"] ],
103
- ].each{|info|
104
- text = info[0]
105
- keys = info[1]
106
-
107
- assert_equal(keys.sort, ner.match(text).keys.sort)
108
- }
109
- end
110
-
111
- def test_load_from_file
112
- tmpfile = TmpFile.tmp_file
113
-
114
- Open.write(tmpfile, @dictionary)
115
-
116
- ner = DictionaryNER.new(tmpfile)
117
-
118
- assert(ner.match("Word1").any?)
119
- end
120
-
121
- end
122
-
@@ -1,33 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt'
3
- require 'rbbt/util/tmpfile'
4
- require 'rbbt/ner/regexpNER'
5
- require 'test/unit'
6
-
7
- class TestRegExpNER < Test::Unit::TestCase
8
-
9
- def test_class
10
- text = "a bc d e f g h i j k l m n o p q one two"
11
-
12
- lexicon =<<-EOF
13
- C1,a,x,xx,xxx
14
- C2,bc,y,yy,yyy
15
- C3,i,z,zz,zzz,m,one two
16
- EOF
17
-
18
- file = TmpFile.tmp_file
19
- File.open(file, 'w'){|f| f.write lexicon}
20
-
21
- r = RegExpNER.new(file, :sep => ',', :stopwords => false)
22
- assert_equal(['a', 'bc', 'i', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
23
-
24
- r = RegExpNER.new(file, :sep => ',', :stopwords => true)
25
- assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
26
-
27
-
28
- FileUtils.rm file
29
- end
30
-
31
- end
32
-
33
-
@@ -1,126 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt'
3
- require 'rbbt/ner/rner'
4
- require 'test/unit'
5
-
6
- class TestRNer < Test::Unit::TestCase
7
-
8
- def setup
9
- @parser = NERFeatures.new do
10
- isLetters /^[A-Z]+$/i
11
- context prefix_3 /^(...)/
12
- downcase do |w| w.downcase end
13
-
14
- context %w(downcase)
15
- end
16
- end
17
-
18
- def test_config
19
- config = <<-EOC
20
- isLetters(/^[A-Z]+$/i)
21
- context(prefix_3(/^(...)/))
22
- downcase { |w| w.downcase }
23
- context(["downcase"])
24
- EOC
25
-
26
- assert(@parser.config == config)
27
- end
28
-
29
- def test_reverse
30
- assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
31
- assert_equal(
32
- ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
33
- NERFeatures.reverse(
34
- "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
35
- ))
36
- end
37
-
38
- def test_features
39
- assert(@parser.features("abCdE"),["abCdE",true,'abc','abcde'])
40
- end
41
-
42
- def test_template
43
- template =<<-EOT
44
- UisLetters: %x[0,1]
45
- Uprefix_3: %x[0,2]
46
- Uprefix_3#1: %x[1,2]
47
- Uprefix_3#-1: %x[-1,2]
48
- Udowncase: %x[0,3]
49
- Udowncase#1: %x[1,3]
50
- Udowncase#-1: %x[-1,3]
51
- B
52
- EOT
53
-
54
- assert(@parser.template == template)
55
- end
56
-
57
- def test_tokens
58
- assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
59
- ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
60
-
61
-
62
- end
63
- def test_text_features
64
-
65
- assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
66
- assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
67
- assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
68
-
69
- end
70
-
71
- def test_tagged_features
72
- assert_equal(
73
- [["phosphorilation",true, "pho", "phosphorilation", 0],
74
- ["of",true, false, "of", 0],
75
- ["GENE1",false, "GEN", "gene1", 1],
76
- [".", false, false, ".", 0]],
77
- @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
78
-
79
- assert_equal(
80
- [["GENE1",false, "GEN", "gene1", 1],
81
- ["phosphorilation",true, "pho", "phosphorilation", 0]],
82
- @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
83
-
84
-
85
- assert_equal(
86
- [["phosphorilation",true, "pho", "phosphorilation", 0],
87
- ["of",true, false, "of", 0],
88
- ["GENE",true, "GEN", "gene", 1],
89
- ["1",false, false, "1", 2],
90
- [".", false, false, ".", 0]],
91
- @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
92
- end
93
-
94
- def test_tagged_features_reverse
95
- @parser.reverse = true
96
- assert_equal(
97
- [
98
- ["GENE1",false, "GEN", "gene1", 1],
99
- ["of",true, false, "of", 0],
100
- ["phosphorilation",true, "pho", "phosphorilation", 0]
101
- ],
102
- @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
103
-
104
- assert_equal(
105
- [
106
- [".", false, false, ".", 0],
107
- ["1",false, false, "1", 1],
108
- ["GENE",true, "GEN", "gene", 2],
109
- ["of",true, false, "of", 0],
110
- ["phosphorilation",true, "pho", "phosphorilation", 0]
111
- ],
112
- @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
113
- end
114
-
115
-
116
- def test_NER_default
117
- parser = NERFeatures.new
118
-
119
- assert(parser.template =~ /UisLetter/)
120
- end
121
-
122
- def test_CRFPP_install
123
- assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
124
- end
125
-
126
- end
@@ -1,47 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/rnorm'
3
- require 'rbbt/util/open'
4
- require 'rbbt/util/tmpfile'
5
- require 'test/unit'
6
-
7
- class TestRNORM < Test::Unit::TestCase
8
-
9
- def setup
10
- tmp = TmpFile.tmp_file("test-rnorm-")
11
- lexicon =<<-EOT
12
- S000000029 YAL031C GIP4 FUN21
13
- S000000030 YAL032C PRP45 FUN20
14
- S000000031 YAL033W POP5 FUN53
15
- S000000374 YBR170C NPL4 HRD4
16
- S000000375 GENE1 BBB CCC
17
- S000000376 AAA GENE1 DDD
18
- EOT
19
-
20
- Open.write(tmp, lexicon)
21
-
22
- @norm = Normalizer.new(tmp)
23
- FileUtils.rm tmp
24
- end
25
-
26
- def test_match
27
- assert_equal(["S000000029"], @norm.match("FUN21"))
28
- assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN"))
29
- assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN 2"))
30
- assert_equal(["S000000030", "S000000029", "S000000031"], @norm.match("FUN 21"))
31
- assert_equal([], @norm.match("GER4"))
32
-
33
- @norm.match("FUN21")
34
- end
35
-
36
- def test_select
37
- assert_equal(["S000000029"], @norm.select(["S000000030", "S000000029", "S000000031"],"FUN 21"))
38
- end
39
-
40
- def test_resolve
41
- assert_equal(["S000000029"], @norm.resolve("FUN 21"))
42
- end
43
-
44
- def test_order
45
- assert_equal(["S000000375"], @norm.resolve("GENE1"))
46
- end
47
- end
@@ -1,38 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/sources/biocreative'
3
- require 'test/unit'
4
-
5
- class TestBiocreative < Test::Unit::TestCase
6
-
7
- def test_BC2GM
8
- assert(Biocreative.BC2GM(:test)['BC2GM000008491'][:text] == "Phenotypic analysis demonstrates that trio and Abl cooperate in regulating axon outgrowth in the embryonic central nervous system (CNS).")
9
- assert(Biocreative.BC2GM(:test)['BC2GM000008491'][:mentions] == ["trio", "Abl"] )
10
- end
11
-
12
- def test_position
13
- mention = "IgA"
14
- text = "Early complement components, C1q and C4, and IgA secretory piece were absent."
15
- pos = [[38, 40]]
16
- assert(Biocreative.position(text,mention) == pos)
17
-
18
- mention = "tyrosine-specific phosphatase"
19
- text = "When expressed in Escherichia coli, SH-PTP2 displays tyrosine-specific phosphatase activity."
20
- pos = [[46, 73]]
21
- assert(Biocreative.position(text,mention) == pos)
22
-
23
- mention = "tyrosine - specific phosphatase"
24
- text = "When expressed in Escherichia coli, SH-PTP2 displays tyrosine-specific phosphatase activity."
25
- pos = [[46, 73]]
26
- assert(Biocreative.position(text,mention) == pos)
27
-
28
- mention = "LH"
29
- text = "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum radioimmuno-assay of LH."
30
- pos = [[52, 53],[96, 97]]
31
- assert(Biocreative.position(text,mention) == pos)
32
-
33
- end
34
-
35
-
36
- end
37
-
38
-
@@ -1,31 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/sources/biomart'
3
- require 'test/unit'
4
-
5
- class TestBioMart < Test::Unit::TestCase
6
-
7
- def test_get
8
- assert_raise BioMart::QueryError do
9
- BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
10
- end
11
-
12
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[])
13
- assert(data['856452']['protein_id'].include? 'AAB68382')
14
-
15
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data )
16
- assert(data['856452']['protein_id'].include? 'AAB68382')
17
- assert(data['856452']['external_gene_id'].include? 'CUP1-2')
18
-
19
- end
20
-
21
- def test_query
22
- data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'],[])
23
-
24
- assert(data['856452']['protein_id'].include? 'AAB68382')
25
- assert(data['856452']['external_gene_id'].include? 'CUP1-2')
26
-
27
- end
28
-
29
- end
30
-
31
-
@@ -1,49 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/sources/entrez'
3
- require 'test/unit'
4
-
5
- class TestEntrez < Test::Unit::TestCase
6
-
7
- def test_entrez2native
8
- tax = 4932
9
- fix = proc{|code| code.sub(/SGD:S0/,'S0') }
10
- check = proc{|code| code.match(/^S0/)}
11
-
12
- lexicon = Entrez.entrez2native(tax, 5, fix, check)
13
- assert(lexicon['855611'].include? 'S000005056')
14
- end
15
-
16
- def test_entrez2pubmed
17
- tax = 4932
18
-
19
- data = Entrez.entrez2pubmed(tax)
20
- assert(data['850320'].include? '15102838')
21
- end
22
-
23
- def test_getonline
24
- geneids = 9129
25
-
26
- assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
27
-
28
- geneids = [9129,9]
29
- assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
30
- end
31
-
32
- def test_getgene
33
- geneids = 9129
34
- assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
35
-
36
- geneids = [9129,728049]
37
- assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
38
-
39
- end
40
-
41
- def test_similarity
42
- assert(Entrez.gene_text_similarity(9129, "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)") > 0)
43
- assert_equal(0, Entrez.gene_text_similarity("NON EXISTEN GENEID", "PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"))
44
-
45
- end
46
-
47
- end
48
-
49
-