rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,54 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Arabidopsis thaliana"
4
+
5
+
6
+ $native_id = "TAIR Locus"
7
+
8
+ $entrez2native = {
9
+ :tax =>3702,
10
+ :fix => proc{|code| code.sub(/^TAIR:/,'')},
11
+ :check => proc{|code| true },
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20090313",
17
+ :native => 0,
18
+ :extra => [1,2],
19
+ },
20
+ }
21
+
22
+ $identifiers = {
23
+ :file => {
24
+ :url => "ftp://ftp.arabidopsis.org/home/tair/Genes/gene_aliases.20090313",
25
+ :native => 0,
26
+ :extra => [],
27
+ },
28
+ :biomart => {
29
+ :database => 'athaliana_eg_gene',
30
+ :main => ['TAIR Locus', 'tair_locus'],
31
+ :extra => [
32
+ ['Associated Gene Name' , "external_gene_id"] ,
33
+ ['Gramene Gene ID' , "ensembl_gene_id"] ,
34
+ ['RefSeq peptide' , "refseq_peptide"] ,
35
+ ['Unigene' , "unigene"] ,
36
+ ['Interpro ID' , "interpro"] ,
37
+
38
+
39
+ ],
40
+ :filter => ['with_tair_locus'], # This is needed as the filter is not with_mgi_id as was expected
41
+ }
42
+
43
+ }
44
+
45
+ $go = {
46
+ :url => "ftp://ftp.arabidopsis.org/home/tair/Ontologies/Gene_Ontology/ATH_GO_GOSLIM.txt",
47
+ :code => 0,
48
+ :go => 5,
49
+ :pmid => 12,
50
+ }
51
+
52
+ $query = '("arabidopsis"[MeSH Terms] OR Arabidopsis[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]))'
53
+
54
+
@@ -0,0 +1,109 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Caenorhabditis elegans "
4
+
5
+
6
+ $native_id = "WormBase ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 6239,
10
+ :fix => proc{|code| code.sub(/^WormBase:/,'')},
11
+ :check => proc{|code| code.match(/^WBGene/)},
12
+ }
13
+
14
+ $lexicon = {
15
+
16
+ :file =>{
17
+
18
+ :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
19
+ :native => 0,
20
+ :extra => [1,2],
21
+
22
+ # :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
23
+ # :native => 0,
24
+ # :extra => [2,3,4,5],
25
+
26
+ },
27
+ }
28
+
29
+
30
+ $identifiers = {
31
+
32
+ :file =>{
33
+
34
+ :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
35
+ :native => 0,
36
+ :extra => [1,2],
37
+
38
+ # :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/gene_ids/current.gz",
39
+ # :native => 0,
40
+ # :extra => [2,3,4,5],
41
+
42
+ },
43
+
44
+ :biomart => {
45
+ :database => 'celegans_gene_ensembl',
46
+ :main => ['Entrez Gene ID' , "entrezgene"],
47
+ :extra => [
48
+ ['WormBase gene', "wormbase_gene" ],
49
+ ['Associated Gene Name ', "external_gene_id" ],
50
+ ['WormPep id', "wormpep_id" ],
51
+ [ 'Ensembl Gene ID', "ensembl_gene_id" ],
52
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
53
+ [ 'Protein ID ', "protein_id" ],
54
+ [ 'RefSeq Protein ID ', "refseq_peptide" ],
55
+ [ 'Unigene ID ', "unigene" ],
56
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
57
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
58
+ ['EMBL (Genbank) ID' , "embl"] ,
59
+ ],
60
+ :filter => [],
61
+ }
62
+ }
63
+
64
+ $go = {
65
+ :url => "ftp://ftp.wormbase.org/pub/wormbase/genomes/elegans/annotations/GO/current.txt.gz",
66
+ :code => 0,
67
+ :go => 3,
68
+ :pmid => 3,
69
+ }
70
+
71
+ $query = '"caenorhabditis elegans"[MeSH Terms] OR Caenorhabditis elegans[Text Word]'
72
+ ##########################
73
+
74
+
75
+ module Open
76
+
77
+ class << self
78
+ alias_method :old_read, :read
79
+
80
+ def read(url, options = {})
81
+ content = old_read(url, options)
82
+
83
+ if url =~ /GO/
84
+ return content.gsub(/.*:.*\((GO:\d+)\)/,'\1').gsub(/\nGO/,"|GO").
85
+ collect{|l|
86
+ l = l.sub(/\|/,"\t")
87
+ names, gos = l.chomp.split(/\t/)
88
+
89
+ id, name, extra = names.split(/ /)
90
+ extra = extra.gsub(/[()]/,'') if extra
91
+
92
+ if gos
93
+ gos.split(/\|/).collect{|go|
94
+ [id, name, extra, go].join("\t")
95
+ }.join("\n")
96
+ else
97
+ [id, name, extra].join("\t") + "\n"
98
+ end
99
+ }
100
+ elsif url =~ /gene_ids/
101
+ return content.gsub(/,/,"\t")
102
+ else
103
+ return content
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+
@@ -0,0 +1 @@
1
+ a been get least our them whether about before getting left ourselves then which after being go less out there while again between goes let over these who ago but going like per they whoever all by gone make put this whom almost came got many putting those whose also can gotten may same through why always cannot had maybe saw till will am come has me see to with an could have mine seen too within and did having more shall two without another do he most she unless won't any does her much should until would anybody doing here my so up wouldn't anyhow done him myself some upon yet anyone down his never somebody us you anything each how no someone very your anyway else i none something was are even if not stand we as ever in now such went at every into of sure were away everyone is off take what back everything isn't on than whatever be for it one that what's became from just onto the when because front last or their where