rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,145 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Homo sapiens"
4
+
5
+
6
+ $native_id = "Entrez Gene ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 9606,
10
+ :fix => nil,
11
+ :check => proc{|code| false},
12
+ }
13
+
14
+ $lexicon = {
15
+ :biomart => {
16
+ :database => 'hsapiens_gene_ensembl',
17
+ :main => ['Entrez Gene ID' , "entrezgene"],
18
+ :extra => [
19
+ [ 'Associated Gene Name' , "external_gene_id"],
20
+ [ 'HGNC symbol', "hgnc_symbol" ],
21
+ [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
22
+ [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
23
+ ],
24
+ }
25
+
26
+ }
27
+
28
+ $identifiers = {
29
+ :biomart => {
30
+ :database => 'hsapiens_gene_ensembl',
31
+ :main => ['Entrez Gene ID' , "entrezgene"],
32
+ :extra => [
33
+ [ 'Ensembl Gene ID', "ensembl_gene_id" ],
34
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
35
+ [ 'Associated Gene Name', "external_gene_id" ],
36
+ [ 'CCDS ID', "ccds" ],
37
+ [ 'Protein ID', "protein_id" ],
38
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
39
+ [ 'Unigene ID', "unigene" ],
40
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
41
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
42
+ [ 'HGNC ID', "hgnc_id", 'HGNC'],
43
+ ['EMBL (Genbank) ID' , "embl"] ,
44
+
45
+ # Affymetrix
46
+ [ 'AFFY HC G110', 'affy_hc_g110' ],
47
+ [ 'AFFY HG FOCUS', 'affy_hg_focus' ],
48
+ [ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
49
+ [ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
50
+ [ 'AFFY HG U133A', 'affy_hg_u133a' ],
51
+ [ 'AFFY HG U133B', 'affy_hg_u133b' ],
52
+ [ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
53
+ [ 'AFFY HG U95B', 'affy_hg_u95b' ],
54
+ [ 'AFFY HG U95C', 'affy_hg_u95c' ],
55
+ [ 'AFFY HG U95D', 'affy_hg_u95d' ],
56
+ [ 'AFFY HG U95E', 'affy_hg_u95e' ],
57
+ [ 'AFFY HG U95A', 'affy_hg_u95a' ],
58
+ [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
59
+ [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
60
+ [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
61
+ [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
62
+ [ 'Agilent WholeGenome',"agilent_wholegenome" ],
63
+ [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
64
+ [ 'Codelink ID', 'codelink' ],
65
+ [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
66
+ [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
67
+
68
+ ],
69
+ :filter => [],
70
+ }
71
+ }
72
+
73
+ $go = {
74
+ :url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
75
+ :code => 2,
76
+ :go => 4,
77
+ :pmid => 5,
78
+ }
79
+
80
+ $query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
81
+ ##########################
82
+
83
+ require 'rbbt/util/index'
84
+
85
+ Rake::Task['gene.go'].clear
86
+ file 'gene.go' => ['identifiers'] do
87
+ if File.exists? 'identifiers'
88
+ require 'rbbt/sources/organism'
89
+ index = Organism.id_index('human', :other => ['Associated Gene Name'])
90
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
91
+
92
+ data = data.collect{|code, value_lists|
93
+ code = index[code]
94
+ [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
95
+ }.select{|p| p[0] && p[1].any?}
96
+
97
+ Open.write('gene.go',
98
+ data.collect{|p|
99
+ p[1].uniq.collect{|go|
100
+ "#{p[0]}\t#{go}"
101
+ }.join("\n")
102
+ }.join("\n")
103
+ )
104
+ end
105
+ end
106
+
107
+ Rake::Task['gene_go.pmid'].clear
108
+ file 'gene_go.pmid' => ['identifiers'] do
109
+ if File.exists? 'identifiers'
110
+ index = Index.index('identifiers')
111
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
112
+
113
+ data = data.collect{|code, value_lists|
114
+ code = index[code]
115
+ [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
116
+ }.select{|p| p[0] && p[1].any?}
117
+
118
+ Open.write('gene_go.pmid',
119
+ data.collect{|p|
120
+ p[1].uniq.collect{|pmid|
121
+ "#{p[0]}\t#{pmid}"
122
+ }.join("\n")
123
+ }.join("\n")
124
+ )
125
+ end
126
+ end
127
+
128
+
129
+ Rake::Task['lexicon'].clear
130
+ file 'lexicon' => ['identifiers'] do
131
+ if File.exists? 'identifiers'
132
+ require 'rbbt/sources/organism'
133
+ HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
134
+ names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
135
+ translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
136
+
137
+ Open.write('lexicon',
138
+ names.collect{|code, names|
139
+ next unless translations[code]
140
+ ([translations[code]] + names).join("\t")
141
+ }.compact.join("\n")
142
+ )
143
+ end
144
+
145
+ end
@@ -0,0 +1,77 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Mus musculus"
4
+
5
+
6
+ $native_id = "MGI DB ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 10090,
10
+ :fix => nil,
11
+ :check => proc{|code| code.match(/^MGI/)},
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
17
+ :native => 0,
18
+ :extra => [2,3],
19
+ :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
20
+ },
21
+ }
22
+
23
+ $identifiers = {
24
+ :file => {
25
+ :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
26
+ :native => 0,
27
+ :extra => [],
28
+ :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
29
+ },
30
+ :biomart => {
31
+ :database => 'mmusculus_gene_ensembl',
32
+ :main => ['MGI DB ID', 'mgi_id'] ,
33
+ :extra => [
34
+ ['Associated Gene Name' , "external_gene_id"],
35
+ ['Protein ID' , "protein_id"] ,
36
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
37
+ ['Unigene ID' , "unigene"] ,
38
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
39
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
40
+ ['EMBL (Genbank) ID' , "embl"] ,
41
+
42
+ ['Affy mg u74a',"affy_mg_u74a" ],
43
+ ['Affy mg u74av2',"affy_mg_u74av2" ],
44
+ ['Affy mg u74b',"affy_mg_u74b" ],
45
+ ['Affy mg u74bv2',"affy_mg_u74bv2" ],
46
+ ['Affy mg u74c',"affy_mg_u74c" ],
47
+ ['Affy mg u74cv2',"affy_mg_u74cv2" ],
48
+ ['Affy moe430a',"affy_moe430a" ],
49
+ ['Affy moe430b',"affy_moe430b" ],
50
+ ['AFFY MoEx',"affy_moex_1_0_st_v1" ],
51
+ ['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
52
+ ['Affy mouse430 2',"affy_mouse430_2" ],
53
+ ['Affy mouse430a 2',"affy_mouse430a_2" ],
54
+ ['Affy mu11ksuba',"affy_mu11ksuba" ],
55
+ ['Affy mu11ksubb',"affy_mu11ksubb" ],
56
+ ['Agilent WholeGenome',"agilent_wholegenome" ],
57
+ ['Codelink ID',"codelink" ],
58
+ ['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
59
+ ['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
60
+
61
+ ],
62
+ :filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
63
+ }
64
+ }
65
+
66
+ $go = {
67
+ :url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
68
+ :code => 1,
69
+ :go => 4,
70
+ :pmid => 5,
71
+ }
72
+
73
+ $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
74
+ ##########################
75
+
76
+
77
+
@@ -0,0 +1,40 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Schizosaccharomyces pombe"
4
+
5
+
6
+ $native_id = "GeneDB Id"
7
+
8
+ $entrez2native = {
9
+ :tax => 4896,
10
+ :fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
11
+ :check => proc{|code| code.match(/^SP/)},
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
17
+ :native => 0,
18
+ :extra => [1,2,3,4,5,6,7,8]
19
+ },
20
+ }
21
+
22
+ $identifiers = {
23
+ :file => {
24
+ :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
25
+ :native => 0,
26
+ :extra => [],
27
+ },
28
+ }
29
+
30
+ $go = {
31
+ :url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
32
+ :code => 1,
33
+ :go => 4,
34
+ :pmid => 5,
35
+ }
36
+
37
+ $query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
38
+ ####
39
+
40
+
@@ -0,0 +1,258 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/arrayHash'
4
+ require 'rbbt/sources/biomart'
5
+ require 'rbbt/sources/entrez'
6
+ require 'rbbt/sources/pubmed'
7
+
8
+
9
+
10
+ file 'name' do
11
+ Open.write('name', $name)
12
+ end
13
+
14
+ file 'all.pmid' do
15
+ Open.write('all.pmid', PubMed.query($query).join("\n"))
16
+ end
17
+
18
+ file 'lexicon' do
19
+ begin
20
+
21
+ data = nil
22
+ # Read from file
23
+ if $lexicon[:file]
24
+ file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
25
+ data = ArrayHash.new(file, $native_id)
26
+ end
27
+
28
+ # Translate from entrez to native if needed
29
+ if $entrez2native
30
+ translations = {}
31
+ Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
32
+ each{|k,v|
33
+ translations[k] = [v.join("|")]
34
+ }
35
+ translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
36
+ if data
37
+ data.merge(translations_data)
38
+ else
39
+ data = translations_data
40
+ end
41
+
42
+ end
43
+
44
+
45
+ # Read from Biomart and merge with previous data
46
+ if $lexicon[:biomart]
47
+ biomart = {}
48
+
49
+ BioMart.query(
50
+ $lexicon[:biomart][:database],
51
+ $lexicon[:biomart][:main][1],
52
+ $lexicon[:biomart][:extra].collect{|v| v[1]},
53
+ $lexicon[:biomart][:filter]
54
+ ).each{|key, values_list|
55
+ values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
56
+ biomart[key] = values
57
+ }
58
+
59
+ biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
60
+
61
+ if data
62
+ if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
63
+ field = $native_id
64
+ else
65
+ field = 'Entrez Gene ID'
66
+ end
67
+ data.merge(biomart_data, field)
68
+ else
69
+ data = biomart_data
70
+ end
71
+ end
72
+
73
+ if $entrez2native
74
+ gene_alias = {}
75
+ Entrez.entrez2native($entrez2native[:tax],4).
76
+ each{|k,v|
77
+ gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
78
+ }
79
+ if gene_alias.keys.any?
80
+ gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
81
+ data.merge(gene_alias_data, 'Entrez Gene ID')
82
+ end
83
+ end
84
+
85
+ data.remove('Entrez Gene ID')
86
+ data.clean
87
+ Open.write('lexicon', data.data.collect{|code, name_lists|
88
+ "#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
89
+ }.join("\n"))
90
+
91
+ rescue Entrez::NoFile
92
+ puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
93
+ end
94
+ end
95
+
96
+
97
+ file 'identifiers' do
98
+
99
+ begin
100
+ data = nil
101
+ if $identifiers[:file]
102
+ file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
103
+ data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
104
+ end
105
+
106
+ # Translate from entrez to native if needed
107
+ if $entrez2native
108
+ translations = {}
109
+ Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
110
+ each{|k,v|
111
+ translations[k] = [v.join("|")]
112
+ }
113
+ if translations.keys.any?
114
+ translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
115
+ if data
116
+ data.merge(translations_data)
117
+ else
118
+ data = translations_data
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+
125
+ # Read from Biomart and merge with previous data
126
+ if $identifiers[:biomart]
127
+ biomart = {}
128
+
129
+ BioMart.query(
130
+ $identifiers[:biomart][:database],
131
+ $identifiers[:biomart][:main][1],
132
+ $identifiers[:biomart][:extra].collect{|v| v[1]},
133
+ $identifiers[:biomart][:filter]
134
+ ).each{|key, values_list|
135
+ values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
136
+ biomart[key] = values
137
+ }
138
+
139
+ biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
140
+ $identifiers[:biomart][:extra].each{|values|
141
+ if values[2]
142
+ biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
143
+ end
144
+ }
145
+
146
+
147
+ if data
148
+ if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
149
+ field = $native_id
150
+ else
151
+ field = 'Entrez Gene ID'
152
+ end
153
+ data.merge(biomart_data, field)
154
+ else
155
+ data = biomart_data
156
+ end
157
+ end
158
+
159
+
160
+ # Add the alias at the end
161
+ if $entrez2native
162
+ gene_alias = {}
163
+ Entrez.entrez2native($entrez2native[:tax],4).
164
+ each{|k,v|
165
+ gene_alias[k] = [v.join("|")]
166
+ }
167
+ if gene_alias.keys.any?
168
+ gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
169
+ if data
170
+ data.merge(gene_alias_data, 'Entrez Gene ID')
171
+ else
172
+ data = gene_alias_data
173
+ end
174
+ end
175
+ end
176
+
177
+
178
+
179
+ # Write ids to file
180
+ fout = File.open('identifiers', 'w')
181
+ fout.puts "##{$native_id}\t" + data.fields.join("\t")
182
+ data.clean
183
+ data.data.each{|code, values|
184
+ fout.puts code + "\t" + values.join("\t")
185
+ }
186
+ fout.close
187
+
188
+ rescue Entrez::NoFile
189
+ puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
190
+ end
191
+ end
192
+
193
+
194
+ file 'gene.go' do
195
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
196
+
197
+ data = data.collect{|code, value_lists|
198
+ [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
199
+ }.select{|p| p[1].any?}
200
+
201
+ Open.write('gene.go',
202
+ data.collect{|p|
203
+ p[1].uniq.collect{|go|
204
+ "#{p[0]}\t#{go}"
205
+ }.join("\n")
206
+ }.join("\n")
207
+ )
208
+ end
209
+
210
+ file 'gene_go.pmid' do
211
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
212
+
213
+ data = data.collect{|code, value_lists|
214
+ [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
215
+ }.select{|p| p[1].any?}
216
+
217
+ Open.write('gene_go.pmid',
218
+ data.collect{|p|
219
+ p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
220
+ }.join("\n")
221
+ )
222
+ end
223
+
224
+
225
+ file 'gene.pmid' do
226
+ begin
227
+ translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
228
+
229
+ data = Entrez.entrez2pubmed($entrez2native[:tax])
230
+
231
+ Open.write('gene.pmid',
232
+ data.collect{|code,pmids|
233
+ next if translations && ! translations[code]
234
+ code = translations[code].first if translations
235
+ pmids.collect{|pmid|
236
+ "#{ code }\t#{pmid}"
237
+ }.compact.join("\n")
238
+ }.compact.join("\n")
239
+ )
240
+ rescue Entrez::NoFile
241
+ puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
242
+ end
243
+
244
+ end
245
+
246
+
247
+
248
+
249
+ task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
250
+ task 'clean' do
251
+ `rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
252
+ end
253
+
254
+ task 'update' do
255
+ Rake::Task['clean'].invoke if $force
256
+ Rake::Task['all'].invoke
257
+ end
258
+
@@ -0,0 +1,88 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Rattus norvegicus"
4
+
5
+
6
+ $native_id = "RGD DB ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 10116,
10
+ :check => proc{|code| code.match(/^RGD/)},
11
+ }
12
+
13
+ $lexicon = {
14
+ :file => {
15
+ :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
16
+ :native => 1,
17
+ :extra => [2,9],
18
+ :exclude => proc{|l| !l.match(/^RGD/)}
19
+ },
20
+ }
21
+
22
+ $identifiers = {
23
+ :file => {
24
+ :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
25
+ :native => 1,
26
+ :extra => [],
27
+ :exclude => proc{|l| !l.match(/^RGD/)}
28
+ },
29
+ :biomart => {
30
+ :database => 'rnorvegicus_gene_ensembl',
31
+ :main => ['Entrez Gene ID' , "entrezgene"],
32
+ :extra => [
33
+ ['Associated Gene Name' , "external_gene_id"],
34
+ ['Protein ID' , "protein_id"] ,
35
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
36
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
37
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
38
+ ['EMBL (Genbank) ID' , "embl"] ,
39
+
40
+ ['Affy rae230a', "affy_rae230a"],
41
+ ['Affy rae230b', "affy_rae230b"],
42
+ ['Affy RaGene', "affy_ragene_1_0_st_v1"],
43
+ ['Affy rat230 2', "affy_rat230_2"],
44
+ ['Affy RaEx', "affy_raex_1_0_st_v1"],
45
+ ['Affy rg u34a', "affy_rg_u34a"],
46
+ ['Affy rg u34b', "affy_rg_u34b"],
47
+ ['Affy rg u34c', "affy_rg_u34c"],
48
+ ['Affy rn u34', "affy_rn_u34"],
49
+ ['Affy rt u34', "affy_rt_u34"],
50
+ ['Agilent WholeGenome',"agilent_wholegenome" ],
51
+ ['Codelink ID ', "codelink"],
52
+
53
+
54
+ ],
55
+ :filter => [],
56
+ }
57
+ }
58
+
59
+ $go = {
60
+ :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
61
+ :exclude => proc{|l| !l.match(/^RGD/)},
62
+ :code => 1,
63
+ :go => 4,
64
+ :pmid => 5,
65
+ }
66
+
67
+ $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
68
+
69
+ #{{{ Redefines
70
+
71
+ module Open
72
+
73
+ class << self
74
+ alias_method :old_read, :read
75
+
76
+ def read(url, options = {})
77
+ data = old_read(url, options)
78
+
79
+ if url =~ /gene_association.rgd.gz/
80
+ return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
81
+ else
82
+ return data
83
+ end
84
+
85
+ end
86
+ end
87
+ end
88
+
@@ -0,0 +1,66 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Saccharomyces cerevisiae"
4
+
5
+
6
+ $native_id = "SGD DB Id"
7
+
8
+ $entrez2native = {
9
+ :tax => 4932,
10
+ :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
11
+ :check => proc{|code| code.match(/^S0/)},
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
17
+ :native => 0,
18
+ :extra => [4,3,5]
19
+ },
20
+ :biomart => {
21
+ :database => 'scerevisiae_gene_ensembl',
22
+ :main => ['Entrez Gene ID', 'entrezgene'],
23
+ :extra => [
24
+ ['Interpro Description' , "interpro_description"],
25
+ ],
26
+ :filter => [],
27
+ }
28
+
29
+ }
30
+
31
+ $identifiers = {
32
+ :file => {
33
+ :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
34
+ :native => 0,
35
+ :extra => [],
36
+ },
37
+ :biomart => {
38
+ :database => 'scerevisiae_gene_ensembl',
39
+ :main => ['Entrez Gene ID', 'entrezgene'],
40
+ :extra => [
41
+ ['Associated Gene Name' , "external_gene_id"],
42
+ ['Ensembl Gene ID', "ensembl_gene_id" ],
43
+ ['Ensembl Protein ID', "ensembl_peptide_id" ],
44
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
45
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
46
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
47
+ ['Protein ID' , "protein_id"] ,
48
+ ['EMBL (Genbank) ID' , "embl"] ,
49
+ # Affymetrix
50
+ ['Affy yeast 2',"affy_yeast_2"],
51
+ ['Affy yg s98', "affy_yg_s98"],
52
+ ],
53
+ :filter => [],
54
+ }
55
+ }
56
+
57
+ $go = {
58
+ :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
59
+ :code => 1,
60
+ :go => 4,
61
+ :pmid => 5,
62
+ }
63
+
64
+ $query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
65
+
66
+