rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,145 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Homo sapiens"
4
+
5
+
6
+ $native_id = "Entrez Gene ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 9606,
10
+ :fix => nil,
11
+ :check => proc{|code| false},
12
+ }
13
+
14
+ $lexicon = {
15
+ :biomart => {
16
+ :database => 'hsapiens_gene_ensembl',
17
+ :main => ['Entrez Gene ID' , "entrezgene"],
18
+ :extra => [
19
+ [ 'Associated Gene Name' , "external_gene_id"],
20
+ [ 'HGNC symbol', "hgnc_symbol" ],
21
+ [ 'HGNC automatic gene name', "hgnc_automatic_gene_name" ],
22
+ [ 'HGNC curated gene name ', "hgnc_curated_gene_name" ],
23
+ ],
24
+ }
25
+
26
+ }
27
+
28
+ $identifiers = {
29
+ :biomart => {
30
+ :database => 'hsapiens_gene_ensembl',
31
+ :main => ['Entrez Gene ID' , "entrezgene"],
32
+ :extra => [
33
+ [ 'Ensembl Gene ID', "ensembl_gene_id" ],
34
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
35
+ [ 'Associated Gene Name', "external_gene_id" ],
36
+ [ 'CCDS ID', "ccds" ],
37
+ [ 'Protein ID', "protein_id" ],
38
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
39
+ [ 'Unigene ID', "unigene" ],
40
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
41
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
42
+ [ 'HGNC ID', "hgnc_id", 'HGNC'],
43
+ ['EMBL (Genbank) ID' , "embl"] ,
44
+
45
+ # Affymetrix
46
+ [ 'AFFY HC G110', 'affy_hc_g110' ],
47
+ [ 'AFFY HG FOCUS', 'affy_hg_focus' ],
48
+ [ 'AFFY HG U133-PLUS-2', 'affy_hg_u133_plus_2' ],
49
+ [ 'AFFY HG U133A_2', 'affy_hg_u133a_2' ],
50
+ [ 'AFFY HG U133A', 'affy_hg_u133a' ],
51
+ [ 'AFFY HG U133B', 'affy_hg_u133b' ],
52
+ [ 'AFFY HG U95AV2', 'affy_hg_u95av2' ],
53
+ [ 'AFFY HG U95B', 'affy_hg_u95b' ],
54
+ [ 'AFFY HG U95C', 'affy_hg_u95c' ],
55
+ [ 'AFFY HG U95D', 'affy_hg_u95d' ],
56
+ [ 'AFFY HG U95E', 'affy_hg_u95e' ],
57
+ [ 'AFFY HG U95A', 'affy_hg_u95a' ],
58
+ [ 'AFFY HUGENEFL', 'affy_hugenefl' ],
59
+ [ 'AFFY HuEx', 'affy_huex_1_0_st_v2' ],
60
+ [ 'AFFY HuGene', 'affy_hugene_1_0_st_v1' ],
61
+ [ 'AFFY U133 X3P', 'affy_u133_x3p' ],
62
+ [ 'Agilent WholeGenome',"agilent_wholegenome" ],
63
+ [ 'Agilent CGH 44b', 'agilent_cgh_44b' ],
64
+ [ 'Codelink ID', 'codelink' ],
65
+ [ 'Illumina HumanWG 6 v2', 'illumina_humanwg_6_v2' ],
66
+ [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
67
+
68
+ ],
69
+ :filter => [],
70
+ }
71
+ }
72
+
73
+ $go = {
74
+ :url => "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD",
75
+ :code => 2,
76
+ :go => 4,
77
+ :pmid => 5,
78
+ }
79
+
80
+ $query = '"humans"[MeSH Terms] AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word])) AND (hasabstract[text] AND "humans"[MeSH Terms] AND English[lang])'
81
+ ##########################
82
+
83
+ require 'rbbt/util/index'
84
+
85
+ Rake::Task['gene.go'].clear
86
+ file 'gene.go' => ['identifiers'] do
87
+ if File.exists? 'identifiers'
88
+ require 'rbbt/sources/organism'
89
+ index = Organism.id_index('human', :other => ['Associated Gene Name'])
90
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude])
91
+
92
+ data = data.collect{|code, value_lists|
93
+ code = index[code]
94
+ [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
95
+ }.select{|p| p[0] && p[1].any?}
96
+
97
+ Open.write('gene.go',
98
+ data.collect{|p|
99
+ p[1].uniq.collect{|go|
100
+ "#{p[0]}\t#{go}"
101
+ }.join("\n")
102
+ }.join("\n")
103
+ )
104
+ end
105
+ end
106
+
107
+ Rake::Task['gene_go.pmid'].clear
108
+ file 'gene_go.pmid' => ['identifiers'] do
109
+ if File.exists? 'identifiers'
110
+ index = Index.index('identifiers')
111
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude])
112
+
113
+ data = data.collect{|code, value_lists|
114
+ code = index[code]
115
+ [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
116
+ }.select{|p| p[0] && p[1].any?}
117
+
118
+ Open.write('gene_go.pmid',
119
+ data.collect{|p|
120
+ p[1].uniq.collect{|pmid|
121
+ "#{p[0]}\t#{pmid}"
122
+ }.join("\n")
123
+ }.join("\n")
124
+ )
125
+ end
126
+ end
127
+
128
+
129
+ Rake::Task['lexicon'].clear
130
+ file 'lexicon' => ['identifiers'] do
131
+ if File.exists? 'identifiers'
132
+ require 'rbbt/sources/organism'
133
+ HGNC_URL = 'http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=HGNC+output+data&hgnc_dbtag=on&col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_acc_ids&status=Approved&status_opt=2&level=pri&=on&where=&order_by=gd_app_sym_sort&limit=&format=text&submit=submit&.cgifields=&.cgifields=level&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag'
134
+ names = Open.to_hash(HGNC_URL, :exclude => proc{|l| l.match(/^HGNC ID/)}, :flatten => true)
135
+ translations = Organism.id_index('human', :native => 'Entrez Gene ID', :other => ['HGNC ID'])
136
+
137
+ Open.write('lexicon',
138
+ names.collect{|code, names|
139
+ next unless translations[code]
140
+ ([translations[code]] + names).join("\t")
141
+ }.compact.join("\n")
142
+ )
143
+ end
144
+
145
+ end
@@ -0,0 +1,77 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Mus musculus"
4
+
5
+
6
+ $native_id = "MGI DB ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 10090,
10
+ :fix => nil,
11
+ :check => proc{|code| code.match(/^MGI/)},
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
17
+ :native => 0,
18
+ :extra => [2,3],
19
+ :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
20
+ },
21
+ }
22
+
23
+ $identifiers = {
24
+ :file => {
25
+ :url => "ftp://ftp.informatics.jax.org/pub/reports/MGI_Coordinate.rpt",
26
+ :native => 0,
27
+ :extra => [],
28
+ :exclude => proc{|l| l.split(/\t/)[1] != "Gene"},
29
+ },
30
+ :biomart => {
31
+ :database => 'mmusculus_gene_ensembl',
32
+ :main => ['MGI DB ID', 'mgi_id'] ,
33
+ :extra => [
34
+ ['Associated Gene Name' , "external_gene_id"],
35
+ ['Protein ID' , "protein_id"] ,
36
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
37
+ ['Unigene ID' , "unigene"] ,
38
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
39
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
40
+ ['EMBL (Genbank) ID' , "embl"] ,
41
+
42
+ ['Affy mg u74a',"affy_mg_u74a" ],
43
+ ['Affy mg u74av2',"affy_mg_u74av2" ],
44
+ ['Affy mg u74b',"affy_mg_u74b" ],
45
+ ['Affy mg u74bv2',"affy_mg_u74bv2" ],
46
+ ['Affy mg u74c',"affy_mg_u74c" ],
47
+ ['Affy mg u74cv2',"affy_mg_u74cv2" ],
48
+ ['Affy moe430a',"affy_moe430a" ],
49
+ ['Affy moe430b',"affy_moe430b" ],
50
+ ['AFFY MoEx',"affy_moex_1_0_st_v1" ],
51
+ ['AFFY MoGene',"affy_mogene_1_0_st_v1" ],
52
+ ['Affy mouse430 2',"affy_mouse430_2" ],
53
+ ['Affy mouse430a 2',"affy_mouse430a_2" ],
54
+ ['Affy mu11ksuba',"affy_mu11ksuba" ],
55
+ ['Affy mu11ksubb',"affy_mu11ksubb" ],
56
+ ['Agilent WholeGenome',"agilent_wholegenome" ],
57
+ ['Codelink ID',"codelink" ],
58
+ ['Illumina MouseWG 6 v1',"illumina_mousewg_6_v1" ],
59
+ ['Illumina MouseWG 6 v2',"illumina_mousewg_6_v2" ],
60
+
61
+ ],
62
+ :filter => ['with_mgi'], # This is needed as the filter is not with_mgi_id as was expected
63
+ }
64
+ }
65
+
66
+ $go = {
67
+ :url => "ftp://ftp.geneontology.org/go/gene-associations/gene_association.mgi.gz",
68
+ :code => 1,
69
+ :go => 4,
70
+ :pmid => 5,
71
+ }
72
+
73
+ $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word]) OR (("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]))'
74
+ ##########################
75
+
76
+
77
+
@@ -0,0 +1,40 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Schizosaccharomyces pombe"
4
+
5
+
6
+ $native_id = "GeneDB Id"
7
+
8
+ $entrez2native = {
9
+ :tax => 4896,
10
+ :fix => proc{|code| code.sub(/GeneDB:SP/,'SP') },
11
+ :check => proc{|code| code.match(/^SP/)},
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
17
+ :native => 0,
18
+ :extra => [1,2,3,4,5,6,7,8]
19
+ },
20
+ }
21
+
22
+ $identifiers = {
23
+ :file => {
24
+ :url => 'ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Mappings/allNames.txt',
25
+ :native => 0,
26
+ :extra => [],
27
+ },
28
+ }
29
+
30
+ $go = {
31
+ :url => "ftp://ftp.sanger.ac.uk/pub/yeast/pombe/Gene_ontology/gene_association.GeneDB_Spombe",
32
+ :code => 1,
33
+ :go => 4,
34
+ :pmid => 5,
35
+ }
36
+
37
+ $query = 'pombe[All Fields] AND (hasabstract[text] AND English[lang])'
38
+ ####
39
+
40
+
@@ -0,0 +1,258 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/arrayHash'
4
+ require 'rbbt/sources/biomart'
5
+ require 'rbbt/sources/entrez'
6
+ require 'rbbt/sources/pubmed'
7
+
8
+
9
+
10
+ file 'name' do
11
+ Open.write('name', $name)
12
+ end
13
+
14
+ file 'all.pmid' do
15
+ Open.write('all.pmid', PubMed.query($query).join("\n"))
16
+ end
17
+
18
+ file 'lexicon' do
19
+ begin
20
+
21
+ data = nil
22
+ # Read from file
23
+ if $lexicon[:file]
24
+ file = Open.to_hash($lexicon[:file][:url], $lexicon[:file])
25
+ data = ArrayHash.new(file, $native_id)
26
+ end
27
+
28
+ # Translate from entrez to native if needed
29
+ if $entrez2native
30
+ translations = {}
31
+ Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
32
+ each{|k,v|
33
+ translations[k] = [v.join("|")]
34
+ }
35
+ translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
36
+ if data
37
+ data.merge(translations_data)
38
+ else
39
+ data = translations_data
40
+ end
41
+
42
+ end
43
+
44
+
45
+ # Read from Biomart and merge with previous data
46
+ if $lexicon[:biomart]
47
+ biomart = {}
48
+
49
+ BioMart.query(
50
+ $lexicon[:biomart][:database],
51
+ $lexicon[:biomart][:main][1],
52
+ $lexicon[:biomart][:extra].collect{|v| v[1]},
53
+ $lexicon[:biomart][:filter]
54
+ ).each{|key, values_list|
55
+ values = values_list.values_at(*$lexicon[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
56
+ biomart[key] = values
57
+ }
58
+
59
+ biomart_data = ArrayHash.new(biomart, $lexicon[:biomart][:main][0], $lexicon[:biomart][:extra].collect{|v| v[0]})
60
+
61
+ if data
62
+ if $lexicon[:biomart][:extra].collect{|v| v[1]}.include?( $native_id )|| $lexicon[:biomart][:main][0] == $native_id
63
+ field = $native_id
64
+ else
65
+ field = 'Entrez Gene ID'
66
+ end
67
+ data.merge(biomart_data, field)
68
+ else
69
+ data = biomart_data
70
+ end
71
+ end
72
+
73
+ if $entrez2native
74
+ gene_alias = {}
75
+ Entrez.entrez2native($entrez2native[:tax],4).
76
+ each{|k,v|
77
+ gene_alias[k] = [v.select{|e| e.to_s != ""}.join("|")]
78
+ }
79
+ if gene_alias.keys.any?
80
+ gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
81
+ data.merge(gene_alias_data, 'Entrez Gene ID')
82
+ end
83
+ end
84
+
85
+ data.remove('Entrez Gene ID')
86
+ data.clean
87
+ Open.write('lexicon', data.data.collect{|code, name_lists|
88
+ "#{ code }\t" + name_lists.flatten.select{|n| n.to_s != ""}.uniq.join("\t")
89
+ }.join("\n"))
90
+
91
+ rescue Entrez::NoFile
92
+ puts "Lexicon not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
93
+ end
94
+ end
95
+
96
+
97
+ file 'identifiers' do
98
+
99
+ begin
100
+ data = nil
101
+ if $identifiers[:file]
102
+ file = Open.to_hash($identifiers[:file][:url], $identifiers[:file])
103
+ data = ArrayHash.new(file, $native_id, $identifiers[:file][:fields])
104
+ end
105
+
106
+ # Translate from entrez to native if needed
107
+ if $entrez2native
108
+ translations = {}
109
+ Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)).
110
+ each{|k,v|
111
+ translations[k] = [v.join("|")]
112
+ }
113
+ if translations.keys.any?
114
+ translations_data = ArrayHash.new(translations,'Entrez Gene ID', [$native_id])
115
+ if data
116
+ data.merge(translations_data)
117
+ else
118
+ data = translations_data
119
+ end
120
+ end
121
+
122
+ end
123
+
124
+
125
+ # Read from Biomart and merge with previous data
126
+ if $identifiers[:biomart]
127
+ biomart = {}
128
+
129
+ BioMart.query(
130
+ $identifiers[:biomart][:database],
131
+ $identifiers[:biomart][:main][1],
132
+ $identifiers[:biomart][:extra].collect{|v| v[1]},
133
+ $identifiers[:biomart][:filter]
134
+ ).each{|key, values_list|
135
+ values = values_list.values_at(*$identifiers[:biomart][:extra].collect{|v| v[1]}).compact.collect{|list| list.select{|e| e.to_s != ""}.uniq.join("|")}
136
+ biomart[key] = values
137
+ }
138
+
139
+ biomart_data = ArrayHash.new(biomart, $identifiers[:biomart][:main][0], $identifiers[:biomart][:extra].collect{|v| v[0]})
140
+ $identifiers[:biomart][:extra].each{|values|
141
+ if values[2]
142
+ biomart_data.process(values[0]){|n| "#{values[2]}:#{n}"}
143
+ end
144
+ }
145
+
146
+
147
+ if data
148
+ if $identifiers[:biomart][:extra].collect{|v| v[1]}.include?( $native_id ) || $identifiers[:biomart][:main][0] == $native_id
149
+ field = $native_id
150
+ else
151
+ field = 'Entrez Gene ID'
152
+ end
153
+ data.merge(biomart_data, field)
154
+ else
155
+ data = biomart_data
156
+ end
157
+ end
158
+
159
+
160
+ # Add the alias at the end
161
+ if $entrez2native
162
+ gene_alias = {}
163
+ Entrez.entrez2native($entrez2native[:tax],4).
164
+ each{|k,v|
165
+ gene_alias[k] = [v.join("|")]
166
+ }
167
+ if gene_alias.keys.any?
168
+ gene_alias_data = ArrayHash.new(gene_alias,'Entrez Gene ID', ['Entrez Gene Alias'])
169
+ if data
170
+ data.merge(gene_alias_data, 'Entrez Gene ID')
171
+ else
172
+ data = gene_alias_data
173
+ end
174
+ end
175
+ end
176
+
177
+
178
+
179
+ # Write ids to file
180
+ fout = File.open('identifiers', 'w')
181
+ fout.puts "##{$native_id}\t" + data.fields.join("\t")
182
+ data.clean
183
+ data.data.each{|code, values|
184
+ fout.puts code + "\t" + values.join("\t")
185
+ }
186
+ fout.close
187
+
188
+ rescue Entrez::NoFile
189
+ puts "Identifiers not produced for #{$name}, install the entrez gene_info file (rbbt_config install entrez)."
190
+ end
191
+ end
192
+
193
+
194
+ file 'gene.go' do
195
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:go], :exclude => $go[:exclude], :fix => $go[:fix])
196
+
197
+ data = data.collect{|code, value_lists|
198
+ [code, value_lists.flatten.select{|ref| ref =~ /GO:\d+/}.collect{|ref| ref.match(/(GO:\d+)/)[1]}]
199
+ }.select{|p| p[1].any?}
200
+
201
+ Open.write('gene.go',
202
+ data.collect{|p|
203
+ p[1].uniq.collect{|go|
204
+ "#{p[0]}\t#{go}"
205
+ }.join("\n")
206
+ }.join("\n")
207
+ )
208
+ end
209
+
210
+ file 'gene_go.pmid' do
211
+ data = Open.to_hash($go[:url], :native => $go[:code], :extra => $go[:pmid], :exclude => $go[:exclude], :fix => $go[:fix])
212
+
213
+ data = data.collect{|code, value_lists|
214
+ [code, value_lists.flatten.select{|ref| ref =~ /PMID:\d+/}.collect{|ref| ref.match(/PMID:(\d+)/)[1]}]
215
+ }.select{|p| p[1].any?}
216
+
217
+ Open.write('gene_go.pmid',
218
+ data.collect{|p|
219
+ p[1].uniq.collect{|pmid| "#{p[0]}\t#{pmid}" }.join("\n")
220
+ }.join("\n")
221
+ )
222
+ end
223
+
224
+
225
+ file 'gene.pmid' do
226
+ begin
227
+ translations = Entrez.entrez2native(*$entrez2native.values_at(:tax,:native,:fix,:check)) if $native_id != "Entrez Gene ID"
228
+
229
+ data = Entrez.entrez2pubmed($entrez2native[:tax])
230
+
231
+ Open.write('gene.pmid',
232
+ data.collect{|code,pmids|
233
+ next if translations && ! translations[code]
234
+ code = translations[code].first if translations
235
+ pmids.collect{|pmid|
236
+ "#{ code }\t#{pmid}"
237
+ }.compact.join("\n")
238
+ }.compact.join("\n")
239
+ )
240
+ rescue Entrez::NoFile
241
+ puts "Gene article associations from entrez not produced, install the gene2pumbed file (rbbt_config install entrez)."
242
+ end
243
+
244
+ end
245
+
246
+
247
+
248
+
249
+ task 'all' => ['name', 'lexicon', 'identifiers', 'gene_go.pmid', 'gene.pmid', 'gene.go', 'all.pmid']
250
+ task 'clean' do
251
+ `rm -f 'name' 'lexicon' 'identifiers' 'gene_go.pmid' 'gene.pmid' 'gene.go' 'all.pmid'`
252
+ end
253
+
254
+ task 'update' do
255
+ Rake::Task['clean'].invoke if $force
256
+ Rake::Task['all'].invoke
257
+ end
258
+
@@ -0,0 +1,88 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Rattus norvegicus"
4
+
5
+
6
+ $native_id = "RGD DB ID"
7
+
8
+ $entrez2native = {
9
+ :tax => 10116,
10
+ :check => proc{|code| code.match(/^RGD/)},
11
+ }
12
+
13
+ $lexicon = {
14
+ :file => {
15
+ :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
16
+ :native => 1,
17
+ :extra => [2,9],
18
+ :exclude => proc{|l| !l.match(/^RGD/)}
19
+ },
20
+ }
21
+
22
+ $identifiers = {
23
+ :file => {
24
+ :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
25
+ :native => 1,
26
+ :extra => [],
27
+ :exclude => proc{|l| !l.match(/^RGD/)}
28
+ },
29
+ :biomart => {
30
+ :database => 'rnorvegicus_gene_ensembl',
31
+ :main => ['Entrez Gene ID' , "entrezgene"],
32
+ :extra => [
33
+ ['Associated Gene Name' , "external_gene_id"],
34
+ ['Protein ID' , "protein_id"] ,
35
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
36
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
37
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
38
+ ['EMBL (Genbank) ID' , "embl"] ,
39
+
40
+ ['Affy rae230a', "affy_rae230a"],
41
+ ['Affy rae230b', "affy_rae230b"],
42
+ ['Affy RaGene', "affy_ragene_1_0_st_v1"],
43
+ ['Affy rat230 2', "affy_rat230_2"],
44
+ ['Affy RaEx', "affy_raex_1_0_st_v1"],
45
+ ['Affy rg u34a', "affy_rg_u34a"],
46
+ ['Affy rg u34b', "affy_rg_u34b"],
47
+ ['Affy rg u34c', "affy_rg_u34c"],
48
+ ['Affy rn u34', "affy_rn_u34"],
49
+ ['Affy rt u34', "affy_rt_u34"],
50
+ ['Agilent WholeGenome',"agilent_wholegenome" ],
51
+ ['Codelink ID ', "codelink"],
52
+
53
+
54
+ ],
55
+ :filter => [],
56
+ }
57
+ }
58
+
59
+ $go = {
60
+ :url => "ftp://rgd.mcw.edu/pub/data_release/gene_association.rgd.gz",
61
+ :exclude => proc{|l| !l.match(/^RGD/)},
62
+ :code => 1,
63
+ :go => 4,
64
+ :pmid => 5,
65
+ }
66
+
67
+ $query = '(("mice"[TIAB] NOT Medline[SB]) OR "mice"[MeSH Terms] OR mouse[Text Word]) AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
68
+
69
+ #{{{ Redefines
70
+
71
+ module Open
72
+
73
+ class << self
74
+ alias_method :old_read, :read
75
+
76
+ def read(url, options = {})
77
+ data = old_read(url, options)
78
+
79
+ if url =~ /gene_association.rgd.gz/
80
+ return data.collect{|l| l.gsub(/^RGD\t/,"RGD\tRGD:")}.join("\n")
81
+ else
82
+ return data
83
+ end
84
+
85
+ end
86
+ end
87
+ end
88
+
@@ -0,0 +1,66 @@
1
+ require __FILE__.sub(/[^\/]*$/,'') + '../rake-include'
2
+
3
+ $name = "Saccharomyces cerevisiae"
4
+
5
+
6
+ $native_id = "SGD DB Id"
7
+
8
+ $entrez2native = {
9
+ :tax => 4932,
10
+ :fix => proc{|code| code.sub(/SGD:S0/,'S0') },
11
+ :check => proc{|code| code.match(/^S0/)},
12
+ }
13
+
14
+ $lexicon = {
15
+ :file => {
16
+ :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
17
+ :native => 0,
18
+ :extra => [4,3,5]
19
+ },
20
+ :biomart => {
21
+ :database => 'scerevisiae_gene_ensembl',
22
+ :main => ['Entrez Gene ID', 'entrezgene'],
23
+ :extra => [
24
+ ['Interpro Description' , "interpro_description"],
25
+ ],
26
+ :filter => [],
27
+ }
28
+
29
+ }
30
+
31
+ $identifiers = {
32
+ :file => {
33
+ :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature/SGD_features.tab",
34
+ :native => 0,
35
+ :extra => [],
36
+ },
37
+ :biomart => {
38
+ :database => 'scerevisiae_gene_ensembl',
39
+ :main => ['Entrez Gene ID', 'entrezgene'],
40
+ :extra => [
41
+ ['Associated Gene Name' , "external_gene_id"],
42
+ ['Ensembl Gene ID', "ensembl_gene_id" ],
43
+ ['Ensembl Protein ID', "ensembl_peptide_id" ],
44
+ ['RefSeq Protein ID' , "refseq_peptide"] ,
45
+ ['UniProt/SwissProt ID' , "uniprot_swissprot"] ,
46
+ ['UniProt/SwissProt Accession' , "uniprot_swissprot_accession"] ,
47
+ ['Protein ID' , "protein_id"] ,
48
+ ['EMBL (Genbank) ID' , "embl"] ,
49
+ # Affymetrix
50
+ ['Affy yeast 2',"affy_yeast_2"],
51
+ ['Affy yg s98', "affy_yg_s98"],
52
+ ],
53
+ :filter => [],
54
+ }
55
+ }
56
+
57
+ $go = {
58
+ :url => "ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/gene_association.sgd.gz",
59
+ :code => 1,
60
+ :go => 4,
61
+ :pmid => 5,
62
+ }
63
+
64
+ $query = '"saccharomyces cerevisiae"[All Fields] AND ((("proteins"[TIAB] NOT Medline[SB]) OR "proteins"[MeSH Terms] OR protein[Text Word]) OR (("genes"[TIAB] NOT Medline[SB]) OR "genes"[MeSH Terms] OR gene[Text Word])) AND hasabstract[text] AND English[lang]'
65
+
66
+