rbbt-sources 3.3.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b10dbe140b4c0733476823e5f5d94e57a3d9a755fc370f6b9640d1e7b8efc368
4
- data.tar.gz: 38aaf56670a07537ad0ef0c025d17e655fc5d7fb87d97ee1c08d0af82c44fbbd
3
+ metadata.gz: 3de2796d78be0d34330313646a9885e147eafff0358471450bfe4f2120358aa0
4
+ data.tar.gz: 54c04d6c10cf6a5e9a442b5151c89951f644154c2144f9bbcd36cfbc7ab939a9
5
5
  SHA512:
6
- metadata.gz: a8ac9df1da30fc7aec3c54a5a200a0c7a9629807b9238089a1e8064e78b0ecd5bad36c4b6a77fac7e7cfdf332ad56be06149b12d0e0fd7f6506b0b82d2e03bcf
7
- data.tar.gz: acff50e8bdb0d4443c3e1dbd237539953206b7d5dcb886db64ec0677f7bba43cf3a9782e4147985a9b3fc1b34df692e89fb2b7185f2aa0f93ccd196a4d19d54a
6
+ metadata.gz: 489a161942fbd6ab46217446c321ccd7d2e72f1e0484f87f1adecb2291fde8ccdf51f7829a48ebf814805096bdc8e4c50d8ce6e33ec5749385b9def84f638198
7
+ data.tar.gz: 6b517de298e5b72667a6a08cda86662c4dbae379215806689a126bb0ab34b7c2d0cb74d63c43e3a23106a438bf3e632484e0416d1ed79741e589a53d503868f0
@@ -1,8 +1,6 @@
1
1
  may2009
2
2
  feb2014
3
- may2017
4
- oct2018
5
- apr2019
3
+ may2015
4
+ sep2019
6
5
  feb2021
7
6
  feb2023
8
- oct2016
@@ -19,6 +19,8 @@
19
19
  - refseq_ncrna_predicted
20
20
  ">jun2015":
21
21
  - uniprot_swissprot_accession~uniprot_swissprot
22
+ ">jan2023":
23
+ - external_transcript_id~external_transcript_name
22
24
  <aug2014:
23
25
  - external_gene_name~external_gene_id
24
26
  may2010:
data/etc/build_organism CHANGED
@@ -2,7 +2,7 @@
2
2
  hg18 Hsa/may2008
3
3
  hg19 Hsa/feb2014
4
4
  b37 Hsa/feb2014
5
- hg38 Hsa/may2017
6
- GRCh38 Hsa/may2017
7
- mm10 Mmu/may2017
8
- GRCm38 Mmu/may2017
5
+ hg38 Hsa/feb2023
6
+ GRCh38 Hsa/feb2023
7
+ mm10 Mmu/feb2023
8
+ GRCm38 Mmu/feb2023
data/etc/organisms CHANGED
@@ -1,2 +1,3 @@
1
1
  Hsa
2
2
  Mmu
3
+ Sce
@@ -3,6 +3,7 @@ require 'rbbt/tsv'
3
3
  require 'rbbt/tsv/attach'
4
4
  require 'rbbt/util/log'
5
5
  require 'cgi'
6
+ require 'rbbt/sources/organism'
6
7
 
7
8
  # This module interacts with BioMart. It performs queries to BioMart and
8
9
  # synthesises a hash with the results. Note that this module connects to the
@@ -13,7 +14,7 @@ module BioMart
13
14
 
14
15
  class BioMart::QueryError < StandardError; end
15
16
 
16
- BIOMART_URL = 'http://www.ensembl.org/biomart/martservice?query='
17
+ BIOMART_URL = 'ensembl.org/biomart/martservice'
17
18
 
18
19
  MISSING_IN_ARCHIVE = Rbbt.etc.biomart.missing_in_archive.exists? ? Rbbt.etc.biomart.missing_in_archive.find.yaml : {}
19
20
 
@@ -22,7 +23,7 @@ module BioMart
22
23
  @@biomart_query_xml = <<-EOT
23
24
  <?xml version="1.0" encoding="UTF-8"?>
24
25
  <!DOCTYPE Query>
25
- <Query completionStamp="1" virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
26
+ <Query completionStamp="1" virtualSchemaName = "<!--VIRTUALSCHEMANAME-->" formatter = "TSV" header = "0" uniqueRows = "1" datasetConfigVersion = "0.6" >
26
27
  <Dataset name = "<!--DATABASE-->" interface = "default" >
27
28
  <!--FILTERS-->
28
29
  <!--MAIN-->
@@ -36,14 +37,10 @@ module BioMart
36
37
  raise "Biomart archive #{ date } is not allowed in this installation" unless Rbbt.etc.allowed_biomart_archives.find.read.split("\n").include? date
37
38
  end
38
39
  Thread.current['archive'] = date
39
- Thread.current['archive_url'] = BIOMART_URL.sub(/www/, date + '.archive')
40
- Log.debug "Using Archive URL #{ Thread.current['archive_url'] }"
41
40
  end
42
41
 
43
42
  def self.unset_archive
44
- Log.debug "Restoring current version URL #{BIOMART_URL}"
45
43
  Thread.current['archive'] = nil
46
- Thread.current['archive_url'] = nil
47
44
  end
48
45
 
49
46
  def self.with_archive(data)
@@ -55,6 +52,21 @@ module BioMart
55
52
  end
56
53
  end
57
54
 
55
+ def self.final_url(query, archive = nil, ensembl_domain = nil)
56
+ url_domain = if archive.nil?
57
+ if ensembl_domain.nil?
58
+ 'www'
59
+ else
60
+ ensembl_domain
61
+ end
62
+ elsif ensembl_domain
63
+ [archive, ensembl_domain] * "-"
64
+ else
65
+ [archive, 'archive'] * "."
66
+ end
67
+ "http://" + url_domain + "." + BIOMART_URL + "?query=#{query}"
68
+ end
69
+
58
70
  def self.get(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
59
71
  open_options = Misc.add_defaults open_options, :wget_options => {"--read-timeout=" => 9000, "--tries=" => 1}
60
72
  repeats = true
@@ -75,11 +87,17 @@ module BioMart
75
87
 
76
88
  query = @@biomart_query_xml.dup
77
89
  query.sub!(/<!--DATABASE-->/,database)
90
+ if Thread.current["ensembl_domain"]
91
+ query.sub!(/<!--VIRTUALSCHEMANAME-->/, Thread.current["ensembl_domain"] + "_mart")
92
+ else
93
+ query.sub!(/<!--VIRTUALSCHEMANAME-->/,'default')
94
+ end
78
95
  query.sub!(/<!--FILTERS-->/, filters.collect{|name, v| v.nil? ? "<Filter name = \"#{ name }\" excluded = \"0\"/>" : "<Filter name = \"#{ name }\" value = \"#{Array === v ? v * "," : v}\"/>" }.join("\n") )
79
96
  query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
80
97
  query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
81
98
 
82
- url = Thread.current['archive_url'] ? Thread.current['archive_url'] + query.gsub(/\n/,' ') : BIOMART_URL + query.gsub(/\n/,' ')
99
+ url = final_url(query, Thread.current["archive"], Thread.current["ensembl_domain"])
100
+
83
101
 
84
102
  begin
85
103
  response = Open.read(url, open_options.dup)
@@ -105,10 +123,17 @@ module BioMart
105
123
 
106
124
  new_datafile = TmpFile.tmp_file
107
125
  if data.nil?
108
- TSV.merge_row_fields Open.open(result_file), new_datafile
126
+ Open.open(result_file) do |file|
127
+ Open.write(new_datafile, Open.collapse_stream(file))
128
+ end
109
129
  data = new_datafile
110
130
  else
111
- TSV.merge_different_fields data, result_file, new_datafile
131
+ Open.open(result_file) do |stream_result|
132
+ Open.open(data) do |stream_data|
133
+ Open.write(new_datafile, Open.collapse_stream(TSV.paste_streams([stream_data, stream_result], sort: true, sort_cmd_args: '-s -k1,1'), compact: true))
134
+ end
135
+ end
136
+ #TSV.merge_different_fields Open.open(data), Open.open(result_file), new_datafile, one2one: false, sort: :first
112
137
  FileUtils.rm data
113
138
  data = new_datafile
114
139
  end
@@ -142,9 +167,9 @@ module BioMart
142
167
 
143
168
  IndiferentHash.setup(open_options)
144
169
 
145
- Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
170
+ Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{Log.fingerprint filters}] #{open_options.inspect}"
146
171
 
147
- max_items = 2
172
+ max_items = 1
148
173
  chunks = []
149
174
  chunk = []
150
175
  attrs.each{|a|
@@ -178,7 +203,7 @@ module BioMart
178
203
  results
179
204
  else
180
205
  Open.write(filename) do |f|
181
- f.puts "#: " << Misc.hash2string(TSV::ENTRIES.collect{|key| [key, open_options[key]]})
206
+ f.puts "#: " << Misc.hash2string(TSV.annotations{|key| [key, open_options[key]]})
182
207
  if field_names.nil?
183
208
  f.puts "#" << [main, attrs].flatten * "\t"
184
209
  else
@@ -211,7 +236,17 @@ module BioMart
211
236
  changes = {}
212
237
  missing.select{|m| m.include? "~" }.each do |str|
213
238
  orig,_sep, new = str.partition "~"
214
- changes[orig] = new
239
+ if orig.include?(":")
240
+ target_db, _sep, orig = orig.partition(":")
241
+ if target_db[0] == "-"
242
+ next if database == target_db[1..-1]
243
+ else
244
+ next unless database == target_db
245
+ end
246
+ changes[orig] = new
247
+ else
248
+ changes[orig] = new
249
+ end
215
250
  end
216
251
  changed = true
217
252
  while changed
@@ -9,11 +9,29 @@ module Ensembl
9
9
  module FTP
10
10
 
11
11
  SERVER = "ftp.ensembl.org"
12
+ DOMAIN_SERVER = "ftp.ensemblgenomes.org"
12
13
 
13
- def self.mysql_path(release)
14
+ def self.ftp_name_for_domain(domain, organism, subdir='mysql')
15
+ code, build = organism.split "/"
16
+ build ||= "current"
17
+
18
+ release = build == "current" ? 'current' : Ensembl.releases[build]
19
+ name = Organism.scientific_name(organism)
20
+ ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
21
+ ftp.passive = true
22
+ ftp.login
23
+ dir = File.join('pub', domain, 'current', subdir)
24
+ ftp.chdir(dir)
25
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
26
+ ftp.close
27
+ [release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
14
28
  end
15
29
 
16
- def self.ftp_name_for(organism)
30
+ def self.ftp_name_for(organism, subdir='mysql')
31
+ if domain = Thread.current["ensembl_domain"]
32
+ return ftp_name_for_domain(domain, organism,subdir)
33
+ end
34
+
17
35
  code, build = organism.split "/"
18
36
  build ||= "current"
19
37
 
@@ -23,8 +41,9 @@ module Ensembl
23
41
  ftp = Net::FTP.new(Ensembl::FTP::SERVER)
24
42
  ftp.passive = true
25
43
  ftp.login
26
- ftp.chdir(File.join('pub', 'current_mysql'))
27
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").collect{|l| l.split(" ").last}.last
44
+ dir = File.join('pub', "current_#{subdir}")
45
+ ftp.chdir(dir)
46
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
28
47
  ftp.close
29
48
  else
30
49
  release = Ensembl.releases[build]
@@ -32,24 +51,21 @@ module Ensembl
32
51
  ftp = Net::FTP.new(Ensembl::FTP::SERVER)
33
52
  ftp.passive = true
34
53
  ftp.login
35
- ftp.chdir(File.join('pub', release, 'mysql'))
36
- file = ftp.list(name.downcase.gsub(" ",'_') + "_core_*").reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
54
+ dir = File.join('pub', release, subdir)
55
+ ftp.chdir(dir)
56
+ file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
37
57
  ftp.close
38
58
  end
39
- [release, file]
59
+ [release, File.join(Ensembl::FTP::SERVER, dir, file)]
40
60
  end
41
61
 
42
- def self.ftp_directory_for(organism)
43
- release, ftp_name = ftp_name_for(organism)
44
- if release == 'current'
45
- File.join('/pub/', 'current_mysql', ftp_name)
46
- else
47
- File.join('/pub/', release, 'mysql', ftp_name)
48
- end
62
+ def self.ftp_url_for(organism)
63
+ release, ftp_url = ftp_name_for(organism)
64
+ ftp_url
49
65
  end
50
66
 
51
67
  def self.base_url(organism)
52
- File.join("ftp://" + SERVER, ftp_directory_for(organism) )
68
+ File.join("ftp://", ftp_url_for(organism) )
53
69
  end
54
70
 
55
71
  def self.url_for(organism, table, extension)
@@ -8,6 +8,19 @@ module Entrez
8
8
 
9
9
  Rbbt.claim Rbbt.share.databases.entrez.gene_info, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene_info.gz'
10
10
  Rbbt.claim Rbbt.share.databases.entrez.gene2pubmed, :url, 'ftp://ftp.ncbi.nih.gov/gene/DATA/gene2pubmed.gz'
11
+ Rbbt.claim Rbbt.share.databases.entrez.tax_ids, :proc do |filename|
12
+ TmpFile.with_dir do |dir|
13
+ Misc.in_dir dir do
14
+ CMD.cmd("wget 'https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'")
15
+ CMD.cmd("tar xvfz taxdump.tar.gz")
16
+ CMD.cmd("grep 'scientific name' names.dmp |cut -f 1,3 > tmp.tsv")
17
+ tsv = TSV.open('tmp.tsv', type: :single)
18
+ tsv.key_field = "Entrez Tax ID"
19
+ tsv.fields = ["Scientific Name"]
20
+ Open.write(filename, tsv.to_s)
21
+ end
22
+ end
23
+ end
11
24
 
12
25
  def self.entrez2native(taxs, options = {})
13
26
  options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
@@ -25,8 +25,8 @@ module GO
25
25
  # the gene_ontology.obo file and extracts all the fields, although right now,
26
26
  # only the name field is used.
27
27
  def self.init
28
- Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
29
- info.serializer = :marshal if info.respond_to? :serializer
28
+ Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true, serializer: :marshal) do |info|
29
+ #info.serializer = :marshal if info.respond_to? :serializer
30
30
  Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
31
31
  term_info = {}
32
32
 
@@ -0,0 +1,26 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/resource'
3
+
4
+ module MeSH
5
+ extend Resource
6
+
7
+ self.subdir = "share/databases/MeSH"
8
+
9
+ MeSH.claim MeSH["data.gz"], :url, "https://nlmpubs.nlm.nih.gov/projects/mesh/rdf/mesh.nt.gz"
10
+
11
+ MeSH.claim MeSH.vocabulary, :proc do
12
+ dumper = TSV::Dumper.new :key_field => "MeSH ID", :fields => ["Label"], :type => :single
13
+ dumper.init
14
+ TSV.traverse MeSH.data, :type => :array, :into => dumper, :bar => "Processing MeSH vocab" do |line|
15
+ sub, verb, obj = line.split("\t")
16
+
17
+ next unless verb && verb.include?("rdf-schema#label")
18
+
19
+ id = sub.split("/").last[0..-2]
20
+ label = obj.split('"')[1]
21
+
22
+ [id, label]
23
+ end
24
+ end
25
+
26
+ end
@@ -9,6 +9,10 @@ module Organism
9
9
  ARCHIVE_MONTH_INDEX = {}
10
10
  %w(jan feb mar apr may jun jul aug sep oct nov dec).each_with_index{|d,i| ARCHIVE_MONTH_INDEX[d] = i }
11
11
 
12
+ def self.rake_organism_helper
13
+ Rbbt.share.install.Organism["organism_helpers.rb"].find
14
+ end
15
+
12
16
  def self.compare_archives(a1, a2)
13
17
  a1 = a1.partition("/").last if a1 and a1.include? "/"
14
18
  a2 = a2.partition("/").last if a2 and a2.include? "/"
@@ -29,7 +33,8 @@ module Organism
29
33
  end
30
34
 
31
35
  def self.default_code(organism = "Hsa")
32
- organism.split("/").first << "/feb2014"
36
+ latest = Rbbt.etc.allowed_biomart_archives.list.sort{|a,b| compare_archives(a, b)}.last
37
+ organism.split("/").first << "/" << latest
33
38
  end
34
39
 
35
40
  def self.organism_codes(organism = nil)
@@ -43,7 +48,8 @@ module Organism
43
48
  end
44
49
 
45
50
  def self.installed_organisms
46
- Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f)}
51
+ Rbbt.share.install.Organism.find.glob('???').collect{|f| File.basename(f) } +
52
+ Rbbt.share.install.Organism.find.glob('*.rake').collect{|f| File.basename(f).sub(/\.rake/, '') }
47
53
  end
48
54
 
49
55
  def self.prepared_organisms
@@ -62,25 +68,6 @@ module Organism
62
68
  nil
63
69
  end
64
70
 
65
- Organism.installable_organisms.each do |organism|
66
- claim Organism[organism], :rake, Rbbt.share.install.Organism[organism].Rakefile.find
67
-
68
- module_eval "#{ organism } = with_key '#{organism}'"
69
- end
70
-
71
- Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
72
- Open.mkdir File.dirname(file) unless File.directory?(file)
73
- url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
74
- CMD.cmd_log("wget '#{url}' -O '#{file}'")
75
- CMD.cmd("chmod 0755 '#{file}'")
76
- Rbbt.set_software_env
77
- nil
78
- end
79
-
80
- CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
81
-
82
- Rbbt.set_software_env
83
-
84
71
  def self.hg_build(organism)
85
72
  require 'rbbt/sources/ensembl_ftp'
86
73
  organism = organism.strip
@@ -257,7 +244,16 @@ module Organism
257
244
  end
258
245
 
259
246
  def self.scientific_name(organism)
260
- Organism[organism]["scientific_name"].produce.read.strip
247
+ Organism[organism].scientific_name.read.strip
248
+ end
249
+
250
+ def self.make_organism(name, long = false)
251
+ first, _, second = name.partition(/[ _]/)
252
+ if long
253
+ first[0].upcase + second.downcase.gsub(/[^a-z]/,'')
254
+ else
255
+ first[0].upcase + second[0..1].downcase
256
+ end
261
257
  end
262
258
 
263
259
  def self.organism(name)
@@ -295,7 +291,7 @@ module Organism
295
291
  organism ||= "Hsa"
296
292
 
297
293
  @@gene_start_end ||= {}
298
- gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unmamed => true)
294
+ gene_start_end = @@gene_start_end[organism] ||= Organism.gene_positions(organism).tsv(:persist => true, :key_field => "Ensembl Gene ID", :fields => ["Gene Start", "Gene End"], :type => :list, :cast => :to_i, :unnamed => true)
299
295
 
300
296
  ranges = genes.collect{|gene|
301
297
  start, eend = gene_start_end[gene]
@@ -339,7 +335,8 @@ module Organism
339
335
  def self.chromosome_sizes(organism = Organism.default_code("Hsa"))
340
336
  chromosome_sizes = {}
341
337
 
342
- Organism[organism].glob_all("chromosome_*").each do |file|
338
+ Organism.chromosomes(organism).produce.tsv.each do |chr|
339
+ file = Organism[organism]["chromosome_#{chr}"].produce.find
343
340
  chromosome = file.split("_").last.split(".").first
344
341
  size = if Open.gzip?(file) || Open.bgzip?(file)
345
342
  CMD.cmd("zcat '#{ file }' | wc -c ").read
@@ -352,4 +349,28 @@ module Organism
352
349
  chromosome_sizes
353
350
  end
354
351
 
352
+ Rbbt.claim Rbbt.software.opt.bin.liftOver, :proc do |file|
353
+ Open.mkdir File.dirname(file) unless File.directory?(file)
354
+ url = "http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver"
355
+ CMD.cmd_log("wget '#{url}' -O '#{file}'")
356
+ CMD.cmd("chmod 0755 '#{file}'")
357
+ Rbbt.set_software_env
358
+ nil
359
+ end
360
+
361
+ CMD.tool :liftOver, Rbbt.software.opt.bin.liftOver
362
+
363
+ Rbbt.set_software_env
364
+
365
+ Organism.installable_organisms.each do |organism|
366
+ if Rbbt.share.install.Organism[organism].Rakefile.exists?
367
+ rakefile = Rbbt.share.install.Organism[organism].Rakefile.find
368
+ else
369
+ rakefile = Rbbt.share.install.Organism[organism + '.rake'].find
370
+ end
371
+
372
+ claim Organism[organism], :rake, rakefile
373
+
374
+ module_eval "#{ organism } = with_key '#{organism}'"
375
+ end
355
376
  end
@@ -51,6 +51,7 @@ module PubMed
51
51
  end
52
52
  [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
53
53
  end
54
+
54
55
  def self.parse_xml(xml)
55
56
  require 'nokogiri'
56
57
 
@@ -91,6 +92,16 @@ module PubMed
91
92
  [lastname, forename] * ", "
92
93
  end * " and "
93
94
 
95
+ info[:mesh] = parser.search("MeshHeadingList/MeshHeading").collect do |mesh|
96
+ descriptor = mesh.search("DescriptorName").first.attr('UI')
97
+ qualifiers = mesh.search("QualifierName").collect{|q| q.attr('UI')}
98
+ [descriptor] + qualifiers.collect{|q| descriptor + q }
99
+ end.compact.flatten
100
+
101
+ info[:substance] = parser.search("NameOfSubstance").collect do |substance|
102
+ substance.attr('UI')
103
+ end
104
+
94
105
  info[:bibentry] = bibentry.downcase if bibentry
95
106
 
96
107
  info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
@@ -102,7 +113,7 @@ module PubMed
102
113
  info
103
114
  end
104
115
 
105
- attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
116
+ attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url, :mesh, :substance
106
117
  attr_accessor *XML_KEYS.collect{|p| p.first }
107
118
 
108
119
  def initialize(xml)
@@ -141,7 +152,7 @@ module PubMed
141
152
  `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
142
153
  TmpFile.with_file do |txt|
143
154
  `pdftotext #{ pdf } #{ txt }`
144
- text = Open.read(txt) if File.exists? txt
155
+ text = Open.read(txt) if File.exist?(txt)
145
156
  end
146
157
  end
147
158
  text
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [9606]
7
2
  $scientific_name = "Homo sapiens"
8
3
  $ortholog_key = "hsapiens_homolog_ensembl_gene"
@@ -95,17 +90,30 @@ $biomart_identifiers = [
95
90
  [ 'Illumina HumanWG 6 v3', 'illumina_humanwg_6_v3' ],
96
91
  ]
97
92
 
98
- $biomart_go= [
99
- ["GO ID", 'go_id'],
100
- ["GO Namespace", 'namespace_1003'],
93
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
94
+ Thread.current["namespace"] = $namespace
95
+ load Organism.rake_organism_helper
96
+
97
+ file 'regulators' do |t|
98
+ regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
99
+ regulatory_fields = [
100
+ ['Chromosome Name','chromosome_name'],
101
+ ['Region Start', 'chromosome_start'],
102
+ ['Region End', 'chromosome_end'],
103
+ ['Feature type', 'feature_type_name'],
101
104
  ]
105
+ regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :list, :namespace => Thread.current['namespace'])
106
+
107
+ Misc.sensiblewrite(t.name, regulators.to_s)
108
+ end
102
109
 
103
- $biomart_go_2009= [
104
- ["GO BP ID", 'go_biological_process_id'],
105
- ["GO MF ID", 'go_molecular_function_id'],
106
- ["GO CC ID", 'go_cellular_component_id'],
110
+ file 'regulator_activity' do |t|
111
+ regulatory_id = ['Regulatory stable ID', 'regulatory_stable_id']
112
+ regulatory_fields = [
113
+ ['Epigenome name','epigenome_name'],
114
+ ['Activity', 'activity'],
107
115
  ]
116
+ regulators = BioMart.tsv('hsapiens_regulatory_feature', regulatory_id, regulatory_fields, [], nil, :type => :double, :namespace => Thread.current['namespace'])
108
117
 
109
- #$namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
110
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
111
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
118
+ Misc.sensiblewrite(t.name, regulators.to_s)
119
+ end
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [10090]
7
2
  $scientific_name = "Mus musculus"
8
3
  $ortholog_key = "mmusculus_homolog_ensembl_gene"
@@ -43,18 +38,6 @@ $biomart_identifiers = [
43
38
  [ 'EMBL (Genbank) ID' , "embl"] ,
44
39
  ]
45
40
 
46
- $biomart_go= [
47
- ["GO ID", 'go_id'],
48
- ["GO Namespace", 'namespace_1003'],
49
- ]
50
-
51
- $biomart_go_2009= [
52
- ["GO BP ID", 'go_biological_process_id'],
53
- ["GO MF ID", 'go_molecular_function_id'],
54
- ["GO CC ID", 'go_cellular_component_id'],
55
- ]
56
-
57
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
58
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
59
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
60
-
41
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
42
+ Thread.current["namespace"] = $namespace
43
+ load Organism.rake_organism_helper
@@ -1,8 +1,3 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', '..', 'lib'))
2
- require 'rbbt/sources/biomart'
3
- require 'rbbt/sources/entrez'
4
- require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
-
6
1
  $taxs = [10116]
7
2
  $scientific_name = "Rattus norvegicus"
8
3
 
@@ -50,6 +45,6 @@ $biomart_protein_identifiers = [
50
45
  [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession"],
51
46
  ]
52
47
 
53
- $namespace = File.basename(File.dirname(File.expand_path(__FILE__)))
54
- Thread.current["namespace"] = File.basename(File.dirname(File.expand_path(__FILE__)))
55
- load File.join(File.dirname(__FILE__), '../organism_helpers.rb')
48
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
49
+ Thread.current["namespace"] = $namespace
50
+ load Organism.rake_organism_helper
@@ -0,0 +1,38 @@
1
+ $taxs = [559292,4932]
2
+ $scientific_name = "Saccharomyces cerevisiae"
3
+ $ensembl_domain = 'fungi'
4
+ #$ortholog_key = "yeast_ensembl_gene"
5
+
6
+ $biomart_db = 'scerevisiae_eg_gene'
7
+
8
+ $biomart_lexicon = [
9
+ [ 'Associated Gene Name' , "external_gene_name"],
10
+ ]
11
+
12
+ $biomart_protein_identifiers = [
13
+ [ 'Protein ID', "protein_id" ],
14
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
15
+ [ 'Unigene ID', "unigene" ],
16
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
17
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
18
+ ]
19
+
20
+ $biomart_probe_identifiers = [
21
+ ]
22
+
23
+ $biomart_identifiers = [
24
+ [ 'Entrez Gene ID', "entrezgene"],
25
+ [ 'Ensembl Protein ID', "ensembl_peptide_id" ],
26
+ [ 'Associated Gene Name', "external_gene_name" ],
27
+ [ 'Protein ID', "protein_id" ],
28
+ [ 'RefSeq Protein ID', "refseq_peptide" ],
29
+ [ 'UniProt/SwissProt ID', "uniprot_swissprot" ],
30
+ [ 'UniProt/SwissProt Accession', "uniprot_swissprot_accession" ],
31
+ [ 'EMBL (Genbank) ID' , "embl"] ,
32
+ [ 'RefSeq DNA' , "refseq_dna"] ,
33
+ ]
34
+
35
+ $namespace = File.basename(__FILE__).sub(/\.rake$/,'')
36
+ Thread.current["namespace"] = $namespace
37
+ Thread.current["ensembl_domain"] = $ensembl_domain
38
+ load Organism.rake_organism_helper