rbbt-sources 3.3.0 → 3.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,11 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
2
+
1
3
  require 'net/ftp'
4
+ require 'rbbt/sources/biomart'
5
+ require 'rbbt/sources/entrez'
6
+ require File.join(File.dirname(__FILE__), '../lib/helpers')
2
7
  require 'rbbt/sources/ensembl_ftp'
3
8
 
4
- #Thread.current['namespace'] = $namespace
5
-
6
9
  $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
7
10
  $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
8
11
  $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
@@ -77,6 +80,17 @@ $biomart_pfam= [
77
80
  ["Pfam Domain", 'pfam'],
78
81
  ]
79
82
 
83
+ $biomart_go= [
84
+ ["GO ID", 'go_id'],
85
+ ["GO Namespace", 'namespace_1003'],
86
+ ]
87
+
88
+ $biomart_go_2009= [
89
+ ["GO BP ID", 'go_biological_process_id'],
90
+ ["GO MF ID", 'go_molecular_function_id'],
91
+ ["GO CC ID", 'go_cellular_component_id'],
92
+ ]
93
+
80
94
  $biomart_gene_biotype= [
81
95
  ["Biotype", 'gene_biotype'],
82
96
  ]
@@ -91,7 +105,13 @@ $biomart_exons = [
91
105
  #{{{ Rules
92
106
 
93
107
  file 'entrez_taxids' do |t|
94
- Misc.sensiblewrite(t.name, $taxs * "\n")
108
+ if $tax && $tax.any?
109
+ Misc.sensiblewrite(t.name, $taxs * "\n")
110
+ else
111
+ tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
112
+ taxs = tsv[$scientific_name] || []
113
+ Misc.sensiblewrite(t.name, taxs * "\n")
114
+ end
95
115
  end
96
116
 
97
117
  file 'scientific_name' do |t|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
104
124
  Misc.sensiblewrite(t.name, $ortholog_key)
105
125
  end
106
126
 
107
- file 'identifiers' do |t|
127
+ file 'identifiers' => 'entrez_taxids' do |t|
128
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
108
129
  identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
109
130
  identifiers.unnamed = true
110
131
 
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
116
137
  end
117
138
 
118
139
  name_pos = identifiers.identify_field "Associated Gene Name"
119
- entrez2name = Entrez.entrez2name($taxs)
120
- identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
121
- names = values[name_pos]
140
+ if tax_codes and tax_codes.any?
141
+ entrez2name = Entrez.entrez2name(tax_codes)
142
+ identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
143
+ names = values[name_pos] || []
122
144
 
123
- matches = entrez.select do |e|
124
- entrez2name.include?(e) && (names & entrez2name[e]).any?
125
- end
145
+ matches = entrez.select do |e|
146
+ entrez2name.include?(e) && (names & entrez2name[e]).any?
147
+ end
126
148
 
127
- if matches.any?
128
- matches
129
- else
130
- entrez
149
+ if matches.any?
150
+ matches
151
+ else
152
+ entrez
153
+ end
131
154
  end
132
155
  end
133
156
 
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
147
170
  identifiers = identifiers.reorder(:key, ordered_fields)
148
171
  end
149
172
 
150
- entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
151
- entrez_synonyms.key_field = "Entrez Gene ID"
152
- entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
173
+ if tax_codes and tax_codes.any?
174
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
175
+ entrez_synonyms.key_field = "Entrez Gene ID"
176
+ entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
153
177
 
154
- identifiers.attach entrez_synonyms
178
+ identifiers.attach entrez_synonyms
179
+ end
155
180
 
156
181
  identifiers.with_unnamed do
157
182
  identifiers.each do |key, values|
158
183
  values.each do |list|
184
+ list ||= []
159
185
  list.reject!{|v| v.nil? or v.empty?}
160
186
  list.uniq!
161
187
  end
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
166
192
  Misc.sensiblewrite(t.name, identifiers.to_s)
167
193
  end
168
194
 
169
- file 'lexicon' => 'identifiers' do |t|
195
+ file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
170
196
  tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
197
+ tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
171
198
 
172
- entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
199
+ entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
173
200
  entrez_description.key_field = "Entrez Gene ID"
174
201
  entrez_description.fields = ["Entrez Gene Description"]
175
202
 
@@ -308,8 +335,9 @@ end
308
335
 
309
336
  # {{{ Other info
310
337
 
311
- file 'gene_pmids' do |t|
312
- tsv = Entrez.entrez2pubmed($taxs)
338
+ file 'gene_pmids' => 'entrez_taxids' do |t|
339
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
340
+ tsv = Entrez.entrez2pubmed(tax_codes)
313
341
  text = "#: :namespace=#{Thread.current['namespace']}\n"
314
342
  text += "#Entrez Gene ID\tPMID"
315
343
  tsv.each do |gene, pmids|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
417
445
 
418
446
  gene_go.monitor = true
419
447
  gene_go.process "GO ID" do |key, go_id, values|
420
- clean = values.zip_fields.select do |id, type|
448
+ clean = NamedArray.zip_fields(values).select do |id, type|
421
449
  type == "biological_process"
422
450
  end
423
451
  clean.collect{|id, type| id}
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
487
515
  end
488
516
 
489
517
  file 'chromosomes' do |t|
490
- goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
518
+ tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
491
519
 
492
- Misc.sensiblewrite(t.name, goterms.to_s)
520
+ Misc.sensiblewrite(t.name, tsv.keys * "\n")
493
521
  end
494
522
 
495
523
  file 'blacklist_chromosomes' => 'chromosomes' do |t|
@@ -511,6 +539,15 @@ end
511
539
 
512
540
  rule /^chromosome_.*/ do |t|
513
541
  chr = t.name.match(/chromosome_(.*)/)[1]
542
+ path = File.expand_path(t.name)
543
+ dirname = File.dirname(path)
544
+
545
+ organism = File.basename(dirname)
546
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
547
+ archive = organism
548
+ organism = File.basename(File.dirname(dirname))
549
+ organism = File.join(organism, archive)
550
+ end
514
551
 
515
552
  # HACK: Skip LRG chromosomes
516
553
  raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
519
556
 
520
557
  release = Ensembl.releases[archive]
521
558
 
522
- ftp = Net::FTP.new("ftp.ensembl.org")
559
+ fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
560
+ server, _, path = fasta_url.partition("/")
561
+ path = "/" + path
562
+
563
+ ftp = Net::FTP.new(server)
523
564
  ftp.passive = true
524
565
  ftp.login
525
- if release.nil? or release == 'current'
526
- ftp.chdir("pub/current_fasta/")
527
- else
528
- ftp.chdir("pub/#{ release }/fasta/")
529
- end
530
- ftp.chdir($scientific_name.downcase.sub(" ",'_'))
566
+ ftp.chdir(path)
531
567
  ftp.chdir('dna')
532
- file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
533
-
534
- raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
535
568
 
536
- Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
569
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
570
+ if file
571
+ Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
537
572
 
538
- Misc.lock t.name + '.rake' do
539
- TmpFile.with_file do |tmpfile|
540
- ftp.getbinaryfile(file, tmpfile)
541
- Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
542
- ftp.close
573
+ Misc.lock t.name + '.rake' do
574
+ TmpFile.with_file do |tmpfile|
575
+ ftp.getbinaryfile(file, tmpfile)
576
+ Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
577
+ ftp.close
578
+ end
543
579
  end
580
+ else
581
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
582
+ Misc.lock t.name + '.rake' do
583
+ TmpFile.with_file do |tmpfile|
584
+ ftp.getbinaryfile(file, tmpfile)
585
+ txt = Open.read(tmpfile, :gzip => true)
586
+
587
+ chr_txt = []
588
+
589
+ in_chr = false
590
+ txt.split("\n").each do |line|
591
+ if line.start_with?(">#{chr}")
592
+ in_chr = true
593
+ elsif line.start_with?(">")
594
+ in_chr = false
595
+ else
596
+ chr_txt << line if in_chr
597
+ end
598
+ end
599
+ Misc.sensiblewrite(t.name, chr_txt * "" )
600
+ ftp.close
601
+ end
602
+ end
603
+ raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
544
604
  end
545
605
  end
546
606
 
@@ -584,6 +644,16 @@ end
584
644
  require 'bio'
585
645
 
586
646
  file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
647
+ path = File.expand_path(t.name)
648
+ dirname = File.dirname(path)
649
+
650
+ organism = File.basename(dirname)
651
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
652
+ archive = organism
653
+ organism = File.basename(File.dirname(dirname))
654
+ organism = File.join(organism, archive)
655
+ end
656
+
587
657
  exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
588
658
 
589
659
  chr_transcript_ranges ||= {}
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
616
686
  chr_transcript_ranges.each do |chr, transcript_ranges|
617
687
  begin
618
688
  raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
619
- p = File.expand_path("./chromosome_#{chr}")
620
- Organism.root.annotate p
621
- p.sub!(%r{.*/organisms/},'share/organisms/')
622
- chr_str = p.produce.read
689
+ pkgdir = Thread.current["resource"]
690
+ p = pkgdir[organism]["chromosome_#{chr}"]
691
+ p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
692
+ chr_str = p.read
623
693
  rescue Exception
624
694
  Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
625
695
  raise $! unless $!.message =~ /not supported/
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
656
726
  organism = File.join(organism, archive)
657
727
  end
658
728
 
659
- translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
729
+ translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
660
730
 
661
731
  if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
662
732
  exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
670
740
  transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
671
741
  end
672
742
 
673
- transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
674
- transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
675
- exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
743
+ transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
744
+ transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
745
+ exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
676
746
 
677
747
  transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
678
748
  transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
@@ -719,12 +789,13 @@ end
719
789
  file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
720
790
  transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
721
791
  transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
722
- transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
792
+ transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
723
793
  transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
724
794
  transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
725
795
 
726
796
 
727
797
  protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
798
+ transcript_sequence.monitor = true
728
799
  transcript_sequence.through do |transcript, sequence|
729
800
  protein = transcript_protein[transcript]
730
801
  next if protein.nil? or protein.empty?
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
777
848
  uni_seq = UniProt.get_uniprot_sequence(uni)
778
849
  ensps = uni2ensps[uni]
779
850
  next if ensps.nil? or ensps.empty?
851
+
780
852
  best_ensp = ensps.sort_by do |ensp|
781
853
  ensp_seq = ensp2seq[ensp]
782
854
  if ensp_seq
@@ -806,7 +878,7 @@ file 'gene_set' do |t|
806
878
  build_code = Organism.GRC_build(organism)
807
879
  scientific_name = $scientific_name
808
880
  url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
809
- CMD.cmd("wget '#{url}' -O #{t.name}.gz")
881
+ Open.download(url, "#{t.name}.gz")
810
882
  nil
811
883
  end
812
884
 
@@ -825,7 +897,8 @@ file 'cdna_fasta' do |t|
825
897
  num = release.split("-").last
826
898
  build_code = Organism.GRC_build(organism)
827
899
  scientific_name = Organism.scientific_name(organism)
828
- url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
829
- CMD.cmd("wget '#{url}' -O #{t.name}.gz")
900
+ url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.cdna.all.fa.gz"
901
+ Open.download(url, "#{t.name}.gz")
830
902
  nil
831
903
  end
904
+
@@ -9,7 +9,7 @@ SOURCE_DIR = 'source'
9
9
  def define_source_tasks(sources)
10
10
  sources.each do |name, url|
11
11
  file File.join(SOURCE_DIR, name) do |t|
12
- FileUtils.mkdir SOURCE_DIR unless File.exists? SOURCE_DIR
12
+ FileUtils.mkdir SOURCE_DIR unless File.exist? SOURCE_DIR
13
13
  Log.log "Retrieving file '#{name}' into '#{t.name}': '#{url}'", Log::LOW
14
14
  Open.write(t.name, Open.open(url, :cache => false, :wget_options => {"--no-check-certificate" => true, "--quiet" => false, :pipe => true}))
15
15
  end
@@ -87,5 +87,5 @@ end
87
87
  task :all => :default
88
88
 
89
89
  task :clean do
90
- ($__tsv_tasks + $__files).each do |file| FileUtils.rm file.to_s if File.exists?(file.to_s) end
90
+ ($__tsv_tasks + $__files).each do |file| FileUtils.rm file.to_s if File.exist?(file.to_s) end
91
91
  end
@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
3
3
  require 'rbbt/sources/pubmed'
4
4
  require 'test/unit'
5
5
  require 'rbbt/sources/biomart'
6
+ require 'rbbt/sources/organism'
6
7
  require 'rbbt/util/tmpfile'
7
8
  require 'test/unit'
8
9
 
9
10
  class TestBioMart < Test::Unit::TestCase
10
11
 
11
12
  def setup
12
- BioMart.set_archive Organism.default_code("Hsa")
13
+ BioMart.set_archive "feb2014"
13
14
  end
14
15
 
15
16
  def teardown
16
17
  BioMart.unset_archive
17
18
  end
18
19
 
19
- def _test_get
20
+ def test_get_Sce
20
21
  assert_raise BioMart::QueryError do
21
22
  BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
22
23
  end
23
24
 
24
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => false, :merge => true, :wget_options => {:quiet => false})
25
+ BioMart.set_archive "feb2023-fungi"
26
+ data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
25
27
  tsv = TSV.open data, :double, :merge => true
26
- assert(tsv['852236'][0].include? 'CAA84864')
28
+ assert(tsv['852236'][0].include? 'CAA84864.1')
27
29
 
28
- data = BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
30
+ data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
29
31
  tsv = TSV.open data, :double, :merge => true
30
32
  assert(tsv['852236'][1].include? 'YBL044W')
31
33
  end
32
34
 
33
- def _test_query
35
+ def test_get_Hsa
36
+ Log.severity = 0
37
+ data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
38
+ tsv = TSV.open data, :double, :merge => true
39
+ assert(tsv['852236'][0].include? 'CAA84864.1')
40
+ end
41
+
42
+
43
+ def test_query
34
44
  data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
35
45
  assert(data['852236']['external_gene_id'].include? 'YBL044W')
36
46
 
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
41
51
  end
42
52
  end
43
53
 
54
+ def __test_transcrip_exons
55
+ Log.with_severity 1 do
56
+ TmpFile.with_file do |f|
57
+ fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
58
+ main = fields[0]
59
+ attrs = fields.values_at(1, 2)
60
+ attrs_first = [attrs.first]
61
+ attrs_last = [attrs.last]
62
+ database = 'hsapiens_gene_ensembl'
63
+
64
+ filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
65
+ ppp Open.read(filename)
66
+
67
+ filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
68
+ ppp Open.read(filename)
69
+
70
+ filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
71
+ ppp Open.read(filename)
72
+
73
+ filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
74
+ ppp Open.read(filename)
75
+
76
+ data = TSV.open Open.open(filename)
77
+ assert(data['852236']['external_gene_id'].include? 'YBL044W')
78
+ end
79
+ end
80
+ end
81
+
44
82
  def test_tsv
45
83
  data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
46
84
  assert(data['852236']['Protein ID'].include? 'CAA84864')
@@ -0,0 +1,11 @@
1
+ require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
2
+ require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
3
+
4
+ class TestEnsemblFTP < Test::Unit::TestCase
5
+ def test_ftp_for
6
+ assert_nothing_raised do
7
+ Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
8
+ end
9
+ end
10
+ end
11
+
@@ -14,6 +14,11 @@ class TestEntrez < Test::Unit::TestCase
14
14
  assert(lexicon['855611'].include? 'S000005056')
15
15
  end
16
16
 
17
+ def test_entrez2name
18
+ tax = $yeast_tax
19
+ Entrez.entrez2name(tax)
20
+ end
21
+
17
22
  def test_entrez2pubmed
18
23
  tax = $yeast_tax
19
24
 
@@ -0,0 +1,10 @@
1
+ require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
2
+ require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
3
+
4
+ class TestMESH < Test::Unit::TestCase
5
+ def test_vocab
6
+ tsv = MeSH.vocabulary.tsv
7
+ assert_equal "3T3 Cells", tsv["D016475"]
8
+ end
9
+ end
10
+
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
5
5
 
6
6
  class TestOrganism < Test::Unit::TestCase
7
7
 
8
- def test_known_ids
8
+ def _test_known_ids
9
9
  assert Organism.known_ids("Hsa").include?("Associated Gene Name")
10
10
  end
11
11
 
12
- def test_location
12
+ def _test_location
13
13
  assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
14
14
  end
15
15
 
16
- def test_identifiers
16
+ def _test_identifiers
17
17
  assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
21
21
 
22
- def test_lexicon
22
+ def _test_lexicon
23
23
  assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
24
24
  end
25
25
 
26
- def test_guess_id
26
+ def _test_guess_id
27
27
  ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
28
28
  gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
29
29
  assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
30
30
  assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
31
31
  end
32
32
 
33
- def test_organisms
33
+ def _test_organisms
34
34
  assert Organism.organisms.include? "Hsa"
35
35
  assert_equal "Hsa", Organism.organism("Homo sapiens")
36
36
  end
37
37
 
38
- def test_attach_translations
38
+ def _test_attach_translations
39
39
  tsv = TSV.setup({"1020" => []}, :type => :list)
40
40
  tsv.key_field = "Entrez Gene ID"
41
41
  tsv.fields = []
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
47
47
  assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
48
48
  end
49
49
 
50
- def test_entrez_taxids
50
+ def _test_entrez_taxids
51
51
  assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
52
52
  end
53
53
 
54
- def test_lift_over
54
+ def _test_lift_over
55
55
  mutation_19 = "19:21131664:T"
56
56
  mutation_18 = "19:20923504:T"
57
- source_build = Organism.default_code("Hsa")
57
+ source_build = "Hsa/feb2014"
58
58
  target_build = "Hsa/may2009"
59
59
 
60
60
  assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
61
61
  assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
62
62
  end
63
63
 
64
- def test_orhtolog
64
+ def _test_orhtolog
65
65
  require 'rbbt/entity/gene'
66
66
  assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
67
67
  end
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
70
70
  assert Organism.chromosome_sizes["2"].to_i > 10_000_000
71
71
  end
72
72
 
73
- def test_build_organism
73
+ def _test_build_organism
74
74
  assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
75
75
  assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
76
76
  assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
77
77
  end
78
78
 
79
- #def test_genes_at_chromosome
79
+ #def _test_genes_at_chromosome
80
80
  # pos = [12, 117799500]
81
81
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
82
82
  #end
83
83
 
84
- #def test_genes_at_chromosome_array
84
+ #def _test_genes_at_chromosome_array
85
85
  # pos = [12, [117799500, 106903900]]
86
86
  # assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
87
87
  #end
88
88
 
89
- #def test_genes_at_genomic_positions
89
+ #def _test_genes_at_genomic_positions
90
90
  # pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
91
91
  # assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
92
92
  #end
@@ -5,7 +5,17 @@ require 'test/unit'
5
5
 
6
6
  class TestPubMed < Test::Unit::TestCase
7
7
 
8
- def test_get_article
8
+ def test_mesh
9
+ pmid = '10866666'
10
+ assert_include PubMed.get_article(pmid).mesh, "D016475"
11
+ end
12
+
13
+ def _test_substance
14
+ pmid = '10866666'
15
+ assert_include PubMed.get_article(pmid).substance, "C000717247"
16
+ end
17
+
18
+ def _test_get_article
9
19
  pmid = '16438716'
10
20
  assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
11
21
 
@@ -13,38 +23,38 @@ class TestPubMed < Test::Unit::TestCase
13
23
  assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
14
24
  end
15
25
 
16
- def test_get_multi_abstract
26
+ def _test_get_multi_abstract
17
27
  pmid = "32141403"
18
28
 
19
29
  assert PubMed.get_article(pmid).abstract.include?("This study shows PCOS patients are at increased risk of incident schizophrenia, and the metformin treatment has a protective effect against incident schizophrenia.")
20
30
  end
21
31
 
22
- def test_full_text
32
+ def _test_full_text
23
33
  pmid = '16438716'
24
34
  assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
25
35
  end
26
36
 
27
- def test_pmc_full_xml
37
+ def _test_pmc_full_xml
28
38
  pmid = '4304705'
29
39
  assert PubMed.get_article(pmid).pmc_full_xml.include?("HBV antigen")
30
40
  end
31
41
 
32
42
 
33
- def test_query
43
+ def _test_query
34
44
  assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
35
45
  end
36
46
 
37
- def test_year
47
+ def _test_year
38
48
  pmid = '16438716'
39
49
  assert_equal "2006", PubMed.get_article(pmid).year
40
50
  end
41
51
 
42
- def test_bibentry
52
+ def _test_bibentry
43
53
  assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
44
54
  assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
45
55
  end
46
56
 
47
- def test_missing
57
+ def _test_missing
48
58
  pmids = '18627426,014966295'.split(",")
49
59
  Log.severity = 0
50
60
  assert PubMed.get_article(pmids).include? "014966295"