rbbt-sources 3.3.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/etc/allowed_biomart_archives +2 -4
- data/etc/biomart/missing_in_archive +2 -0
- data/etc/build_organism +4 -4
- data/etc/organisms +1 -0
- data/lib/rbbt/sources/biomart.rb +48 -13
- data/lib/rbbt/sources/ensembl_ftp.rb +31 -15
- data/lib/rbbt/sources/entrez.rb +13 -0
- data/lib/rbbt/sources/go.rb +2 -2
- data/lib/rbbt/sources/mesh.rb +26 -0
- data/lib/rbbt/sources/organism.rb +45 -24
- data/lib/rbbt/sources/pubmed.rb +13 -2
- data/share/install/Organism/{Hsa/Rakefile → Hsa.rake} +23 -15
- data/share/install/Organism/{Mmu/Rakefile → Mmu.rake} +3 -20
- data/share/install/Organism/{Rno/Rakefile → Rno.rake} +3 -8
- data/share/install/Organism/Sce.rake +38 -0
- data/share/install/Organism/organism_helpers.rb +126 -53
- data/share/install/lib/rake_helper.rb +2 -2
- data/test/rbbt/sources/test_biomart.rb +44 -6
- data/test/rbbt/sources/test_ensembl_ftp.rb +11 -0
- data/test/rbbt/sources/test_entrez.rb +5 -0
- data/test/rbbt/sources/test_mesh.rb +10 -0
- data/test/rbbt/sources/test_organism.rb +15 -15
- data/test/rbbt/sources/test_pubmed.rb +18 -8
- metadata +12 -7
- data/share/install/Organism/Sce/Rakefile +0 -52
@@ -1,8 +1,11 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
|
2
|
+
|
1
3
|
require 'net/ftp'
|
4
|
+
require 'rbbt/sources/biomart'
|
5
|
+
require 'rbbt/sources/entrez'
|
6
|
+
require File.join(File.dirname(__FILE__), '../lib/helpers')
|
2
7
|
require 'rbbt/sources/ensembl_ftp'
|
3
8
|
|
4
|
-
#Thread.current['namespace'] = $namespace
|
5
|
-
|
6
9
|
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
|
7
10
|
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
|
8
11
|
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
|
@@ -77,6 +80,17 @@ $biomart_pfam= [
|
|
77
80
|
["Pfam Domain", 'pfam'],
|
78
81
|
]
|
79
82
|
|
83
|
+
$biomart_go= [
|
84
|
+
["GO ID", 'go_id'],
|
85
|
+
["GO Namespace", 'namespace_1003'],
|
86
|
+
]
|
87
|
+
|
88
|
+
$biomart_go_2009= [
|
89
|
+
["GO BP ID", 'go_biological_process_id'],
|
90
|
+
["GO MF ID", 'go_molecular_function_id'],
|
91
|
+
["GO CC ID", 'go_cellular_component_id'],
|
92
|
+
]
|
93
|
+
|
80
94
|
$biomart_gene_biotype= [
|
81
95
|
["Biotype", 'gene_biotype'],
|
82
96
|
]
|
@@ -91,7 +105,13 @@ $biomart_exons = [
|
|
91
105
|
#{{{ Rules
|
92
106
|
|
93
107
|
file 'entrez_taxids' do |t|
|
94
|
-
|
108
|
+
if $tax && $tax.any?
|
109
|
+
Misc.sensiblewrite(t.name, $taxs * "\n")
|
110
|
+
else
|
111
|
+
tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
|
112
|
+
taxs = tsv[$scientific_name] || []
|
113
|
+
Misc.sensiblewrite(t.name, taxs * "\n")
|
114
|
+
end
|
95
115
|
end
|
96
116
|
|
97
117
|
file 'scientific_name' do |t|
|
@@ -104,7 +124,8 @@ file 'ortholog_key' do |t|
|
|
104
124
|
Misc.sensiblewrite(t.name, $ortholog_key)
|
105
125
|
end
|
106
126
|
|
107
|
-
file 'identifiers' do |t|
|
127
|
+
file 'identifiers' => 'entrez_taxids' do |t|
|
128
|
+
tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
|
108
129
|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
|
109
130
|
identifiers.unnamed = true
|
110
131
|
|
@@ -116,18 +137,20 @@ file 'identifiers' do |t|
|
|
116
137
|
end
|
117
138
|
|
118
139
|
name_pos = identifiers.identify_field "Associated Gene Name"
|
119
|
-
|
120
|
-
|
121
|
-
|
140
|
+
if tax_codes and tax_codes.any?
|
141
|
+
entrez2name = Entrez.entrez2name(tax_codes)
|
142
|
+
identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
|
143
|
+
names = values[name_pos] || []
|
122
144
|
|
123
|
-
|
124
|
-
|
125
|
-
|
145
|
+
matches = entrez.select do |e|
|
146
|
+
entrez2name.include?(e) && (names & entrez2name[e]).any?
|
147
|
+
end
|
126
148
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
149
|
+
if matches.any?
|
150
|
+
matches
|
151
|
+
else
|
152
|
+
entrez
|
153
|
+
end
|
131
154
|
end
|
132
155
|
end
|
133
156
|
|
@@ -147,15 +170,18 @@ file 'identifiers' do |t|
|
|
147
170
|
identifiers = identifiers.reorder(:key, ordered_fields)
|
148
171
|
end
|
149
172
|
|
150
|
-
|
151
|
-
|
152
|
-
|
173
|
+
if tax_codes and tax_codes.any?
|
174
|
+
entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
|
175
|
+
entrez_synonyms.key_field = "Entrez Gene ID"
|
176
|
+
entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
|
153
177
|
|
154
|
-
|
178
|
+
identifiers.attach entrez_synonyms
|
179
|
+
end
|
155
180
|
|
156
181
|
identifiers.with_unnamed do
|
157
182
|
identifiers.each do |key, values|
|
158
183
|
values.each do |list|
|
184
|
+
list ||= []
|
159
185
|
list.reject!{|v| v.nil? or v.empty?}
|
160
186
|
list.uniq!
|
161
187
|
end
|
@@ -166,10 +192,11 @@ file 'identifiers' do |t|
|
|
166
192
|
Misc.sensiblewrite(t.name, identifiers.to_s)
|
167
193
|
end
|
168
194
|
|
169
|
-
file 'lexicon' => 'identifiers' do |t|
|
195
|
+
file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
|
170
196
|
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
197
|
+
tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
|
171
198
|
|
172
|
-
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep =>
|
199
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
|
173
200
|
entrez_description.key_field = "Entrez Gene ID"
|
174
201
|
entrez_description.fields = ["Entrez Gene Description"]
|
175
202
|
|
@@ -308,8 +335,9 @@ end
|
|
308
335
|
|
309
336
|
# {{{ Other info
|
310
337
|
|
311
|
-
file 'gene_pmids' do |t|
|
312
|
-
|
338
|
+
file 'gene_pmids' => 'entrez_taxids' do |t|
|
339
|
+
tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
|
340
|
+
tsv = Entrez.entrez2pubmed(tax_codes)
|
313
341
|
text = "#: :namespace=#{Thread.current['namespace']}\n"
|
314
342
|
text += "#Entrez Gene ID\tPMID"
|
315
343
|
tsv.each do |gene, pmids|
|
@@ -417,7 +445,7 @@ file 'gene_go_bp' => 'gene_go' do |t|
|
|
417
445
|
|
418
446
|
gene_go.monitor = true
|
419
447
|
gene_go.process "GO ID" do |key, go_id, values|
|
420
|
-
clean =
|
448
|
+
clean = NamedArray.zip_fields(values).select do |id, type|
|
421
449
|
type == "biological_process"
|
422
450
|
end
|
423
451
|
clean.collect{|id, type| id}
|
@@ -487,9 +515,9 @@ file 'gene_pfam' do |t|
|
|
487
515
|
end
|
488
516
|
|
489
517
|
file 'chromosomes' do |t|
|
490
|
-
|
518
|
+
tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
|
491
519
|
|
492
|
-
Misc.sensiblewrite(t.name,
|
520
|
+
Misc.sensiblewrite(t.name, tsv.keys * "\n")
|
493
521
|
end
|
494
522
|
|
495
523
|
file 'blacklist_chromosomes' => 'chromosomes' do |t|
|
@@ -511,6 +539,15 @@ end
|
|
511
539
|
|
512
540
|
rule /^chromosome_.*/ do |t|
|
513
541
|
chr = t.name.match(/chromosome_(.*)/)[1]
|
542
|
+
path = File.expand_path(t.name)
|
543
|
+
dirname = File.dirname(path)
|
544
|
+
|
545
|
+
organism = File.basename(dirname)
|
546
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
547
|
+
archive = organism
|
548
|
+
organism = File.basename(File.dirname(dirname))
|
549
|
+
organism = File.join(organism, archive)
|
550
|
+
end
|
514
551
|
|
515
552
|
# HACK: Skip LRG chromosomes
|
516
553
|
raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
|
@@ -519,28 +556,51 @@ rule /^chromosome_.*/ do |t|
|
|
519
556
|
|
520
557
|
release = Ensembl.releases[archive]
|
521
558
|
|
522
|
-
|
559
|
+
fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
|
560
|
+
server, _, path = fasta_url.partition("/")
|
561
|
+
path = "/" + path
|
562
|
+
|
563
|
+
ftp = Net::FTP.new(server)
|
523
564
|
ftp.passive = true
|
524
565
|
ftp.login
|
525
|
-
|
526
|
-
ftp.chdir("pub/current_fasta/")
|
527
|
-
else
|
528
|
-
ftp.chdir("pub/#{ release }/fasta/")
|
529
|
-
end
|
530
|
-
ftp.chdir($scientific_name.downcase.sub(" ",'_'))
|
566
|
+
ftp.chdir(path)
|
531
567
|
ftp.chdir('dna')
|
532
|
-
file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
|
533
|
-
|
534
|
-
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
535
568
|
|
536
|
-
|
569
|
+
file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
|
570
|
+
if file
|
571
|
+
Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
|
537
572
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
573
|
+
Misc.lock t.name + '.rake' do
|
574
|
+
TmpFile.with_file do |tmpfile|
|
575
|
+
ftp.getbinaryfile(file, tmpfile)
|
576
|
+
Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
|
577
|
+
ftp.close
|
578
|
+
end
|
543
579
|
end
|
580
|
+
else
|
581
|
+
file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
|
582
|
+
Misc.lock t.name + '.rake' do
|
583
|
+
TmpFile.with_file do |tmpfile|
|
584
|
+
ftp.getbinaryfile(file, tmpfile)
|
585
|
+
txt = Open.read(tmpfile, :gzip => true)
|
586
|
+
|
587
|
+
chr_txt = []
|
588
|
+
|
589
|
+
in_chr = false
|
590
|
+
txt.split("\n").each do |line|
|
591
|
+
if line.start_with?(">#{chr}")
|
592
|
+
in_chr = true
|
593
|
+
elsif line.start_with?(">")
|
594
|
+
in_chr = false
|
595
|
+
else
|
596
|
+
chr_txt << line if in_chr
|
597
|
+
end
|
598
|
+
end
|
599
|
+
Misc.sensiblewrite(t.name, chr_txt * "" )
|
600
|
+
ftp.close
|
601
|
+
end
|
602
|
+
end
|
603
|
+
raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
|
544
604
|
end
|
545
605
|
end
|
546
606
|
|
@@ -584,6 +644,16 @@ end
|
|
584
644
|
require 'bio'
|
585
645
|
|
586
646
|
file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
|
647
|
+
path = File.expand_path(t.name)
|
648
|
+
dirname = File.dirname(path)
|
649
|
+
|
650
|
+
organism = File.basename(dirname)
|
651
|
+
if organism =~ /^[a-z]{3}20[0-9]{2}/
|
652
|
+
archive = organism
|
653
|
+
organism = File.basename(File.dirname(dirname))
|
654
|
+
organism = File.join(organism, archive)
|
655
|
+
end
|
656
|
+
|
587
657
|
exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
|
588
658
|
|
589
659
|
chr_transcript_ranges ||= {}
|
@@ -616,10 +686,10 @@ file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosom
|
|
616
686
|
chr_transcript_ranges.each do |chr, transcript_ranges|
|
617
687
|
begin
|
618
688
|
raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
|
619
|
-
|
620
|
-
|
621
|
-
p.
|
622
|
-
chr_str = p.
|
689
|
+
pkgdir = Thread.current["resource"]
|
690
|
+
p = pkgdir[organism]["chromosome_#{chr}"]
|
691
|
+
p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
|
692
|
+
chr_str = p.read
|
623
693
|
rescue Exception
|
624
694
|
Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
|
625
695
|
raise $! unless $!.message =~ /not supported/
|
@@ -656,7 +726,7 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
|
656
726
|
organism = File.join(organism, archive)
|
657
727
|
end
|
658
728
|
|
659
|
-
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :
|
729
|
+
translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
|
660
730
|
|
661
731
|
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
|
662
732
|
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
|
@@ -670,9 +740,9 @@ file 'transcript_5utr' => ["exons", "transcript_exons", "transcripts"] do |t|
|
|
670
740
|
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
|
671
741
|
end
|
672
742
|
|
673
|
-
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :
|
674
|
-
transcript_exons = TSV.open("./transcript_exons", :
|
675
|
-
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :
|
743
|
+
transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
744
|
+
transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
|
745
|
+
exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
|
676
746
|
|
677
747
|
transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
|
678
748
|
transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
|
@@ -719,12 +789,13 @@ end
|
|
719
789
|
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
|
720
790
|
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
|
721
791
|
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
|
722
|
-
transcript_phase
|
792
|
+
transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
|
723
793
|
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
|
724
794
|
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
|
725
795
|
|
726
796
|
|
727
797
|
protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
|
798
|
+
transcript_sequence.monitor = true
|
728
799
|
transcript_sequence.through do |transcript, sequence|
|
729
800
|
protein = transcript_protein[transcript]
|
730
801
|
next if protein.nil? or protein.empty?
|
@@ -777,6 +848,7 @@ file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
|
|
777
848
|
uni_seq = UniProt.get_uniprot_sequence(uni)
|
778
849
|
ensps = uni2ensps[uni]
|
779
850
|
next if ensps.nil? or ensps.empty?
|
851
|
+
|
780
852
|
best_ensp = ensps.sort_by do |ensp|
|
781
853
|
ensp_seq = ensp2seq[ensp]
|
782
854
|
if ensp_seq
|
@@ -806,7 +878,7 @@ file 'gene_set' do |t|
|
|
806
878
|
build_code = Organism.GRC_build(organism)
|
807
879
|
scientific_name = $scientific_name
|
808
880
|
url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
|
809
|
-
|
881
|
+
Open.download(url, "#{t.name}.gz")
|
810
882
|
nil
|
811
883
|
end
|
812
884
|
|
@@ -825,7 +897,8 @@ file 'cdna_fasta' do |t|
|
|
825
897
|
num = release.split("-").last
|
826
898
|
build_code = Organism.GRC_build(organism)
|
827
899
|
scientific_name = Organism.scientific_name(organism)
|
828
|
-
url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
|
829
|
-
|
900
|
+
url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.cdna.all.fa.gz"
|
901
|
+
Open.download(url, "#{t.name}.gz")
|
830
902
|
nil
|
831
903
|
end
|
904
|
+
|
@@ -9,7 +9,7 @@ SOURCE_DIR = 'source'
|
|
9
9
|
def define_source_tasks(sources)
|
10
10
|
sources.each do |name, url|
|
11
11
|
file File.join(SOURCE_DIR, name) do |t|
|
12
|
-
FileUtils.mkdir SOURCE_DIR unless File.
|
12
|
+
FileUtils.mkdir SOURCE_DIR unless File.exist? SOURCE_DIR
|
13
13
|
Log.log "Retrieving file '#{name}' into '#{t.name}': '#{url}'", Log::LOW
|
14
14
|
Open.write(t.name, Open.open(url, :cache => false, :wget_options => {"--no-check-certificate" => true, "--quiet" => false, :pipe => true}))
|
15
15
|
end
|
@@ -87,5 +87,5 @@ end
|
|
87
87
|
task :all => :default
|
88
88
|
|
89
89
|
task :clean do
|
90
|
-
($__tsv_tasks + $__files).each do |file| FileUtils.rm file.to_s if File.
|
90
|
+
($__tsv_tasks + $__files).each do |file| FileUtils.rm file.to_s if File.exist?(file.to_s) end
|
91
91
|
end
|
@@ -3,34 +3,44 @@ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
|
|
3
3
|
require 'rbbt/sources/pubmed'
|
4
4
|
require 'test/unit'
|
5
5
|
require 'rbbt/sources/biomart'
|
6
|
+
require 'rbbt/sources/organism'
|
6
7
|
require 'rbbt/util/tmpfile'
|
7
8
|
require 'test/unit'
|
8
9
|
|
9
10
|
class TestBioMart < Test::Unit::TestCase
|
10
11
|
|
11
12
|
def setup
|
12
|
-
BioMart.set_archive
|
13
|
+
BioMart.set_archive "feb2014"
|
13
14
|
end
|
14
15
|
|
15
16
|
def teardown
|
16
17
|
BioMart.unset_archive
|
17
18
|
end
|
18
19
|
|
19
|
-
def
|
20
|
+
def test_get_Sce
|
20
21
|
assert_raise BioMart::QueryError do
|
21
22
|
BioMart.get('scerevisiae_gene_ensembl','entrezgene', ['protein_id'],['with_unknownattr'])
|
22
23
|
end
|
23
24
|
|
24
|
-
|
25
|
+
BioMart.set_archive "feb2023-fungi"
|
26
|
+
data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
|
25
27
|
tsv = TSV.open data, :double, :merge => true
|
26
|
-
assert(tsv['852236'][0].include? 'CAA84864')
|
28
|
+
assert(tsv['852236'][0].include? 'CAA84864.1')
|
27
29
|
|
28
|
-
data = BioMart.get('
|
30
|
+
data = BioMart.get('scerevisiae_eg_gene','entrezgene_id', ['external_gene_id'],[], data, :nocache => false, :wget_options => { :quiet => false} )
|
29
31
|
tsv = TSV.open data, :double, :merge => true
|
30
32
|
assert(tsv['852236'][1].include? 'YBL044W')
|
31
33
|
end
|
32
34
|
|
33
|
-
def
|
35
|
+
def test_get_Hsa
|
36
|
+
Log.severity = 0
|
37
|
+
data = BioMart.get('hsapiens_gene_ensembl','entrezgene', ['protein_id'],[], nil, :nocache => true, :merge => true, :wget_options => {:quiet => false})
|
38
|
+
tsv = TSV.open data, :double, :merge => true
|
39
|
+
assert(tsv['852236'][0].include? 'CAA84864.1')
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def test_query
|
34
44
|
data = BioMart.query('scerevisiae_gene_ensembl','entrezgene', ['protein_id','refseq_peptide','external_gene_id','ensembl_gene_id'], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
35
45
|
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
36
46
|
|
@@ -41,6 +51,34 @@ class TestBioMart < Test::Unit::TestCase
|
|
41
51
|
end
|
42
52
|
end
|
43
53
|
|
54
|
+
def __test_transcrip_exons
|
55
|
+
Log.with_severity 1 do
|
56
|
+
TmpFile.with_file do |f|
|
57
|
+
fields = ['ensembl_transcript_id','ensembl_exon_id','rank']
|
58
|
+
main = fields[0]
|
59
|
+
attrs = fields.values_at(1, 2)
|
60
|
+
attrs_first = [attrs.first]
|
61
|
+
attrs_last = [attrs.last]
|
62
|
+
database = 'hsapiens_gene_ensembl'
|
63
|
+
|
64
|
+
filename = BioMart.get(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
65
|
+
ppp Open.read(filename)
|
66
|
+
|
67
|
+
filename = BioMart.get(database, main, attrs_first, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
68
|
+
ppp Open.read(filename)
|
69
|
+
|
70
|
+
filename = BioMart.get(database, main, attrs_last, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => false, :wget_options => {:quiet => false}, :filename => f)
|
71
|
+
ppp Open.read(filename)
|
72
|
+
|
73
|
+
filename = BioMart.query(database, main, attrs, {"ensembl_transcript_id" => ['ENST00000357654']}, nil, :nocache => true, :wget_options => {:quiet => false}, :filename => f)
|
74
|
+
ppp Open.read(filename)
|
75
|
+
|
76
|
+
data = TSV.open Open.open(filename)
|
77
|
+
assert(data['852236']['external_gene_id'].include? 'YBL044W')
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
44
82
|
def test_tsv
|
45
83
|
data = BioMart.tsv('scerevisiae_gene_ensembl',['Entrez Gene', 'entrezgene'], [['Protein ID', 'protein_id'],['RefSeq Peptide','refseq_peptide']], [], nil, :nocache => false, :wget_options => { :quiet => false})
|
46
84
|
assert(data['852236']['Protein ID'].include? 'CAA84864')
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
class TestEnsemblFTP < Test::Unit::TestCase
|
5
|
+
def test_ftp_for
|
6
|
+
assert_nothing_raised do
|
7
|
+
Ensembl::FTP.ftp_name_for("Hsa/feb2023", 'fasta')
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
class TestMESH < Test::Unit::TestCase
|
5
|
+
def test_vocab
|
6
|
+
tsv = MeSH.vocabulary.tsv
|
7
|
+
assert_equal "3T3 Cells", tsv["D016475"]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
|
|
5
5
|
|
6
6
|
class TestOrganism < Test::Unit::TestCase
|
7
7
|
|
8
|
-
def
|
8
|
+
def _test_known_ids
|
9
9
|
assert Organism.known_ids("Hsa").include?("Associated Gene Name")
|
10
10
|
end
|
11
11
|
|
12
|
-
def
|
12
|
+
def _test_location
|
13
13
|
assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
16
|
+
def _test_identifiers
|
17
17
|
assert Organism.identifiers('Hsa/feb2014').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
|
18
18
|
assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
19
19
|
assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
22
|
+
def _test_lexicon
|
23
23
|
assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_guess_id
|
27
27
|
ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
|
28
28
|
gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
|
29
29
|
assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
|
30
30
|
assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
|
31
31
|
end
|
32
32
|
|
33
|
-
def
|
33
|
+
def _test_organisms
|
34
34
|
assert Organism.organisms.include? "Hsa"
|
35
35
|
assert_equal "Hsa", Organism.organism("Homo sapiens")
|
36
36
|
end
|
37
37
|
|
38
|
-
def
|
38
|
+
def _test_attach_translations
|
39
39
|
tsv = TSV.setup({"1020" => []}, :type => :list)
|
40
40
|
tsv.key_field = "Entrez Gene ID"
|
41
41
|
tsv.fields = []
|
@@ -47,21 +47,21 @@ class TestOrganism < Test::Unit::TestCase
|
|
47
47
|
assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
|
48
48
|
end
|
49
49
|
|
50
|
-
def
|
50
|
+
def _test_entrez_taxids
|
51
51
|
assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
|
52
52
|
end
|
53
53
|
|
54
|
-
def
|
54
|
+
def _test_lift_over
|
55
55
|
mutation_19 = "19:21131664:T"
|
56
56
|
mutation_18 = "19:20923504:T"
|
57
|
-
source_build =
|
57
|
+
source_build = "Hsa/feb2014"
|
58
58
|
target_build = "Hsa/may2009"
|
59
59
|
|
60
60
|
assert_equal mutation_18, Organism.liftOver([mutation_19], source_build, target_build).first
|
61
61
|
assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
|
62
62
|
end
|
63
63
|
|
64
|
-
def
|
64
|
+
def _test_orhtolog
|
65
65
|
require 'rbbt/entity/gene'
|
66
66
|
assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog(Organism.default_code("Hsa"))
|
67
67
|
end
|
@@ -70,23 +70,23 @@ class TestOrganism < Test::Unit::TestCase
|
|
70
70
|
assert Organism.chromosome_sizes["2"].to_i > 10_000_000
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
73
|
+
def _test_build_organism
|
74
74
|
assert_equal 'Hsa/may2017', Organism.organism_for_build('hg38')
|
75
75
|
assert_equal 'Hsa/feb2014', Organism.organism_for_build('b37')
|
76
76
|
assert_equal 'Mmu/may2017', Organism.organism_for_build('mm10')
|
77
77
|
end
|
78
78
|
|
79
|
-
#def
|
79
|
+
#def _test_genes_at_chromosome
|
80
80
|
# pos = [12, 117799500]
|
81
81
|
# assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
82
82
|
#end
|
83
83
|
|
84
|
-
#def
|
84
|
+
#def _test_genes_at_chromosome_array
|
85
85
|
# pos = [12, [117799500, 106903900]]
|
86
86
|
# assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
|
87
87
|
#end
|
88
88
|
|
89
|
-
#def
|
89
|
+
#def _test_genes_at_genomic_positions
|
90
90
|
# pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
|
91
91
|
# assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
|
92
92
|
#end
|
@@ -5,7 +5,17 @@ require 'test/unit'
|
|
5
5
|
|
6
6
|
class TestPubMed < Test::Unit::TestCase
|
7
7
|
|
8
|
-
def
|
8
|
+
def test_mesh
|
9
|
+
pmid = '10866666'
|
10
|
+
assert_include PubMed.get_article(pmid).mesh, "D016475"
|
11
|
+
end
|
12
|
+
|
13
|
+
def _test_substance
|
14
|
+
pmid = '10866666'
|
15
|
+
assert_include PubMed.get_article(pmid).substance, "C000717247"
|
16
|
+
end
|
17
|
+
|
18
|
+
def _test_get_article
|
9
19
|
pmid = '16438716'
|
10
20
|
assert(PubMed.get_article(pmid).title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
11
21
|
|
@@ -13,38 +23,38 @@ class TestPubMed < Test::Unit::TestCase
|
|
13
23
|
assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
14
24
|
end
|
15
25
|
|
16
|
-
def
|
26
|
+
def _test_get_multi_abstract
|
17
27
|
pmid = "32141403"
|
18
28
|
|
19
29
|
assert PubMed.get_article(pmid).abstract.include?("This study shows PCOS patients are at increased risk of incident schizophrenia, and the metformin treatment has a protective effect against incident schizophrenia.")
|
20
30
|
end
|
21
31
|
|
22
|
-
def
|
32
|
+
def _test_full_text
|
23
33
|
pmid = '16438716'
|
24
34
|
assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
|
25
35
|
end
|
26
36
|
|
27
|
-
def
|
37
|
+
def _test_pmc_full_xml
|
28
38
|
pmid = '4304705'
|
29
39
|
assert PubMed.get_article(pmid).pmc_full_xml.include?("HBV antigen")
|
30
40
|
end
|
31
41
|
|
32
42
|
|
33
|
-
def
|
43
|
+
def _test_query
|
34
44
|
assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
|
35
45
|
end
|
36
46
|
|
37
|
-
def
|
47
|
+
def _test_year
|
38
48
|
pmid = '16438716'
|
39
49
|
assert_equal "2006", PubMed.get_article(pmid).year
|
40
50
|
end
|
41
51
|
|
42
|
-
def
|
52
|
+
def _test_bibentry
|
43
53
|
assert("vazquez2008sent", PubMed::Article.make_bibentry('vazquez', 2008, "SENT: Semantic features in text"))
|
44
54
|
assert("vazquez2008aes", PubMed::Article.make_bibentry('vazquez', 2008, "An Example System"))
|
45
55
|
end
|
46
56
|
|
47
|
-
def
|
57
|
+
def _test_missing
|
48
58
|
pmids = '18627426,014966295'.split(",")
|
49
59
|
Log.severity = 0
|
50
60
|
assert PubMed.get_article(pmids).include? "014966295"
|