rbbt-sources 3.1.42 → 3.1.48
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/etc/biomart/missing_in_archive +4 -0
- data/lib/rbbt/sources/biomart.rb +4 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +19 -7
- data/lib/rbbt/sources/entrez.rb +3 -3
- data/lib/rbbt/sources/go.rb +1 -1
- data/lib/rbbt/sources/pina.rb +1 -1
- data/lib/rbbt/sources/pubmed.rb +24 -12
- data/lib/rbbt/sources/tfacts.rb +14 -14
- data/share/install/Organism/Hsa/Rakefile +1 -1
- data/share/install/Organism/Mmu/Rakefile +1 -1
- data/share/install/Organism/Rno/Rakefile +1 -1
- data/share/install/Organism/organism_helpers.rb +8 -3
- data/test/rbbt/sources/test_entrez.rb +3 -0
- data/test/rbbt/sources/test_pubmed.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92bb6e2852266c73666e0b07221484822441d19603b2ce9d4fe9ceb6faa44d37
|
4
|
+
data.tar.gz: 9eda4d907bb34ae5b5b95dbdb972b6c659111b042c5b68661d41cf3ccd93e3b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ac47f700cfc54144bfbc2dc7b3eeaf2e9dce885ccfc1e47c26d978a79dd02d99802a83a888bb1bd80de96ecf9bf023ae548041f3b1867308228c319777dabd4
|
7
|
+
data.tar.gz: 05b07ac77a194f0ad87a36b050eb97d2285490974d0bca039c2163d762c11461f4ccf6c4585df62b24298524bb9f02e4d1d65dae51418e491644b4725fbd864f
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -134,11 +134,14 @@ module BioMart
|
|
134
134
|
# cause an error if the BioMart WS does not allow filtering with that
|
135
135
|
# attribute.
|
136
136
|
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
137
|
+
IndiferentHash.setup(open_options)
|
137
138
|
open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
|
138
139
|
filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
|
139
140
|
attrs ||= []
|
140
141
|
open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
|
141
142
|
|
143
|
+
IndiferentHash.setup(open_options)
|
144
|
+
|
142
145
|
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
143
146
|
|
144
147
|
max_items = 2
|
@@ -167,6 +170,7 @@ module BioMart
|
|
167
170
|
end
|
168
171
|
|
169
172
|
open_options[:filename] = "BioMart[#{main}+#{attrs.length}]"
|
173
|
+
|
170
174
|
if filename.nil?
|
171
175
|
results = TSV.open data, open_options
|
172
176
|
results.key_field = main
|
@@ -52,24 +52,35 @@ module Ensembl
|
|
52
52
|
File.join("ftp://" + SERVER, ftp_directory_for(organism) )
|
53
53
|
end
|
54
54
|
|
55
|
-
def self.url_for(organism, table)
|
56
|
-
|
55
|
+
def self.url_for(organism, table, extension)
|
56
|
+
File.join(base_url(organism), table) + ".#{extension}.gz"
|
57
|
+
end
|
58
|
+
|
59
|
+
def self._get_gz(url)
|
60
|
+
begin
|
61
|
+
CMD.cmd("wget '#{url}' -O - | gunzip").read
|
62
|
+
rescue
|
63
|
+
CMD.cmd("wget '#{url}.bz2' -O - | bunzip2 | gunzip").read
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self._get_file(organism, table, extension)
|
68
|
+
url = url_for(organism, table, extension)
|
69
|
+
self._get_gz(url)
|
57
70
|
end
|
58
71
|
|
59
72
|
def self.has_table?(organism, table)
|
60
|
-
sql_file =
|
73
|
+
sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
|
61
74
|
! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
|
62
75
|
end
|
63
76
|
|
64
77
|
def self.fields_for(organism, table)
|
65
|
-
sql_file =
|
66
|
-
|
78
|
+
sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
|
67
79
|
chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
|
68
80
|
chunk.scan(/^\s+`(.*?)`/).flatten
|
69
81
|
end
|
70
82
|
|
71
83
|
def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
|
72
|
-
url = url_for(organism, table)
|
73
84
|
if key_field and fields
|
74
85
|
all_fields = fields_for(organism, table)
|
75
86
|
key_pos = all_fields.index key_field
|
@@ -78,7 +89,8 @@ module Ensembl
|
|
78
89
|
options[:key_field] = key_pos
|
79
90
|
options[:fields] = field_pos
|
80
91
|
end
|
81
|
-
|
92
|
+
|
93
|
+
tsv = TSV.open(StringIO.new(_get_file(organism, table, "txt")), options)
|
82
94
|
tsv.key_field = key_field
|
83
95
|
tsv.fields = fields
|
84
96
|
tsv
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -14,7 +14,7 @@ module Entrez
|
|
14
14
|
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
15
15
|
|
16
16
|
taxs = [taxs] unless Array === taxs
|
17
|
-
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
17
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
|
18
18
|
|
19
19
|
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
20
20
|
tsv.key_field = "Entrez Gene ID"
|
@@ -26,7 +26,7 @@ module Entrez
|
|
26
26
|
options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
|
27
27
|
|
28
28
|
taxs = [taxs] unless Array === taxs
|
29
|
-
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
29
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
|
30
30
|
|
31
31
|
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
32
32
|
tsv.key_field = "Entrez Gene ID"
|
@@ -39,7 +39,7 @@ module Entrez
|
|
39
39
|
options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
|
40
40
|
|
41
41
|
taxs = [taxs] unless taxs.is_a?(Array)
|
42
|
-
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
42
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
|
43
43
|
|
44
44
|
Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
|
45
45
|
end
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -27,7 +27,7 @@ module GO
|
|
27
27
|
def self.init
|
28
28
|
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
29
29
|
info.serializer = :marshal if info.respond_to? :serializer
|
30
|
-
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
30
|
+
Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
|
31
31
|
term_info = {}
|
32
32
|
|
33
33
|
term.split(/\n/). select{|l| l =~ /:/}.each{|l|
|
data/lib/rbbt/sources/pina.rb
CHANGED
@@ -7,7 +7,7 @@ module Pina
|
|
7
7
|
Pina.claim Pina.protein_protein, :proc do
|
8
8
|
require 'rbbt/sources/organism'
|
9
9
|
|
10
|
-
url = "
|
10
|
+
url = "https://omics.bjcancer.org/pina/download/Homo%20sapiens-20140521.tsv"
|
11
11
|
|
12
12
|
dumper = TSV::Dumper.new :type => :double,
|
13
13
|
:key_field => 'UniProt/SwissProt Accession', :namespace => Organism.default_code("Hsa"),
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -109,6 +109,14 @@ module PubMed
|
|
109
109
|
end
|
110
110
|
end
|
111
111
|
|
112
|
+
def pmc_full_xml
|
113
|
+
begin
|
114
|
+
Open.read("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=#{pmid}")
|
115
|
+
rescue
|
116
|
+
nil
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
112
120
|
def pdf_url
|
113
121
|
return pmc_pdf if pmc_pdf
|
114
122
|
@gscholar_pdf ||= begin
|
@@ -121,18 +129,22 @@ module PubMed
|
|
121
129
|
end
|
122
130
|
|
123
131
|
def full_text
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
132
|
+
text = if pdf_url
|
133
|
+
text = nil
|
134
|
+
TmpFile.with_file do |pdf|
|
135
|
+
# Change user-agent, oh well...
|
136
|
+
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
|
137
|
+
TmpFile.with_file do |txt|
|
138
|
+
`pdftotext #{ pdf } #{ txt }`
|
139
|
+
text = Open.read(txt) if File.exists? txt
|
140
|
+
end
|
141
|
+
end
|
142
|
+
text
|
143
|
+
elsif pmc_full_xml
|
144
|
+
pmc_full_xml
|
145
|
+
else
|
146
|
+
nil
|
147
|
+
end
|
136
148
|
|
137
149
|
Misc.fixutf8(text)
|
138
150
|
end
|
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -2,15 +2,15 @@ require 'rbbt'
|
|
2
2
|
require 'rbbt/tsv'
|
3
3
|
require 'rbbt/resource'
|
4
4
|
|
5
|
-
module
|
5
|
+
module TFactS
|
6
6
|
extend Resource
|
7
|
-
self.subdir = "share/databases/
|
7
|
+
self.subdir = "share/databases/TFactS"
|
8
8
|
|
9
|
-
|
9
|
+
TFactS.claim TFactS[".source"]["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
|
10
10
|
|
11
|
-
|
11
|
+
TFactS.claim TFactS.targets, :proc do
|
12
12
|
require 'spreadsheet'
|
13
|
-
book = Spreadsheet.open
|
13
|
+
book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
|
14
14
|
sheet = book.worksheet 0
|
15
15
|
|
16
16
|
tsv = TSV.setup({}, :key_field => "Target Gene (Associated Gene Name)", :fields => ["Transcription Factor (Associated Gene Name)"], :namespace => "Hsa", :type => :flat)
|
@@ -24,9 +24,9 @@ module TFacts
|
|
24
24
|
tsv.to_s
|
25
25
|
end
|
26
26
|
|
27
|
-
|
27
|
+
TFactS.claim TFactS.targets_signed, :proc do
|
28
28
|
require 'spreadsheet'
|
29
|
-
book = Spreadsheet.open
|
29
|
+
book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
|
30
30
|
sheet = book.worksheet 1
|
31
31
|
|
32
32
|
tsv = TSV.setup({}, :key_field => "Target Gene (Associated Gene Name)", :fields => ["Transcription Factor (Associated Gene Name)", "Sign", "PMID"], :namespace => "Hsa", :type => :double)
|
@@ -43,13 +43,13 @@ module TFacts
|
|
43
43
|
tsv.to_s
|
44
44
|
end
|
45
45
|
|
46
|
-
|
47
|
-
|
46
|
+
TFactS.claim TFactS.regulators, :proc do
|
47
|
+
TFactS.targets.tsv.reorder("Transcription Factor (Associated Gene Name)").to_s
|
48
48
|
end
|
49
49
|
|
50
|
-
|
50
|
+
TFactS.claim TFactS.tf_tg, :proc do
|
51
51
|
require 'spreadsheet'
|
52
|
-
book = Spreadsheet.open
|
52
|
+
book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
|
53
53
|
|
54
54
|
tsv = TSV.setup({}, :key_field => "Transcription Factor (Associated Gene Name)", :fields => ["Target Gene (Associated Gene Name)", "Sign", "Species", "Source", "PMID"], :namespace => "Hsa", :type => :double)
|
55
55
|
|
@@ -128,16 +128,16 @@ if defined? Entity and defined? Gene and Entity === Gene
|
|
128
128
|
|
129
129
|
module Gene
|
130
130
|
property :is_transcription_factor? => :array2single do
|
131
|
-
tfs =
|
131
|
+
tfs = TFactS.targets.keys
|
132
132
|
self.name.collect{|gene| tfs.include? gene}
|
133
133
|
end
|
134
134
|
|
135
135
|
property :transcription_regulators => :array2single do
|
136
|
-
Gene.setup(
|
136
|
+
Gene.setup(TFactS.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
137
137
|
end
|
138
138
|
|
139
139
|
property :transcription_targets => :array2single do
|
140
|
-
Gene.setup(
|
140
|
+
Gene.setup(TFactS.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
141
141
|
end
|
142
142
|
end
|
143
143
|
end
|
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
|
5
5
|
|
6
6
|
$taxs = [9606]
|
7
7
|
$scientific_name = "Homo sapiens"
|
8
|
-
$ortholog_key = "
|
8
|
+
$ortholog_key = "hsapiens_homolog_ensembl_gene"
|
9
9
|
|
10
10
|
$biomart_db = 'hsapiens_gene_ensembl'
|
11
11
|
$biomart_db_germline_variation = 'hsapiens_snp'
|
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
|
5
5
|
|
6
6
|
$taxs = [10090]
|
7
7
|
$scientific_name = "Mus musculus"
|
8
|
-
$ortholog_key = "
|
8
|
+
$ortholog_key = "mmusculus_homolog_ensembl_gene"
|
9
9
|
|
10
10
|
$biomart_db = 'mmusculus_gene_ensembl'
|
11
11
|
$biomart_db_germline_variation = 'mmusculus_snp'
|
@@ -9,7 +9,7 @@ $scientific_name = "Rattus norvegicus"
|
|
9
9
|
$biomart_db = 'rnorvegicus_gene_ensembl'
|
10
10
|
$biomart_db_germline_variation = 'rnorvegicus_snp'
|
11
11
|
$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
|
12
|
-
$ortholog_key = "
|
12
|
+
$ortholog_key = "rnorvegicus_homolog_ensembl_gene"
|
13
13
|
|
14
14
|
$biomart_lexicon = [
|
15
15
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -547,13 +547,13 @@ end
|
|
547
547
|
rule /^possible_ortholog_(.*)/ do |t|
|
548
548
|
other = t.name.match(/ortholog_(.*)/)[1]
|
549
549
|
other_key = Organism.ortholog_key(other).produce.read
|
550
|
-
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :
|
550
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :double, :filename => t.name, :namespace => Thread.current['namespace'])
|
551
551
|
end
|
552
552
|
|
553
553
|
rule /^ortholog_(.*)/ do |t|
|
554
554
|
other = t.name.match(/ortholog_(.*)/)[1]
|
555
555
|
other_key = Organism.ortholog_key(other).produce.read
|
556
|
-
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :
|
556
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :double, :filename => t.name, :namespace => Thread.current['namespace'])
|
557
557
|
end
|
558
558
|
|
559
559
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
@@ -728,13 +728,18 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
|
|
728
728
|
transcript_sequence.through do |transcript, sequence|
|
729
729
|
protein = transcript_protein[transcript]
|
730
730
|
next if protein.nil? or protein.empty?
|
731
|
+
|
731
732
|
utr5 = transcript_5utr[transcript]
|
732
733
|
utr3 = transcript_3utr[transcript]
|
733
734
|
phase = transcript_phase[transcript] || 0
|
735
|
+
|
734
736
|
if phase < 0
|
735
|
-
utr5
|
737
|
+
if utr5.nil? || utr5 == 0 || utr5 == "0"
|
738
|
+
utr5 = 0
|
739
|
+
end
|
736
740
|
phase = 0
|
737
741
|
end
|
742
|
+
|
738
743
|
psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
|
739
744
|
protein_sequence[protein]=psequence
|
740
745
|
end
|
@@ -17,6 +17,12 @@ class TestPubMed < Test::Unit::TestCase
|
|
17
17
|
pmid = '16438716'
|
18
18
|
assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
|
19
19
|
end
|
20
|
+
|
21
|
+
def test_pmc_full_xml
|
22
|
+
pmid = '4304705'
|
23
|
+
assert PubMed.get_article(pmid).pmc_full_xml.include?("HBV antigen")
|
24
|
+
end
|
25
|
+
|
20
26
|
|
21
27
|
def test_query
|
22
28
|
assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.48
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-08-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|