rbbt-sources 3.1.43 → 3.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/etc/biomart/missing_in_archive +4 -0
- data/lib/rbbt/sources/biomart.rb +4 -0
- data/lib/rbbt/sources/ensembl_ftp.rb +19 -7
- data/lib/rbbt/sources/entrez.rb +3 -3
- data/lib/rbbt/sources/go.rb +1 -1
- data/lib/rbbt/sources/organism.rb +5 -1
- data/lib/rbbt/sources/pubmed.rb +24 -12
- data/lib/rbbt/sources/tfacts.rb +14 -14
- data/share/Ensembl/release_dates +5 -0
- data/share/install/Organism/Hsa/Rakefile +1 -1
- data/share/install/Organism/Mmu/Rakefile +1 -1
- data/share/install/Organism/Rno/Rakefile +1 -1
- data/share/install/Organism/organism_helpers.rb +8 -3
- data/test/rbbt/sources/test_entrez.rb +3 -0
- data/test/rbbt/sources/test_pubmed.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5e2516cf7cd0ae996164a41f940843369c5ab8c5092a76fcdb81ca80b43b1b6
|
4
|
+
data.tar.gz: 5d90eb802ea522d2c760910ce1e19a6c197b7afef6a183cb5ed8335adf8b083b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5451de956158d1f9e40c216cdf4107fede82080423acb32924831168d5f60ea21590a1e996a08bb68c1951cb8da42c793fffa84ea873cc889073fda2cfec07b7
|
7
|
+
data.tar.gz: 80a79bada240deec85485f4f831a0e1aaa520a10889ac28d7665424d61173938c95638753cd2eb190d744e0a978e04eb891ee7bd88bd5a4442792c9beba45ad1
|
data/lib/rbbt/sources/biomart.rb
CHANGED
@@ -134,11 +134,14 @@ module BioMart
|
|
134
134
|
# cause an error if the BioMart WS does not allow filtering with that
|
135
135
|
# attribute.
|
136
136
|
def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
|
137
|
+
IndiferentHash.setup(open_options)
|
137
138
|
open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
|
138
139
|
filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
|
139
140
|
attrs ||= []
|
140
141
|
open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
|
141
142
|
|
143
|
+
IndiferentHash.setup(open_options)
|
144
|
+
|
142
145
|
Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
|
143
146
|
|
144
147
|
max_items = 2
|
@@ -167,6 +170,7 @@ module BioMart
|
|
167
170
|
end
|
168
171
|
|
169
172
|
open_options[:filename] = "BioMart[#{main}+#{attrs.length}]"
|
173
|
+
|
170
174
|
if filename.nil?
|
171
175
|
results = TSV.open data, open_options
|
172
176
|
results.key_field = main
|
@@ -52,24 +52,35 @@ module Ensembl
|
|
52
52
|
File.join("ftp://" + SERVER, ftp_directory_for(organism) )
|
53
53
|
end
|
54
54
|
|
55
|
-
def self.url_for(organism, table)
|
56
|
-
|
55
|
+
def self.url_for(organism, table, extension)
|
56
|
+
File.join(base_url(organism), table) + ".#{extension}.gz"
|
57
|
+
end
|
58
|
+
|
59
|
+
def self._get_gz(url)
|
60
|
+
begin
|
61
|
+
CMD.cmd("wget '#{url}' -O - | gunzip").read
|
62
|
+
rescue
|
63
|
+
CMD.cmd("wget '#{url}.bz2' -O - | bunzip2 | gunzip").read
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self._get_file(organism, table, extension)
|
68
|
+
url = url_for(organism, table, extension)
|
69
|
+
self._get_gz(url)
|
57
70
|
end
|
58
71
|
|
59
72
|
def self.has_table?(organism, table)
|
60
|
-
sql_file =
|
73
|
+
sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
|
61
74
|
! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
|
62
75
|
end
|
63
76
|
|
64
77
|
def self.fields_for(organism, table)
|
65
|
-
sql_file =
|
66
|
-
|
78
|
+
sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
|
67
79
|
chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
|
68
80
|
chunk.scan(/^\s+`(.*?)`/).flatten
|
69
81
|
end
|
70
82
|
|
71
83
|
def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
|
72
|
-
url = url_for(organism, table)
|
73
84
|
if key_field and fields
|
74
85
|
all_fields = fields_for(organism, table)
|
75
86
|
key_pos = all_fields.index key_field
|
@@ -78,7 +89,8 @@ module Ensembl
|
|
78
89
|
options[:key_field] = key_pos
|
79
90
|
options[:fields] = field_pos
|
80
91
|
end
|
81
|
-
|
92
|
+
|
93
|
+
tsv = TSV.open(StringIO.new(_get_file(organism, table, "txt")), options)
|
82
94
|
tsv.key_field = key_field
|
83
95
|
tsv.fields = fields
|
84
96
|
tsv
|
data/lib/rbbt/sources/entrez.rb
CHANGED
@@ -14,7 +14,7 @@ module Entrez
|
|
14
14
|
options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
|
15
15
|
|
16
16
|
taxs = [taxs] unless Array === taxs
|
17
|
-
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
17
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
|
18
18
|
|
19
19
|
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
20
20
|
tsv.key_field = "Entrez Gene ID"
|
@@ -26,7 +26,7 @@ module Entrez
|
|
26
26
|
options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
|
27
27
|
|
28
28
|
taxs = [taxs] unless Array === taxs
|
29
|
-
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
29
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
|
30
30
|
|
31
31
|
tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
|
32
32
|
tsv.key_field = "Entrez Gene ID"
|
@@ -39,7 +39,7 @@ module Entrez
|
|
39
39
|
options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
|
40
40
|
|
41
41
|
taxs = [taxs] unless taxs.is_a?(Array)
|
42
|
-
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
|
42
|
+
options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
|
43
43
|
|
44
44
|
Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
|
45
45
|
end
|
data/lib/rbbt/sources/go.rb
CHANGED
@@ -27,7 +27,7 @@ module GO
|
|
27
27
|
def self.init
|
28
28
|
Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
|
29
29
|
info.serializer = :marshal if info.respond_to? :serializer
|
30
|
-
Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
|
30
|
+
Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
|
31
31
|
term_info = {}
|
32
32
|
|
33
33
|
term.split(/\n/). select{|l| l =~ /:/}.each{|l|
|
@@ -88,8 +88,10 @@ module Organism
|
|
88
88
|
end
|
89
89
|
when "Mmu"
|
90
90
|
"mm10"
|
91
|
+
when "Rno"
|
92
|
+
"rn6"
|
91
93
|
else
|
92
|
-
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
94
|
+
raise "Only organism 'Hsa' (Homo sapiens), 'Rno' (Rattus norvegicus), and Mmu (Mus musculus) supported"
|
93
95
|
end
|
94
96
|
end
|
95
97
|
|
@@ -117,6 +119,8 @@ module Organism
|
|
117
119
|
end
|
118
120
|
when "Mmu"
|
119
121
|
"GRCm38"
|
122
|
+
when "Rno"
|
123
|
+
"Rnor_6.0"
|
120
124
|
else
|
121
125
|
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
122
126
|
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -109,6 +109,14 @@ module PubMed
|
|
109
109
|
end
|
110
110
|
end
|
111
111
|
|
112
|
+
def pmc_full_xml
|
113
|
+
begin
|
114
|
+
Open.read("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=#{pmid}")
|
115
|
+
rescue
|
116
|
+
nil
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
112
120
|
def pdf_url
|
113
121
|
return pmc_pdf if pmc_pdf
|
114
122
|
@gscholar_pdf ||= begin
|
@@ -121,18 +129,22 @@ module PubMed
|
|
121
129
|
end
|
122
130
|
|
123
131
|
def full_text
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
132
|
+
text = if pdf_url
|
133
|
+
text = nil
|
134
|
+
TmpFile.with_file do |pdf|
|
135
|
+
# Change user-agent, oh well...
|
136
|
+
`wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
|
137
|
+
TmpFile.with_file do |txt|
|
138
|
+
`pdftotext #{ pdf } #{ txt }`
|
139
|
+
text = Open.read(txt) if File.exists? txt
|
140
|
+
end
|
141
|
+
end
|
142
|
+
text
|
143
|
+
elsif pmc_full_xml
|
144
|
+
pmc_full_xml
|
145
|
+
else
|
146
|
+
nil
|
147
|
+
end
|
136
148
|
|
137
149
|
Misc.fixutf8(text)
|
138
150
|
end
|
data/lib/rbbt/sources/tfacts.rb
CHANGED
@@ -2,15 +2,15 @@ require 'rbbt'
|
|
2
2
|
require 'rbbt/tsv'
|
3
3
|
require 'rbbt/resource'
|
4
4
|
|
5
|
-
module
|
5
|
+
module TFactS
|
6
6
|
extend Resource
|
7
|
-
self.subdir = "share/databases/
|
7
|
+
self.subdir = "share/databases/TFactS"
|
8
8
|
|
9
|
-
|
9
|
+
TFactS.claim TFactS[".source"]["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
|
10
10
|
|
11
|
-
|
11
|
+
TFactS.claim TFactS.targets, :proc do
|
12
12
|
require 'spreadsheet'
|
13
|
-
book = Spreadsheet.open
|
13
|
+
book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
|
14
14
|
sheet = book.worksheet 0
|
15
15
|
|
16
16
|
tsv = TSV.setup({}, :key_field => "Target Gene (Associated Gene Name)", :fields => ["Transcription Factor (Associated Gene Name)"], :namespace => "Hsa", :type => :flat)
|
@@ -24,9 +24,9 @@ module TFacts
|
|
24
24
|
tsv.to_s
|
25
25
|
end
|
26
26
|
|
27
|
-
|
27
|
+
TFactS.claim TFactS.targets_signed, :proc do
|
28
28
|
require 'spreadsheet'
|
29
|
-
book = Spreadsheet.open
|
29
|
+
book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
|
30
30
|
sheet = book.worksheet 1
|
31
31
|
|
32
32
|
tsv = TSV.setup({}, :key_field => "Target Gene (Associated Gene Name)", :fields => ["Transcription Factor (Associated Gene Name)", "Sign", "PMID"], :namespace => "Hsa", :type => :double)
|
@@ -43,13 +43,13 @@ module TFacts
|
|
43
43
|
tsv.to_s
|
44
44
|
end
|
45
45
|
|
46
|
-
|
47
|
-
|
46
|
+
TFactS.claim TFactS.regulators, :proc do
|
47
|
+
TFactS.targets.tsv.reorder("Transcription Factor (Associated Gene Name)").to_s
|
48
48
|
end
|
49
49
|
|
50
|
-
|
50
|
+
TFactS.claim TFactS.tf_tg, :proc do
|
51
51
|
require 'spreadsheet'
|
52
|
-
book = Spreadsheet.open
|
52
|
+
book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
|
53
53
|
|
54
54
|
tsv = TSV.setup({}, :key_field => "Transcription Factor (Associated Gene Name)", :fields => ["Target Gene (Associated Gene Name)", "Sign", "Species", "Source", "PMID"], :namespace => "Hsa", :type => :double)
|
55
55
|
|
@@ -128,16 +128,16 @@ if defined? Entity and defined? Gene and Entity === Gene
|
|
128
128
|
|
129
129
|
module Gene
|
130
130
|
property :is_transcription_factor? => :array2single do
|
131
|
-
tfs =
|
131
|
+
tfs = TFactS.targets.keys
|
132
132
|
self.name.collect{|gene| tfs.include? gene}
|
133
133
|
end
|
134
134
|
|
135
135
|
property :transcription_regulators => :array2single do
|
136
|
-
Gene.setup(
|
136
|
+
Gene.setup(TFactS.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
137
137
|
end
|
138
138
|
|
139
139
|
property :transcription_targets => :array2single do
|
140
|
-
Gene.setup(
|
140
|
+
Gene.setup(TFactS.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
|
141
141
|
end
|
142
142
|
end
|
143
143
|
end
|
data/share/Ensembl/release_dates
CHANGED
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
|
5
5
|
|
6
6
|
$taxs = [9606]
|
7
7
|
$scientific_name = "Homo sapiens"
|
8
|
-
$ortholog_key = "
|
8
|
+
$ortholog_key = "hsapiens_homolog_ensembl_gene"
|
9
9
|
|
10
10
|
$biomart_db = 'hsapiens_gene_ensembl'
|
11
11
|
$biomart_db_germline_variation = 'hsapiens_snp'
|
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
|
|
5
5
|
|
6
6
|
$taxs = [10090]
|
7
7
|
$scientific_name = "Mus musculus"
|
8
|
-
$ortholog_key = "
|
8
|
+
$ortholog_key = "mmusculus_homolog_ensembl_gene"
|
9
9
|
|
10
10
|
$biomart_db = 'mmusculus_gene_ensembl'
|
11
11
|
$biomart_db_germline_variation = 'mmusculus_snp'
|
@@ -9,7 +9,7 @@ $scientific_name = "Rattus norvegicus"
|
|
9
9
|
$biomart_db = 'rnorvegicus_gene_ensembl'
|
10
10
|
$biomart_db_germline_variation = 'rnorvegicus_snp'
|
11
11
|
$biomart_db_somatic_variation = 'rnorvegicus_snp_som'
|
12
|
-
$ortholog_key = "
|
12
|
+
$ortholog_key = "rnorvegicus_homolog_ensembl_gene"
|
13
13
|
|
14
14
|
$biomart_lexicon = [
|
15
15
|
[ 'Associated Gene Name' , "external_gene_id"],
|
@@ -547,13 +547,13 @@ end
|
|
547
547
|
rule /^possible_ortholog_(.*)/ do |t|
|
548
548
|
other = t.name.match(/ortholog_(.*)/)[1]
|
549
549
|
other_key = Organism.ortholog_key(other).produce.read
|
550
|
-
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :
|
550
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :double, :filename => t.name, :namespace => Thread.current['namespace'])
|
551
551
|
end
|
552
552
|
|
553
553
|
rule /^ortholog_(.*)/ do |t|
|
554
554
|
other = t.name.match(/ortholog_(.*)/)[1]
|
555
555
|
other_key = Organism.ortholog_key(other).produce.read
|
556
|
-
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :
|
556
|
+
BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :double, :filename => t.name, :namespace => Thread.current['namespace'])
|
557
557
|
end
|
558
558
|
|
559
559
|
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
|
@@ -728,13 +728,18 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
|
|
728
728
|
transcript_sequence.through do |transcript, sequence|
|
729
729
|
protein = transcript_protein[transcript]
|
730
730
|
next if protein.nil? or protein.empty?
|
731
|
+
|
731
732
|
utr5 = transcript_5utr[transcript]
|
732
733
|
utr3 = transcript_3utr[transcript]
|
733
734
|
phase = transcript_phase[transcript] || 0
|
735
|
+
|
734
736
|
if phase < 0
|
735
|
-
utr5
|
737
|
+
if utr5.nil? || utr5 == 0 || utr5 == "0"
|
738
|
+
utr5 = 0
|
739
|
+
end
|
736
740
|
phase = 0
|
737
741
|
end
|
742
|
+
|
738
743
|
psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
|
739
744
|
protein_sequence[protein]=psequence
|
740
745
|
end
|
@@ -17,6 +17,12 @@ class TestPubMed < Test::Unit::TestCase
|
|
17
17
|
pmid = '16438716'
|
18
18
|
assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
|
19
19
|
end
|
20
|
+
|
21
|
+
def test_pmc_full_xml
|
22
|
+
pmid = '4304705'
|
23
|
+
assert PubMed.get_article(pmid).pmc_full_xml.include?("HBV antigen")
|
24
|
+
end
|
25
|
+
|
20
26
|
|
21
27
|
def test_query
|
22
28
|
assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1.
|
4
|
+
version: 3.1.49
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|