rbbt-sources 3.1.43 → 3.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cb929c25537c93976e03cac4193b7bcbde8c6b8fa1975506779290cc9579d2cb
4
- data.tar.gz: d316a40f9ac9c00997dbecf35246290ee5c87704bf42fab5b687b1e595f803c3
3
+ metadata.gz: e5e2516cf7cd0ae996164a41f940843369c5ab8c5092a76fcdb81ca80b43b1b6
4
+ data.tar.gz: 5d90eb802ea522d2c760910ce1e19a6c197b7afef6a183cb5ed8335adf8b083b
5
5
  SHA512:
6
- metadata.gz: 93b9260a58ed0bfdfa2be1d63160fae4a70a71df3f9c8510e42b99ac740d2c90b1433c8b2aae28e2978ddccfb8abb822bc106341bf8dcef31092db768b5caf19
7
- data.tar.gz: 7931ac20e016779659aca2bf6966abdefe21eec7dc9ca6bf843ddd09004f6fda3c022a74ed48e6d269d2b702aa4bf77cfb22ae760ea65d80db916efeb6998ebd
6
+ metadata.gz: 5451de956158d1f9e40c216cdf4107fede82080423acb32924831168d5f60ea21590a1e996a08bb68c1951cb8da42c793fffa84ea873cc889073fda2cfec07b7
7
+ data.tar.gz: 80a79bada240deec85485f4f831a0e1aaa520a10889ac28d7665424d61173938c95638753cd2eb190d744e0a978e04eb891ee7bd88bd5a4442792c9beba45ad1
@@ -1,3 +1,7 @@
1
+ ">oct2014":
2
+ - rgd~rgd_id
3
+ ">jun2019":
4
+ - entrezgene~entrezgene_id
1
5
  ">dec2017":
2
6
  - unigene
3
7
  ">dec2016":
@@ -134,11 +134,14 @@ module BioMart
134
134
  # cause an error if the BioMart WS does not allow filtering with that
135
135
  # attribute.
136
136
  def self.query(database, main, attrs = nil, filters = nil, data = nil, open_options = {})
137
+ IndiferentHash.setup(open_options)
137
138
  open_options = Misc.add_defaults open_options, :nocache => false, :filename => nil, :field_names => nil, :by_chr => false
138
139
  filename, field_names, by_chr = Misc.process_options open_options, :filename, :field_names, :by_chr
139
140
  attrs ||= []
140
141
  open_options = Misc.add_defaults open_options, :keep_empty => false, :merge => true
141
142
 
143
+ IndiferentHash.setup(open_options)
144
+
142
145
  Log.low "BioMart query: '#{main}' [#{(attrs || []) * ', '}] [#{(filters || []) * ', '}] #{open_options.inspect}"
143
146
 
144
147
  max_items = 2
@@ -167,6 +170,7 @@ module BioMart
167
170
  end
168
171
 
169
172
  open_options[:filename] = "BioMart[#{main}+#{attrs.length}]"
173
+
170
174
  if filename.nil?
171
175
  results = TSV.open data, open_options
172
176
  results.key_field = main
@@ -52,24 +52,35 @@ module Ensembl
52
52
  File.join("ftp://" + SERVER, ftp_directory_for(organism) )
53
53
  end
54
54
 
55
- def self.url_for(organism, table)
56
- "#{base_url(organism)}/#{table}.txt.gz.bz2"
55
+ def self.url_for(organism, table, extension)
56
+ File.join(base_url(organism), table) + ".#{extension}.gz"
57
+ end
58
+
59
+ def self._get_gz(url)
60
+ begin
61
+ CMD.cmd("wget '#{url}' -O - | gunzip").read
62
+ rescue
63
+ CMD.cmd("wget '#{url}.bz2' -O - | bunzip2 | gunzip").read
64
+ end
65
+ end
66
+
67
+ def self._get_file(organism, table, extension)
68
+ url = url_for(organism, table, extension)
69
+ self._get_gz(url)
57
70
  end
58
71
 
59
72
  def self.has_table?(organism, table)
60
- sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
73
+ sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
61
74
  ! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
62
75
  end
63
76
 
64
77
  def self.fields_for(organism, table)
65
- sql_file = CMD.cmd("wget '#{base_url(organism)}/#{File.basename(base_url(organism))}.sql.gz.bz2' -O -| bunzip2| gunzip").read
66
-
78
+ sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
67
79
  chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
68
80
  chunk.scan(/^\s+`(.*?)`/).flatten
69
81
  end
70
82
 
71
83
  def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
72
- url = url_for(organism, table)
73
84
  if key_field and fields
74
85
  all_fields = fields_for(organism, table)
75
86
  key_pos = all_fields.index key_field
@@ -78,7 +89,8 @@ module Ensembl
78
89
  options[:key_field] = key_pos
79
90
  options[:fields] = field_pos
80
91
  end
81
- tsv = TSV.open(CMD.cmd("wget '#{url}' -O - |bunzip2|gunzip", :pipe => true), options)
92
+
93
+ tsv = TSV.open(StringIO.new(_get_file(organism, table, "txt")), options)
82
94
  tsv.key_field = key_field
83
95
  tsv.fields = fields
84
96
  tsv
@@ -14,7 +14,7 @@ module Entrez
14
14
  options = Misc.add_defaults options, :key_field => 1, :fields => [5], :persist => true, :merge => true
15
15
 
16
16
  taxs = [taxs] unless Array === taxs
17
- options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
17
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
18
18
 
19
19
  tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
20
20
  tsv.key_field = "Entrez Gene ID"
@@ -26,7 +26,7 @@ module Entrez
26
26
  options = Misc.add_defaults options, :key_field => 1, :fields => [2], :persist => true, :merge => true
27
27
 
28
28
  taxs = [taxs] unless Array === taxs
29
- options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
29
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
30
30
 
31
31
  tsv = Rbbt.share.databases.entrez.gene_info.tsv :flat, options
32
32
  tsv.key_field = "Entrez Gene ID"
@@ -39,7 +39,7 @@ module Entrez
39
39
  options = {:key_field => 1, :fields => [2], :persist => true, :merge => true}
40
40
 
41
41
  taxs = [taxs] unless taxs.is_a?(Array)
42
- options.merge! :grep => taxs.collect{|t| "^" + t.to_s}
42
+ options.merge! :grep => taxs.collect{|t| "^" + t.to_s}, :fixed_grep => false
43
43
 
44
44
  Rbbt.share.databases.entrez.gene2pubmed.tsv :flat, options
45
45
  end
@@ -27,7 +27,7 @@ module GO
27
27
  def self.init
28
28
  Persist.persist_tsv(nil, 'gene_ontology', {}, :persist => true) do |info|
29
29
  info.serializer = :marshal if info.respond_to? :serializer
30
- Rbbt.share.databases.GO.gene_ontology.read.split(/\[Term\]/).each{|term|
30
+ Rbbt.share.databases.GO.gene_ontology.produce.read.split(/\[Term\]/).each{|term|
31
31
  term_info = {}
32
32
 
33
33
  term.split(/\n/). select{|l| l =~ /:/}.each{|l|
@@ -88,8 +88,10 @@ module Organism
88
88
  end
89
89
  when "Mmu"
90
90
  "mm10"
91
+ when "Rno"
92
+ "rn6"
91
93
  else
92
- raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
94
+ raise "Only organism 'Hsa' (Homo sapiens), 'Rno' (Rattus norvegicus), and Mmu (Mus musculus) supported"
93
95
  end
94
96
  end
95
97
 
@@ -117,6 +119,8 @@ module Organism
117
119
  end
118
120
  when "Mmu"
119
121
  "GRCm38"
122
+ when "Rno"
123
+ "Rnor_6.0"
120
124
  else
121
125
  raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
122
126
  end
@@ -109,6 +109,14 @@ module PubMed
109
109
  end
110
110
  end
111
111
 
112
+ def pmc_full_xml
113
+ begin
114
+ Open.read("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=#{pmid}")
115
+ rescue
116
+ nil
117
+ end
118
+ end
119
+
112
120
  def pdf_url
113
121
  return pmc_pdf if pmc_pdf
114
122
  @gscholar_pdf ||= begin
@@ -121,18 +129,22 @@ module PubMed
121
129
  end
122
130
 
123
131
  def full_text
124
- return nil if pdf_url.nil?
125
-
126
- text = nil
127
- TmpFile.with_file do |pdf|
128
-
129
- # Change user-agent, oh well...
130
- `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
131
- TmpFile.with_file do |txt|
132
- `pdftotext #{ pdf } #{ txt }`
133
- text = Open.read(txt) if File.exists? txt
134
- end
135
- end
132
+ text = if pdf_url
133
+ text = nil
134
+ TmpFile.with_file do |pdf|
135
+ # Change user-agent, oh well...
136
+ `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
137
+ TmpFile.with_file do |txt|
138
+ `pdftotext #{ pdf } #{ txt }`
139
+ text = Open.read(txt) if File.exists? txt
140
+ end
141
+ end
142
+ text
143
+ elsif pmc_full_xml
144
+ pmc_full_xml
145
+ else
146
+ nil
147
+ end
136
148
 
137
149
  Misc.fixutf8(text)
138
150
  end
@@ -2,15 +2,15 @@ require 'rbbt'
2
2
  require 'rbbt/tsv'
3
3
  require 'rbbt/resource'
4
4
 
5
- module TFacts
5
+ module TFactS
6
6
  extend Resource
7
- self.subdir = "share/databases/TFacts"
7
+ self.subdir = "share/databases/TFactS"
8
8
 
9
- TFacts.claim TFacts[".source"]["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
9
+ TFactS.claim TFactS[".source"]["Catalogues.xls"], :url, "http://www.tfacts.org/TFactS-new/TFactS-v2/tfacts/data/Catalogues.xls"
10
10
 
11
- TFacts.claim TFacts.targets, :proc do
11
+ TFactS.claim TFactS.targets, :proc do
12
12
  require 'spreadsheet'
13
- book = Spreadsheet.open TFacts[".source"]["Catalogues.xls"].produce.find
13
+ book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
14
14
  sheet = book.worksheet 0
15
15
 
16
16
  tsv = TSV.setup({}, :key_field => "Target Gene (Associated Gene Name)", :fields => ["Transcription Factor (Associated Gene Name)"], :namespace => "Hsa", :type => :flat)
@@ -24,9 +24,9 @@ module TFacts
24
24
  tsv.to_s
25
25
  end
26
26
 
27
- TFacts.claim TFacts.targets_signed, :proc do
27
+ TFactS.claim TFactS.targets_signed, :proc do
28
28
  require 'spreadsheet'
29
- book = Spreadsheet.open TFacts[".source"]["Catalogues.xls"].produce.find
29
+ book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
30
30
  sheet = book.worksheet 1
31
31
 
32
32
  tsv = TSV.setup({}, :key_field => "Target Gene (Associated Gene Name)", :fields => ["Transcription Factor (Associated Gene Name)", "Sign", "PMID"], :namespace => "Hsa", :type => :double)
@@ -43,13 +43,13 @@ module TFacts
43
43
  tsv.to_s
44
44
  end
45
45
 
46
- TFacts.claim TFacts.regulators, :proc do
47
- TFacts.targets.tsv.reorder("Transcription Factor (Associated Gene Name)").to_s
46
+ TFactS.claim TFactS.regulators, :proc do
47
+ TFactS.targets.tsv.reorder("Transcription Factor (Associated Gene Name)").to_s
48
48
  end
49
49
 
50
- TFacts.claim TFacts.tf_tg, :proc do
50
+ TFactS.claim TFactS.tf_tg, :proc do
51
51
  require 'spreadsheet'
52
- book = Spreadsheet.open TFacts[".source"]["Catalogues.xls"].produce.find
52
+ book = Spreadsheet.open TFactS[".source"]["Catalogues.xls"].produce.find
53
53
 
54
54
  tsv = TSV.setup({}, :key_field => "Transcription Factor (Associated Gene Name)", :fields => ["Target Gene (Associated Gene Name)", "Sign", "Species", "Source", "PMID"], :namespace => "Hsa", :type => :double)
55
55
 
@@ -128,16 +128,16 @@ if defined? Entity and defined? Gene and Entity === Gene
128
128
 
129
129
  module Gene
130
130
  property :is_transcription_factor? => :array2single do
131
- tfs = TFacts.targets.keys
131
+ tfs = TFactS.targets.keys
132
132
  self.name.collect{|gene| tfs.include? gene}
133
133
  end
134
134
 
135
135
  property :transcription_regulators => :array2single do
136
- Gene.setup(TFacts.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
136
+ Gene.setup(TFactS.regulators.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
137
137
  end
138
138
 
139
139
  property :transcription_targets => :array2single do
140
- Gene.setup(TFacts.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
140
+ Gene.setup(TFactS.targets.tsv(:persist => true).values_at(*self.name), "Associated Gene Name", self.organism)
141
141
  end
142
142
  end
143
143
  end
@@ -1,5 +1,10 @@
1
1
  #: :type=:single
2
2
  #Release build
3
+ release-100 apr2020
4
+ release-99 jan2020
5
+ release-98 sep2019
6
+ release-97 jul2019
7
+ release-96 apr2019
3
8
  release-95 jan2019
4
9
  release-94 oct2018
5
10
  release-93 jul2018
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
6
  $taxs = [9606]
7
7
  $scientific_name = "Homo sapiens"
8
- $ortholog_key = "human_ensembl_gene"
8
+ $ortholog_key = "hsapiens_homolog_ensembl_gene"
9
9
 
10
10
  $biomart_db = 'hsapiens_gene_ensembl'
11
11
  $biomart_db_germline_variation = 'hsapiens_snp'
@@ -5,7 +5,7 @@ require File.join(File.dirname(__FILE__), '../../lib/helpers')
5
5
 
6
6
  $taxs = [10090]
7
7
  $scientific_name = "Mus musculus"
8
- $ortholog_key = "mouse_ensembl_gene"
8
+ $ortholog_key = "mmusculus_homolog_ensembl_gene"
9
9
 
10
10
  $biomart_db = 'mmusculus_gene_ensembl'
11
11
  $biomart_db_germline_variation = 'mmusculus_snp'
@@ -9,7 +9,7 @@ $scientific_name = "Rattus norvegicus"
9
9
  $biomart_db = 'rnorvegicus_gene_ensembl'
10
10
  $biomart_db_germline_variation = 'rnorvegicus_snp'
11
11
  $biomart_db_somatic_variation = 'rnorvegicus_snp_som'
12
- $ortholog_key = "rat_ensembl_gene"
12
+ $ortholog_key = "rnorvegicus_homolog_ensembl_gene"
13
13
 
14
14
  $biomart_lexicon = [
15
15
  [ 'Associated Gene Name' , "external_gene_id"],
@@ -547,13 +547,13 @@ end
547
547
  rule /^possible_ortholog_(.*)/ do |t|
548
548
  other = t.name.match(/ortholog_(.*)/)[1]
549
549
  other_key = Organism.ortholog_key(other).produce.read
550
- BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => Thread.current['namespace'])
550
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :double, :filename => t.name, :namespace => Thread.current['namespace'])
551
551
  end
552
552
 
553
553
  rule /^ortholog_(.*)/ do |t|
554
554
  other = t.name.match(/ortholog_(.*)/)[1]
555
555
  other_key = Organism.ortholog_key(other).produce.read
556
- BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => Thread.current['namespace'])
556
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :double, :filename => t.name, :namespace => Thread.current['namespace'])
557
557
  end
558
558
 
559
559
  rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
@@ -728,13 +728,18 @@ file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr",
728
728
  transcript_sequence.through do |transcript, sequence|
729
729
  protein = transcript_protein[transcript]
730
730
  next if protein.nil? or protein.empty?
731
+
731
732
  utr5 = transcript_5utr[transcript]
732
733
  utr3 = transcript_3utr[transcript]
733
734
  phase = transcript_phase[transcript] || 0
735
+
734
736
  if phase < 0
735
- utr5 = - phase if utr5 == 0
737
+ if utr5.nil? || utr5 == 0 || utr5 == "0"
738
+ utr5 = 0
739
+ end
736
740
  phase = 0
737
741
  end
742
+
738
743
  psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
739
744
  protein_sequence[protein]=psequence
740
745
  end
@@ -17,7 +17,10 @@ class TestEntrez < Test::Unit::TestCase
17
17
  def test_entrez2pubmed
18
18
  tax = $yeast_tax
19
19
 
20
+ Log.severity = 0
20
21
  data = Entrez.entrez2pubmed(tax)
22
+ data.read
23
+ Log.tsv data
21
24
  assert(data['850320'].include? '1574125')
22
25
  end
23
26
 
@@ -17,6 +17,12 @@ class TestPubMed < Test::Unit::TestCase
17
17
  pmid = '16438716'
18
18
  assert(PubMed.get_article(pmid).full_text =~ /Discovering/)
19
19
  end
20
+
21
+ def test_pmc_full_xml
22
+ pmid = '4304705'
23
+ assert PubMed.get_article(pmid).pmc_full_xml.include?("HBV antigen")
24
+ end
25
+
20
26
 
21
27
  def test_query
22
28
  assert(PubMed.query('chagoyen[All Fields] AND ("loattrfull text"[sb] AND hasabstract[text])').include? '16438716')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.43
4
+ version: 3.1.49
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-01 00:00:00.000000000 Z
11
+ date: 2020-10-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util