rbbt-sources 3.1.52 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +20 -0
- data/etc/allowed_biomart_archives +1 -0
- data/lib/rbbt/sources/clinvar.rb +4 -2
- data/lib/rbbt/sources/organism.rb +34 -24
- data/lib/rbbt/sources/pubmed.rb +20 -15
- data/lib/rbbt/sources/uniprot.rb +7 -0
- data/share/Ensembl/release_dates +3 -0
- data/share/install/Organism/organism_helpers.rb +3 -3
- data/test/rbbt/sources/test_pubmed.rb +6 -0
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 973d827d476c42023692f813a588ad500da04dd27f6903e1da9c74673985b40e
|
4
|
+
data.tar.gz: ed995de19a62f17908c2b7ed3f327782c75ece60aa0f7c5f373cc9309228c8b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cff68ce8c57381ec1581a6df61b56d40384b108030f87858ea1a735a82b8922b9d71a8b8a42930846f30186610058cbf738dccfb788ebf49e3f4d5e1b000809f
|
7
|
+
data.tar.gz: 771625cad70732c54cc2a1b798f6c0ac44004e73a9435e6699a7988a0904e987fecd9e375e6f888061b0f88a4c03ff3200322c59ca75a7f8f83dcdad8b19b156
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2022 Miguel Vázquez García
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/rbbt/sources/clinvar.rb
CHANGED
@@ -58,8 +58,10 @@ module ClinVar
|
|
58
58
|
require 'rbbt/workflow'
|
59
59
|
Workflow.require_workflow "Sequence"
|
60
60
|
variants = ClinVar.hg19.snv_summary.produce
|
61
|
-
muts = CMD.cmd(
|
62
|
-
consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg19)
|
61
|
+
muts = CMD.cmd("cut -f 1 #{ variants.find }", :pipe => true)
|
62
|
+
consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg19)
|
63
|
+
consequence.run
|
64
|
+
iif consequence
|
63
65
|
|
64
66
|
options = TSV.parse_header(variants).options.merge({:key_field => "Mutated Isoform"})
|
65
67
|
fields = options[:fields].length
|
@@ -72,9 +72,10 @@ module Organism
|
|
72
72
|
|
73
73
|
def self.hg_build(organism)
|
74
74
|
require 'rbbt/sources/ensembl_ftp'
|
75
|
+
organism = organism.strip
|
75
76
|
return organism if organism =~ /^hg\d\d$/
|
76
77
|
|
77
|
-
return
|
78
|
+
return organism unless organism =~ /\//
|
78
79
|
|
79
80
|
species, date = organism.split("/")
|
80
81
|
|
@@ -101,35 +102,43 @@ module Organism
|
|
101
102
|
end
|
102
103
|
end
|
103
104
|
|
104
|
-
def self.GRC_build(organism)
|
105
|
+
def self.GRC_build(organism, with_release = false)
|
105
106
|
require 'rbbt/sources/ensembl_ftp'
|
106
|
-
return organism if organism =~ /^
|
107
|
+
return organism if organism =~ /^GRC$/
|
107
108
|
|
108
|
-
|
109
|
+
if organism == "hg19" || organism == "b37"
|
110
|
+
return "GRCh37"
|
111
|
+
elsif organism == "hg38"
|
112
|
+
return "GRCh38"
|
113
|
+
end
|
109
114
|
|
110
|
-
|
115
|
+
return self.GRC_build(default_code(organism)) unless organism =~ /\//
|
111
116
|
|
112
|
-
|
113
|
-
when "Hsa"
|
114
|
-
date = organism.split("/")[1]
|
117
|
+
species, date = organism.split("/")
|
115
118
|
|
116
|
-
|
119
|
+
build = case species
|
120
|
+
when "Hsa"
|
121
|
+
date = organism.split("/")[1]
|
122
|
+
|
123
|
+
release = Ensembl.releases[date]
|
124
|
+
|
125
|
+
release_number = release.sub(/.*-/,'').to_i
|
126
|
+
if release_number <= 54
|
127
|
+
'GRCh36'
|
128
|
+
elsif release_number <= 75
|
129
|
+
'GRCh37'
|
130
|
+
else
|
131
|
+
'GRCh38'
|
132
|
+
end
|
133
|
+
when "Mmu"
|
134
|
+
"GRCm38"
|
135
|
+
when "Rno"
|
136
|
+
"Rnor_6.0"
|
137
|
+
else
|
138
|
+
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
139
|
+
end
|
117
140
|
|
118
|
-
|
119
|
-
if release_number <= 54
|
120
|
-
'GRCh36'
|
121
|
-
elsif release_number <= 75
|
122
|
-
'GRCh37'
|
123
|
-
else
|
124
|
-
'GRCh38'
|
125
|
-
end
|
126
|
-
when "Mmu"
|
127
|
-
"GRCm38"
|
128
|
-
when "Rno"
|
129
|
-
"Rnor_6.0"
|
130
|
-
else
|
131
|
-
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
132
|
-
end
|
141
|
+
(release_number && with_release) ? build + "." + release_number.to_s : build
|
133
142
|
end
|
134
143
|
|
135
144
|
def self.organism_for_build(build)
|
@@ -332,4 +341,5 @@ module Organism
|
|
332
341
|
|
333
342
|
chromosome_sizes
|
334
343
|
end
|
344
|
+
|
335
345
|
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'libxml'
|
3
2
|
require 'rbbt/sources/gscholar'
|
4
3
|
require 'rbbt/util/filecache'
|
5
4
|
|
@@ -53,32 +52,38 @@ module PubMed
|
|
53
52
|
[lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
|
54
53
|
end
|
55
54
|
def self.parse_xml(xml)
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
55
|
+
require 'nokogiri'
|
56
|
+
|
57
|
+
#parser = LibXML::XML::Parser.string(xml)
|
58
|
+
#pubmed = parser.parse.find("/PubmedArticle").first
|
59
|
+
#medline = parser.find("MedlineCitation").first
|
60
|
+
#article = medline.find("Article").first
|
61
|
+
|
62
|
+
parser = Nokogiri.XML(xml)
|
63
|
+
medline = parser.search("MedlineCitation").first
|
64
|
+
article = medline.search("Article").first
|
60
65
|
|
61
66
|
info = {}
|
62
67
|
|
63
|
-
info[:pmid] = medline.
|
68
|
+
info[:pmid] = medline.search("PMID").first.content
|
64
69
|
|
65
70
|
XML_KEYS.each do |p|
|
66
71
|
name, key = p
|
67
|
-
|
72
|
+
nodes = article.search(key)
|
68
73
|
|
69
|
-
next if
|
74
|
+
next if nodes.nil? || nodes.empty?
|
70
75
|
|
71
|
-
info[name] =
|
76
|
+
info[name] = nodes.collect{|n| n.content } * "\n\n"
|
72
77
|
end
|
73
78
|
|
74
79
|
bibentry = nil
|
75
|
-
info[:author] = article.
|
80
|
+
info[:author] = article.search("AuthorList/Author").collect do |author|
|
76
81
|
begin
|
77
|
-
lastname = author.
|
78
|
-
if author.
|
82
|
+
lastname = author.search("LastName").first.content
|
83
|
+
if author.search("ForeName").first.nil?
|
79
84
|
forename = nil
|
80
85
|
else
|
81
|
-
forename = author.
|
86
|
+
forename = author.search("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
82
87
|
end
|
83
88
|
bibentry ||= make_bibentry lastname, info[:year], info[:title]
|
84
89
|
rescue
|
@@ -88,7 +93,7 @@ module PubMed
|
|
88
93
|
|
89
94
|
info[:bibentry] = bibentry.downcase if bibentry
|
90
95
|
|
91
|
-
info[:pmc_pdf] =
|
96
|
+
info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
92
97
|
|
93
98
|
if info[:pmc_pdf]
|
94
99
|
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
@@ -270,7 +275,7 @@ module PubMed
|
|
270
275
|
result[pmid] = xml
|
271
276
|
end
|
272
277
|
|
273
|
-
ids.each{|id| next if id.nil? or result[id]; fid = id.sub(/^0+/,''); next unless result[fid]; result[id] = result[fid]}
|
278
|
+
ids.each{|id| next if id.nil? or result[id]; fid = String === id ? id.sub(/^0+/,'') : id; next unless result[fid]; result[id] = result[fid]}
|
274
279
|
ids.each{|id| next if id.nil? or result[id]; result[id] = ""}
|
275
280
|
|
276
281
|
result
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -33,6 +33,8 @@ module UniProt
|
|
33
33
|
"Ensembl Transcript ID"
|
34
34
|
when "Ensembl_PRO"
|
35
35
|
"Ensembl Protein ID"
|
36
|
+
when "GeneID"
|
37
|
+
"Entrez Gene ID"
|
36
38
|
else
|
37
39
|
field
|
38
40
|
end
|
@@ -64,6 +66,11 @@ module UniProt
|
|
64
66
|
tsv.to_s
|
65
67
|
end
|
66
68
|
|
69
|
+
UniProt.claim UniProt.identifiers.Rno, :proc do
|
70
|
+
url = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/RAT_10116_idmapping.dat.gz"
|
71
|
+
tsv = UniProt.get_organism_ids(url, "Rno")
|
72
|
+
tsv.to_s
|
73
|
+
end
|
67
74
|
|
68
75
|
UniProt.claim UniProt.annotated_variants, :proc do
|
69
76
|
url = "https://www.uniprot.org/docs/humsavar.txt"
|
data/share/Ensembl/release_dates
CHANGED
@@ -169,11 +169,11 @@ end
|
|
169
169
|
file 'lexicon' => 'identifiers' do |t|
|
170
170
|
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
171
171
|
|
172
|
-
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
|
172
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
|
173
173
|
entrez_description.key_field = "Entrez Gene ID"
|
174
174
|
entrez_description.fields = ["Entrez Gene Description"]
|
175
175
|
|
176
|
-
tsv.attach entrez_description
|
176
|
+
tsv = tsv.attach entrez_description
|
177
177
|
Misc.sensiblewrite(t.name, tsv.to_s)
|
178
178
|
end
|
179
179
|
|
@@ -220,7 +220,7 @@ file 'transcript_cds' do |t|
|
|
220
220
|
end
|
221
221
|
|
222
222
|
file 'gene_positions' do |t|
|
223
|
-
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
|
223
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [], nil, :type => :list, :namespace => Thread.current['namespace'])
|
224
224
|
|
225
225
|
Misc.sensiblewrite(t.name, sequences.to_s)
|
226
226
|
end
|
@@ -12,6 +12,12 @@ class TestPubMed < Test::Unit::TestCase
|
|
12
12
|
pmids = ['16438716', 17204154]
|
13
13
|
assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
14
14
|
end
|
15
|
+
|
16
|
+
def test_get_multi_abstract
|
17
|
+
pmid = "32141403"
|
18
|
+
|
19
|
+
assert PubMed.get_article(pmid).abstract.include?("This study shows PCOS patients are at increased risk of incident schizophrenia, and the metformin treatment has a protective effect against incident schizophrenia.")
|
20
|
+
end
|
15
21
|
|
16
22
|
def test_full_text
|
17
23
|
pmid = '16438716'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1
|
4
|
+
version: 3.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-12-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -70,8 +70,10 @@ description: Data sources like PubMed, Entrez Gene, or Gene Ontology
|
|
70
70
|
email: miguel.vazquez@fdi.ucm.es
|
71
71
|
executables: []
|
72
72
|
extensions: []
|
73
|
-
extra_rdoc_files:
|
73
|
+
extra_rdoc_files:
|
74
|
+
- LICENSE
|
74
75
|
files:
|
76
|
+
- LICENSE
|
75
77
|
- etc/allowed_biomart_archives
|
76
78
|
- etc/biomart/missing_in_archive
|
77
79
|
- etc/build_organism
|