rbbt-sources 3.1.52 → 3.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +20 -0
- data/etc/allowed_biomart_archives +1 -0
- data/lib/rbbt/sources/clinvar.rb +4 -2
- data/lib/rbbt/sources/organism.rb +34 -24
- data/lib/rbbt/sources/pubmed.rb +20 -15
- data/lib/rbbt/sources/uniprot.rb +7 -0
- data/share/Ensembl/release_dates +3 -0
- data/share/install/Organism/organism_helpers.rb +3 -3
- data/test/rbbt/sources/test_pubmed.rb +6 -0
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 973d827d476c42023692f813a588ad500da04dd27f6903e1da9c74673985b40e
|
4
|
+
data.tar.gz: ed995de19a62f17908c2b7ed3f327782c75ece60aa0f7c5f373cc9309228c8b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cff68ce8c57381ec1581a6df61b56d40384b108030f87858ea1a735a82b8922b9d71a8b8a42930846f30186610058cbf738dccfb788ebf49e3f4d5e1b000809f
|
7
|
+
data.tar.gz: 771625cad70732c54cc2a1b798f6c0ac44004e73a9435e6699a7988a0904e987fecd9e375e6f888061b0f88a4c03ff3200322c59ca75a7f8f83dcdad8b19b156
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010-2022 Miguel Vázquez García
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/rbbt/sources/clinvar.rb
CHANGED
@@ -58,8 +58,10 @@ module ClinVar
|
|
58
58
|
require 'rbbt/workflow'
|
59
59
|
Workflow.require_workflow "Sequence"
|
60
60
|
variants = ClinVar.hg19.snv_summary.produce
|
61
|
-
muts = CMD.cmd(
|
62
|
-
consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg19)
|
61
|
+
muts = CMD.cmd("cut -f 1 #{ variants.find }", :pipe => true)
|
62
|
+
consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg19)
|
63
|
+
consequence.run
|
64
|
+
iif consequence
|
63
65
|
|
64
66
|
options = TSV.parse_header(variants).options.merge({:key_field => "Mutated Isoform"})
|
65
67
|
fields = options[:fields].length
|
@@ -72,9 +72,10 @@ module Organism
|
|
72
72
|
|
73
73
|
def self.hg_build(organism)
|
74
74
|
require 'rbbt/sources/ensembl_ftp'
|
75
|
+
organism = organism.strip
|
75
76
|
return organism if organism =~ /^hg\d\d$/
|
76
77
|
|
77
|
-
return
|
78
|
+
return organism unless organism =~ /\//
|
78
79
|
|
79
80
|
species, date = organism.split("/")
|
80
81
|
|
@@ -101,35 +102,43 @@ module Organism
|
|
101
102
|
end
|
102
103
|
end
|
103
104
|
|
104
|
-
def self.GRC_build(organism)
|
105
|
+
def self.GRC_build(organism, with_release = false)
|
105
106
|
require 'rbbt/sources/ensembl_ftp'
|
106
|
-
return organism if organism =~ /^
|
107
|
+
return organism if organism =~ /^GRC$/
|
107
108
|
|
108
|
-
|
109
|
+
if organism == "hg19" || organism == "b37"
|
110
|
+
return "GRCh37"
|
111
|
+
elsif organism == "hg38"
|
112
|
+
return "GRCh38"
|
113
|
+
end
|
109
114
|
|
110
|
-
|
115
|
+
return self.GRC_build(default_code(organism)) unless organism =~ /\//
|
111
116
|
|
112
|
-
|
113
|
-
when "Hsa"
|
114
|
-
date = organism.split("/")[1]
|
117
|
+
species, date = organism.split("/")
|
115
118
|
|
116
|
-
|
119
|
+
build = case species
|
120
|
+
when "Hsa"
|
121
|
+
date = organism.split("/")[1]
|
122
|
+
|
123
|
+
release = Ensembl.releases[date]
|
124
|
+
|
125
|
+
release_number = release.sub(/.*-/,'').to_i
|
126
|
+
if release_number <= 54
|
127
|
+
'GRCh36'
|
128
|
+
elsif release_number <= 75
|
129
|
+
'GRCh37'
|
130
|
+
else
|
131
|
+
'GRCh38'
|
132
|
+
end
|
133
|
+
when "Mmu"
|
134
|
+
"GRCm38"
|
135
|
+
when "Rno"
|
136
|
+
"Rnor_6.0"
|
137
|
+
else
|
138
|
+
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
139
|
+
end
|
117
140
|
|
118
|
-
|
119
|
-
if release_number <= 54
|
120
|
-
'GRCh36'
|
121
|
-
elsif release_number <= 75
|
122
|
-
'GRCh37'
|
123
|
-
else
|
124
|
-
'GRCh38'
|
125
|
-
end
|
126
|
-
when "Mmu"
|
127
|
-
"GRCm38"
|
128
|
-
when "Rno"
|
129
|
-
"Rnor_6.0"
|
130
|
-
else
|
131
|
-
raise "Only organism 'Hsa' (Homo sapiens) and Mmu (Mus musculus) supported"
|
132
|
-
end
|
141
|
+
(release_number && with_release) ? build + "." + release_number.to_s : build
|
133
142
|
end
|
134
143
|
|
135
144
|
def self.organism_for_build(build)
|
@@ -332,4 +341,5 @@ module Organism
|
|
332
341
|
|
333
342
|
chromosome_sizes
|
334
343
|
end
|
344
|
+
|
335
345
|
end
|
data/lib/rbbt/sources/pubmed.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'libxml'
|
3
2
|
require 'rbbt/sources/gscholar'
|
4
3
|
require 'rbbt/util/filecache'
|
5
4
|
|
@@ -53,32 +52,38 @@ module PubMed
|
|
53
52
|
[lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
|
54
53
|
end
|
55
54
|
def self.parse_xml(xml)
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
55
|
+
require 'nokogiri'
|
56
|
+
|
57
|
+
#parser = LibXML::XML::Parser.string(xml)
|
58
|
+
#pubmed = parser.parse.find("/PubmedArticle").first
|
59
|
+
#medline = parser.find("MedlineCitation").first
|
60
|
+
#article = medline.find("Article").first
|
61
|
+
|
62
|
+
parser = Nokogiri.XML(xml)
|
63
|
+
medline = parser.search("MedlineCitation").first
|
64
|
+
article = medline.search("Article").first
|
60
65
|
|
61
66
|
info = {}
|
62
67
|
|
63
|
-
info[:pmid] = medline.
|
68
|
+
info[:pmid] = medline.search("PMID").first.content
|
64
69
|
|
65
70
|
XML_KEYS.each do |p|
|
66
71
|
name, key = p
|
67
|
-
|
72
|
+
nodes = article.search(key)
|
68
73
|
|
69
|
-
next if
|
74
|
+
next if nodes.nil? || nodes.empty?
|
70
75
|
|
71
|
-
info[name] =
|
76
|
+
info[name] = nodes.collect{|n| n.content } * "\n\n"
|
72
77
|
end
|
73
78
|
|
74
79
|
bibentry = nil
|
75
|
-
info[:author] = article.
|
80
|
+
info[:author] = article.search("AuthorList/Author").collect do |author|
|
76
81
|
begin
|
77
|
-
lastname = author.
|
78
|
-
if author.
|
82
|
+
lastname = author.search("LastName").first.content
|
83
|
+
if author.search("ForeName").first.nil?
|
79
84
|
forename = nil
|
80
85
|
else
|
81
|
-
forename = author.
|
86
|
+
forename = author.search("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
|
82
87
|
end
|
83
88
|
bibentry ||= make_bibentry lastname, info[:year], info[:title]
|
84
89
|
rescue
|
@@ -88,7 +93,7 @@ module PubMed
|
|
88
93
|
|
89
94
|
info[:bibentry] = bibentry.downcase if bibentry
|
90
95
|
|
91
|
-
info[:pmc_pdf] =
|
96
|
+
info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
|
92
97
|
|
93
98
|
if info[:pmc_pdf]
|
94
99
|
info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
|
@@ -270,7 +275,7 @@ module PubMed
|
|
270
275
|
result[pmid] = xml
|
271
276
|
end
|
272
277
|
|
273
|
-
ids.each{|id| next if id.nil? or result[id]; fid = id.sub(/^0+/,''); next unless result[fid]; result[id] = result[fid]}
|
278
|
+
ids.each{|id| next if id.nil? or result[id]; fid = String === id ? id.sub(/^0+/,'') : id; next unless result[fid]; result[id] = result[fid]}
|
274
279
|
ids.each{|id| next if id.nil? or result[id]; result[id] = ""}
|
275
280
|
|
276
281
|
result
|
data/lib/rbbt/sources/uniprot.rb
CHANGED
@@ -33,6 +33,8 @@ module UniProt
|
|
33
33
|
"Ensembl Transcript ID"
|
34
34
|
when "Ensembl_PRO"
|
35
35
|
"Ensembl Protein ID"
|
36
|
+
when "GeneID"
|
37
|
+
"Entrez Gene ID"
|
36
38
|
else
|
37
39
|
field
|
38
40
|
end
|
@@ -64,6 +66,11 @@ module UniProt
|
|
64
66
|
tsv.to_s
|
65
67
|
end
|
66
68
|
|
69
|
+
UniProt.claim UniProt.identifiers.Rno, :proc do
|
70
|
+
url = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/RAT_10116_idmapping.dat.gz"
|
71
|
+
tsv = UniProt.get_organism_ids(url, "Rno")
|
72
|
+
tsv.to_s
|
73
|
+
end
|
67
74
|
|
68
75
|
UniProt.claim UniProt.annotated_variants, :proc do
|
69
76
|
url = "https://www.uniprot.org/docs/humsavar.txt"
|
data/share/Ensembl/release_dates
CHANGED
@@ -169,11 +169,11 @@ end
|
|
169
169
|
file 'lexicon' => 'identifiers' do |t|
|
170
170
|
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
|
171
171
|
|
172
|
-
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :key_field => 1, :fields => 8
|
172
|
+
entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
|
173
173
|
entrez_description.key_field = "Entrez Gene ID"
|
174
174
|
entrez_description.fields = ["Entrez Gene Description"]
|
175
175
|
|
176
|
-
tsv.attach entrez_description
|
176
|
+
tsv = tsv.attach entrez_description
|
177
177
|
Misc.sensiblewrite(t.name, tsv.to_s)
|
178
178
|
end
|
179
179
|
|
@@ -220,7 +220,7 @@ file 'transcript_cds' do |t|
|
|
220
220
|
end
|
221
221
|
|
222
222
|
file 'gene_positions' do |t|
|
223
|
-
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [])
|
223
|
+
sequences = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_gene_positions, [], nil, :type => :list, :namespace => Thread.current['namespace'])
|
224
224
|
|
225
225
|
Misc.sensiblewrite(t.name, sequences.to_s)
|
226
226
|
end
|
@@ -12,6 +12,12 @@ class TestPubMed < Test::Unit::TestCase
|
|
12
12
|
pmids = ['16438716', 17204154]
|
13
13
|
assert(PubMed.get_article(pmids)[pmid].title == "Discovering semantic features in the literature: a foundation for building functional associations.")
|
14
14
|
end
|
15
|
+
|
16
|
+
def test_get_multi_abstract
|
17
|
+
pmid = "32141403"
|
18
|
+
|
19
|
+
assert PubMed.get_article(pmid).abstract.include?("This study shows PCOS patients are at increased risk of incident schizophrenia, and the metformin treatment has a protective effect against incident schizophrenia.")
|
20
|
+
end
|
15
21
|
|
16
22
|
def test_full_text
|
17
23
|
pmid = '16438716'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-sources
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.1
|
4
|
+
version: 3.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-12-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -39,7 +39,7 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: nokogiri
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
@@ -70,8 +70,10 @@ description: Data sources like PubMed, Entrez Gene, or Gene Ontology
|
|
70
70
|
email: miguel.vazquez@fdi.ucm.es
|
71
71
|
executables: []
|
72
72
|
extensions: []
|
73
|
-
extra_rdoc_files:
|
73
|
+
extra_rdoc_files:
|
74
|
+
- LICENSE
|
74
75
|
files:
|
76
|
+
- LICENSE
|
75
77
|
- etc/allowed_biomart_archives
|
76
78
|
- etc/biomart/missing_in_archive
|
77
79
|
- etc/build_organism
|