rbbt-sources 2.1.5 → 2.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MTZiNDcxMzVmMzdkZjM3MzdiNzE5YTFmN2JiMjlkMmVhZWMzODliOA==
5
- data.tar.gz: !binary |-
6
- YWJlYTE4Y2M2YWM0ZjIxYTAxZTE4ZjExZmExNjQwYTJjNTg3NGVmZg==
2
+ SHA1:
3
+ metadata.gz: daf367338fb6e78d2cb7b76440e67712d27f34ab
4
+ data.tar.gz: 5b7a7308779ec4441fa2eb997d6f9b7f0dd37e3a
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- NDVjY2ViNTE4ZDliY2FiMzMwYjk1YzUxMWI4ODk2NWI2OTdiNmU1YTQ0MzMx
10
- ZTM0MjA5ZWFmZjlkMzAzMmRjYTBhOGVhMjM4Y2JhMmM2OThjMjQ1MDRkY2Vi
11
- YTFiOWYyNmEwMGZmMzg5MDFiNjQwMWNlNDVhODEwM2VjNTg0MTc=
12
- data.tar.gz: !binary |-
13
- ZTY3ZGFjM2E3ZmY0OThmZjZiNzI2OTAwNWNmZWZlYmI5ODRkMTEyY2IzODNm
14
- YmZkNjY3NTI2MjQzNjMzMTc4YjgzYjVkM2IwZjc0OTA0NWM0YzM1ZDUzMjU5
15
- ZTliYTNjZWY4YWMwMjUxMDFkMTRiMGRmNWRkNWQyNjBjYjgwYzE=
6
+ metadata.gz: bb568b0d788284e82d0ac0d9cdbd14db7c0e59b4977ddce57e2701f25ca18bbef93d43424179a188f73daaacc87963d039a17aaf0916872945f2d384e6441552
7
+ data.tar.gz: b24f422176f10f518f692a7878c2389df0276df27a074feb5918bae1993f860fae4558d330f95de66a7857da712b5c811e487c0b117f553106215d1065f856af
@@ -1,6 +1,7 @@
1
- require 'rbbt'
1
+ require 'rbbt-util'
2
2
  require 'rbbt/tsv'
3
3
  require 'rbbt/resource'
4
+ require 'rbbt/util/filecache'
4
5
  require 'rbbt/bow/bow'
5
6
  require 'set'
6
7
 
@@ -70,85 +71,44 @@ module Entrez
70
71
 
71
72
  private
72
73
 
73
- def self.get_online(geneids)
74
74
 
75
- genes_complete = geneids.is_a?(Array) ? geneids : [geneids]
75
+ def self.get_gene(geneids)
76
+ _array = Array === geneids
76
77
 
77
- genes = []
78
- Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
79
- begin
80
- Misc.try3times do
81
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
78
+ geneids = [geneids] unless Array === geneids
79
+ geneids = geneids.compact.collect{|id| id}
82
80
 
83
- xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
81
+ result_files = FileCache.cache_online_elements(geneids, 'gene-{ID}.xml') do |ids|
82
+ result = {}
83
+ values = []
84
+ Misc.divide(ids, (ids.length / 100) + 1).each do |list|
85
+ begin
86
+ Misc.try3times do
87
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{list * ","}"
84
88
 
85
- genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
89
+ xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
90
+
91
+ values += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
92
+ end
93
+ rescue
94
+ Log.error $!.message
86
95
  end
87
- rescue
88
- puts $!.message
89
- genes += geneids_list.collect{|g| nil}
90
96
  end
91
- end
92
97
 
93
- if geneids.is_a? Array
94
- list = Hash[*genes_complete.zip([nil]).flatten]
95
- genes.each{|gene|
98
+ values.each do |xml|
96
99
  geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
97
- geneid = geneid.to_i unless list.include? geneid
98
- list[geneid] = gene
99
- }
100
- return list
101
- else
102
- return genes.first
100
+
101
+ result[geneid] = xml
102
+ end
103
103
  end
104
- end
105
104
 
106
- public
105
+ genes = {}
106
+ geneids.each{|id| genes[id] = Gene.new(Open.read(result_files[id])) }
107
107
 
108
- def self.gene_filename(id)
109
- 'gene-' + id.to_s + '.xml'
110
- end
111
-
112
- def self.get_gene(geneid)
113
- return nil if geneid.nil?
114
-
115
- if Array === geneid
116
- missing = []
117
- list = {}
118
-
119
- geneid.each{|p|
120
- next if p.nil?
121
- if FileCache.found(gene_filename p)
122
- list[p] = Gene.new(Open.read(FileCache.path(gene_filename p)))
123
- else
124
- missing << p
125
- end
126
- }
127
-
128
-
129
- return list unless missing.any?
130
- genes = get_online(missing)
131
-
132
- genes.each{|p, xml|
133
- filename = gene_filename p
134
- FileCache.add(filename,xml) unless FileCache.found(filename)
135
- list[p] = Gene.new(xml)
136
- }
137
-
138
- return list
108
+ if _array
109
+ genes
139
110
  else
140
- filename = gene_filename geneid
141
-
142
-
143
- if FileCache.found(filename)
144
- return Gene.new(Open.read(FileCache.path(filename)))
145
- else
146
- xml = get_online(geneid)
147
-
148
- FileCache.add(filename, xml) unless FileCache.found(filename)
149
-
150
- return Gene.new(xml)
151
- end
111
+ genes.values.first
152
112
  end
153
113
  end
154
114
 
@@ -144,6 +144,12 @@ module Organism
144
144
  }.first
145
145
  end
146
146
 
147
+ def self.organism_code(name)
148
+ organisms.select{|organism|
149
+ organism == name or Organism.scientific_name(organism) =~ /#{ name }/i
150
+ }.first
151
+ end
152
+
147
153
  def self.known_ids(name)
148
154
  TSV::Parser.new(Organism.identifiers(name).open).all_fields
149
155
  end
@@ -1,5 +1,6 @@
1
- require 'rbbt'
1
+ require 'rbbt-util'
2
2
  require 'rbbt/util/open'
3
+ require 'rbbt/util/filecache'
3
4
  require 'rbbt/resource'
4
5
  require 'rbbt/sources/cath'
5
6
  require 'rbbt/sources/uniprot'
@@ -32,12 +33,78 @@ module UniProt
32
33
  tsv.to_s
33
34
  end
34
35
 
35
-
36
36
  UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
37
37
  UNIPROT_FASTA="http://www.uniprot.org/uniprot/[PROTEIN].fasta"
38
+
39
+ def self.get_uniprot_entry(uniprotids)
40
+ _array = Array === uniprotids
41
+
42
+ uniprotids = [uniprotids] unless Array === uniprotids
43
+ uniprotids = uniprotids.compact.collect{|id| id}
44
+
45
+ result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-{ID}.xml') do |ids|
46
+ result = {}
47
+ ids.each do |id|
48
+ begin
49
+ Misc.try3times do
50
+
51
+ content = Open.read(UNIPROT_TEXT.sub("[PROTEIN]", id), :wget_options => {:quiet => true}, :nocache => true)
52
+
53
+ result[id] = content
54
+ end
55
+ rescue
56
+ Log.error $!.message
57
+ end
58
+ end
59
+ result
60
+ end
61
+
62
+ uniprots = {}
63
+ uniprotids.each{|id| uniprots[id] = Open.read(result_files[id]) }
64
+
65
+ if _array
66
+ uniprots
67
+ else
68
+ uniprots.values.first
69
+ end
70
+ end
71
+
72
+ def self.get_uniprot_sequence(uniprotids)
73
+ _array = Array === uniprotids
74
+
75
+ uniprotids = [uniprotids] unless Array === uniprotids
76
+ uniprotids = uniprotids.compact.collect{|id| id}
77
+
78
+ result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-sequence-{ID}') do |ids|
79
+ result = {}
80
+ ids.each do |id|
81
+ begin
82
+ Misc.try3times do
83
+
84
+ url = UNIPROT_FASTA.sub "[PROTEIN]", id
85
+ text = Open.read(url, :nocache => true)
86
+
87
+ result[id] = text.split(/\n/).select{|line| line !~ /^>/} * ""
88
+ end
89
+ rescue
90
+ Log.error $!.message
91
+ end
92
+ end
93
+ result
94
+ end
95
+
96
+ uniprots = {}
97
+ uniprotids.each{|id| uniprots[id] = Open.read(result_files[id]) }
98
+
99
+ if _array
100
+ uniprots
101
+ else
102
+ uniprots.values.first
103
+ end
104
+ end
105
+
38
106
  def self.pdbs(protein)
39
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
40
- text = Open.read(url)
107
+ text = get_uniprot_entry(protein)
41
108
 
42
109
  pdb = {}
43
110
 
@@ -59,15 +126,11 @@ module UniProt
59
126
  end
60
127
 
61
128
  def self.sequence(protein)
62
- url = UNIPROT_FASTA.sub "[PROTEIN]", protein
63
- text = Open.read(url)
64
-
65
- text.split(/\n/).select{|line| line !~ /^>/} * ""
129
+ get_uniprot_sequence(protein)
66
130
  end
67
131
 
68
132
  def self.features(protein)
69
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
70
- text = Open.read(url)
133
+ text = get_uniprot_entry(protein)
71
134
 
72
135
  text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
73
136
 
@@ -78,7 +141,6 @@ module UniProt
78
141
 
79
142
  type = nil
80
143
  parts.each do |part|
81
- parts
82
144
  if part[0..1] == "FT"
83
145
  type = part.gsub(/FT\s+/,'')
84
146
  next
@@ -111,8 +173,7 @@ module UniProt
111
173
 
112
174
 
113
175
  def self.variants(protein)
114
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
115
- text = Open.read(url)
176
+ text = get_uniprot_entry(protein)
116
177
 
117
178
  text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
118
179
 
@@ -157,8 +218,7 @@ module UniProt
157
218
  end
158
219
 
159
220
  def self.cath(protein)
160
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
161
- text = Open.read(url)
221
+ text = get_uniprot_entry(protein)
162
222
 
163
223
  cath = {}
164
224
  text.split(/\n/).each{|l|
@@ -21,21 +21,12 @@ class TestEntrez < Test::Unit::TestCase
21
21
  assert(data['850320'].include? '1574125')
22
22
  end
23
23
 
24
- def test_getonline
25
- geneids = 9129
26
-
27
- assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
28
-
29
- geneids = [9129,9]
30
- assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
31
- end
32
-
33
24
  def test_getgene
34
25
  geneids = 9129
35
- assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
26
+ assert_equal([["pre-mRNA processing factor 3"]], Entrez.get_gene(geneids).description)
36
27
 
37
28
  geneids = [9129, 728049]
38
- assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
29
+ assert_equal([["pre-mRNA processing factor 3"]], Entrez.get_gene(geneids)[9129].description)
39
30
  end
40
31
 
41
32
  def test_similarity
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
5
5
 
6
6
  class TestOrganism < Test::Unit::TestCase
7
7
 
8
- def test_known_ids
8
+ def _test_known_ids
9
9
  assert Organism.known_ids("Hsa").include?("Associated Gene Name")
10
10
  end
11
11
 
12
- def test_location
12
+ def _test_location
13
13
  assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
14
14
  end
15
15
 
16
- def test_identifiers
16
+ def _test_identifiers
17
17
  assert Organism.identifiers('Hsa').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
21
21
 
22
- def test_lexicon
22
+ def _test_lexicon
23
23
  assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
24
24
  end
25
25
 
26
- def test_guess_id
26
+ def _test_guess_id
27
27
  ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
28
28
  gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
29
29
  assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
30
30
  assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
31
31
  end
32
32
 
33
- def test_organisms
33
+ def _test_organisms
34
34
  assert Organism.organisms.include? "Hsa"
35
35
  assert_equal "Hsa", Organism.organism("Homo sapiens")
36
36
  end
37
37
 
38
- def test_attach_translations
38
+ def _test_attach_translations
39
39
  tsv = TSV.setup({"1020" => []}, :type => :list)
40
40
  tsv.key_field = "Entrez Gene ID"
41
41
  tsv.fields = []
@@ -47,7 +47,7 @@ class TestOrganism < Test::Unit::TestCase
47
47
  assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
48
48
  end
49
49
 
50
- def test_entrez_taxids
50
+ def _test_entrez_taxids
51
51
  assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
52
52
  end
53
53
 
@@ -61,22 +61,22 @@ class TestOrganism < Test::Unit::TestCase
61
61
  assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
62
62
  end
63
63
 
64
- def test_orhtolog
64
+ def _test_orhtolog
65
65
  require 'rbbt/entity/gene'
66
66
  assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog("Hsa/jun2011")
67
67
  end
68
68
 
69
- #def test_genes_at_chromosome
69
+ #def _test_genes_at_chromosome
70
70
  # pos = [12, 117799500]
71
71
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
72
72
  #end
73
73
 
74
- #def test_genes_at_chromosome_array
74
+ #def _test_genes_at_chromosome_array
75
75
  # pos = [12, [117799500, 106903900]]
76
76
  # assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
77
77
  #end
78
78
 
79
- #def test_genes_at_genomic_positions
79
+ #def _test_genes_at_genomic_positions
80
80
  # pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
81
81
  # assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
82
82
  #end
metadata CHANGED
@@ -1,83 +1,83 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.5
4
+ version: 2.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-25 00:00:00.000000000 Z
11
+ date: 2014-02-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: 4.0.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 4.0.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rbbt-text
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ! '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ! '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: libxml-ruby
42
+ name: mechanize
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ! '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ! '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: bio
56
+ name: libxml-ruby
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ! '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ! '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: mechanize
70
+ name: bio
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ! '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ! '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  description: Data sources like PubMed, Entrez Gene, or Gene Ontology
@@ -88,7 +88,6 @@ extra_rdoc_files: []
88
88
  files:
89
89
  - etc/allowed_biomart_archives
90
90
  - etc/biomart/missing_in_archive
91
- - lib/rbbt/sources/COSMIC.rb
92
91
  - lib/rbbt/sources/COSTART.rb
93
92
  - lib/rbbt/sources/CTCAE.rb
94
93
  - lib/rbbt/sources/HPRD.rb
@@ -100,11 +99,9 @@ files:
100
99
  - lib/rbbt/sources/bibtex.rb
101
100
  - lib/rbbt/sources/biomart.rb
102
101
  - lib/rbbt/sources/cath.rb
103
- - lib/rbbt/sources/dbSNP.rb
104
102
  - lib/rbbt/sources/ensembl.rb
105
103
  - lib/rbbt/sources/ensembl_ftp.rb
106
104
  - lib/rbbt/sources/entrez.rb
107
- - lib/rbbt/sources/genomes1000.rb
108
105
  - lib/rbbt/sources/go.rb
109
106
  - lib/rbbt/sources/gscholar.rb
110
107
  - lib/rbbt/sources/jochem.rb
@@ -143,25 +140,25 @@ require_paths:
143
140
  - lib
144
141
  required_ruby_version: !ruby/object:Gem::Requirement
145
142
  requirements:
146
- - - ! '>='
143
+ - - ">="
147
144
  - !ruby/object:Gem::Version
148
145
  version: '0'
149
146
  required_rubygems_version: !ruby/object:Gem::Requirement
150
147
  requirements:
151
- - - ! '>='
148
+ - - ">="
152
149
  - !ruby/object:Gem::Version
153
150
  version: '0'
154
151
  requirements: []
155
152
  rubyforge_project:
156
- rubygems_version: 2.2.0
153
+ rubygems_version: 2.2.1
157
154
  signing_key:
158
155
  specification_version: 4
159
156
  summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
160
157
  test_files:
161
- - test/rbbt/sources/test_go.rb
162
- - test/rbbt/sources/test_entrez.rb
158
+ - test/rbbt/sources/test_pubmed.rb
163
159
  - test/rbbt/sources/test_biomart.rb
164
160
  - test/rbbt/sources/test_gscholar.rb
161
+ - test/rbbt/sources/test_entrez.rb
162
+ - test/rbbt/sources/test_go.rb
165
163
  - test/rbbt/sources/test_organism.rb
166
- - test/rbbt/sources/test_pubmed.rb
167
164
  - test/test_helper.rb
@@ -1,153 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/resource'
3
-
4
- module COSMIC
5
- extend Resource
6
- self.subdir = "share/databases/COSMIC"
7
-
8
- COSMIC.claim COSMIC.mutations, :proc do
9
- url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicCompleteExport_v67_241013.tsv.gz"
10
-
11
- stream = CMD.cmd('awk \'BEGIN{FS="\t"} { if ($12 != "" && $12 != "Mutation ID") { sub($12, "COSM" $12 ":" $4)}; print}\'', :in => Open.open(url), :pipe => true)
12
- tsv = TSV.open(stream, :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
13
- tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
14
- tsv.add_field "Genomic Mutation" do |mid, values|
15
- position = values["Mutation GRCh37 genome position"]
16
- cds = values["Mutation CDS"]
17
-
18
- if position.nil? or position.empty?
19
- nil
20
- else
21
- position = position.split("-").first
22
-
23
- chr, pos = position.split(":")
24
- chr = "X" if chr == "23"
25
- chr = "Y" if chr == "24"
26
- chr = "M" if chr == "25"
27
- position = [chr, pos ] * ":"
28
-
29
- if cds.nil?
30
- position
31
- else
32
- change = case
33
- when cds =~ />/
34
- cds.split(">").last
35
- when cds =~ /del/
36
- deletion = cds.split("del").last
37
- case
38
- when deletion =~ /^\d+$/
39
- "-" * deletion.to_i
40
- when deletion =~ /^[ACTG]+$/i
41
- "-" * deletion.length
42
- else
43
- Log.debug "Unknown deletion: #{ deletion }"
44
- deletion
45
- end
46
- when cds =~ /ins/
47
- insertion = cds.split("ins").last
48
- case
49
- when insertion =~ /^\d+$/
50
- "+" + "N" * insertion.to_i
51
- when insertion =~ /^[NACTG]+$/i
52
- "+" + insertion
53
- else
54
- Log.debug "Unknown insertion: #{insertion }"
55
- insertion
56
- end
57
- else
58
- Log.debug "Unknown change: #{cds}"
59
- "?(" << cds << ")"
60
- end
61
- position + ":" + change
62
- end
63
- end
64
- end
65
-
66
- tsv.to_s.gsub(/(\d)-(\d)/,'\1:\2')
67
- end
68
-
69
- COSMIC.claim COSMIC.mutations_hg18, :proc do |filename|
70
- require 'rbbt/sources/organism'
71
- file = COSMIC.mutations.open
72
- begin
73
-
74
- while (line = file.gets) !~ /Genomic Mutation/; end
75
- fields = line[1..-2].split("\t")
76
- mutation_pos = fields.index "Genomic Mutation"
77
-
78
- mutations = CMD.cmd("grep -v '^#'|cut -f #{mutation_pos + 1}|sort -u", :in => COSMIC.mutations.open).read.split("\n").select{|m| m.include? ":" }
79
-
80
- translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
81
-
82
- File.open(filename, 'w') do |f|
83
- f.puts "#: :type=:list#:namespace=Hsa/may2009"
84
- f.puts "#" + fields * "\t"
85
- while line = file.gets do
86
- next if line[0] == "#"[0]
87
- line.strip!
88
- parts = line.split("\t")
89
- parts[mutation_pos] = translations[parts[mutation_pos]]
90
- f.puts parts * "\t"
91
- end
92
- end
93
- rescue Exception
94
- FileUtils.rm filename if File.exists? filename
95
- raise $!
96
- ensure
97
- file.close
98
- end
99
-
100
- nil
101
- end
102
-
103
-
104
- def self.rsid_index(organism, chromosome = nil)
105
- build = Organism.hg_build(organism)
106
-
107
- tag = [build, chromosome] * ":"
108
- fwt = nil
109
- Persist.persist("StaticPosIndex for COSMIC [#{ tag }]", :fwt, :persist => true) do
110
- value_size = 0
111
- file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
112
- chr_positions = []
113
- begin
114
- Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
115
- next if line[0] == "#"[0]
116
- rsid, mutation = line.split("\t").values_at 0, 25
117
- next if mutation.nil? or mutation.empty?
118
- chr, pos = mutation.split(":")
119
- next if chr != chromosome or pos.nil? or pos.empty?
120
- chr_positions << [rsid, pos.to_i]
121
- value_size = rsid.length if rsid.length > value_size
122
- end
123
- rescue
124
- end
125
- fwt = FixWidthTable.new :memory, value_size
126
- fwt.add_point(chr_positions)
127
- fwt
128
- end
129
- end
130
-
131
- def self.mutation_index(organism)
132
- build = Organism.hg_build(organism)
133
- file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
134
- @mutation_index ||= {}
135
- @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
136
- end
137
-
138
-
139
- end
140
-
141
- if defined? Entity
142
- if defined? Gene and Entity === Gene
143
- module Gene
144
- property :COSMIC_rsids => :single2array do
145
- COSMIC.rsid_index(organism, chromosome)[self.chr_range]
146
- end
147
-
148
- property :COSMIC_mutations => :single2array do
149
- GenomicMutation.setup(COSMIC.mutation_index(organism).values_at(*self.COSMIC_rsids).uniq, "COSMIC mutations over #{self.name || self}", organism, false)
150
- end
151
- end
152
- end
153
- end
@@ -1,194 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/resource'
4
- require 'net/ftp'
5
-
6
- module DbSNP
7
- extend Resource
8
- self.subdir = "share/databases/dbSNP"
9
-
10
- URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
11
-
12
- DbSNP.claim DbSNP.mutations_ncbi, :proc do
13
- tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :flat)
14
- file = Open.open(URL, :nocache => true)
15
- while line = file.gets do
16
- next if line[0] == "#"[0]
17
- chr, position, id, ref, alt = line.split "\t"
18
-
19
- mutations = alt.split(",").collect do |a|
20
- if alt[0] == ref[0]
21
- alt[0] = '+'[0]
22
- end
23
- [chr, position, alt] * ":"
24
- end
25
-
26
- tsv.namespace = "Hsa/may2012"
27
- tsv[id] = mutations
28
- end
29
-
30
- tsv.to_s
31
- end
32
-
33
- DbSNP.claim DbSNP.rsids, :proc do |filename|
34
- ftp = Net::FTP.new('ftp.broadinstitute.org')
35
- ftp.passive = true
36
- ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
37
- ftp.chdir('/bundle/2.3/hg19')
38
-
39
- tmpfile = TmpFile.tmp_file + '.gz'
40
- ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
41
-
42
- file = Open.open(tmpfile, :nocache => true)
43
- begin
44
- File.open(filename, 'w') do |f|
45
- f.puts "#: :type=:list#:namespace=Hsa/may2012"
46
- f.puts "#" + ["RS ID", "GMAF", "G5", "G5A", "dbSNP Build ID"] * "\t"
47
- while line = file.gets do
48
- next if line[0] == "#"[0]
49
-
50
- chr, position, id, ref, muts, qual, filter, info = line.split "\t"
51
-
52
- g5 = g5a = dbsnp_build_id = gmaf = nil
53
-
54
- gmaf = $1 if info =~ /GMAF=([0-9.]+)/
55
- g5 = true if info =~ /\bG5\b/
56
- g5a = true if info =~ /\bG5A\b/
57
- dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
58
-
59
- f.puts [id, gmaf, g5, g5a, dbsnp_build_id] * "\t"
60
- end
61
- end
62
- rescue Exception
63
- FileUtils.rm filename if File.exists? filename
64
- raise $!
65
- ensure
66
- file.close
67
- FileUtils.rm tmpfile
68
- end
69
-
70
- nil
71
- end
72
-
73
- DbSNP.claim DbSNP.mutations, :proc do |filename|
74
- ftp = Net::FTP.new('ftp.broadinstitute.org')
75
- ftp.passive = true
76
- ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
77
- ftp.chdir('/bundle/2.3/hg19')
78
-
79
- tmpfile = TmpFile.tmp_file + '.gz'
80
- ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
81
-
82
- file = Open.open(tmpfile, :nocache => true)
83
- begin
84
- File.open(filename, 'w') do |f|
85
- f.puts "#: :type=:flat#:namespace=Hsa/may2012"
86
- f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
87
- while line = file.gets do
88
- next if line[0] == "#"[0]
89
-
90
- chr, position, id, ref, muts, qual, filter, info = line.split "\t"
91
-
92
- chr.sub!('chr', '')
93
-
94
- position, muts = Misc.correct_vcf_mutation(position.to_i, ref, muts)
95
-
96
- mutations = muts.collect{|mut| [chr, position, mut] * ":" }
97
-
98
- f.puts ([id] + mutations) * "\t"
99
- end
100
- end
101
- rescue Exception
102
- FileUtils.rm filename if File.exists? filename
103
- raise $!
104
- ensure
105
- file.close
106
- FileUtils.rm tmpfile
107
- end
108
-
109
- nil
110
- end
111
-
112
- DbSNP.claim DbSNP.mutations_hg18, :proc do |filename|
113
- require 'rbbt/sources/organism'
114
-
115
- mutations = CMD.cmd("grep -v '^#'|cut -f 2|sort -u", :in => DbSNP.mutations.open).read.split("\n").collect{|l| l.split("|")}.flatten
116
-
117
- translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
118
- begin
119
- file = Open.open(DbSNP.mutations.find, :nocache => true)
120
- File.open(filename, 'w') do |f|
121
- f.puts "#: :type=:flat#:namespace=Hsa/may2009"
122
- f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
123
- while line = file.gets do
124
- next if line[0] == "#"[0]
125
- parts = line.split("\t")
126
- parts[1..-1] = parts[1..-1].collect{|p| translations[p]} * "|"
127
- f.puts parts * "\t"
128
- end
129
- end
130
- rescue Exception
131
- FileUtils.rm filename if File.exists? filename
132
- raise $!
133
- ensure
134
- file.close
135
- end
136
-
137
- nil
138
- end
139
-
140
- def self.rsid_index(organism, chromosome = nil)
141
- build = Organism.hg_build(organism)
142
-
143
- tag = [build, chromosome] * ":"
144
- Persist.persist("StaticPosIndex for dbSNP [#{ tag }]", :fwt, :persist => true) do
145
- value_size = 0
146
- file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
147
- chr_positions = []
148
- Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
149
- next if line[0] == "#"[0]
150
- rsid, mutation = line.split("\t")
151
- next if mutation.nil? or mutation.empty?
152
- chr, pos = mutation.split(":")
153
- next if chr != chromosome or pos.nil? or pos.empty?
154
- chr_positions << [rsid, pos.to_i]
155
- value_size = rsid.length if rsid.length > value_size
156
- end
157
- fwt = FixWidthTable.new :memory, value_size
158
- fwt.add_point(chr_positions)
159
- fwt
160
- end
161
- end
162
-
163
- def self.mutation_index(organism)
164
- build = Organism.hg_build(organism)
165
- file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
166
- @mutation_index ||= {}
167
- @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
168
- end
169
-
170
- end
171
-
172
- if defined? Entity
173
- if defined? Gene and Entity === Gene
174
- module Gene
175
- property :dbSNP_rsids => :single2array do
176
- DbSNP.rsid_index(organism, chromosome)[self.chr_range]
177
- end
178
-
179
- property :dbSNP_mutations => :single2array do
180
- GenomicMutation.setup(DbSNP.mutation_index(organism).values_at(*self.dbSNP_rsids).compact.flatten.uniq, "dbSNP mutations over #{self.name || self}", organism, true)
181
- end
182
- end
183
- end
184
-
185
- if defined? GenomicMutation and Entity === GenomicMutation
186
- module GenomicMutation
187
- property :dbSNP => :array2single do
188
- dbSNP.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["RS ID"], :type => :single).values_at *self
189
- end
190
- end
191
-
192
- end
193
- end
194
-
@@ -1,109 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/resource'
4
- require 'rbbt/entity/gene'
5
-
6
- module Genomes1000
7
- extend Resource
8
- self.subdir = "share/databases/genomes_1000"
9
-
10
- RELEASE_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz"
11
-
12
- Genomes1000.claim Genomes1000.mutations, :proc do |filename|
13
-
14
- begin
15
- Open.write(filename) do |file|
16
- file.puts "#: :type=:single#:namespace=Hsa"
17
- file.puts "#Variant ID\tGenomic Mutation"
18
-
19
- Open.read(RELEASE_URL) do |line|
20
- next if line[0] == "#"[0]
21
-
22
- chromosome, position, id, references, alternative, quality, filter, info = line.split("\t")
23
-
24
- file.puts [id, [chromosome, position, alternative] * ":"] * "\t"
25
- end
26
- end
27
- rescue
28
- FileUtils.rm filename if File.exists? filename
29
- raise $!
30
- end
31
- nil
32
- end
33
-
34
-
35
- Genomes1000.claim Genomes1000.mutations_hg18, :proc do
36
- require 'rbbt/sources/organism'
37
-
38
- hg19_tsv = Genomes1000.mutations.tsv :unnamed => true
39
-
40
- mutations = hg19_tsv.values
41
-
42
- translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
43
-
44
- tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
45
- translations[mutation]
46
- end
47
-
48
- tsv.namespace = "Hsa/may2009"
49
-
50
- tsv.to_s
51
- end
52
-
53
- def self.rsid_index(organism, chromosome = nil)
54
- build = Organism.hg_build(organism)
55
-
56
- tag = [build, chromosome] * ":"
57
- Persist.persist("StaticPosIndex for Genomes1000 [#{ tag }]", :fwt, :persist => true) do
58
- value_size = 0
59
- file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
60
- chr_positions = []
61
- Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
62
- next if line[0] == "#"[0]
63
- rsid, mutation = line.split("\t")
64
- next if mutation.nil? or mutation.empty?
65
- chr, pos = mutation.split(":")
66
- next if chr != chromosome or pos.nil? or pos.empty?
67
- chr_positions << [rsid, pos.to_i]
68
- value_size = rsid.length if rsid.length > value_size
69
- end
70
- fwt = FixWidthTable.new :memory, value_size
71
- fwt.add_point(chr_positions)
72
- fwt
73
- end
74
- end
75
-
76
- def self.mutation_index(organism)
77
- build = Organism.hg_build(organism)
78
- file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
79
- @mutation_index ||= {}
80
- @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
81
- end
82
-
83
-
84
- end
85
-
86
-
87
- if defined? Entity
88
- if defined? Gene and Entity === Gene
89
- module Gene
90
- property :genomes_1000_rsids => :single2array do
91
- Genomes1000.rsid_index(organism, chromosome)[self.chr_range]
92
- end
93
-
94
- property :genomes_1000_mutations => :single2array do
95
- GenomicMutation.setup(Genomes1000.mutation_index(organism).values_at(*self.genomes_1000_rsids).uniq, "1000 Genomes mutations over #{self.name || self}", organism, true)
96
- end
97
- end
98
- end
99
-
100
- if defined? GenomicMutation and Entity === GenomicMutation
101
- module GenomicMutation
102
- property :genomes_1000 => :array2single do
103
- Genomes1000.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["Variant ID"], :type => :single).values_at *self
104
- end
105
- end
106
- end
107
- end
108
-
109
-