rbbt-sources 2.1.5 → 2.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MTZiNDcxMzVmMzdkZjM3MzdiNzE5YTFmN2JiMjlkMmVhZWMzODliOA==
5
- data.tar.gz: !binary |-
6
- YWJlYTE4Y2M2YWM0ZjIxYTAxZTE4ZjExZmExNjQwYTJjNTg3NGVmZg==
2
+ SHA1:
3
+ metadata.gz: daf367338fb6e78d2cb7b76440e67712d27f34ab
4
+ data.tar.gz: 5b7a7308779ec4441fa2eb997d6f9b7f0dd37e3a
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- NDVjY2ViNTE4ZDliY2FiMzMwYjk1YzUxMWI4ODk2NWI2OTdiNmU1YTQ0MzMx
10
- ZTM0MjA5ZWFmZjlkMzAzMmRjYTBhOGVhMjM4Y2JhMmM2OThjMjQ1MDRkY2Vi
11
- YTFiOWYyNmEwMGZmMzg5MDFiNjQwMWNlNDVhODEwM2VjNTg0MTc=
12
- data.tar.gz: !binary |-
13
- ZTY3ZGFjM2E3ZmY0OThmZjZiNzI2OTAwNWNmZWZlYmI5ODRkMTEyY2IzODNm
14
- YmZkNjY3NTI2MjQzNjMzMTc4YjgzYjVkM2IwZjc0OTA0NWM0YzM1ZDUzMjU5
15
- ZTliYTNjZWY4YWMwMjUxMDFkMTRiMGRmNWRkNWQyNjBjYjgwYzE=
6
+ metadata.gz: bb568b0d788284e82d0ac0d9cdbd14db7c0e59b4977ddce57e2701f25ca18bbef93d43424179a188f73daaacc87963d039a17aaf0916872945f2d384e6441552
7
+ data.tar.gz: b24f422176f10f518f692a7878c2389df0276df27a074feb5918bae1993f860fae4558d330f95de66a7857da712b5c811e487c0b117f553106215d1065f856af
@@ -1,6 +1,7 @@
1
- require 'rbbt'
1
+ require 'rbbt-util'
2
2
  require 'rbbt/tsv'
3
3
  require 'rbbt/resource'
4
+ require 'rbbt/util/filecache'
4
5
  require 'rbbt/bow/bow'
5
6
  require 'set'
6
7
 
@@ -70,85 +71,44 @@ module Entrez
70
71
 
71
72
  private
72
73
 
73
- def self.get_online(geneids)
74
74
 
75
- genes_complete = geneids.is_a?(Array) ? geneids : [geneids]
75
+ def self.get_gene(geneids)
76
+ _array = Array === geneids
76
77
 
77
- genes = []
78
- Misc.divide(genes_complete, (genes_complete.length / 100) + 1).each do |geneids_list|
79
- begin
80
- Misc.try3times do
81
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list * ","}"
78
+ geneids = [geneids] unless Array === geneids
79
+ geneids = geneids.compact.collect{|id| id}
82
80
 
83
- xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
81
+ result_files = FileCache.cache_online_elements(geneids, 'gene-{ID}.xml') do |ids|
82
+ result = {}
83
+ values = []
84
+ Misc.divide(ids, (ids.length / 100) + 1).each do |list|
85
+ begin
86
+ Misc.try3times do
87
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{list * ","}"
84
88
 
85
- genes += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
89
+ xml = Open.read(url, :wget_options => {:quiet => true}, :nocache => true)
90
+
91
+ values += xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
92
+ end
93
+ rescue
94
+ Log.error $!.message
86
95
  end
87
- rescue
88
- puts $!.message
89
- genes += geneids_list.collect{|g| nil}
90
96
  end
91
- end
92
97
 
93
- if geneids.is_a? Array
94
- list = Hash[*genes_complete.zip([nil]).flatten]
95
- genes.each{|gene|
98
+ values.each do |xml|
96
99
  geneid = gene.match(/<Gene-track_geneid>(\d+)/)[1]
97
- geneid = geneid.to_i unless list.include? geneid
98
- list[geneid] = gene
99
- }
100
- return list
101
- else
102
- return genes.first
100
+
101
+ result[geneid] = xml
102
+ end
103
103
  end
104
- end
105
104
 
106
- public
105
+ genes = {}
106
+ geneids.each{|id| genes[id] = Gene.new(Open.read(result_files[id])) }
107
107
 
108
- def self.gene_filename(id)
109
- 'gene-' + id.to_s + '.xml'
110
- end
111
-
112
- def self.get_gene(geneid)
113
- return nil if geneid.nil?
114
-
115
- if Array === geneid
116
- missing = []
117
- list = {}
118
-
119
- geneid.each{|p|
120
- next if p.nil?
121
- if FileCache.found(gene_filename p)
122
- list[p] = Gene.new(Open.read(FileCache.path(gene_filename p)))
123
- else
124
- missing << p
125
- end
126
- }
127
-
128
-
129
- return list unless missing.any?
130
- genes = get_online(missing)
131
-
132
- genes.each{|p, xml|
133
- filename = gene_filename p
134
- FileCache.add(filename,xml) unless FileCache.found(filename)
135
- list[p] = Gene.new(xml)
136
- }
137
-
138
- return list
108
+ if _array
109
+ genes
139
110
  else
140
- filename = gene_filename geneid
141
-
142
-
143
- if FileCache.found(filename)
144
- return Gene.new(Open.read(FileCache.path(filename)))
145
- else
146
- xml = get_online(geneid)
147
-
148
- FileCache.add(filename, xml) unless FileCache.found(filename)
149
-
150
- return Gene.new(xml)
151
- end
111
+ genes.values.first
152
112
  end
153
113
  end
154
114
 
@@ -144,6 +144,12 @@ module Organism
144
144
  }.first
145
145
  end
146
146
 
147
+ def self.organism_code(name)
148
+ organisms.select{|organism|
149
+ organism == name or Organism.scientific_name(organism) =~ /#{ name }/i
150
+ }.first
151
+ end
152
+
147
153
  def self.known_ids(name)
148
154
  TSV::Parser.new(Organism.identifiers(name).open).all_fields
149
155
  end
@@ -1,5 +1,6 @@
1
- require 'rbbt'
1
+ require 'rbbt-util'
2
2
  require 'rbbt/util/open'
3
+ require 'rbbt/util/filecache'
3
4
  require 'rbbt/resource'
4
5
  require 'rbbt/sources/cath'
5
6
  require 'rbbt/sources/uniprot'
@@ -32,12 +33,78 @@ module UniProt
32
33
  tsv.to_s
33
34
  end
34
35
 
35
-
36
36
  UNIPROT_TEXT="http://www.uniprot.org/uniprot/[PROTEIN].txt"
37
37
  UNIPROT_FASTA="http://www.uniprot.org/uniprot/[PROTEIN].fasta"
38
+
39
+ def self.get_uniprot_entry(uniprotids)
40
+ _array = Array === uniprotids
41
+
42
+ uniprotids = [uniprotids] unless Array === uniprotids
43
+ uniprotids = uniprotids.compact.collect{|id| id}
44
+
45
+ result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-{ID}.xml') do |ids|
46
+ result = {}
47
+ ids.each do |id|
48
+ begin
49
+ Misc.try3times do
50
+
51
+ content = Open.read(UNIPROT_TEXT.sub("[PROTEIN]", id), :wget_options => {:quiet => true}, :nocache => true)
52
+
53
+ result[id] = content
54
+ end
55
+ rescue
56
+ Log.error $!.message
57
+ end
58
+ end
59
+ result
60
+ end
61
+
62
+ uniprots = {}
63
+ uniprotids.each{|id| uniprots[id] = Open.read(result_files[id]) }
64
+
65
+ if _array
66
+ uniprots
67
+ else
68
+ uniprots.values.first
69
+ end
70
+ end
71
+
72
+ def self.get_uniprot_sequence(uniprotids)
73
+ _array = Array === uniprotids
74
+
75
+ uniprotids = [uniprotids] unless Array === uniprotids
76
+ uniprotids = uniprotids.compact.collect{|id| id}
77
+
78
+ result_files = FileCache.cache_online_elements(uniprotids, 'uniprot-sequence-{ID}') do |ids|
79
+ result = {}
80
+ ids.each do |id|
81
+ begin
82
+ Misc.try3times do
83
+
84
+ url = UNIPROT_FASTA.sub "[PROTEIN]", id
85
+ text = Open.read(url, :nocache => true)
86
+
87
+ result[id] = text.split(/\n/).select{|line| line !~ /^>/} * ""
88
+ end
89
+ rescue
90
+ Log.error $!.message
91
+ end
92
+ end
93
+ result
94
+ end
95
+
96
+ uniprots = {}
97
+ uniprotids.each{|id| uniprots[id] = Open.read(result_files[id]) }
98
+
99
+ if _array
100
+ uniprots
101
+ else
102
+ uniprots.values.first
103
+ end
104
+ end
105
+
38
106
  def self.pdbs(protein)
39
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
40
- text = Open.read(url)
107
+ text = get_uniprot_entry(protein)
41
108
 
42
109
  pdb = {}
43
110
 
@@ -59,15 +126,11 @@ module UniProt
59
126
  end
60
127
 
61
128
  def self.sequence(protein)
62
- url = UNIPROT_FASTA.sub "[PROTEIN]", protein
63
- text = Open.read(url)
64
-
65
- text.split(/\n/).select{|line| line !~ /^>/} * ""
129
+ get_uniprot_sequence(protein)
66
130
  end
67
131
 
68
132
  def self.features(protein)
69
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
70
- text = Open.read(url)
133
+ text = get_uniprot_entry(protein)
71
134
 
72
135
  text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
73
136
 
@@ -78,7 +141,6 @@ module UniProt
78
141
 
79
142
  type = nil
80
143
  parts.each do |part|
81
- parts
82
144
  if part[0..1] == "FT"
83
145
  type = part.gsub(/FT\s+/,'')
84
146
  next
@@ -111,8 +173,7 @@ module UniProt
111
173
 
112
174
 
113
175
  def self.variants(protein)
114
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
115
- text = Open.read(url)
176
+ text = get_uniprot_entry(protein)
116
177
 
117
178
  text = text.split(/\n/).select{|line| line =~ /^FT/} * "\n"
118
179
 
@@ -157,8 +218,7 @@ module UniProt
157
218
  end
158
219
 
159
220
  def self.cath(protein)
160
- url = UNIPROT_TEXT.sub "[PROTEIN]", protein
161
- text = Open.read(url)
221
+ text = get_uniprot_entry(protein)
162
222
 
163
223
  cath = {}
164
224
  text.split(/\n/).each{|l|
@@ -21,21 +21,12 @@ class TestEntrez < Test::Unit::TestCase
21
21
  assert(data['850320'].include? '1574125')
22
22
  end
23
23
 
24
- def test_getonline
25
- geneids = 9129
26
-
27
- assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids))
28
-
29
- geneids = [9129,9]
30
- assert_match(/PRP3 pre-mRNA processing factor/s, Entrez.get_online(geneids)[9129])
31
- end
32
-
33
24
  def test_getgene
34
25
  geneids = 9129
35
- assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids).description)
26
+ assert_equal([["pre-mRNA processing factor 3"]], Entrez.get_gene(geneids).description)
36
27
 
37
28
  geneids = [9129, 728049]
38
- assert_equal([["PRP3 pre-mRNA processing factor 3 homolog (S. cerevisiae)"]], Entrez.get_gene(geneids)[9129].description)
29
+ assert_equal([["pre-mRNA processing factor 3"]], Entrez.get_gene(geneids)[9129].description)
39
30
  end
40
31
 
41
32
  def test_similarity
@@ -5,37 +5,37 @@ require 'rbbt/sources/ensembl_ftp'
5
5
 
6
6
  class TestOrganism < Test::Unit::TestCase
7
7
 
8
- def test_known_ids
8
+ def _test_known_ids
9
9
  assert Organism.known_ids("Hsa").include?("Associated Gene Name")
10
10
  end
11
11
 
12
- def test_location
12
+ def _test_location
13
13
  assert_equal "share/organisms/Sce/identifiers", Organism.identifiers('Sce')
14
14
  end
15
15
 
16
- def test_identifiers
16
+ def _test_identifiers
17
17
  assert Organism.identifiers('Hsa').tsv(:key_field => "Entrez Gene ID", :persist => true)['1020']["Associated Gene Name"].include?('CDK5')
18
18
  assert Organism.identifiers('Sce').tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
19
19
  assert Organism.identifiers("Sce").tsv(:persist => true)['S000006120']["Ensembl Gene ID"].include?('YPL199C')
20
20
  end
21
21
 
22
- def test_lexicon
22
+ def _test_lexicon
23
23
  assert TSV.open(Organism.lexicon('Sce'))['S000006120'].flatten.include?('YPL199C')
24
24
  end
25
25
 
26
- def test_guess_id
26
+ def _test_guess_id
27
27
  ensembl = %w(YOL044W YDR289C YAL034C YGR246C ARS519 tH(GUG)E2 YDR218C YLR002C YGL224C)
28
28
  gene_name = %w(SNR64 MIP1 MRPS18 TFB2 JEN1 IVY1 TRS33 GAS3)
29
29
  assert_equal "Associated Gene Name", Organism.guess_id("Sce", gene_name).first
30
30
  assert_equal "Ensembl Gene ID", Organism.guess_id("Sce", ensembl).first
31
31
  end
32
32
 
33
- def test_organisms
33
+ def _test_organisms
34
34
  assert Organism.organisms.include? "Hsa"
35
35
  assert_equal "Hsa", Organism.organism("Homo sapiens")
36
36
  end
37
37
 
38
- def test_attach_translations
38
+ def _test_attach_translations
39
39
  tsv = TSV.setup({"1020" => []}, :type => :list)
40
40
  tsv.key_field = "Entrez Gene ID"
41
41
  tsv.fields = []
@@ -47,7 +47,7 @@ class TestOrganism < Test::Unit::TestCase
47
47
  assert_equal "CDK5", tsv["1020"]["Associated Gene Name"]
48
48
  end
49
49
 
50
- def test_entrez_taxids
50
+ def _test_entrez_taxids
51
51
  assert_equal "Hsa", Organism.entrez_taxid_organism('9606')
52
52
  end
53
53
 
@@ -61,22 +61,22 @@ class TestOrganism < Test::Unit::TestCase
61
61
  assert_equal mutation_19, Organism.liftOver([mutation_18], target_build, source_build).first
62
62
  end
63
63
 
64
- def test_orhtolog
64
+ def _test_orhtolog
65
65
  require 'rbbt/entity/gene'
66
66
  assert_equal ["ENSG00000133703"], Gene.setup("Kras", "Associated Gene Name", "Mmu/jun2011").ensembl.ortholog("Hsa/jun2011")
67
67
  end
68
68
 
69
- #def test_genes_at_chromosome
69
+ #def _test_genes_at_chromosome
70
70
  # pos = [12, 117799500]
71
71
  # assert_equal "ENSG00000089250", Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
72
72
  #end
73
73
 
74
- #def test_genes_at_chromosome_array
74
+ #def _test_genes_at_chromosome_array
75
75
  # pos = [12, [117799500, 106903900]]
76
76
  # assert_equal ["ENSG00000089250", "ENSG00000013503"], Organism::Hsa.genes_at_chromosome_positions(pos.first, pos.last)
77
77
  #end
78
78
 
79
- #def test_genes_at_genomic_positions
79
+ #def _test_genes_at_genomic_positions
80
80
  # pos = [[12, 117799500], [12, 106903900], [1, 115259500]]
81
81
  # assert_equal ["ENSG00000089250", "ENSG00000013503", "ENSG00000213281"], Organism::Hsa.genes_at_genomic_positions(pos)
82
82
  #end
metadata CHANGED
@@ -1,83 +1,83 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-sources
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.5
4
+ version: 2.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-25 00:00:00.000000000 Z
11
+ date: 2014-02-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: 4.0.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 4.0.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rbbt-text
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ! '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ! '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: libxml-ruby
42
+ name: mechanize
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ! '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ! '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: bio
56
+ name: libxml-ruby
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ! '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ! '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: mechanize
70
+ name: bio
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ! '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ! '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  description: Data sources like PubMed, Entrez Gene, or Gene Ontology
@@ -88,7 +88,6 @@ extra_rdoc_files: []
88
88
  files:
89
89
  - etc/allowed_biomart_archives
90
90
  - etc/biomart/missing_in_archive
91
- - lib/rbbt/sources/COSMIC.rb
92
91
  - lib/rbbt/sources/COSTART.rb
93
92
  - lib/rbbt/sources/CTCAE.rb
94
93
  - lib/rbbt/sources/HPRD.rb
@@ -100,11 +99,9 @@ files:
100
99
  - lib/rbbt/sources/bibtex.rb
101
100
  - lib/rbbt/sources/biomart.rb
102
101
  - lib/rbbt/sources/cath.rb
103
- - lib/rbbt/sources/dbSNP.rb
104
102
  - lib/rbbt/sources/ensembl.rb
105
103
  - lib/rbbt/sources/ensembl_ftp.rb
106
104
  - lib/rbbt/sources/entrez.rb
107
- - lib/rbbt/sources/genomes1000.rb
108
105
  - lib/rbbt/sources/go.rb
109
106
  - lib/rbbt/sources/gscholar.rb
110
107
  - lib/rbbt/sources/jochem.rb
@@ -143,25 +140,25 @@ require_paths:
143
140
  - lib
144
141
  required_ruby_version: !ruby/object:Gem::Requirement
145
142
  requirements:
146
- - - ! '>='
143
+ - - ">="
147
144
  - !ruby/object:Gem::Version
148
145
  version: '0'
149
146
  required_rubygems_version: !ruby/object:Gem::Requirement
150
147
  requirements:
151
- - - ! '>='
148
+ - - ">="
152
149
  - !ruby/object:Gem::Version
153
150
  version: '0'
154
151
  requirements: []
155
152
  rubyforge_project:
156
- rubygems_version: 2.2.0
153
+ rubygems_version: 2.2.1
157
154
  signing_key:
158
155
  specification_version: 4
159
156
  summary: Data sources for the Ruby Bioinformatics Toolkit (rbbt)
160
157
  test_files:
161
- - test/rbbt/sources/test_go.rb
162
- - test/rbbt/sources/test_entrez.rb
158
+ - test/rbbt/sources/test_pubmed.rb
163
159
  - test/rbbt/sources/test_biomart.rb
164
160
  - test/rbbt/sources/test_gscholar.rb
161
+ - test/rbbt/sources/test_entrez.rb
162
+ - test/rbbt/sources/test_go.rb
165
163
  - test/rbbt/sources/test_organism.rb
166
- - test/rbbt/sources/test_pubmed.rb
167
164
  - test/test_helper.rb
@@ -1,153 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/resource'
3
-
4
- module COSMIC
5
- extend Resource
6
- self.subdir = "share/databases/COSMIC"
7
-
8
- COSMIC.claim COSMIC.mutations, :proc do
9
- url = "ftp://ftp.sanger.ac.uk/pub/CGP/cosmic/data_export/CosmicCompleteExport_v67_241013.tsv.gz"
10
-
11
- stream = CMD.cmd('awk \'BEGIN{FS="\t"} { if ($12 != "" && $12 != "Mutation ID") { sub($12, "COSM" $12 ":" $4)}; print}\'', :in => Open.open(url), :pipe => true)
12
- tsv = TSV.open(stream, :type => :list, :header_hash => "", :key_field => "Mutation ID", :namespace => "Hsa/jun2011")
13
- tsv.fields = tsv.fields.collect{|f| f == "Gene name" ? "Associated Gene Name" : f}
14
- tsv.add_field "Genomic Mutation" do |mid, values|
15
- position = values["Mutation GRCh37 genome position"]
16
- cds = values["Mutation CDS"]
17
-
18
- if position.nil? or position.empty?
19
- nil
20
- else
21
- position = position.split("-").first
22
-
23
- chr, pos = position.split(":")
24
- chr = "X" if chr == "23"
25
- chr = "Y" if chr == "24"
26
- chr = "M" if chr == "25"
27
- position = [chr, pos ] * ":"
28
-
29
- if cds.nil?
30
- position
31
- else
32
- change = case
33
- when cds =~ />/
34
- cds.split(">").last
35
- when cds =~ /del/
36
- deletion = cds.split("del").last
37
- case
38
- when deletion =~ /^\d+$/
39
- "-" * deletion.to_i
40
- when deletion =~ /^[ACTG]+$/i
41
- "-" * deletion.length
42
- else
43
- Log.debug "Unknown deletion: #{ deletion }"
44
- deletion
45
- end
46
- when cds =~ /ins/
47
- insertion = cds.split("ins").last
48
- case
49
- when insertion =~ /^\d+$/
50
- "+" + "N" * insertion.to_i
51
- when insertion =~ /^[NACTG]+$/i
52
- "+" + insertion
53
- else
54
- Log.debug "Unknown insertion: #{insertion }"
55
- insertion
56
- end
57
- else
58
- Log.debug "Unknown change: #{cds}"
59
- "?(" << cds << ")"
60
- end
61
- position + ":" + change
62
- end
63
- end
64
- end
65
-
66
- tsv.to_s.gsub(/(\d)-(\d)/,'\1:\2')
67
- end
68
-
69
- COSMIC.claim COSMIC.mutations_hg18, :proc do |filename|
70
- require 'rbbt/sources/organism'
71
- file = COSMIC.mutations.open
72
- begin
73
-
74
- while (line = file.gets) !~ /Genomic Mutation/; end
75
- fields = line[1..-2].split("\t")
76
- mutation_pos = fields.index "Genomic Mutation"
77
-
78
- mutations = CMD.cmd("grep -v '^#'|cut -f #{mutation_pos + 1}|sort -u", :in => COSMIC.mutations.open).read.split("\n").select{|m| m.include? ":" }
79
-
80
- translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
81
-
82
- File.open(filename, 'w') do |f|
83
- f.puts "#: :type=:list#:namespace=Hsa/may2009"
84
- f.puts "#" + fields * "\t"
85
- while line = file.gets do
86
- next if line[0] == "#"[0]
87
- line.strip!
88
- parts = line.split("\t")
89
- parts[mutation_pos] = translations[parts[mutation_pos]]
90
- f.puts parts * "\t"
91
- end
92
- end
93
- rescue Exception
94
- FileUtils.rm filename if File.exists? filename
95
- raise $!
96
- ensure
97
- file.close
98
- end
99
-
100
- nil
101
- end
102
-
103
-
104
- def self.rsid_index(organism, chromosome = nil)
105
- build = Organism.hg_build(organism)
106
-
107
- tag = [build, chromosome] * ":"
108
- fwt = nil
109
- Persist.persist("StaticPosIndex for COSMIC [#{ tag }]", :fwt, :persist => true) do
110
- value_size = 0
111
- file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
112
- chr_positions = []
113
- begin
114
- Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
115
- next if line[0] == "#"[0]
116
- rsid, mutation = line.split("\t").values_at 0, 25
117
- next if mutation.nil? or mutation.empty?
118
- chr, pos = mutation.split(":")
119
- next if chr != chromosome or pos.nil? or pos.empty?
120
- chr_positions << [rsid, pos.to_i]
121
- value_size = rsid.length if rsid.length > value_size
122
- end
123
- rescue
124
- end
125
- fwt = FixWidthTable.new :memory, value_size
126
- fwt.add_point(chr_positions)
127
- fwt
128
- end
129
- end
130
-
131
- def self.mutation_index(organism)
132
- build = Organism.hg_build(organism)
133
- file = COSMIC[build == "hg19" ? "mutations" : "mutations_hg18"]
134
- @mutation_index ||= {}
135
- @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
136
- end
137
-
138
-
139
- end
140
-
141
- if defined? Entity
142
- if defined? Gene and Entity === Gene
143
- module Gene
144
- property :COSMIC_rsids => :single2array do
145
- COSMIC.rsid_index(organism, chromosome)[self.chr_range]
146
- end
147
-
148
- property :COSMIC_mutations => :single2array do
149
- GenomicMutation.setup(COSMIC.mutation_index(organism).values_at(*self.COSMIC_rsids).uniq, "COSMIC mutations over #{self.name || self}", organism, false)
150
- end
151
- end
152
- end
153
- end
@@ -1,194 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/resource'
4
- require 'net/ftp'
5
-
6
- module DbSNP
7
- extend Resource
8
- self.subdir = "share/databases/dbSNP"
9
-
10
- URL = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/VCF/common_all.vcf.gz"
11
-
12
- DbSNP.claim DbSNP.mutations_ncbi, :proc do
13
- tsv = TSV.setup({}, :key_field => "RS ID", :fields => ["Genomic Mutation"], :type => :flat)
14
- file = Open.open(URL, :nocache => true)
15
- while line = file.gets do
16
- next if line[0] == "#"[0]
17
- chr, position, id, ref, alt = line.split "\t"
18
-
19
- mutations = alt.split(",").collect do |a|
20
- if alt[0] == ref[0]
21
- alt[0] = '+'[0]
22
- end
23
- [chr, position, alt] * ":"
24
- end
25
-
26
- tsv.namespace = "Hsa/may2012"
27
- tsv[id] = mutations
28
- end
29
-
30
- tsv.to_s
31
- end
32
-
33
- DbSNP.claim DbSNP.rsids, :proc do |filename|
34
- ftp = Net::FTP.new('ftp.broadinstitute.org')
35
- ftp.passive = true
36
- ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
37
- ftp.chdir('/bundle/2.3/hg19')
38
-
39
- tmpfile = TmpFile.tmp_file + '.gz'
40
- ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
41
-
42
- file = Open.open(tmpfile, :nocache => true)
43
- begin
44
- File.open(filename, 'w') do |f|
45
- f.puts "#: :type=:list#:namespace=Hsa/may2012"
46
- f.puts "#" + ["RS ID", "GMAF", "G5", "G5A", "dbSNP Build ID"] * "\t"
47
- while line = file.gets do
48
- next if line[0] == "#"[0]
49
-
50
- chr, position, id, ref, muts, qual, filter, info = line.split "\t"
51
-
52
- g5 = g5a = dbsnp_build_id = gmaf = nil
53
-
54
- gmaf = $1 if info =~ /GMAF=([0-9.]+)/
55
- g5 = true if info =~ /\bG5\b/
56
- g5a = true if info =~ /\bG5A\b/
57
- dbsnp_build_id = $1 if info =~ /dbSNPBuildID=(\d+)/
58
-
59
- f.puts [id, gmaf, g5, g5a, dbsnp_build_id] * "\t"
60
- end
61
- end
62
- rescue Exception
63
- FileUtils.rm filename if File.exists? filename
64
- raise $!
65
- ensure
66
- file.close
67
- FileUtils.rm tmpfile
68
- end
69
-
70
- nil
71
- end
72
-
73
- DbSNP.claim DbSNP.mutations, :proc do |filename|
74
- ftp = Net::FTP.new('ftp.broadinstitute.org')
75
- ftp.passive = true
76
- ftp.login('gsapubftp-anonymous', 'devnull@nomail.org')
77
- ftp.chdir('/bundle/2.3/hg19')
78
-
79
- tmpfile = TmpFile.tmp_file + '.gz'
80
- ftp.getbinaryfile('dbsnp_137.hg19.vcf.gz', tmpfile, 1024)
81
-
82
- file = Open.open(tmpfile, :nocache => true)
83
- begin
84
- File.open(filename, 'w') do |f|
85
- f.puts "#: :type=:flat#:namespace=Hsa/may2012"
86
- f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
87
- while line = file.gets do
88
- next if line[0] == "#"[0]
89
-
90
- chr, position, id, ref, muts, qual, filter, info = line.split "\t"
91
-
92
- chr.sub!('chr', '')
93
-
94
- position, muts = Misc.correct_vcf_mutation(position.to_i, ref, muts)
95
-
96
- mutations = muts.collect{|mut| [chr, position, mut] * ":" }
97
-
98
- f.puts ([id] + mutations) * "\t"
99
- end
100
- end
101
- rescue Exception
102
- FileUtils.rm filename if File.exists? filename
103
- raise $!
104
- ensure
105
- file.close
106
- FileUtils.rm tmpfile
107
- end
108
-
109
- nil
110
- end
111
-
112
- DbSNP.claim DbSNP.mutations_hg18, :proc do |filename|
113
- require 'rbbt/sources/organism'
114
-
115
- mutations = CMD.cmd("grep -v '^#'|cut -f 2|sort -u", :in => DbSNP.mutations.open).read.split("\n").collect{|l| l.split("|")}.flatten
116
-
117
- translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
118
- begin
119
- file = Open.open(DbSNP.mutations.find, :nocache => true)
120
- File.open(filename, 'w') do |f|
121
- f.puts "#: :type=:flat#:namespace=Hsa/may2009"
122
- f.puts "#" + ["RS ID", "Genomic Mutation"] * "\t"
123
- while line = file.gets do
124
- next if line[0] == "#"[0]
125
- parts = line.split("\t")
126
- parts[1..-1] = parts[1..-1].collect{|p| translations[p]} * "|"
127
- f.puts parts * "\t"
128
- end
129
- end
130
- rescue Exception
131
- FileUtils.rm filename if File.exists? filename
132
- raise $!
133
- ensure
134
- file.close
135
- end
136
-
137
- nil
138
- end
139
-
140
- def self.rsid_index(organism, chromosome = nil)
141
- build = Organism.hg_build(organism)
142
-
143
- tag = [build, chromosome] * ":"
144
- Persist.persist("StaticPosIndex for dbSNP [#{ tag }]", :fwt, :persist => true) do
145
- value_size = 0
146
- file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
147
- chr_positions = []
148
- Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
149
- next if line[0] == "#"[0]
150
- rsid, mutation = line.split("\t")
151
- next if mutation.nil? or mutation.empty?
152
- chr, pos = mutation.split(":")
153
- next if chr != chromosome or pos.nil? or pos.empty?
154
- chr_positions << [rsid, pos.to_i]
155
- value_size = rsid.length if rsid.length > value_size
156
- end
157
- fwt = FixWidthTable.new :memory, value_size
158
- fwt.add_point(chr_positions)
159
- fwt
160
- end
161
- end
162
-
163
- def self.mutation_index(organism)
164
- build = Organism.hg_build(organism)
165
- file = DbSNP[build == "hg19" ? "mutations" : "mutations_hg18"]
166
- @mutation_index ||= {}
167
- @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
168
- end
169
-
170
- end
171
-
172
- if defined? Entity
173
- if defined? Gene and Entity === Gene
174
- module Gene
175
- property :dbSNP_rsids => :single2array do
176
- DbSNP.rsid_index(organism, chromosome)[self.chr_range]
177
- end
178
-
179
- property :dbSNP_mutations => :single2array do
180
- GenomicMutation.setup(DbSNP.mutation_index(organism).values_at(*self.dbSNP_rsids).compact.flatten.uniq, "dbSNP mutations over #{self.name || self}", organism, true)
181
- end
182
- end
183
- end
184
-
185
- if defined? GenomicMutation and Entity === GenomicMutation
186
- module GenomicMutation
187
- property :dbSNP => :array2single do
188
- dbSNP.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["RS ID"], :type => :single).values_at *self
189
- end
190
- end
191
-
192
- end
193
- end
194
-
@@ -1,109 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/resource'
4
- require 'rbbt/entity/gene'
5
-
6
- module Genomes1000
7
- extend Resource
8
- self.subdir = "share/databases/genomes_1000"
9
-
10
- RELEASE_URL = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20110521/ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz"
11
-
12
- Genomes1000.claim Genomes1000.mutations, :proc do |filename|
13
-
14
- begin
15
- Open.write(filename) do |file|
16
- file.puts "#: :type=:single#:namespace=Hsa"
17
- file.puts "#Variant ID\tGenomic Mutation"
18
-
19
- Open.read(RELEASE_URL) do |line|
20
- next if line[0] == "#"[0]
21
-
22
- chromosome, position, id, references, alternative, quality, filter, info = line.split("\t")
23
-
24
- file.puts [id, [chromosome, position, alternative] * ":"] * "\t"
25
- end
26
- end
27
- rescue
28
- FileUtils.rm filename if File.exists? filename
29
- raise $!
30
- end
31
- nil
32
- end
33
-
34
-
35
- Genomes1000.claim Genomes1000.mutations_hg18, :proc do
36
- require 'rbbt/sources/organism'
37
-
38
- hg19_tsv = Genomes1000.mutations.tsv :unnamed => true
39
-
40
- mutations = hg19_tsv.values
41
-
42
- translations = Misc.process_to_hash(mutations){|mutations| Organism.liftOver(mutations, "Hsa/jun2011", "Hsa/may2009")}
43
-
44
- tsv = hg19_tsv.process "Genomic Mutation" do |mutation|
45
- translations[mutation]
46
- end
47
-
48
- tsv.namespace = "Hsa/may2009"
49
-
50
- tsv.to_s
51
- end
52
-
53
- def self.rsid_index(organism, chromosome = nil)
54
- build = Organism.hg_build(organism)
55
-
56
- tag = [build, chromosome] * ":"
57
- Persist.persist("StaticPosIndex for Genomes1000 [#{ tag }]", :fwt, :persist => true) do
58
- value_size = 0
59
- file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
60
- chr_positions = []
61
- Open.read(CMD.cmd("grep '\t#{chromosome}:'", :in => file.open, :pipe => true)) do |line|
62
- next if line[0] == "#"[0]
63
- rsid, mutation = line.split("\t")
64
- next if mutation.nil? or mutation.empty?
65
- chr, pos = mutation.split(":")
66
- next if chr != chromosome or pos.nil? or pos.empty?
67
- chr_positions << [rsid, pos.to_i]
68
- value_size = rsid.length if rsid.length > value_size
69
- end
70
- fwt = FixWidthTable.new :memory, value_size
71
- fwt.add_point(chr_positions)
72
- fwt
73
- end
74
- end
75
-
76
- def self.mutation_index(organism)
77
- build = Organism.hg_build(organism)
78
- file = Genomes1000[build == "hg19" ? "mutations" : "mutations_hg18"]
79
- @mutation_index ||= {}
80
- @mutation_index[build] ||= file.tsv :persist => true, :fields => ["Genomic Mutation"], :type => :single, :persist => true
81
- end
82
-
83
-
84
- end
85
-
86
-
87
- if defined? Entity
88
- if defined? Gene and Entity === Gene
89
- module Gene
90
- property :genomes_1000_rsids => :single2array do
91
- Genomes1000.rsid_index(organism, chromosome)[self.chr_range]
92
- end
93
-
94
- property :genomes_1000_mutations => :single2array do
95
- GenomicMutation.setup(Genomes1000.mutation_index(organism).values_at(*self.genomes_1000_rsids).uniq, "1000 Genomes mutations over #{self.name || self}", organism, true)
96
- end
97
- end
98
- end
99
-
100
- if defined? GenomicMutation and Entity === GenomicMutation
101
- module GenomicMutation
102
- property :genomes_1000 => :array2single do
103
- Genomes1000.mutations.tsv(:persist => true, :key_field => "Genomic Mutation", :fields => ["Variant ID"], :type => :single).values_at *self
104
- end
105
- end
106
- end
107
- end
108
-
109
-