publisci 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -2
- data/LICENSE.txt +19 -17
- data/README.md +41 -8
- data/README.rdoc +3 -5
- data/Rakefile +2 -2
- data/bin/publisci +9 -7
- data/examples/visualization/prov_viz.rb +1 -1
- data/lib/publisci.rb +19 -11
- data/lib/publisci/datacube_model.rb +2 -2
- data/lib/publisci/dataset/ORM/data_cube_orm.rb +2 -2
- data/lib/publisci/dataset/data_cube.rb +1 -1
- data/lib/publisci/dataset/dataset_for.rb +6 -1
- data/lib/publisci/dataset/interactive.rb +1 -46
- data/lib/publisci/generators/base.rb +22 -0
- data/lib/publisci/generators/maf.rb +172 -0
- data/lib/publisci/metadata/generator.rb +1 -1
- data/lib/publisci/parser.rb +62 -62
- data/lib/publisci/parsers/base.rb +29 -0
- data/lib/publisci/parsers/maf.rb +20 -0
- data/lib/publisci/readers/arff.rb +43 -43
- data/lib/publisci/readers/base.rb +2 -2
- data/lib/publisci/readers/csv.rb +2 -1
- data/lib/publisci/readers/maf.rb +15 -181
- data/lib/publisci/readers/r_matrix.rb +143 -143
- data/lib/publisci/writers/arff.rb +1 -1
- data/lib/publisci/writers/base.rb +1 -1
- data/resources/maf_rdf.ttl +98 -22
- data/spec/ORM/data_cube_orm_spec.rb +1 -1
- data/spec/ORM/prov_model_spec.rb +3 -3
- data/spec/dataset_for_spec.rb +1 -1
- data/spec/generators/maf_spec.rb +2 -1
- data/spec/maf_query_spec.rb +1 -1
- metadata +25 -23
- data/lib/r2rdf.rb +0 -226
- data/lib/template_bak.rb +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b2b51e26fd60c38dfa2f7b4d24b7d49f7d874d7c
|
4
|
+
data.tar.gz: 930163fafca5f08a3a0bef75a1defc325fcae8d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 77c57742c7300988650337353c4401d652cd2c4e3a2e34daffc0e00216d501b9624b0696a74c4ce79d1cb13a06bf62dfc570ec6580fb5dd6faea4afb33ff32aa
|
7
|
+
data.tar.gz: af000fd327fcd04f422fccd40c3fcf3a0a1478fff2aa5c7b447c207d130d62893bdd1963123e80b3acb070526b4842e8524be97eaf51e531ddb4c8abac1f33da
|
data/Gemfile
CHANGED
@@ -7,12 +7,12 @@ source "http://rubygems.org"
|
|
7
7
|
# Include everything needed to run rake, tests, features, etc.
|
8
8
|
group :development do
|
9
9
|
gem "rspec", "~> 2.8.0"
|
10
|
-
gem "rdoc", "~> 3.12"
|
11
10
|
gem "cucumber", ">= 0"
|
12
11
|
gem "jeweler", "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
|
13
12
|
gem "bundler", ">= 1.0.21"
|
14
13
|
gem "bio", ">= 1.4.2"
|
15
|
-
gem "rdoc"
|
14
|
+
gem "rdoc"
|
15
|
+
gem "pry"
|
16
16
|
gem "spoon"
|
17
17
|
end
|
18
18
|
|
data/LICENSE.txt
CHANGED
@@ -1,20 +1,22 @@
|
|
1
|
-
Copyright (c) 2013
|
1
|
+
Copyright (c) 2013, Will Strinz
|
2
|
+
All rights reserved.
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are met:
|
10
6
|
|
11
|
-
|
12
|
-
|
7
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
8
|
+
list of conditions and the following disclaimer.
|
9
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
11
|
+
and/or other materials provided with the distribution.
|
13
12
|
|
14
|
-
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
16
|
-
MERCHANTABILITY
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
13
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
14
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
15
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
16
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
17
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
18
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
19
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
20
|
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
21
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
22
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
CHANGED
@@ -2,9 +2,7 @@
|
|
2
2
|
|
3
3
|
[![Build Status](https://travis-ci.org/wstrinz/publisci.png?branch=master)](https://travis-ci.org/wstrinz/publisci)
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
Note: this software is under active development!
|
5
|
+
Note: this software is under active development! Until it hits v 1.0.0, the overall API and usage pattern is subject to change.
|
8
6
|
|
9
7
|
## Installation
|
10
8
|
|
@@ -14,6 +12,10 @@ gem install publisci
|
|
14
12
|
|
15
13
|
## Usage
|
16
14
|
|
15
|
+
#### DSL
|
16
|
+
|
17
|
+
Most of the gem's functions can be accessed through its DSL
|
18
|
+
|
17
19
|
```ruby
|
18
20
|
require 'publisci'
|
19
21
|
include PubliSci::DSL
|
@@ -21,18 +23,18 @@ include PubliSci::DSL
|
|
21
23
|
# Specify input data
|
22
24
|
data do
|
23
25
|
# use local or remote paths
|
24
|
-
source 'https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv'
|
26
|
+
source 'https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv'
|
25
27
|
|
26
28
|
# specify datacube properties
|
27
|
-
dimension 'producer', 'pricerange'
|
29
|
+
dimension 'producer', 'pricerange'
|
28
30
|
measure 'chunkiness'
|
29
31
|
|
30
32
|
# set parser specific options
|
31
|
-
option 'label_column', 'producer'
|
33
|
+
option 'label_column', 'producer'
|
32
34
|
end
|
33
35
|
|
34
36
|
# Describe dataset
|
35
|
-
metadata do
|
37
|
+
metadata do
|
36
38
|
dataset 'bacon'
|
37
39
|
title 'Bacon dataset'
|
38
40
|
creator 'Will Strinz'
|
@@ -48,14 +50,45 @@ repo = to_repository
|
|
48
50
|
PubliSci::QueryHelper.execute('select * where {?s ?p ?o} limit 5', repo)
|
49
51
|
|
50
52
|
# export in other formats
|
51
|
-
PubliSci::Writers::ARFF.new.from_store(repo)
|
53
|
+
PubliSci::Writers::ARFF.new.from_store(repo)
|
52
54
|
```
|
53
55
|
|
54
56
|
|
57
|
+
#### Gem executable
|
58
|
+
|
59
|
+
Running the gem using the `publisci` executable will attempt to find and run
|
60
|
+
an triplifier for your input.
|
61
|
+
|
62
|
+
For example, the following
|
63
|
+
|
64
|
+
```sh
|
65
|
+
publisci https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv
|
66
|
+
```
|
67
|
+
|
68
|
+
Is equivalent to the DSL code
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
require 'publisci'
|
72
|
+
include PubliSci::DSL
|
73
|
+
|
74
|
+
data do
|
75
|
+
source 'https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv'
|
76
|
+
end
|
77
|
+
|
78
|
+
generate_n3
|
79
|
+
```
|
55
80
|
|
56
81
|
The API doc is online. For more code examples see the test files in
|
57
82
|
the source tree.
|
58
83
|
|
84
|
+
### Custom Parsers
|
85
|
+
|
86
|
+
Building a parser simply requires you to implement a `generate_n3` method, either at the class or instance level. Then register it using `Publisci::Dataset.register_reader(extension, class)` using your reader's preferred file extension and its class. This way, if you call the `Dataset.for` method on a file with the given extension it will use your reader class.
|
87
|
+
|
88
|
+
Including or extending the `Publisci::Readers::Base` will give you access to many helpful methods for creating a triplifying your data. There is a post on the [project blog](http://gsocsemantic.wordpress.com/2013/08/31/parsing-with-publisci-how-to-get-your-data-into-the-semantic-web/) with further details about how to design and implement a parser.
|
89
|
+
|
90
|
+
The interface is in the process of being more rigdly defined to separate parsing, generation, and output, and it is advisable to you make your parsing code as stateless as possible for better handling of large inputs. Pull requests with parsers for new formats are greatly appreciated however!
|
91
|
+
|
59
92
|
## Project home page
|
60
93
|
|
61
94
|
Information on the source tree, documentation, examples, issues and
|
data/README.rdoc
CHANGED
@@ -4,8 +4,6 @@
|
|
4
4
|
src="https://secure.travis-ci.org/wstrinz/publisci.png"
|
5
5
|
/>}[http://travis-ci.org/#!/wstrinz/publisci]
|
6
6
|
|
7
|
-
Full description goes here
|
8
|
-
|
9
7
|
Note: this software is under active development!
|
10
8
|
|
11
9
|
== Installation
|
@@ -16,13 +14,13 @@ Note: this software is under active development!
|
|
16
14
|
|
17
15
|
== Developers
|
18
16
|
|
19
|
-
To use the library
|
17
|
+
To use the library
|
20
18
|
|
21
19
|
require 'publisci'
|
22
20
|
|
23
21
|
The API doc is online. For more code examples see also the test files in
|
24
22
|
the source tree.
|
25
|
-
|
23
|
+
|
26
24
|
== Project home page
|
27
25
|
|
28
26
|
Information on the source tree, documentation, issues and how to contribute, see
|
@@ -34,7 +32,7 @@ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
|
34
32
|
== Cite
|
35
33
|
|
36
34
|
If you use this software, please cite one of
|
37
|
-
|
35
|
+
|
38
36
|
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
37
|
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
38
|
|
data/Rakefile
CHANGED
@@ -16,12 +16,12 @@ Jeweler::Tasks.new do |gem|
|
|
16
16
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
17
17
|
gem.name = "publisci"
|
18
18
|
gem.homepage = "http://github.com/wstrinz/publisci"
|
19
|
-
gem.license = "
|
19
|
+
gem.license = "BSD 2-Clause"
|
20
20
|
gem.summary = %Q{Publish scientific results to the semantic web}
|
21
21
|
gem.description = %Q{A toolkit for publishing scientific results and datasets using RDF, OWL, and related technologies }
|
22
22
|
gem.email = "wstrinz@gmail.com"
|
23
23
|
gem.authors = ["Will Strinz"]
|
24
|
-
gem.version = "0.1.
|
24
|
+
gem.version = "0.1.4"
|
25
25
|
|
26
26
|
# dependencies defined in Gemfile
|
27
27
|
end
|
data/bin/publisci
CHANGED
@@ -12,7 +12,7 @@ USAGE = <<-EOF
|
|
12
12
|
Usage:
|
13
13
|
|
14
14
|
publisci file
|
15
|
-
(
|
15
|
+
(triplify file using best available reader)
|
16
16
|
EOF
|
17
17
|
|
18
18
|
gempath = File.dirname(File.dirname(__FILE__))
|
@@ -27,12 +27,14 @@ if ARGV.size == 0
|
|
27
27
|
print USAGE
|
28
28
|
elsif ARGV.size == 1
|
29
29
|
#assume file, run DSL (prov for now)
|
30
|
-
if File.exist? ARGV[0]
|
31
|
-
puts "#{PubliSci::Prov.prefixes}\n#{PubliSci::Prov.run(ARGV[0])}"
|
32
|
-
else
|
33
|
-
|
34
|
-
|
35
|
-
end
|
30
|
+
# if File.exist? ARGV[0]
|
31
|
+
# puts "#{PubliSci::Prov.prefixes}\n#{PubliSci::Prov.run(ARGV[0])}"
|
32
|
+
# else
|
33
|
+
# puts "(no file #{ARGV[0]})"
|
34
|
+
# print USAGE
|
35
|
+
# end
|
36
|
+
puts PubliSci::Dataset.for(ARGV[0])
|
37
|
+
# PubliSci::Dataset.for(ARGV[0])
|
36
38
|
else
|
37
39
|
if ARGV.size % 2 == 0
|
38
40
|
opts=Hash[*ARGV]
|
@@ -58,7 +58,7 @@ infile = ARGV[0] || 'primer.prov'
|
|
58
58
|
runner = PubliSci::Prov::DSL::Instance.new
|
59
59
|
runner.instance_eval(IO.read(infile),infile)
|
60
60
|
repo = runner.to_repository
|
61
|
-
Spira.
|
61
|
+
Spira.repository = repo
|
62
62
|
|
63
63
|
include PubliSci::Prov::Model
|
64
64
|
|
data/lib/publisci.rb
CHANGED
@@ -4,6 +4,8 @@ require 'tempfile'
|
|
4
4
|
require 'fileutils'
|
5
5
|
require 'csv'
|
6
6
|
|
7
|
+
require 'spira'
|
8
|
+
|
7
9
|
require 'rdf'
|
8
10
|
require 'sparql'
|
9
11
|
require 'sparql/client'
|
@@ -14,11 +16,11 @@ require 'json/ld'
|
|
14
16
|
require 'rserve'
|
15
17
|
require 'rest-client'
|
16
18
|
|
17
|
-
begin
|
18
|
-
|
19
|
-
rescue LoadError
|
20
|
-
|
21
|
-
end
|
19
|
+
# begin
|
20
|
+
# require 'spira'
|
21
|
+
# rescue LoadError
|
22
|
+
# puts "can't load spira; orm unavailable"
|
23
|
+
# end
|
22
24
|
|
23
25
|
def load_folder(folder)
|
24
26
|
Dir.foreach(File.dirname(__FILE__) + "/#{folder}") do |file|
|
@@ -30,17 +32,23 @@ def load_folder(folder)
|
|
30
32
|
end
|
31
33
|
|
32
34
|
load_folder('publisci/mixins')
|
33
|
-
|
34
|
-
load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
|
35
|
-
load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
|
36
35
|
load File.dirname(__FILE__) + '/publisci/parser.rb'
|
37
|
-
load File.dirname(__FILE__) + '/publisci/
|
38
|
-
|
39
|
-
load File.dirname(__FILE__) + '/publisci/store.rb'
|
36
|
+
load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
|
37
|
+
|
40
38
|
load File.dirname(__FILE__) + '/publisci/dataset/data_cube.rb'
|
41
39
|
load File.dirname(__FILE__) + '/publisci/dataset/dataset_for.rb'
|
42
40
|
load File.dirname(__FILE__) + '/publisci/dataset/configuration.rb'
|
43
41
|
load File.dirname(__FILE__) + '/publisci/dataset/dataset.rb'
|
42
|
+
|
43
|
+
load File.dirname(__FILE__) + '/publisci/generators/base.rb'
|
44
|
+
load File.dirname(__FILE__) + '/publisci/parsers/base.rb'
|
45
|
+
load_folder('publisci/parsers')
|
46
|
+
load_folder('publisci/generators')
|
47
|
+
|
48
|
+
load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
|
49
|
+
load File.dirname(__FILE__) + '/publisci/post_processor.rb'
|
50
|
+
load File.dirname(__FILE__) + '/publisci/analyzer.rb'
|
51
|
+
load File.dirname(__FILE__) + '/publisci/store.rb'
|
44
52
|
load File.dirname(__FILE__) + '/publisci/datacube_model.rb'
|
45
53
|
load File.dirname(__FILE__) + '/publisci/output.rb'
|
46
54
|
load File.dirname(__FILE__) + '/publisci/metadata/prov/element.rb'
|
@@ -37,7 +37,7 @@ begin
|
|
37
37
|
|
38
38
|
def load_repo(repo)
|
39
39
|
raise "Not an RDF::Repository - #{repo}" unless repo.is_a? RDF::Repository
|
40
|
-
Spira.
|
40
|
+
Spira.repository = repo
|
41
41
|
end
|
42
42
|
|
43
43
|
class Observation < Spira::Base
|
@@ -60,7 +60,7 @@ begin
|
|
60
60
|
uri[-1] = '' if uri[-1] == '>'
|
61
61
|
uri.to_s.split('/').last.split('#').last
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
end
|
65
65
|
|
66
66
|
def reload_observation
|
@@ -5,13 +5,13 @@ module PubliSci
|
|
5
5
|
extend PubliSci::Dataset::DataCube
|
6
6
|
extend PubliSci::Analyzer
|
7
7
|
extend PubliSci::Query
|
8
|
-
extend PubliSci::
|
8
|
+
extend PubliSci::RDFParser
|
9
9
|
|
10
10
|
include PubliSci::Dataset::DataCube
|
11
11
|
include PubliSci::Analyzer
|
12
12
|
include PubliSci::Metadata::Generator
|
13
13
|
include PubliSci::Query
|
14
|
-
include PubliSci::
|
14
|
+
include PubliSci::RDFParser
|
15
15
|
|
16
16
|
attr_accessor :labels
|
17
17
|
attr_accessor :dimensions
|
@@ -28,7 +28,12 @@ module PubliSci
|
|
28
28
|
end
|
29
29
|
|
30
30
|
if reader_registry.keys.include? extension
|
31
|
-
reader_registry[extension]
|
31
|
+
k = reader_registry[extension]
|
32
|
+
if k.respond_to? "automatic"
|
33
|
+
reader_registry[extension].automatic(object,options,ask_on_ambiguous)
|
34
|
+
else
|
35
|
+
reader_registry[extension].new.automatic(object,options,ask_on_ambiguous)
|
36
|
+
end
|
32
37
|
else
|
33
38
|
case extension
|
34
39
|
when ".RData"
|
@@ -23,50 +23,5 @@ module PubliSci
|
|
23
23
|
default
|
24
24
|
end
|
25
25
|
end
|
26
|
-
|
27
|
-
# def interactive(options={})
|
28
|
-
# options = defaults.merge(options)
|
29
|
-
# qb = {}
|
30
|
-
|
31
|
-
# puts "load config from file? [y/N]"
|
32
|
-
# if gets.chomp == "y"
|
33
|
-
# #use yaml or DSL file to configure
|
34
|
-
# else
|
35
|
-
# qb[:dimensions] = dimensions()
|
36
|
-
# qb[:measures] = measures()
|
37
|
-
# end
|
38
|
-
|
39
|
-
# puts "load data from file? [y/N]"
|
40
|
-
# if gets.chomp == "y"
|
41
|
-
# #attempt to load dataset from file, ask user to resolve problems or ambiguity
|
42
|
-
# else
|
43
|
-
# end
|
44
|
-
# qb
|
45
|
-
# end
|
46
|
-
|
47
|
-
# def dimensions
|
48
|
-
# puts "Enter a list of dimensions, separated by commas"
|
49
|
-
# arr = gets.chomp.split(",")
|
50
|
-
# dims = {}
|
51
|
-
|
52
|
-
# arr.map{|dim|
|
53
|
-
# puts "What is the range of #{dim.chomp.strip}? [:coded]"
|
54
|
-
# type = gets.chomp
|
55
|
-
# type = :coded if type == ":coded" || type == ""
|
56
|
-
# dims[dim.chomp.strip] = {type: type}
|
57
|
-
# }
|
58
|
-
|
59
|
-
# dims
|
60
|
-
# end
|
61
|
-
|
62
|
-
# def measures
|
63
|
-
# puts "Enter a list of measures, separated by commas"
|
64
|
-
# arr = gets.chomp.split(",")
|
65
|
-
# meas = []
|
66
|
-
|
67
|
-
# arr.map{|m| meas << m.chomp.strip}
|
68
|
-
|
69
|
-
# meas
|
70
|
-
# end
|
71
26
|
end
|
72
|
-
end
|
27
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Generators
|
3
|
+
module Base
|
4
|
+
include PubliSci::Dataset::DataCube
|
5
|
+
|
6
|
+
def write(*args)
|
7
|
+
raise "Should be overriden"
|
8
|
+
end
|
9
|
+
alias_method :generate_n3, :write
|
10
|
+
|
11
|
+
def write_to(out, string)
|
12
|
+
out.write string
|
13
|
+
end
|
14
|
+
|
15
|
+
def close_output(out)
|
16
|
+
if out.is_a? File
|
17
|
+
out.close
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Generators
|
3
|
+
class MAF
|
4
|
+
extend Base
|
5
|
+
|
6
|
+
COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
|
7
|
+
|
8
|
+
COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
|
9
|
+
|
10
|
+
TCGA_CODES =
|
11
|
+
{
|
12
|
+
"Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
|
13
|
+
"Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
|
14
|
+
"dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
|
15
|
+
"Verification_Status" => %w{Verified, Unknown},
|
16
|
+
"Validation_Status" => %w{Untested Inconclusive Valid Invalid},
|
17
|
+
"Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
|
18
|
+
"Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
|
19
|
+
"Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
|
20
|
+
}
|
21
|
+
|
22
|
+
BARCODE_INDEX = COLUMN_NAMES.index('Tumor_Sample_Barcode')
|
23
|
+
|
24
|
+
class << self
|
25
|
+
def write(record, out, label, options={})
|
26
|
+
|
27
|
+
options = process_options(options)
|
28
|
+
|
29
|
+
options[:no_labels] ||= true
|
30
|
+
options[:lookup_hugo] ||= false
|
31
|
+
options[:complex_objects] ||= false
|
32
|
+
options[:ranges] ||= COMPONENT_RANGES
|
33
|
+
|
34
|
+
write_to(out, process_line(record, label, options))
|
35
|
+
end
|
36
|
+
|
37
|
+
def write_structure(input, output, options)
|
38
|
+
write_to(output, structure(options))
|
39
|
+
end
|
40
|
+
|
41
|
+
def process_options(options)
|
42
|
+
options[:dimensions] = dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
|
43
|
+
options[:codes] = codes = dimensions
|
44
|
+
options[:measures] = (COLUMN_NAMES - dimensions - codes)
|
45
|
+
options[:dataset_name] ||= "MAF_#{Time.now.nsec.to_s(32)}"
|
46
|
+
|
47
|
+
options
|
48
|
+
end
|
49
|
+
|
50
|
+
def process_line(entry,label,options)
|
51
|
+
entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[BARCODE_INDEX])).flatten
|
52
|
+
|
53
|
+
entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
|
54
|
+
|
55
|
+
# A 0 in the entrez-id column appears to mean null
|
56
|
+
col=1
|
57
|
+
entry[col] = nil if entry[col] == '0'
|
58
|
+
entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
|
59
|
+
|
60
|
+
# Only link non-novel dbSNP entries
|
61
|
+
col = COLUMN_NAMES.index('dbSNP_RS')
|
62
|
+
if entry[col] && entry[col][0..1] == "rs"
|
63
|
+
entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
|
64
|
+
end
|
65
|
+
|
66
|
+
# optionally create typed objects using sio nodes
|
67
|
+
if options[:complex_objects]
|
68
|
+
entry = sio_values(entry)
|
69
|
+
end
|
70
|
+
|
71
|
+
data = {}
|
72
|
+
COLUMN_NAMES.each_with_index{|col,i|
|
73
|
+
data[col] = [entry[i]]
|
74
|
+
}
|
75
|
+
|
76
|
+
observations(options[:measures],options[:dimensions],options[:codes],data,[label],options[:dataset_name],options).first
|
77
|
+
end
|
78
|
+
|
79
|
+
def sio_values(entry)
|
80
|
+
entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
|
81
|
+
|
82
|
+
# Link entrez genes
|
83
|
+
col=1
|
84
|
+
entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
|
85
|
+
|
86
|
+
col = COLUMN_NAMES.index('dbSNP_RS')
|
87
|
+
entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
|
88
|
+
|
89
|
+
# test SIO attributes for chromosome
|
90
|
+
col = COLUMN_NAMES.index('Chromosome')
|
91
|
+
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
|
92
|
+
|
93
|
+
# More SIO attrtibutes for alleles
|
94
|
+
%w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
|
95
|
+
col = COLUMN_NAMES.index(name)
|
96
|
+
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
|
97
|
+
}
|
98
|
+
|
99
|
+
col = COLUMN_NAMES.index("Strand")
|
100
|
+
entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
|
101
|
+
|
102
|
+
col = COLUMN_NAMES.index("Center")
|
103
|
+
entry[col] = sio_attribute("foaf:homepage",entry[col])
|
104
|
+
|
105
|
+
# Use faldo for locations End_Position
|
106
|
+
col = COLUMN_NAMES.index("Start_Position")
|
107
|
+
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
|
108
|
+
|
109
|
+
col = COLUMN_NAMES.index("End_Position")
|
110
|
+
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
|
111
|
+
|
112
|
+
entry
|
113
|
+
end
|
114
|
+
|
115
|
+
def structure(options={})
|
116
|
+
|
117
|
+
options = process_options(options)
|
118
|
+
|
119
|
+
str = prefixes(options[:dataset_name],options)
|
120
|
+
str << data_structure_definition(options[:measures],options[:dimensions],options[:codes],options[:dataset_name],options)
|
121
|
+
str << dataset(options[:dataset_name],options)
|
122
|
+
component_specifications(options[:measures], options[:dimensions], options[:codes], options[:dataset_name], options).map{ |c| str << c }
|
123
|
+
measure_properties(options[:measures],options[:dataset_name],options).map{|m| str << m}
|
124
|
+
dimension_properties(options[:dimensions],options[:codes], options[:dataset_name],options).map{|d| str << d}
|
125
|
+
code_lists(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
|
126
|
+
concept_codes(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
|
127
|
+
|
128
|
+
str
|
129
|
+
end
|
130
|
+
|
131
|
+
def post_process(file)
|
132
|
+
reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
|
133
|
+
hugo_cache ||= {}
|
134
|
+
PubliSci::PostProcessor.process(file,file,reg){|g|
|
135
|
+
hugo_cache[g] ||= official_symbol(g)
|
136
|
+
'http://identifiers.org/hgnc.symbol/' + cache[g]
|
137
|
+
}
|
138
|
+
end
|
139
|
+
|
140
|
+
def column_replace(entry,column,prefix,value=nil)
|
141
|
+
if value
|
142
|
+
entry[COLUMN_NAMES.index(column)] = prefix + value
|
143
|
+
else
|
144
|
+
entry[COLUMN_NAMES.index(column)] += prefix
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def official_symbol(hugo_symbol)
|
149
|
+
qry = <<-EOF
|
150
|
+
|
151
|
+
SELECT distinct ?official where {
|
152
|
+
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
|
153
|
+
UNION
|
154
|
+
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
|
155
|
+
|
156
|
+
?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
|
157
|
+
}
|
158
|
+
|
159
|
+
EOF
|
160
|
+
|
161
|
+
sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
|
162
|
+
sparql.query(qry).map(&:official).first.to_s
|
163
|
+
end
|
164
|
+
|
165
|
+
def parse_barcode(code)
|
166
|
+
#TCGA-E9-A22B-01A-11D-A159-09
|
167
|
+
[code[5..11], code[13..-1]]
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|