publisci 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 992e9282183a5b68f85d5835b91c761a53dd53f2
4
- data.tar.gz: 5764fee6d82a2a8fc6b8835cc3fc0383ab2ad6eb
3
+ metadata.gz: b2b51e26fd60c38dfa2f7b4d24b7d49f7d874d7c
4
+ data.tar.gz: 930163fafca5f08a3a0bef75a1defc325fcae8d6
5
5
  SHA512:
6
- metadata.gz: 186f61e059fbb8d60038c2181829dc8f231af53dd87aaa3556a1ecb59a7e1289d62dc2d682a385e0d0d700b8533f8f4c50235fb6e4f35879407d2703e5a6fdcd
7
- data.tar.gz: 341714efc592cbd634f2b427f37a72e6151902184e455028217d3c04d1cf3ee1b589cebb1da9cfde0055f612eb42743a7718f16e09a2d556feef430700fe1e1f
6
+ metadata.gz: 77c57742c7300988650337353c4401d652cd2c4e3a2e34daffc0e00216d501b9624b0696a74c4ce79d1cb13a06bf62dfc570ec6580fb5dd6faea4afb33ff32aa
7
+ data.tar.gz: af000fd327fcd04f422fccd40c3fcf3a0a1478fff2aa5c7b447c207d130d62893bdd1963123e80b3acb070526b4842e8524be97eaf51e531ddb4c8abac1f33da
data/Gemfile CHANGED
@@ -7,12 +7,12 @@ source "http://rubygems.org"
7
7
  # Include everything needed to run rake, tests, features, etc.
8
8
  group :development do
9
9
  gem "rspec", "~> 2.8.0"
10
- gem "rdoc", "~> 3.12"
11
10
  gem "cucumber", ">= 0"
12
11
  gem "jeweler", "~> 1.8.4", :git => "https://github.com/technicalpickles/jeweler.git"
13
12
  gem "bundler", ">= 1.0.21"
14
13
  gem "bio", ">= 1.4.2"
15
- gem "rdoc", "~> 3.12"
14
+ gem "rdoc"
15
+ gem "pry"
16
16
  gem "spoon"
17
17
  end
18
18
 
@@ -1,20 +1,22 @@
1
- Copyright (c) 2013 wstrinz
1
+ Copyright (c) 2013, Will Strinz
2
+ All rights reserved.
2
3
 
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
10
6
 
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
7
+ 1. Redistributions of source code must retain the above copyright notice, this
8
+ list of conditions and the following disclaimer.
9
+ 2. Redistributions in binary form must reproduce the above copyright notice,
10
+ this list of conditions and the following disclaimer in the documentation
11
+ and/or other materials provided with the distribution.
13
12
 
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
13
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md CHANGED
@@ -2,9 +2,7 @@
2
2
 
3
3
  [![Build Status](https://travis-ci.org/wstrinz/publisci.png?branch=master)](https://travis-ci.org/wstrinz/publisci)
4
4
 
5
- Full description goes here
6
-
7
- Note: this software is under active development!
5
+ Note: this software is under active development! Until it hits v 1.0.0, the overall API and usage pattern is subject to change.
8
6
 
9
7
  ## Installation
10
8
 
@@ -14,6 +12,10 @@ gem install publisci
14
12
 
15
13
  ## Usage
16
14
 
15
+ #### DSL
16
+
17
+ Most of the gem's functions can be accessed through its DSL
18
+
17
19
  ```ruby
18
20
  require 'publisci'
19
21
  include PubliSci::DSL
@@ -21,18 +23,18 @@ include PubliSci::DSL
21
23
  # Specify input data
22
24
  data do
23
25
  # use local or remote paths
24
- source 'https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv'
26
+ source 'https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv'
25
27
 
26
28
  # specify datacube properties
27
- dimension 'producer', 'pricerange'
29
+ dimension 'producer', 'pricerange'
28
30
  measure 'chunkiness'
29
31
 
30
32
  # set parser specific options
31
- option 'label_column', 'producer'
33
+ option 'label_column', 'producer'
32
34
  end
33
35
 
34
36
  # Describe dataset
35
- metadata do
37
+ metadata do
36
38
  dataset 'bacon'
37
39
  title 'Bacon dataset'
38
40
  creator 'Will Strinz'
@@ -48,14 +50,45 @@ repo = to_repository
48
50
  PubliSci::QueryHelper.execute('select * where {?s ?p ?o} limit 5', repo)
49
51
 
50
52
  # export in other formats
51
- PubliSci::Writers::ARFF.new.from_store(repo)
53
+ PubliSci::Writers::ARFF.new.from_store(repo)
52
54
  ```
53
55
 
54
56
 
57
+ #### Gem executable
58
+
59
+ Running the gem using the `publisci` executable will attempt to find and run
60
+ an triplifier for your input.
61
+
62
+ For example, the following
63
+
64
+ ```sh
65
+ publisci https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv
66
+ ```
67
+
68
+ Is equivalent to the DSL code
69
+
70
+ ```ruby
71
+ require 'publisci'
72
+ include PubliSci::DSL
73
+
74
+ data do
75
+ source 'https://github.com/wstrinz/publisci/raw/master/spec/csv/bacon.csv'
76
+ end
77
+
78
+ generate_n3
79
+ ```
55
80
 
56
81
  The API doc is online. For more code examples see the test files in
57
82
  the source tree.
58
83
 
84
+ ### Custom Parsers
85
+
86
+ Building a parser simply requires you to implement a `generate_n3` method, either at the class or instance level. Then register it using `Publisci::Dataset.register_reader(extension, class)` using your reader's preferred file extension and its class. This way, if you call the `Dataset.for` method on a file with the given extension it will use your reader class.
87
+
88
+ Including or extending the `Publisci::Readers::Base` will give you access to many helpful methods for creating a triplifying your data. There is a post on the [project blog](http://gsocsemantic.wordpress.com/2013/08/31/parsing-with-publisci-how-to-get-your-data-into-the-semantic-web/) with further details about how to design and implement a parser.
89
+
90
+ The interface is in the process of being more rigdly defined to separate parsing, generation, and output, and it is advisable to you make your parsing code as stateless as possible for better handling of large inputs. Pull requests with parsers for new formats are greatly appreciated however!
91
+
59
92
  ## Project home page
60
93
 
61
94
  Information on the source tree, documentation, examples, issues and
@@ -4,8 +4,6 @@
4
4
  src="https://secure.travis-ci.org/wstrinz/publisci.png"
5
5
  />}[http://travis-ci.org/#!/wstrinz/publisci]
6
6
 
7
- Full description goes here
8
-
9
7
  Note: this software is under active development!
10
8
 
11
9
  == Installation
@@ -16,13 +14,13 @@ Note: this software is under active development!
16
14
 
17
15
  == Developers
18
16
 
19
- To use the library
17
+ To use the library
20
18
 
21
19
  require 'publisci'
22
20
 
23
21
  The API doc is online. For more code examples see also the test files in
24
22
  the source tree.
25
-
23
+
26
24
  == Project home page
27
25
 
28
26
  Information on the source tree, documentation, issues and how to contribute, see
@@ -34,7 +32,7 @@ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
34
32
  == Cite
35
33
 
36
34
  If you use this software, please cite one of
37
-
35
+
38
36
  * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
39
37
  * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
40
38
 
data/Rakefile CHANGED
@@ -16,12 +16,12 @@ Jeweler::Tasks.new do |gem|
16
16
  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
17
  gem.name = "publisci"
18
18
  gem.homepage = "http://github.com/wstrinz/publisci"
19
- gem.license = "MIT"
19
+ gem.license = "BSD 2-Clause"
20
20
  gem.summary = %Q{Publish scientific results to the semantic web}
21
21
  gem.description = %Q{A toolkit for publishing scientific results and datasets using RDF, OWL, and related technologies }
22
22
  gem.email = "wstrinz@gmail.com"
23
23
  gem.authors = ["Will Strinz"]
24
- gem.version = "0.1.3"
24
+ gem.version = "0.1.4"
25
25
 
26
26
  # dependencies defined in Gemfile
27
27
  end
@@ -12,7 +12,7 @@ USAGE = <<-EOF
12
12
  Usage:
13
13
 
14
14
  publisci file
15
- (execute PROV dsl file)
15
+ (triplify file using best available reader)
16
16
  EOF
17
17
 
18
18
  gempath = File.dirname(File.dirname(__FILE__))
@@ -27,12 +27,14 @@ if ARGV.size == 0
27
27
  print USAGE
28
28
  elsif ARGV.size == 1
29
29
  #assume file, run DSL (prov for now)
30
- if File.exist? ARGV[0]
31
- puts "#{PubliSci::Prov.prefixes}\n#{PubliSci::Prov.run(ARGV[0])}"
32
- else
33
- puts "(no file #{ARGV[0]})"
34
- print USAGE
35
- end
30
+ # if File.exist? ARGV[0]
31
+ # puts "#{PubliSci::Prov.prefixes}\n#{PubliSci::Prov.run(ARGV[0])}"
32
+ # else
33
+ # puts "(no file #{ARGV[0]})"
34
+ # print USAGE
35
+ # end
36
+ puts PubliSci::Dataset.for(ARGV[0])
37
+ # PubliSci::Dataset.for(ARGV[0])
36
38
  else
37
39
  if ARGV.size % 2 == 0
38
40
  opts=Hash[*ARGV]
@@ -58,7 +58,7 @@ infile = ARGV[0] || 'primer.prov'
58
58
  runner = PubliSci::Prov::DSL::Instance.new
59
59
  runner.instance_eval(IO.read(infile),infile)
60
60
  repo = runner.to_repository
61
- Spira.add_repository :default, repo
61
+ Spira.repository = repo
62
62
 
63
63
  include PubliSci::Prov::Model
64
64
 
@@ -4,6 +4,8 @@ require 'tempfile'
4
4
  require 'fileutils'
5
5
  require 'csv'
6
6
 
7
+ require 'spira'
8
+
7
9
  require 'rdf'
8
10
  require 'sparql'
9
11
  require 'sparql/client'
@@ -14,11 +16,11 @@ require 'json/ld'
14
16
  require 'rserve'
15
17
  require 'rest-client'
16
18
 
17
- begin
18
- require 'spira'
19
- rescue LoadError
20
- puts "can't load spira; orm unavailable"
21
- end
19
+ # begin
20
+ # require 'spira'
21
+ # rescue LoadError
22
+ # puts "can't load spira; orm unavailable"
23
+ # end
22
24
 
23
25
  def load_folder(folder)
24
26
  Dir.foreach(File.dirname(__FILE__) + "/#{folder}") do |file|
@@ -30,17 +32,23 @@ def load_folder(folder)
30
32
  end
31
33
 
32
34
  load_folder('publisci/mixins')
33
-
34
- load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
35
- load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
36
35
  load File.dirname(__FILE__) + '/publisci/parser.rb'
37
- load File.dirname(__FILE__) + '/publisci/post_processor.rb'
38
- load File.dirname(__FILE__) + '/publisci/analyzer.rb'
39
- load File.dirname(__FILE__) + '/publisci/store.rb'
36
+ load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
37
+
40
38
  load File.dirname(__FILE__) + '/publisci/dataset/data_cube.rb'
41
39
  load File.dirname(__FILE__) + '/publisci/dataset/dataset_for.rb'
42
40
  load File.dirname(__FILE__) + '/publisci/dataset/configuration.rb'
43
41
  load File.dirname(__FILE__) + '/publisci/dataset/dataset.rb'
42
+
43
+ load File.dirname(__FILE__) + '/publisci/generators/base.rb'
44
+ load File.dirname(__FILE__) + '/publisci/parsers/base.rb'
45
+ load_folder('publisci/parsers')
46
+ load_folder('publisci/generators')
47
+
48
+ load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
49
+ load File.dirname(__FILE__) + '/publisci/post_processor.rb'
50
+ load File.dirname(__FILE__) + '/publisci/analyzer.rb'
51
+ load File.dirname(__FILE__) + '/publisci/store.rb'
44
52
  load File.dirname(__FILE__) + '/publisci/datacube_model.rb'
45
53
  load File.dirname(__FILE__) + '/publisci/output.rb'
46
54
  load File.dirname(__FILE__) + '/publisci/metadata/prov/element.rb'
@@ -37,7 +37,7 @@ begin
37
37
 
38
38
  def load_repo(repo)
39
39
  raise "Not an RDF::Repository - #{repo}" unless repo.is_a? RDF::Repository
40
- Spira.add_repository :default, repo
40
+ Spira.repository = repo
41
41
  end
42
42
 
43
43
  class Observation < Spira::Base
@@ -60,7 +60,7 @@ begin
60
60
  uri[-1] = '' if uri[-1] == '>'
61
61
  uri.to_s.split('/').last.split('#').last
62
62
  end
63
-
63
+
64
64
  end
65
65
 
66
66
  def reload_observation
@@ -5,13 +5,13 @@ module PubliSci
5
5
  extend PubliSci::Dataset::DataCube
6
6
  extend PubliSci::Analyzer
7
7
  extend PubliSci::Query
8
- extend PubliSci::Parser
8
+ extend PubliSci::RDFParser
9
9
 
10
10
  include PubliSci::Dataset::DataCube
11
11
  include PubliSci::Analyzer
12
12
  include PubliSci::Metadata::Generator
13
13
  include PubliSci::Query
14
- include PubliSci::Parser
14
+ include PubliSci::RDFParser
15
15
 
16
16
  attr_accessor :labels
17
17
  attr_accessor :dimensions
@@ -8,7 +8,7 @@ end
8
8
  module PubliSci
9
9
  class Dataset
10
10
  module DataCube
11
- include PubliSci::Parser
11
+ include PubliSci::RDFParser
12
12
  def defaults
13
13
  {
14
14
  type: :dataframe,
@@ -28,7 +28,12 @@ module PubliSci
28
28
  end
29
29
 
30
30
  if reader_registry.keys.include? extension
31
- reader_registry[extension].new.automatic(object,options,ask_on_ambiguous)
31
+ k = reader_registry[extension]
32
+ if k.respond_to? "automatic"
33
+ reader_registry[extension].automatic(object,options,ask_on_ambiguous)
34
+ else
35
+ reader_registry[extension].new.automatic(object,options,ask_on_ambiguous)
36
+ end
32
37
  else
33
38
  case extension
34
39
  when ".RData"
@@ -23,50 +23,5 @@ module PubliSci
23
23
  default
24
24
  end
25
25
  end
26
-
27
- # def interactive(options={})
28
- # options = defaults.merge(options)
29
- # qb = {}
30
-
31
- # puts "load config from file? [y/N]"
32
- # if gets.chomp == "y"
33
- # #use yaml or DSL file to configure
34
- # else
35
- # qb[:dimensions] = dimensions()
36
- # qb[:measures] = measures()
37
- # end
38
-
39
- # puts "load data from file? [y/N]"
40
- # if gets.chomp == "y"
41
- # #attempt to load dataset from file, ask user to resolve problems or ambiguity
42
- # else
43
- # end
44
- # qb
45
- # end
46
-
47
- # def dimensions
48
- # puts "Enter a list of dimensions, separated by commas"
49
- # arr = gets.chomp.split(",")
50
- # dims = {}
51
-
52
- # arr.map{|dim|
53
- # puts "What is the range of #{dim.chomp.strip}? [:coded]"
54
- # type = gets.chomp
55
- # type = :coded if type == ":coded" || type == ""
56
- # dims[dim.chomp.strip] = {type: type}
57
- # }
58
-
59
- # dims
60
- # end
61
-
62
- # def measures
63
- # puts "Enter a list of measures, separated by commas"
64
- # arr = gets.chomp.split(",")
65
- # meas = []
66
-
67
- # arr.map{|m| meas << m.chomp.strip}
68
-
69
- # meas
70
- # end
71
26
  end
72
- end
27
+ end
@@ -0,0 +1,22 @@
1
+ module PubliSci
2
+ module Generators
3
+ module Base
4
+ include PubliSci::Dataset::DataCube
5
+
6
+ def write(*args)
7
+ raise "Should be overriden"
8
+ end
9
+ alias_method :generate_n3, :write
10
+
11
+ def write_to(out, string)
12
+ out.write string
13
+ end
14
+
15
+ def close_output(out)
16
+ if out.is_a? File
17
+ out.close
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,172 @@
1
+ module PubliSci
2
+ module Generators
3
+ class MAF
4
+ extend Base
5
+
6
+ COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
7
+
8
+ COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
9
+
10
+ TCGA_CODES =
11
+ {
12
+ "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
13
+ "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
14
+ "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
15
+ "Verification_Status" => %w{Verified, Unknown},
16
+ "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
17
+ "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
18
+ "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
19
+ "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
20
+ }
21
+
22
+ BARCODE_INDEX = COLUMN_NAMES.index('Tumor_Sample_Barcode')
23
+
24
+ class << self
25
+ def write(record, out, label, options={})
26
+
27
+ options = process_options(options)
28
+
29
+ options[:no_labels] ||= true
30
+ options[:lookup_hugo] ||= false
31
+ options[:complex_objects] ||= false
32
+ options[:ranges] ||= COMPONENT_RANGES
33
+
34
+ write_to(out, process_line(record, label, options))
35
+ end
36
+
37
+ def write_structure(input, output, options)
38
+ write_to(output, structure(options))
39
+ end
40
+
41
+ def process_options(options)
42
+ options[:dimensions] = dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
43
+ options[:codes] = codes = dimensions
44
+ options[:measures] = (COLUMN_NAMES - dimensions - codes)
45
+ options[:dataset_name] ||= "MAF_#{Time.now.nsec.to_s(32)}"
46
+
47
+ options
48
+ end
49
+
50
+ def process_line(entry,label,options)
51
+ entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[BARCODE_INDEX])).flatten
52
+
53
+ entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
54
+
55
+ # A 0 in the entrez-id column appears to mean null
56
+ col=1
57
+ entry[col] = nil if entry[col] == '0'
58
+ entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
59
+
60
+ # Only link non-novel dbSNP entries
61
+ col = COLUMN_NAMES.index('dbSNP_RS')
62
+ if entry[col] && entry[col][0..1] == "rs"
63
+ entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
64
+ end
65
+
66
+ # optionally create typed objects using sio nodes
67
+ if options[:complex_objects]
68
+ entry = sio_values(entry)
69
+ end
70
+
71
+ data = {}
72
+ COLUMN_NAMES.each_with_index{|col,i|
73
+ data[col] = [entry[i]]
74
+ }
75
+
76
+ observations(options[:measures],options[:dimensions],options[:codes],data,[label],options[:dataset_name],options).first
77
+ end
78
+
79
+ def sio_values(entry)
80
+ entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
81
+
82
+ # Link entrez genes
83
+ col=1
84
+ entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
85
+
86
+ col = COLUMN_NAMES.index('dbSNP_RS')
87
+ entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
88
+
89
+ # test SIO attributes for chromosome
90
+ col = COLUMN_NAMES.index('Chromosome')
91
+ entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
92
+
93
+ # More SIO attrtibutes for alleles
94
+ %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
95
+ col = COLUMN_NAMES.index(name)
96
+ entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
97
+ }
98
+
99
+ col = COLUMN_NAMES.index("Strand")
100
+ entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
101
+
102
+ col = COLUMN_NAMES.index("Center")
103
+ entry[col] = sio_attribute("foaf:homepage",entry[col])
104
+
105
+ # Use faldo for locations End_Position
106
+ col = COLUMN_NAMES.index("Start_Position")
107
+ entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
108
+
109
+ col = COLUMN_NAMES.index("End_Position")
110
+ entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
111
+
112
+ entry
113
+ end
114
+
115
+ def structure(options={})
116
+
117
+ options = process_options(options)
118
+
119
+ str = prefixes(options[:dataset_name],options)
120
+ str << data_structure_definition(options[:measures],options[:dimensions],options[:codes],options[:dataset_name],options)
121
+ str << dataset(options[:dataset_name],options)
122
+ component_specifications(options[:measures], options[:dimensions], options[:codes], options[:dataset_name], options).map{ |c| str << c }
123
+ measure_properties(options[:measures],options[:dataset_name],options).map{|m| str << m}
124
+ dimension_properties(options[:dimensions],options[:codes], options[:dataset_name],options).map{|d| str << d}
125
+ code_lists(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
126
+ concept_codes(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
127
+
128
+ str
129
+ end
130
+
131
+ def post_process(file)
132
+ reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
133
+ hugo_cache ||= {}
134
+ PubliSci::PostProcessor.process(file,file,reg){|g|
135
+ hugo_cache[g] ||= official_symbol(g)
136
+ 'http://identifiers.org/hgnc.symbol/' + cache[g]
137
+ }
138
+ end
139
+
140
+ def column_replace(entry,column,prefix,value=nil)
141
+ if value
142
+ entry[COLUMN_NAMES.index(column)] = prefix + value
143
+ else
144
+ entry[COLUMN_NAMES.index(column)] += prefix
145
+ end
146
+ end
147
+
148
+ def official_symbol(hugo_symbol)
149
+ qry = <<-EOF
150
+
151
+ SELECT distinct ?official where {
152
+ {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
153
+ UNION
154
+ {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
155
+
156
+ ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
157
+ }
158
+
159
+ EOF
160
+
161
+ sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
162
+ sparql.query(qry).map(&:official).first.to_s
163
+ end
164
+
165
+ def parse_barcode(code)
166
+ #TCGA-E9-A22B-01A-11D-A159-09
167
+ [code[5..11], code[13..-1]]
168
+ end
169
+ end
170
+ end
171
+ end
172
+ end