RubyGems - publisci - Versions diffs - 0.1.3 → 0.1.4 - Mend

publisci 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/Gemfile +2 -2
data/LICENSE.txt +19 -17
data/README.md +41 -8
data/README.rdoc +3 -5
data/Rakefile +2 -2
data/bin/publisci +9 -7
data/examples/visualization/prov_viz.rb +1 -1
data/lib/publisci.rb +19 -11
data/lib/publisci/datacube_model.rb +2 -2
data/lib/publisci/dataset/ORM/data_cube_orm.rb +2 -2
data/lib/publisci/dataset/data_cube.rb +1 -1
data/lib/publisci/dataset/dataset_for.rb +6 -1
data/lib/publisci/dataset/interactive.rb +1 -46
data/lib/publisci/generators/base.rb +22 -0
data/lib/publisci/generators/maf.rb +172 -0
data/lib/publisci/metadata/generator.rb +1 -1
data/lib/publisci/parser.rb +62 -62
data/lib/publisci/parsers/base.rb +29 -0
data/lib/publisci/parsers/maf.rb +20 -0
data/lib/publisci/readers/arff.rb +43 -43
data/lib/publisci/readers/base.rb +2 -2
data/lib/publisci/readers/csv.rb +2 -1
data/lib/publisci/readers/maf.rb +15 -181
data/lib/publisci/readers/r_matrix.rb +143 -143
data/lib/publisci/writers/arff.rb +1 -1
data/lib/publisci/writers/base.rb +1 -1
data/resources/maf_rdf.ttl +98 -22
data/spec/ORM/data_cube_orm_spec.rb +1 -1
data/spec/ORM/prov_model_spec.rb +3 -3
data/spec/dataset_for_spec.rb +1 -1
data/spec/generators/maf_spec.rb +2 -1
data/spec/maf_query_spec.rb +1 -1
metadata +25 -23
data/lib/r2rdf.rb +0 -226
data/lib/template_bak.rb +0 -12

data/lib/publisci/metadata/generator.rb CHANGED

@@ -7,7 +7,7 @@ end
 module PubliSci
   class Metadata
     module Generator
-      include PubliSci::Parser
+      include PubliSci::RDFParser
       def defaults
       {

data/lib/publisci/parser.rb CHANGED

@@ -1,5 +1,5 @@
 module PubliSci
-	module Parser
+  module RDFParser
     def is_uri?(obj)
       RDF::Resource(obj).valid?
@@ -38,61 +38,61 @@ module PubliSci
       h
     end
-		def load_string(string,repo=RDF::Repository.new)
-			f = Tempfile.new('repo')
-			f.write(string)
-			f.close
-			repo.load(f.path, :format => :ttl)
-			f.unlink
-			repo
-		end
+    def load_string(string,repo=RDF::Repository.new)
+      f = Tempfile.new('repo')
+      f.write(string)
+      f.close
+      repo.load(f.path, :format => :ttl)
+      f.unlink
+      repo
+    end
-		def get_ary(query_results,method='to_s')
+    def get_ary(query_results,method='to_s')
       query_results.map{|solution|
         solution.to_a.map{|entry|
           if entry.last.respond_to? method
-	          entry.last.send(method)
-	        else
-	        	entry.last.to_s
-	        end
+            entry.last.send(method)
+          else
+            entry.last.to_s
+          end
         }
       }
     end
     def get_hashes(query_results,method=nil)
-    	arr=[]
-    	query_results.map{|solution|
-    		h={}
-    		solution.map{|element|
-					if method && element[1].respond_to?(method)
-					 	h[element[0]] = element[1].send(method)
-					else
-					 	h[element[0]] = element[1]
-					end
-    		}
-    		arr << h
-    	}
-    	arr
+      arr=[]
+      query_results.map{|solution|
+        h={}
+        solution.map{|element|
+          if method && element[1].respond_to?(method)
+            h[element[0]] = element[1].send(method)
+          else
+            h[element[0]] = element[1]
+          end
+        }
+        arr << h
+      }
+      arr
     end
     def observation_hash(query_results,shorten_uris=false,method='to_s')
-    	h={}
-    	query_results.map{|sol|
-    		(h[sol[:observation].to_s] ||= {})[sol[:property].to_s] = sol[:value].to_s
-    	}
+      h={}
+      query_results.map{|sol|
+        (h[sol[:observation].to_s] ||= {})[sol[:property].to_s] = sol[:value].to_s
+      }
-    	if shorten_uris
-	    	newh= {}
-	    	h.map{|k,v|
-	    		newh[strip_uri(k)] ||= {}
-	    		v.map{|kk,vv|
-	    			newh[strip_uri(k)][strip_uri(kk)] = strip_uri(vv)
-	    		}
-	    	}
-	    	newh
-	    else
-	    	h
-	    end
+      if shorten_uris
+        newh= {}
+        h.map{|k,v|
+          newh[strip_uri(k)] ||= {}
+          v.map{|kk,vv|
+            newh[strip_uri(k)][strip_uri(kk)] = strip_uri(vv)
+          }
+        }
+        newh
+      else
+        h
+      end
     end
     def to_resource(obj, options={})
@@ -162,7 +162,7 @@ module PubliSci
         to_resource(obj,options)
       elsif obj && obj.is_a?(String) && (obj[0]=="<" && obj[-1] = ">")
         obj
-      elsif obj.is_a?(Array)
+      elsif obj.is_a?(Array)
         node_str = add_node(node_index,node_str)
         ["#{node_str}" ] + [bnode_value(obj, node_index, node_str, options)]
       else
@@ -179,7 +179,7 @@ module PubliSci
         if obj.size == 2
           if obj[0].is_a?(String)
             if is_complex?(obj[1])
-              str << "#{to_resource(obj[0])} #{add_node(node_index,node_str)} . \n"
+              str << "#{to_resource(obj[0])} #{add_node(node_index,node_str)} . \n"
               subnodes << encode_value(obj[1], options, node_index, node_str)
             else
               str << "#{to_resource(obj[0])} #{encode_value(obj[1], options, node_index, node_str)} "
@@ -220,7 +220,7 @@ module PubliSci
         raise "Invalid Structured value: #{obj}"
       end
-      if subnodes.size > 0
+      if subnodes.size > 0
         [str, subnodes.flatten].flatten
       else
         str
@@ -231,22 +231,22 @@ module PubliSci
       tabs = 0
       turtle_str.split("\n").map{|str|
         case str[-1]
-        when "."
-          last_tabs = tabs
-          tabs = 0
-          ("  " * last_tabs) + str
-        when ";"
-          last_tabs = tabs
-          tabs = 1 if tabs == 0
-          ("  " * last_tabs) + str
-        else
-          last_tabs = tabs
-          if str.size < 2
+          when "."
+            last_tabs = tabs
             tabs = 0
+            ("  " * last_tabs) + str
+          when ";"
+            last_tabs = tabs
+            tabs = 1 if tabs == 0
+            ("  " * last_tabs) + str
           else
-            tabs += 1
-          end
-          ("  " * last_tabs) + str
+            last_tabs = tabs
+            if str.size < 2
+              tabs = 0
+            else
+              tabs += 1
+            end
+            ("  " * last_tabs) + str
         end
       }.join("\n")
@@ -262,5 +262,5 @@ module PubliSci
       string.to_s.split(':').last
     end
-	end
-end
+  end
+end

data/lib/publisci/parsers/base.rb ADDED

@@ -0,0 +1,29 @@
+module PubliSci
+  module Parsers
+    module Base
+      include Enumerable
+      # attr_accessor :dataset_name, :measures, :dimensions, :codes
+      def valid?(rec)
+        true
+      end
+      def enum_method
+        :each
+      end
+      def process_record(rec)
+        rec
+      end
+      def each(input)
+        input.send(enum_method).each_with_index do |rec, i|
+          yield process_record(rec), i if valid? rec
+        end
+      end
+      alias_method :each_rec, :each
+      alias_method :each_record, :each
+    end
+  end
+end

data/lib/publisci/parsers/maf.rb ADDED

@@ -0,0 +1,20 @@
+module PubliSci
+  module Parsers
+    class MAF
+      extend Base
+      COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS  dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1  Match_Norm_Seq_Allele2  Tumor_Validation_Allele1  Tumor_Validation_Allele2  Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase  Sequence_Source Validation_Method Score BAM_File  Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
+      def self.valid?(line)
+        not (line[0] == "#" || line[0..3] == "Hugo")
+      end
+      def enum_method
+        :each_line
+      end
+      def self.process_record(rec)
+        ::CSV.parse(rec, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
+      end
+    end
+  end
+end

data/lib/publisci/readers/arff.rb CHANGED

@@ -1,49 +1,49 @@
 module PubliSci
-		module Readers
-		class ARFF
-			include PubliSci::Dataset::DataCube
+  module Readers
+    class ARFF
+      include PubliSci::Dataset::DataCube
-			def generate_n3(arff, options={})
-				arff = IO.read(arff) if File.exist? arff
-				options[:no_labels] = true # unless options[:no_labels] == nil
-				@options = options
-				comps =  components(arff)
-				obs = data(arff, comps.keys)
-				generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
-			end
+      def generate_n3(arff, options={})
+        arff = IO.read(arff) if File.exist? arff
+        options[:no_labels] = true
+        @options = options
+        comps =  components(arff)
+        obs = data(arff, comps.keys)
+        generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
+      end
-			def relation(arff)
-				arff.match(/@relation.+/i).to_a.first.split.last
-			end
+      def relation(arff)
+        arff.match(/@relation.+/i).to_a.first.split.last
+      end
-			def components(arff)
-				#still needs support for quoted strings with whitespace
-				h ={}
-				arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
-					if line =~ /\{.*}/
-						name = line.match(/\s.*/).to_a.first.strip.split.first
-						type = :coded
-						codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
-						h[name] = {type: type, codes: codes}
-					else
-						name = line.split[1]
-						type = line.split[2]
-						h[name] = {type: type}
-					end
-				}
-				h
-			end
+      def components(arff)
+        #still needs support for quoted strings with whitespace
+        h ={}
+        arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
+          if line =~ /\{.*}/
+            name = line.match(/\s.*/).to_a.first.strip.split.first
+            type = :coded
+            codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
+            h[name] = {type: type, codes: codes}
+          else
+            name = line.split[1]
+            type = line.split[2]
+            h[name] = {type: type}
+          end
+        }
+        h
+      end
-			def data(arff, attributes)
-				lines = arff.split("\n")
-				data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
-				h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
-				data_lines.map{|line|
-					line = line.split ','
-					attributes.each_with_index{|a,i| h[a] << line[i]}
-				}
-				h
-			end
-		end
-	end
+      def data(arff, attributes)
+        lines = arff.split("\n")
+        data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
+        h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
+        data_lines.map{|line|
+          line = line.split ','
+          attributes.each_with_index{|a,i| h[a] << line[i]}
+        }
+        h
+      end
+    end
+  end
 end

data/lib/publisci/readers/base.rb CHANGED

@@ -1,8 +1,8 @@
 module PubliSci
   module Readers
-    class Base
+    module Base
       include PubliSci::Query
-      include PubliSci::Parser
+      include PubliSci::RDFParser
       include PubliSci::Analyzer
       include PubliSci::Interactive
       include PubliSci::Dataset::DataCube

data/lib/publisci/readers/csv.rb CHANGED

@@ -1,6 +1,7 @@
 module PubliSci
 	module Readers
-		class CSV < Base
+		class CSV
+      include Base
       def automatic(file=nil,dataset_name=nil,options={},interactive=true)
         #to do
         # puts "f #{file} \n ds #{dataset_name} opts #{options}"

data/lib/publisci/readers/maf.rb CHANGED

@@ -1,199 +1,33 @@
 module PubliSci
   module Readers
-    class MAF < Base
-    COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS  dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1  Match_Norm_Seq_Allele2  Tumor_Validation_Allele1  Tumor_Validation_Allele2  Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase  Sequence_Source Validation_Method Score BAM_File  Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
+    class MAF
+      extend PubliSci::Readers::Base
-    COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
+      def self.generate_n3(input_file, options={})
+        input_file = open(input_file,'r')
-    TCGA_CODES =
-      {
-        "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1  Intron RNA Targeted_Region},
-        "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
-        "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
-        "Verification_Status" => %w{Verified, Unknown},
-        "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
-        "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
-        "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
-        "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
-      }
+        out_base = options[:output_base] || File.basename(input_file,'.*')
-      def generate_n3(input_file, options={})
-        dataset_name = options[:dataset_name] || nil
-        output = options[:output] || :file
-        output_base = options[:output_base] || nil
-        @dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
-        # @codes = %w{Variant_Classification Variant_Type}
-        @codes = @dimensions
-        @measures = (COLUMN_NAMES - @dimensions - @codes)
-        @dataset_name ||= File.basename(input_file,'.*')
-        @barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
-        options[:no_labels] ||= true
-        options[:lookup_hugo] ||= false
-        options[:complex_objects] ||= false
-        options[:ranges] ||= COMPONENT_RANGES
-        if output == :print
-          str = structure(options)
-          f = open(input_file)
-          n = 0
-          f.each_line{|line|
-            processed = process_line(line,n.to_s,options)
-            str << processed.first if processed
-            n +=1
-          }
-          str
+        if options[:output] == :print
+          output = StringIO.new("")
         else
-          # TODO - allow multi file / separate structure output for very large datasets
-          # open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
-          file_base = output_base || @dataset_name
-          out = open("#{file_base}.ttl",'w')
-          out.write(structure(options))
-          f = open(input_file)
-          n = 0
-          f.each_line{|line|
-            processed = process_line(line,n.to_s,options)
-            out.write(processed.first) if processed
-            n += 1
-          }
-          if options[:lookup_hugo]
-            post_process(out)
-          else
-            out
-          end
+          output = open "#{out_base}.ttl",'w'
         end
-      end
-      def process_line(line,label,options)
-        unless line[0] == "#" || line[0..3] == "Hugo"
-          entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
-          entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
-          entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
-          # A 0 in the entrez-id column appears to mean null
-          col=1
-          entry[col] = nil if entry[col] == '0'
-          entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
+        PubliSci::Generators::MAF.write_structure(input_file, output, options)
-          # Only link non-novel dbSNP entries
-          col = COLUMN_NAMES.index('dbSNP_RS')
-          if entry[col] && entry[col][0..1] == "rs"
-            entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
-          end
-          # optionally create typed objects using sio nodes
-          if options[:complex_objects]
-            entry = sio_values(entry)
-          end
-          data = {}
-          COLUMN_NAMES.each_with_index{|col,i|
-            data[col] = [entry[i]]
-          }
-          observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
+        PubliSci::Parsers::MAF.each_record(input_file) do |rec, label|
+          PubliSci::Generators::MAF.write(rec, output, label, options)
         end
-      end
-      def sio_values(entry)
-        entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
-        # Link entrez genes
-        col=1
-        entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
-        col = COLUMN_NAMES.index('dbSNP_RS')
-        entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
-        # test SIO attributes for chromosome
-        col = COLUMN_NAMES.index('Chromosome')
-        entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
-        # More SIO attrtibutes for alleles
-        %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
-          col = COLUMN_NAMES.index(name)
-          entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
-        }
+        output.close
-        col = COLUMN_NAMES.index("Strand")
-        entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
-        col = COLUMN_NAMES.index("Center")
-        entry[col] = sio_attribute("foaf:homepage",entry[col])
-        # entry[col] = [
-        #   ["a", "foaf:Organization"],
-        #   ["foaf:homepage", entry[col]],
-        # ]
-        # Use faldo for locations End_Position
-        col = COLUMN_NAMES.index("Start_Position")
-        entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
-        col = COLUMN_NAMES.index("End_Position")
-        entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
-        entry
-      end
-      def column_replace(entry,column,prefix,value=nil)
-        if value
-          entry[COLUMN_NAMES.index(column)] = prefix + value
+        if options[:output] == :print
+          output.string
         else
-          entry[COLUMN_NAMES.index(column)] += prefix
+          output.path
         end
       end
-      def official_symbol(hugo_symbol)
-        qry = <<-EOF
-        SELECT distinct ?official where {
-         {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
-         UNION
-         {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
-         ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
-        }
-        EOF
-        sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
-        sparql.query(qry).map(&:official).first.to_s
-      end
-      def parse_barcode(code)
-      	#TCGA-E9-A22B-01A-11D-A159-09
-      	[code[5..11], code[13..-1]]
-      end
-      def structure(options={})
-        str = prefixes(@dataset_name,options)
-        str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
-        str << dataset(@dataset_name,options)
-        component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
-        measure_properties(@measures,@dataset_name,options).map{|m| str << m}
-        dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
-        code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
-        concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
-        str
-      end
-      def post_process(file)
-        reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
-        @@hugo_cache ||= {}
-        PubliSci::PostProcessor.process(file,file,reg){|g|
-          @@hugo_cache[g] ||= official_symbol(g)
-         'http://identifiers.org/hgnc.symbol/' + cache[g]
-       }
-      end
     end
   end
 end