publisci 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ end
7
7
  module PubliSci
8
8
  class Metadata
9
9
  module Generator
10
- include PubliSci::Parser
10
+ include PubliSci::RDFParser
11
11
 
12
12
  def defaults
13
13
  {
@@ -1,5 +1,5 @@
1
1
  module PubliSci
2
- module Parser
2
+ module RDFParser
3
3
 
4
4
  def is_uri?(obj)
5
5
  RDF::Resource(obj).valid?
@@ -38,61 +38,61 @@ module PubliSci
38
38
  h
39
39
  end
40
40
 
41
- def load_string(string,repo=RDF::Repository.new)
42
- f = Tempfile.new('repo')
43
- f.write(string)
44
- f.close
45
- repo.load(f.path, :format => :ttl)
46
- f.unlink
47
- repo
48
- end
41
+ def load_string(string,repo=RDF::Repository.new)
42
+ f = Tempfile.new('repo')
43
+ f.write(string)
44
+ f.close
45
+ repo.load(f.path, :format => :ttl)
46
+ f.unlink
47
+ repo
48
+ end
49
49
 
50
- def get_ary(query_results,method='to_s')
50
+ def get_ary(query_results,method='to_s')
51
51
  query_results.map{|solution|
52
52
  solution.to_a.map{|entry|
53
53
  if entry.last.respond_to? method
54
- entry.last.send(method)
55
- else
56
- entry.last.to_s
57
- end
54
+ entry.last.send(method)
55
+ else
56
+ entry.last.to_s
57
+ end
58
58
  }
59
59
  }
60
60
  end
61
61
 
62
62
  def get_hashes(query_results,method=nil)
63
- arr=[]
64
- query_results.map{|solution|
65
- h={}
66
- solution.map{|element|
67
- if method && element[1].respond_to?(method)
68
- h[element[0]] = element[1].send(method)
69
- else
70
- h[element[0]] = element[1]
71
- end
72
- }
73
- arr << h
74
- }
75
- arr
63
+ arr=[]
64
+ query_results.map{|solution|
65
+ h={}
66
+ solution.map{|element|
67
+ if method && element[1].respond_to?(method)
68
+ h[element[0]] = element[1].send(method)
69
+ else
70
+ h[element[0]] = element[1]
71
+ end
72
+ }
73
+ arr << h
74
+ }
75
+ arr
76
76
  end
77
77
 
78
78
  def observation_hash(query_results,shorten_uris=false,method='to_s')
79
- h={}
80
- query_results.map{|sol|
81
- (h[sol[:observation].to_s] ||= {})[sol[:property].to_s] = sol[:value].to_s
82
- }
79
+ h={}
80
+ query_results.map{|sol|
81
+ (h[sol[:observation].to_s] ||= {})[sol[:property].to_s] = sol[:value].to_s
82
+ }
83
83
 
84
- if shorten_uris
85
- newh= {}
86
- h.map{|k,v|
87
- newh[strip_uri(k)] ||= {}
88
- v.map{|kk,vv|
89
- newh[strip_uri(k)][strip_uri(kk)] = strip_uri(vv)
90
- }
91
- }
92
- newh
93
- else
94
- h
95
- end
84
+ if shorten_uris
85
+ newh= {}
86
+ h.map{|k,v|
87
+ newh[strip_uri(k)] ||= {}
88
+ v.map{|kk,vv|
89
+ newh[strip_uri(k)][strip_uri(kk)] = strip_uri(vv)
90
+ }
91
+ }
92
+ newh
93
+ else
94
+ h
95
+ end
96
96
  end
97
97
 
98
98
  def to_resource(obj, options={})
@@ -162,7 +162,7 @@ module PubliSci
162
162
  to_resource(obj,options)
163
163
  elsif obj && obj.is_a?(String) && (obj[0]=="<" && obj[-1] = ">")
164
164
  obj
165
- elsif obj.is_a?(Array)
165
+ elsif obj.is_a?(Array)
166
166
  node_str = add_node(node_index,node_str)
167
167
  ["#{node_str}" ] + [bnode_value(obj, node_index, node_str, options)]
168
168
  else
@@ -179,7 +179,7 @@ module PubliSci
179
179
  if obj.size == 2
180
180
  if obj[0].is_a?(String)
181
181
  if is_complex?(obj[1])
182
- str << "#{to_resource(obj[0])} #{add_node(node_index,node_str)} . \n"
182
+ str << "#{to_resource(obj[0])} #{add_node(node_index,node_str)} . \n"
183
183
  subnodes << encode_value(obj[1], options, node_index, node_str)
184
184
  else
185
185
  str << "#{to_resource(obj[0])} #{encode_value(obj[1], options, node_index, node_str)} "
@@ -220,7 +220,7 @@ module PubliSci
220
220
  raise "Invalid Structured value: #{obj}"
221
221
  end
222
222
 
223
- if subnodes.size > 0
223
+ if subnodes.size > 0
224
224
  [str, subnodes.flatten].flatten
225
225
  else
226
226
  str
@@ -231,22 +231,22 @@ module PubliSci
231
231
  tabs = 0
232
232
  turtle_str.split("\n").map{|str|
233
233
  case str[-1]
234
- when "."
235
- last_tabs = tabs
236
- tabs = 0
237
- (" " * last_tabs) + str
238
- when ";"
239
- last_tabs = tabs
240
- tabs = 1 if tabs == 0
241
- (" " * last_tabs) + str
242
- else
243
- last_tabs = tabs
244
- if str.size < 2
234
+ when "."
235
+ last_tabs = tabs
245
236
  tabs = 0
237
+ (" " * last_tabs) + str
238
+ when ";"
239
+ last_tabs = tabs
240
+ tabs = 1 if tabs == 0
241
+ (" " * last_tabs) + str
246
242
  else
247
- tabs += 1
248
- end
249
- (" " * last_tabs) + str
243
+ last_tabs = tabs
244
+ if str.size < 2
245
+ tabs = 0
246
+ else
247
+ tabs += 1
248
+ end
249
+ (" " * last_tabs) + str
250
250
  end
251
251
  }.join("\n")
252
252
 
@@ -262,5 +262,5 @@ module PubliSci
262
262
  string.to_s.split(':').last
263
263
  end
264
264
 
265
- end
266
- end
265
+ end
266
+ end
@@ -0,0 +1,29 @@
1
+ module PubliSci
2
+ module Parsers
3
+ module Base
4
+ include Enumerable
5
+ # attr_accessor :dataset_name, :measures, :dimensions, :codes
6
+
7
+ def valid?(rec)
8
+ true
9
+ end
10
+
11
+ def enum_method
12
+ :each
13
+ end
14
+
15
+ def process_record(rec)
16
+ rec
17
+ end
18
+
19
+ def each(input)
20
+ input.send(enum_method).each_with_index do |rec, i|
21
+ yield process_record(rec), i if valid? rec
22
+ end
23
+ end
24
+ alias_method :each_rec, :each
25
+ alias_method :each_record, :each
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,20 @@
1
+ module PubliSci
2
+ module Parsers
3
+ class MAF
4
+ extend Base
5
+ COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
6
+
7
+ def self.valid?(line)
8
+ not (line[0] == "#" || line[0..3] == "Hugo")
9
+ end
10
+
11
+ def enum_method
12
+ :each_line
13
+ end
14
+
15
+ def self.process_record(rec)
16
+ ::CSV.parse(rec, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
17
+ end
18
+ end
19
+ end
20
+ end
@@ -1,49 +1,49 @@
1
1
  module PubliSci
2
- module Readers
3
- class ARFF
4
- include PubliSci::Dataset::DataCube
2
+ module Readers
3
+ class ARFF
4
+ include PubliSci::Dataset::DataCube
5
5
 
6
- def generate_n3(arff, options={})
7
- arff = IO.read(arff) if File.exist? arff
8
- options[:no_labels] = true # unless options[:no_labels] == nil
9
- @options = options
10
- comps = components(arff)
11
- obs = data(arff, comps.keys)
12
- generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
13
- end
6
+ def generate_n3(arff, options={})
7
+ arff = IO.read(arff) if File.exist? arff
8
+ options[:no_labels] = true
9
+ @options = options
10
+ comps = components(arff)
11
+ obs = data(arff, comps.keys)
12
+ generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
13
+ end
14
14
 
15
- def relation(arff)
16
- arff.match(/@relation.+/i).to_a.first.split.last
17
- end
15
+ def relation(arff)
16
+ arff.match(/@relation.+/i).to_a.first.split.last
17
+ end
18
18
 
19
- def components(arff)
20
- #still needs support for quoted strings with whitespace
21
- h ={}
22
- arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
23
- if line =~ /\{.*}/
24
- name = line.match(/\s.*/).to_a.first.strip.split.first
25
- type = :coded
26
- codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
27
- h[name] = {type: type, codes: codes}
28
- else
29
- name = line.split[1]
30
- type = line.split[2]
31
- h[name] = {type: type}
32
- end
33
- }
34
- h
35
- end
19
+ def components(arff)
20
+ #still needs support for quoted strings with whitespace
21
+ h ={}
22
+ arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
23
+ if line =~ /\{.*}/
24
+ name = line.match(/\s.*/).to_a.first.strip.split.first
25
+ type = :coded
26
+ codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
27
+ h[name] = {type: type, codes: codes}
28
+ else
29
+ name = line.split[1]
30
+ type = line.split[2]
31
+ h[name] = {type: type}
32
+ end
33
+ }
34
+ h
35
+ end
36
36
 
37
- def data(arff, attributes)
38
- lines = arff.split("\n")
39
- data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
40
- h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
41
- data_lines.map{|line|
42
- line = line.split ','
43
- attributes.each_with_index{|a,i| h[a] << line[i]}
44
- }
45
- h
46
- end
47
- end
48
- end
37
+ def data(arff, attributes)
38
+ lines = arff.split("\n")
39
+ data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
40
+ h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
41
+ data_lines.map{|line|
42
+ line = line.split ','
43
+ attributes.each_with_index{|a,i| h[a] << line[i]}
44
+ }
45
+ h
46
+ end
47
+ end
48
+ end
49
49
  end
@@ -1,8 +1,8 @@
1
1
  module PubliSci
2
2
  module Readers
3
- class Base
3
+ module Base
4
4
  include PubliSci::Query
5
- include PubliSci::Parser
5
+ include PubliSci::RDFParser
6
6
  include PubliSci::Analyzer
7
7
  include PubliSci::Interactive
8
8
  include PubliSci::Dataset::DataCube
@@ -1,6 +1,7 @@
1
1
  module PubliSci
2
2
  module Readers
3
- class CSV < Base
3
+ class CSV
4
+ include Base
4
5
  def automatic(file=nil,dataset_name=nil,options={},interactive=true)
5
6
  #to do
6
7
  # puts "f #{file} \n ds #{dataset_name} opts #{options}"
@@ -1,199 +1,33 @@
1
1
  module PubliSci
2
2
  module Readers
3
- class MAF < Base
4
- COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
3
+ class MAF
4
+ extend PubliSci::Readers::Base
5
5
 
6
- COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
6
+ def self.generate_n3(input_file, options={})
7
+ input_file = open(input_file,'r')
7
8
 
8
- TCGA_CODES =
9
- {
10
- "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
11
- "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
12
- "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
13
- "Verification_Status" => %w{Verified, Unknown},
14
- "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
15
- "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
16
- "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
17
- "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
18
- }
9
+ out_base = options[:output_base] || File.basename(input_file,'.*')
19
10
 
20
- def generate_n3(input_file, options={})
21
-
22
- dataset_name = options[:dataset_name] || nil
23
- output = options[:output] || :file
24
- output_base = options[:output_base] || nil
25
-
26
- @dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
27
- # @codes = %w{Variant_Classification Variant_Type}
28
- @codes = @dimensions
29
- @measures = (COLUMN_NAMES - @dimensions - @codes)
30
- @dataset_name ||= File.basename(input_file,'.*')
31
- @barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
32
-
33
- options[:no_labels] ||= true
34
- options[:lookup_hugo] ||= false
35
- options[:complex_objects] ||= false
36
- options[:ranges] ||= COMPONENT_RANGES
37
-
38
-
39
- if output == :print
40
- str = structure(options)
41
- f = open(input_file)
42
- n = 0
43
- f.each_line{|line|
44
- processed = process_line(line,n.to_s,options)
45
- str << processed.first if processed
46
- n +=1
47
- }
48
- str
11
+ if options[:output] == :print
12
+ output = StringIO.new("")
49
13
  else
50
- # TODO - allow multi file / separate structure output for very large datasets
51
- # open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
52
- file_base = output_base || @dataset_name
53
-
54
- out = open("#{file_base}.ttl",'w')
55
- out.write(structure(options))
56
- f = open(input_file)
57
- n = 0
58
- f.each_line{|line|
59
- processed = process_line(line,n.to_s,options)
60
- out.write(processed.first) if processed
61
- n += 1
62
- }
63
- if options[:lookup_hugo]
64
- post_process(out)
65
- else
66
- out
67
- end
14
+ output = open "#{out_base}.ttl",'w'
68
15
  end
69
- end
70
-
71
- def process_line(line,label,options)
72
- unless line[0] == "#" || line[0..3] == "Hugo"
73
- entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
74
-
75
- entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
76
-
77
- entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
78
16
 
79
- # A 0 in the entrez-id column appears to mean null
80
- col=1
81
- entry[col] = nil if entry[col] == '0'
82
- entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
17
+ PubliSci::Generators::MAF.write_structure(input_file, output, options)
83
18
 
84
- # Only link non-novel dbSNP entries
85
- col = COLUMN_NAMES.index('dbSNP_RS')
86
- if entry[col] && entry[col][0..1] == "rs"
87
- entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
88
- end
89
-
90
- # optionally create typed objects using sio nodes
91
- if options[:complex_objects]
92
- entry = sio_values(entry)
93
- end
94
-
95
- data = {}
96
- COLUMN_NAMES.each_with_index{|col,i|
97
- data[col] = [entry[i]]
98
- }
99
-
100
- observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
19
+ PubliSci::Parsers::MAF.each_record(input_file) do |rec, label|
20
+ PubliSci::Generators::MAF.write(rec, output, label, options)
101
21
  end
102
- end
103
-
104
- def sio_values(entry)
105
- entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
106
-
107
- # Link entrez genes
108
- col=1
109
- entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
110
-
111
- col = COLUMN_NAMES.index('dbSNP_RS')
112
- entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
113
-
114
- # test SIO attributes for chromosome
115
- col = COLUMN_NAMES.index('Chromosome')
116
- entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
117
-
118
-
119
22
 
120
- # More SIO attrtibutes for alleles
121
- %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
122
- col = COLUMN_NAMES.index(name)
123
- entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
124
- }
23
+ output.close
125
24
 
126
- col = COLUMN_NAMES.index("Strand")
127
- entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
128
-
129
- col = COLUMN_NAMES.index("Center")
130
- entry[col] = sio_attribute("foaf:homepage",entry[col])
131
- # entry[col] = [
132
- # ["a", "foaf:Organization"],
133
- # ["foaf:homepage", entry[col]],
134
- # ]
135
-
136
- # Use faldo for locations End_Position
137
- col = COLUMN_NAMES.index("Start_Position")
138
- entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
139
-
140
- col = COLUMN_NAMES.index("End_Position")
141
- entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
142
-
143
- entry
144
- end
145
-
146
- def column_replace(entry,column,prefix,value=nil)
147
- if value
148
- entry[COLUMN_NAMES.index(column)] = prefix + value
25
+ if options[:output] == :print
26
+ output.string
149
27
  else
150
- entry[COLUMN_NAMES.index(column)] += prefix
28
+ output.path
151
29
  end
152
30
  end
153
-
154
- def official_symbol(hugo_symbol)
155
- qry = <<-EOF
156
-
157
- SELECT distinct ?official where {
158
- {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
159
- UNION
160
- {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
161
-
162
- ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
163
- }
164
-
165
- EOF
166
-
167
- sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
168
- sparql.query(qry).map(&:official).first.to_s
169
- end
170
-
171
- def parse_barcode(code)
172
- #TCGA-E9-A22B-01A-11D-A159-09
173
- [code[5..11], code[13..-1]]
174
- end
175
-
176
- def structure(options={})
177
-
178
- str = prefixes(@dataset_name,options)
179
- str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
180
- str << dataset(@dataset_name,options)
181
- component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
182
- measure_properties(@measures,@dataset_name,options).map{|m| str << m}
183
- dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
184
- code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
185
- concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
186
- str
187
- end
188
-
189
- def post_process(file)
190
- reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
191
- @@hugo_cache ||= {}
192
- PubliSci::PostProcessor.process(file,file,reg){|g|
193
- @@hugo_cache[g] ||= official_symbol(g)
194
- 'http://identifiers.org/hgnc.symbol/' + cache[g]
195
- }
196
- end
197
31
  end
198
32
  end
199
33
  end