publisci 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,7 @@ end
7
7
  module PubliSci
8
8
  class Metadata
9
9
  module Generator
10
- include PubliSci::Parser
10
+ include PubliSci::RDFParser
11
11
 
12
12
  def defaults
13
13
  {
@@ -1,5 +1,5 @@
1
1
  module PubliSci
2
- module Parser
2
+ module RDFParser
3
3
 
4
4
  def is_uri?(obj)
5
5
  RDF::Resource(obj).valid?
@@ -38,61 +38,61 @@ module PubliSci
38
38
  h
39
39
  end
40
40
 
41
- def load_string(string,repo=RDF::Repository.new)
42
- f = Tempfile.new('repo')
43
- f.write(string)
44
- f.close
45
- repo.load(f.path, :format => :ttl)
46
- f.unlink
47
- repo
48
- end
41
+ def load_string(string,repo=RDF::Repository.new)
42
+ f = Tempfile.new('repo')
43
+ f.write(string)
44
+ f.close
45
+ repo.load(f.path, :format => :ttl)
46
+ f.unlink
47
+ repo
48
+ end
49
49
 
50
- def get_ary(query_results,method='to_s')
50
+ def get_ary(query_results,method='to_s')
51
51
  query_results.map{|solution|
52
52
  solution.to_a.map{|entry|
53
53
  if entry.last.respond_to? method
54
- entry.last.send(method)
55
- else
56
- entry.last.to_s
57
- end
54
+ entry.last.send(method)
55
+ else
56
+ entry.last.to_s
57
+ end
58
58
  }
59
59
  }
60
60
  end
61
61
 
62
62
  def get_hashes(query_results,method=nil)
63
- arr=[]
64
- query_results.map{|solution|
65
- h={}
66
- solution.map{|element|
67
- if method && element[1].respond_to?(method)
68
- h[element[0]] = element[1].send(method)
69
- else
70
- h[element[0]] = element[1]
71
- end
72
- }
73
- arr << h
74
- }
75
- arr
63
+ arr=[]
64
+ query_results.map{|solution|
65
+ h={}
66
+ solution.map{|element|
67
+ if method && element[1].respond_to?(method)
68
+ h[element[0]] = element[1].send(method)
69
+ else
70
+ h[element[0]] = element[1]
71
+ end
72
+ }
73
+ arr << h
74
+ }
75
+ arr
76
76
  end
77
77
 
78
78
  def observation_hash(query_results,shorten_uris=false,method='to_s')
79
- h={}
80
- query_results.map{|sol|
81
- (h[sol[:observation].to_s] ||= {})[sol[:property].to_s] = sol[:value].to_s
82
- }
79
+ h={}
80
+ query_results.map{|sol|
81
+ (h[sol[:observation].to_s] ||= {})[sol[:property].to_s] = sol[:value].to_s
82
+ }
83
83
 
84
- if shorten_uris
85
- newh= {}
86
- h.map{|k,v|
87
- newh[strip_uri(k)] ||= {}
88
- v.map{|kk,vv|
89
- newh[strip_uri(k)][strip_uri(kk)] = strip_uri(vv)
90
- }
91
- }
92
- newh
93
- else
94
- h
95
- end
84
+ if shorten_uris
85
+ newh= {}
86
+ h.map{|k,v|
87
+ newh[strip_uri(k)] ||= {}
88
+ v.map{|kk,vv|
89
+ newh[strip_uri(k)][strip_uri(kk)] = strip_uri(vv)
90
+ }
91
+ }
92
+ newh
93
+ else
94
+ h
95
+ end
96
96
  end
97
97
 
98
98
  def to_resource(obj, options={})
@@ -162,7 +162,7 @@ module PubliSci
162
162
  to_resource(obj,options)
163
163
  elsif obj && obj.is_a?(String) && (obj[0]=="<" && obj[-1] = ">")
164
164
  obj
165
- elsif obj.is_a?(Array)
165
+ elsif obj.is_a?(Array)
166
166
  node_str = add_node(node_index,node_str)
167
167
  ["#{node_str}" ] + [bnode_value(obj, node_index, node_str, options)]
168
168
  else
@@ -179,7 +179,7 @@ module PubliSci
179
179
  if obj.size == 2
180
180
  if obj[0].is_a?(String)
181
181
  if is_complex?(obj[1])
182
- str << "#{to_resource(obj[0])} #{add_node(node_index,node_str)} . \n"
182
+ str << "#{to_resource(obj[0])} #{add_node(node_index,node_str)} . \n"
183
183
  subnodes << encode_value(obj[1], options, node_index, node_str)
184
184
  else
185
185
  str << "#{to_resource(obj[0])} #{encode_value(obj[1], options, node_index, node_str)} "
@@ -220,7 +220,7 @@ module PubliSci
220
220
  raise "Invalid Structured value: #{obj}"
221
221
  end
222
222
 
223
- if subnodes.size > 0
223
+ if subnodes.size > 0
224
224
  [str, subnodes.flatten].flatten
225
225
  else
226
226
  str
@@ -231,22 +231,22 @@ module PubliSci
231
231
  tabs = 0
232
232
  turtle_str.split("\n").map{|str|
233
233
  case str[-1]
234
- when "."
235
- last_tabs = tabs
236
- tabs = 0
237
- (" " * last_tabs) + str
238
- when ";"
239
- last_tabs = tabs
240
- tabs = 1 if tabs == 0
241
- (" " * last_tabs) + str
242
- else
243
- last_tabs = tabs
244
- if str.size < 2
234
+ when "."
235
+ last_tabs = tabs
245
236
  tabs = 0
237
+ (" " * last_tabs) + str
238
+ when ";"
239
+ last_tabs = tabs
240
+ tabs = 1 if tabs == 0
241
+ (" " * last_tabs) + str
246
242
  else
247
- tabs += 1
248
- end
249
- (" " * last_tabs) + str
243
+ last_tabs = tabs
244
+ if str.size < 2
245
+ tabs = 0
246
+ else
247
+ tabs += 1
248
+ end
249
+ (" " * last_tabs) + str
250
250
  end
251
251
  }.join("\n")
252
252
 
@@ -262,5 +262,5 @@ module PubliSci
262
262
  string.to_s.split(':').last
263
263
  end
264
264
 
265
- end
266
- end
265
+ end
266
+ end
@@ -0,0 +1,29 @@
1
+ module PubliSci
2
+ module Parsers
3
+ module Base
4
+ include Enumerable
5
+ # attr_accessor :dataset_name, :measures, :dimensions, :codes
6
+
7
+ def valid?(rec)
8
+ true
9
+ end
10
+
11
+ def enum_method
12
+ :each
13
+ end
14
+
15
+ def process_record(rec)
16
+ rec
17
+ end
18
+
19
+ def each(input)
20
+ input.send(enum_method).each_with_index do |rec, i|
21
+ yield process_record(rec), i if valid? rec
22
+ end
23
+ end
24
+ alias_method :each_rec, :each
25
+ alias_method :each_record, :each
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,20 @@
1
+ module PubliSci
2
+ module Parsers
3
+ class MAF
4
+ extend Base
5
+ COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
6
+
7
+ def self.valid?(line)
8
+ not (line[0] == "#" || line[0..3] == "Hugo")
9
+ end
10
+
11
+ def enum_method
12
+ :each_line
13
+ end
14
+
15
+ def self.process_record(rec)
16
+ ::CSV.parse(rec, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
17
+ end
18
+ end
19
+ end
20
+ end
@@ -1,49 +1,49 @@
1
1
  module PubliSci
2
- module Readers
3
- class ARFF
4
- include PubliSci::Dataset::DataCube
2
+ module Readers
3
+ class ARFF
4
+ include PubliSci::Dataset::DataCube
5
5
 
6
- def generate_n3(arff, options={})
7
- arff = IO.read(arff) if File.exist? arff
8
- options[:no_labels] = true # unless options[:no_labels] == nil
9
- @options = options
10
- comps = components(arff)
11
- obs = data(arff, comps.keys)
12
- generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
13
- end
6
+ def generate_n3(arff, options={})
7
+ arff = IO.read(arff) if File.exist? arff
8
+ options[:no_labels] = true
9
+ @options = options
10
+ comps = components(arff)
11
+ obs = data(arff, comps.keys)
12
+ generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
13
+ end
14
14
 
15
- def relation(arff)
16
- arff.match(/@relation.+/i).to_a.first.split.last
17
- end
15
+ def relation(arff)
16
+ arff.match(/@relation.+/i).to_a.first.split.last
17
+ end
18
18
 
19
- def components(arff)
20
- #still needs support for quoted strings with whitespace
21
- h ={}
22
- arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
23
- if line =~ /\{.*}/
24
- name = line.match(/\s.*/).to_a.first.strip.split.first
25
- type = :coded
26
- codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
27
- h[name] = {type: type, codes: codes}
28
- else
29
- name = line.split[1]
30
- type = line.split[2]
31
- h[name] = {type: type}
32
- end
33
- }
34
- h
35
- end
19
+ def components(arff)
20
+ #still needs support for quoted strings with whitespace
21
+ h ={}
22
+ arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
23
+ if line =~ /\{.*}/
24
+ name = line.match(/\s.*/).to_a.first.strip.split.first
25
+ type = :coded
26
+ codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
27
+ h[name] = {type: type, codes: codes}
28
+ else
29
+ name = line.split[1]
30
+ type = line.split[2]
31
+ h[name] = {type: type}
32
+ end
33
+ }
34
+ h
35
+ end
36
36
 
37
- def data(arff, attributes)
38
- lines = arff.split("\n")
39
- data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
40
- h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
41
- data_lines.map{|line|
42
- line = line.split ','
43
- attributes.each_with_index{|a,i| h[a] << line[i]}
44
- }
45
- h
46
- end
47
- end
48
- end
37
+ def data(arff, attributes)
38
+ lines = arff.split("\n")
39
+ data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
40
+ h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
41
+ data_lines.map{|line|
42
+ line = line.split ','
43
+ attributes.each_with_index{|a,i| h[a] << line[i]}
44
+ }
45
+ h
46
+ end
47
+ end
48
+ end
49
49
  end
@@ -1,8 +1,8 @@
1
1
  module PubliSci
2
2
  module Readers
3
- class Base
3
+ module Base
4
4
  include PubliSci::Query
5
- include PubliSci::Parser
5
+ include PubliSci::RDFParser
6
6
  include PubliSci::Analyzer
7
7
  include PubliSci::Interactive
8
8
  include PubliSci::Dataset::DataCube
@@ -1,6 +1,7 @@
1
1
  module PubliSci
2
2
  module Readers
3
- class CSV < Base
3
+ class CSV
4
+ include Base
4
5
  def automatic(file=nil,dataset_name=nil,options={},interactive=true)
5
6
  #to do
6
7
  # puts "f #{file} \n ds #{dataset_name} opts #{options}"
@@ -1,199 +1,33 @@
1
1
  module PubliSci
2
2
  module Readers
3
- class MAF < Base
4
- COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
3
+ class MAF
4
+ extend PubliSci::Readers::Base
5
5
 
6
- COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
6
+ def self.generate_n3(input_file, options={})
7
+ input_file = open(input_file,'r')
7
8
 
8
- TCGA_CODES =
9
- {
10
- "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
11
- "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
12
- "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
13
- "Verification_Status" => %w{Verified, Unknown},
14
- "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
15
- "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
16
- "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
17
- "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
18
- }
9
+ out_base = options[:output_base] || File.basename(input_file,'.*')
19
10
 
20
- def generate_n3(input_file, options={})
21
-
22
- dataset_name = options[:dataset_name] || nil
23
- output = options[:output] || :file
24
- output_base = options[:output_base] || nil
25
-
26
- @dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
27
- # @codes = %w{Variant_Classification Variant_Type}
28
- @codes = @dimensions
29
- @measures = (COLUMN_NAMES - @dimensions - @codes)
30
- @dataset_name ||= File.basename(input_file,'.*')
31
- @barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
32
-
33
- options[:no_labels] ||= true
34
- options[:lookup_hugo] ||= false
35
- options[:complex_objects] ||= false
36
- options[:ranges] ||= COMPONENT_RANGES
37
-
38
-
39
- if output == :print
40
- str = structure(options)
41
- f = open(input_file)
42
- n = 0
43
- f.each_line{|line|
44
- processed = process_line(line,n.to_s,options)
45
- str << processed.first if processed
46
- n +=1
47
- }
48
- str
11
+ if options[:output] == :print
12
+ output = StringIO.new("")
49
13
  else
50
- # TODO - allow multi file / separate structure output for very large datasets
51
- # open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
52
- file_base = output_base || @dataset_name
53
-
54
- out = open("#{file_base}.ttl",'w')
55
- out.write(structure(options))
56
- f = open(input_file)
57
- n = 0
58
- f.each_line{|line|
59
- processed = process_line(line,n.to_s,options)
60
- out.write(processed.first) if processed
61
- n += 1
62
- }
63
- if options[:lookup_hugo]
64
- post_process(out)
65
- else
66
- out
67
- end
14
+ output = open "#{out_base}.ttl",'w'
68
15
  end
69
- end
70
-
71
- def process_line(line,label,options)
72
- unless line[0] == "#" || line[0..3] == "Hugo"
73
- entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
74
-
75
- entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
76
-
77
- entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
78
16
 
79
- # A 0 in the entrez-id column appears to mean null
80
- col=1
81
- entry[col] = nil if entry[col] == '0'
82
- entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
17
+ PubliSci::Generators::MAF.write_structure(input_file, output, options)
83
18
 
84
- # Only link non-novel dbSNP entries
85
- col = COLUMN_NAMES.index('dbSNP_RS')
86
- if entry[col] && entry[col][0..1] == "rs"
87
- entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
88
- end
89
-
90
- # optionally create typed objects using sio nodes
91
- if options[:complex_objects]
92
- entry = sio_values(entry)
93
- end
94
-
95
- data = {}
96
- COLUMN_NAMES.each_with_index{|col,i|
97
- data[col] = [entry[i]]
98
- }
99
-
100
- observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
19
+ PubliSci::Parsers::MAF.each_record(input_file) do |rec, label|
20
+ PubliSci::Generators::MAF.write(rec, output, label, options)
101
21
  end
102
- end
103
-
104
- def sio_values(entry)
105
- entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
106
-
107
- # Link entrez genes
108
- col=1
109
- entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
110
-
111
- col = COLUMN_NAMES.index('dbSNP_RS')
112
- entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
113
-
114
- # test SIO attributes for chromosome
115
- col = COLUMN_NAMES.index('Chromosome')
116
- entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
117
-
118
-
119
22
 
120
- # More SIO attrtibutes for alleles
121
- %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
122
- col = COLUMN_NAMES.index(name)
123
- entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
124
- }
23
+ output.close
125
24
 
126
- col = COLUMN_NAMES.index("Strand")
127
- entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
128
-
129
- col = COLUMN_NAMES.index("Center")
130
- entry[col] = sio_attribute("foaf:homepage",entry[col])
131
- # entry[col] = [
132
- # ["a", "foaf:Organization"],
133
- # ["foaf:homepage", entry[col]],
134
- # ]
135
-
136
- # Use faldo for locations End_Position
137
- col = COLUMN_NAMES.index("Start_Position")
138
- entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
139
-
140
- col = COLUMN_NAMES.index("End_Position")
141
- entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
142
-
143
- entry
144
- end
145
-
146
- def column_replace(entry,column,prefix,value=nil)
147
- if value
148
- entry[COLUMN_NAMES.index(column)] = prefix + value
25
+ if options[:output] == :print
26
+ output.string
149
27
  else
150
- entry[COLUMN_NAMES.index(column)] += prefix
28
+ output.path
151
29
  end
152
30
  end
153
-
154
- def official_symbol(hugo_symbol)
155
- qry = <<-EOF
156
-
157
- SELECT distinct ?official where {
158
- {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
159
- UNION
160
- {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
161
-
162
- ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
163
- }
164
-
165
- EOF
166
-
167
- sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
168
- sparql.query(qry).map(&:official).first.to_s
169
- end
170
-
171
- def parse_barcode(code)
172
- #TCGA-E9-A22B-01A-11D-A159-09
173
- [code[5..11], code[13..-1]]
174
- end
175
-
176
- def structure(options={})
177
-
178
- str = prefixes(@dataset_name,options)
179
- str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
180
- str << dataset(@dataset_name,options)
181
- component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
182
- measure_properties(@measures,@dataset_name,options).map{|m| str << m}
183
- dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
184
- code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
185
- concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
186
- str
187
- end
188
-
189
- def post_process(file)
190
- reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
191
- @@hugo_cache ||= {}
192
- PubliSci::PostProcessor.process(file,file,reg){|g|
193
- @@hugo_cache[g] ||= official_symbol(g)
194
- 'http://identifiers.org/hgnc.symbol/' + cache[g]
195
- }
196
- end
197
31
  end
198
32
  end
199
33
  end