publisci 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +13 -0
- data/Gemfile +36 -0
- data/LICENSE.txt +20 -0
- data/README.md +51 -0
- data/README.rdoc +48 -0
- data/Rakefile +68 -0
- data/bin/bio-publisci +106 -0
- data/bin/bio-publisci-server +50 -0
- data/examples/bio-band_integration.rb +9 -0
- data/examples/no_magic.prov +58 -0
- data/examples/no_magic.rb +58 -0
- data/examples/orm.prov +48 -0
- data/examples/primer-full.prov +120 -0
- data/examples/primer.prov +66 -0
- data/examples/prov_dsl.prov +85 -0
- data/examples/safe_gen.rb +7 -0
- data/examples/visualization/primer.prov +66 -0
- data/examples/visualization/prov_viz.rb +140 -0
- data/examples/visualization/viz.rb +35 -0
- data/features/create_generator.feature +21 -0
- data/features/integration.feature +12 -0
- data/features/integration_steps.rb +10 -0
- data/features/metadata.feature +37 -0
- data/features/metadata_steps.rb +40 -0
- data/features/orm.feature +60 -0
- data/features/orm_steps.rb +74 -0
- data/features/prov_dsl.feature +14 -0
- data/features/prov_dsl_steps.rb +11 -0
- data/features/reader.feature +25 -0
- data/features/reader_steps.rb +61 -0
- data/features/step_definitions/bio-publisci_steps.rb +0 -0
- data/features/store.feature +27 -0
- data/features/store_steps.rb +42 -0
- data/features/support/env.rb +13 -0
- data/features/writer.feature +14 -0
- data/features/writer_steps.rb +24 -0
- data/lib/bio-publisci.rb +64 -0
- data/lib/bio-publisci/analyzer.rb +57 -0
- data/lib/bio-publisci/datacube_model.rb +111 -0
- data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +240 -0
- data/lib/bio-publisci/dataset/ORM/observation.rb +20 -0
- data/lib/bio-publisci/dataset/configuration.rb +31 -0
- data/lib/bio-publisci/dataset/data_cube.rb +418 -0
- data/lib/bio-publisci/dataset/dataset.rb +11 -0
- data/lib/bio-publisci/dataset/dataset_for.rb +186 -0
- data/lib/bio-publisci/dataset/interactive.rb +72 -0
- data/lib/bio-publisci/dsl/config.rb +34 -0
- data/lib/bio-publisci/dsl/dataset_dsl.rb +93 -0
- data/lib/bio-publisci/dsl/dsl.rb +72 -0
- data/lib/bio-publisci/dsl/metadata_dsl.rb +85 -0
- data/lib/bio-publisci/dsl/prov_dsl.rb +143 -0
- data/lib/bio-publisci/metadata/generator.rb +323 -0
- data/lib/bio-publisci/metadata/metadata.rb +5 -0
- data/lib/bio-publisci/metadata/metadata_model.rb +25 -0
- data/lib/bio-publisci/metadata/prov/activity.rb +88 -0
- data/lib/bio-publisci/metadata/prov/agent.rb +100 -0
- data/lib/bio-publisci/metadata/prov/association.rb +107 -0
- data/lib/bio-publisci/metadata/prov/config.rb +34 -0
- data/lib/bio-publisci/metadata/prov/derivation.rb +60 -0
- data/lib/bio-publisci/metadata/prov/element.rb +120 -0
- data/lib/bio-publisci/metadata/prov/entity.rb +64 -0
- data/lib/bio-publisci/metadata/prov/model/prov_models.rb +109 -0
- data/lib/bio-publisci/metadata/prov/plan.rb +32 -0
- data/lib/bio-publisci/metadata/prov/prov.rb +78 -0
- data/lib/bio-publisci/metadata/prov/role.rb +40 -0
- data/lib/bio-publisci/metadata/prov/usage.rb +64 -0
- data/lib/bio-publisci/metadata/publisher.rb +25 -0
- data/lib/bio-publisci/mixins/custom_predicate.rb +38 -0
- data/lib/bio-publisci/mixins/dereferencable.rb +34 -0
- data/lib/bio-publisci/mixins/registry.rb +27 -0
- data/lib/bio-publisci/mixins/vocabulary.rb +8 -0
- data/lib/bio-publisci/output.rb +27 -0
- data/lib/bio-publisci/parser.rb +266 -0
- data/lib/bio-publisci/post_processor.rb +95 -0
- data/lib/bio-publisci/query/query_helper.rb +123 -0
- data/lib/bio-publisci/r_client.rb +54 -0
- data/lib/bio-publisci/readers/arff.rb +49 -0
- data/lib/bio-publisci/readers/base.rb +57 -0
- data/lib/bio-publisci/readers/csv.rb +88 -0
- data/lib/bio-publisci/readers/dataframe.rb +67 -0
- data/lib/bio-publisci/readers/maf.rb +199 -0
- data/lib/bio-publisci/readers/r_cross.rb +112 -0
- data/lib/bio-publisci/readers/r_matrix.rb +176 -0
- data/lib/bio-publisci/store.rb +56 -0
- data/lib/bio-publisci/writers/arff.rb +91 -0
- data/lib/bio-publisci/writers/base.rb +93 -0
- data/lib/bio-publisci/writers/csv.rb +31 -0
- data/lib/bio-publisci/writers/dataframe.rb +81 -0
- data/lib/bio-publisci/writers/json.rb +18 -0
- data/lib/r2rdf.rb +226 -0
- data/lib/template_bak.rb +12 -0
- data/lib/template_bak/publisci.rb +3 -0
- data/lib/vocabs/cc.rb +18 -0
- data/lib/vocabs/cert.rb +13 -0
- data/lib/vocabs/dc.rb +63 -0
- data/lib/vocabs/dc11.rb +23 -0
- data/lib/vocabs/doap.rb +45 -0
- data/lib/vocabs/exif.rb +168 -0
- data/lib/vocabs/foaf.rb +69 -0
- data/lib/vocabs/geo.rb +13 -0
- data/lib/vocabs/http.rb +26 -0
- data/lib/vocabs/ma.rb +78 -0
- data/lib/vocabs/owl.rb +59 -0
- data/lib/vocabs/rdfs.rb +17 -0
- data/lib/vocabs/rsa.rb +12 -0
- data/lib/vocabs/rss.rb +14 -0
- data/lib/vocabs/sioc.rb +93 -0
- data/lib/vocabs/skos.rb +36 -0
- data/lib/vocabs/wot.rb +21 -0
- data/lib/vocabs/xhtml.rb +9 -0
- data/lib/vocabs/xsd.rb +58 -0
- data/resources/maf_example.maf +10 -0
- data/resources/maf_rdf.ttl +1173 -0
- data/resources/primer.ttl +38 -0
- data/resources/queries/code_resources.rq +10 -0
- data/resources/queries/codes.rq +18 -0
- data/resources/queries/dataset.rq +7 -0
- data/resources/queries/dimension_ranges.rq +8 -0
- data/resources/queries/dimensions.rq +12 -0
- data/resources/queries/gene.rq +16 -0
- data/resources/queries/hugo_to_ensembl.rq +7 -0
- data/resources/queries/maf_column.rq +26 -0
- data/resources/queries/measures.rq +12 -0
- data/resources/queries/observation_labels.rq +8 -0
- data/resources/queries/observations.rq +13 -0
- data/resources/queries/patient.rq +11 -0
- data/resources/queries/patient_list.rq +11 -0
- data/resources/queries/patients_with_mutation.rq +18 -0
- data/resources/queries/properties.rq +8 -0
- data/resources/queries/test.rq +3 -0
- data/resources/weather.numeric.arff +28 -0
- data/scripts/get_gene_lengths.rb +50 -0
- data/scripts/islet_mlratio.rb +6 -0
- data/scripts/scan_islet.rb +6 -0
- data/scripts/update_reference.rb +25 -0
- data/server/helpers.rb +215 -0
- data/server/public/src-min-noconflict/LICENSE +24 -0
- data/server/public/src-min-noconflict/ace.js +11 -0
- data/server/public/src-min-noconflict/ext-chromevox.js +1 -0
- data/server/public/src-min-noconflict/ext-elastic_tabstops_lite.js +1 -0
- data/server/public/src-min-noconflict/ext-emmet.js +1 -0
- data/server/public/src-min-noconflict/ext-keybinding_menu.js +1 -0
- data/server/public/src-min-noconflict/ext-language_tools.js +1 -0
- data/server/public/src-min-noconflict/ext-modelist.js +1 -0
- data/server/public/src-min-noconflict/ext-old_ie.js +1 -0
- data/server/public/src-min-noconflict/ext-searchbox.js +1 -0
- data/server/public/src-min-noconflict/ext-settings_menu.js +1 -0
- data/server/public/src-min-noconflict/ext-spellcheck.js +1 -0
- data/server/public/src-min-noconflict/ext-split.js +1 -0
- data/server/public/src-min-noconflict/ext-static_highlight.js +1 -0
- data/server/public/src-min-noconflict/ext-statusbar.js +1 -0
- data/server/public/src-min-noconflict/ext-textarea.js +1 -0
- data/server/public/src-min-noconflict/ext-themelist.js +1 -0
- data/server/public/src-min-noconflict/ext-whitespace.js +1 -0
- data/server/public/src-min-noconflict/keybinding-emacs.js +1 -0
- data/server/public/src-min-noconflict/keybinding-vim.js +1 -0
- data/server/public/src-min-noconflict/mode-ruby.js +1 -0
- data/server/public/src-min-noconflict/snippets/ruby.js +1 -0
- data/server/public/src-min-noconflict/theme-twilight.js +1 -0
- data/server/public/src-min-noconflict/worker-coffee.js +1 -0
- data/server/public/src-min-noconflict/worker-css.js +1 -0
- data/server/public/src-min-noconflict/worker-javascript.js +1 -0
- data/server/public/src-min-noconflict/worker-json.js +1 -0
- data/server/public/src-min-noconflict/worker-lua.js +1 -0
- data/server/public/src-min-noconflict/worker-php.js +1 -0
- data/server/public/src-min-noconflict/worker-xquery.js +1 -0
- data/server/routes.rb +123 -0
- data/server/views/dsl.haml +65 -0
- data/server/views/dump.haml +3 -0
- data/server/views/import.haml +35 -0
- data/server/views/new_repository.haml +25 -0
- data/server/views/query.haml +28 -0
- data/server/views/repository.haml +25 -0
- data/spec/ORM/data_cube_orm_spec.rb +33 -0
- data/spec/ORM/prov_model_spec.rb +72 -0
- data/spec/analyzer_spec.rb +36 -0
- data/spec/bnode_spec.rb +66 -0
- data/spec/csv/bacon.csv +4 -0
- data/spec/csv/moar_bacon.csv +11 -0
- data/spec/data_cube_spec.rb +169 -0
- data/spec/dataset_for_spec.rb +77 -0
- data/spec/dsl_spec.rb +134 -0
- data/spec/generators/csv_spec.rb +44 -0
- data/spec/generators/dataframe_spec.rb +44 -0
- data/spec/generators/maf_spec.rb +40 -0
- data/spec/generators/r_cross_spec.rb +51 -0
- data/spec/generators/r_matrix_spec.rb +44 -0
- data/spec/length_lookup_spec.rb +0 -0
- data/spec/maf_query_spec.rb +343 -0
- data/spec/metadata/metadata_dsl_spec.rb +68 -0
- data/spec/prov/activity_spec.rb +74 -0
- data/spec/prov/agent_spec.rb +54 -0
- data/spec/prov/association_spec.rb +55 -0
- data/spec/prov/config_spec.rb +28 -0
- data/spec/prov/derivation_spec.rb +30 -0
- data/spec/prov/entity_spec.rb +52 -0
- data/spec/prov/role_spec.rb +94 -0
- data/spec/prov/usage_spec.rb +98 -0
- data/spec/queries/integrity/1.rq +21 -0
- data/spec/queries/integrity/11.rq +29 -0
- data/spec/queries/integrity/12.rq +37 -0
- data/spec/queries/integrity/14.rq +25 -0
- data/spec/queries/integrity/19_1.rq +21 -0
- data/spec/queries/integrity/19_2.rq +15 -0
- data/spec/queries/integrity/2.rq +22 -0
- data/spec/queries/integrity/3.rq +19 -0
- data/spec/queries/integrity/4.rq +13 -0
- data/spec/queries/integrity/5.rq +14 -0
- data/spec/r_builder_spec.rb +33 -0
- data/spec/resource/.RData +0 -0
- data/spec/resource/example.Rhistory +3 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/turtle/bacon +147 -0
- data/spec/turtle/reference +2064 -0
- data/spec/turtle/weather +275 -0
- data/spec/writer_spec.rb +75 -0
- metadata +589 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
module RDF
|
2
|
+
class Query
|
3
|
+
class Solutions
|
4
|
+
def to_h
|
5
|
+
arr=[]
|
6
|
+
self.map{|solution|
|
7
|
+
h={}
|
8
|
+
solution.map{|element|
|
9
|
+
h[element[0]] = element[1]
|
10
|
+
}
|
11
|
+
arr << h
|
12
|
+
}
|
13
|
+
arr
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
module PubliSci
|
20
|
+
#.gsub(/^\s+/,'')
|
21
|
+
module Query
|
22
|
+
def vocabulary
|
23
|
+
{
|
24
|
+
base: RDF::Vocabulary.new('<http://www.rqtl.org/ns/#>'),
|
25
|
+
qb: RDF::Vocabulary.new("http://purl.org/linked-data/cube#"),
|
26
|
+
rdf: RDF::Vocabulary.new('http://www.w3.org/1999/02/22-rdf-syntax-ns#'),
|
27
|
+
rdfs: RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#'),
|
28
|
+
prop: RDF::Vocabulary.new('http://www.rqtl.org/dc/properties/'),
|
29
|
+
cs: RDF::Vocabulary.new('http://www.rqtl.org/dc/cs')
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
# def execute_internal(query,repo)
|
36
|
+
# SPARQL.execute(query,repo)
|
37
|
+
# end
|
38
|
+
|
39
|
+
def execute(string,store,type=:fourstore)
|
40
|
+
if store.is_a?(PubliSci::Store) || store.is_a?(RDF::FourStore)
|
41
|
+
sparql = SPARQL::Client.new(store.url+"/sparql/")
|
42
|
+
elsif type == :graph || store.is_a?(RDF::Graph) || store.is_a?(RDF::Repository)
|
43
|
+
sparql = SPARQL::Client.new(store)
|
44
|
+
elsif type == :fourstore
|
45
|
+
sparql = SPARQL::Client.new(store+"/sparql/")
|
46
|
+
end
|
47
|
+
sparql.query(string)
|
48
|
+
end
|
49
|
+
|
50
|
+
def execute_from_file(file,store,type=:fourstore,substitutions={})
|
51
|
+
if Gem::Dependency.new('bio-publisci').matching_specs.size > 0
|
52
|
+
queries_dir = Gem::Specification.find_by_name("bio-publisci").gem_dir + "/resources/queries/"
|
53
|
+
else
|
54
|
+
queries_dir = File.dirname(__FILE__) + '/../../../resources/queries/'
|
55
|
+
end
|
56
|
+
if File.exist?(file)
|
57
|
+
string = IO.read(file)
|
58
|
+
elsif File.exist?(queries_dir + file)
|
59
|
+
string = IO.read(queries_dir + file)
|
60
|
+
elsif File.exist?(queries_dir + file + '.rq')
|
61
|
+
string = IO.read(queries_dir + file + '.rq')
|
62
|
+
else
|
63
|
+
raise "couldn't find query for #{file}"
|
64
|
+
end
|
65
|
+
|
66
|
+
substitutions.map{|k,v|
|
67
|
+
string = string.gsub(k,v)
|
68
|
+
}
|
69
|
+
execute(string, store, type)
|
70
|
+
end
|
71
|
+
|
72
|
+
# def prefixes
|
73
|
+
# <<-EOF
|
74
|
+
# PREFIX ns: <http://www.rqtl.org/ns/#>
|
75
|
+
# PREFIX qb: <http://purl.org/linked-data/cube#>
|
76
|
+
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
77
|
+
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
78
|
+
# PREFIX prop: <http://www.rqtl.org/dc/properties/>
|
79
|
+
# PREFIX cs: <http://www.rqtl.org/dc/cs/>
|
80
|
+
# PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
81
|
+
|
82
|
+
# EOF
|
83
|
+
# end
|
84
|
+
|
85
|
+
def property_values(var, property)
|
86
|
+
str = prefixes
|
87
|
+
str << <<-EOS
|
88
|
+
SELECT ?val WHERE {
|
89
|
+
?obs qb:dataSet ns:dataset-#{var} ;
|
90
|
+
prop:#{property} ?val ;
|
91
|
+
}
|
92
|
+
EOS
|
93
|
+
str
|
94
|
+
end
|
95
|
+
|
96
|
+
def row_names(var)
|
97
|
+
str = prefixes
|
98
|
+
str << <<-EOS
|
99
|
+
SELECT ?label WHERE {
|
100
|
+
?obs qb:dataSet ns:dataset-#{var} ;
|
101
|
+
prop:refRow ?row .
|
102
|
+
?row skos:prefLabel ?label .
|
103
|
+
}
|
104
|
+
EOS
|
105
|
+
end
|
106
|
+
|
107
|
+
# Currently will say "___ Component", needs further parsing
|
108
|
+
def property_names(var)
|
109
|
+
str = prefixes
|
110
|
+
str << <<-EOS
|
111
|
+
SELECT DISTINCT ?label WHERE {
|
112
|
+
ns:dsd-#{var} qb:component ?c .
|
113
|
+
?c rdfs:label ?label
|
114
|
+
}
|
115
|
+
EOS
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
119
|
+
|
120
|
+
class QueryHelper
|
121
|
+
extend PubliSci::Query
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Rconnect
|
3
|
+
|
4
|
+
def connect(address=nil)
|
5
|
+
if address
|
6
|
+
Rserve::Connection.new(address)
|
7
|
+
else
|
8
|
+
Rserve::Connection.new
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def load_workspace(connection,loc=Dir.home,file=".RData")
|
13
|
+
loc = File.join(loc,file)
|
14
|
+
connection.eval "load(\"#{loc}\")"
|
15
|
+
end
|
16
|
+
|
17
|
+
def get(connection, instruction)
|
18
|
+
connection.eval instruction
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_vars(connection)
|
22
|
+
connection.eval("ls()")
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class Client
|
28
|
+
include PubliSci::Rconnect
|
29
|
+
attr :R
|
30
|
+
|
31
|
+
def initialize(auto=true, loc=Dir.home)
|
32
|
+
@R = connect
|
33
|
+
@loc = loc
|
34
|
+
load_ws if auto
|
35
|
+
puts "vars: #{vars.payload}" if auto
|
36
|
+
end
|
37
|
+
|
38
|
+
def load_ws
|
39
|
+
load_workspace(@R, @loc)
|
40
|
+
end
|
41
|
+
|
42
|
+
def get_var(var)
|
43
|
+
get(@R,var)
|
44
|
+
end
|
45
|
+
|
46
|
+
def get_ws
|
47
|
+
"#{@loc}/.RData"
|
48
|
+
end
|
49
|
+
|
50
|
+
def vars
|
51
|
+
get_vars(@R)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Readers
|
3
|
+
class ARFF
|
4
|
+
include PubliSci::Dataset::DataCube
|
5
|
+
|
6
|
+
def generate_n3(arff, options={})
|
7
|
+
arff = IO.read(arff) if File.exist? arff
|
8
|
+
options[:no_labels] = true # unless options[:no_labels] == nil
|
9
|
+
@options = options
|
10
|
+
comps = components(arff)
|
11
|
+
obs = data(arff, comps.keys)
|
12
|
+
generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
|
13
|
+
end
|
14
|
+
|
15
|
+
def relation(arff)
|
16
|
+
arff.match(/@relation.+/i).to_a.first.split.last
|
17
|
+
end
|
18
|
+
|
19
|
+
def components(arff)
|
20
|
+
#still needs support for quoted strings with whitespace
|
21
|
+
h ={}
|
22
|
+
arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
|
23
|
+
if line =~ /\{.*}/
|
24
|
+
name = line.match(/\s.*/).to_a.first.strip.split.first
|
25
|
+
type = :coded
|
26
|
+
codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
|
27
|
+
h[name] = {type: type, codes: codes}
|
28
|
+
else
|
29
|
+
name = line.split[1]
|
30
|
+
type = line.split[2]
|
31
|
+
h[name] = {type: type}
|
32
|
+
end
|
33
|
+
}
|
34
|
+
h
|
35
|
+
end
|
36
|
+
|
37
|
+
def data(arff, attributes)
|
38
|
+
lines = arff.split("\n")
|
39
|
+
data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
|
40
|
+
h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
|
41
|
+
data_lines.map{|line|
|
42
|
+
line = line.split ','
|
43
|
+
attributes.each_with_index{|a,i| h[a] << line[i]}
|
44
|
+
}
|
45
|
+
h
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Readers
|
3
|
+
class Base
|
4
|
+
include PubliSci::Query
|
5
|
+
include PubliSci::Parser
|
6
|
+
include PubliSci::Analyzer
|
7
|
+
include PubliSci::Interactive
|
8
|
+
include PubliSci::Dataset::DataCube
|
9
|
+
|
10
|
+
#should be overridden if extra processing/input is required
|
11
|
+
def automatic(*args)
|
12
|
+
generate_n3(args[0],Hash[*args[1..-2]])
|
13
|
+
end
|
14
|
+
|
15
|
+
def generate_n3(*args)
|
16
|
+
raise "#{self} does not implement a generate_n3 method!"
|
17
|
+
end
|
18
|
+
|
19
|
+
def sio_value(type,value)
|
20
|
+
[
|
21
|
+
["a", type],
|
22
|
+
["http://semanticscience.org/resource/SIO_000300",value]
|
23
|
+
]
|
24
|
+
end
|
25
|
+
|
26
|
+
def sio_attribute(attribute_type,value,data_type=nil)
|
27
|
+
inner = [
|
28
|
+
"http://semanticscience.org/resource/SIO_000300",value
|
29
|
+
]
|
30
|
+
if data_type
|
31
|
+
inner = [["a", data_type], inner]
|
32
|
+
end
|
33
|
+
|
34
|
+
outer =
|
35
|
+
[
|
36
|
+
"http://semanticscience.org/resource/SIO_000008",
|
37
|
+
inner
|
38
|
+
]
|
39
|
+
|
40
|
+
if attribute_type
|
41
|
+
outer = [["a", attribute_type], outer]
|
42
|
+
end
|
43
|
+
|
44
|
+
# puts "#{outer}"
|
45
|
+
outer
|
46
|
+
end
|
47
|
+
|
48
|
+
def next_label
|
49
|
+
if @__current_label
|
50
|
+
@__current_label += 1
|
51
|
+
else
|
52
|
+
@__current_label = 0
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Readers
|
3
|
+
class CSV < Base
|
4
|
+
def automatic(file=nil,dataset_name=nil,options={},interactive=true)
|
5
|
+
#to do
|
6
|
+
# puts "f #{file} \n ds #{dataset_name} opts #{options}"
|
7
|
+
|
8
|
+
unless file || !interactive
|
9
|
+
puts "Input file?"
|
10
|
+
file = gets.chomp
|
11
|
+
end
|
12
|
+
|
13
|
+
raise "CSV reader needs an input file" unless file && file.size > 0
|
14
|
+
|
15
|
+
|
16
|
+
unless dataset_name
|
17
|
+
if interactive
|
18
|
+
dataset_name = interact("Dataset name?","#{File.basename(file).split('.').first}"){|sel| File.basename(file).split('.').first }
|
19
|
+
else
|
20
|
+
dataset_name = File.basename(file).split('.').first
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
categories = ::CSV.read(file)[0]
|
26
|
+
|
27
|
+
|
28
|
+
unless options[:dimensions] || !interactive
|
29
|
+
options[:dimensions] = Array(interact("Dimensions?",categories[0],categories))
|
30
|
+
end
|
31
|
+
|
32
|
+
unless options[:measures] || !interactive
|
33
|
+
meas = categories - (options[:dimensions] || [categories[0]])
|
34
|
+
selection = interact("Measures?",meas,meas){|s| nil}
|
35
|
+
options[:measures] = Array(selection) unless selection == nil
|
36
|
+
end
|
37
|
+
|
38
|
+
generate_n3(file,dataset_name,options)
|
39
|
+
end
|
40
|
+
|
41
|
+
def generate_n3(file, dataset_name, options={})
|
42
|
+
@data = ::CSV.read(file)
|
43
|
+
@options = options
|
44
|
+
generate(measures, dimensions, codes, observation_data, observation_labels, dataset_name, options)
|
45
|
+
end
|
46
|
+
|
47
|
+
def dimensions
|
48
|
+
@options[:dimensions] || [@data[0][0]]
|
49
|
+
end
|
50
|
+
|
51
|
+
def codes
|
52
|
+
@options[:codes] || dimensions()
|
53
|
+
end
|
54
|
+
|
55
|
+
def measures
|
56
|
+
@options[:measures] || @data[0] - dimensions()
|
57
|
+
end
|
58
|
+
|
59
|
+
def observation_labels
|
60
|
+
if @options[:label_column]
|
61
|
+
tmp = @data.dup
|
62
|
+
tmp.shift
|
63
|
+
tmp.map{|row|
|
64
|
+
row[@options[:label_column]]
|
65
|
+
}
|
66
|
+
else
|
67
|
+
(1..@data.size - 1).to_a
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def observation_data
|
72
|
+
obs = {}
|
73
|
+
@data[0].map{|label|
|
74
|
+
obs[label] = []
|
75
|
+
}
|
76
|
+
tmp = @data.dup
|
77
|
+
tmp.shift
|
78
|
+
|
79
|
+
tmp.map{|row|
|
80
|
+
row.each_with_index{|entry,i|
|
81
|
+
obs[@data[0][i]] << entry
|
82
|
+
}
|
83
|
+
}
|
84
|
+
obs
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Readers
|
3
|
+
class Dataframe
|
4
|
+
include PubliSci::Dataset::DataCube
|
5
|
+
include PubliSci::Readers::Output
|
6
|
+
|
7
|
+
# def initialize(var)
|
8
|
+
# @var = var
|
9
|
+
# end
|
10
|
+
|
11
|
+
def generate_n3(rexp, var, options={})
|
12
|
+
@rexp = rexp
|
13
|
+
options[:type] ||= :string
|
14
|
+
@options = options
|
15
|
+
output(generate(measures, dimensions, codes, observation_data, observation_labels, var, options), options)
|
16
|
+
end
|
17
|
+
|
18
|
+
def dimensions
|
19
|
+
if @options[:dimensions]
|
20
|
+
@options[:dimensions]
|
21
|
+
elsif @options[:row_label]
|
22
|
+
[@options[:row_label]]
|
23
|
+
else
|
24
|
+
["refRow"]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def codes
|
29
|
+
if @options[:codes]
|
30
|
+
@options[:codes]
|
31
|
+
elsif @options[:row_label]
|
32
|
+
[@options[:row_label]]
|
33
|
+
else
|
34
|
+
["refRow"]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def measures
|
39
|
+
if @options[:dimensions]
|
40
|
+
if @options[:measures]
|
41
|
+
@options[:measures] - @options[:dimensions]
|
42
|
+
else
|
43
|
+
@rexp.payload.names - @options[:dimensions]
|
44
|
+
end
|
45
|
+
else
|
46
|
+
@options[:measures] || @rexp.payload.names
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def observation_labels
|
51
|
+
row_names = @rexp.attr.payload["row.names"].to_ruby
|
52
|
+
row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
|
53
|
+
row_names
|
54
|
+
end
|
55
|
+
|
56
|
+
def observation_data
|
57
|
+
|
58
|
+
data = {}
|
59
|
+
@rexp.payload.names.map{|name|
|
60
|
+
data[name] = @rexp.payload[name].to_ruby
|
61
|
+
}
|
62
|
+
data[@options[:row_label] || "refRow"] = observation_labels()
|
63
|
+
data
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Readers
|
3
|
+
class MAF < Base
|
4
|
+
COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
|
5
|
+
|
6
|
+
COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
|
7
|
+
|
8
|
+
TCGA_CODES =
|
9
|
+
{
|
10
|
+
"Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
|
11
|
+
"Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
|
12
|
+
"dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
|
13
|
+
"Verification_Status" => %w{Verified, Unknown},
|
14
|
+
"Validation_Status" => %w{Untested Inconclusive Valid Invalid},
|
15
|
+
"Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
|
16
|
+
"Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
|
17
|
+
"Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
|
18
|
+
}
|
19
|
+
|
20
|
+
def generate_n3(input_file, options={})
|
21
|
+
|
22
|
+
dataset_name = options[:dataset_name] || nil
|
23
|
+
output = options[:output] || :file
|
24
|
+
output_base = options[:output_base] || nil
|
25
|
+
|
26
|
+
@dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
|
27
|
+
# @codes = %w{Variant_Classification Variant_Type}
|
28
|
+
@codes = @dimensions
|
29
|
+
@measures = (COLUMN_NAMES - @dimensions - @codes)
|
30
|
+
@dataset_name ||= File.basename(input_file,'.*')
|
31
|
+
@barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
|
32
|
+
|
33
|
+
options[:no_labels] ||= true
|
34
|
+
options[:lookup_hugo] ||= false
|
35
|
+
options[:complex_objects] ||= false
|
36
|
+
options[:ranges] ||= COMPONENT_RANGES
|
37
|
+
|
38
|
+
|
39
|
+
if output == :print
|
40
|
+
str = structure(options)
|
41
|
+
f = open(input_file)
|
42
|
+
n = 0
|
43
|
+
f.each_line{|line|
|
44
|
+
processed = process_line(line,n.to_s,options)
|
45
|
+
str << processed.first if processed
|
46
|
+
n +=1
|
47
|
+
}
|
48
|
+
str
|
49
|
+
else
|
50
|
+
# TODO - allow multi file / separate structure output for very large datasets
|
51
|
+
# open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
|
52
|
+
file_base = output_base || @dataset_name
|
53
|
+
|
54
|
+
out = open("#{file_base}.ttl",'w')
|
55
|
+
out.write(structure(options))
|
56
|
+
f = open(input_file)
|
57
|
+
n = 0
|
58
|
+
f.each_line{|line|
|
59
|
+
processed = process_line(line,n.to_s,options)
|
60
|
+
out.write(processed.first) if processed
|
61
|
+
n += 1
|
62
|
+
}
|
63
|
+
if options[:lookup_hugo]
|
64
|
+
post_process(out)
|
65
|
+
else
|
66
|
+
out
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def process_line(line,label,options)
|
72
|
+
unless line[0] == "#" || line[0..3] == "Hugo"
|
73
|
+
entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
|
74
|
+
|
75
|
+
entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
|
76
|
+
|
77
|
+
entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
|
78
|
+
|
79
|
+
# A 0 in the entrez-id column appears to mean null
|
80
|
+
col=1
|
81
|
+
entry[col] = nil if entry[col] == '0'
|
82
|
+
entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
|
83
|
+
|
84
|
+
# Only link non-novel dbSNP entries
|
85
|
+
col = COLUMN_NAMES.index('dbSNP_RS')
|
86
|
+
if entry[col] && entry[col][0..1] == "rs"
|
87
|
+
entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
|
88
|
+
end
|
89
|
+
|
90
|
+
# optionally create typed objects using sio nodes
|
91
|
+
if options[:complex_objects]
|
92
|
+
entry = sio_values(entry)
|
93
|
+
end
|
94
|
+
|
95
|
+
data = {}
|
96
|
+
COLUMN_NAMES.each_with_index{|col,i|
|
97
|
+
data[col] = [entry[i]]
|
98
|
+
}
|
99
|
+
|
100
|
+
observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def sio_values(entry)
|
105
|
+
entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
|
106
|
+
|
107
|
+
# Link entrez genes
|
108
|
+
col=1
|
109
|
+
entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
|
110
|
+
|
111
|
+
col = COLUMN_NAMES.index('dbSNP_RS')
|
112
|
+
entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
|
113
|
+
|
114
|
+
# test SIO attributes for chromosome
|
115
|
+
col = COLUMN_NAMES.index('Chromosome')
|
116
|
+
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
# More SIO attrtibutes for alleles
|
121
|
+
%w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
|
122
|
+
col = COLUMN_NAMES.index(name)
|
123
|
+
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
|
124
|
+
}
|
125
|
+
|
126
|
+
col = COLUMN_NAMES.index("Strand")
|
127
|
+
entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
|
128
|
+
|
129
|
+
col = COLUMN_NAMES.index("Center")
|
130
|
+
entry[col] = sio_attribute("foaf:homepage",entry[col])
|
131
|
+
# entry[col] = [
|
132
|
+
# ["a", "foaf:Organization"],
|
133
|
+
# ["foaf:homepage", entry[col]],
|
134
|
+
# ]
|
135
|
+
|
136
|
+
# Use faldo for locations End_Position
|
137
|
+
col = COLUMN_NAMES.index("Start_Position")
|
138
|
+
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
|
139
|
+
|
140
|
+
col = COLUMN_NAMES.index("End_Position")
|
141
|
+
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
|
142
|
+
|
143
|
+
entry
|
144
|
+
end
|
145
|
+
|
146
|
+
def column_replace(entry,column,prefix,value=nil)
|
147
|
+
if value
|
148
|
+
entry[COLUMN_NAMES.index(column)] = prefix + value
|
149
|
+
else
|
150
|
+
entry[COLUMN_NAMES.index(column)] += prefix
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def official_symbol(hugo_symbol)
|
155
|
+
qry = <<-EOF
|
156
|
+
|
157
|
+
SELECT distinct ?official where {
|
158
|
+
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
|
159
|
+
UNION
|
160
|
+
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
|
161
|
+
|
162
|
+
?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
|
163
|
+
}
|
164
|
+
|
165
|
+
EOF
|
166
|
+
|
167
|
+
sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
|
168
|
+
sparql.query(qry).map(&:official).first.to_s
|
169
|
+
end
|
170
|
+
|
171
|
+
def parse_barcode(code)
|
172
|
+
#TCGA-E9-A22B-01A-11D-A159-09
|
173
|
+
[code[5..11], code[13..-1]]
|
174
|
+
end
|
175
|
+
|
176
|
+
def structure(options={})
|
177
|
+
|
178
|
+
str = prefixes(@dataset_name,options)
|
179
|
+
str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
|
180
|
+
str << dataset(@dataset_name,options)
|
181
|
+
component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
|
182
|
+
measure_properties(@measures,@dataset_name,options).map{|m| str << m}
|
183
|
+
dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
|
184
|
+
code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
|
185
|
+
concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
|
186
|
+
str
|
187
|
+
end
|
188
|
+
|
189
|
+
def post_process(file)
|
190
|
+
reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
|
191
|
+
@@hugo_cache ||= {}
|
192
|
+
PubliSci::PostProcessor.process(file,file,reg){|g|
|
193
|
+
@@hugo_cache[g] ||= official_symbol(g)
|
194
|
+
'http://identifiers.org/hgnc.symbol/' + cache[g]
|
195
|
+
}
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|