publisci 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +13 -0
- data/Gemfile +36 -0
- data/LICENSE.txt +20 -0
- data/README.md +51 -0
- data/README.rdoc +48 -0
- data/Rakefile +68 -0
- data/bin/bio-publisci +106 -0
- data/bin/bio-publisci-server +50 -0
- data/examples/bio-band_integration.rb +9 -0
- data/examples/no_magic.prov +58 -0
- data/examples/no_magic.rb +58 -0
- data/examples/orm.prov +48 -0
- data/examples/primer-full.prov +120 -0
- data/examples/primer.prov +66 -0
- data/examples/prov_dsl.prov +85 -0
- data/examples/safe_gen.rb +7 -0
- data/examples/visualization/primer.prov +66 -0
- data/examples/visualization/prov_viz.rb +140 -0
- data/examples/visualization/viz.rb +35 -0
- data/features/create_generator.feature +21 -0
- data/features/integration.feature +12 -0
- data/features/integration_steps.rb +10 -0
- data/features/metadata.feature +37 -0
- data/features/metadata_steps.rb +40 -0
- data/features/orm.feature +60 -0
- data/features/orm_steps.rb +74 -0
- data/features/prov_dsl.feature +14 -0
- data/features/prov_dsl_steps.rb +11 -0
- data/features/reader.feature +25 -0
- data/features/reader_steps.rb +61 -0
- data/features/step_definitions/bio-publisci_steps.rb +0 -0
- data/features/store.feature +27 -0
- data/features/store_steps.rb +42 -0
- data/features/support/env.rb +13 -0
- data/features/writer.feature +14 -0
- data/features/writer_steps.rb +24 -0
- data/lib/bio-publisci.rb +64 -0
- data/lib/bio-publisci/analyzer.rb +57 -0
- data/lib/bio-publisci/datacube_model.rb +111 -0
- data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +240 -0
- data/lib/bio-publisci/dataset/ORM/observation.rb +20 -0
- data/lib/bio-publisci/dataset/configuration.rb +31 -0
- data/lib/bio-publisci/dataset/data_cube.rb +418 -0
- data/lib/bio-publisci/dataset/dataset.rb +11 -0
- data/lib/bio-publisci/dataset/dataset_for.rb +186 -0
- data/lib/bio-publisci/dataset/interactive.rb +72 -0
- data/lib/bio-publisci/dsl/config.rb +34 -0
- data/lib/bio-publisci/dsl/dataset_dsl.rb +93 -0
- data/lib/bio-publisci/dsl/dsl.rb +72 -0
- data/lib/bio-publisci/dsl/metadata_dsl.rb +85 -0
- data/lib/bio-publisci/dsl/prov_dsl.rb +143 -0
- data/lib/bio-publisci/metadata/generator.rb +323 -0
- data/lib/bio-publisci/metadata/metadata.rb +5 -0
- data/lib/bio-publisci/metadata/metadata_model.rb +25 -0
- data/lib/bio-publisci/metadata/prov/activity.rb +88 -0
- data/lib/bio-publisci/metadata/prov/agent.rb +100 -0
- data/lib/bio-publisci/metadata/prov/association.rb +107 -0
- data/lib/bio-publisci/metadata/prov/config.rb +34 -0
- data/lib/bio-publisci/metadata/prov/derivation.rb +60 -0
- data/lib/bio-publisci/metadata/prov/element.rb +120 -0
- data/lib/bio-publisci/metadata/prov/entity.rb +64 -0
- data/lib/bio-publisci/metadata/prov/model/prov_models.rb +109 -0
- data/lib/bio-publisci/metadata/prov/plan.rb +32 -0
- data/lib/bio-publisci/metadata/prov/prov.rb +78 -0
- data/lib/bio-publisci/metadata/prov/role.rb +40 -0
- data/lib/bio-publisci/metadata/prov/usage.rb +64 -0
- data/lib/bio-publisci/metadata/publisher.rb +25 -0
- data/lib/bio-publisci/mixins/custom_predicate.rb +38 -0
- data/lib/bio-publisci/mixins/dereferencable.rb +34 -0
- data/lib/bio-publisci/mixins/registry.rb +27 -0
- data/lib/bio-publisci/mixins/vocabulary.rb +8 -0
- data/lib/bio-publisci/output.rb +27 -0
- data/lib/bio-publisci/parser.rb +266 -0
- data/lib/bio-publisci/post_processor.rb +95 -0
- data/lib/bio-publisci/query/query_helper.rb +123 -0
- data/lib/bio-publisci/r_client.rb +54 -0
- data/lib/bio-publisci/readers/arff.rb +49 -0
- data/lib/bio-publisci/readers/base.rb +57 -0
- data/lib/bio-publisci/readers/csv.rb +88 -0
- data/lib/bio-publisci/readers/dataframe.rb +67 -0
- data/lib/bio-publisci/readers/maf.rb +199 -0
- data/lib/bio-publisci/readers/r_cross.rb +112 -0
- data/lib/bio-publisci/readers/r_matrix.rb +176 -0
- data/lib/bio-publisci/store.rb +56 -0
- data/lib/bio-publisci/writers/arff.rb +91 -0
- data/lib/bio-publisci/writers/base.rb +93 -0
- data/lib/bio-publisci/writers/csv.rb +31 -0
- data/lib/bio-publisci/writers/dataframe.rb +81 -0
- data/lib/bio-publisci/writers/json.rb +18 -0
- data/lib/r2rdf.rb +226 -0
- data/lib/template_bak.rb +12 -0
- data/lib/template_bak/publisci.rb +3 -0
- data/lib/vocabs/cc.rb +18 -0
- data/lib/vocabs/cert.rb +13 -0
- data/lib/vocabs/dc.rb +63 -0
- data/lib/vocabs/dc11.rb +23 -0
- data/lib/vocabs/doap.rb +45 -0
- data/lib/vocabs/exif.rb +168 -0
- data/lib/vocabs/foaf.rb +69 -0
- data/lib/vocabs/geo.rb +13 -0
- data/lib/vocabs/http.rb +26 -0
- data/lib/vocabs/ma.rb +78 -0
- data/lib/vocabs/owl.rb +59 -0
- data/lib/vocabs/rdfs.rb +17 -0
- data/lib/vocabs/rsa.rb +12 -0
- data/lib/vocabs/rss.rb +14 -0
- data/lib/vocabs/sioc.rb +93 -0
- data/lib/vocabs/skos.rb +36 -0
- data/lib/vocabs/wot.rb +21 -0
- data/lib/vocabs/xhtml.rb +9 -0
- data/lib/vocabs/xsd.rb +58 -0
- data/resources/maf_example.maf +10 -0
- data/resources/maf_rdf.ttl +1173 -0
- data/resources/primer.ttl +38 -0
- data/resources/queries/code_resources.rq +10 -0
- data/resources/queries/codes.rq +18 -0
- data/resources/queries/dataset.rq +7 -0
- data/resources/queries/dimension_ranges.rq +8 -0
- data/resources/queries/dimensions.rq +12 -0
- data/resources/queries/gene.rq +16 -0
- data/resources/queries/hugo_to_ensembl.rq +7 -0
- data/resources/queries/maf_column.rq +26 -0
- data/resources/queries/measures.rq +12 -0
- data/resources/queries/observation_labels.rq +8 -0
- data/resources/queries/observations.rq +13 -0
- data/resources/queries/patient.rq +11 -0
- data/resources/queries/patient_list.rq +11 -0
- data/resources/queries/patients_with_mutation.rq +18 -0
- data/resources/queries/properties.rq +8 -0
- data/resources/queries/test.rq +3 -0
- data/resources/weather.numeric.arff +28 -0
- data/scripts/get_gene_lengths.rb +50 -0
- data/scripts/islet_mlratio.rb +6 -0
- data/scripts/scan_islet.rb +6 -0
- data/scripts/update_reference.rb +25 -0
- data/server/helpers.rb +215 -0
- data/server/public/src-min-noconflict/LICENSE +24 -0
- data/server/public/src-min-noconflict/ace.js +11 -0
- data/server/public/src-min-noconflict/ext-chromevox.js +1 -0
- data/server/public/src-min-noconflict/ext-elastic_tabstops_lite.js +1 -0
- data/server/public/src-min-noconflict/ext-emmet.js +1 -0
- data/server/public/src-min-noconflict/ext-keybinding_menu.js +1 -0
- data/server/public/src-min-noconflict/ext-language_tools.js +1 -0
- data/server/public/src-min-noconflict/ext-modelist.js +1 -0
- data/server/public/src-min-noconflict/ext-old_ie.js +1 -0
- data/server/public/src-min-noconflict/ext-searchbox.js +1 -0
- data/server/public/src-min-noconflict/ext-settings_menu.js +1 -0
- data/server/public/src-min-noconflict/ext-spellcheck.js +1 -0
- data/server/public/src-min-noconflict/ext-split.js +1 -0
- data/server/public/src-min-noconflict/ext-static_highlight.js +1 -0
- data/server/public/src-min-noconflict/ext-statusbar.js +1 -0
- data/server/public/src-min-noconflict/ext-textarea.js +1 -0
- data/server/public/src-min-noconflict/ext-themelist.js +1 -0
- data/server/public/src-min-noconflict/ext-whitespace.js +1 -0
- data/server/public/src-min-noconflict/keybinding-emacs.js +1 -0
- data/server/public/src-min-noconflict/keybinding-vim.js +1 -0
- data/server/public/src-min-noconflict/mode-ruby.js +1 -0
- data/server/public/src-min-noconflict/snippets/ruby.js +1 -0
- data/server/public/src-min-noconflict/theme-twilight.js +1 -0
- data/server/public/src-min-noconflict/worker-coffee.js +1 -0
- data/server/public/src-min-noconflict/worker-css.js +1 -0
- data/server/public/src-min-noconflict/worker-javascript.js +1 -0
- data/server/public/src-min-noconflict/worker-json.js +1 -0
- data/server/public/src-min-noconflict/worker-lua.js +1 -0
- data/server/public/src-min-noconflict/worker-php.js +1 -0
- data/server/public/src-min-noconflict/worker-xquery.js +1 -0
- data/server/routes.rb +123 -0
- data/server/views/dsl.haml +65 -0
- data/server/views/dump.haml +3 -0
- data/server/views/import.haml +35 -0
- data/server/views/new_repository.haml +25 -0
- data/server/views/query.haml +28 -0
- data/server/views/repository.haml +25 -0
- data/spec/ORM/data_cube_orm_spec.rb +33 -0
- data/spec/ORM/prov_model_spec.rb +72 -0
- data/spec/analyzer_spec.rb +36 -0
- data/spec/bnode_spec.rb +66 -0
- data/spec/csv/bacon.csv +4 -0
- data/spec/csv/moar_bacon.csv +11 -0
- data/spec/data_cube_spec.rb +169 -0
- data/spec/dataset_for_spec.rb +77 -0
- data/spec/dsl_spec.rb +134 -0
- data/spec/generators/csv_spec.rb +44 -0
- data/spec/generators/dataframe_spec.rb +44 -0
- data/spec/generators/maf_spec.rb +40 -0
- data/spec/generators/r_cross_spec.rb +51 -0
- data/spec/generators/r_matrix_spec.rb +44 -0
- data/spec/length_lookup_spec.rb +0 -0
- data/spec/maf_query_spec.rb +343 -0
- data/spec/metadata/metadata_dsl_spec.rb +68 -0
- data/spec/prov/activity_spec.rb +74 -0
- data/spec/prov/agent_spec.rb +54 -0
- data/spec/prov/association_spec.rb +55 -0
- data/spec/prov/config_spec.rb +28 -0
- data/spec/prov/derivation_spec.rb +30 -0
- data/spec/prov/entity_spec.rb +52 -0
- data/spec/prov/role_spec.rb +94 -0
- data/spec/prov/usage_spec.rb +98 -0
- data/spec/queries/integrity/1.rq +21 -0
- data/spec/queries/integrity/11.rq +29 -0
- data/spec/queries/integrity/12.rq +37 -0
- data/spec/queries/integrity/14.rq +25 -0
- data/spec/queries/integrity/19_1.rq +21 -0
- data/spec/queries/integrity/19_2.rq +15 -0
- data/spec/queries/integrity/2.rq +22 -0
- data/spec/queries/integrity/3.rq +19 -0
- data/spec/queries/integrity/4.rq +13 -0
- data/spec/queries/integrity/5.rq +14 -0
- data/spec/r_builder_spec.rb +33 -0
- data/spec/resource/.RData +0 -0
- data/spec/resource/example.Rhistory +3 -0
- data/spec/spec_helper.rb +17 -0
- data/spec/turtle/bacon +147 -0
- data/spec/turtle/reference +2064 -0
- data/spec/turtle/weather +275 -0
- data/spec/writer_spec.rb +75 -0
- metadata +589 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
module RDF
|
|
2
|
+
class Query
|
|
3
|
+
class Solutions
|
|
4
|
+
def to_h
|
|
5
|
+
arr=[]
|
|
6
|
+
self.map{|solution|
|
|
7
|
+
h={}
|
|
8
|
+
solution.map{|element|
|
|
9
|
+
h[element[0]] = element[1]
|
|
10
|
+
}
|
|
11
|
+
arr << h
|
|
12
|
+
}
|
|
13
|
+
arr
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
module PubliSci
|
|
20
|
+
#.gsub(/^\s+/,'')
|
|
21
|
+
module Query
|
|
22
|
+
def vocabulary
|
|
23
|
+
{
|
|
24
|
+
base: RDF::Vocabulary.new('<http://www.rqtl.org/ns/#>'),
|
|
25
|
+
qb: RDF::Vocabulary.new("http://purl.org/linked-data/cube#"),
|
|
26
|
+
rdf: RDF::Vocabulary.new('http://www.w3.org/1999/02/22-rdf-syntax-ns#'),
|
|
27
|
+
rdfs: RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#'),
|
|
28
|
+
prop: RDF::Vocabulary.new('http://www.rqtl.org/dc/properties/'),
|
|
29
|
+
cs: RDF::Vocabulary.new('http://www.rqtl.org/dc/cs')
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# def execute_internal(query,repo)
|
|
36
|
+
# SPARQL.execute(query,repo)
|
|
37
|
+
# end
|
|
38
|
+
|
|
39
|
+
def execute(string,store,type=:fourstore)
|
|
40
|
+
if store.is_a?(PubliSci::Store) || store.is_a?(RDF::FourStore)
|
|
41
|
+
sparql = SPARQL::Client.new(store.url+"/sparql/")
|
|
42
|
+
elsif type == :graph || store.is_a?(RDF::Graph) || store.is_a?(RDF::Repository)
|
|
43
|
+
sparql = SPARQL::Client.new(store)
|
|
44
|
+
elsif type == :fourstore
|
|
45
|
+
sparql = SPARQL::Client.new(store+"/sparql/")
|
|
46
|
+
end
|
|
47
|
+
sparql.query(string)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def execute_from_file(file,store,type=:fourstore,substitutions={})
|
|
51
|
+
if Gem::Dependency.new('bio-publisci').matching_specs.size > 0
|
|
52
|
+
queries_dir = Gem::Specification.find_by_name("bio-publisci").gem_dir + "/resources/queries/"
|
|
53
|
+
else
|
|
54
|
+
queries_dir = File.dirname(__FILE__) + '/../../../resources/queries/'
|
|
55
|
+
end
|
|
56
|
+
if File.exist?(file)
|
|
57
|
+
string = IO.read(file)
|
|
58
|
+
elsif File.exist?(queries_dir + file)
|
|
59
|
+
string = IO.read(queries_dir + file)
|
|
60
|
+
elsif File.exist?(queries_dir + file + '.rq')
|
|
61
|
+
string = IO.read(queries_dir + file + '.rq')
|
|
62
|
+
else
|
|
63
|
+
raise "couldn't find query for #{file}"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
substitutions.map{|k,v|
|
|
67
|
+
string = string.gsub(k,v)
|
|
68
|
+
}
|
|
69
|
+
execute(string, store, type)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# def prefixes
|
|
73
|
+
# <<-EOF
|
|
74
|
+
# PREFIX ns: <http://www.rqtl.org/ns/#>
|
|
75
|
+
# PREFIX qb: <http://purl.org/linked-data/cube#>
|
|
76
|
+
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
77
|
+
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
78
|
+
# PREFIX prop: <http://www.rqtl.org/dc/properties/>
|
|
79
|
+
# PREFIX cs: <http://www.rqtl.org/dc/cs/>
|
|
80
|
+
# PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
81
|
+
|
|
82
|
+
# EOF
|
|
83
|
+
# end
|
|
84
|
+
|
|
85
|
+
def property_values(var, property)
|
|
86
|
+
str = prefixes
|
|
87
|
+
str << <<-EOS
|
|
88
|
+
SELECT ?val WHERE {
|
|
89
|
+
?obs qb:dataSet ns:dataset-#{var} ;
|
|
90
|
+
prop:#{property} ?val ;
|
|
91
|
+
}
|
|
92
|
+
EOS
|
|
93
|
+
str
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def row_names(var)
|
|
97
|
+
str = prefixes
|
|
98
|
+
str << <<-EOS
|
|
99
|
+
SELECT ?label WHERE {
|
|
100
|
+
?obs qb:dataSet ns:dataset-#{var} ;
|
|
101
|
+
prop:refRow ?row .
|
|
102
|
+
?row skos:prefLabel ?label .
|
|
103
|
+
}
|
|
104
|
+
EOS
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Currently will say "___ Component", needs further parsing
|
|
108
|
+
def property_names(var)
|
|
109
|
+
str = prefixes
|
|
110
|
+
str << <<-EOS
|
|
111
|
+
SELECT DISTINCT ?label WHERE {
|
|
112
|
+
ns:dsd-#{var} qb:component ?c .
|
|
113
|
+
?c rdfs:label ?label
|
|
114
|
+
}
|
|
115
|
+
EOS
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
class QueryHelper
|
|
121
|
+
extend PubliSci::Query
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module PubliSci
|
|
2
|
+
module Rconnect
|
|
3
|
+
|
|
4
|
+
def connect(address=nil)
|
|
5
|
+
if address
|
|
6
|
+
Rserve::Connection.new(address)
|
|
7
|
+
else
|
|
8
|
+
Rserve::Connection.new
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def load_workspace(connection,loc=Dir.home,file=".RData")
|
|
13
|
+
loc = File.join(loc,file)
|
|
14
|
+
connection.eval "load(\"#{loc}\")"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def get(connection, instruction)
|
|
18
|
+
connection.eval instruction
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def get_vars(connection)
|
|
22
|
+
connection.eval("ls()")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
class Client
|
|
28
|
+
include PubliSci::Rconnect
|
|
29
|
+
attr :R
|
|
30
|
+
|
|
31
|
+
def initialize(auto=true, loc=Dir.home)
|
|
32
|
+
@R = connect
|
|
33
|
+
@loc = loc
|
|
34
|
+
load_ws if auto
|
|
35
|
+
puts "vars: #{vars.payload}" if auto
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def load_ws
|
|
39
|
+
load_workspace(@R, @loc)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def get_var(var)
|
|
43
|
+
get(@R,var)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def get_ws
|
|
47
|
+
"#{@loc}/.RData"
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def vars
|
|
51
|
+
get_vars(@R)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
module PubliSci
|
|
2
|
+
module Readers
|
|
3
|
+
class ARFF
|
|
4
|
+
include PubliSci::Dataset::DataCube
|
|
5
|
+
|
|
6
|
+
def generate_n3(arff, options={})
|
|
7
|
+
arff = IO.read(arff) if File.exist? arff
|
|
8
|
+
options[:no_labels] = true # unless options[:no_labels] == nil
|
|
9
|
+
@options = options
|
|
10
|
+
comps = components(arff)
|
|
11
|
+
obs = data(arff, comps.keys)
|
|
12
|
+
generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def relation(arff)
|
|
16
|
+
arff.match(/@relation.+/i).to_a.first.split.last
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def components(arff)
|
|
20
|
+
#still needs support for quoted strings with whitespace
|
|
21
|
+
h ={}
|
|
22
|
+
arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
|
|
23
|
+
if line =~ /\{.*}/
|
|
24
|
+
name = line.match(/\s.*/).to_a.first.strip.split.first
|
|
25
|
+
type = :coded
|
|
26
|
+
codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
|
|
27
|
+
h[name] = {type: type, codes: codes}
|
|
28
|
+
else
|
|
29
|
+
name = line.split[1]
|
|
30
|
+
type = line.split[2]
|
|
31
|
+
h[name] = {type: type}
|
|
32
|
+
end
|
|
33
|
+
}
|
|
34
|
+
h
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def data(arff, attributes)
|
|
38
|
+
lines = arff.split("\n")
|
|
39
|
+
data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
|
|
40
|
+
h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
|
|
41
|
+
data_lines.map{|line|
|
|
42
|
+
line = line.split ','
|
|
43
|
+
attributes.each_with_index{|a,i| h[a] << line[i]}
|
|
44
|
+
}
|
|
45
|
+
h
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
module PubliSci
|
|
2
|
+
module Readers
|
|
3
|
+
class Base
|
|
4
|
+
include PubliSci::Query
|
|
5
|
+
include PubliSci::Parser
|
|
6
|
+
include PubliSci::Analyzer
|
|
7
|
+
include PubliSci::Interactive
|
|
8
|
+
include PubliSci::Dataset::DataCube
|
|
9
|
+
|
|
10
|
+
#should be overridden if extra processing/input is required
|
|
11
|
+
def automatic(*args)
|
|
12
|
+
generate_n3(args[0],Hash[*args[1..-2]])
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def generate_n3(*args)
|
|
16
|
+
raise "#{self} does not implement a generate_n3 method!"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def sio_value(type,value)
|
|
20
|
+
[
|
|
21
|
+
["a", type],
|
|
22
|
+
["http://semanticscience.org/resource/SIO_000300",value]
|
|
23
|
+
]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def sio_attribute(attribute_type,value,data_type=nil)
|
|
27
|
+
inner = [
|
|
28
|
+
"http://semanticscience.org/resource/SIO_000300",value
|
|
29
|
+
]
|
|
30
|
+
if data_type
|
|
31
|
+
inner = [["a", data_type], inner]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
outer =
|
|
35
|
+
[
|
|
36
|
+
"http://semanticscience.org/resource/SIO_000008",
|
|
37
|
+
inner
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
if attribute_type
|
|
41
|
+
outer = [["a", attribute_type], outer]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# puts "#{outer}"
|
|
45
|
+
outer
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def next_label
|
|
49
|
+
if @__current_label
|
|
50
|
+
@__current_label += 1
|
|
51
|
+
else
|
|
52
|
+
@__current_label = 0
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
module PubliSci
|
|
2
|
+
module Readers
|
|
3
|
+
class CSV < Base
|
|
4
|
+
def automatic(file=nil,dataset_name=nil,options={},interactive=true)
|
|
5
|
+
#to do
|
|
6
|
+
# puts "f #{file} \n ds #{dataset_name} opts #{options}"
|
|
7
|
+
|
|
8
|
+
unless file || !interactive
|
|
9
|
+
puts "Input file?"
|
|
10
|
+
file = gets.chomp
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
raise "CSV reader needs an input file" unless file && file.size > 0
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
unless dataset_name
|
|
17
|
+
if interactive
|
|
18
|
+
dataset_name = interact("Dataset name?","#{File.basename(file).split('.').first}"){|sel| File.basename(file).split('.').first }
|
|
19
|
+
else
|
|
20
|
+
dataset_name = File.basename(file).split('.').first
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
categories = ::CSV.read(file)[0]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
unless options[:dimensions] || !interactive
|
|
29
|
+
options[:dimensions] = Array(interact("Dimensions?",categories[0],categories))
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
unless options[:measures] || !interactive
|
|
33
|
+
meas = categories - (options[:dimensions] || [categories[0]])
|
|
34
|
+
selection = interact("Measures?",meas,meas){|s| nil}
|
|
35
|
+
options[:measures] = Array(selection) unless selection == nil
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
generate_n3(file,dataset_name,options)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def generate_n3(file, dataset_name, options={})
|
|
42
|
+
@data = ::CSV.read(file)
|
|
43
|
+
@options = options
|
|
44
|
+
generate(measures, dimensions, codes, observation_data, observation_labels, dataset_name, options)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def dimensions
|
|
48
|
+
@options[:dimensions] || [@data[0][0]]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def codes
|
|
52
|
+
@options[:codes] || dimensions()
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def measures
|
|
56
|
+
@options[:measures] || @data[0] - dimensions()
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def observation_labels
|
|
60
|
+
if @options[:label_column]
|
|
61
|
+
tmp = @data.dup
|
|
62
|
+
tmp.shift
|
|
63
|
+
tmp.map{|row|
|
|
64
|
+
row[@options[:label_column]]
|
|
65
|
+
}
|
|
66
|
+
else
|
|
67
|
+
(1..@data.size - 1).to_a
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def observation_data
|
|
72
|
+
obs = {}
|
|
73
|
+
@data[0].map{|label|
|
|
74
|
+
obs[label] = []
|
|
75
|
+
}
|
|
76
|
+
tmp = @data.dup
|
|
77
|
+
tmp.shift
|
|
78
|
+
|
|
79
|
+
tmp.map{|row|
|
|
80
|
+
row.each_with_index{|entry,i|
|
|
81
|
+
obs[@data[0][i]] << entry
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
obs
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
module PubliSci
|
|
2
|
+
module Readers
|
|
3
|
+
class Dataframe
|
|
4
|
+
include PubliSci::Dataset::DataCube
|
|
5
|
+
include PubliSci::Readers::Output
|
|
6
|
+
|
|
7
|
+
# def initialize(var)
|
|
8
|
+
# @var = var
|
|
9
|
+
# end
|
|
10
|
+
|
|
11
|
+
def generate_n3(rexp, var, options={})
|
|
12
|
+
@rexp = rexp
|
|
13
|
+
options[:type] ||= :string
|
|
14
|
+
@options = options
|
|
15
|
+
output(generate(measures, dimensions, codes, observation_data, observation_labels, var, options), options)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def dimensions
|
|
19
|
+
if @options[:dimensions]
|
|
20
|
+
@options[:dimensions]
|
|
21
|
+
elsif @options[:row_label]
|
|
22
|
+
[@options[:row_label]]
|
|
23
|
+
else
|
|
24
|
+
["refRow"]
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def codes
|
|
29
|
+
if @options[:codes]
|
|
30
|
+
@options[:codes]
|
|
31
|
+
elsif @options[:row_label]
|
|
32
|
+
[@options[:row_label]]
|
|
33
|
+
else
|
|
34
|
+
["refRow"]
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def measures
|
|
39
|
+
if @options[:dimensions]
|
|
40
|
+
if @options[:measures]
|
|
41
|
+
@options[:measures] - @options[:dimensions]
|
|
42
|
+
else
|
|
43
|
+
@rexp.payload.names - @options[:dimensions]
|
|
44
|
+
end
|
|
45
|
+
else
|
|
46
|
+
@options[:measures] || @rexp.payload.names
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def observation_labels
|
|
51
|
+
row_names = @rexp.attr.payload["row.names"].to_ruby
|
|
52
|
+
row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
|
|
53
|
+
row_names
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def observation_data
|
|
57
|
+
|
|
58
|
+
data = {}
|
|
59
|
+
@rexp.payload.names.map{|name|
|
|
60
|
+
data[name] = @rexp.payload[name].to_ruby
|
|
61
|
+
}
|
|
62
|
+
data[@options[:row_label] || "refRow"] = observation_labels()
|
|
63
|
+
data
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
module PubliSci
|
|
2
|
+
module Readers
|
|
3
|
+
class MAF < Base
|
|
4
|
+
COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
|
|
5
|
+
|
|
6
|
+
COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
|
|
7
|
+
|
|
8
|
+
TCGA_CODES =
|
|
9
|
+
{
|
|
10
|
+
"Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
|
|
11
|
+
"Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
|
|
12
|
+
"dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
|
|
13
|
+
"Verification_Status" => %w{Verified, Unknown},
|
|
14
|
+
"Validation_Status" => %w{Untested Inconclusive Valid Invalid},
|
|
15
|
+
"Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
|
|
16
|
+
"Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
|
|
17
|
+
"Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def generate_n3(input_file, options={})
|
|
21
|
+
|
|
22
|
+
dataset_name = options[:dataset_name] || nil
|
|
23
|
+
output = options[:output] || :file
|
|
24
|
+
output_base = options[:output_base] || nil
|
|
25
|
+
|
|
26
|
+
@dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
|
|
27
|
+
# @codes = %w{Variant_Classification Variant_Type}
|
|
28
|
+
@codes = @dimensions
|
|
29
|
+
@measures = (COLUMN_NAMES - @dimensions - @codes)
|
|
30
|
+
@dataset_name ||= File.basename(input_file,'.*')
|
|
31
|
+
@barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
|
|
32
|
+
|
|
33
|
+
options[:no_labels] ||= true
|
|
34
|
+
options[:lookup_hugo] ||= false
|
|
35
|
+
options[:complex_objects] ||= false
|
|
36
|
+
options[:ranges] ||= COMPONENT_RANGES
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
if output == :print
|
|
40
|
+
str = structure(options)
|
|
41
|
+
f = open(input_file)
|
|
42
|
+
n = 0
|
|
43
|
+
f.each_line{|line|
|
|
44
|
+
processed = process_line(line,n.to_s,options)
|
|
45
|
+
str << processed.first if processed
|
|
46
|
+
n +=1
|
|
47
|
+
}
|
|
48
|
+
str
|
|
49
|
+
else
|
|
50
|
+
# TODO - allow multi file / separate structure output for very large datasets
|
|
51
|
+
# open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
|
|
52
|
+
file_base = output_base || @dataset_name
|
|
53
|
+
|
|
54
|
+
out = open("#{file_base}.ttl",'w')
|
|
55
|
+
out.write(structure(options))
|
|
56
|
+
f = open(input_file)
|
|
57
|
+
n = 0
|
|
58
|
+
f.each_line{|line|
|
|
59
|
+
processed = process_line(line,n.to_s,options)
|
|
60
|
+
out.write(processed.first) if processed
|
|
61
|
+
n += 1
|
|
62
|
+
}
|
|
63
|
+
if options[:lookup_hugo]
|
|
64
|
+
post_process(out)
|
|
65
|
+
else
|
|
66
|
+
out
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def process_line(line,label,options)
|
|
72
|
+
unless line[0] == "#" || line[0..3] == "Hugo"
|
|
73
|
+
entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
|
|
74
|
+
|
|
75
|
+
entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
|
|
76
|
+
|
|
77
|
+
entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
|
|
78
|
+
|
|
79
|
+
# A 0 in the entrez-id column appears to mean null
|
|
80
|
+
col=1
|
|
81
|
+
entry[col] = nil if entry[col] == '0'
|
|
82
|
+
entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
|
|
83
|
+
|
|
84
|
+
# Only link non-novel dbSNP entries
|
|
85
|
+
col = COLUMN_NAMES.index('dbSNP_RS')
|
|
86
|
+
if entry[col] && entry[col][0..1] == "rs"
|
|
87
|
+
entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# optionally create typed objects using sio nodes
|
|
91
|
+
if options[:complex_objects]
|
|
92
|
+
entry = sio_values(entry)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
data = {}
|
|
96
|
+
COLUMN_NAMES.each_with_index{|col,i|
|
|
97
|
+
data[col] = [entry[i]]
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def sio_values(entry)
|
|
105
|
+
entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
|
|
106
|
+
|
|
107
|
+
# Link entrez genes
|
|
108
|
+
col=1
|
|
109
|
+
entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
|
|
110
|
+
|
|
111
|
+
col = COLUMN_NAMES.index('dbSNP_RS')
|
|
112
|
+
entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
|
|
113
|
+
|
|
114
|
+
# test SIO attributes for chromosome
|
|
115
|
+
col = COLUMN_NAMES.index('Chromosome')
|
|
116
|
+
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# More SIO attrtibutes for alleles
|
|
121
|
+
%w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
|
|
122
|
+
col = COLUMN_NAMES.index(name)
|
|
123
|
+
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
col = COLUMN_NAMES.index("Strand")
|
|
127
|
+
entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
|
|
128
|
+
|
|
129
|
+
col = COLUMN_NAMES.index("Center")
|
|
130
|
+
entry[col] = sio_attribute("foaf:homepage",entry[col])
|
|
131
|
+
# entry[col] = [
|
|
132
|
+
# ["a", "foaf:Organization"],
|
|
133
|
+
# ["foaf:homepage", entry[col]],
|
|
134
|
+
# ]
|
|
135
|
+
|
|
136
|
+
# Use faldo for locations End_Position
|
|
137
|
+
col = COLUMN_NAMES.index("Start_Position")
|
|
138
|
+
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
|
|
139
|
+
|
|
140
|
+
col = COLUMN_NAMES.index("End_Position")
|
|
141
|
+
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
|
|
142
|
+
|
|
143
|
+
entry
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def column_replace(entry,column,prefix,value=nil)
|
|
147
|
+
if value
|
|
148
|
+
entry[COLUMN_NAMES.index(column)] = prefix + value
|
|
149
|
+
else
|
|
150
|
+
entry[COLUMN_NAMES.index(column)] += prefix
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def official_symbol(hugo_symbol)
|
|
155
|
+
qry = <<-EOF
|
|
156
|
+
|
|
157
|
+
SELECT distinct ?official where {
|
|
158
|
+
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
|
|
159
|
+
UNION
|
|
160
|
+
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
|
|
161
|
+
|
|
162
|
+
?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
EOF
|
|
166
|
+
|
|
167
|
+
sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
|
|
168
|
+
sparql.query(qry).map(&:official).first.to_s
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def parse_barcode(code)
|
|
172
|
+
#TCGA-E9-A22B-01A-11D-A159-09
|
|
173
|
+
[code[5..11], code[13..-1]]
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def structure(options={})
|
|
177
|
+
|
|
178
|
+
str = prefixes(@dataset_name,options)
|
|
179
|
+
str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
|
|
180
|
+
str << dataset(@dataset_name,options)
|
|
181
|
+
component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
|
|
182
|
+
measure_properties(@measures,@dataset_name,options).map{|m| str << m}
|
|
183
|
+
dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
|
|
184
|
+
code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
|
|
185
|
+
concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
|
|
186
|
+
str
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def post_process(file)
|
|
190
|
+
reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
|
|
191
|
+
@@hugo_cache ||= {}
|
|
192
|
+
PubliSci::PostProcessor.process(file,file,reg){|g|
|
|
193
|
+
@@hugo_cache[g] ||= official_symbol(g)
|
|
194
|
+
'http://identifiers.org/hgnc.symbol/' + cache[g]
|
|
195
|
+
}
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|