publisci 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (220) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +13 -0
  5. data/Gemfile +36 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +51 -0
  8. data/README.rdoc +48 -0
  9. data/Rakefile +68 -0
  10. data/bin/bio-publisci +106 -0
  11. data/bin/bio-publisci-server +50 -0
  12. data/examples/bio-band_integration.rb +9 -0
  13. data/examples/no_magic.prov +58 -0
  14. data/examples/no_magic.rb +58 -0
  15. data/examples/orm.prov +48 -0
  16. data/examples/primer-full.prov +120 -0
  17. data/examples/primer.prov +66 -0
  18. data/examples/prov_dsl.prov +85 -0
  19. data/examples/safe_gen.rb +7 -0
  20. data/examples/visualization/primer.prov +66 -0
  21. data/examples/visualization/prov_viz.rb +140 -0
  22. data/examples/visualization/viz.rb +35 -0
  23. data/features/create_generator.feature +21 -0
  24. data/features/integration.feature +12 -0
  25. data/features/integration_steps.rb +10 -0
  26. data/features/metadata.feature +37 -0
  27. data/features/metadata_steps.rb +40 -0
  28. data/features/orm.feature +60 -0
  29. data/features/orm_steps.rb +74 -0
  30. data/features/prov_dsl.feature +14 -0
  31. data/features/prov_dsl_steps.rb +11 -0
  32. data/features/reader.feature +25 -0
  33. data/features/reader_steps.rb +61 -0
  34. data/features/step_definitions/bio-publisci_steps.rb +0 -0
  35. data/features/store.feature +27 -0
  36. data/features/store_steps.rb +42 -0
  37. data/features/support/env.rb +13 -0
  38. data/features/writer.feature +14 -0
  39. data/features/writer_steps.rb +24 -0
  40. data/lib/bio-publisci.rb +64 -0
  41. data/lib/bio-publisci/analyzer.rb +57 -0
  42. data/lib/bio-publisci/datacube_model.rb +111 -0
  43. data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +240 -0
  44. data/lib/bio-publisci/dataset/ORM/observation.rb +20 -0
  45. data/lib/bio-publisci/dataset/configuration.rb +31 -0
  46. data/lib/bio-publisci/dataset/data_cube.rb +418 -0
  47. data/lib/bio-publisci/dataset/dataset.rb +11 -0
  48. data/lib/bio-publisci/dataset/dataset_for.rb +186 -0
  49. data/lib/bio-publisci/dataset/interactive.rb +72 -0
  50. data/lib/bio-publisci/dsl/config.rb +34 -0
  51. data/lib/bio-publisci/dsl/dataset_dsl.rb +93 -0
  52. data/lib/bio-publisci/dsl/dsl.rb +72 -0
  53. data/lib/bio-publisci/dsl/metadata_dsl.rb +85 -0
  54. data/lib/bio-publisci/dsl/prov_dsl.rb +143 -0
  55. data/lib/bio-publisci/metadata/generator.rb +323 -0
  56. data/lib/bio-publisci/metadata/metadata.rb +5 -0
  57. data/lib/bio-publisci/metadata/metadata_model.rb +25 -0
  58. data/lib/bio-publisci/metadata/prov/activity.rb +88 -0
  59. data/lib/bio-publisci/metadata/prov/agent.rb +100 -0
  60. data/lib/bio-publisci/metadata/prov/association.rb +107 -0
  61. data/lib/bio-publisci/metadata/prov/config.rb +34 -0
  62. data/lib/bio-publisci/metadata/prov/derivation.rb +60 -0
  63. data/lib/bio-publisci/metadata/prov/element.rb +120 -0
  64. data/lib/bio-publisci/metadata/prov/entity.rb +64 -0
  65. data/lib/bio-publisci/metadata/prov/model/prov_models.rb +109 -0
  66. data/lib/bio-publisci/metadata/prov/plan.rb +32 -0
  67. data/lib/bio-publisci/metadata/prov/prov.rb +78 -0
  68. data/lib/bio-publisci/metadata/prov/role.rb +40 -0
  69. data/lib/bio-publisci/metadata/prov/usage.rb +64 -0
  70. data/lib/bio-publisci/metadata/publisher.rb +25 -0
  71. data/lib/bio-publisci/mixins/custom_predicate.rb +38 -0
  72. data/lib/bio-publisci/mixins/dereferencable.rb +34 -0
  73. data/lib/bio-publisci/mixins/registry.rb +27 -0
  74. data/lib/bio-publisci/mixins/vocabulary.rb +8 -0
  75. data/lib/bio-publisci/output.rb +27 -0
  76. data/lib/bio-publisci/parser.rb +266 -0
  77. data/lib/bio-publisci/post_processor.rb +95 -0
  78. data/lib/bio-publisci/query/query_helper.rb +123 -0
  79. data/lib/bio-publisci/r_client.rb +54 -0
  80. data/lib/bio-publisci/readers/arff.rb +49 -0
  81. data/lib/bio-publisci/readers/base.rb +57 -0
  82. data/lib/bio-publisci/readers/csv.rb +88 -0
  83. data/lib/bio-publisci/readers/dataframe.rb +67 -0
  84. data/lib/bio-publisci/readers/maf.rb +199 -0
  85. data/lib/bio-publisci/readers/r_cross.rb +112 -0
  86. data/lib/bio-publisci/readers/r_matrix.rb +176 -0
  87. data/lib/bio-publisci/store.rb +56 -0
  88. data/lib/bio-publisci/writers/arff.rb +91 -0
  89. data/lib/bio-publisci/writers/base.rb +93 -0
  90. data/lib/bio-publisci/writers/csv.rb +31 -0
  91. data/lib/bio-publisci/writers/dataframe.rb +81 -0
  92. data/lib/bio-publisci/writers/json.rb +18 -0
  93. data/lib/r2rdf.rb +226 -0
  94. data/lib/template_bak.rb +12 -0
  95. data/lib/template_bak/publisci.rb +3 -0
  96. data/lib/vocabs/cc.rb +18 -0
  97. data/lib/vocabs/cert.rb +13 -0
  98. data/lib/vocabs/dc.rb +63 -0
  99. data/lib/vocabs/dc11.rb +23 -0
  100. data/lib/vocabs/doap.rb +45 -0
  101. data/lib/vocabs/exif.rb +168 -0
  102. data/lib/vocabs/foaf.rb +69 -0
  103. data/lib/vocabs/geo.rb +13 -0
  104. data/lib/vocabs/http.rb +26 -0
  105. data/lib/vocabs/ma.rb +78 -0
  106. data/lib/vocabs/owl.rb +59 -0
  107. data/lib/vocabs/rdfs.rb +17 -0
  108. data/lib/vocabs/rsa.rb +12 -0
  109. data/lib/vocabs/rss.rb +14 -0
  110. data/lib/vocabs/sioc.rb +93 -0
  111. data/lib/vocabs/skos.rb +36 -0
  112. data/lib/vocabs/wot.rb +21 -0
  113. data/lib/vocabs/xhtml.rb +9 -0
  114. data/lib/vocabs/xsd.rb +58 -0
  115. data/resources/maf_example.maf +10 -0
  116. data/resources/maf_rdf.ttl +1173 -0
  117. data/resources/primer.ttl +38 -0
  118. data/resources/queries/code_resources.rq +10 -0
  119. data/resources/queries/codes.rq +18 -0
  120. data/resources/queries/dataset.rq +7 -0
  121. data/resources/queries/dimension_ranges.rq +8 -0
  122. data/resources/queries/dimensions.rq +12 -0
  123. data/resources/queries/gene.rq +16 -0
  124. data/resources/queries/hugo_to_ensembl.rq +7 -0
  125. data/resources/queries/maf_column.rq +26 -0
  126. data/resources/queries/measures.rq +12 -0
  127. data/resources/queries/observation_labels.rq +8 -0
  128. data/resources/queries/observations.rq +13 -0
  129. data/resources/queries/patient.rq +11 -0
  130. data/resources/queries/patient_list.rq +11 -0
  131. data/resources/queries/patients_with_mutation.rq +18 -0
  132. data/resources/queries/properties.rq +8 -0
  133. data/resources/queries/test.rq +3 -0
  134. data/resources/weather.numeric.arff +28 -0
  135. data/scripts/get_gene_lengths.rb +50 -0
  136. data/scripts/islet_mlratio.rb +6 -0
  137. data/scripts/scan_islet.rb +6 -0
  138. data/scripts/update_reference.rb +25 -0
  139. data/server/helpers.rb +215 -0
  140. data/server/public/src-min-noconflict/LICENSE +24 -0
  141. data/server/public/src-min-noconflict/ace.js +11 -0
  142. data/server/public/src-min-noconflict/ext-chromevox.js +1 -0
  143. data/server/public/src-min-noconflict/ext-elastic_tabstops_lite.js +1 -0
  144. data/server/public/src-min-noconflict/ext-emmet.js +1 -0
  145. data/server/public/src-min-noconflict/ext-keybinding_menu.js +1 -0
  146. data/server/public/src-min-noconflict/ext-language_tools.js +1 -0
  147. data/server/public/src-min-noconflict/ext-modelist.js +1 -0
  148. data/server/public/src-min-noconflict/ext-old_ie.js +1 -0
  149. data/server/public/src-min-noconflict/ext-searchbox.js +1 -0
  150. data/server/public/src-min-noconflict/ext-settings_menu.js +1 -0
  151. data/server/public/src-min-noconflict/ext-spellcheck.js +1 -0
  152. data/server/public/src-min-noconflict/ext-split.js +1 -0
  153. data/server/public/src-min-noconflict/ext-static_highlight.js +1 -0
  154. data/server/public/src-min-noconflict/ext-statusbar.js +1 -0
  155. data/server/public/src-min-noconflict/ext-textarea.js +1 -0
  156. data/server/public/src-min-noconflict/ext-themelist.js +1 -0
  157. data/server/public/src-min-noconflict/ext-whitespace.js +1 -0
  158. data/server/public/src-min-noconflict/keybinding-emacs.js +1 -0
  159. data/server/public/src-min-noconflict/keybinding-vim.js +1 -0
  160. data/server/public/src-min-noconflict/mode-ruby.js +1 -0
  161. data/server/public/src-min-noconflict/snippets/ruby.js +1 -0
  162. data/server/public/src-min-noconflict/theme-twilight.js +1 -0
  163. data/server/public/src-min-noconflict/worker-coffee.js +1 -0
  164. data/server/public/src-min-noconflict/worker-css.js +1 -0
  165. data/server/public/src-min-noconflict/worker-javascript.js +1 -0
  166. data/server/public/src-min-noconflict/worker-json.js +1 -0
  167. data/server/public/src-min-noconflict/worker-lua.js +1 -0
  168. data/server/public/src-min-noconflict/worker-php.js +1 -0
  169. data/server/public/src-min-noconflict/worker-xquery.js +1 -0
  170. data/server/routes.rb +123 -0
  171. data/server/views/dsl.haml +65 -0
  172. data/server/views/dump.haml +3 -0
  173. data/server/views/import.haml +35 -0
  174. data/server/views/new_repository.haml +25 -0
  175. data/server/views/query.haml +28 -0
  176. data/server/views/repository.haml +25 -0
  177. data/spec/ORM/data_cube_orm_spec.rb +33 -0
  178. data/spec/ORM/prov_model_spec.rb +72 -0
  179. data/spec/analyzer_spec.rb +36 -0
  180. data/spec/bnode_spec.rb +66 -0
  181. data/spec/csv/bacon.csv +4 -0
  182. data/spec/csv/moar_bacon.csv +11 -0
  183. data/spec/data_cube_spec.rb +169 -0
  184. data/spec/dataset_for_spec.rb +77 -0
  185. data/spec/dsl_spec.rb +134 -0
  186. data/spec/generators/csv_spec.rb +44 -0
  187. data/spec/generators/dataframe_spec.rb +44 -0
  188. data/spec/generators/maf_spec.rb +40 -0
  189. data/spec/generators/r_cross_spec.rb +51 -0
  190. data/spec/generators/r_matrix_spec.rb +44 -0
  191. data/spec/length_lookup_spec.rb +0 -0
  192. data/spec/maf_query_spec.rb +343 -0
  193. data/spec/metadata/metadata_dsl_spec.rb +68 -0
  194. data/spec/prov/activity_spec.rb +74 -0
  195. data/spec/prov/agent_spec.rb +54 -0
  196. data/spec/prov/association_spec.rb +55 -0
  197. data/spec/prov/config_spec.rb +28 -0
  198. data/spec/prov/derivation_spec.rb +30 -0
  199. data/spec/prov/entity_spec.rb +52 -0
  200. data/spec/prov/role_spec.rb +94 -0
  201. data/spec/prov/usage_spec.rb +98 -0
  202. data/spec/queries/integrity/1.rq +21 -0
  203. data/spec/queries/integrity/11.rq +29 -0
  204. data/spec/queries/integrity/12.rq +37 -0
  205. data/spec/queries/integrity/14.rq +25 -0
  206. data/spec/queries/integrity/19_1.rq +21 -0
  207. data/spec/queries/integrity/19_2.rq +15 -0
  208. data/spec/queries/integrity/2.rq +22 -0
  209. data/spec/queries/integrity/3.rq +19 -0
  210. data/spec/queries/integrity/4.rq +13 -0
  211. data/spec/queries/integrity/5.rq +14 -0
  212. data/spec/r_builder_spec.rb +33 -0
  213. data/spec/resource/.RData +0 -0
  214. data/spec/resource/example.Rhistory +3 -0
  215. data/spec/spec_helper.rb +17 -0
  216. data/spec/turtle/bacon +147 -0
  217. data/spec/turtle/reference +2064 -0
  218. data/spec/turtle/weather +275 -0
  219. data/spec/writer_spec.rb +75 -0
  220. metadata +589 -0
@@ -0,0 +1,123 @@
1
+ module RDF
2
+ class Query
3
+ class Solutions
4
+ def to_h
5
+ arr=[]
6
+ self.map{|solution|
7
+ h={}
8
+ solution.map{|element|
9
+ h[element[0]] = element[1]
10
+ }
11
+ arr << h
12
+ }
13
+ arr
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ module PubliSci
20
+ #.gsub(/^\s+/,'')
21
+ module Query
22
+ def vocabulary
23
+ {
24
+ base: RDF::Vocabulary.new('<http://www.rqtl.org/ns/#>'),
25
+ qb: RDF::Vocabulary.new("http://purl.org/linked-data/cube#"),
26
+ rdf: RDF::Vocabulary.new('http://www.w3.org/1999/02/22-rdf-syntax-ns#'),
27
+ rdfs: RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#'),
28
+ prop: RDF::Vocabulary.new('http://www.rqtl.org/dc/properties/'),
29
+ cs: RDF::Vocabulary.new('http://www.rqtl.org/dc/cs')
30
+ }
31
+ end
32
+
33
+
34
+
35
+ # def execute_internal(query,repo)
36
+ # SPARQL.execute(query,repo)
37
+ # end
38
+
39
+ def execute(string,store,type=:fourstore)
40
+ if store.is_a?(PubliSci::Store) || store.is_a?(RDF::FourStore)
41
+ sparql = SPARQL::Client.new(store.url+"/sparql/")
42
+ elsif type == :graph || store.is_a?(RDF::Graph) || store.is_a?(RDF::Repository)
43
+ sparql = SPARQL::Client.new(store)
44
+ elsif type == :fourstore
45
+ sparql = SPARQL::Client.new(store+"/sparql/")
46
+ end
47
+ sparql.query(string)
48
+ end
49
+
50
+ def execute_from_file(file,store,type=:fourstore,substitutions={})
51
+ if Gem::Dependency.new('bio-publisci').matching_specs.size > 0
52
+ queries_dir = Gem::Specification.find_by_name("bio-publisci").gem_dir + "/resources/queries/"
53
+ else
54
+ queries_dir = File.dirname(__FILE__) + '/../../../resources/queries/'
55
+ end
56
+ if File.exist?(file)
57
+ string = IO.read(file)
58
+ elsif File.exist?(queries_dir + file)
59
+ string = IO.read(queries_dir + file)
60
+ elsif File.exist?(queries_dir + file + '.rq')
61
+ string = IO.read(queries_dir + file + '.rq')
62
+ else
63
+ raise "couldn't find query for #{file}"
64
+ end
65
+
66
+ substitutions.map{|k,v|
67
+ string = string.gsub(k,v)
68
+ }
69
+ execute(string, store, type)
70
+ end
71
+
72
+ # def prefixes
73
+ # <<-EOF
74
+ # PREFIX ns: <http://www.rqtl.org/ns/#>
75
+ # PREFIX qb: <http://purl.org/linked-data/cube#>
76
+ # PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
77
+ # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
78
+ # PREFIX prop: <http://www.rqtl.org/dc/properties/>
79
+ # PREFIX cs: <http://www.rqtl.org/dc/cs/>
80
+ # PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
81
+
82
+ # EOF
83
+ # end
84
+
85
+ def property_values(var, property)
86
+ str = prefixes
87
+ str << <<-EOS
88
+ SELECT ?val WHERE {
89
+ ?obs qb:dataSet ns:dataset-#{var} ;
90
+ prop:#{property} ?val ;
91
+ }
92
+ EOS
93
+ str
94
+ end
95
+
96
+ def row_names(var)
97
+ str = prefixes
98
+ str << <<-EOS
99
+ SELECT ?label WHERE {
100
+ ?obs qb:dataSet ns:dataset-#{var} ;
101
+ prop:refRow ?row .
102
+ ?row skos:prefLabel ?label .
103
+ }
104
+ EOS
105
+ end
106
+
107
+ # Currently will say "___ Component", needs further parsing
108
+ def property_names(var)
109
+ str = prefixes
110
+ str << <<-EOS
111
+ SELECT DISTINCT ?label WHERE {
112
+ ns:dsd-#{var} qb:component ?c .
113
+ ?c rdfs:label ?label
114
+ }
115
+ EOS
116
+ end
117
+
118
+ end
119
+
120
+ class QueryHelper
121
+ extend PubliSci::Query
122
+ end
123
+ end
@@ -0,0 +1,54 @@
1
+ module PubliSci
2
+ module Rconnect
3
+
4
+ def connect(address=nil)
5
+ if address
6
+ Rserve::Connection.new(address)
7
+ else
8
+ Rserve::Connection.new
9
+ end
10
+ end
11
+
12
+ def load_workspace(connection,loc=Dir.home,file=".RData")
13
+ loc = File.join(loc,file)
14
+ connection.eval "load(\"#{loc}\")"
15
+ end
16
+
17
+ def get(connection, instruction)
18
+ connection.eval instruction
19
+ end
20
+
21
+ def get_vars(connection)
22
+ connection.eval("ls()")
23
+ end
24
+
25
+ end
26
+
27
+ class Client
28
+ include PubliSci::Rconnect
29
+ attr :R
30
+
31
+ def initialize(auto=true, loc=Dir.home)
32
+ @R = connect
33
+ @loc = loc
34
+ load_ws if auto
35
+ puts "vars: #{vars.payload}" if auto
36
+ end
37
+
38
+ def load_ws
39
+ load_workspace(@R, @loc)
40
+ end
41
+
42
+ def get_var(var)
43
+ get(@R,var)
44
+ end
45
+
46
+ def get_ws
47
+ "#{@loc}/.RData"
48
+ end
49
+
50
+ def vars
51
+ get_vars(@R)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,49 @@
1
+ module PubliSci
2
+ module Readers
3
+ class ARFF
4
+ include PubliSci::Dataset::DataCube
5
+
6
+ def generate_n3(arff, options={})
7
+ arff = IO.read(arff) if File.exist? arff
8
+ options[:no_labels] = true # unless options[:no_labels] == nil
9
+ @options = options
10
+ comps = components(arff)
11
+ obs = data(arff, comps.keys)
12
+ generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
13
+ end
14
+
15
+ def relation(arff)
16
+ arff.match(/@relation.+/i).to_a.first.split.last
17
+ end
18
+
19
+ def components(arff)
20
+ #still needs support for quoted strings with whitespace
21
+ h ={}
22
+ arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
23
+ if line =~ /\{.*}/
24
+ name = line.match(/\s.*/).to_a.first.strip.split.first
25
+ type = :coded
26
+ codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
27
+ h[name] = {type: type, codes: codes}
28
+ else
29
+ name = line.split[1]
30
+ type = line.split[2]
31
+ h[name] = {type: type}
32
+ end
33
+ }
34
+ h
35
+ end
36
+
37
+ def data(arff, attributes)
38
+ lines = arff.split("\n")
39
+ data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
40
+ h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
41
+ data_lines.map{|line|
42
+ line = line.split ','
43
+ attributes.each_with_index{|a,i| h[a] << line[i]}
44
+ }
45
+ h
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,57 @@
1
+ module PubliSci
2
+ module Readers
3
+ class Base
4
+ include PubliSci::Query
5
+ include PubliSci::Parser
6
+ include PubliSci::Analyzer
7
+ include PubliSci::Interactive
8
+ include PubliSci::Dataset::DataCube
9
+
10
+ #should be overridden if extra processing/input is required
11
+ def automatic(*args)
12
+ generate_n3(args[0],Hash[*args[1..-2]])
13
+ end
14
+
15
+ def generate_n3(*args)
16
+ raise "#{self} does not implement a generate_n3 method!"
17
+ end
18
+
19
+ def sio_value(type,value)
20
+ [
21
+ ["a", type],
22
+ ["http://semanticscience.org/resource/SIO_000300",value]
23
+ ]
24
+ end
25
+
26
+ def sio_attribute(attribute_type,value,data_type=nil)
27
+ inner = [
28
+ "http://semanticscience.org/resource/SIO_000300",value
29
+ ]
30
+ if data_type
31
+ inner = [["a", data_type], inner]
32
+ end
33
+
34
+ outer =
35
+ [
36
+ "http://semanticscience.org/resource/SIO_000008",
37
+ inner
38
+ ]
39
+
40
+ if attribute_type
41
+ outer = [["a", attribute_type], outer]
42
+ end
43
+
44
+ # puts "#{outer}"
45
+ outer
46
+ end
47
+
48
+ def next_label
49
+ if @__current_label
50
+ @__current_label += 1
51
+ else
52
+ @__current_label = 0
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,88 @@
1
+ module PubliSci
2
+ module Readers
3
+ class CSV < Base
4
+ def automatic(file=nil,dataset_name=nil,options={},interactive=true)
5
+ #to do
6
+ # puts "f #{file} \n ds #{dataset_name} opts #{options}"
7
+
8
+ unless file || !interactive
9
+ puts "Input file?"
10
+ file = gets.chomp
11
+ end
12
+
13
+ raise "CSV reader needs an input file" unless file && file.size > 0
14
+
15
+
16
+ unless dataset_name
17
+ if interactive
18
+ dataset_name = interact("Dataset name?","#{File.basename(file).split('.').first}"){|sel| File.basename(file).split('.').first }
19
+ else
20
+ dataset_name = File.basename(file).split('.').first
21
+ end
22
+ end
23
+
24
+
25
+ categories = ::CSV.read(file)[0]
26
+
27
+
28
+ unless options[:dimensions] || !interactive
29
+ options[:dimensions] = Array(interact("Dimensions?",categories[0],categories))
30
+ end
31
+
32
+ unless options[:measures] || !interactive
33
+ meas = categories - (options[:dimensions] || [categories[0]])
34
+ selection = interact("Measures?",meas,meas){|s| nil}
35
+ options[:measures] = Array(selection) unless selection == nil
36
+ end
37
+
38
+ generate_n3(file,dataset_name,options)
39
+ end
40
+
41
+ def generate_n3(file, dataset_name, options={})
42
+ @data = ::CSV.read(file)
43
+ @options = options
44
+ generate(measures, dimensions, codes, observation_data, observation_labels, dataset_name, options)
45
+ end
46
+
47
+ def dimensions
48
+ @options[:dimensions] || [@data[0][0]]
49
+ end
50
+
51
+ def codes
52
+ @options[:codes] || dimensions()
53
+ end
54
+
55
+ def measures
56
+ @options[:measures] || @data[0] - dimensions()
57
+ end
58
+
59
+ def observation_labels
60
+ if @options[:label_column]
61
+ tmp = @data.dup
62
+ tmp.shift
63
+ tmp.map{|row|
64
+ row[@options[:label_column]]
65
+ }
66
+ else
67
+ (1..@data.size - 1).to_a
68
+ end
69
+ end
70
+
71
+ def observation_data
72
+ obs = {}
73
+ @data[0].map{|label|
74
+ obs[label] = []
75
+ }
76
+ tmp = @data.dup
77
+ tmp.shift
78
+
79
+ tmp.map{|row|
80
+ row.each_with_index{|entry,i|
81
+ obs[@data[0][i]] << entry
82
+ }
83
+ }
84
+ obs
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,67 @@
1
+ module PubliSci
2
+ module Readers
3
+ class Dataframe
4
+ include PubliSci::Dataset::DataCube
5
+ include PubliSci::Readers::Output
6
+
7
+ # def initialize(var)
8
+ # @var = var
9
+ # end
10
+
11
+ def generate_n3(rexp, var, options={})
12
+ @rexp = rexp
13
+ options[:type] ||= :string
14
+ @options = options
15
+ output(generate(measures, dimensions, codes, observation_data, observation_labels, var, options), options)
16
+ end
17
+
18
+ def dimensions
19
+ if @options[:dimensions]
20
+ @options[:dimensions]
21
+ elsif @options[:row_label]
22
+ [@options[:row_label]]
23
+ else
24
+ ["refRow"]
25
+ end
26
+ end
27
+
28
+ def codes
29
+ if @options[:codes]
30
+ @options[:codes]
31
+ elsif @options[:row_label]
32
+ [@options[:row_label]]
33
+ else
34
+ ["refRow"]
35
+ end
36
+ end
37
+
38
+ def measures
39
+ if @options[:dimensions]
40
+ if @options[:measures]
41
+ @options[:measures] - @options[:dimensions]
42
+ else
43
+ @rexp.payload.names - @options[:dimensions]
44
+ end
45
+ else
46
+ @options[:measures] || @rexp.payload.names
47
+ end
48
+ end
49
+
50
+ def observation_labels
51
+ row_names = @rexp.attr.payload["row.names"].to_ruby
52
+ row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
53
+ row_names
54
+ end
55
+
56
+ def observation_data
57
+
58
+ data = {}
59
+ @rexp.payload.names.map{|name|
60
+ data[name] = @rexp.payload[name].to_ruby
61
+ }
62
+ data[@options[:row_label] || "refRow"] = observation_labels()
63
+ data
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,199 @@
1
+ module PubliSci
2
+ module Readers
3
+ class MAF < Base
4
+ COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
5
+
6
+ COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
7
+
8
+ TCGA_CODES =
9
+ {
10
+ "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
11
+ "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
12
+ "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
13
+ "Verification_Status" => %w{Verified, Unknown},
14
+ "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
15
+ "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
16
+ "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
17
+ "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
18
+ }
19
+
20
+ def generate_n3(input_file, options={})
21
+
22
+ dataset_name = options[:dataset_name] || nil
23
+ output = options[:output] || :file
24
+ output_base = options[:output_base] || nil
25
+
26
+ @dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
27
+ # @codes = %w{Variant_Classification Variant_Type}
28
+ @codes = @dimensions
29
+ @measures = (COLUMN_NAMES - @dimensions - @codes)
30
+ @dataset_name ||= File.basename(input_file,'.*')
31
+ @barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
32
+
33
+ options[:no_labels] ||= true
34
+ options[:lookup_hugo] ||= false
35
+ options[:complex_objects] ||= false
36
+ options[:ranges] ||= COMPONENT_RANGES
37
+
38
+
39
+ if output == :print
40
+ str = structure(options)
41
+ f = open(input_file)
42
+ n = 0
43
+ f.each_line{|line|
44
+ processed = process_line(line,n.to_s,options)
45
+ str << processed.first if processed
46
+ n +=1
47
+ }
48
+ str
49
+ else
50
+ # TODO - allow multi file / separate structure output for very large datasets
51
+ # open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
52
+ file_base = output_base || @dataset_name
53
+
54
+ out = open("#{file_base}.ttl",'w')
55
+ out.write(structure(options))
56
+ f = open(input_file)
57
+ n = 0
58
+ f.each_line{|line|
59
+ processed = process_line(line,n.to_s,options)
60
+ out.write(processed.first) if processed
61
+ n += 1
62
+ }
63
+ if options[:lookup_hugo]
64
+ post_process(out)
65
+ else
66
+ out
67
+ end
68
+ end
69
+ end
70
+
71
+ def process_line(line,label,options)
72
+ unless line[0] == "#" || line[0..3] == "Hugo"
73
+ entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
74
+
75
+ entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
76
+
77
+ entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
78
+
79
+ # A 0 in the entrez-id column appears to mean null
80
+ col=1
81
+ entry[col] = nil if entry[col] == '0'
82
+ entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
83
+
84
+ # Only link non-novel dbSNP entries
85
+ col = COLUMN_NAMES.index('dbSNP_RS')
86
+ if entry[col] && entry[col][0..1] == "rs"
87
+ entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
88
+ end
89
+
90
+ # optionally create typed objects using sio nodes
91
+ if options[:complex_objects]
92
+ entry = sio_values(entry)
93
+ end
94
+
95
+ data = {}
96
+ COLUMN_NAMES.each_with_index{|col,i|
97
+ data[col] = [entry[i]]
98
+ }
99
+
100
+ observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
101
+ end
102
+ end
103
+
104
+ def sio_values(entry)
105
+ entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
106
+
107
+ # Link entrez genes
108
+ col=1
109
+ entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
110
+
111
+ col = COLUMN_NAMES.index('dbSNP_RS')
112
+ entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
113
+
114
+ # test SIO attributes for chromosome
115
+ col = COLUMN_NAMES.index('Chromosome')
116
+ entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
117
+
118
+
119
+
120
+ # More SIO attrtibutes for alleles
121
+ %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
122
+ col = COLUMN_NAMES.index(name)
123
+ entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
124
+ }
125
+
126
+ col = COLUMN_NAMES.index("Strand")
127
+ entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
128
+
129
+ col = COLUMN_NAMES.index("Center")
130
+ entry[col] = sio_attribute("foaf:homepage",entry[col])
131
+ # entry[col] = [
132
+ # ["a", "foaf:Organization"],
133
+ # ["foaf:homepage", entry[col]],
134
+ # ]
135
+
136
+ # Use faldo for locations End_Position
137
+ col = COLUMN_NAMES.index("Start_Position")
138
+ entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
139
+
140
+ col = COLUMN_NAMES.index("End_Position")
141
+ entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
142
+
143
+ entry
144
+ end
145
+
146
+ def column_replace(entry,column,prefix,value=nil)
147
+ if value
148
+ entry[COLUMN_NAMES.index(column)] = prefix + value
149
+ else
150
+ entry[COLUMN_NAMES.index(column)] += prefix
151
+ end
152
+ end
153
+
154
+ def official_symbol(hugo_symbol)
155
+ qry = <<-EOF
156
+
157
+ SELECT distinct ?official where {
158
+ {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
159
+ UNION
160
+ {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
161
+
162
+ ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
163
+ }
164
+
165
+ EOF
166
+
167
+ sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
168
+ sparql.query(qry).map(&:official).first.to_s
169
+ end
170
+
171
+ def parse_barcode(code)
172
+ #TCGA-E9-A22B-01A-11D-A159-09
173
+ [code[5..11], code[13..-1]]
174
+ end
175
+
176
+ def structure(options={})
177
+
178
+ str = prefixes(@dataset_name,options)
179
+ str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
180
+ str << dataset(@dataset_name,options)
181
+ component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
182
+ measure_properties(@measures,@dataset_name,options).map{|m| str << m}
183
+ dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
184
+ code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
185
+ concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
186
+ str
187
+ end
188
+
189
+ def post_process(file)
190
+ reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
191
+ @@hugo_cache ||= {}
192
+ PubliSci::PostProcessor.process(file,file,reg){|g|
193
+ @@hugo_cache[g] ||= official_symbol(g)
194
+ 'http://identifiers.org/hgnc.symbol/' + cache[g]
195
+ }
196
+ end
197
+ end
198
+ end
199
+ end