publisci 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +13 -0
  5. data/Gemfile +36 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +51 -0
  8. data/README.rdoc +48 -0
  9. data/Rakefile +68 -0
  10. data/bin/bio-publisci +106 -0
  11. data/bin/bio-publisci-server +50 -0
  12. data/examples/bio-band_integration.rb +9 -0
  13. data/examples/no_magic.prov +58 -0
  14. data/examples/no_magic.rb +58 -0
  15. data/examples/orm.prov +48 -0
  16. data/examples/primer-full.prov +120 -0
  17. data/examples/primer.prov +66 -0
  18. data/examples/prov_dsl.prov +85 -0
  19. data/examples/safe_gen.rb +7 -0
  20. data/examples/visualization/primer.prov +66 -0
  21. data/examples/visualization/prov_viz.rb +140 -0
  22. data/examples/visualization/viz.rb +35 -0
  23. data/features/create_generator.feature +21 -0
  24. data/features/integration.feature +12 -0
  25. data/features/integration_steps.rb +10 -0
  26. data/features/metadata.feature +37 -0
  27. data/features/metadata_steps.rb +40 -0
  28. data/features/orm.feature +60 -0
  29. data/features/orm_steps.rb +74 -0
  30. data/features/prov_dsl.feature +14 -0
  31. data/features/prov_dsl_steps.rb +11 -0
  32. data/features/reader.feature +25 -0
  33. data/features/reader_steps.rb +61 -0
  34. data/features/step_definitions/bio-publisci_steps.rb +0 -0
  35. data/features/store.feature +27 -0
  36. data/features/store_steps.rb +42 -0
  37. data/features/support/env.rb +13 -0
  38. data/features/writer.feature +14 -0
  39. data/features/writer_steps.rb +24 -0
  40. data/lib/bio-publisci.rb +64 -0
  41. data/lib/bio-publisci/analyzer.rb +57 -0
  42. data/lib/bio-publisci/datacube_model.rb +111 -0
  43. data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +240 -0
  44. data/lib/bio-publisci/dataset/ORM/observation.rb +20 -0
  45. data/lib/bio-publisci/dataset/configuration.rb +31 -0
  46. data/lib/bio-publisci/dataset/data_cube.rb +418 -0
  47. data/lib/bio-publisci/dataset/dataset.rb +11 -0
  48. data/lib/bio-publisci/dataset/dataset_for.rb +186 -0
  49. data/lib/bio-publisci/dataset/interactive.rb +72 -0
  50. data/lib/bio-publisci/dsl/config.rb +34 -0
  51. data/lib/bio-publisci/dsl/dataset_dsl.rb +93 -0
  52. data/lib/bio-publisci/dsl/dsl.rb +72 -0
  53. data/lib/bio-publisci/dsl/metadata_dsl.rb +85 -0
  54. data/lib/bio-publisci/dsl/prov_dsl.rb +143 -0
  55. data/lib/bio-publisci/metadata/generator.rb +323 -0
  56. data/lib/bio-publisci/metadata/metadata.rb +5 -0
  57. data/lib/bio-publisci/metadata/metadata_model.rb +25 -0
  58. data/lib/bio-publisci/metadata/prov/activity.rb +88 -0
  59. data/lib/bio-publisci/metadata/prov/agent.rb +100 -0
  60. data/lib/bio-publisci/metadata/prov/association.rb +107 -0
  61. data/lib/bio-publisci/metadata/prov/config.rb +34 -0
  62. data/lib/bio-publisci/metadata/prov/derivation.rb +60 -0
  63. data/lib/bio-publisci/metadata/prov/element.rb +120 -0
  64. data/lib/bio-publisci/metadata/prov/entity.rb +64 -0
  65. data/lib/bio-publisci/metadata/prov/model/prov_models.rb +109 -0
  66. data/lib/bio-publisci/metadata/prov/plan.rb +32 -0
  67. data/lib/bio-publisci/metadata/prov/prov.rb +78 -0
  68. data/lib/bio-publisci/metadata/prov/role.rb +40 -0
  69. data/lib/bio-publisci/metadata/prov/usage.rb +64 -0
  70. data/lib/bio-publisci/metadata/publisher.rb +25 -0
  71. data/lib/bio-publisci/mixins/custom_predicate.rb +38 -0
  72. data/lib/bio-publisci/mixins/dereferencable.rb +34 -0
  73. data/lib/bio-publisci/mixins/registry.rb +27 -0
  74. data/lib/bio-publisci/mixins/vocabulary.rb +8 -0
  75. data/lib/bio-publisci/output.rb +27 -0
  76. data/lib/bio-publisci/parser.rb +266 -0
  77. data/lib/bio-publisci/post_processor.rb +95 -0
  78. data/lib/bio-publisci/query/query_helper.rb +123 -0
  79. data/lib/bio-publisci/r_client.rb +54 -0
  80. data/lib/bio-publisci/readers/arff.rb +49 -0
  81. data/lib/bio-publisci/readers/base.rb +57 -0
  82. data/lib/bio-publisci/readers/csv.rb +88 -0
  83. data/lib/bio-publisci/readers/dataframe.rb +67 -0
  84. data/lib/bio-publisci/readers/maf.rb +199 -0
  85. data/lib/bio-publisci/readers/r_cross.rb +112 -0
  86. data/lib/bio-publisci/readers/r_matrix.rb +176 -0
  87. data/lib/bio-publisci/store.rb +56 -0
  88. data/lib/bio-publisci/writers/arff.rb +91 -0
  89. data/lib/bio-publisci/writers/base.rb +93 -0
  90. data/lib/bio-publisci/writers/csv.rb +31 -0
  91. data/lib/bio-publisci/writers/dataframe.rb +81 -0
  92. data/lib/bio-publisci/writers/json.rb +18 -0
  93. data/lib/r2rdf.rb +226 -0
  94. data/lib/template_bak.rb +12 -0
  95. data/lib/template_bak/publisci.rb +3 -0
  96. data/lib/vocabs/cc.rb +18 -0
  97. data/lib/vocabs/cert.rb +13 -0
  98. data/lib/vocabs/dc.rb +63 -0
  99. data/lib/vocabs/dc11.rb +23 -0
  100. data/lib/vocabs/doap.rb +45 -0
  101. data/lib/vocabs/exif.rb +168 -0
  102. data/lib/vocabs/foaf.rb +69 -0
  103. data/lib/vocabs/geo.rb +13 -0
  104. data/lib/vocabs/http.rb +26 -0
  105. data/lib/vocabs/ma.rb +78 -0
  106. data/lib/vocabs/owl.rb +59 -0
  107. data/lib/vocabs/rdfs.rb +17 -0
  108. data/lib/vocabs/rsa.rb +12 -0
  109. data/lib/vocabs/rss.rb +14 -0
  110. data/lib/vocabs/sioc.rb +93 -0
  111. data/lib/vocabs/skos.rb +36 -0
  112. data/lib/vocabs/wot.rb +21 -0
  113. data/lib/vocabs/xhtml.rb +9 -0
  114. data/lib/vocabs/xsd.rb +58 -0
  115. data/resources/maf_example.maf +10 -0
  116. data/resources/maf_rdf.ttl +1173 -0
  117. data/resources/primer.ttl +38 -0
  118. data/resources/queries/code_resources.rq +10 -0
  119. data/resources/queries/codes.rq +18 -0
  120. data/resources/queries/dataset.rq +7 -0
  121. data/resources/queries/dimension_ranges.rq +8 -0
  122. data/resources/queries/dimensions.rq +12 -0
  123. data/resources/queries/gene.rq +16 -0
  124. data/resources/queries/hugo_to_ensembl.rq +7 -0
  125. data/resources/queries/maf_column.rq +26 -0
  126. data/resources/queries/measures.rq +12 -0
  127. data/resources/queries/observation_labels.rq +8 -0
  128. data/resources/queries/observations.rq +13 -0
  129. data/resources/queries/patient.rq +11 -0
  130. data/resources/queries/patient_list.rq +11 -0
  131. data/resources/queries/patients_with_mutation.rq +18 -0
  132. data/resources/queries/properties.rq +8 -0
  133. data/resources/queries/test.rq +3 -0
  134. data/resources/weather.numeric.arff +28 -0
  135. data/scripts/get_gene_lengths.rb +50 -0
  136. data/scripts/islet_mlratio.rb +6 -0
  137. data/scripts/scan_islet.rb +6 -0
  138. data/scripts/update_reference.rb +25 -0
  139. data/server/helpers.rb +215 -0
  140. data/server/public/src-min-noconflict/LICENSE +24 -0
  141. data/server/public/src-min-noconflict/ace.js +11 -0
  142. data/server/public/src-min-noconflict/ext-chromevox.js +1 -0
  143. data/server/public/src-min-noconflict/ext-elastic_tabstops_lite.js +1 -0
  144. data/server/public/src-min-noconflict/ext-emmet.js +1 -0
  145. data/server/public/src-min-noconflict/ext-keybinding_menu.js +1 -0
  146. data/server/public/src-min-noconflict/ext-language_tools.js +1 -0
  147. data/server/public/src-min-noconflict/ext-modelist.js +1 -0
  148. data/server/public/src-min-noconflict/ext-old_ie.js +1 -0
  149. data/server/public/src-min-noconflict/ext-searchbox.js +1 -0
  150. data/server/public/src-min-noconflict/ext-settings_menu.js +1 -0
  151. data/server/public/src-min-noconflict/ext-spellcheck.js +1 -0
  152. data/server/public/src-min-noconflict/ext-split.js +1 -0
  153. data/server/public/src-min-noconflict/ext-static_highlight.js +1 -0
  154. data/server/public/src-min-noconflict/ext-statusbar.js +1 -0
  155. data/server/public/src-min-noconflict/ext-textarea.js +1 -0
  156. data/server/public/src-min-noconflict/ext-themelist.js +1 -0
  157. data/server/public/src-min-noconflict/ext-whitespace.js +1 -0
  158. data/server/public/src-min-noconflict/keybinding-emacs.js +1 -0
  159. data/server/public/src-min-noconflict/keybinding-vim.js +1 -0
  160. data/server/public/src-min-noconflict/mode-ruby.js +1 -0
  161. data/server/public/src-min-noconflict/snippets/ruby.js +1 -0
  162. data/server/public/src-min-noconflict/theme-twilight.js +1 -0
  163. data/server/public/src-min-noconflict/worker-coffee.js +1 -0
  164. data/server/public/src-min-noconflict/worker-css.js +1 -0
  165. data/server/public/src-min-noconflict/worker-javascript.js +1 -0
  166. data/server/public/src-min-noconflict/worker-json.js +1 -0
  167. data/server/public/src-min-noconflict/worker-lua.js +1 -0
  168. data/server/public/src-min-noconflict/worker-php.js +1 -0
  169. data/server/public/src-min-noconflict/worker-xquery.js +1 -0
  170. data/server/routes.rb +123 -0
  171. data/server/views/dsl.haml +65 -0
  172. data/server/views/dump.haml +3 -0
  173. data/server/views/import.haml +35 -0
  174. data/server/views/new_repository.haml +25 -0
  175. data/server/views/query.haml +28 -0
  176. data/server/views/repository.haml +25 -0
  177. data/spec/ORM/data_cube_orm_spec.rb +33 -0
  178. data/spec/ORM/prov_model_spec.rb +72 -0
  179. data/spec/analyzer_spec.rb +36 -0
  180. data/spec/bnode_spec.rb +66 -0
  181. data/spec/csv/bacon.csv +4 -0
  182. data/spec/csv/moar_bacon.csv +11 -0
  183. data/spec/data_cube_spec.rb +169 -0
  184. data/spec/dataset_for_spec.rb +77 -0
  185. data/spec/dsl_spec.rb +134 -0
  186. data/spec/generators/csv_spec.rb +44 -0
  187. data/spec/generators/dataframe_spec.rb +44 -0
  188. data/spec/generators/maf_spec.rb +40 -0
  189. data/spec/generators/r_cross_spec.rb +51 -0
  190. data/spec/generators/r_matrix_spec.rb +44 -0
  191. data/spec/length_lookup_spec.rb +0 -0
  192. data/spec/maf_query_spec.rb +343 -0
  193. data/spec/metadata/metadata_dsl_spec.rb +68 -0
  194. data/spec/prov/activity_spec.rb +74 -0
  195. data/spec/prov/agent_spec.rb +54 -0
  196. data/spec/prov/association_spec.rb +55 -0
  197. data/spec/prov/config_spec.rb +28 -0
  198. data/spec/prov/derivation_spec.rb +30 -0
  199. data/spec/prov/entity_spec.rb +52 -0
  200. data/spec/prov/role_spec.rb +94 -0
  201. data/spec/prov/usage_spec.rb +98 -0
  202. data/spec/queries/integrity/1.rq +21 -0
  203. data/spec/queries/integrity/11.rq +29 -0
  204. data/spec/queries/integrity/12.rq +37 -0
  205. data/spec/queries/integrity/14.rq +25 -0
  206. data/spec/queries/integrity/19_1.rq +21 -0
  207. data/spec/queries/integrity/19_2.rq +15 -0
  208. data/spec/queries/integrity/2.rq +22 -0
  209. data/spec/queries/integrity/3.rq +19 -0
  210. data/spec/queries/integrity/4.rq +13 -0
  211. data/spec/queries/integrity/5.rq +14 -0
  212. data/spec/r_builder_spec.rb +33 -0
  213. data/spec/resource/.RData +0 -0
  214. data/spec/resource/example.Rhistory +3 -0
  215. data/spec/spec_helper.rb +17 -0
  216. data/spec/turtle/bacon +147 -0
  217. data/spec/turtle/reference +2064 -0
  218. data/spec/turtle/weather +275 -0
  219. data/spec/writer_spec.rb +75 -0
  220. metadata +589 -0
@@ -0,0 +1,123 @@
1
+ module RDF
2
+ class Query
3
+ class Solutions
4
+ def to_h
5
+ arr=[]
6
+ self.map{|solution|
7
+ h={}
8
+ solution.map{|element|
9
+ h[element[0]] = element[1]
10
+ }
11
+ arr << h
12
+ }
13
+ arr
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ module PubliSci
20
+ #.gsub(/^\s+/,'')
21
+ module Query
22
+ def vocabulary
23
+ {
24
+ base: RDF::Vocabulary.new('<http://www.rqtl.org/ns/#>'),
25
+ qb: RDF::Vocabulary.new("http://purl.org/linked-data/cube#"),
26
+ rdf: RDF::Vocabulary.new('http://www.w3.org/1999/02/22-rdf-syntax-ns#'),
27
+ rdfs: RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#'),
28
+ prop: RDF::Vocabulary.new('http://www.rqtl.org/dc/properties/'),
29
+ cs: RDF::Vocabulary.new('http://www.rqtl.org/dc/cs')
30
+ }
31
+ end
32
+
33
+
34
+
35
+ # def execute_internal(query,repo)
36
+ # SPARQL.execute(query,repo)
37
+ # end
38
+
39
+ def execute(string,store,type=:fourstore)
40
+ if store.is_a?(PubliSci::Store) || store.is_a?(RDF::FourStore)
41
+ sparql = SPARQL::Client.new(store.url+"/sparql/")
42
+ elsif type == :graph || store.is_a?(RDF::Graph) || store.is_a?(RDF::Repository)
43
+ sparql = SPARQL::Client.new(store)
44
+ elsif type == :fourstore
45
+ sparql = SPARQL::Client.new(store+"/sparql/")
46
+ end
47
+ sparql.query(string)
48
+ end
49
+
50
+ def execute_from_file(file,store,type=:fourstore,substitutions={})
51
+ if Gem::Dependency.new('bio-publisci').matching_specs.size > 0
52
+ queries_dir = Gem::Specification.find_by_name("bio-publisci").gem_dir + "/resources/queries/"
53
+ else
54
+ queries_dir = File.dirname(__FILE__) + '/../../../resources/queries/'
55
+ end
56
+ if File.exist?(file)
57
+ string = IO.read(file)
58
+ elsif File.exist?(queries_dir + file)
59
+ string = IO.read(queries_dir + file)
60
+ elsif File.exist?(queries_dir + file + '.rq')
61
+ string = IO.read(queries_dir + file + '.rq')
62
+ else
63
+ raise "couldn't find query for #{file}"
64
+ end
65
+
66
+ substitutions.map{|k,v|
67
+ string = string.gsub(k,v)
68
+ }
69
+ execute(string, store, type)
70
+ end
71
+
72
+ # def prefixes
73
+ # <<-EOF
74
+ # PREFIX ns: <http://www.rqtl.org/ns/#>
75
+ # PREFIX qb: <http://purl.org/linked-data/cube#>
76
+ # PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
77
+ # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
78
+ # PREFIX prop: <http://www.rqtl.org/dc/properties/>
79
+ # PREFIX cs: <http://www.rqtl.org/dc/cs/>
80
+ # PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
81
+
82
+ # EOF
83
+ # end
84
+
85
+ def property_values(var, property)
86
+ str = prefixes
87
+ str << <<-EOS
88
+ SELECT ?val WHERE {
89
+ ?obs qb:dataSet ns:dataset-#{var} ;
90
+ prop:#{property} ?val ;
91
+ }
92
+ EOS
93
+ str
94
+ end
95
+
96
+ def row_names(var)
97
+ str = prefixes
98
+ str << <<-EOS
99
+ SELECT ?label WHERE {
100
+ ?obs qb:dataSet ns:dataset-#{var} ;
101
+ prop:refRow ?row .
102
+ ?row skos:prefLabel ?label .
103
+ }
104
+ EOS
105
+ end
106
+
107
+ # Currently will say "___ Component", needs further parsing
108
+ def property_names(var)
109
+ str = prefixes
110
+ str << <<-EOS
111
+ SELECT DISTINCT ?label WHERE {
112
+ ns:dsd-#{var} qb:component ?c .
113
+ ?c rdfs:label ?label
114
+ }
115
+ EOS
116
+ end
117
+
118
+ end
119
+
120
+ class QueryHelper
121
+ extend PubliSci::Query
122
+ end
123
+ end
@@ -0,0 +1,54 @@
1
+ module PubliSci
2
+ module Rconnect
3
+
4
+ def connect(address=nil)
5
+ if address
6
+ Rserve::Connection.new(address)
7
+ else
8
+ Rserve::Connection.new
9
+ end
10
+ end
11
+
12
+ def load_workspace(connection,loc=Dir.home,file=".RData")
13
+ loc = File.join(loc,file)
14
+ connection.eval "load(\"#{loc}\")"
15
+ end
16
+
17
+ def get(connection, instruction)
18
+ connection.eval instruction
19
+ end
20
+
21
+ def get_vars(connection)
22
+ connection.eval("ls()")
23
+ end
24
+
25
+ end
26
+
27
+ class Client
28
+ include PubliSci::Rconnect
29
+ attr :R
30
+
31
+ def initialize(auto=true, loc=Dir.home)
32
+ @R = connect
33
+ @loc = loc
34
+ load_ws if auto
35
+ puts "vars: #{vars.payload}" if auto
36
+ end
37
+
38
+ def load_ws
39
+ load_workspace(@R, @loc)
40
+ end
41
+
42
+ def get_var(var)
43
+ get(@R,var)
44
+ end
45
+
46
+ def get_ws
47
+ "#{@loc}/.RData"
48
+ end
49
+
50
+ def vars
51
+ get_vars(@R)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,49 @@
1
+ module PubliSci
2
+ module Readers
3
+ class ARFF
4
+ include PubliSci::Dataset::DataCube
5
+
6
+ def generate_n3(arff, options={})
7
+ arff = IO.read(arff) if File.exist? arff
8
+ options[:no_labels] = true # unless options[:no_labels] == nil
9
+ @options = options
10
+ comps = components(arff)
11
+ obs = data(arff, comps.keys)
12
+ generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
13
+ end
14
+
15
+ def relation(arff)
16
+ arff.match(/@relation.+/i).to_a.first.split.last
17
+ end
18
+
19
+ def components(arff)
20
+ #still needs support for quoted strings with whitespace
21
+ h ={}
22
+ arff.split("\n").select{|lin| lin =~ /^@ATTRIBUTE/i}.map{|line|
23
+ if line =~ /\{.*}/
24
+ name = line.match(/\s.*/).to_a.first.strip.split.first
25
+ type = :coded
26
+ codes = line.match(/\{.*}/).to_a.first[1..-2].split(',')
27
+ h[name] = {type: type, codes: codes}
28
+ else
29
+ name = line.split[1]
30
+ type = line.split[2]
31
+ h[name] = {type: type}
32
+ end
33
+ }
34
+ h
35
+ end
36
+
37
+ def data(arff, attributes)
38
+ lines = arff.split("\n")
39
+ data_lines = lines[lines.index(lines.select{|line| line =~ /^@DATA/i}.first)+1..-1]
40
+ h=attributes.inject({}){|ha,attrib| ha[attrib] = []; ha}
41
+ data_lines.map{|line|
42
+ line = line.split ','
43
+ attributes.each_with_index{|a,i| h[a] << line[i]}
44
+ }
45
+ h
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,57 @@
1
+ module PubliSci
2
+ module Readers
3
+ class Base
4
+ include PubliSci::Query
5
+ include PubliSci::Parser
6
+ include PubliSci::Analyzer
7
+ include PubliSci::Interactive
8
+ include PubliSci::Dataset::DataCube
9
+
10
+ #should be overridden if extra processing/input is required
11
+ def automatic(*args)
12
+ generate_n3(args[0],Hash[*args[1..-2]])
13
+ end
14
+
15
+ def generate_n3(*args)
16
+ raise "#{self} does not implement a generate_n3 method!"
17
+ end
18
+
19
+ def sio_value(type,value)
20
+ [
21
+ ["a", type],
22
+ ["http://semanticscience.org/resource/SIO_000300",value]
23
+ ]
24
+ end
25
+
26
+ def sio_attribute(attribute_type,value,data_type=nil)
27
+ inner = [
28
+ "http://semanticscience.org/resource/SIO_000300",value
29
+ ]
30
+ if data_type
31
+ inner = [["a", data_type], inner]
32
+ end
33
+
34
+ outer =
35
+ [
36
+ "http://semanticscience.org/resource/SIO_000008",
37
+ inner
38
+ ]
39
+
40
+ if attribute_type
41
+ outer = [["a", attribute_type], outer]
42
+ end
43
+
44
+ # puts "#{outer}"
45
+ outer
46
+ end
47
+
48
+ def next_label
49
+ if @__current_label
50
+ @__current_label += 1
51
+ else
52
+ @__current_label = 0
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,88 @@
1
+ module PubliSci
2
+ module Readers
3
+ class CSV < Base
4
+ def automatic(file=nil,dataset_name=nil,options={},interactive=true)
5
+ #to do
6
+ # puts "f #{file} \n ds #{dataset_name} opts #{options}"
7
+
8
+ unless file || !interactive
9
+ puts "Input file?"
10
+ file = gets.chomp
11
+ end
12
+
13
+ raise "CSV reader needs an input file" unless file && file.size > 0
14
+
15
+
16
+ unless dataset_name
17
+ if interactive
18
+ dataset_name = interact("Dataset name?","#{File.basename(file).split('.').first}"){|sel| File.basename(file).split('.').first }
19
+ else
20
+ dataset_name = File.basename(file).split('.').first
21
+ end
22
+ end
23
+
24
+
25
+ categories = ::CSV.read(file)[0]
26
+
27
+
28
+ unless options[:dimensions] || !interactive
29
+ options[:dimensions] = Array(interact("Dimensions?",categories[0],categories))
30
+ end
31
+
32
+ unless options[:measures] || !interactive
33
+ meas = categories - (options[:dimensions] || [categories[0]])
34
+ selection = interact("Measures?",meas,meas){|s| nil}
35
+ options[:measures] = Array(selection) unless selection == nil
36
+ end
37
+
38
+ generate_n3(file,dataset_name,options)
39
+ end
40
+
41
+ def generate_n3(file, dataset_name, options={})
42
+ @data = ::CSV.read(file)
43
+ @options = options
44
+ generate(measures, dimensions, codes, observation_data, observation_labels, dataset_name, options)
45
+ end
46
+
47
+ def dimensions
48
+ @options[:dimensions] || [@data[0][0]]
49
+ end
50
+
51
+ def codes
52
+ @options[:codes] || dimensions()
53
+ end
54
+
55
+ def measures
56
+ @options[:measures] || @data[0] - dimensions()
57
+ end
58
+
59
+ def observation_labels
60
+ if @options[:label_column]
61
+ tmp = @data.dup
62
+ tmp.shift
63
+ tmp.map{|row|
64
+ row[@options[:label_column]]
65
+ }
66
+ else
67
+ (1..@data.size - 1).to_a
68
+ end
69
+ end
70
+
71
+ def observation_data
72
+ obs = {}
73
+ @data[0].map{|label|
74
+ obs[label] = []
75
+ }
76
+ tmp = @data.dup
77
+ tmp.shift
78
+
79
+ tmp.map{|row|
80
+ row.each_with_index{|entry,i|
81
+ obs[@data[0][i]] << entry
82
+ }
83
+ }
84
+ obs
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,67 @@
1
+ module PubliSci
2
+ module Readers
3
+ class Dataframe
4
+ include PubliSci::Dataset::DataCube
5
+ include PubliSci::Readers::Output
6
+
7
+ # def initialize(var)
8
+ # @var = var
9
+ # end
10
+
11
+ def generate_n3(rexp, var, options={})
12
+ @rexp = rexp
13
+ options[:type] ||= :string
14
+ @options = options
15
+ output(generate(measures, dimensions, codes, observation_data, observation_labels, var, options), options)
16
+ end
17
+
18
+ def dimensions
19
+ if @options[:dimensions]
20
+ @options[:dimensions]
21
+ elsif @options[:row_label]
22
+ [@options[:row_label]]
23
+ else
24
+ ["refRow"]
25
+ end
26
+ end
27
+
28
+ def codes
29
+ if @options[:codes]
30
+ @options[:codes]
31
+ elsif @options[:row_label]
32
+ [@options[:row_label]]
33
+ else
34
+ ["refRow"]
35
+ end
36
+ end
37
+
38
+ def measures
39
+ if @options[:dimensions]
40
+ if @options[:measures]
41
+ @options[:measures] - @options[:dimensions]
42
+ else
43
+ @rexp.payload.names - @options[:dimensions]
44
+ end
45
+ else
46
+ @options[:measures] || @rexp.payload.names
47
+ end
48
+ end
49
+
50
+ def observation_labels
51
+ row_names = @rexp.attr.payload["row.names"].to_ruby
52
+ row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
53
+ row_names
54
+ end
55
+
56
+ def observation_data
57
+
58
+ data = {}
59
+ @rexp.payload.names.map{|name|
60
+ data[name] = @rexp.payload[name].to_ruby
61
+ }
62
+ data[@options[:row_label] || "refRow"] = observation_labels()
63
+ data
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,199 @@
1
+ module PubliSci
2
+ module Readers
3
+ class MAF < Base
4
+ COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
5
+
6
+ COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
7
+
8
+ TCGA_CODES =
9
+ {
10
+ "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
11
+ "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
12
+ "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
13
+ "Verification_Status" => %w{Verified, Unknown},
14
+ "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
15
+ "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
16
+ "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
17
+ "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
18
+ }
19
+
20
+ def generate_n3(input_file, options={})
21
+
22
+ dataset_name = options[:dataset_name] || nil
23
+ output = options[:output] || :file
24
+ output_base = options[:output_base] || nil
25
+
26
+ @dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
27
+ # @codes = %w{Variant_Classification Variant_Type}
28
+ @codes = @dimensions
29
+ @measures = (COLUMN_NAMES - @dimensions - @codes)
30
+ @dataset_name ||= File.basename(input_file,'.*')
31
+ @barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
32
+
33
+ options[:no_labels] ||= true
34
+ options[:lookup_hugo] ||= false
35
+ options[:complex_objects] ||= false
36
+ options[:ranges] ||= COMPONENT_RANGES
37
+
38
+
39
+ if output == :print
40
+ str = structure(options)
41
+ f = open(input_file)
42
+ n = 0
43
+ f.each_line{|line|
44
+ processed = process_line(line,n.to_s,options)
45
+ str << processed.first if processed
46
+ n +=1
47
+ }
48
+ str
49
+ else
50
+ # TODO - allow multi file / separate structure output for very large datasets
51
+ # open("#{file_base}_structure.ttl",'w'){|f| f.write structure(options)}
52
+ file_base = output_base || @dataset_name
53
+
54
+ out = open("#{file_base}.ttl",'w')
55
+ out.write(structure(options))
56
+ f = open(input_file)
57
+ n = 0
58
+ f.each_line{|line|
59
+ processed = process_line(line,n.to_s,options)
60
+ out.write(processed.first) if processed
61
+ n += 1
62
+ }
63
+ if options[:lookup_hugo]
64
+ post_process(out)
65
+ else
66
+ out
67
+ end
68
+ end
69
+ end
70
+
71
+ def process_line(line,label,options)
72
+ unless line[0] == "#" || line[0..3] == "Hugo"
73
+ entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
74
+
75
+ entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
76
+
77
+ entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
78
+
79
+ # A 0 in the entrez-id column appears to mean null
80
+ col=1
81
+ entry[col] = nil if entry[col] == '0'
82
+ entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
83
+
84
+ # Only link non-novel dbSNP entries
85
+ col = COLUMN_NAMES.index('dbSNP_RS')
86
+ if entry[col] && entry[col][0..1] == "rs"
87
+ entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
88
+ end
89
+
90
+ # optionally create typed objects using sio nodes
91
+ if options[:complex_objects]
92
+ entry = sio_values(entry)
93
+ end
94
+
95
+ data = {}
96
+ COLUMN_NAMES.each_with_index{|col,i|
97
+ data[col] = [entry[i]]
98
+ }
99
+
100
+ observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
101
+ end
102
+ end
103
+
104
+ def sio_values(entry)
105
+ entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
106
+
107
+ # Link entrez genes
108
+ col=1
109
+ entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
110
+
111
+ col = COLUMN_NAMES.index('dbSNP_RS')
112
+ entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
113
+
114
+ # test SIO attributes for chromosome
115
+ col = COLUMN_NAMES.index('Chromosome')
116
+ entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
117
+
118
+
119
+
120
+ # More SIO attrtibutes for alleles
121
+ %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
122
+ col = COLUMN_NAMES.index(name)
123
+ entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
124
+ }
125
+
126
+ col = COLUMN_NAMES.index("Strand")
127
+ entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
128
+
129
+ col = COLUMN_NAMES.index("Center")
130
+ entry[col] = sio_attribute("foaf:homepage",entry[col])
131
+ # entry[col] = [
132
+ # ["a", "foaf:Organization"],
133
+ # ["foaf:homepage", entry[col]],
134
+ # ]
135
+
136
+ # Use faldo for locations End_Position
137
+ col = COLUMN_NAMES.index("Start_Position")
138
+ entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
139
+
140
+ col = COLUMN_NAMES.index("End_Position")
141
+ entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
142
+
143
+ entry
144
+ end
145
+
146
+ def column_replace(entry,column,prefix,value=nil)
147
+ if value
148
+ entry[COLUMN_NAMES.index(column)] = prefix + value
149
+ else
150
+ entry[COLUMN_NAMES.index(column)] += prefix
151
+ end
152
+ end
153
+
154
+ def official_symbol(hugo_symbol)
155
+ qry = <<-EOF
156
+
157
+ SELECT distinct ?official where {
158
+ {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
159
+ UNION
160
+ {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
161
+
162
+ ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
163
+ }
164
+
165
+ EOF
166
+
167
+ sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
168
+ sparql.query(qry).map(&:official).first.to_s
169
+ end
170
+
171
+ def parse_barcode(code)
172
+ #TCGA-E9-A22B-01A-11D-A159-09
173
+ [code[5..11], code[13..-1]]
174
+ end
175
+
176
+ def structure(options={})
177
+
178
+ str = prefixes(@dataset_name,options)
179
+ str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
180
+ str << dataset(@dataset_name,options)
181
+ component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
182
+ measure_properties(@measures,@dataset_name,options).map{|m| str << m}
183
+ dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
184
+ code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
185
+ concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
186
+ str
187
+ end
188
+
189
+ def post_process(file)
190
+ reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
191
+ @@hugo_cache ||= {}
192
+ PubliSci::PostProcessor.process(file,file,reg){|g|
193
+ @@hugo_cache[g] ||= official_symbol(g)
194
+ 'http://identifiers.org/hgnc.symbol/' + cache[g]
195
+ }
196
+ end
197
+ end
198
+ end
199
+ end