biointerchange 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ module BioInterchange::Genomics
2
+
3
+ # Represents a sequence of a genomic feature of a GFF3 file.
4
+ class GFF3FeatureSequence
5
+
6
+ # Creates a new feature sequence representation. A feature sequence is described by two or more
7
+ # lines in a GFF3 file that are succeeding a '##FASTA' pragma statement.
8
+ #
9
+ # +feature_id+:: ID of the feature whose sequence is stored
10
+ # +sequence+:: sequence of the feature
11
+ def initialize(feature_id, sequence, comment = nil)
12
+ @feature_id = feature_id
13
+ @sequence = sequence
14
+ @comment = comment
15
+ end
16
+
17
+ # Returns the ID of the feature whose sequence is represented by the object.
18
+ def feature_id
19
+ @feature_id
20
+ end
21
+
22
+ # Returns the sequence of the feature.
23
+ def sequence
24
+ @sequence
25
+ end
26
+
27
+ # Returns additional comments -- if provided -- that are associated with the feature ID.
28
+ # If no comment was provided, then `nil` is returned.
29
+ def comment
30
+ @comment
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+
@@ -11,24 +11,23 @@ module BioInterchange::Genomics
11
11
  # - biointerchange.gvf
12
12
  #
13
13
  # Outputs:
14
- # - rdf.biointerchange.gff3
15
- # - rdf.biointerchange.gvf
14
+ # - rdf.biointerchange.gfvo
16
15
  class RDFWriter < BioInterchange::Writer
17
16
 
18
17
  # Register writers:
19
18
  BioInterchange::Registry.register_writer(
20
- 'rdf.biointerchange.gff3',
19
+ 'rdf.biointerchange.gfvo',
21
20
  BioInterchange::Genomics::RDFWriter,
22
21
  [ 'biointerchange.gff3' ],
23
22
  true,
24
- 'Generic Feature Format Version 3 Ontology (GFF3O) based RDFization'
23
+ 'Genomic Feature and Variation Ontology (GFVO) based RDFization'
25
24
  )
26
25
  BioInterchange::Registry.register_writer(
27
- 'rdf.biointerchange.gvf',
26
+ 'rdf.biointerchange.gfvo',
28
27
  BioInterchange::Genomics::RDFWriter,
29
28
  [ 'biointerchange.gvf' ],
30
29
  true,
31
- 'Genome Variation Format Version 1 Ontology (GVF1O) based RDFization'
30
+ 'Genomic Feature and Variation Ontology (GFVO) based RDFization'
32
31
  )
33
32
 
34
33
  # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
@@ -42,18 +41,19 @@ class RDFWriter < BioInterchange::Writer
42
41
  # Serialize a model as RDF.
43
42
  #
44
43
  # +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
45
- def serialize(model)
44
+ # +uri_prefix+:: optional URI prefix that replaces the default URI prefix for all set/feature/annotation URIs
45
+ def serialize(model, uri_prefix = nil)
46
46
  if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
47
- @base = BioInterchange::GFF3O
48
- serialize_model(model)
47
+ @format = :gff3
49
48
  elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
50
- @base = BioInterchange::GVF1O
51
- serialize_model(model)
49
+ @format = :gvf
52
50
  else
53
51
  raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
54
52
  'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
55
53
  'BioInterchange::Genomics::GVFFeatureSet.'
56
54
  end
55
+ @base = BioInterchange::GFVO
56
+ serialize_model(model, uri_prefix)
57
57
  end
58
58
 
59
59
  protected
@@ -61,7 +61,8 @@ protected
61
61
  # Serializes RDF for a feature set representation.
62
62
  #
63
63
  # +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
64
- def serialize_model(model)
64
+ # +set_uri+:: optional URI prefix that should be used for a set instance (and hence, all its dependents -- features, annotations, etc.)
65
+ def serialize_model(model, set_uri)
65
66
  # We record landmarks, because they can either be written when their "##sequence-region"
66
67
  # pragma statement appears, or otherwise, when the first feature with said landmark is
67
68
  # being serialized.
@@ -70,17 +71,25 @@ protected
70
71
  # Record written variants in order to avoid writing out RDF.type multiple times.
71
72
  @variants = {}
72
73
 
73
- graph = RDF::Graph.new
74
- graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
75
- set_uri = RDF::URI.new(model.uri)
76
- graph.insert(RDF::Statement.new(set_uri, RDF.type, @base.Set))
74
+ # Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
75
+ # Then register the prefix with the writer to have a concise Turtle output.
76
+ set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
77
+ set_uri = RDF::URI.new(model.uri) unless set_uri
78
+ set_base(set_uri + '/')
79
+
80
+ create_triple(set_uri, RDF.type, @base.Set)
77
81
  model.pragmas.each { |pragma_name|
78
- serialize_pragma(graph, set_uri, model.pragma(pragma_name))
82
+ serialize_pragma(set_uri, model.pragma(pragma_name))
79
83
  }
80
84
  model.contents.each { |feature|
81
- serialize_feature(graph, set_uri, feature)
85
+ if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
86
+ serialize_feature_sequence(set_uri, feature)
87
+ else
88
+ serialize_feature(set_uri, feature)
89
+ end
82
90
  }
83
- RDF::NTriples::Writer.dump(graph, @ostream)
91
+ close
92
+ #RDF::NTriples::Writer.dump(graph, @ostream)
84
93
  # TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
85
94
  # Having said that, Jena's rdfcat is very good for converting formats
86
95
  # anyway, so perhaps it is not worth investigating the following.
@@ -89,25 +98,22 @@ protected
89
98
 
90
99
  # Serializes pragmas for a given feature set URI.
91
100
  #
92
- # +graph+:: RDF graph to which the pragmas are added
93
101
  # +set_uri+:: the feature set URI to which the pragmas belong to
94
102
  # +pragma+:: an object representing a pragma statement
95
- def serialize_pragma(graph, set_uri, pragma)
103
+ def serialize_pragma(set_uri, pragma)
96
104
  if pragma.kind_of?(Hash) then
97
- if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) and @base == BioInterchange::GVF1O then
98
- serialize_structured_attribute(graph, set_uri, pragma)
99
- elsif pragma.has_key?('gff-version') and @base == BioInterchange::GFF3O then
100
- graph.insert(RDF::Statement.new(set_uri, @base.version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
101
- elsif pragma.has_key?('gff-version') and @base == BioInterchange::GVF1O then
102
- graph.insert(RDF::Statement.new(set_uri, @base.gff_version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
103
- elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
104
- graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
105
+ if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
106
+ serialize_structured_attribute(set_uri, pragma)
107
+ elsif pragma.has_key?('gff-version') then
108
+ create_triple(set_uri, @base.gff_version, pragma['gff-version'], RDF::XSD.float)
109
+ elsif pragma.has_key?('gvf-version') then
110
+ create_triple(set_uri, @base.gvf_version, pragma['gvf-version'], RDF::XSD.float)
105
111
  elsif pragma.has_key?('sequence-region') then
106
112
  pragma['sequence-region'].keys.each { |seqid|
107
- serialize_landmark(graph, set_uri, pragma['sequence-region'][seqid])
113
+ serialize_landmark(set_uri, pragma['sequence-region'][seqid])
108
114
  }
109
115
  elsif pragma.has_key?('species') then
110
- graph.insert(RDF::Statement.new(set_uri, @base.species, RDF::URI.new(pragma['species'])))
116
+ create_triple(set_uri, @base.species, RDF::URI.new(pragma['species']))
111
117
  end
112
118
  else
113
119
  # TODO
@@ -116,77 +122,97 @@ protected
116
122
 
117
123
  # Serializes a +GFF3Feature+ object for a given feature set URI.
118
124
  #
119
- # +graph+:: RDF graph to which the feature is added
120
125
  # +set_uri+:: the feature set URI to which the feature belongs to
121
126
  # +feature+:: a +GFF3Feature+ instance
122
- def serialize_feature(graph, set_uri, feature)
127
+ def serialize_feature(set_uri, feature)
123
128
  # TODO Make sure there is only one value in the 'ID' list.
124
129
  feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
125
130
  feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
126
- feature_datatype_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
127
- feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
128
- graph.insert(RDF::Statement.new(set_uri, @base.contains, feature_uri))
129
- graph.insert(RDF::Statement.new(feature_uri, RDF.type, @base.Feature))
130
- serialize_landmark(graph, set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
131
- graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.seqid ].flatten, feature_object_properties)[0], @landmarks[feature.sequence_id]))
132
- graph.insert(RDF::Statement.new(feature_uri, @base.source, RDF::Literal.new(feature.source)))
133
- graph.insert(RDF::Statement.new(feature_uri, @base.type, feature.type))
134
- graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start, feature_datatype_properties)[0], RDF::Literal.new(feature.start_coordinate)))
135
- graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end, feature_datatype_properties)[0], RDF::Literal.new(feature.end_coordinate)))
136
- graph.insert(RDF::Statement.new(feature_uri, @base.score, RDF::Literal.new(feature.score))) if feature.score
137
- strand_uri = @base.with_parent(@base.strand, feature_object_properties)[0]
131
+ create_triple(set_uri, @base.contains, feature_uri)
132
+ create_triple(feature_uri, RDF.type, @base.Feature)
133
+ serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
134
+ create_triple(feature_uri, @base.seqid, @landmarks[feature.sequence_id])
135
+ create_triple(feature_uri, @base.source, feature.source)
136
+ create_triple(feature_uri, @base.type, feature.type)
137
+ create_triple(feature_uri, @base.phase, feature.phase) if feature.phase
138
+
139
+ serialize_coordinate(set_uri, feature_uri, feature)
140
+ serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
141
+ end
142
+
143
+ def serialize_coordinate(set_uri, feature_uri, feature)
144
+ region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
145
+ start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
146
+ end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
147
+ #feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
148
+ ##graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
149
+ create_triple(feature_uri, @base.locus, region_uri)
150
+ create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
151
+ # BIN STUFF
152
+ if false then
153
+ bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
154
+ create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
155
+ end
156
+ create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
157
+ create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
138
158
  case feature.strand
139
159
  when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
140
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.NotStranded))
160
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
161
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
141
162
  when BioInterchange::Genomics::GFF3Feature::UNKNOWN
142
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.UnknownStrand))
163
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
164
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
143
165
  when BioInterchange::Genomics::GFF3Feature::POSITIVE
144
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Positive))
166
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
167
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
145
168
  when BioInterchange::Genomics::GFF3Feature::NEGATIVE
146
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Negative))
169
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
170
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
147
171
  else
148
172
  raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
149
173
  end
150
- graph.insert(RDF::Statement.new(feature_uri, @base.phase, RDF::Literal.new(feature.phase))) if feature.phase
151
-
152
- serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
174
+ create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
175
+ create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
176
+ create_triple(feature_uri, @base.score, feature.score) if feature.score
153
177
  end
154
178
 
155
179
  # Serializes a genomic feature landmark ("seqid").
156
180
  #
157
- # +graph+:: RDF graph to which the landmark is added
158
181
  # +set_uri+:: the feature set URI to which the landmark belongs to
159
182
  # +landmark+:: encapsuled landmark data
160
- def serialize_landmark(graph, set_uri, landmark)
183
+ def serialize_landmark(set_uri, landmark)
161
184
  return if @landmarks.has_key?(landmark.seqid)
162
185
  landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
186
+ region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
163
187
  @landmarks[landmark.seqid] = landmark_uri
164
- graph.insert(RDF::Statement.new(landmark_uri, RDF.type, @base.Landmark))
165
- graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.id ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.seqid)))
166
- graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.start ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.start_coordinate))) if landmark.start_coordinate
167
- graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.end ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.end_coordinate))) if landmark.end_coordinate
188
+ create_triple(landmark_uri, RDF.type, @base.Landmark)
189
+ create_triple(landmark_uri, @base.id, landmark.seqid)
190
+ create_triple(landmark_uri, @base.locus, region_uri)
191
+ if landmark.start_coordinate then
192
+ start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
193
+ create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
194
+ create_triple(start_position_uri, BioInterchange::FALDO.position, landmark.start_coordinate)
195
+ end
196
+ if landmark.start_coordinate then
197
+ end_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/end")
198
+ create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
199
+ create_triple(end_position_uri, BioInterchange::FALDO.position, landmark.end_coordinate)
200
+ end
168
201
  end
169
202
 
170
203
  # Serializes the attributes of a feature.
171
204
  #
172
- # +graph+:: RDF graph to which the feature is added
173
205
  # +set_uri+:: URI of the set these attributes belong to (implicit due to feature)
174
206
  # +feature_uri+:: the feature URI to which the attributes belong to
175
207
  # +attribtues+:: a map of tag/value pairs
176
- def serialize_attributes(graph, set_uri, feature_uri, attributes)
208
+ def serialize_attributes(set_uri, feature_uri, attributes)
177
209
  attributes.each_pair { |tag, list|
178
210
  # Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
179
211
  if tag == 'Alias' then
180
212
  list.each { |value|
181
- graph.insert(RDF::Statement.new(feature_uri, @base.alias, RDF::Literal.new(value)))
213
+ create_triple(feature_uri, @base.alias, value)
182
214
  }
183
215
  elsif tag == 'Dbxref' then
184
- feature_properties = nil
185
- if @base == BioInterchange::GFF3O then
186
- feature_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
187
- else
188
- feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
189
- end
190
216
  list.each { |value|
191
217
  begin
192
218
  linkout = nil
@@ -200,79 +226,143 @@ protected
200
226
  linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
201
227
  end
202
228
  # Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
203
- if @base == BioInterchange::GFF3O then
204
- graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::Literal.new(linkout, :datatype => RDF::XSD.anyURI )))
205
- else
206
- graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new(linkout))) if @base == BioInterchange::GVF1O
207
- end
229
+ create_triple(feature_uri, @base.dbxref, linkout)
208
230
  rescue NoMethodError
209
- raise BioInterchange::Exceptions::InputFormatError, 'Attribute Dbxref link-out is not resolvable, i.e. the name cannot be turned into an URL.'
231
+ # Preserve the Dbxref as a Literal:
232
+ @dbxref = 0 if @dbxref == nil
233
+ literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
234
+ @dbxref += 1
235
+ create_triple(feature_uri, @base.dbxref, literal_uri)
236
+ create_triple(literal_uri, RDF.type, RDF::RDFS.Literal)
237
+ create_triple(literal_uri, RDF.value, value)
210
238
  end
211
239
  }
212
- elsif tag == 'Derives_from' and @base == BioInterchange::GFF3O then
240
+ elsif tag == 'Derives_from' then
213
241
  list.each { |value|
214
- graph.insert(RDF::Statement.new(feature_uri, @base.derives_from, RDF::URI.new("#{set_uri.to_s}/feature/#{value}")))
242
+ create_triple(feature_uri, @base.derivesFrom, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
215
243
  }
216
- elsif tag == 'Gap' and @base == BioInterchange::GFF3O then
217
- graph.insert(RDF::Statement.new(feature_uri, @base.gap, RDF::Literal.new(list.join(','))))
244
+ elsif tag == 'Gap' then
245
+ # Handled by 'Target', because 'Gap' requires 'Target' to be present.
218
246
  elsif tag == 'ID' then
219
- # Do nothing. The feature ID is the URI of the feature. It is not relevant as information anymore.
220
- elsif tag == 'Is_circular' and @base == BioInterchange::GFF3O then
247
+ list.each { |value|
248
+ create_triple(feature_uri, @base.id, value)
249
+ }
250
+ elsif tag == 'Is_circular' then
221
251
  value = list.join(',')
222
- graph.insert(RDF::Statement.new(feature_uri, @base.is_circular, true)) if value == 'true'
223
- graph.insert(RDF::Statement.new(feature_uri, @base.is_circular, false)) if value == 'false'
224
- # TODO Report invalid value.
225
- elsif tag == 'Name' and @base == BioInterchange::GFF3O then
226
- graph.insert(RDF::Statement.new(feature_uri, @base.name, RDF::Literal.new(list.join(','))))
227
- elsif tag == 'Note' and @base == BioInterchange::GFF3O then
252
+ if value == 'true' then
253
+ create_triple(feature_uri, @base.isCircular, true) if value == 'true'
254
+ elsif value == 'false' then
255
+ create_triple(feature_uri, @base.isCircular, false) if value == 'false'
256
+ else
257
+ create_triple(feature_uri, RDF::RDFS.comment, "Is_circular non-truth value: #{value}")
258
+ end
259
+ elsif tag == 'Name' then
260
+ list.each { |value|
261
+ create_triple(feature_uri, @base.name, value)
262
+ }
263
+ elsif tag == 'Note' then
228
264
  list.each { |value|
229
- graph.insert(RDF::Statement.new(feature_uri, @base.note, RDF::Literal.new(value)))
265
+ create_triple(feature_uri, RDF::RDFS.comment, value)
230
266
  }
231
267
  elsif tag == 'Ontology_term' then
232
268
  list.each { |value|
233
269
  # TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
234
270
  # match their associated Ruby method.
235
271
  namespace, accession = value.split(/:/, 2)
236
- graph.insert(RDF::Statement.new(feature_uri, @base.ontology_term, RDF::Literal.new("#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}", :datatype => RDF::XSD.anyURI )))
272
+ create_triple(feature_uri, @base.ontology_term, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
237
273
  }
238
274
  elsif tag == 'Parent' then
239
275
  list.each { |parent_id|
240
- graph.insert(RDF::Statement.new(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
276
+ create_triple(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
241
277
  }
242
278
  elsif tag == 'Reference_seq' then
243
279
  list.each { |value|
244
- graph.insert(RDF::Statement.new(feature_uri, @base.reference_seq, RDF::Literal.new(value)))
280
+ reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
281
+ create_triple(feature_uri, @base.sequence_annotation, reference_uri)
282
+ create_triple(reference_uri, RDF.type, @base.Reference)
283
+ create_triple(reference_uri, @base.sequence, value)
245
284
  }
246
285
  elsif tag == 'Target' then
247
286
  target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
248
- target_datatype_properties = @base.target_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
249
- target_object_properties = @base.target_properties.select { |uri| @base.is_object_property?(uri) }[0]
250
287
  target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
251
- graph.insert(RDF::Statement.new(target_uri, RDF.type, @base.Target))
252
- graph.insert(RDF::Statement.new(target_uri, @base.target_id, RDF::Literal.new(target_id)))
253
- graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.start ].flatten, target_datatype_properties)[0], RDF::Literal.new(start_coordinate.to_i)))
254
- graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_datatype_properties)[0], RDF::Literal.new(end_coordinate.to_i)))
255
- graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_object_properties)[0], @base.Positive)) if strand and strand == '+'
256
- graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_object_properties)[0], @base.Negative)) if strand and strand == '-'
257
- graph.insert(RDF::Statement.new(feature_uri, @base.target, target_uri))
258
- elsif tag == 'Variant_effect' and @base == BioInterchange::GVF1O then
259
- serialize_variant_effects(graph, set_uri, feature_uri, list)
260
- elsif tag == 'Variant_seq' and @base == BioInterchange::GVF1O then
261
- serialize_variant_seqs(graph, set_uri, feature_uri, list)
288
+ create_triple(feature_uri, @base.target, target_uri)
289
+ create_triple(target_uri, RDF.type, @base.Target)
290
+ create_triple(target_uri, @base.id, target_id)
291
+ region_uri = RDF::URI.new("#{target_uri.to_s}/region")
292
+ start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
293
+ end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
294
+ create_triple(target_uri, @base.locus, region_uri)
295
+ create_triple(region_uri, @base.locus, start_position_uri)
296
+ create_triple(region_uri, @base.locus, end_position_uri)
297
+ if strand == '+' then
298
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
299
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
300
+ create_triple(start_position_uri, BioInterchange::FALDO.position, start_coordinate)
301
+ create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
302
+ elsif strand == '-' then
303
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
304
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
305
+ # Reverse start/end coordinates on the negative strand; FALDO requirement:
306
+ create_triple(start_position_uri, BioInterchange::FALDO.position, end_coordinate)
307
+ create_triple(end_position_uri, BioInterchange::FALDO.position, start_coordinate)
308
+ else
309
+ create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
310
+ create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
311
+ create_triple(start_position_uri, BioInterchange::FALDO.position, start_coordinate)
312
+ create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
313
+ end
314
+
315
+ # Describe a possible alignment between the feature and target:
316
+ if attributes.has_key?('Gap') then
317
+ attributes['Gap'].each_index { |gap_no|
318
+ cigar_line = attributes['Gap'][gap_no].split(/\s+/)
319
+ cigar_line.each_index { |alignment_no|
320
+ alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
321
+ create_triple(feature_uri, @base.alignment, alignment_uri) if alignment_no == 0
322
+ operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
323
+ operation = nil unless operation.length == 1
324
+ span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
325
+ span = nil unless span.length > 0
326
+ if operation == 'M' then
327
+ create_triple(alignment_uri, RDF.type, @base.Match)
328
+ elsif operation == 'I' then
329
+ create_triple(alignment_uri, RDF.type, @base.Reference_Sequence_Gap)
330
+ elsif operation == 'D' then
331
+ create_triple(alignment_uri, RDF.type, @base.Target_Sequence_Gap)
332
+ elsif operation == 'F' then
333
+ create_triple(alignment_uri, RDF.type, @base.Forward_Reference_Sequence_Frameshift)
334
+ elsif operation == 'R' then
335
+ create_triple(alignment_uri, RDF.type, @base.Reverse_Reference_Sequence_Frameshift)
336
+ else
337
+ # Fallback: operation is outside of the specification
338
+ create_triple(alignment_uri, RDF.type, @base.Alignment_Operation)
339
+ create_triple(alignment_uri, RDF::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
340
+ end
341
+ create_triple(alignment_uri, @base.span, span.to_i) if span
342
+ create_triple(alignment_uri, RDF.first, alignment_uri)
343
+ if alignment_no + 1 < cigar_line.length then
344
+ create_triple(alignment_uri, RDF.rest, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
345
+ else
346
+ create_triple(alignment_uri, RDF.rest, RDF.nil)
347
+ end
348
+ }
349
+ }
350
+ end
351
+ elsif tag == 'Variant_effect' then
352
+ serialize_variant_effects(set_uri, feature_uri, list)
353
+ elsif tag == 'Variant_seq' then
354
+ serialize_variant_seqs(set_uri, feature_uri, list)
262
355
  else
263
356
  # TODO Report unknown upper case letters here? That would be a spec. validation...
264
357
  # Well, or it would show that this implementation is incomplete. Could be either.
265
- attribute_properties = @base.attribute_properties
266
- attribute_properties = attribute_properties.select { |uri| @base.is_datatype_property?(uri) }[0] if attribute_properties.kind_of?(Array)
267
- feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
268
358
  list.each_index { |index|
269
359
  value = list[index]
270
360
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
271
361
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
272
- graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.attributes ].flatten, feature_properties)[0], attribute_uri))
273
- graph.insert(RDF::Statement.new(attribute_uri, RDF.type, @base.Attribute))
274
- graph.insert(RDF::Statement.new(attribute_uri, @base.with_parent([ @base.tag ].flatten, attribute_properties)[0], RDF::Literal.new("#{tag}")))
275
- graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
362
+ create_triple(feature_uri, @base.attribute, attribute_uri)
363
+ create_triple(attribute_uri, RDF.type, @base.Attribute)
364
+ create_triple(attribute_uri, @base.tag, "#{tag}")
365
+ create_triple(attribute_uri, RDF.value, value)
276
366
  }
277
367
  end
278
368
  }
@@ -281,10 +371,9 @@ protected
281
371
  # Serializes a structured attribute (given as a pragma statement), which later
282
372
  # can be referred to from feature instances.
283
373
  #
284
- # +graph+:: RDF graph to which the structured attribute is added
285
374
  # +set_uri+:: the feature set URI to which the structured attribute belongs to
286
375
  # +pragma+:: a map that encapsulates the structured attribute data
287
- def serialize_structured_attribute(graph, set_uri, pragma)
376
+ def serialize_structured_attribute(set_uri, pragma)
288
377
  attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
289
378
  attributes = nil
290
379
  class_type = nil
@@ -306,114 +395,132 @@ protected
306
395
  else
307
396
  # TODO Error.
308
397
  end
309
- graph.insert(RDF::Statement.new(attribute_uri, RDF.type, class_type))
310
398
  if class_type == @base.DataSource and attributes.has_key?('Data_type') then
311
- data_type_individual = nil
312
399
  attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
313
400
  if attributes['Data_type'] == 'Array_CGH' then
314
- data_type_individual = @base.ArrayComparativeGenomicHybridization
401
+ class_type = @base.ArrayComparativeGenomicHybridization
315
402
  elsif attributes['Data_type'] == 'DNA_microarray' then
316
- data_type_individual = @base.DNAMicroarray
403
+ class_type = @base.DNAMicroarray
317
404
  elsif attributes['Data_type'] == 'DNA_sequence' then
318
- data_type_individual = @base.DNASequence
405
+ class_type = @base.DNASequence
319
406
  elsif attributes['Data_type'] == 'RNA_sequence' then
320
- data_type_individual = @base.RNASequence
407
+ class_type = @base.RNASequence
321
408
  else
322
409
  # TODO Error.
323
410
  end
324
- graph.insert(RDF::Statement.new(attribute_uri, @base.data_type, data_type_individual))
325
411
  elsif class_type == @base.TechnologyPlatform then
326
412
  if attributes.has_key?('Average_coverage') then
327
- graph.insert(RDF::Statement.new(attribute_uri, @base.average_coverage, RDF::Literal.new(attributes['Average_coverage'][0].to_i)))
413
+ create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
328
414
  end
329
415
  if attributes.has_key?('Platform_class') then
330
- graph.insert(RDF::Statement.new(attribute_uri, @base.platform_class, RDF::Literal.new(attributes['Platform_class'][0])))
416
+ create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
331
417
  end
332
418
  if attributes.has_key?('Platform_name') then
333
- graph.insert(RDF::Statement.new(attribute_uri, @base.platform_name, RDF::Literal.new(attributes['Platform_name'][0])))
419
+ create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
334
420
  end
335
421
  if attributes.has_key?('Read_length') then
336
- graph.insert(RDF::Statement.new(attribute_uri, @base.read_length, RDF::Literal.new(attributes['Read_length'][0].to_i)))
422
+ create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
337
423
  end
338
424
  if attributes.has_key?('Read_pair_span') then
339
- graph.insert(RDF::Statement.new(attribute_uri, @base.read_pair_span, RDF::Literal.new(attributes['Read_pair_span'][0].to_i)))
425
+ create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
340
426
  end
341
427
  if attributes.has_key?('Read_type') then
342
- read_type_individual = nil
343
428
  attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
344
429
  if attributes['Read_type'] == 'fragment' then
345
- read_type_individual = @base.Fragment
430
+ class_type = @base.FragmentReadPlatform
346
431
  elsif attributes['Read_type'] == 'pair' then
347
- read_type_individual = @base.Pair
432
+ class_type = @base.PairedEndReadPlatform
348
433
  else
349
434
  # TODO Error.
350
435
  end
351
- graph.insert(RDF::Statement.new(attribute_uri, @base.read_type, read_type_individual))
352
436
  end
353
437
  end
354
- structuredpragma_properties = @base.structuredpragma_properties.select { |uri| @base.is_object_property?(uri) }[0]
438
+ create_triple(attribute_uri, RDF.type, class_type)
355
439
  attributes.keys.each { |tag|
356
440
  if tag.match(/^[a-z]/) then
357
441
  custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
358
- graph.insert(RDF::Statement.new(custom_attribute_uri, RDF.type, @base.StructuredAttribute))
359
- graph.insert(RDF::Statement.new(custom_attribute_uri, @base.with_parent([ @base.tag ].flatten, @base.structuredattribute_properties)[0], tag))
360
- graph.insert(RDF::Statement.new(custom_attribute_uri, RDF.value, RDF::Literal.new(attributes[tag].join(','))))
361
- graph.insert(RDF::Statement.new(attribute_uri, @base.with_parent([ @base.attributes ].flatten, structuredpragma_properties)[0], custom_attribute_uri))
442
+ create_triple(custom_attribute_uri, RDF.type, @base.StructuredAttribute)
443
+ create_triple(custom_attribute_uri, @base.tag, tag)
444
+ attributes[tag].each { |value|
445
+ create_triple(custom_attribute_uri, RDF.value, value)
446
+ }
447
+ create_triple(attribute_uri, @base.attribute, custom_attribute_uri)
448
+ else
449
+ # TODO
362
450
  end
363
451
  }
364
452
  end
365
453
 
366
454
  # Serializes a list of variant effects.
367
455
  #
368
- # +graph+:: RDF graph to which the structured attribute is added
369
456
  # +set_uri+:: the feature set URI to which the feature belongs to
370
457
  # +feature_uri+:: the feature URI to the feature that is annotated with variant data
371
458
  # +list+:: list of variant values
372
- def serialize_variant_effects(graph, set_uri, feature_uri, list)
459
+ def serialize_variant_effects(set_uri, feature_uri, list)
373
460
  list.each_index { |index|
374
461
  effect = list[index]
375
462
  sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
376
463
  feature_ids = feature_ids.split(' ')
377
464
  effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
378
- serialize_variant_triple(graph, feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
379
- graph.insert(RDF::Statement.new(effect_uri, RDF.type, @base.Effect))
380
- graph.insert(RDF::Statement.new(effect_uri, @base.sequence_variant, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant))))
381
- graph.insert(RDF::Statement.new(effect_uri, @base.feature_type, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type))))
465
+ serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
466
+ create_triple(effect_uri, RDF.type, @base.Effect)
467
+ create_triple(effect_uri, @base.sequenceVariant, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
468
+ create_triple(effect_uri, @base.featureType, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
382
469
  feature_ids.each { |feature_id|
383
- graph.insert(RDF::Statement.new(effect_uri, @base.feature, RDF::Literal.new(feature_id)))
470
+ create_triple(effect_uri, @base.feature, feature_id)
384
471
  }
385
472
  }
386
473
  end
387
474
 
388
475
  # Serializes a list of variant sequences.
389
476
  #
390
- # +graph+:: RDF graph to which the structured attribute is added
391
477
  # +set_uri+:: the feature set URI to which the feature belongs to
392
478
  # +feature_uri+:: the feature URI to the feature that is annotated with variant data
393
479
  # +list+:: list of variant values
394
- def serialize_variant_seqs(graph, set_uri, feature_uri, list)
480
+ def serialize_variant_seqs(set_uri, feature_uri, list)
395
481
  list.each_index { |index|
396
482
  value = list[index]
397
483
  variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
398
- serialize_variant_triple(graph, feature_uri, variant_uri, @base.variant_seq, RDF::Literal.new(value))
484
+ serialize_variant_triple(feature_uri, variant_uri, @base.sequence, RDF::Literal.new(value))
399
485
  }
486
+
487
+ # Return the variant type based on the present sequence(s):
488
+ return @base.Variant if list.length != 2
489
+ if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
490
+ return @base.HomozygousVariant if list[0] == list[1]
491
+ return @base.HeterozygousVariant
492
+ end
493
+ return @base.Variant
400
494
  end
401
495
 
402
496
  # Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
403
497
  #
404
- # +graph+:: RDF graph to which the variant is added
405
498
  # +feature_uri+:: the feature URI to the feature that is annotated with variant data
406
499
  # +variant_uri+:: URI that identifies the feature in question ("subject", if you like)
407
500
  # +predicate+:: predicate that describes the data being serialized
408
501
  # +object+:: data to be serialized
409
- def serialize_variant_triple(graph, feature_uri, variant_uri, predicate, object)
502
+ def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
410
503
  unless @variants.has_key?(variant_uri.to_s) then
411
- graph.insert(RDF::Statement.new(feature_uri, @base.variant, variant_uri))
412
- graph.insert(RDF::Statement.new(variant_uri, RDF.type, @base.Variant))
504
+ create_triple(feature_uri, @base.sequence_annotation, variant_uri)
505
+ create_triple(variant_uri, RDF.type, @base.Variant)
413
506
  end
414
507
  @variants[variant_uri.to_s] = true
415
- graph.insert(RDF::Statement.new(variant_uri, predicate, object))
508
+ create_triple(variant_uri, predicate, object)
509
+ end
510
+
511
+ # Serializes a +GFF3FeatureSequence+ object that contains the sequence for a feature object.
512
+ #
513
+ # +set_uri+:: the feature set URI to which the feature belongs to
514
+ # +feature_sequence+:: a +GFF3FeatureSequence+ instance
515
+ def serialize_feature_sequence(set_uri, feature_sequence)
516
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
517
+ annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
518
+ create_triple(feature_uri, @base.sequence_annotation, annotation_uri)
519
+ create_triple(annotation_uri, RDF.type, @base.Sequence_Annotation)
520
+ create_triple(annotation_uri, RDF::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
521
+ create_triple(annotation_uri, @base.sequence, feature_sequence.sequence)
416
522
  end
523
+
417
524
  end
418
525
 
419
526
  end