biointerchange 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +2 -2
- data/README.md +7 -6
- data/VERSION +1 -1
- data/examples/ReaderModelWriterSequenceDiagram.graffle +3073 -0
- data/examples/ReaderModelWriterSequenceDiagram.png +0 -0
- data/lib/biointerchange/core.rb +63 -58
- data/lib/biointerchange/faldo.rb +125 -35
- data/lib/biointerchange/genomics/gff3_feature_sequence.rb +36 -0
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +254 -147
- data/lib/biointerchange/genomics/gff3_reader.rb +22 -3
- data/lib/biointerchange/genomics/locations.rb +30 -0
- data/lib/biointerchange/gfvo.rb +1072 -0
- data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +6 -5
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +7 -4
- data/lib/biointerchange/writer.rb +138 -2
- data/spec/gff3_rdfwriter_spec.rb +5 -6
- data/spec/gvf_rdfwriter_spec.rb +5 -6
- data/spec/phylogenetics_spec.rb +1 -1
- data/web/images/BioInterchange120.png +0 -0
- data/web/images/BioInterchange120inv.png +0 -0
- data/web/images/BioInterchange160.png +0 -0
- data/web/images/BioInterchange160inv.png +0 -0
- data/web/images/BioInterchange300inv.png +0 -0
- data/web/index.html +5 -1
- data/web/ontologies.html +1538 -21
- metadata +31 -46
@@ -0,0 +1,36 @@
|
|
1
|
+
module BioInterchange::Genomics
|
2
|
+
|
3
|
+
# Represents a sequence of a genomic feature of a GFF3 file.
|
4
|
+
class GFF3FeatureSequence
|
5
|
+
|
6
|
+
# Creates a new feature sequence representation. A feature sequence is described by two or more
|
7
|
+
# lines in a GFF3 file that are succeeding a '##FASTA' pragma statement.
|
8
|
+
#
|
9
|
+
# +feature_id+:: ID of the feature whose sequence is stored
|
10
|
+
# +sequence+:: sequence of the feature
|
11
|
+
def initialize(feature_id, sequence, comment = nil)
|
12
|
+
@feature_id = feature_id
|
13
|
+
@sequence = sequence
|
14
|
+
@comment = comment
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns the ID of the feature whose sequence is represented by the object.
|
18
|
+
def feature_id
|
19
|
+
@feature_id
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns the sequence of the feature.
|
23
|
+
def sequence
|
24
|
+
@sequence
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns additional comments -- if provided -- that are associated with the feature ID.
|
28
|
+
# If no comment was provided, then `nil` is returned.
|
29
|
+
def comment
|
30
|
+
@comment
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
@@ -11,24 +11,23 @@ module BioInterchange::Genomics
|
|
11
11
|
# - biointerchange.gvf
|
12
12
|
#
|
13
13
|
# Outputs:
|
14
|
-
# - rdf.biointerchange.
|
15
|
-
# - rdf.biointerchange.gvf
|
14
|
+
# - rdf.biointerchange.gfvo
|
16
15
|
class RDFWriter < BioInterchange::Writer
|
17
16
|
|
18
17
|
# Register writers:
|
19
18
|
BioInterchange::Registry.register_writer(
|
20
|
-
'rdf.biointerchange.
|
19
|
+
'rdf.biointerchange.gfvo',
|
21
20
|
BioInterchange::Genomics::RDFWriter,
|
22
21
|
[ 'biointerchange.gff3' ],
|
23
22
|
true,
|
24
|
-
'
|
23
|
+
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
25
24
|
)
|
26
25
|
BioInterchange::Registry.register_writer(
|
27
|
-
'rdf.biointerchange.
|
26
|
+
'rdf.biointerchange.gfvo',
|
28
27
|
BioInterchange::Genomics::RDFWriter,
|
29
28
|
[ 'biointerchange.gvf' ],
|
30
29
|
true,
|
31
|
-
'
|
30
|
+
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
32
31
|
)
|
33
32
|
|
34
33
|
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
@@ -42,18 +41,19 @@ class RDFWriter < BioInterchange::Writer
|
|
42
41
|
# Serialize a model as RDF.
|
43
42
|
#
|
44
43
|
# +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
|
45
|
-
|
44
|
+
# +uri_prefix+:: optional URI prefix that replaces the default URI prefix for all set/feature/annotation URIs
|
45
|
+
def serialize(model, uri_prefix = nil)
|
46
46
|
if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
|
47
|
-
@
|
48
|
-
serialize_model(model)
|
47
|
+
@format = :gff3
|
49
48
|
elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
|
50
|
-
@
|
51
|
-
serialize_model(model)
|
49
|
+
@format = :gvf
|
52
50
|
else
|
53
51
|
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
|
54
52
|
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
|
55
53
|
'BioInterchange::Genomics::GVFFeatureSet.'
|
56
54
|
end
|
55
|
+
@base = BioInterchange::GFVO
|
56
|
+
serialize_model(model, uri_prefix)
|
57
57
|
end
|
58
58
|
|
59
59
|
protected
|
@@ -61,7 +61,8 @@ protected
|
|
61
61
|
# Serializes RDF for a feature set representation.
|
62
62
|
#
|
63
63
|
# +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
|
64
|
-
|
64
|
+
# +set_uri+:: optional URI prefix that should be used for a set instance (and hence, all its dependents -- features, annotations, etc.)
|
65
|
+
def serialize_model(model, set_uri)
|
65
66
|
# We record landmarks, because they can either be written when their "##sequence-region"
|
66
67
|
# pragma statement appears, or otherwise, when the first feature with said landmark is
|
67
68
|
# being serialized.
|
@@ -70,17 +71,25 @@ protected
|
|
70
71
|
# Record written variants in order to avoid writing out RDF.type multiple times.
|
71
72
|
@variants = {}
|
72
73
|
|
73
|
-
|
74
|
-
|
75
|
-
set_uri =
|
76
|
-
|
74
|
+
# Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
|
75
|
+
# Then register the prefix with the writer to have a concise Turtle output.
|
76
|
+
set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
|
77
|
+
set_uri = RDF::URI.new(model.uri) unless set_uri
|
78
|
+
set_base(set_uri + '/')
|
79
|
+
|
80
|
+
create_triple(set_uri, RDF.type, @base.Set)
|
77
81
|
model.pragmas.each { |pragma_name|
|
78
|
-
serialize_pragma(
|
82
|
+
serialize_pragma(set_uri, model.pragma(pragma_name))
|
79
83
|
}
|
80
84
|
model.contents.each { |feature|
|
81
|
-
|
85
|
+
if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
|
86
|
+
serialize_feature_sequence(set_uri, feature)
|
87
|
+
else
|
88
|
+
serialize_feature(set_uri, feature)
|
89
|
+
end
|
82
90
|
}
|
83
|
-
|
91
|
+
close
|
92
|
+
#RDF::NTriples::Writer.dump(graph, @ostream)
|
84
93
|
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
85
94
|
# Having said that, Jena's rdfcat is very good for converting formats
|
86
95
|
# anyway, so perhaps it is not worth investigating the following.
|
@@ -89,25 +98,22 @@ protected
|
|
89
98
|
|
90
99
|
# Serializes pragmas for a given feature set URI.
|
91
100
|
#
|
92
|
-
# +graph+:: RDF graph to which the pragmas are added
|
93
101
|
# +set_uri+:: the feature set URI to which the pragmas belong to
|
94
102
|
# +pragma+:: an object representing a pragma statement
|
95
|
-
def serialize_pragma(
|
103
|
+
def serialize_pragma(set_uri, pragma)
|
96
104
|
if pragma.kind_of?(Hash) then
|
97
|
-
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform'))
|
98
|
-
serialize_structured_attribute(
|
99
|
-
elsif pragma.has_key?('gff-version')
|
100
|
-
|
101
|
-
elsif pragma.has_key?('
|
102
|
-
|
103
|
-
elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
|
104
|
-
graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
|
105
|
+
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
|
106
|
+
serialize_structured_attribute(set_uri, pragma)
|
107
|
+
elsif pragma.has_key?('gff-version') then
|
108
|
+
create_triple(set_uri, @base.gff_version, pragma['gff-version'], RDF::XSD.float)
|
109
|
+
elsif pragma.has_key?('gvf-version') then
|
110
|
+
create_triple(set_uri, @base.gvf_version, pragma['gvf-version'], RDF::XSD.float)
|
105
111
|
elsif pragma.has_key?('sequence-region') then
|
106
112
|
pragma['sequence-region'].keys.each { |seqid|
|
107
|
-
serialize_landmark(
|
113
|
+
serialize_landmark(set_uri, pragma['sequence-region'][seqid])
|
108
114
|
}
|
109
115
|
elsif pragma.has_key?('species') then
|
110
|
-
|
116
|
+
create_triple(set_uri, @base.species, RDF::URI.new(pragma['species']))
|
111
117
|
end
|
112
118
|
else
|
113
119
|
# TODO
|
@@ -116,77 +122,97 @@ protected
|
|
116
122
|
|
117
123
|
# Serializes a +GFF3Feature+ object for a given feature set URI.
|
118
124
|
#
|
119
|
-
# +graph+:: RDF graph to which the feature is added
|
120
125
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
121
126
|
# +feature+:: a +GFF3Feature+ instance
|
122
|
-
def serialize_feature(
|
127
|
+
def serialize_feature(set_uri, feature)
|
123
128
|
# TODO Make sure there is only one value in the 'ID' list.
|
124
129
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
|
125
130
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
131
|
+
create_triple(set_uri, @base.contains, feature_uri)
|
132
|
+
create_triple(feature_uri, RDF.type, @base.Feature)
|
133
|
+
serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
|
134
|
+
create_triple(feature_uri, @base.seqid, @landmarks[feature.sequence_id])
|
135
|
+
create_triple(feature_uri, @base.source, feature.source)
|
136
|
+
create_triple(feature_uri, @base.type, feature.type)
|
137
|
+
create_triple(feature_uri, @base.phase, feature.phase) if feature.phase
|
138
|
+
|
139
|
+
serialize_coordinate(set_uri, feature_uri, feature)
|
140
|
+
serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
141
|
+
end
|
142
|
+
|
143
|
+
def serialize_coordinate(set_uri, feature_uri, feature)
|
144
|
+
region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
|
145
|
+
start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
|
146
|
+
end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
|
147
|
+
#feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
148
|
+
##graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
|
149
|
+
create_triple(feature_uri, @base.locus, region_uri)
|
150
|
+
create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
|
151
|
+
# BIN STUFF
|
152
|
+
if false then
|
153
|
+
bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
|
154
|
+
create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
|
155
|
+
end
|
156
|
+
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
157
|
+
create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
|
138
158
|
case feature.strand
|
139
159
|
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
140
|
-
|
160
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
161
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
141
162
|
when BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
142
|
-
|
163
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
164
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
143
165
|
when BioInterchange::Genomics::GFF3Feature::POSITIVE
|
144
|
-
|
166
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
167
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
145
168
|
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
146
|
-
|
169
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
170
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
147
171
|
else
|
148
172
|
raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
|
149
173
|
end
|
150
|
-
|
151
|
-
|
152
|
-
|
174
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
|
175
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
|
176
|
+
create_triple(feature_uri, @base.score, feature.score) if feature.score
|
153
177
|
end
|
154
178
|
|
155
179
|
# Serializes a genomic feature landmark ("seqid").
|
156
180
|
#
|
157
|
-
# +graph+:: RDF graph to which the landmark is added
|
158
181
|
# +set_uri+:: the feature set URI to which the landmark belongs to
|
159
182
|
# +landmark+:: encapsuled landmark data
|
160
|
-
def serialize_landmark(
|
183
|
+
def serialize_landmark(set_uri, landmark)
|
161
184
|
return if @landmarks.has_key?(landmark.seqid)
|
162
185
|
landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
|
186
|
+
region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
|
163
187
|
@landmarks[landmark.seqid] = landmark_uri
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
188
|
+
create_triple(landmark_uri, RDF.type, @base.Landmark)
|
189
|
+
create_triple(landmark_uri, @base.id, landmark.seqid)
|
190
|
+
create_triple(landmark_uri, @base.locus, region_uri)
|
191
|
+
if landmark.start_coordinate then
|
192
|
+
start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
|
193
|
+
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
194
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, landmark.start_coordinate)
|
195
|
+
end
|
196
|
+
if landmark.start_coordinate then
|
197
|
+
end_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/end")
|
198
|
+
create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
|
199
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, landmark.end_coordinate)
|
200
|
+
end
|
168
201
|
end
|
169
202
|
|
170
203
|
# Serializes the attributes of a feature.
|
171
204
|
#
|
172
|
-
# +graph+:: RDF graph to which the feature is added
|
173
205
|
# +set_uri+:: URI of the set these attributes belong to (implicit due to feature)
|
174
206
|
# +feature_uri+:: the feature URI to which the attributes belong to
|
175
207
|
# +attribtues+:: a map of tag/value pairs
|
176
|
-
def serialize_attributes(
|
208
|
+
def serialize_attributes(set_uri, feature_uri, attributes)
|
177
209
|
attributes.each_pair { |tag, list|
|
178
210
|
# Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
|
179
211
|
if tag == 'Alias' then
|
180
212
|
list.each { |value|
|
181
|
-
|
213
|
+
create_triple(feature_uri, @base.alias, value)
|
182
214
|
}
|
183
215
|
elsif tag == 'Dbxref' then
|
184
|
-
feature_properties = nil
|
185
|
-
if @base == BioInterchange::GFF3O then
|
186
|
-
feature_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
187
|
-
else
|
188
|
-
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
189
|
-
end
|
190
216
|
list.each { |value|
|
191
217
|
begin
|
192
218
|
linkout = nil
|
@@ -200,79 +226,143 @@ protected
|
|
200
226
|
linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
|
201
227
|
end
|
202
228
|
# Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
|
203
|
-
|
204
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::Literal.new(linkout, :datatype => RDF::XSD.anyURI )))
|
205
|
-
else
|
206
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new(linkout))) if @base == BioInterchange::GVF1O
|
207
|
-
end
|
229
|
+
create_triple(feature_uri, @base.dbxref, linkout)
|
208
230
|
rescue NoMethodError
|
209
|
-
|
231
|
+
# Preserve the Dbxref as a Literal:
|
232
|
+
@dbxref = 0 if @dbxref == nil
|
233
|
+
literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
|
234
|
+
@dbxref += 1
|
235
|
+
create_triple(feature_uri, @base.dbxref, literal_uri)
|
236
|
+
create_triple(literal_uri, RDF.type, RDF::RDFS.Literal)
|
237
|
+
create_triple(literal_uri, RDF.value, value)
|
210
238
|
end
|
211
239
|
}
|
212
|
-
elsif tag == 'Derives_from'
|
240
|
+
elsif tag == 'Derives_from' then
|
213
241
|
list.each { |value|
|
214
|
-
|
242
|
+
create_triple(feature_uri, @base.derivesFrom, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
|
215
243
|
}
|
216
|
-
elsif tag == 'Gap'
|
217
|
-
|
244
|
+
elsif tag == 'Gap' then
|
245
|
+
# Handled by 'Target', because 'Gap' requires 'Target' to be present.
|
218
246
|
elsif tag == 'ID' then
|
219
|
-
|
220
|
-
|
247
|
+
list.each { |value|
|
248
|
+
create_triple(feature_uri, @base.id, value)
|
249
|
+
}
|
250
|
+
elsif tag == 'Is_circular' then
|
221
251
|
value = list.join(',')
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
252
|
+
if value == 'true' then
|
253
|
+
create_triple(feature_uri, @base.isCircular, true) if value == 'true'
|
254
|
+
elsif value == 'false' then
|
255
|
+
create_triple(feature_uri, @base.isCircular, false) if value == 'false'
|
256
|
+
else
|
257
|
+
create_triple(feature_uri, RDF::RDFS.comment, "Is_circular non-truth value: #{value}")
|
258
|
+
end
|
259
|
+
elsif tag == 'Name' then
|
260
|
+
list.each { |value|
|
261
|
+
create_triple(feature_uri, @base.name, value)
|
262
|
+
}
|
263
|
+
elsif tag == 'Note' then
|
228
264
|
list.each { |value|
|
229
|
-
|
265
|
+
create_triple(feature_uri, RDF::RDFS.comment, value)
|
230
266
|
}
|
231
267
|
elsif tag == 'Ontology_term' then
|
232
268
|
list.each { |value|
|
233
269
|
# TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
|
234
270
|
# match their associated Ruby method.
|
235
271
|
namespace, accession = value.split(/:/, 2)
|
236
|
-
|
272
|
+
create_triple(feature_uri, @base.ontology_term, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
|
237
273
|
}
|
238
274
|
elsif tag == 'Parent' then
|
239
275
|
list.each { |parent_id|
|
240
|
-
|
276
|
+
create_triple(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
|
241
277
|
}
|
242
278
|
elsif tag == 'Reference_seq' then
|
243
279
|
list.each { |value|
|
244
|
-
|
280
|
+
reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
|
281
|
+
create_triple(feature_uri, @base.sequence_annotation, reference_uri)
|
282
|
+
create_triple(reference_uri, RDF.type, @base.Reference)
|
283
|
+
create_triple(reference_uri, @base.sequence, value)
|
245
284
|
}
|
246
285
|
elsif tag == 'Target' then
|
247
286
|
target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
|
248
|
-
target_datatype_properties = @base.target_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
249
|
-
target_object_properties = @base.target_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
250
287
|
target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
288
|
+
create_triple(feature_uri, @base.target, target_uri)
|
289
|
+
create_triple(target_uri, RDF.type, @base.Target)
|
290
|
+
create_triple(target_uri, @base.id, target_id)
|
291
|
+
region_uri = RDF::URI.new("#{target_uri.to_s}/region")
|
292
|
+
start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
|
293
|
+
end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
|
294
|
+
create_triple(target_uri, @base.locus, region_uri)
|
295
|
+
create_triple(region_uri, @base.locus, start_position_uri)
|
296
|
+
create_triple(region_uri, @base.locus, end_position_uri)
|
297
|
+
if strand == '+' then
|
298
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
299
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
300
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, start_coordinate)
|
301
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
302
|
+
elsif strand == '-' then
|
303
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
304
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
305
|
+
# Reverse start/end coordinates on the negative strand; FALDO requirement:
|
306
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
307
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, start_coordinate)
|
308
|
+
else
|
309
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
310
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
311
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, start_coordinate)
|
312
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
313
|
+
end
|
314
|
+
|
315
|
+
# Describe a possible alignment between the feature and target:
|
316
|
+
if attributes.has_key?('Gap') then
|
317
|
+
attributes['Gap'].each_index { |gap_no|
|
318
|
+
cigar_line = attributes['Gap'][gap_no].split(/\s+/)
|
319
|
+
cigar_line.each_index { |alignment_no|
|
320
|
+
alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
|
321
|
+
create_triple(feature_uri, @base.alignment, alignment_uri) if alignment_no == 0
|
322
|
+
operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
|
323
|
+
operation = nil unless operation.length == 1
|
324
|
+
span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
|
325
|
+
span = nil unless span.length > 0
|
326
|
+
if operation == 'M' then
|
327
|
+
create_triple(alignment_uri, RDF.type, @base.Match)
|
328
|
+
elsif operation == 'I' then
|
329
|
+
create_triple(alignment_uri, RDF.type, @base.Reference_Sequence_Gap)
|
330
|
+
elsif operation == 'D' then
|
331
|
+
create_triple(alignment_uri, RDF.type, @base.Target_Sequence_Gap)
|
332
|
+
elsif operation == 'F' then
|
333
|
+
create_triple(alignment_uri, RDF.type, @base.Forward_Reference_Sequence_Frameshift)
|
334
|
+
elsif operation == 'R' then
|
335
|
+
create_triple(alignment_uri, RDF.type, @base.Reverse_Reference_Sequence_Frameshift)
|
336
|
+
else
|
337
|
+
# Fallback: operation is outside of the specification
|
338
|
+
create_triple(alignment_uri, RDF.type, @base.Alignment_Operation)
|
339
|
+
create_triple(alignment_uri, RDF::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
|
340
|
+
end
|
341
|
+
create_triple(alignment_uri, @base.span, span.to_i) if span
|
342
|
+
create_triple(alignment_uri, RDF.first, alignment_uri)
|
343
|
+
if alignment_no + 1 < cigar_line.length then
|
344
|
+
create_triple(alignment_uri, RDF.rest, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
|
345
|
+
else
|
346
|
+
create_triple(alignment_uri, RDF.rest, RDF.nil)
|
347
|
+
end
|
348
|
+
}
|
349
|
+
}
|
350
|
+
end
|
351
|
+
elsif tag == 'Variant_effect' then
|
352
|
+
serialize_variant_effects(set_uri, feature_uri, list)
|
353
|
+
elsif tag == 'Variant_seq' then
|
354
|
+
serialize_variant_seqs(set_uri, feature_uri, list)
|
262
355
|
else
|
263
356
|
# TODO Report unknown upper case letters here? That would be a spec. validation...
|
264
357
|
# Well, or it would show that this implementation is incomplete. Could be either.
|
265
|
-
attribute_properties = @base.attribute_properties
|
266
|
-
attribute_properties = attribute_properties.select { |uri| @base.is_datatype_property?(uri) }[0] if attribute_properties.kind_of?(Array)
|
267
|
-
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
268
358
|
list.each_index { |index|
|
269
359
|
value = list[index]
|
270
360
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
271
361
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
362
|
+
create_triple(feature_uri, @base.attribute, attribute_uri)
|
363
|
+
create_triple(attribute_uri, RDF.type, @base.Attribute)
|
364
|
+
create_triple(attribute_uri, @base.tag, "#{tag}")
|
365
|
+
create_triple(attribute_uri, RDF.value, value)
|
276
366
|
}
|
277
367
|
end
|
278
368
|
}
|
@@ -281,10 +371,9 @@ protected
|
|
281
371
|
# Serializes a structured attribute (given as a pragma statement), which later
|
282
372
|
# can be referred to from feature instances.
|
283
373
|
#
|
284
|
-
# +graph+:: RDF graph to which the structured attribute is added
|
285
374
|
# +set_uri+:: the feature set URI to which the structured attribute belongs to
|
286
375
|
# +pragma+:: a map that encapsulates the structured attribute data
|
287
|
-
def serialize_structured_attribute(
|
376
|
+
def serialize_structured_attribute(set_uri, pragma)
|
288
377
|
attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
|
289
378
|
attributes = nil
|
290
379
|
class_type = nil
|
@@ -306,114 +395,132 @@ protected
|
|
306
395
|
else
|
307
396
|
# TODO Error.
|
308
397
|
end
|
309
|
-
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, class_type))
|
310
398
|
if class_type == @base.DataSource and attributes.has_key?('Data_type') then
|
311
|
-
data_type_individual = nil
|
312
399
|
attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
|
313
400
|
if attributes['Data_type'] == 'Array_CGH' then
|
314
|
-
|
401
|
+
class_type = @base.ArrayComparativeGenomicHybridization
|
315
402
|
elsif attributes['Data_type'] == 'DNA_microarray' then
|
316
|
-
|
403
|
+
class_type = @base.DNAMicroarray
|
317
404
|
elsif attributes['Data_type'] == 'DNA_sequence' then
|
318
|
-
|
405
|
+
class_type = @base.DNASequence
|
319
406
|
elsif attributes['Data_type'] == 'RNA_sequence' then
|
320
|
-
|
407
|
+
class_type = @base.RNASequence
|
321
408
|
else
|
322
409
|
# TODO Error.
|
323
410
|
end
|
324
|
-
graph.insert(RDF::Statement.new(attribute_uri, @base.data_type, data_type_individual))
|
325
411
|
elsif class_type == @base.TechnologyPlatform then
|
326
412
|
if attributes.has_key?('Average_coverage') then
|
327
|
-
|
413
|
+
create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
|
328
414
|
end
|
329
415
|
if attributes.has_key?('Platform_class') then
|
330
|
-
|
416
|
+
create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
|
331
417
|
end
|
332
418
|
if attributes.has_key?('Platform_name') then
|
333
|
-
|
419
|
+
create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
|
334
420
|
end
|
335
421
|
if attributes.has_key?('Read_length') then
|
336
|
-
|
422
|
+
create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
|
337
423
|
end
|
338
424
|
if attributes.has_key?('Read_pair_span') then
|
339
|
-
|
425
|
+
create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
|
340
426
|
end
|
341
427
|
if attributes.has_key?('Read_type') then
|
342
|
-
read_type_individual = nil
|
343
428
|
attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
|
344
429
|
if attributes['Read_type'] == 'fragment' then
|
345
|
-
|
430
|
+
class_type = @base.FragmentReadPlatform
|
346
431
|
elsif attributes['Read_type'] == 'pair' then
|
347
|
-
|
432
|
+
class_type = @base.PairedEndReadPlatform
|
348
433
|
else
|
349
434
|
# TODO Error.
|
350
435
|
end
|
351
|
-
graph.insert(RDF::Statement.new(attribute_uri, @base.read_type, read_type_individual))
|
352
436
|
end
|
353
437
|
end
|
354
|
-
|
438
|
+
create_triple(attribute_uri, RDF.type, class_type)
|
355
439
|
attributes.keys.each { |tag|
|
356
440
|
if tag.match(/^[a-z]/) then
|
357
441
|
custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
442
|
+
create_triple(custom_attribute_uri, RDF.type, @base.StructuredAttribute)
|
443
|
+
create_triple(custom_attribute_uri, @base.tag, tag)
|
444
|
+
attributes[tag].each { |value|
|
445
|
+
create_triple(custom_attribute_uri, RDF.value, value)
|
446
|
+
}
|
447
|
+
create_triple(attribute_uri, @base.attribute, custom_attribute_uri)
|
448
|
+
else
|
449
|
+
# TODO
|
362
450
|
end
|
363
451
|
}
|
364
452
|
end
|
365
453
|
|
366
454
|
# Serializes a list of variant effects.
|
367
455
|
#
|
368
|
-
# +graph+:: RDF graph to which the structured attribute is added
|
369
456
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
370
457
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
371
458
|
# +list+:: list of variant values
|
372
|
-
def serialize_variant_effects(
|
459
|
+
def serialize_variant_effects(set_uri, feature_uri, list)
|
373
460
|
list.each_index { |index|
|
374
461
|
effect = list[index]
|
375
462
|
sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
|
376
463
|
feature_ids = feature_ids.split(' ')
|
377
464
|
effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
|
378
|
-
serialize_variant_triple(
|
379
|
-
|
380
|
-
|
381
|
-
|
465
|
+
serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
|
466
|
+
create_triple(effect_uri, RDF.type, @base.Effect)
|
467
|
+
create_triple(effect_uri, @base.sequenceVariant, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
|
468
|
+
create_triple(effect_uri, @base.featureType, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
|
382
469
|
feature_ids.each { |feature_id|
|
383
|
-
|
470
|
+
create_triple(effect_uri, @base.feature, feature_id)
|
384
471
|
}
|
385
472
|
}
|
386
473
|
end
|
387
474
|
|
388
475
|
# Serializes a list of variant sequences.
|
389
476
|
#
|
390
|
-
# +graph+:: RDF graph to which the structured attribute is added
|
391
477
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
392
478
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
393
479
|
# +list+:: list of variant values
|
394
|
-
def serialize_variant_seqs(
|
480
|
+
def serialize_variant_seqs(set_uri, feature_uri, list)
|
395
481
|
list.each_index { |index|
|
396
482
|
value = list[index]
|
397
483
|
variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
|
398
|
-
serialize_variant_triple(
|
484
|
+
serialize_variant_triple(feature_uri, variant_uri, @base.sequence, RDF::Literal.new(value))
|
399
485
|
}
|
486
|
+
|
487
|
+
# Return the variant type based on the present sequence(s):
|
488
|
+
return @base.Variant if list.length != 2
|
489
|
+
if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
|
490
|
+
return @base.HomozygousVariant if list[0] == list[1]
|
491
|
+
return @base.HeterozygousVariant
|
492
|
+
end
|
493
|
+
return @base.Variant
|
400
494
|
end
|
401
495
|
|
402
496
|
# Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
|
403
497
|
#
|
404
|
-
# +graph+:: RDF graph to which the variant is added
|
405
498
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
406
499
|
# +variant_uri+:: URI that identifies the feature in question ("subject", if you like)
|
407
500
|
# +predicate+:: predicate that describes the data being serialized
|
408
501
|
# +object+:: data to be serialized
|
409
|
-
def serialize_variant_triple(
|
502
|
+
def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
|
410
503
|
unless @variants.has_key?(variant_uri.to_s) then
|
411
|
-
|
412
|
-
|
504
|
+
create_triple(feature_uri, @base.sequence_annotation, variant_uri)
|
505
|
+
create_triple(variant_uri, RDF.type, @base.Variant)
|
413
506
|
end
|
414
507
|
@variants[variant_uri.to_s] = true
|
415
|
-
|
508
|
+
create_triple(variant_uri, predicate, object)
|
509
|
+
end
|
510
|
+
|
511
|
+
# Serializes a +GFF3FeatureSequence+ object that contains the sequence for a feature object.
|
512
|
+
#
|
513
|
+
# +set_uri+:: the feature set URI to which the feature belongs to
|
514
|
+
# +feature_sequence+:: a +GFF3FeatureSequence+ instance
|
515
|
+
def serialize_feature_sequence(set_uri, feature_sequence)
|
516
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
|
517
|
+
annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
|
518
|
+
create_triple(feature_uri, @base.sequence_annotation, annotation_uri)
|
519
|
+
create_triple(annotation_uri, RDF.type, @base.Sequence_Annotation)
|
520
|
+
create_triple(annotation_uri, RDF::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
|
521
|
+
create_triple(annotation_uri, @base.sequence, feature_sequence.sequence)
|
416
522
|
end
|
523
|
+
|
417
524
|
end
|
418
525
|
|
419
526
|
end
|