biointerchange 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +2 -2
- data/README.md +7 -6
- data/VERSION +1 -1
- data/examples/ReaderModelWriterSequenceDiagram.graffle +3073 -0
- data/examples/ReaderModelWriterSequenceDiagram.png +0 -0
- data/lib/biointerchange/core.rb +63 -58
- data/lib/biointerchange/faldo.rb +125 -35
- data/lib/biointerchange/genomics/gff3_feature_sequence.rb +36 -0
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +254 -147
- data/lib/biointerchange/genomics/gff3_reader.rb +22 -3
- data/lib/biointerchange/genomics/locations.rb +30 -0
- data/lib/biointerchange/gfvo.rb +1072 -0
- data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +6 -5
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +7 -4
- data/lib/biointerchange/writer.rb +138 -2
- data/spec/gff3_rdfwriter_spec.rb +5 -6
- data/spec/gvf_rdfwriter_spec.rb +5 -6
- data/spec/phylogenetics_spec.rb +1 -1
- data/web/images/BioInterchange120.png +0 -0
- data/web/images/BioInterchange120inv.png +0 -0
- data/web/images/BioInterchange160.png +0 -0
- data/web/images/BioInterchange160inv.png +0 -0
- data/web/images/BioInterchange300inv.png +0 -0
- data/web/index.html +5 -1
- data/web/ontologies.html +1538 -21
- metadata +31 -46
@@ -0,0 +1,36 @@
|
|
1
|
+
module BioInterchange::Genomics
|
2
|
+
|
3
|
+
# Represents a sequence of a genomic feature of a GFF3 file.
|
4
|
+
class GFF3FeatureSequence
|
5
|
+
|
6
|
+
# Creates a new feature sequence representation. A feature sequence is described by two or more
|
7
|
+
# lines in a GFF3 file that are succeeding a '##FASTA' pragma statement.
|
8
|
+
#
|
9
|
+
# +feature_id+:: ID of the feature whose sequence is stored
|
10
|
+
# +sequence+:: sequence of the feature
|
11
|
+
def initialize(feature_id, sequence, comment = nil)
|
12
|
+
@feature_id = feature_id
|
13
|
+
@sequence = sequence
|
14
|
+
@comment = comment
|
15
|
+
end
|
16
|
+
|
17
|
+
# Returns the ID of the feature whose sequence is represented by the object.
|
18
|
+
def feature_id
|
19
|
+
@feature_id
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns the sequence of the feature.
|
23
|
+
def sequence
|
24
|
+
@sequence
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns additional comments -- if provided -- that are associated with the feature ID.
|
28
|
+
# If no comment was provided, then `nil` is returned.
|
29
|
+
def comment
|
30
|
+
@comment
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
@@ -11,24 +11,23 @@ module BioInterchange::Genomics
|
|
11
11
|
# - biointerchange.gvf
|
12
12
|
#
|
13
13
|
# Outputs:
|
14
|
-
# - rdf.biointerchange.
|
15
|
-
# - rdf.biointerchange.gvf
|
14
|
+
# - rdf.biointerchange.gfvo
|
16
15
|
class RDFWriter < BioInterchange::Writer
|
17
16
|
|
18
17
|
# Register writers:
|
19
18
|
BioInterchange::Registry.register_writer(
|
20
|
-
'rdf.biointerchange.
|
19
|
+
'rdf.biointerchange.gfvo',
|
21
20
|
BioInterchange::Genomics::RDFWriter,
|
22
21
|
[ 'biointerchange.gff3' ],
|
23
22
|
true,
|
24
|
-
'
|
23
|
+
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
25
24
|
)
|
26
25
|
BioInterchange::Registry.register_writer(
|
27
|
-
'rdf.biointerchange.
|
26
|
+
'rdf.biointerchange.gfvo',
|
28
27
|
BioInterchange::Genomics::RDFWriter,
|
29
28
|
[ 'biointerchange.gvf' ],
|
30
29
|
true,
|
31
|
-
'
|
30
|
+
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
32
31
|
)
|
33
32
|
|
34
33
|
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
@@ -42,18 +41,19 @@ class RDFWriter < BioInterchange::Writer
|
|
42
41
|
# Serialize a model as RDF.
|
43
42
|
#
|
44
43
|
# +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
|
45
|
-
|
44
|
+
# +uri_prefix+:: optional URI prefix that replaces the default URI prefix for all set/feature/annotation URIs
|
45
|
+
def serialize(model, uri_prefix = nil)
|
46
46
|
if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
|
47
|
-
@
|
48
|
-
serialize_model(model)
|
47
|
+
@format = :gff3
|
49
48
|
elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
|
50
|
-
@
|
51
|
-
serialize_model(model)
|
49
|
+
@format = :gvf
|
52
50
|
else
|
53
51
|
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
|
54
52
|
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
|
55
53
|
'BioInterchange::Genomics::GVFFeatureSet.'
|
56
54
|
end
|
55
|
+
@base = BioInterchange::GFVO
|
56
|
+
serialize_model(model, uri_prefix)
|
57
57
|
end
|
58
58
|
|
59
59
|
protected
|
@@ -61,7 +61,8 @@ protected
|
|
61
61
|
# Serializes RDF for a feature set representation.
|
62
62
|
#
|
63
63
|
# +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
|
64
|
-
|
64
|
+
# +set_uri+:: optional URI prefix that should be used for a set instance (and hence, all its dependents -- features, annotations, etc.)
|
65
|
+
def serialize_model(model, set_uri)
|
65
66
|
# We record landmarks, because they can either be written when their "##sequence-region"
|
66
67
|
# pragma statement appears, or otherwise, when the first feature with said landmark is
|
67
68
|
# being serialized.
|
@@ -70,17 +71,25 @@ protected
|
|
70
71
|
# Record written variants in order to avoid writing out RDF.type multiple times.
|
71
72
|
@variants = {}
|
72
73
|
|
73
|
-
|
74
|
-
|
75
|
-
set_uri =
|
76
|
-
|
74
|
+
# Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
|
75
|
+
# Then register the prefix with the writer to have a concise Turtle output.
|
76
|
+
set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
|
77
|
+
set_uri = RDF::URI.new(model.uri) unless set_uri
|
78
|
+
set_base(set_uri + '/')
|
79
|
+
|
80
|
+
create_triple(set_uri, RDF.type, @base.Set)
|
77
81
|
model.pragmas.each { |pragma_name|
|
78
|
-
serialize_pragma(
|
82
|
+
serialize_pragma(set_uri, model.pragma(pragma_name))
|
79
83
|
}
|
80
84
|
model.contents.each { |feature|
|
81
|
-
|
85
|
+
if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
|
86
|
+
serialize_feature_sequence(set_uri, feature)
|
87
|
+
else
|
88
|
+
serialize_feature(set_uri, feature)
|
89
|
+
end
|
82
90
|
}
|
83
|
-
|
91
|
+
close
|
92
|
+
#RDF::NTriples::Writer.dump(graph, @ostream)
|
84
93
|
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
85
94
|
# Having said that, Jena's rdfcat is very good for converting formats
|
86
95
|
# anyway, so perhaps it is not worth investigating the following.
|
@@ -89,25 +98,22 @@ protected
|
|
89
98
|
|
90
99
|
# Serializes pragmas for a given feature set URI.
|
91
100
|
#
|
92
|
-
# +graph+:: RDF graph to which the pragmas are added
|
93
101
|
# +set_uri+:: the feature set URI to which the pragmas belong to
|
94
102
|
# +pragma+:: an object representing a pragma statement
|
95
|
-
def serialize_pragma(
|
103
|
+
def serialize_pragma(set_uri, pragma)
|
96
104
|
if pragma.kind_of?(Hash) then
|
97
|
-
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform'))
|
98
|
-
serialize_structured_attribute(
|
99
|
-
elsif pragma.has_key?('gff-version')
|
100
|
-
|
101
|
-
elsif pragma.has_key?('
|
102
|
-
|
103
|
-
elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
|
104
|
-
graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
|
105
|
+
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
|
106
|
+
serialize_structured_attribute(set_uri, pragma)
|
107
|
+
elsif pragma.has_key?('gff-version') then
|
108
|
+
create_triple(set_uri, @base.gff_version, pragma['gff-version'], RDF::XSD.float)
|
109
|
+
elsif pragma.has_key?('gvf-version') then
|
110
|
+
create_triple(set_uri, @base.gvf_version, pragma['gvf-version'], RDF::XSD.float)
|
105
111
|
elsif pragma.has_key?('sequence-region') then
|
106
112
|
pragma['sequence-region'].keys.each { |seqid|
|
107
|
-
serialize_landmark(
|
113
|
+
serialize_landmark(set_uri, pragma['sequence-region'][seqid])
|
108
114
|
}
|
109
115
|
elsif pragma.has_key?('species') then
|
110
|
-
|
116
|
+
create_triple(set_uri, @base.species, RDF::URI.new(pragma['species']))
|
111
117
|
end
|
112
118
|
else
|
113
119
|
# TODO
|
@@ -116,77 +122,97 @@ protected
|
|
116
122
|
|
117
123
|
# Serializes a +GFF3Feature+ object for a given feature set URI.
|
118
124
|
#
|
119
|
-
# +graph+:: RDF graph to which the feature is added
|
120
125
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
121
126
|
# +feature+:: a +GFF3Feature+ instance
|
122
|
-
def serialize_feature(
|
127
|
+
def serialize_feature(set_uri, feature)
|
123
128
|
# TODO Make sure there is only one value in the 'ID' list.
|
124
129
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
|
125
130
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
131
|
+
create_triple(set_uri, @base.contains, feature_uri)
|
132
|
+
create_triple(feature_uri, RDF.type, @base.Feature)
|
133
|
+
serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
|
134
|
+
create_triple(feature_uri, @base.seqid, @landmarks[feature.sequence_id])
|
135
|
+
create_triple(feature_uri, @base.source, feature.source)
|
136
|
+
create_triple(feature_uri, @base.type, feature.type)
|
137
|
+
create_triple(feature_uri, @base.phase, feature.phase) if feature.phase
|
138
|
+
|
139
|
+
serialize_coordinate(set_uri, feature_uri, feature)
|
140
|
+
serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
141
|
+
end
|
142
|
+
|
143
|
+
def serialize_coordinate(set_uri, feature_uri, feature)
|
144
|
+
region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
|
145
|
+
start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
|
146
|
+
end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
|
147
|
+
#feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
148
|
+
##graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
|
149
|
+
create_triple(feature_uri, @base.locus, region_uri)
|
150
|
+
create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
|
151
|
+
# BIN STUFF
|
152
|
+
if false then
|
153
|
+
bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
|
154
|
+
create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
|
155
|
+
end
|
156
|
+
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
157
|
+
create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
|
138
158
|
case feature.strand
|
139
159
|
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
140
|
-
|
160
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
161
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
141
162
|
when BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
142
|
-
|
163
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
164
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
143
165
|
when BioInterchange::Genomics::GFF3Feature::POSITIVE
|
144
|
-
|
166
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
167
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
145
168
|
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
146
|
-
|
169
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
170
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
147
171
|
else
|
148
172
|
raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
|
149
173
|
end
|
150
|
-
|
151
|
-
|
152
|
-
|
174
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
|
175
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
|
176
|
+
create_triple(feature_uri, @base.score, feature.score) if feature.score
|
153
177
|
end
|
154
178
|
|
155
179
|
# Serializes a genomic feature landmark ("seqid").
|
156
180
|
#
|
157
|
-
# +graph+:: RDF graph to which the landmark is added
|
158
181
|
# +set_uri+:: the feature set URI to which the landmark belongs to
|
159
182
|
# +landmark+:: encapsuled landmark data
|
160
|
-
def serialize_landmark(
|
183
|
+
def serialize_landmark(set_uri, landmark)
|
161
184
|
return if @landmarks.has_key?(landmark.seqid)
|
162
185
|
landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
|
186
|
+
region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
|
163
187
|
@landmarks[landmark.seqid] = landmark_uri
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
188
|
+
create_triple(landmark_uri, RDF.type, @base.Landmark)
|
189
|
+
create_triple(landmark_uri, @base.id, landmark.seqid)
|
190
|
+
create_triple(landmark_uri, @base.locus, region_uri)
|
191
|
+
if landmark.start_coordinate then
|
192
|
+
start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
|
193
|
+
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
194
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, landmark.start_coordinate)
|
195
|
+
end
|
196
|
+
if landmark.start_coordinate then
|
197
|
+
end_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/end")
|
198
|
+
create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
|
199
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, landmark.end_coordinate)
|
200
|
+
end
|
168
201
|
end
|
169
202
|
|
170
203
|
# Serializes the attributes of a feature.
|
171
204
|
#
|
172
|
-
# +graph+:: RDF graph to which the feature is added
|
173
205
|
# +set_uri+:: URI of the set these attributes belong to (implicit due to feature)
|
174
206
|
# +feature_uri+:: the feature URI to which the attributes belong to
|
175
207
|
# +attribtues+:: a map of tag/value pairs
|
176
|
-
def serialize_attributes(
|
208
|
+
def serialize_attributes(set_uri, feature_uri, attributes)
|
177
209
|
attributes.each_pair { |tag, list|
|
178
210
|
# Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
|
179
211
|
if tag == 'Alias' then
|
180
212
|
list.each { |value|
|
181
|
-
|
213
|
+
create_triple(feature_uri, @base.alias, value)
|
182
214
|
}
|
183
215
|
elsif tag == 'Dbxref' then
|
184
|
-
feature_properties = nil
|
185
|
-
if @base == BioInterchange::GFF3O then
|
186
|
-
feature_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
187
|
-
else
|
188
|
-
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
189
|
-
end
|
190
216
|
list.each { |value|
|
191
217
|
begin
|
192
218
|
linkout = nil
|
@@ -200,79 +226,143 @@ protected
|
|
200
226
|
linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
|
201
227
|
end
|
202
228
|
# Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
|
203
|
-
|
204
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::Literal.new(linkout, :datatype => RDF::XSD.anyURI )))
|
205
|
-
else
|
206
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new(linkout))) if @base == BioInterchange::GVF1O
|
207
|
-
end
|
229
|
+
create_triple(feature_uri, @base.dbxref, linkout)
|
208
230
|
rescue NoMethodError
|
209
|
-
|
231
|
+
# Preserve the Dbxref as a Literal:
|
232
|
+
@dbxref = 0 if @dbxref == nil
|
233
|
+
literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
|
234
|
+
@dbxref += 1
|
235
|
+
create_triple(feature_uri, @base.dbxref, literal_uri)
|
236
|
+
create_triple(literal_uri, RDF.type, RDF::RDFS.Literal)
|
237
|
+
create_triple(literal_uri, RDF.value, value)
|
210
238
|
end
|
211
239
|
}
|
212
|
-
elsif tag == 'Derives_from'
|
240
|
+
elsif tag == 'Derives_from' then
|
213
241
|
list.each { |value|
|
214
|
-
|
242
|
+
create_triple(feature_uri, @base.derivesFrom, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
|
215
243
|
}
|
216
|
-
elsif tag == 'Gap'
|
217
|
-
|
244
|
+
elsif tag == 'Gap' then
|
245
|
+
# Handled by 'Target', because 'Gap' requires 'Target' to be present.
|
218
246
|
elsif tag == 'ID' then
|
219
|
-
|
220
|
-
|
247
|
+
list.each { |value|
|
248
|
+
create_triple(feature_uri, @base.id, value)
|
249
|
+
}
|
250
|
+
elsif tag == 'Is_circular' then
|
221
251
|
value = list.join(',')
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
252
|
+
if value == 'true' then
|
253
|
+
create_triple(feature_uri, @base.isCircular, true) if value == 'true'
|
254
|
+
elsif value == 'false' then
|
255
|
+
create_triple(feature_uri, @base.isCircular, false) if value == 'false'
|
256
|
+
else
|
257
|
+
create_triple(feature_uri, RDF::RDFS.comment, "Is_circular non-truth value: #{value}")
|
258
|
+
end
|
259
|
+
elsif tag == 'Name' then
|
260
|
+
list.each { |value|
|
261
|
+
create_triple(feature_uri, @base.name, value)
|
262
|
+
}
|
263
|
+
elsif tag == 'Note' then
|
228
264
|
list.each { |value|
|
229
|
-
|
265
|
+
create_triple(feature_uri, RDF::RDFS.comment, value)
|
230
266
|
}
|
231
267
|
elsif tag == 'Ontology_term' then
|
232
268
|
list.each { |value|
|
233
269
|
# TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
|
234
270
|
# match their associated Ruby method.
|
235
271
|
namespace, accession = value.split(/:/, 2)
|
236
|
-
|
272
|
+
create_triple(feature_uri, @base.ontology_term, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
|
237
273
|
}
|
238
274
|
elsif tag == 'Parent' then
|
239
275
|
list.each { |parent_id|
|
240
|
-
|
276
|
+
create_triple(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
|
241
277
|
}
|
242
278
|
elsif tag == 'Reference_seq' then
|
243
279
|
list.each { |value|
|
244
|
-
|
280
|
+
reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
|
281
|
+
create_triple(feature_uri, @base.sequence_annotation, reference_uri)
|
282
|
+
create_triple(reference_uri, RDF.type, @base.Reference)
|
283
|
+
create_triple(reference_uri, @base.sequence, value)
|
245
284
|
}
|
246
285
|
elsif tag == 'Target' then
|
247
286
|
target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
|
248
|
-
target_datatype_properties = @base.target_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
249
|
-
target_object_properties = @base.target_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
250
287
|
target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
288
|
+
create_triple(feature_uri, @base.target, target_uri)
|
289
|
+
create_triple(target_uri, RDF.type, @base.Target)
|
290
|
+
create_triple(target_uri, @base.id, target_id)
|
291
|
+
region_uri = RDF::URI.new("#{target_uri.to_s}/region")
|
292
|
+
start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
|
293
|
+
end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
|
294
|
+
create_triple(target_uri, @base.locus, region_uri)
|
295
|
+
create_triple(region_uri, @base.locus, start_position_uri)
|
296
|
+
create_triple(region_uri, @base.locus, end_position_uri)
|
297
|
+
if strand == '+' then
|
298
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
299
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
300
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, start_coordinate)
|
301
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
302
|
+
elsif strand == '-' then
|
303
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
304
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
305
|
+
# Reverse start/end coordinates on the negative strand; FALDO requirement:
|
306
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
307
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, start_coordinate)
|
308
|
+
else
|
309
|
+
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
310
|
+
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
|
311
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, start_coordinate)
|
312
|
+
create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
313
|
+
end
|
314
|
+
|
315
|
+
# Describe a possible alignment between the feature and target:
|
316
|
+
if attributes.has_key?('Gap') then
|
317
|
+
attributes['Gap'].each_index { |gap_no|
|
318
|
+
cigar_line = attributes['Gap'][gap_no].split(/\s+/)
|
319
|
+
cigar_line.each_index { |alignment_no|
|
320
|
+
alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
|
321
|
+
create_triple(feature_uri, @base.alignment, alignment_uri) if alignment_no == 0
|
322
|
+
operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
|
323
|
+
operation = nil unless operation.length == 1
|
324
|
+
span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
|
325
|
+
span = nil unless span.length > 0
|
326
|
+
if operation == 'M' then
|
327
|
+
create_triple(alignment_uri, RDF.type, @base.Match)
|
328
|
+
elsif operation == 'I' then
|
329
|
+
create_triple(alignment_uri, RDF.type, @base.Reference_Sequence_Gap)
|
330
|
+
elsif operation == 'D' then
|
331
|
+
create_triple(alignment_uri, RDF.type, @base.Target_Sequence_Gap)
|
332
|
+
elsif operation == 'F' then
|
333
|
+
create_triple(alignment_uri, RDF.type, @base.Forward_Reference_Sequence_Frameshift)
|
334
|
+
elsif operation == 'R' then
|
335
|
+
create_triple(alignment_uri, RDF.type, @base.Reverse_Reference_Sequence_Frameshift)
|
336
|
+
else
|
337
|
+
# Fallback: operation is outside of the specification
|
338
|
+
create_triple(alignment_uri, RDF.type, @base.Alignment_Operation)
|
339
|
+
create_triple(alignment_uri, RDF::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
|
340
|
+
end
|
341
|
+
create_triple(alignment_uri, @base.span, span.to_i) if span
|
342
|
+
create_triple(alignment_uri, RDF.first, alignment_uri)
|
343
|
+
if alignment_no + 1 < cigar_line.length then
|
344
|
+
create_triple(alignment_uri, RDF.rest, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
|
345
|
+
else
|
346
|
+
create_triple(alignment_uri, RDF.rest, RDF.nil)
|
347
|
+
end
|
348
|
+
}
|
349
|
+
}
|
350
|
+
end
|
351
|
+
elsif tag == 'Variant_effect' then
|
352
|
+
serialize_variant_effects(set_uri, feature_uri, list)
|
353
|
+
elsif tag == 'Variant_seq' then
|
354
|
+
serialize_variant_seqs(set_uri, feature_uri, list)
|
262
355
|
else
|
263
356
|
# TODO Report unknown upper case letters here? That would be a spec. validation...
|
264
357
|
# Well, or it would show that this implementation is incomplete. Could be either.
|
265
|
-
attribute_properties = @base.attribute_properties
|
266
|
-
attribute_properties = attribute_properties.select { |uri| @base.is_datatype_property?(uri) }[0] if attribute_properties.kind_of?(Array)
|
267
|
-
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
268
358
|
list.each_index { |index|
|
269
359
|
value = list[index]
|
270
360
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
271
361
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
362
|
+
create_triple(feature_uri, @base.attribute, attribute_uri)
|
363
|
+
create_triple(attribute_uri, RDF.type, @base.Attribute)
|
364
|
+
create_triple(attribute_uri, @base.tag, "#{tag}")
|
365
|
+
create_triple(attribute_uri, RDF.value, value)
|
276
366
|
}
|
277
367
|
end
|
278
368
|
}
|
@@ -281,10 +371,9 @@ protected
|
|
281
371
|
# Serializes a structured attribute (given as a pragma statement), which later
|
282
372
|
# can be referred to from feature instances.
|
283
373
|
#
|
284
|
-
# +graph+:: RDF graph to which the structured attribute is added
|
285
374
|
# +set_uri+:: the feature set URI to which the structured attribute belongs to
|
286
375
|
# +pragma+:: a map that encapsulates the structured attribute data
|
287
|
-
def serialize_structured_attribute(
|
376
|
+
def serialize_structured_attribute(set_uri, pragma)
|
288
377
|
attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
|
289
378
|
attributes = nil
|
290
379
|
class_type = nil
|
@@ -306,114 +395,132 @@ protected
|
|
306
395
|
else
|
307
396
|
# TODO Error.
|
308
397
|
end
|
309
|
-
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, class_type))
|
310
398
|
if class_type == @base.DataSource and attributes.has_key?('Data_type') then
|
311
|
-
data_type_individual = nil
|
312
399
|
attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
|
313
400
|
if attributes['Data_type'] == 'Array_CGH' then
|
314
|
-
|
401
|
+
class_type = @base.ArrayComparativeGenomicHybridization
|
315
402
|
elsif attributes['Data_type'] == 'DNA_microarray' then
|
316
|
-
|
403
|
+
class_type = @base.DNAMicroarray
|
317
404
|
elsif attributes['Data_type'] == 'DNA_sequence' then
|
318
|
-
|
405
|
+
class_type = @base.DNASequence
|
319
406
|
elsif attributes['Data_type'] == 'RNA_sequence' then
|
320
|
-
|
407
|
+
class_type = @base.RNASequence
|
321
408
|
else
|
322
409
|
# TODO Error.
|
323
410
|
end
|
324
|
-
graph.insert(RDF::Statement.new(attribute_uri, @base.data_type, data_type_individual))
|
325
411
|
elsif class_type == @base.TechnologyPlatform then
|
326
412
|
if attributes.has_key?('Average_coverage') then
|
327
|
-
|
413
|
+
create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
|
328
414
|
end
|
329
415
|
if attributes.has_key?('Platform_class') then
|
330
|
-
|
416
|
+
create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
|
331
417
|
end
|
332
418
|
if attributes.has_key?('Platform_name') then
|
333
|
-
|
419
|
+
create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
|
334
420
|
end
|
335
421
|
if attributes.has_key?('Read_length') then
|
336
|
-
|
422
|
+
create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
|
337
423
|
end
|
338
424
|
if attributes.has_key?('Read_pair_span') then
|
339
|
-
|
425
|
+
create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
|
340
426
|
end
|
341
427
|
if attributes.has_key?('Read_type') then
|
342
|
-
read_type_individual = nil
|
343
428
|
attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
|
344
429
|
if attributes['Read_type'] == 'fragment' then
|
345
|
-
|
430
|
+
class_type = @base.FragmentReadPlatform
|
346
431
|
elsif attributes['Read_type'] == 'pair' then
|
347
|
-
|
432
|
+
class_type = @base.PairedEndReadPlatform
|
348
433
|
else
|
349
434
|
# TODO Error.
|
350
435
|
end
|
351
|
-
graph.insert(RDF::Statement.new(attribute_uri, @base.read_type, read_type_individual))
|
352
436
|
end
|
353
437
|
end
|
354
|
-
|
438
|
+
create_triple(attribute_uri, RDF.type, class_type)
|
355
439
|
attributes.keys.each { |tag|
|
356
440
|
if tag.match(/^[a-z]/) then
|
357
441
|
custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
442
|
+
create_triple(custom_attribute_uri, RDF.type, @base.StructuredAttribute)
|
443
|
+
create_triple(custom_attribute_uri, @base.tag, tag)
|
444
|
+
attributes[tag].each { |value|
|
445
|
+
create_triple(custom_attribute_uri, RDF.value, value)
|
446
|
+
}
|
447
|
+
create_triple(attribute_uri, @base.attribute, custom_attribute_uri)
|
448
|
+
else
|
449
|
+
# TODO
|
362
450
|
end
|
363
451
|
}
|
364
452
|
end
|
365
453
|
|
366
454
|
# Serializes a list of variant effects.
|
367
455
|
#
|
368
|
-
# +graph+:: RDF graph to which the structured attribute is added
|
369
456
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
370
457
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
371
458
|
# +list+:: list of variant values
|
372
|
-
def serialize_variant_effects(
|
459
|
+
def serialize_variant_effects(set_uri, feature_uri, list)
|
373
460
|
list.each_index { |index|
|
374
461
|
effect = list[index]
|
375
462
|
sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
|
376
463
|
feature_ids = feature_ids.split(' ')
|
377
464
|
effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
|
378
|
-
serialize_variant_triple(
|
379
|
-
|
380
|
-
|
381
|
-
|
465
|
+
serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
|
466
|
+
create_triple(effect_uri, RDF.type, @base.Effect)
|
467
|
+
create_triple(effect_uri, @base.sequenceVariant, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
|
468
|
+
create_triple(effect_uri, @base.featureType, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
|
382
469
|
feature_ids.each { |feature_id|
|
383
|
-
|
470
|
+
create_triple(effect_uri, @base.feature, feature_id)
|
384
471
|
}
|
385
472
|
}
|
386
473
|
end
|
387
474
|
|
388
475
|
# Serializes a list of variant sequences.
|
389
476
|
#
|
390
|
-
# +graph+:: RDF graph to which the structured attribute is added
|
391
477
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
392
478
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
393
479
|
# +list+:: list of variant values
|
394
|
-
def serialize_variant_seqs(
|
480
|
+
def serialize_variant_seqs(set_uri, feature_uri, list)
|
395
481
|
list.each_index { |index|
|
396
482
|
value = list[index]
|
397
483
|
variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
|
398
|
-
serialize_variant_triple(
|
484
|
+
serialize_variant_triple(feature_uri, variant_uri, @base.sequence, RDF::Literal.new(value))
|
399
485
|
}
|
486
|
+
|
487
|
+
# Return the variant type based on the present sequence(s):
|
488
|
+
return @base.Variant if list.length != 2
|
489
|
+
if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
|
490
|
+
return @base.HomozygousVariant if list[0] == list[1]
|
491
|
+
return @base.HeterozygousVariant
|
492
|
+
end
|
493
|
+
return @base.Variant
|
400
494
|
end
|
401
495
|
|
402
496
|
# Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
|
403
497
|
#
|
404
|
-
# +graph+:: RDF graph to which the variant is added
|
405
498
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
406
499
|
# +variant_uri+:: URI that identifies the feature in question ("subject", if you like)
|
407
500
|
# +predicate+:: predicate that describes the data being serialized
|
408
501
|
# +object+:: data to be serialized
|
409
|
-
def serialize_variant_triple(
|
502
|
+
def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
|
410
503
|
unless @variants.has_key?(variant_uri.to_s) then
|
411
|
-
|
412
|
-
|
504
|
+
create_triple(feature_uri, @base.sequence_annotation, variant_uri)
|
505
|
+
create_triple(variant_uri, RDF.type, @base.Variant)
|
413
506
|
end
|
414
507
|
@variants[variant_uri.to_s] = true
|
415
|
-
|
508
|
+
create_triple(variant_uri, predicate, object)
|
509
|
+
end
|
510
|
+
|
511
|
+
# Serializes a +GFF3FeatureSequence+ object that contains the sequence for a feature object.
|
512
|
+
#
|
513
|
+
# +set_uri+:: the feature set URI to which the feature belongs to
|
514
|
+
# +feature_sequence+:: a +GFF3FeatureSequence+ instance
|
515
|
+
def serialize_feature_sequence(set_uri, feature_sequence)
|
516
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
|
517
|
+
annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
|
518
|
+
create_triple(feature_uri, @base.sequence_annotation, annotation_uri)
|
519
|
+
create_triple(annotation_uri, RDF.type, @base.Sequence_Annotation)
|
520
|
+
create_triple(annotation_uri, RDF::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
|
521
|
+
create_triple(annotation_uri, @base.sequence, feature_sequence.sequence)
|
416
522
|
end
|
523
|
+
|
417
524
|
end
|
418
525
|
|
419
526
|
end
|