biointerchange 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +12 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +8 -0
- data/README.md +166 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/bin/biointerchange +6 -0
- data/docs/exceptions_readme.txt +13 -0
- data/examples/BovineGenomeChrX.gff3.gz +0 -0
- data/examples/gb-2007-8-3-R40.xml +243 -0
- data/examples/pubannotation.json +1 -0
- data/generators/rdfxml.rb +104 -0
- data/lib/biointerchange/core.rb +195 -0
- data/lib/biointerchange/exceptions.rb +38 -0
- data/lib/biointerchange/genomics/gff3_feature.rb +82 -0
- data/lib/biointerchange/genomics/gff3_feature_set.rb +37 -0
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +107 -0
- data/lib/biointerchange/genomics/gff3_reader.rb +86 -0
- data/lib/biointerchange/gff3.rb +135 -0
- data/lib/biointerchange/reader.rb +25 -0
- data/lib/biointerchange/registry.rb +29 -0
- data/lib/biointerchange/sio.rb +7124 -0
- data/lib/biointerchange/sofa.rb +1566 -0
- data/lib/biointerchange/textmining/content.rb +69 -0
- data/lib/biointerchange/textmining/document.rb +36 -0
- data/lib/biointerchange/textmining/pdfx_xml_reader.rb +161 -0
- data/lib/biointerchange/textmining/process.rb +57 -0
- data/lib/biointerchange/textmining/pubannos_json_reader.rb +72 -0
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +197 -0
- data/lib/biointerchange/textmining/text_mining_reader.rb +41 -0
- data/lib/biointerchange/writer.rb +23 -0
- data/lib/biointerchange.rb +3 -0
- data/spec/exceptions_spec.rb +27 -0
- data/spec/gff3_rdfwriter_spec.rb +67 -0
- data/spec/text_mining_pdfx_xml_reader_spec.rb +89 -0
- data/spec/text_mining_pubannos_json_reader_spec.rb +71 -0
- data/spec/text_mining_rdfwriter_spec.rb +57 -0
- data/web/about.html +89 -0
- data/web/biointerchange.js +133 -0
- data/web/bootstrap/css/bootstrap-responsive.css +1040 -0
- data/web/bootstrap/css/bootstrap-responsive.min.css +9 -0
- data/web/bootstrap/css/bootstrap.css +5624 -0
- data/web/bootstrap/css/bootstrap.min.css +9 -0
- data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
- data/web/bootstrap/img/glyphicons-halflings.png +0 -0
- data/web/bootstrap/js/bootstrap.js +2027 -0
- data/web/bootstrap/js/bootstrap.min.js +6 -0
- data/web/bootstrap/js/jquery-1.8.1.min.js +2 -0
- data/web/css/rdoc-style.css +5786 -0
- data/web/css/rdoc.css +716 -0
- data/web/images/BioInterchange300.png +0 -0
- data/web/index.html +109 -0
- data/web/service/rdfizer.fcgi +68 -0
- data/web/webservices.html +123 -0
- metadata +240 -0
@@ -0,0 +1,195 @@
|
|
1
|
+
module BioInterchange
|
2
|
+
|
3
|
+
# Custom Exceptions and Errors
|
4
|
+
require 'biointerchange/exceptions'
|
5
|
+
|
6
|
+
# Ontologies (besides the ones from the 'rdf' gem)
|
7
|
+
require 'biointerchange/gff3'
|
8
|
+
require 'biointerchange/sio'
|
9
|
+
require 'biointerchange/sofa'
|
10
|
+
|
11
|
+
# Reader/writer interfaces
|
12
|
+
require 'biointerchange/reader'
|
13
|
+
require 'biointerchange/writer'
|
14
|
+
|
15
|
+
#
|
16
|
+
# TEXT MINING
|
17
|
+
#
|
18
|
+
|
19
|
+
# Text mining readers
|
20
|
+
require 'biointerchange/textmining/text_mining_reader'
|
21
|
+
require 'biointerchange/textmining/pubannos_json_reader'
|
22
|
+
require 'biointerchange/textmining/pdfx_xml_reader'
|
23
|
+
|
24
|
+
# Text mining model
|
25
|
+
require 'biointerchange/textmining/document'
|
26
|
+
require 'biointerchange/textmining/content'
|
27
|
+
require 'biointerchange/textmining/process'
|
28
|
+
|
29
|
+
# Text mining writers
|
30
|
+
require 'biointerchange/textmining/text_mining_rdf_ntriples'
|
31
|
+
|
32
|
+
#
|
33
|
+
# GENOMICS
|
34
|
+
#
|
35
|
+
|
36
|
+
# GFF3 reader
|
37
|
+
require 'biointerchange/genomics/gff3_reader'
|
38
|
+
|
39
|
+
# Feature base model
|
40
|
+
require 'biointerchange/genomics/gff3_feature_set'
|
41
|
+
require 'biointerchange/genomics/gff3_feature'
|
42
|
+
|
43
|
+
# GFF3 writer
|
44
|
+
require 'biointerchange/genomics/gff3_rdf_ntriples'
|
45
|
+
|
46
|
+
#
|
47
|
+
# ACTUAL COMMAND LINE IMPLEMENTATION
|
48
|
+
#
|
49
|
+
|
50
|
+
# Option parsing
|
51
|
+
require 'getopt/long'
|
52
|
+
|
53
|
+
def self.cli
|
54
|
+
begin
|
55
|
+
opt = Getopt::Long.getopts(
|
56
|
+
["--help", "-h", Getopt::BOOLEAN],
|
57
|
+
["--debug", "-d", Getopt::BOOLEAN], # set debug mode => print stack traces
|
58
|
+
["--input", "-i", Getopt::REQUIRED], # input file format
|
59
|
+
["--rdf", "-r", Getopt::REQUIRED], # output file format
|
60
|
+
["--name", Getopt::OPTIONAL], # name of resourcce/tool/person
|
61
|
+
["--name_id", Getopt::OPTIONAL], # uri of resource/tool/person
|
62
|
+
["--date", "-t", Getopt::OPTIONAL], # date of processing/annotation
|
63
|
+
["--version", "-v", Getopt::OPTIONAL], # version number of resource
|
64
|
+
["--file", "-f", Getopt::OPTIONAL], # file to read, will read from STDIN if not supplied
|
65
|
+
["--out", "-o", Getopt::OPTIONAL] # output file, will out to STDOUT if not supplied
|
66
|
+
)
|
67
|
+
|
68
|
+
if opt['help'] or not opt['input'] or not opt['rdf'] then
|
69
|
+
puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
|
70
|
+
puts 'Supported input formats (--input <format>/-i <format>):'
|
71
|
+
puts ' biointerchange.gff3 : GFF3'
|
72
|
+
puts ' dbcls.catanns.json : PubAnnotation JSON'
|
73
|
+
puts ' uk.ac.man.pdfx : PDFx XML'
|
74
|
+
puts 'Supported output formats (--rdf <format>/-r <format>)'
|
75
|
+
puts ' rdf.biointerchange.gff3 : RDF N-Triples for input'
|
76
|
+
puts ' biointerchange.gff3'
|
77
|
+
puts ' rdf.bh12.sio : RDF N-Triples for inputs'
|
78
|
+
puts ' dbcls.catanns.json'
|
79
|
+
puts ' uk.ac.man.pdfx'
|
80
|
+
puts 'I/O options:'
|
81
|
+
puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
|
82
|
+
puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
|
83
|
+
puts 'Input-/RDF-format specific options:'
|
84
|
+
puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
|
85
|
+
puts ' Output: rdf.bh12.sio'
|
86
|
+
puts ' Options:'
|
87
|
+
puts ' -t <date>/--date <date> : date of processing/annotation (optional)'
|
88
|
+
puts ' -v <version>/--version <version> : version number of resource (optional)'
|
89
|
+
puts ' --name <name> : name of resource/tool/person (required)'
|
90
|
+
puts ' --name_id <id> : URI of resource/tool/person (required)'
|
91
|
+
puts 'Input-/RDF-format specific options:'
|
92
|
+
puts ' Input: biointerchange.gff3'
|
93
|
+
puts ' Output: rdf.biointerchange.gff3'
|
94
|
+
puts ' Options:'
|
95
|
+
puts ' -t <date>/--date <date> : date when the GFF3 file was created (optional)'
|
96
|
+
puts ' --name <name> : name of the GFF3 file creator (optional)'
|
97
|
+
puts ' --name_id <id> : email address of the GFF3 file creator (optional)'
|
98
|
+
puts 'Other options:'
|
99
|
+
puts ' -d / --debug : turn on debugging output (for stacktraces)'
|
100
|
+
puts ' -h --help : this message'
|
101
|
+
|
102
|
+
exit 1
|
103
|
+
end
|
104
|
+
|
105
|
+
if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
|
106
|
+
if opt['rdf'] == 'rdf.bh12.sio' then
|
107
|
+
raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
|
108
|
+
else
|
109
|
+
unsupported_combination
|
110
|
+
end
|
111
|
+
elsif opt['input'] == 'biointerchange.gff3' then
|
112
|
+
if opt['rdf'] == 'rdf.biointerchange.gff3' then
|
113
|
+
# Okay. No further arguments required.
|
114
|
+
else
|
115
|
+
unsupported_combination
|
116
|
+
end
|
117
|
+
else
|
118
|
+
unsupported_combination
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
opt['date'] = nil unless opt['date']
|
123
|
+
opt['version'] = nil unless opt['version']
|
124
|
+
|
125
|
+
# generate model from file (deserialise)
|
126
|
+
reader = nil
|
127
|
+
if opt['input'] == 'dbcls.catanns.json' then
|
128
|
+
reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
129
|
+
elsif opt['input'] == 'uk.ac.man.pdfx' then
|
130
|
+
reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
131
|
+
elsif opt['input'] == 'biointerchange.gff3' then
|
132
|
+
reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
|
133
|
+
end
|
134
|
+
|
135
|
+
model = nil
|
136
|
+
if opt["file"]
|
137
|
+
model = reader.deserialize(File.new(opt["file"],'r'))
|
138
|
+
else
|
139
|
+
model = reader.deserialize(STDIN)
|
140
|
+
end
|
141
|
+
|
142
|
+
# generate rdf from model (serialise)
|
143
|
+
writer = nil
|
144
|
+
if opt['rdf'] == 'rdf.bh12.sio' then
|
145
|
+
writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
|
146
|
+
writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
|
147
|
+
end
|
148
|
+
if opt['rdf'] == 'rdf.biointerchange.gff3' then
|
149
|
+
writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
|
150
|
+
writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
|
151
|
+
end
|
152
|
+
|
153
|
+
writer.serialize(model)
|
154
|
+
|
155
|
+
rescue ArgumentError => e
|
156
|
+
$stderr.puts e.message
|
157
|
+
$stderr.puts e.backtrace if opt['debug']
|
158
|
+
exit 1
|
159
|
+
rescue Getopt::Long::Error => e
|
160
|
+
$stderr.puts e.message
|
161
|
+
#$stderr.puts e.backtrace if opt['debug']
|
162
|
+
exit 1
|
163
|
+
rescue BioInterchange::Exceptions::InputFormatError => e
|
164
|
+
$stderr.puts e.message
|
165
|
+
$stderr.puts e.backtrace if opt['debug']
|
166
|
+
exit 2
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
#
|
171
|
+
# Helper functions
|
172
|
+
#
|
173
|
+
|
174
|
+
# Returns the values of several named parameters.
|
175
|
+
#
|
176
|
+
# +map+:: a map of named parameters and their values
|
177
|
+
# +parameters+:: the names of the parameter values we are interested in
|
178
|
+
def self.get_parameters(map, parameters)
|
179
|
+
parameters.map { |parameter|
|
180
|
+
if parameter.instance_of? Array then
|
181
|
+
parameter[0].call(*BioInterchange::get_parameters(map, parameter[1..-1]))
|
182
|
+
else
|
183
|
+
map[parameter]
|
184
|
+
end
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
private
|
189
|
+
|
190
|
+
def self.unsupported_combination
|
191
|
+
raise ArgumentError, 'This input/output format combination is not supported.'
|
192
|
+
end
|
193
|
+
|
194
|
+
end
|
195
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module BioInterchange::Exceptions
|
2
|
+
|
3
|
+
# Top level Error class for all framework errors.
|
4
|
+
class BioInterchangeError < StandardError
|
5
|
+
end
|
6
|
+
|
7
|
+
# Error class for issues regarding input file formats.
|
8
|
+
# This error class is caught at the hightest level and
|
9
|
+
# only the error message is returned to the end user
|
10
|
+
# (not the backtrace). This helps keep the framework
|
11
|
+
# more user friendly. Note that this means meaningful
|
12
|
+
# error messages are strongly encourged.
|
13
|
+
class InputFormatError < BioInterchangeError
|
14
|
+
end
|
15
|
+
|
16
|
+
# Top class error for errors raised only during
|
17
|
+
# framework implementation and extension. Once
|
18
|
+
# such an implementation/extension is completed,
|
19
|
+
# these errors should no longer be possible.
|
20
|
+
# Examples include a method reciving something it
|
21
|
+
# can't handle. The framework does not resuce these
|
22
|
+
# errors leaving the backtraces for help in debug.
|
23
|
+
class ImplementationError < BioInterchangeError
|
24
|
+
end
|
25
|
+
|
26
|
+
# Implementation errors originating from readers.
|
27
|
+
class ImplementationReaderError < ImplementationError
|
28
|
+
end
|
29
|
+
|
30
|
+
# Implementation errors originating from models.
|
31
|
+
class ImplementationModelError < ImplementationError
|
32
|
+
end
|
33
|
+
|
34
|
+
# Implementation errors originating from writers.
|
35
|
+
class ImplementationWriterError < ImplementationError
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module BioInterchange::Genomics
|
2
|
+
|
3
|
+
class GFF3Feature
|
4
|
+
|
5
|
+
# Constants determining the strand of the feature.
|
6
|
+
NOT_STRANDED = 0
|
7
|
+
UNKNOWN = 1
|
8
|
+
POSITIVE = 2
|
9
|
+
NEGATIVE = 3
|
10
|
+
|
11
|
+
# Creates a new feature representation. A feature is described on one line of the GFF3 file.
|
12
|
+
#
|
13
|
+
# +sequence_id+:: an identifier that determines the coordinate system for the feature
|
14
|
+
# +source+:: a text description of the origin of this feature description
|
15
|
+
# +type+:: either a SOFA accession, SOFA term, or textual description (the former are URIs, the latter is a string)
|
16
|
+
# +start_coordinate+:: an integer denoting the start coordinate of the feature
|
17
|
+
# +end_coordinate+:: an integer denoting the end coordinate of the feature, which is equal or larger than the start coordinate
|
18
|
+
# +score+:: a floating point score
|
19
|
+
# +strand+:: a constant determining whether the feature is NOT_STRANDED, the strand is UNKNOWN, or the feature is on the POSITIVE or NEGATIVE strand
|
20
|
+
# +phase+:: an integer determining the phase of the feature, if the feature has a phase
|
21
|
+
# +attributes+:: a map of additional attributes associated with the feature
|
22
|
+
def initialize(sequence_id, source, type, start_coordinate, end_coordinate, score = nil, strand = NOT_STRANDED, phase = nil, attributes = {})
|
23
|
+
@sequence_id = sequence_id
|
24
|
+
@source = source
|
25
|
+
@type = type
|
26
|
+
@start_coordinate = start_coordinate
|
27
|
+
@end_coordinate = end_coordinate
|
28
|
+
@score = score
|
29
|
+
@strand = strand
|
30
|
+
@phase = phase
|
31
|
+
@attributes = attributes
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the sequence ID that determines the coordinate system for the feature.
|
35
|
+
def sequence_id
|
36
|
+
@sequence_id
|
37
|
+
end
|
38
|
+
|
39
|
+
# Returns a textual description that determines the origin of this feature.
|
40
|
+
def source
|
41
|
+
@source
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns the feature type, which can either be a SOFA URI or a textual description otherwise.
|
45
|
+
def type
|
46
|
+
@type
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the start coordinate of the feature. The start coordinate is equal or smaller than the end coordinate.
|
50
|
+
def start_coordinate
|
51
|
+
@start_coordinate
|
52
|
+
end
|
53
|
+
|
54
|
+
# Returns the end coordinate of the feature. The end coordinate is equal or larger than the start coordinate.
|
55
|
+
def end_coordinate
|
56
|
+
@end_coordinate
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the score of the feature. The score is a floating point number, which ideally is an E-value or P-value.
|
60
|
+
def score
|
61
|
+
@score
|
62
|
+
end
|
63
|
+
|
64
|
+
# Returns the strand the feature is located on.
|
65
|
+
def strand
|
66
|
+
@strand
|
67
|
+
end
|
68
|
+
|
69
|
+
# Returns the phase, if existing, for this feature.
|
70
|
+
def phase
|
71
|
+
@phase
|
72
|
+
end
|
73
|
+
|
74
|
+
# Returns a map of additional attributes for this feature.
|
75
|
+
def attributes
|
76
|
+
@attributes.freeze
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
module BioInterchange::Genomics
|
4
|
+
|
5
|
+
class GFF3FeatureSet
|
6
|
+
|
7
|
+
# Create a new instance of a Generic Feature Format Version 3 (GFF3) feature set. A feature
|
8
|
+
# set can contain multiple GFF3 features.
|
9
|
+
def initialize
|
10
|
+
# Features are stored as the keys of a hash map to increase performance:
|
11
|
+
@set = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns the contents of the feature set.
|
15
|
+
def contents
|
16
|
+
@set.keys
|
17
|
+
end
|
18
|
+
|
19
|
+
# Returns an URI for this particular feature set, which is a SHA1 hash over the content's concatenated properties.
|
20
|
+
def uri
|
21
|
+
clob = ''
|
22
|
+
contents.each { |feature|
|
23
|
+
clob << "#{feature.sequence_id}\t#{feature.source}\t#{feature.type}\t#{feature.start_coordinate}\t#{feature.end_coordinate}\t#{feature.score}\t#{feature.strand}\t#{feature.phase}\t#{feature.attributes.keys.map { |tag| "#{tag}=#{feature.attributes[tag]}" }.join(';')}\n"
|
24
|
+
}
|
25
|
+
"biointerchange://gff3/featureset/self/#{Digest::SHA1.hexdigest(clob)}"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Adds a feature to the feature set.
|
29
|
+
#
|
30
|
+
# +feature+:: feature instance that is added to the contents of this feature set
|
31
|
+
def add(feature)
|
32
|
+
@set[feature] = true
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ntriples'
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module BioInterchange::Genomics
|
6
|
+
|
7
|
+
class RDFWriter
|
8
|
+
|
9
|
+
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
10
|
+
#
|
11
|
+
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
12
|
+
def initialize(ostream)
|
13
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
|
14
|
+
@ostream = ostream
|
15
|
+
end
|
16
|
+
|
17
|
+
# Serialize a model as RDF.
|
18
|
+
#
|
19
|
+
# +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
|
20
|
+
def serialize(model)
|
21
|
+
if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
|
22
|
+
serialize_model(model)
|
23
|
+
else
|
24
|
+
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
|
25
|
+
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet.'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# Serializes RDF for a feature set representation.
|
32
|
+
#
|
33
|
+
# +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
|
34
|
+
def serialize_model(model)
|
35
|
+
graph = RDF::Graph.new
|
36
|
+
set_uri = RDF::URI.new(model.uri)
|
37
|
+
graph.insert(RDF::Statement.new(set_uri, RDF.type, BioInterchange::GFF3.Set))
|
38
|
+
model.contents.each { |feature|
|
39
|
+
serialize_feature(graph, set_uri, feature)
|
40
|
+
}
|
41
|
+
RDF::NTriples::Writer.dump(graph, @ostream)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Serializes a +GFF3Feature+ object for a given feature set URI.
|
45
|
+
#
|
46
|
+
# +graph+:: RDF graph to which the feature is added
|
47
|
+
# +set_uri+:: the feature set URI to which the feature belongs to
|
48
|
+
# +feature+:: a +GFF3Feature+ instance
|
49
|
+
def serialize_feature(graph, set_uri, feature)
|
50
|
+
# TODO Make sure there is only one value in the 'ID' list.
|
51
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
|
52
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
|
53
|
+
graph.insert(RDF::Statement.new(set_uri, BioInterchange::GFF3.contains, feature_uri))
|
54
|
+
graph.insert(RDF::Statement.new(feature_uri, RDF.type, BioInterchange::GFF3.Feature))
|
55
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.seqid, RDF::Literal.new(feature.sequence_id)))
|
56
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.source, RDF::Literal.new(feature.source)))
|
57
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.type, RDF::Literal.new(feature.type)))
|
58
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.start, RDF::Literal.new(feature.start_coordinate)))
|
59
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.end, RDF::Literal.new(feature.end_coordinate)))
|
60
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.score, RDF::Literal.new(feature.score))) if feature.score
|
61
|
+
case feature.strand
|
62
|
+
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
63
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.NotStranded))
|
64
|
+
when BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
65
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.UnknownStrand))
|
66
|
+
when BioInterchange::Genomics::GFF3Feature::POSITIVE
|
67
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.Positive))
|
68
|
+
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
69
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.Negative))
|
70
|
+
else
|
71
|
+
raise ArgumentException, 'Strand of feature is set to an unknown constant.'
|
72
|
+
end
|
73
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.phase, RDF::Literal.new(feature.phase))) if feature.phase
|
74
|
+
|
75
|
+
serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
76
|
+
end
|
77
|
+
|
78
|
+
# Serializes the attributes of a feature.
|
79
|
+
#
|
80
|
+
# +graph+:: RDF graph to which the feature is added
|
81
|
+
# +set_uri+:: URI of the set these attributes belong to (implicit due to feature)
|
82
|
+
# +feature_uri+:: the feature URI to which the attributes belong to
|
83
|
+
# +attribtues+:: a map of tag/value pairs
|
84
|
+
def serialize_attributes(graph, set_uri, feature_uri, attributes)
|
85
|
+
attributes.each_pair { |tag, list|
|
86
|
+
if tag == 'Parent' then
|
87
|
+
list.each { |parent_id|
|
88
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
|
89
|
+
}
|
90
|
+
else
|
91
|
+
list.each_index { |index|
|
92
|
+
value = list[index]
|
93
|
+
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
94
|
+
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
95
|
+
graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.attributes, attribute_uri))
|
96
|
+
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, BioInterchange::GFF3.Attribute))
|
97
|
+
graph.insert(RDF::Statement.new(attribute_uri, BioInterchange::GFF3.tag, RDF::Literal.new("#{tag}")))
|
98
|
+
graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
|
99
|
+
}
|
100
|
+
end
|
101
|
+
}
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module BioInterchange::Genomics
|
2
|
+
|
3
|
+
class GFF3Reader
|
4
|
+
|
5
|
+
# Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
|
6
|
+
#
|
7
|
+
# +name+:: Optional name of the person who generated the GFF3 file.
|
8
|
+
# +name_uri+:: Optional e-mail address of the person who generated the GFF3 file.
|
9
|
+
# +date+:: Optional date of when the GFF3 file was produced.
|
10
|
+
def initialize(name = nil, name_uri = nil, date = nil)
|
11
|
+
@name = name
|
12
|
+
@name_uri = name_uri
|
13
|
+
@date = date
|
14
|
+
end
|
15
|
+
|
16
|
+
# Reads a GFF3 file from the input stream and returns an associated model.
|
17
|
+
#
|
18
|
+
# +inputstream+:: an instance of class IO or String that holds the contents of a GFF3 file
|
19
|
+
def deserialize(inputstream)
|
20
|
+
if inputstream.kind_of?(IO)
|
21
|
+
create_model(inputstream.read)
|
22
|
+
elsif inputstream.kind_of?(String) then
|
23
|
+
create_model(inputstream)
|
24
|
+
else
|
25
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def create_model(gff3)
|
32
|
+
feature_set = BioInterchange::Genomics::GFF3FeatureSet.new()
|
33
|
+
gff3.each_line { |line|
|
34
|
+
next if line.start_with?('#')
|
35
|
+
|
36
|
+
line.chomp!
|
37
|
+
seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
|
38
|
+
|
39
|
+
# The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
|
40
|
+
if type.match(/SO:\d+/) then
|
41
|
+
type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
|
42
|
+
elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
|
43
|
+
type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
|
44
|
+
end
|
45
|
+
|
46
|
+
# String to numeric value conversions:
|
47
|
+
start_coordinate = start_coordinate.to_i
|
48
|
+
stop_coordinate = stop_coordinate.to_i
|
49
|
+
if score == '.' then
|
50
|
+
score = nil
|
51
|
+
else
|
52
|
+
score = score.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
# Determine strandedness:
|
56
|
+
if strand == '?' then
|
57
|
+
strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
58
|
+
elsif strand == '+' then
|
59
|
+
strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
|
60
|
+
elsif strand == '-' then
|
61
|
+
strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
62
|
+
else
|
63
|
+
strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
64
|
+
end
|
65
|
+
|
66
|
+
# Set phase, if it lies in the permissable range of values:
|
67
|
+
if phase == '0' or phase == '1' or phase == '2' then
|
68
|
+
phase = phase.to_i
|
69
|
+
else
|
70
|
+
phase = nil
|
71
|
+
end
|
72
|
+
|
73
|
+
temp = {}
|
74
|
+
attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
|
75
|
+
attributes = temp
|
76
|
+
|
77
|
+
feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
|
78
|
+
}
|
79
|
+
|
80
|
+
feature_set
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
@@ -0,0 +1,135 @@
|
|
1
|
+
module BioInterchange
|
2
|
+
|
3
|
+
class GFF3
|
4
|
+
|
5
|
+
def self.strand
|
6
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0010')
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.attributes
|
10
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0012')
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parent
|
14
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0014')
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.contains
|
18
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0015')
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.seqid
|
22
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0004')
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.source
|
26
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0005')
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.type
|
30
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0006')
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.start
|
34
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0007')
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.end
|
38
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0008')
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.score
|
42
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0009')
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.phase
|
46
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0011')
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.tag
|
50
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0013')
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.Set
|
54
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0001')
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.Feature
|
58
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0002')
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.Attribute
|
62
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0003')
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.Strand
|
66
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0016')
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.Positive
|
70
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0017')
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.Negative
|
74
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0018')
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.UnknownStrand
|
78
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0019')
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.NotStranded
|
82
|
+
RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0020')
|
83
|
+
end
|
84
|
+
|
85
|
+
# Determines whether the given URI is an object property.
|
86
|
+
#
|
87
|
+
# +uri+:: URI that is tested for being an object property
|
88
|
+
def self.is_object_property?(uri)
|
89
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0010')
|
90
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0012')
|
91
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0014')
|
92
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0015')
|
93
|
+
false
|
94
|
+
end
|
95
|
+
|
96
|
+
# Determines whether the given URI is a datatype property.
|
97
|
+
#
|
98
|
+
# +uri+:: URI that is tested for being a datatype property
|
99
|
+
def self.is_datatype_property?(uri)
|
100
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0004')
|
101
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0005')
|
102
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0006')
|
103
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0007')
|
104
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0008')
|
105
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0009')
|
106
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0011')
|
107
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0013')
|
108
|
+
false
|
109
|
+
end
|
110
|
+
|
111
|
+
# Determines whether the given URI is a class.
|
112
|
+
#
|
113
|
+
# +uri+:: URI that is tested for being a class
|
114
|
+
def self.is_class?(uri)
|
115
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0001')
|
116
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0002')
|
117
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0003')
|
118
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0016')
|
119
|
+
false
|
120
|
+
end
|
121
|
+
|
122
|
+
# Determines whether the given URI is a named individual.
|
123
|
+
#
|
124
|
+
# +uri+:: URI that is tested for being a named individual
|
125
|
+
def self.is_named_individual?(uri)
|
126
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0017')
|
127
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0018')
|
128
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0019')
|
129
|
+
return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0020')
|
130
|
+
false
|
131
|
+
end
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|