biointerchange 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +17 -0
  5. data/LICENSE.txt +8 -0
  6. data/README.md +166 -0
  7. data/Rakefile +50 -0
  8. data/VERSION +1 -0
  9. data/bin/biointerchange +6 -0
  10. data/docs/exceptions_readme.txt +13 -0
  11. data/examples/BovineGenomeChrX.gff3.gz +0 -0
  12. data/examples/gb-2007-8-3-R40.xml +243 -0
  13. data/examples/pubannotation.json +1 -0
  14. data/generators/rdfxml.rb +104 -0
  15. data/lib/biointerchange/core.rb +195 -0
  16. data/lib/biointerchange/exceptions.rb +38 -0
  17. data/lib/biointerchange/genomics/gff3_feature.rb +82 -0
  18. data/lib/biointerchange/genomics/gff3_feature_set.rb +37 -0
  19. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +107 -0
  20. data/lib/biointerchange/genomics/gff3_reader.rb +86 -0
  21. data/lib/biointerchange/gff3.rb +135 -0
  22. data/lib/biointerchange/reader.rb +25 -0
  23. data/lib/biointerchange/registry.rb +29 -0
  24. data/lib/biointerchange/sio.rb +7124 -0
  25. data/lib/biointerchange/sofa.rb +1566 -0
  26. data/lib/biointerchange/textmining/content.rb +69 -0
  27. data/lib/biointerchange/textmining/document.rb +36 -0
  28. data/lib/biointerchange/textmining/pdfx_xml_reader.rb +161 -0
  29. data/lib/biointerchange/textmining/process.rb +57 -0
  30. data/lib/biointerchange/textmining/pubannos_json_reader.rb +72 -0
  31. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +197 -0
  32. data/lib/biointerchange/textmining/text_mining_reader.rb +41 -0
  33. data/lib/biointerchange/writer.rb +23 -0
  34. data/lib/biointerchange.rb +3 -0
  35. data/spec/exceptions_spec.rb +27 -0
  36. data/spec/gff3_rdfwriter_spec.rb +67 -0
  37. data/spec/text_mining_pdfx_xml_reader_spec.rb +89 -0
  38. data/spec/text_mining_pubannos_json_reader_spec.rb +71 -0
  39. data/spec/text_mining_rdfwriter_spec.rb +57 -0
  40. data/web/about.html +89 -0
  41. data/web/biointerchange.js +133 -0
  42. data/web/bootstrap/css/bootstrap-responsive.css +1040 -0
  43. data/web/bootstrap/css/bootstrap-responsive.min.css +9 -0
  44. data/web/bootstrap/css/bootstrap.css +5624 -0
  45. data/web/bootstrap/css/bootstrap.min.css +9 -0
  46. data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
  47. data/web/bootstrap/img/glyphicons-halflings.png +0 -0
  48. data/web/bootstrap/js/bootstrap.js +2027 -0
  49. data/web/bootstrap/js/bootstrap.min.js +6 -0
  50. data/web/bootstrap/js/jquery-1.8.1.min.js +2 -0
  51. data/web/css/rdoc-style.css +5786 -0
  52. data/web/css/rdoc.css +716 -0
  53. data/web/images/BioInterchange300.png +0 -0
  54. data/web/index.html +109 -0
  55. data/web/service/rdfizer.fcgi +68 -0
  56. data/web/webservices.html +123 -0
  57. metadata +240 -0
@@ -0,0 +1,195 @@
1
+ module BioInterchange
2
+
3
+ # Custom Exceptions and Errors
4
+ require 'biointerchange/exceptions'
5
+
6
+ # Ontologies (besides the ones from the 'rdf' gem)
7
+ require 'biointerchange/gff3'
8
+ require 'biointerchange/sio'
9
+ require 'biointerchange/sofa'
10
+
11
+ # Reader/writer interfaces
12
+ require 'biointerchange/reader'
13
+ require 'biointerchange/writer'
14
+
15
+ #
16
+ # TEXT MINING
17
+ #
18
+
19
+ # Text mining readers
20
+ require 'biointerchange/textmining/text_mining_reader'
21
+ require 'biointerchange/textmining/pubannos_json_reader'
22
+ require 'biointerchange/textmining/pdfx_xml_reader'
23
+
24
+ # Text mining model
25
+ require 'biointerchange/textmining/document'
26
+ require 'biointerchange/textmining/content'
27
+ require 'biointerchange/textmining/process'
28
+
29
+ # Text mining writers
30
+ require 'biointerchange/textmining/text_mining_rdf_ntriples'
31
+
32
+ #
33
+ # GENOMICS
34
+ #
35
+
36
+ # GFF3 reader
37
+ require 'biointerchange/genomics/gff3_reader'
38
+
39
+ # Feature base model
40
+ require 'biointerchange/genomics/gff3_feature_set'
41
+ require 'biointerchange/genomics/gff3_feature'
42
+
43
+ # GFF3 writer
44
+ require 'biointerchange/genomics/gff3_rdf_ntriples'
45
+
46
+ #
47
+ # ACTUAL COMMAND LINE IMPLEMENTATION
48
+ #
49
+
50
+ # Option parsing
51
+ require 'getopt/long'
52
+
53
+ def self.cli
54
+ begin
55
+ opt = Getopt::Long.getopts(
56
+ ["--help", "-h", Getopt::BOOLEAN],
57
+ ["--debug", "-d", Getopt::BOOLEAN], # set debug mode => print stack traces
58
+ ["--input", "-i", Getopt::REQUIRED], # input file format
59
+ ["--rdf", "-r", Getopt::REQUIRED], # output file format
60
+ ["--name", Getopt::OPTIONAL], # name of resourcce/tool/person
61
+ ["--name_id", Getopt::OPTIONAL], # uri of resource/tool/person
62
+ ["--date", "-t", Getopt::OPTIONAL], # date of processing/annotation
63
+ ["--version", "-v", Getopt::OPTIONAL], # version number of resource
64
+ ["--file", "-f", Getopt::OPTIONAL], # file to read, will read from STDIN if not supplied
65
+ ["--out", "-o", Getopt::OPTIONAL] # output file, will out to STDOUT if not supplied
66
+ )
67
+
68
+ if opt['help'] or not opt['input'] or not opt['rdf'] then
69
+ puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
70
+ puts 'Supported input formats (--input <format>/-i <format>):'
71
+ puts ' biointerchange.gff3 : GFF3'
72
+ puts ' dbcls.catanns.json : PubAnnotation JSON'
73
+ puts ' uk.ac.man.pdfx : PDFx XML'
74
+ puts 'Supported output formats (--rdf <format>/-r <format>)'
75
+ puts ' rdf.biointerchange.gff3 : RDF N-Triples for input'
76
+ puts ' biointerchange.gff3'
77
+ puts ' rdf.bh12.sio : RDF N-Triples for inputs'
78
+ puts ' dbcls.catanns.json'
79
+ puts ' uk.ac.man.pdfx'
80
+ puts 'I/O options:'
81
+ puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
82
+ puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
83
+ puts 'Input-/RDF-format specific options:'
84
+ puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
85
+ puts ' Output: rdf.bh12.sio'
86
+ puts ' Options:'
87
+ puts ' -t <date>/--date <date> : date of processing/annotation (optional)'
88
+ puts ' -v <version>/--version <version> : version number of resource (optional)'
89
+ puts ' --name <name> : name of resource/tool/person (required)'
90
+ puts ' --name_id <id> : URI of resource/tool/person (required)'
91
+ puts 'Input-/RDF-format specific options:'
92
+ puts ' Input: biointerchange.gff3'
93
+ puts ' Output: rdf.biointerchange.gff3'
94
+ puts ' Options:'
95
+ puts ' -t <date>/--date <date> : date when the GFF3 file was created (optional)'
96
+ puts ' --name <name> : name of the GFF3 file creator (optional)'
97
+ puts ' --name_id <id> : email address of the GFF3 file creator (optional)'
98
+ puts 'Other options:'
99
+ puts ' -d / --debug : turn on debugging output (for stacktraces)'
100
+ puts ' -h --help : this message'
101
+
102
+ exit 1
103
+ end
104
+
105
+ if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
106
+ if opt['rdf'] == 'rdf.bh12.sio' then
107
+ raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
108
+ else
109
+ unsupported_combination
110
+ end
111
+ elsif opt['input'] == 'biointerchange.gff3' then
112
+ if opt['rdf'] == 'rdf.biointerchange.gff3' then
113
+ # Okay. No further arguments required.
114
+ else
115
+ unsupported_combination
116
+ end
117
+ else
118
+ unsupported_combination
119
+ end
120
+
121
+
122
+ opt['date'] = nil unless opt['date']
123
+ opt['version'] = nil unless opt['version']
124
+
125
+ # generate model from file (deserialise)
126
+ reader = nil
127
+ if opt['input'] == 'dbcls.catanns.json' then
128
+ reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
129
+ elsif opt['input'] == 'uk.ac.man.pdfx' then
130
+ reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
131
+ elsif opt['input'] == 'biointerchange.gff3' then
132
+ reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
133
+ end
134
+
135
+ model = nil
136
+ if opt["file"]
137
+ model = reader.deserialize(File.new(opt["file"],'r'))
138
+ else
139
+ model = reader.deserialize(STDIN)
140
+ end
141
+
142
+ # generate rdf from model (serialise)
143
+ writer = nil
144
+ if opt['rdf'] == 'rdf.bh12.sio' then
145
+ writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
146
+ writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
147
+ end
148
+ if opt['rdf'] == 'rdf.biointerchange.gff3' then
149
+ writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
150
+ writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
151
+ end
152
+
153
+ writer.serialize(model)
154
+
155
+ rescue ArgumentError => e
156
+ $stderr.puts e.message
157
+ $stderr.puts e.backtrace if opt['debug']
158
+ exit 1
159
+ rescue Getopt::Long::Error => e
160
+ $stderr.puts e.message
161
+ #$stderr.puts e.backtrace if opt['debug']
162
+ exit 1
163
+ rescue BioInterchange::Exceptions::InputFormatError => e
164
+ $stderr.puts e.message
165
+ $stderr.puts e.backtrace if opt['debug']
166
+ exit 2
167
+ end
168
+ end
169
+
170
+ #
171
+ # Helper functions
172
+ #
173
+
174
+ # Returns the values of several named parameters.
175
+ #
176
+ # +map+:: a map of named parameters and their values
177
+ # +parameters+:: the names of the parameter values we are interested in
178
+ def self.get_parameters(map, parameters)
179
+ parameters.map { |parameter|
180
+ if parameter.instance_of? Array then
181
+ parameter[0].call(*BioInterchange::get_parameters(map, parameter[1..-1]))
182
+ else
183
+ map[parameter]
184
+ end
185
+ }
186
+ end
187
+
188
+ private
189
+
190
+ def self.unsupported_combination
191
+ raise ArgumentError, 'This input/output format combination is not supported.'
192
+ end
193
+
194
+ end
195
+
@@ -0,0 +1,38 @@
1
+ module BioInterchange::Exceptions
2
+
3
+ # Top level Error class for all framework errors.
4
+ class BioInterchangeError < StandardError
5
+ end
6
+
7
+ # Error class for issues regarding input file formats.
8
+ # This error class is caught at the hightest level and
9
+ # only the error message is returned to the end user
10
+ # (not the backtrace). This helps keep the framework
11
+ # more user friendly. Note that this means meaningful
12
+ # error messages are strongly encourged.
13
+ class InputFormatError < BioInterchangeError
14
+ end
15
+
16
+ # Top class error for errors raised only during
17
+ # framework implementation and extension. Once
18
+ # such an implementation/extension is completed,
19
+ # these errors should no longer be possible.
20
+ # Examples include a method reciving something it
21
+ # can't handle. The framework does not resuce these
22
+ # errors leaving the backtraces for help in debug.
23
+ class ImplementationError < BioInterchangeError
24
+ end
25
+
26
+ # Implementation errors originating from readers.
27
+ class ImplementationReaderError < ImplementationError
28
+ end
29
+
30
+ # Implementation errors originating from models.
31
+ class ImplementationModelError < ImplementationError
32
+ end
33
+
34
+ # Implementation errors originating from writers.
35
+ class ImplementationWriterError < ImplementationError
36
+ end
37
+
38
+ end
@@ -0,0 +1,82 @@
1
+ module BioInterchange::Genomics
2
+
3
+ class GFF3Feature
4
+
5
+ # Constants determining the strand of the feature.
6
+ NOT_STRANDED = 0
7
+ UNKNOWN = 1
8
+ POSITIVE = 2
9
+ NEGATIVE = 3
10
+
11
+ # Creates a new feature representation. A feature is described on one line of the GFF3 file.
12
+ #
13
+ # +sequence_id+:: an identifier that determines the coordinate system for the feature
14
+ # +source+:: a text description of the origin of this feature description
15
+ # +type+:: either a SOFA accession, SOFA term, or textual description (the former are URIs, the latter is a string)
16
+ # +start_coordinate+:: an integer denoting the start coordinate of the feature
17
+ # +end_coordinate+:: an integer denoting the end coordinate of the feature, which is equal or larger than the start coordinate
18
+ # +score+:: a floating point score
19
+ # +strand+:: a constant determining whether the feature is NOT_STRANDED, the strand is UNKNOWN, or the feature is on the POSITIVE or NEGATIVE strand
20
+ # +phase+:: an integer determining the phase of the feature, if the feature has a phase
21
+ # +attributes+:: a map of additional attributes associated with the feature
22
+ def initialize(sequence_id, source, type, start_coordinate, end_coordinate, score = nil, strand = NOT_STRANDED, phase = nil, attributes = {})
23
+ @sequence_id = sequence_id
24
+ @source = source
25
+ @type = type
26
+ @start_coordinate = start_coordinate
27
+ @end_coordinate = end_coordinate
28
+ @score = score
29
+ @strand = strand
30
+ @phase = phase
31
+ @attributes = attributes
32
+ end
33
+
34
+ # Returns the sequence ID that determines the coordinate system for the feature.
35
+ def sequence_id
36
+ @sequence_id
37
+ end
38
+
39
+ # Returns a textual description that determines the origin of this feature.
40
+ def source
41
+ @source
42
+ end
43
+
44
+ # Returns the feature type, which can either be a SOFA URI or a textual description otherwise.
45
+ def type
46
+ @type
47
+ end
48
+
49
+ # Returns the start coordinate of the feature. The start coordinate is equal or smaller than the end coordinate.
50
+ def start_coordinate
51
+ @start_coordinate
52
+ end
53
+
54
+ # Returns the end coordinate of the feature. The end coordinate is equal or larger than the start coordinate.
55
+ def end_coordinate
56
+ @end_coordinate
57
+ end
58
+
59
+ # Returns the score of the feature. The score is a floating point number, which ideally is an E-value or P-value.
60
+ def score
61
+ @score
62
+ end
63
+
64
+ # Returns the strand the feature is located on.
65
+ def strand
66
+ @strand
67
+ end
68
+
69
+ # Returns the phase, if existing, for this feature.
70
+ def phase
71
+ @phase
72
+ end
73
+
74
+ # Returns a map of additional attributes for this feature.
75
+ def attributes
76
+ @attributes.freeze
77
+ end
78
+
79
+ end
80
+
81
+ end
82
+
@@ -0,0 +1,37 @@
1
+ require 'digest/sha1'
2
+
3
+ module BioInterchange::Genomics
4
+
5
+ class GFF3FeatureSet
6
+
7
+ # Create a new instance of a Generic Feature Format Version 3 (GFF3) feature set. A feature
8
+ # set can contain multiple GFF3 features.
9
+ def initialize
10
+ # Features are stored as the keys of a hash map to increase performance:
11
+ @set = {}
12
+ end
13
+
14
+ # Returns the contents of the feature set.
15
+ def contents
16
+ @set.keys
17
+ end
18
+
19
+ # Returns an URI for this particular feature set, which is a SHA1 hash over the content's concatenated properties.
20
+ def uri
21
+ clob = ''
22
+ contents.each { |feature|
23
+ clob << "#{feature.sequence_id}\t#{feature.source}\t#{feature.type}\t#{feature.start_coordinate}\t#{feature.end_coordinate}\t#{feature.score}\t#{feature.strand}\t#{feature.phase}\t#{feature.attributes.keys.map { |tag| "#{tag}=#{feature.attributes[tag]}" }.join(';')}\n"
24
+ }
25
+ "biointerchange://gff3/featureset/self/#{Digest::SHA1.hexdigest(clob)}"
26
+ end
27
+
28
+ # Adds a feature to the feature set.
29
+ #
30
+ # +feature+:: feature instance that is added to the contents of this feature set
31
+ def add(feature)
32
+ @set[feature] = true
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -0,0 +1,107 @@
1
+ require 'rdf'
2
+ require 'rdf/ntriples'
3
+ require 'date'
4
+
5
+ module BioInterchange::Genomics
6
+
7
+ class RDFWriter
8
+
9
+ # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
10
+ #
11
+ # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
12
+ def initialize(ostream)
13
+ raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
14
+ @ostream = ostream
15
+ end
16
+
17
+ # Serialize a model as RDF.
18
+ #
19
+ # +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
20
+ def serialize(model)
21
+ if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
22
+ serialize_model(model)
23
+ else
24
+ raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
25
+ 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet.'
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ # Serializes RDF for a feature set representation.
32
+ #
33
+ # +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
34
+ def serialize_model(model)
35
+ graph = RDF::Graph.new
36
+ set_uri = RDF::URI.new(model.uri)
37
+ graph.insert(RDF::Statement.new(set_uri, RDF.type, BioInterchange::GFF3.Set))
38
+ model.contents.each { |feature|
39
+ serialize_feature(graph, set_uri, feature)
40
+ }
41
+ RDF::NTriples::Writer.dump(graph, @ostream)
42
+ end
43
+
44
+ # Serializes a +GFF3Feature+ object for a given feature set URI.
45
+ #
46
+ # +graph+:: RDF graph to which the feature is added
47
+ # +set_uri+:: the feature set URI to which the feature belongs to
48
+ # +feature+:: a +GFF3Feature+ instance
49
+ def serialize_feature(graph, set_uri, feature)
50
+ # TODO Make sure there is only one value in the 'ID' list.
51
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
52
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
53
+ graph.insert(RDF::Statement.new(set_uri, BioInterchange::GFF3.contains, feature_uri))
54
+ graph.insert(RDF::Statement.new(feature_uri, RDF.type, BioInterchange::GFF3.Feature))
55
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.seqid, RDF::Literal.new(feature.sequence_id)))
56
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.source, RDF::Literal.new(feature.source)))
57
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.type, RDF::Literal.new(feature.type)))
58
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.start, RDF::Literal.new(feature.start_coordinate)))
59
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.end, RDF::Literal.new(feature.end_coordinate)))
60
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.score, RDF::Literal.new(feature.score))) if feature.score
61
+ case feature.strand
62
+ when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
63
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.NotStranded))
64
+ when BioInterchange::Genomics::GFF3Feature::UNKNOWN
65
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.UnknownStrand))
66
+ when BioInterchange::Genomics::GFF3Feature::POSITIVE
67
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.Positive))
68
+ when BioInterchange::Genomics::GFF3Feature::NEGATIVE
69
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.strand, BioInterchange::GFF3.Negative))
70
+ else
71
+ raise ArgumentException, 'Strand of feature is set to an unknown constant.'
72
+ end
73
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.phase, RDF::Literal.new(feature.phase))) if feature.phase
74
+
75
+ serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
76
+ end
77
+
78
+ # Serializes the attributes of a feature.
79
+ #
80
+ # +graph+:: RDF graph to which the feature is added
81
+ # +set_uri+:: URI of the set these attributes belong to (implicit due to feature)
82
+ # +feature_uri+:: the feature URI to which the attributes belong to
83
+ # +attribtues+:: a map of tag/value pairs
84
+ def serialize_attributes(graph, set_uri, feature_uri, attributes)
85
+ attributes.each_pair { |tag, list|
86
+ if tag == 'Parent' then
87
+ list.each { |parent_id|
88
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
89
+ }
90
+ else
91
+ list.each_index { |index|
92
+ value = list[index]
93
+ attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
94
+ attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
95
+ graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3.attributes, attribute_uri))
96
+ graph.insert(RDF::Statement.new(attribute_uri, RDF.type, BioInterchange::GFF3.Attribute))
97
+ graph.insert(RDF::Statement.new(attribute_uri, BioInterchange::GFF3.tag, RDF::Literal.new("#{tag}")))
98
+ graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
99
+ }
100
+ end
101
+ }
102
+ end
103
+
104
+ end
105
+
106
+ end
107
+
@@ -0,0 +1,86 @@
1
+ module BioInterchange::Genomics
2
+
3
+ class GFF3Reader
4
+
5
+ # Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
6
+ #
7
+ # +name+:: Optional name of the person who generated the GFF3 file.
8
+ # +name_uri+:: Optional e-mail address of the person who generated the GFF3 file.
9
+ # +date+:: Optional date of when the GFF3 file was produced.
10
+ def initialize(name = nil, name_uri = nil, date = nil)
11
+ @name = name
12
+ @name_uri = name_uri
13
+ @date = date
14
+ end
15
+
16
+ # Reads a GFF3 file from the input stream and returns an associated model.
17
+ #
18
+ # +inputstream+:: an instance of class IO or String that holds the contents of a GFF3 file
19
+ def deserialize(inputstream)
20
+ if inputstream.kind_of?(IO)
21
+ create_model(inputstream.read)
22
+ elsif inputstream.kind_of?(String) then
23
+ create_model(inputstream)
24
+ else
25
+ raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def create_model(gff3)
32
+ feature_set = BioInterchange::Genomics::GFF3FeatureSet.new()
33
+ gff3.each_line { |line|
34
+ next if line.start_with?('#')
35
+
36
+ line.chomp!
37
+ seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
38
+
39
+ # The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
40
+ if type.match(/SO:\d+/) then
41
+ type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
42
+ elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
43
+ type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
44
+ end
45
+
46
+ # String to numeric value conversions:
47
+ start_coordinate = start_coordinate.to_i
48
+ stop_coordinate = stop_coordinate.to_i
49
+ if score == '.' then
50
+ score = nil
51
+ else
52
+ score = score.to_f
53
+ end
54
+
55
+ # Determine strandedness:
56
+ if strand == '?' then
57
+ strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
58
+ elsif strand == '+' then
59
+ strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
60
+ elsif strand == '-' then
61
+ strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
62
+ else
63
+ strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
64
+ end
65
+
66
+ # Set phase, if it lies in the permissable range of values:
67
+ if phase == '0' or phase == '1' or phase == '2' then
68
+ phase = phase.to_i
69
+ else
70
+ phase = nil
71
+ end
72
+
73
+ temp = {}
74
+ attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
75
+ attributes = temp
76
+
77
+ feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
78
+ }
79
+
80
+ feature_set
81
+ end
82
+
83
+ end
84
+
85
+ end
86
+
@@ -0,0 +1,135 @@
1
+ module BioInterchange
2
+
3
+ class GFF3
4
+
5
+ def self.strand
6
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0010')
7
+ end
8
+
9
+ def self.attributes
10
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0012')
11
+ end
12
+
13
+ def self.parent
14
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0014')
15
+ end
16
+
17
+ def self.contains
18
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0015')
19
+ end
20
+
21
+ def self.seqid
22
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0004')
23
+ end
24
+
25
+ def self.source
26
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0005')
27
+ end
28
+
29
+ def self.type
30
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0006')
31
+ end
32
+
33
+ def self.start
34
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0007')
35
+ end
36
+
37
+ def self.end
38
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0008')
39
+ end
40
+
41
+ def self.score
42
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0009')
43
+ end
44
+
45
+ def self.phase
46
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0011')
47
+ end
48
+
49
+ def self.tag
50
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0013')
51
+ end
52
+
53
+ def self.Set
54
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0001')
55
+ end
56
+
57
+ def self.Feature
58
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0002')
59
+ end
60
+
61
+ def self.Attribute
62
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0003')
63
+ end
64
+
65
+ def self.Strand
66
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0016')
67
+ end
68
+
69
+ def self.Positive
70
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0017')
71
+ end
72
+
73
+ def self.Negative
74
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0018')
75
+ end
76
+
77
+ def self.UnknownStrand
78
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0019')
79
+ end
80
+
81
+ def self.NotStranded
82
+ RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0020')
83
+ end
84
+
85
+ # Determines whether the given URI is an object property.
86
+ #
87
+ # +uri+:: URI that is tested for being an object property
88
+ def self.is_object_property?(uri)
89
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0010')
90
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0012')
91
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0014')
92
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0015')
93
+ false
94
+ end
95
+
96
+ # Determines whether the given URI is a datatype property.
97
+ #
98
+ # +uri+:: URI that is tested for being a datatype property
99
+ def self.is_datatype_property?(uri)
100
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0004')
101
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0005')
102
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0006')
103
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0007')
104
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0008')
105
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0009')
106
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0011')
107
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0013')
108
+ false
109
+ end
110
+
111
+ # Determines whether the given URI is a class.
112
+ #
113
+ # +uri+:: URI that is tested for being a class
114
+ def self.is_class?(uri)
115
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0001')
116
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0002')
117
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0003')
118
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0016')
119
+ false
120
+ end
121
+
122
+ # Determines whether the given URI is a named individual.
123
+ #
124
+ # +uri+:: URI that is tested for being a named individual
125
+ def self.is_named_individual?(uri)
126
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0017')
127
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0018')
128
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0019')
129
+ return true if uri == RDF::URI.new('http://www.sequenceontology.org/gff3#GFF3_0020')
130
+ false
131
+ end
132
+
133
+ end
134
+
135
+ end