biointerchange 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/VERSION +1 -1
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +18 -20
- data/lib/biointerchange/genomics/gff3_reader.rb +17 -2
- data/spec/gff3_reader_spec.rb +43 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 740da15b29a3a3da4b012e17b16e4ce3c5cef989
|
4
|
+
data.tar.gz: fb7508cdacae1aa18d5d142cebed3717b4c66e91
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9cb7e6845df394b0a7d5e54310456841471baa54c2a081c18d6a7fb3c5a6a929074acc42690101c083d6f16f8d02e1f995a12806017a45c5b5e3ead24e75c5d7
|
7
|
+
data.tar.gz: 7921f0f48abbb8bdc8e0c2d0c47251b40b70468cde3210d9f39edb98bc94e21c778c49d79a5740101df1bae1dbeedf9e88cdac7bb08e95d642409d4915e78f12
|
data/README.md
CHANGED
@@ -107,7 +107,7 @@ The following list provides information on the origin of the example-data files
|
|
107
107
|
|
108
108
|
The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
|
109
109
|
|
110
|
-
* `mgp.
|
110
|
+
* `mgp.v4.indels.dbSNP.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz)
|
111
111
|
|
112
112
|
### Application Programming Interface
|
113
113
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.5
|
@@ -255,26 +255,24 @@ protected
|
|
255
255
|
bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
|
256
256
|
create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
|
257
257
|
end
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
|
277
|
-
create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
|
258
|
+
[ [ start_position_uri, BioInterchange::FALDO.begin, feature.start_coordinate ],
|
259
|
+
[ end_position_uri, BioInterchange::FALDO.end, feature.end_coordinate ] ].each { |uri_relation_coordinate|
|
260
|
+
position_uri, faldo_relation, coordinate = uri_relation_coordinate
|
261
|
+
create_triple(region_uri, faldo_relation, position_uri)
|
262
|
+
case feature.strand
|
263
|
+
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
264
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
|
265
|
+
when BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
266
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
|
267
|
+
when BioInterchange::Genomics::GFF3Feature::POSITIVE
|
268
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
269
|
+
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
270
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
271
|
+
else
|
272
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
|
273
|
+
end
|
274
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, coordinate)
|
275
|
+
}
|
278
276
|
if feature.score then
|
279
277
|
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
|
280
278
|
if @format == :gvf or @format == :vcf then
|
@@ -32,6 +32,7 @@ class GFF3Reader < BioInterchange::Reader
|
|
32
32
|
@name_uri = name_uri
|
33
33
|
@date = date
|
34
34
|
@batch_size = batch_size
|
35
|
+
@linenumber = 0
|
35
36
|
end
|
36
37
|
|
37
38
|
# Reads a GFF3 file from the input stream and returns an associated model.
|
@@ -79,6 +80,7 @@ protected
|
|
79
80
|
begin
|
80
81
|
line = gff3.readline
|
81
82
|
line.chomp!
|
83
|
+
@linenumber += 1
|
82
84
|
|
83
85
|
if line.start_with?('#') and not line.start_with?('##') then
|
84
86
|
add_comment(@feature_set, line[1..-1].strip)
|
@@ -90,6 +92,9 @@ protected
|
|
90
92
|
next
|
91
93
|
end
|
92
94
|
|
95
|
+
# Empty lines. They happen. Mostly at the end of files.
|
96
|
+
next if line.strip.empty?
|
97
|
+
|
93
98
|
if fasta_block then
|
94
99
|
if line.start_with?('>') and line.length > 1 then
|
95
100
|
@feature_set.add(BioInterchange::Genomics::GFF3FeatureSequence.new(fasta_id, fasta_sequence, fasta_comment)) if fasta_id and not fasta_sequence.empty?()
|
@@ -139,7 +144,9 @@ protected
|
|
139
144
|
type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
|
140
145
|
end
|
141
146
|
rescue NoMethodError
|
142
|
-
|
147
|
+
# If neither a SO accession or a SO/SOFA term, then raise an error. The GFF3 specification
|
148
|
+
# is very clear about type being either one of the two.
|
149
|
+
raise BioInterchange::Exceptions::InputFormatError, "Line #{@linenumber}. Type of feature is set to an unknown SO/SOFA term: \"#{type}\""
|
143
150
|
end
|
144
151
|
|
145
152
|
# String to numeric value conversions:
|
@@ -218,7 +225,15 @@ protected
|
|
218
225
|
# +attribute_string+:: key/value string (column 9) as seen in a GFF3/GVF file
|
219
226
|
def split_attributes(attribute_string)
|
220
227
|
attributes = {}
|
221
|
-
attribute_string.split(';').map { |assignment|
|
228
|
+
hashes = attribute_string.split(';').map { |assignment|
|
229
|
+
match = assignment.match(/([^=]+)=(.+)/) ;
|
230
|
+
{ match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } }
|
231
|
+
}
|
232
|
+
hashes.map { |hash|
|
233
|
+
hash.each_pair { |tag,list|
|
234
|
+
attributes[tag] = list
|
235
|
+
}
|
236
|
+
}
|
222
237
|
attributes
|
223
238
|
end
|
224
239
|
|
@@ -0,0 +1,43 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rspec'
|
4
|
+
|
5
|
+
require 'biointerchange'
|
6
|
+
|
7
|
+
describe BioInterchange::Genomics::GFF3Reader do
|
8
|
+
describe 'reading GFF3 data' do
|
9
|
+
|
10
|
+
describe 'reading GFF3 into a model' do
|
11
|
+
before :all do
|
12
|
+
@test_gff3 = """##gff-version 3
|
13
|
+
##sequence-region chr1 1001 400200
|
14
|
+
##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606
|
15
|
+
# A comment line.
|
16
|
+
chr1 Test Center gene 2030 3100 . + . ID=Gene1;Name=firstgene
|
17
|
+
chr1 . gene 5010 10029 . - . ID=Gene2;Name=secondgene
|
18
|
+
chr1 . exon 5090 6000 . - . ID=Exon2.1;Parent=Gene2
|
19
|
+
|
20
|
+
"""
|
21
|
+
# Initialize with the most basic form of a GFF3 reader, i.e.
|
22
|
+
# no name, name_uri, date, etc., information is provided.
|
23
|
+
@reader = BioInterchange::Genomics::GFF3Reader.new()
|
24
|
+
end
|
25
|
+
it 'creates correct GFF3 model instance' do
|
26
|
+
model = @reader.deserialize(@test_gff3)
|
27
|
+
|
28
|
+
model.should be_an_instance_of BioInterchange::Genomics::GFF3FeatureSet
|
29
|
+
end
|
30
|
+
it 'creates a model with the right number of genomic features' do
|
31
|
+
model = @reader.deserialize(@test_gff3)
|
32
|
+
|
33
|
+
model.contents.length.should eql 3
|
34
|
+
end
|
35
|
+
it 'creates a model with the right number of pragma statements' do
|
36
|
+
model = @reader.deserialize(@test_gff3)
|
37
|
+
|
38
|
+
model.pragmas.length.should eql 3
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biointerchange
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joachim Baran
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2014-11-
|
15
|
+
date: 2014-11-25 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rdf
|
@@ -226,6 +226,7 @@ files:
|
|
226
226
|
- make.sh
|
227
227
|
- spec/exceptions_spec.rb
|
228
228
|
- spec/gff3_rdfwriter_spec.rb
|
229
|
+
- spec/gff3_reader_spec.rb
|
229
230
|
- spec/gvf_rdfwriter_spec.rb
|
230
231
|
- spec/phylogenetics_spec.rb
|
231
232
|
- spec/text_mining_pdfx_xml_reader_spec.rb
|
@@ -304,7 +305,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
304
305
|
version: '0'
|
305
306
|
requirements: []
|
306
307
|
rubyforge_project:
|
307
|
-
rubygems_version: 2.0.
|
308
|
+
rubygems_version: 2.0.3
|
308
309
|
signing_key:
|
309
310
|
specification_version: 4
|
310
311
|
summary: An open source framework for transforming heterogeneous data formats into
|