biointerchange 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/VERSION +1 -1
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +18 -20
- data/lib/biointerchange/genomics/gff3_reader.rb +17 -2
- data/spec/gff3_reader_spec.rb +43 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 740da15b29a3a3da4b012e17b16e4ce3c5cef989
|
4
|
+
data.tar.gz: fb7508cdacae1aa18d5d142cebed3717b4c66e91
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9cb7e6845df394b0a7d5e54310456841471baa54c2a081c18d6a7fb3c5a6a929074acc42690101c083d6f16f8d02e1f995a12806017a45c5b5e3ead24e75c5d7
|
7
|
+
data.tar.gz: 7921f0f48abbb8bdc8e0c2d0c47251b40b70468cde3210d9f39edb98bc94e21c778c49d79a5740101df1bae1dbeedf9e88cdac7bb08e95d642409d4915e78f12
|
data/README.md
CHANGED
@@ -107,7 +107,7 @@ The following list provides information on the origin of the example-data files
|
|
107
107
|
|
108
108
|
The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
|
109
109
|
|
110
|
-
* `mgp.
|
110
|
+
* `mgp.v4.indels.dbSNP.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz)
|
111
111
|
|
112
112
|
### Application Programming Interface
|
113
113
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.5
|
@@ -255,26 +255,24 @@ protected
|
|
255
255
|
bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
|
256
256
|
create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
|
257
257
|
end
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
|
277
|
-
create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
|
258
|
+
[ [ start_position_uri, BioInterchange::FALDO.begin, feature.start_coordinate ],
|
259
|
+
[ end_position_uri, BioInterchange::FALDO.end, feature.end_coordinate ] ].each { |uri_relation_coordinate|
|
260
|
+
position_uri, faldo_relation, coordinate = uri_relation_coordinate
|
261
|
+
create_triple(region_uri, faldo_relation, position_uri)
|
262
|
+
case feature.strand
|
263
|
+
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
264
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
|
265
|
+
when BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
266
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
|
267
|
+
when BioInterchange::Genomics::GFF3Feature::POSITIVE
|
268
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
269
|
+
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
270
|
+
create_triple(position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
|
271
|
+
else
|
272
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
|
273
|
+
end
|
274
|
+
create_triple(start_position_uri, BioInterchange::FALDO.position, coordinate)
|
275
|
+
}
|
278
276
|
if feature.score then
|
279
277
|
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
|
280
278
|
if @format == :gvf or @format == :vcf then
|
@@ -32,6 +32,7 @@ class GFF3Reader < BioInterchange::Reader
|
|
32
32
|
@name_uri = name_uri
|
33
33
|
@date = date
|
34
34
|
@batch_size = batch_size
|
35
|
+
@linenumber = 0
|
35
36
|
end
|
36
37
|
|
37
38
|
# Reads a GFF3 file from the input stream and returns an associated model.
|
@@ -79,6 +80,7 @@ protected
|
|
79
80
|
begin
|
80
81
|
line = gff3.readline
|
81
82
|
line.chomp!
|
83
|
+
@linenumber += 1
|
82
84
|
|
83
85
|
if line.start_with?('#') and not line.start_with?('##') then
|
84
86
|
add_comment(@feature_set, line[1..-1].strip)
|
@@ -90,6 +92,9 @@ protected
|
|
90
92
|
next
|
91
93
|
end
|
92
94
|
|
95
|
+
# Empty lines. They happen. Mostly at the end of files.
|
96
|
+
next if line.strip.empty?
|
97
|
+
|
93
98
|
if fasta_block then
|
94
99
|
if line.start_with?('>') and line.length > 1 then
|
95
100
|
@feature_set.add(BioInterchange::Genomics::GFF3FeatureSequence.new(fasta_id, fasta_sequence, fasta_comment)) if fasta_id and not fasta_sequence.empty?()
|
@@ -139,7 +144,9 @@ protected
|
|
139
144
|
type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
|
140
145
|
end
|
141
146
|
rescue NoMethodError
|
142
|
-
|
147
|
+
# If neither a SO accession or a SO/SOFA term, then raise an error. The GFF3 specification
|
148
|
+
# is very clear about type being either one of the two.
|
149
|
+
raise BioInterchange::Exceptions::InputFormatError, "Line #{@linenumber}. Type of feature is set to an unknown SO/SOFA term: \"#{type}\""
|
143
150
|
end
|
144
151
|
|
145
152
|
# String to numeric value conversions:
|
@@ -218,7 +225,15 @@ protected
|
|
218
225
|
# +attribute_string+:: key/value string (column 9) as seen in a GFF3/GVF file
|
219
226
|
def split_attributes(attribute_string)
|
220
227
|
attributes = {}
|
221
|
-
attribute_string.split(';').map { |assignment|
|
228
|
+
hashes = attribute_string.split(';').map { |assignment|
|
229
|
+
match = assignment.match(/([^=]+)=(.+)/) ;
|
230
|
+
{ match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } }
|
231
|
+
}
|
232
|
+
hashes.map { |hash|
|
233
|
+
hash.each_pair { |tag,list|
|
234
|
+
attributes[tag] = list
|
235
|
+
}
|
236
|
+
}
|
222
237
|
attributes
|
223
238
|
end
|
224
239
|
|
@@ -0,0 +1,43 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rspec'
|
4
|
+
|
5
|
+
require 'biointerchange'
|
6
|
+
|
7
|
+
describe BioInterchange::Genomics::GFF3Reader do
|
8
|
+
describe 'reading GFF3 data' do
|
9
|
+
|
10
|
+
describe 'reading GFF3 into a model' do
|
11
|
+
before :all do
|
12
|
+
@test_gff3 = """##gff-version 3
|
13
|
+
##sequence-region chr1 1001 400200
|
14
|
+
##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606
|
15
|
+
# A comment line.
|
16
|
+
chr1 Test Center gene 2030 3100 . + . ID=Gene1;Name=firstgene
|
17
|
+
chr1 . gene 5010 10029 . - . ID=Gene2;Name=secondgene
|
18
|
+
chr1 . exon 5090 6000 . - . ID=Exon2.1;Parent=Gene2
|
19
|
+
|
20
|
+
"""
|
21
|
+
# Initialize with the most basic form of a GFF3 reader, i.e.
|
22
|
+
# no name, name_uri, date, etc., information is provided.
|
23
|
+
@reader = BioInterchange::Genomics::GFF3Reader.new()
|
24
|
+
end
|
25
|
+
it 'creates correct GFF3 model instance' do
|
26
|
+
model = @reader.deserialize(@test_gff3)
|
27
|
+
|
28
|
+
model.should be_an_instance_of BioInterchange::Genomics::GFF3FeatureSet
|
29
|
+
end
|
30
|
+
it 'creates a model with the right number of genomic features' do
|
31
|
+
model = @reader.deserialize(@test_gff3)
|
32
|
+
|
33
|
+
model.contents.length.should eql 3
|
34
|
+
end
|
35
|
+
it 'creates a model with the right number of pragma statements' do
|
36
|
+
model = @reader.deserialize(@test_gff3)
|
37
|
+
|
38
|
+
model.pragmas.length.should eql 3
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biointerchange
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joachim Baran
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2014-11-
|
15
|
+
date: 2014-11-25 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rdf
|
@@ -226,6 +226,7 @@ files:
|
|
226
226
|
- make.sh
|
227
227
|
- spec/exceptions_spec.rb
|
228
228
|
- spec/gff3_rdfwriter_spec.rb
|
229
|
+
- spec/gff3_reader_spec.rb
|
229
230
|
- spec/gvf_rdfwriter_spec.rb
|
230
231
|
- spec/phylogenetics_spec.rb
|
231
232
|
- spec/text_mining_pdfx_xml_reader_spec.rb
|
@@ -304,7 +305,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
304
305
|
version: '0'
|
305
306
|
requirements: []
|
306
307
|
rubyforge_project:
|
307
|
-
rubygems_version: 2.0.
|
308
|
+
rubygems_version: 2.0.3
|
308
309
|
signing_key:
|
309
310
|
specification_version: 4
|
310
311
|
summary: An open source framework for transforming heterogeneous data formats into
|