biointerchange 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ac318ac0a65f19b3d411a9e478227dbfe80503d3
4
- data.tar.gz: f821af2f98f214f44f9053227e46cf06a187383f
3
+ metadata.gz: 740da15b29a3a3da4b012e17b16e4ce3c5cef989
4
+ data.tar.gz: fb7508cdacae1aa18d5d142cebed3717b4c66e91
5
5
  SHA512:
6
- metadata.gz: b30d9a4bbf5684f5fcd131934b2ae06233c7bab5c680be35ce554996b27bb7a2b43a6a2d6c79093acdca4dadcce20e7c653c48e615aecb4d68fd2f015ce2f93e
7
- data.tar.gz: cd3308acbed7bcaaa763a8f1abd65cd597b9622e8cb36d8dc0fc5bc40d71a47f808082fdcc5b9a5a70c1cd5fd753121353c9ac2585e8616daebaf2aee4218e50
6
+ metadata.gz: 9cb7e6845df394b0a7d5e54310456841471baa54c2a081c18d6a7fb3c5a6a929074acc42690101c083d6f16f8d02e1f995a12806017a45c5b5e3ead24e75c5d7
7
+ data.tar.gz: 7921f0f48abbb8bdc8e0c2d0c47251b40b70468cde3210d9f39edb98bc94e21c778c49d79a5740101df1bae1dbeedf9e88cdac7bb08e95d642409d4915e78f12
data/README.md CHANGED
@@ -107,7 +107,7 @@ The following list provides information on the origin of the example-data files
107
107
 
108
108
  The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
109
109
 
110
- * `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
110
+ * `mgp.v4.indels.dbSNP.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz)
111
111
 
112
112
  ### Application Programming Interface
113
113
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.4
1
+ 1.0.5
@@ -255,26 +255,24 @@ protected
255
255
  bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
256
256
  create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
257
257
  end
258
- create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
259
- create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
260
- case feature.strand
261
- when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
262
- create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
263
- create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
264
- when BioInterchange::Genomics::GFF3Feature::UNKNOWN
265
- create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
266
- create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
267
- when BioInterchange::Genomics::GFF3Feature::POSITIVE
268
- create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
269
- create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
270
- when BioInterchange::Genomics::GFF3Feature::NEGATIVE
271
- create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
272
- create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
273
- else
274
- raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
275
- end
276
- create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
277
- create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
258
+ [ [ start_position_uri, BioInterchange::FALDO.begin, feature.start_coordinate ],
259
+ [ end_position_uri, BioInterchange::FALDO.end, feature.end_coordinate ] ].each { |uri_relation_coordinate|
260
+ position_uri, faldo_relation, coordinate = uri_relation_coordinate
261
+ create_triple(region_uri, faldo_relation, position_uri)
262
+ case feature.strand
263
+ when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
264
+ create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
265
+ when BioInterchange::Genomics::GFF3Feature::UNKNOWN
266
+ create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
267
+ when BioInterchange::Genomics::GFF3Feature::POSITIVE
268
+ create_triple(position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
269
+ when BioInterchange::Genomics::GFF3Feature::NEGATIVE
270
+ create_triple(position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
271
+ else
272
+ raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
273
+ end
274
+ create_triple(start_position_uri, BioInterchange::FALDO.position, coordinate)
275
+ }
278
276
  if feature.score then
279
277
  create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
280
278
  if @format == :gvf or @format == :vcf then
@@ -32,6 +32,7 @@ class GFF3Reader < BioInterchange::Reader
32
32
  @name_uri = name_uri
33
33
  @date = date
34
34
  @batch_size = batch_size
35
+ @linenumber = 0
35
36
  end
36
37
 
37
38
  # Reads a GFF3 file from the input stream and returns an associated model.
@@ -79,6 +80,7 @@ protected
79
80
  begin
80
81
  line = gff3.readline
81
82
  line.chomp!
83
+ @linenumber += 1
82
84
 
83
85
  if line.start_with?('#') and not line.start_with?('##') then
84
86
  add_comment(@feature_set, line[1..-1].strip)
@@ -90,6 +92,9 @@ protected
90
92
  next
91
93
  end
92
94
 
95
+ # Empty lines. They happen. Mostly at the end of files.
96
+ next if line.strip.empty?
97
+
93
98
  if fasta_block then
94
99
  if line.start_with?('>') and line.length > 1 then
95
100
  @feature_set.add(BioInterchange::Genomics::GFF3FeatureSequence.new(fasta_id, fasta_sequence, fasta_comment)) if fasta_id and not fasta_sequence.empty?()
@@ -139,7 +144,9 @@ protected
139
144
  type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
140
145
  end
141
146
  rescue NoMethodError
142
- raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
147
+ # If neither a SO accession or a SO/SOFA term, then raise an error. The GFF3 specification
148
+ # is very clear about type being either one of the two.
149
+ raise BioInterchange::Exceptions::InputFormatError, "Line #{@linenumber}. Type of feature is set to an unknown SO/SOFA term: \"#{type}\""
143
150
  end
144
151
 
145
152
  # String to numeric value conversions:
@@ -218,7 +225,15 @@ protected
218
225
  # +attribute_string+:: key/value string (column 9) as seen in a GFF3/GVF file
219
226
  def split_attributes(attribute_string)
220
227
  attributes = {}
221
- attribute_string.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } } }.map { |hash| hash.each_pair { |tag,list| attributes[tag] = list } }
228
+ hashes = attribute_string.split(';').map { |assignment|
229
+ match = assignment.match(/([^=]+)=(.+)/) ;
230
+ { match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } }
231
+ }
232
+ hashes.map { |hash|
233
+ hash.each_pair { |tag,list|
234
+ attributes[tag] = list
235
+ }
236
+ }
222
237
  attributes
223
238
  end
224
239
 
@@ -0,0 +1,43 @@
1
+
2
+ require 'rubygems'
3
+ require 'rspec'
4
+
5
+ require 'biointerchange'
6
+
7
+ describe BioInterchange::Genomics::GFF3Reader do
8
+ describe 'reading GFF3 data' do
9
+
10
+ describe 'reading GFF3 into a model' do
11
+ before :all do
12
+ @test_gff3 = """##gff-version 3
13
+ ##sequence-region chr1 1001 400200
14
+ ##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606
15
+ # A comment line.
16
+ chr1 Test Center gene 2030 3100 . + . ID=Gene1;Name=firstgene
17
+ chr1 . gene 5010 10029 . - . ID=Gene2;Name=secondgene
18
+ chr1 . exon 5090 6000 . - . ID=Exon2.1;Parent=Gene2
19
+
20
+ """
21
+ # Initialize with the most basic form of a GFF3 reader, i.e.
22
+ # no name, name_uri, date, etc., information is provided.
23
+ @reader = BioInterchange::Genomics::GFF3Reader.new()
24
+ end
25
+ it 'creates correct GFF3 model instance' do
26
+ model = @reader.deserialize(@test_gff3)
27
+
28
+ model.should be_an_instance_of BioInterchange::Genomics::GFF3FeatureSet
29
+ end
30
+ it 'creates a model with the right number of genomic features' do
31
+ model = @reader.deserialize(@test_gff3)
32
+
33
+ model.contents.length.should eql 3
34
+ end
35
+ it 'creates a model with the right number of pragma statements' do
36
+ model = @reader.deserialize(@test_gff3)
37
+
38
+ model.pragmas.length.should eql 3
39
+ end
40
+ end
41
+ end
42
+ end
43
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: biointerchange
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joachim Baran
@@ -12,7 +12,7 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2014-11-13 00:00:00.000000000 Z
15
+ date: 2014-11-25 00:00:00.000000000 Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: rdf
@@ -226,6 +226,7 @@ files:
226
226
  - make.sh
227
227
  - spec/exceptions_spec.rb
228
228
  - spec/gff3_rdfwriter_spec.rb
229
+ - spec/gff3_reader_spec.rb
229
230
  - spec/gvf_rdfwriter_spec.rb
230
231
  - spec/phylogenetics_spec.rb
231
232
  - spec/text_mining_pdfx_xml_reader_spec.rb
@@ -304,7 +305,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
304
305
  version: '0'
305
306
  requirements: []
306
307
  rubyforge_project:
307
- rubygems_version: 2.0.14
308
+ rubygems_version: 2.0.3
308
309
  signing_key:
309
310
  specification_version: 4
310
311
  summary: An open source framework for transforming heterogeneous data formats into