RubyGems - biointerchange - Versions diffs - 1.0.4 → 1.0.5 - Mend

biointerchange 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/VERSION +1 -1
data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +18 -20
data/lib/biointerchange/genomics/gff3_reader.rb +17 -2
data/spec/gff3_reader_spec.rb +43 -0
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ac318ac0a65f19b3d411a9e478227dbfe80503d3
-  data.tar.gz: f821af2f98f214f44f9053227e46cf06a187383f
+  metadata.gz: 740da15b29a3a3da4b012e17b16e4ce3c5cef989
+  data.tar.gz: fb7508cdacae1aa18d5d142cebed3717b4c66e91
 SHA512:
-  metadata.gz: b30d9a4bbf5684f5fcd131934b2ae06233c7bab5c680be35ce554996b27bb7a2b43a6a2d6c79093acdca4dadcce20e7c653c48e615aecb4d68fd2f015ce2f93e
-  data.tar.gz: cd3308acbed7bcaaa763a8f1abd65cd597b9622e8cb36d8dc0fc5bc40d71a47f808082fdcc5b9a5a70c1cd5fd753121353c9ac2585e8616daebaf2aee4218e50
+  metadata.gz: 9cb7e6845df394b0a7d5e54310456841471baa54c2a081c18d6a7fb3c5a6a929074acc42690101c083d6f16f8d02e1f995a12806017a45c5b5e3ead24e75c5d7
+  data.tar.gz: 7921f0f48abbb8bdc8e0c2d0c47251b40b70468cde3210d9f39edb98bc94e21c778c49d79a5740101df1bae1dbeedf9e88cdac7bb08e95d642409d4915e78f12

data/README.md CHANGED

@@ -107,7 +107,7 @@ The following list provides information on the origin of the example-data files
 The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
-*  `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
+*  `mgp.v4.indels.dbSNP.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz)
 ### Application Programming Interface

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 1.0.4
1	+ 1.0.5

data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb CHANGED

@@ -255,26 +255,24 @@ protected
       bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
       create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
     end
-    create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
-    create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
-    case feature.strand
-    when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
-    when BioInterchange::Genomics::GFF3Feature::UNKNOWN
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
-    when BioInterchange::Genomics::GFF3Feature::POSITIVE
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
-    when BioInterchange::Genomics::GFF3Feature::NEGATIVE
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
-    else
-      raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
-    end
-    create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
-    create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
+    [ [ start_position_uri, BioInterchange::FALDO.begin, feature.start_coordinate ],
+      [ end_position_uri, BioInterchange::FALDO.end, feature.end_coordinate ] ].each { |uri_relation_coordinate|
+      position_uri, faldo_relation, coordinate = uri_relation_coordinate
+      create_triple(region_uri, faldo_relation, position_uri)
+      case feature.strand
+      when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
+      when BioInterchange::Genomics::GFF3Feature::UNKNOWN
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
+      when BioInterchange::Genomics::GFF3Feature::POSITIVE
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
+      when BioInterchange::Genomics::GFF3Feature::NEGATIVE
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
+      else
+        raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
+      end
+      create_triple(start_position_uri, BioInterchange::FALDO.position, coordinate)
+    }
     if feature.score then
       create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
       if @format == :gvf or @format == :vcf then

data/lib/biointerchange/genomics/gff3_reader.rb CHANGED

@@ -32,6 +32,7 @@ class GFF3Reader < BioInterchange::Reader
     @name_uri = name_uri
     @date = date
     @batch_size = batch_size
+    @linenumber = 0
   end
   # Reads a GFF3 file from the input stream and returns an associated model.
@@ -79,6 +80,7 @@ protected
     begin
       line = gff3.readline
       line.chomp!
+      @linenumber += 1
       if line.start_with?('#') and not line.start_with?('##') then
         add_comment(@feature_set, line[1..-1].strip)
@@ -90,6 +92,9 @@ protected
         next
       end
+      # Empty lines. They happen. Mostly at the end of files.
+      next if line.strip.empty?
       if fasta_block then
         if line.start_with?('>') and line.length > 1 then
           @feature_set.add(BioInterchange::Genomics::GFF3FeatureSequence.new(fasta_id, fasta_sequence, fasta_comment)) if fasta_id and not fasta_sequence.empty?()
@@ -139,7 +144,9 @@ protected
         type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
       end
     rescue NoMethodError
-      raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
+      # If neither a SO accession or a SO/SOFA term, then raise an error. The GFF3 specification
+      # is very clear about type being either one of the two.
+      raise BioInterchange::Exceptions::InputFormatError, "Line #{@linenumber}. Type of feature is set to an unknown SO/SOFA term: \"#{type}\""
     end
     # String to numeric value conversions:
@@ -218,7 +225,15 @@ protected
   # +attribute_string+:: key/value string (column 9) as seen in a GFF3/GVF file
   def split_attributes(attribute_string)
     attributes = {}
-    attribute_string.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } } }.map { |hash| hash.each_pair { |tag,list| attributes[tag] = list } }
+    hashes = attribute_string.split(';').map { |assignment|
+      match = assignment.match(/([^=]+)=(.+)/) ;
+      { match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } }
+    }
+    hashes.map { |hash|
+      hash.each_pair { |tag,list|
+        attributes[tag] = list
+      }
+    }
     attributes
   end

data/spec/gff3_reader_spec.rb ADDED

@@ -0,0 +1,43 @@
+require 'rubygems'
+require 'rspec'
+require 'biointerchange'
+describe BioInterchange::Genomics::GFF3Reader do
+  describe 'reading GFF3 data' do
+    describe 'reading GFF3 into a model' do
+      before :all do
+        @test_gff3 = """##gff-version 3
+##sequence-region chr1 1001 400200
+##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606
+# A comment line.
+chr1	Test Center	gene	2030	3100	.	+	.	ID=Gene1;Name=firstgene
+chr1	.	gene	5010	10029	.	-	.	ID=Gene2;Name=secondgene
+chr1	.	exon	5090	6000	.	-	.	ID=Exon2.1;Parent=Gene2
+"""
+        # Initialize with the most basic form of a GFF3 reader, i.e.
+        # no name, name_uri, date, etc., information is provided.
+        @reader = BioInterchange::Genomics::GFF3Reader.new()
+      end
+      it 'creates correct GFF3 model instance' do
+        model = @reader.deserialize(@test_gff3)
+        model.should be_an_instance_of BioInterchange::Genomics::GFF3FeatureSet
+      end
+      it 'creates a model with the right number of genomic features' do
+        model = @reader.deserialize(@test_gff3)
+        model.contents.length.should eql 3
+      end
+      it 'creates a model with the right number of pragma statements' do
+        model = @reader.deserialize(@test_gff3)
+        model.pragmas.length.should eql 3
+      end
+    end
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: biointerchange
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.0.5
 platform: ruby
 authors:
 - Joachim Baran
@@ -12,7 +12,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-11-13 00:00:00.000000000 Z
+date: 2014-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rdf
@@ -226,6 +226,7 @@ files:
 - make.sh
 - spec/exceptions_spec.rb
 - spec/gff3_rdfwriter_spec.rb
+- spec/gff3_reader_spec.rb
 - spec/gvf_rdfwriter_spec.rb
 - spec/phylogenetics_spec.rb
 - spec/text_mining_pdfx_xml_reader_spec.rb
@@ -304,7 +305,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.14
+rubygems_version: 2.0.3
 signing_key:
 specification_version: 4
 summary: An open source framework for transforming heterogeneous data formats into