RubyGems - biointerchange - Versions diffs - 1.0.4 → 1.0.5 - Mend

biointerchange 1.0.4 → 1.0.5

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/VERSION +1 -1
data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +18 -20
data/lib/biointerchange/genomics/gff3_reader.rb +17 -2
data/spec/gff3_reader_spec.rb +43 -0
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ac318ac0a65f19b3d411a9e478227dbfe80503d3
-  data.tar.gz: f821af2f98f214f44f9053227e46cf06a187383f
+  metadata.gz: 740da15b29a3a3da4b012e17b16e4ce3c5cef989
+  data.tar.gz: fb7508cdacae1aa18d5d142cebed3717b4c66e91
 SHA512:
-  metadata.gz: b30d9a4bbf5684f5fcd131934b2ae06233c7bab5c680be35ce554996b27bb7a2b43a6a2d6c79093acdca4dadcce20e7c653c48e615aecb4d68fd2f015ce2f93e
-  data.tar.gz: cd3308acbed7bcaaa763a8f1abd65cd597b9622e8cb36d8dc0fc5bc40d71a47f808082fdcc5b9a5a70c1cd5fd753121353c9ac2585e8616daebaf2aee4218e50
+  metadata.gz: 9cb7e6845df394b0a7d5e54310456841471baa54c2a081c18d6a7fb3c5a6a929074acc42690101c083d6f16f8d02e1f995a12806017a45c5b5e3ead24e75c5d7
+  data.tar.gz: 7921f0f48abbb8bdc8e0c2d0c47251b40b70468cde3210d9f39edb98bc94e21c778c49d79a5740101df1bae1dbeedf9e88cdac7bb08e95d642409d4915e78f12

data/README.md CHANGED

@@ -107,7 +107,7 @@ The following list provides information on the origin of the example-data files
 The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
-*  `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
+*  `mgp.v4.indels.dbSNP.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/REL-1410-SNPs_Indels/mgp.v4.indels.dbSNP.vcf.gz)
 ### Application Programming Interface

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 1.0.4
1	+ 1.0.5

data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb CHANGED

@@ -255,26 +255,24 @@ protected
       bin_uri = RDF::URI.new("bin://#{feature.sequence_id}/#{BioInterchange::Genomics::Locations.reg2bin(feature.start_coordinate, feature.end_coordinate)}")
       create_triple(bin_uri, RDF::URI.new('bin://contains'), feature_uri)
     end
-    create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
-    create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
-    case feature.strand
-    when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
-    when BioInterchange::Genomics::GFF3Feature::UNKNOWN
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Position)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Position)
-    when BioInterchange::Genomics::GFF3Feature::POSITIVE
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
-    when BioInterchange::Genomics::GFF3Feature::NEGATIVE
-      create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
-      create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
-    else
-      raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
-    end
-    create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
-    create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
+    [ [ start_position_uri, BioInterchange::FALDO.begin, feature.start_coordinate ],
+      [ end_position_uri, BioInterchange::FALDO.end, feature.end_coordinate ] ].each { |uri_relation_coordinate|
+      position_uri, faldo_relation, coordinate = uri_relation_coordinate
+      create_triple(region_uri, faldo_relation, position_uri)
+      case feature.strand
+      when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
+      when BioInterchange::Genomics::GFF3Feature::UNKNOWN
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Position)
+      when BioInterchange::Genomics::GFF3Feature::POSITIVE
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
+      when BioInterchange::Genomics::GFF3Feature::NEGATIVE
+        create_triple(position_uri, RDF.type, BioInterchange::FALDO.Negative_strand)
+      else
+        raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
+      end
+      create_triple(start_position_uri, BioInterchange::FALDO.position, coordinate)
+    }
     if feature.score then
       create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
       if @format == :gvf or @format == :vcf then

data/lib/biointerchange/genomics/gff3_reader.rb CHANGED

@@ -32,6 +32,7 @@ class GFF3Reader < BioInterchange::Reader
     @name_uri = name_uri
     @date = date
     @batch_size = batch_size
+    @linenumber = 0
   end
   # Reads a GFF3 file from the input stream and returns an associated model.
@@ -79,6 +80,7 @@ protected
     begin
       line = gff3.readline
       line.chomp!
+      @linenumber += 1
       if line.start_with?('#') and not line.start_with?('##') then
         add_comment(@feature_set, line[1..-1].strip)
@@ -90,6 +92,9 @@ protected
         next
       end
+      # Empty lines. They happen. Mostly at the end of files.
+      next if line.strip.empty?
       if fasta_block then
         if line.start_with?('>') and line.length > 1 then
           @feature_set.add(BioInterchange::Genomics::GFF3FeatureSequence.new(fasta_id, fasta_sequence, fasta_comment)) if fasta_id and not fasta_sequence.empty?()
@@ -139,7 +144,9 @@ protected
         type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
       end
     rescue NoMethodError
-      raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
+      # If neither a SO accession or a SO/SOFA term, then raise an error. The GFF3 specification
+      # is very clear about type being either one of the two.
+      raise BioInterchange::Exceptions::InputFormatError, "Line #{@linenumber}. Type of feature is set to an unknown SO/SOFA term: \"#{type}\""
     end
     # String to numeric value conversions:
@@ -218,7 +225,15 @@ protected
   # +attribute_string+:: key/value string (column 9) as seen in a GFF3/GVF file
   def split_attributes(attribute_string)
     attributes = {}
-    attribute_string.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } } }.map { |hash| hash.each_pair { |tag,list| attributes[tag] = list } }
+    hashes = attribute_string.split(';').map { |assignment|
+      match = assignment.match(/([^=]+)=(.+)/) ;
+      { match[1].strip => match[2].split(',').map { |value| URI.decode(value.strip) } }
+    }
+    hashes.map { |hash|
+      hash.each_pair { |tag,list|
+        attributes[tag] = list
+      }
+    }
     attributes
   end

data/spec/gff3_reader_spec.rb ADDED

@@ -0,0 +1,43 @@
+require 'rubygems'
+require 'rspec'
+require 'biointerchange'
+describe BioInterchange::Genomics::GFF3Reader do
+  describe 'reading GFF3 data' do
+    describe 'reading GFF3 into a model' do
+      before :all do
+        @test_gff3 = """##gff-version 3
+##sequence-region chr1 1001 400200
+##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606
+# A comment line.
+chr1	Test Center	gene	2030	3100	.	+	.	ID=Gene1;Name=firstgene
+chr1	.	gene	5010	10029	.	-	.	ID=Gene2;Name=secondgene
+chr1	.	exon	5090	6000	.	-	.	ID=Exon2.1;Parent=Gene2
+"""
+        # Initialize with the most basic form of a GFF3 reader, i.e.
+        # no name, name_uri, date, etc., information is provided.
+        @reader = BioInterchange::Genomics::GFF3Reader.new()
+      end
+      it 'creates correct GFF3 model instance' do
+        model = @reader.deserialize(@test_gff3)
+        model.should be_an_instance_of BioInterchange::Genomics::GFF3FeatureSet
+      end
+      it 'creates a model with the right number of genomic features' do
+        model = @reader.deserialize(@test_gff3)
+        model.contents.length.should eql 3
+      end
+      it 'creates a model with the right number of pragma statements' do
+        model = @reader.deserialize(@test_gff3)
+        model.pragmas.length.should eql 3
+      end
+    end
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: biointerchange
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.0.5
 platform: ruby
 authors:
 - Joachim Baran
@@ -12,7 +12,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-11-13 00:00:00.000000000 Z
+date: 2014-11-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rdf
@@ -226,6 +226,7 @@ files:
 - make.sh
 - spec/exceptions_spec.rb
 - spec/gff3_rdfwriter_spec.rb
+- spec/gff3_reader_spec.rb
 - spec/gvf_rdfwriter_spec.rb
 - spec/phylogenetics_spec.rb
 - spec/text_mining_pdfx_xml_reader_spec.rb
@@ -304,7 +305,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.14
+rubygems_version: 2.0.3
 signing_key:
 specification_version: 4
 summary: An open source framework for transforming heterogeneous data formats into