RubyGems - biointerchange - Versions diffs - 0.1.2 → 0.1.3 - Mend

biointerchange 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

data/README.md +49 -4
data/VERSION +1 -1
data/examples/chromosome_BF.gff +1701 -0
data/examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf +4326 -0
data/examples/pubannotation.10096561.json +1 -0
data/examples/{pubannotation.json → pubannotation.10096561.json.old} +0 -0
data/examples/pubannotation.2626671.json +1 -0
data/lib/biointerchange/core.rb +58 -16
data/lib/biointerchange/genomics/gff3_feature.rb +1 -0
data/lib/biointerchange/genomics/gff3_feature_set.rb +31 -1
data/lib/biointerchange/genomics/gff3_pragmas.rb +35 -0
data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +60 -23
data/lib/biointerchange/genomics/gff3_reader.rb +74 -40
data/lib/biointerchange/genomics/gvf_feature.rb +24 -0
data/lib/biointerchange/genomics/gvf_feature_set.rb +14 -0
data/lib/biointerchange/genomics/gvf_pragmas.rb +6 -0
data/lib/biointerchange/genomics/gvf_reader.rb +37 -0
data/lib/biointerchange/gff3o.rb +1 -1
data/lib/biointerchange/gvf1o.rb +145 -17
data/lib/biointerchange/textmining/content.rb +1 -0
data/lib/biointerchange/textmining/content_connection.rb +74 -0
data/lib/biointerchange/textmining/document.rb +3 -1
data/lib/biointerchange/textmining/pubannos_json_reader.rb +87 -9
data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +58 -2
data/spec/gff3_rdfwriter_spec.rb +9 -1
data/spec/gvf_rdfwriter_spec.rb +81 -0
data/spec/text_mining_pubannos_json_reader_spec.rb +82 -10
data/spec/text_mining_rdfwriter_spec.rb +11 -0
data/web/api.html +30 -23
metadata +156 -138

data/examples/pubannotation.10096561.json ADDED

@@ -0,0 +1 @@

+ {"name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "4", "docurl":"http://www.ncbi.nlm.nih.gov/pubmed/10096561","text":"Stimulation of CD40 on immunogenic human malignant melanomas augments their cytotoxic T lymphocyte-mediated lysis and induces apoptosis.\nHere, we report the functional expression of CD40 on human malignant melanomas (MMs). Comparison of tumor specimen from MM precursor lesions, primary tumors, and metastases revealed that CD40 surface expression is down-regulated during tumor progression. CD40 expression was confirmed in 7 human MM cell lines established from immunogenic primary tumors or metastases, whereas 11 cell lines established from advanced stages were CD40 negative. CD40 expression could be enhanced in CD40-positive MM by stimulation with IFN-gamma and tumor necrosis factor-alpha but not by interleukin (IL)-1beta or CD40 triggering. CD40 ligation on MM by CD40L-transfected murine L-cells or by a soluble CD40L fusion protein up-regulated their expression of intercellular adhesion molecule-1 and MHC class I and class II molecules and their secretion of IL-6, IL-8, tumor necrosis factor-a, and granulocyte macrophage colony-stimulating factor and also induced a rapid activation of the transcription factor nuclear factor kappaB. Furthermore, CD40 ligation of a HLA-A2+, MelanA/MART1+ MM cell line enhanced its susceptibility to specific lysis by a HLA-A2-restricted, MelanA/MART-1-specific CTL clone. Finally, CD40 ligation induced growth inhibition and apoptosis in MM. These results indicate that CD40-CD40L interactions may play an important role in augmenting antitumor immunity and inducing apoptosis in some CD40-positive immunogenic human MMs.","catanns":[{"id":"T1","span":{"begin":15,"end":19},"category":"NP"},{"id":"T2","span":{"begin":23,"end":60},"category":"NP"},{"id":"T3","span":{"begin":70,"end":75},"category":"PR"},{"id":"T4","span":{"begin":126,"end":135},"category":"NP"},{"id":"T5","span":{"begin":182,"end":186},"category":"NP"},{"id":"T6","span":{"begin":190,"end":221},"category":"NP"},{"id":"T7","span":{"begin":299,"end":309},"category":"NP"},{"id":"T8","span":{"begin":392,"end":407},"category":"NP"},{"id":"T9","span":{"begin":494,"end":504},"category":"NP"},{"id":"T10","span":{"begin":581,"end":596},"category":"NP"},{"id":"T11","span":{"begin":768,"end":770},"category":"NP"},{"id":"T12","span":{"begin":857,"end":862},"category":"PR"},{"id":"T13","span":{"begin":954,"end":959},"category":"PR"},{"id":"T14","span":{"begin":1163,"end":1217},"category":"NP"},{"id":"T15","span":{"begin":1227,"end":1230},"category":"PR"},{"id":"T16","span":{"begin":1375,"end":1384},"category":"NP"},{"id":"T17","span":{"begin":1388,"end":1390},"category":"NP"},{"id":"T18","span":{"begin":1517,"end":1526},"category":"NP"}],"insanns":[],"relanns":[{"id":"R1","type":"coreferenceOf","subject":"T3","object":"T2"},{"id":"R2","type":"coreferenceOf","subject":"T5","object":"T1"},{"id":"R3","type":"coreferenceOf","subject":"T6","object":"T2"},{"id":"R4","type":"coreferenceOf","subject":"T9","object":"T7"},{"id":"R5","type":"coreferenceOf","subject":"T10","object":"T8"},{"id":"R6","type":"coreferenceOf","subject":"T11","object":"T6"},{"id":"R7","type":"coreferenceOf","subject":"T12","object":"T11"},{"id":"R8","type":"coreferenceOf","subject":"T13","object":"T11"},{"id":"R9","type":"coreferenceOf","subject":"T15","object":"T14"},{"id":"R10","type":"coreferenceOf","subject":"T16","object":"T4"},{"id":"R11","type":"coreferenceOf","subject":"T17","object":"T11"},{"id":"R12","type":"coreferenceOf","subject":"T18","object":"T16"}],"modanns":[]}

data/examples/{pubannotation.json → pubannotation.10096561.json.old} RENAMED

File without changes

data/examples/pubannotation.2626671.json ADDED

@@ -0,0 +1 @@

+ {"name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "3", "docurl":"http://www.ncbi.nlm.nih.gov/pubmed/2626671","pmcdoc_id":"2626671","div_id":"3","text":"Distinct expression kinetics of perforin and granzyme B during CTL development in culture\nOur experiments revealed clear differences in the kinetics of perforin, granzyme B, and cytokine expression during CD8+ T cell activation (Fig. 1). Naive T cells showed detectable expression of perforin mRNA as well as perforin protein (Fig. 1, A\u2013D). Relative to its expression in naive T cells, perforin (Prf1) mRNA expression did not increase appreciably at day 2 but showed a reproducible decrease at day 4, followed by robust reexpression between days 4 and 8 (Fig. 1, A\u2013D). In contrast, granzyme B (Gzmb) mRNA was low or undetectable in naive T cells but was strongly up-regulated by day 2 after stimulation and increased progressively until day 6 (Fig. 1, A and B); similarly, granzyme B protein was expressed by day 4 and remained high until day 6 (Fig. 1 E). As expected, a small fraction of naive T cells expressed the cytokines IFN-\u03b3 and TNF in response to stimulation, and this capacity increased significantly in differentiated cells (Fig. 1 E; see also Fig. 2 A). \nWe evaluated antigen-dependent cytolytic function in a short-term assay in which target cell death was measured within 2 h (Fig. 1 F). By limiting the duration of TCR stimulation, this strategy minimizes cytolysis secondary to new gene expression during the period of the assay. Naive T cells did not display significant cytolytic function in this short-term assay (unpublished data), most likely because they express immature (unprocessed) forms of perforin and lack the capacity to degranulate (18, 19). Even after activation for 2 or 4 d, the cells showed poor cytolytic activity (Fig. 1 F), in striking contrast to their capacity for efficient cytokine production (Fig. 1 E). Only cells cultured until day 6 displayed robust cytotoxicity, as judged by their ability to induce apoptosis in a large number of target cells (Fig. 1 F). \nThese results show that after a strong priming stimulus through TCRs and co-stimulatory receptors in vitro, granzyme B expression and the ability to produce effector cytokines are programmed early, whereas perforin expression and cytolytic function are induced later, during the phase of clonal expansion in IL-2. Therefore, the two major effector functions of CTL, cytokine production and cytolytic activity, are not intrinsically coregulated. \n","catanns":[{"id":"T21","span":{"begin":9,"end":19},"category":"Gene_expression"},{"id":"T1","span":{"begin":32,"end":40},"category":"Protein"},{"id":"T2","span":{"begin":45,"end":55},"category":"Protein"},{"id":"T3","span":{"begin":152,"end":160},"category":"Protein"},{"id":"T4","span":{"begin":162,"end":172},"category":"Protein"},{"id":"T22","span":{"begin":187,"end":197},"category":"Gene_expression"},{"id":"T5","span":{"begin":205,"end":208},"category":"Protein"},{"id":"T23","span":{"begin":270,"end":280},"category":"Gene_expression"},{"id":"T6","span":{"begin":284,"end":292},"category":"Protein"},{"id":"T7","span":{"begin":309,"end":317},"category":"Protein"},{"id":"T19","span":{"begin":353,"end":356},"category":"Anaphora"},{"id":"T24","span":{"begin":357,"end":367},"category":"Gene_expression"},{"id":"T8","span":{"begin":386,"end":394},"category":"Protein"},{"id":"T9","span":{"begin":396,"end":400},"category":"Protein"},{"id":"T25","span":{"begin":402,"end":417},"category":"Transcription"},{"id":"T26","span":{"begin":426,"end":434},"category":"Positive_regulation"},{"id":"T27","span":{"begin":482,"end":490},"category":"Negative_regulation"},{"id":"T28","span":{"begin":520,"end":532},"category":"Positive_regulation"},{"id":"T10","span":{"begin":582,"end":592},"category":"Protein"},{"id":"T11","span":{"begin":594,"end":598},"category":"Protein"},{"id":"T29","span":{"begin":609,"end":628},"category":"Transcription"},{"id":"T30","span":{"begin":663,"end":675},"category":"Positive_regulation"},{"id":"T31","span":{"begin":707,"end":716},"category":"Positive_regulation"},{"id":"T12","span":{"begin":773,"end":783},"category":"Protein"},{"id":"T32","span":{"begin":819,"end":832},"category":"Positive_regulation"},{"id":"T33","span":{"begin":904,"end":913},"category":"Gene_expression"},{"id":"T13","span":{"begin":928,"end":933},"category":"Protein"},{"id":"T14","span":{"begin":938,"end":941},"category":"Protein"},{"id":"T20","span":{"begin":974,"end":987},"category":"Anaphora"},{"id":"T34","span":{"begin":988,"end":997},"category":"Positive_regulation"},{"id":"T35","span":{"begin":1478,"end":1485},"category":"Gene_expression"},{"id":"T15","span":{"begin":1518,"end":1526},"category":"Protein"},{"id":"T16","span":{"begin":2013,"end":2023},"category":"Protein"},{"id":"T36","span":{"begin":2024,"end":2034},"category":"Gene_expression"},{"id":"T17","span":{"begin":2111,"end":2119},"category":"Protein"},{"id":"T37","span":{"begin":2120,"end":2130},"category":"Gene_expression"},{"id":"T38","span":{"begin":2158,"end":2165},"category":"Positive_regulation"},{"id":"T18","span":{"begin":2213,"end":2217},"category":"Protein"}],"insanns":[{"id":"E1","type":"subClassOf","object":"T21"},{"id":"E2","type":"subClassOf","object":"T21"},{"id":"E3","type":"subClassOf","object":"T22"},{"id":"E4","type":"subClassOf","object":"T22"},{"id":"E5","type":"subClassOf","object":"T23"},{"id":"E6","type":"subClassOf","object":"T23"},{"id":"E7","type":"subClassOf","object":"T24"},{"id":"E8","type":"subClassOf","object":"T25"},{"id":"E9","type":"subClassOf","object":"T26"},{"id":"E10","type":"subClassOf","object":"T27"},{"id":"E11","type":"subClassOf","object":"T28"},{"id":"E12","type":"subClassOf","object":"T29"},{"id":"E13","type":"subClassOf","object":"T30"},{"id":"E14","type":"subClassOf","object":"T31"},{"id":"E15","type":"subClassOf","object":"T32"},{"id":"E16","type":"subClassOf","object":"T33"},{"id":"E17","type":"subClassOf","object":"T33"},{"id":"E18","type":"subClassOf","object":"T34"},{"id":"E19","type":"subClassOf","object":"T34"},{"id":"E20","type":"subClassOf","object":"T35"},{"id":"E21","type":"subClassOf","object":"T36"},{"id":"E22","type":"subClassOf","object":"T37"},{"id":"E23","type":"subClassOf","object":"T38"}],"relanns":[{"id":"R4","type":"equivalentTo","subject":"T9","object":"T8"},{"id":"R5","type":"equivalentTo","subject":"T11","object":"T10"},{"id":"R6","type":"themeOf","subject":"T1","object":"E1"},{"id":"R7","type":"themeOf","subject":"T2","object":"E2"},{"id":"R8","type":"themeOf","subject":"T3","object":"E3"},{"id":"R9","type":"themeOf","subject":"T4","object":"E4"},{"id":"R10","type":"themeOf","subject":"T6","object":"E5"},{"id":"R11","type":"themeOf","subject":"T7","object":"E6"},{"id":"R12","type":"themeOf","subject":"T8","object":"E7"},{"id":"R13","type":"themeOf","subject":"T8","object":"E8"},{"id":"R14","type":"themeOf","subject":"E8","object":"E9"},{"id":"R15","type":"themeOf","subject":"E8","object":"E10"},{"id":"R16","type":"themeOf","subject":"E8","object":"E11"},{"id":"R17","type":"themeOf","subject":"T10","object":"E12"},{"id":"R18","type":"themeOf","subject":"T10","object":"E13"},{"id":"R19","type":"themeOf","subject":"T10","object":"E14"},{"id":"R20","type":"themeOf","subject":"T12","object":"E15"},{"id":"R21","type":"themeOf","subject":"T13","object":"E16"},{"id":"R22","type":"themeOf","subject":"T14","object":"E17"},{"id":"R23","type":"themeOf","subject":"E16","object":"E18"},{"id":"R24","type":"themeOf","subject":"E17","object":"E19"},{"id":"R25","type":"themeOf","subject":"T15","object":"E20"},{"id":"R26","type":"themeOf","subject":"T16","object":"E21"},{"id":"R27","type":"themeOf","subject":"T17","object":"E22"},{"id":"R28","type":"themeOf","subject":"E22","object":"E23"}],"modanns":[{"id":"M1","type":"Speculation","object":"E1"},{"id":"M2","type":"Speculation","object":"E2"},{"id":"M3","type":"Negation","object":"E9"},{"id":"M4","type":"Negation","object":"E12"}]}

data/lib/biointerchange/core.rb CHANGED

@@ -1,3 +1,8 @@
+# BioInterchange converts non-RDF data formats into RDF.
+#
+# Convert TSV, XML, GFF3, GVF and other files into RDF triples using
+# BioInterchange's command-line tool, its web-services or make use
+# of it as a gem in your own Ruby implementation.
 module BioInterchange
   # Custom Exceptions and Errors
@@ -25,6 +30,7 @@ module BioInterchange
   # Text mining model
   require 'biointerchange/textmining/document'
   require 'biointerchange/textmining/content'
+  require 'biointerchange/textmining/content_connection'
   require 'biointerchange/textmining/process'
   # Text mining writers
@@ -34,16 +40,32 @@ module BioInterchange
   # GENOMICS
   #
-  # GFF3 reader
+  ### GFF3 ###
+  # Reader
   require 'biointerchange/genomics/gff3_reader'
   # Feature base model
+  require 'biointerchange/genomics/gff3_pragmas'
   require 'biointerchange/genomics/gff3_feature_set'
   require 'biointerchange/genomics/gff3_feature'
-  # GFF3 writer
+  # Writer
   require 'biointerchange/genomics/gff3_rdf_ntriples'
+  ### GVF ###
+  # Reader
+  require 'biointerchange/genomics/gvf_reader'
+  # Feature base model
+  require 'biointerchange/genomics/gvf_pragmas'
+  require 'biointerchange/genomics/gvf_feature_set'
+  require 'biointerchange/genomics/gvf_feature'
+  # Writer
+  # ...same GFF3 writer
   #
   # ACTUAL COMMAND LINE IMPLEMENTATION
   #
@@ -68,19 +90,27 @@ module BioInterchange
       if opt['help'] or not opt['input'] or not opt['rdf'] then
         puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
+        puts ''
         puts 'Supported input formats (--input <format>/-i <format>):'
         puts '  biointerchange.gff3                : GFF3'
+        puts '  biointerchange.gvf                 : GVF'
         puts '  dbcls.catanns.json                 : PubAnnotation JSON'
         puts '  uk.ac.man.pdfx                     : PDFx XML'
+        puts ''
         puts 'Supported output formats (--rdf <format>/-r <format>)'
-        puts '  rdf.biointerchange.gff3            : RDF N-Triples for input'
+        puts '  rdf.biointerchange.gff3            : RDF N-Triples for the following input'
         puts '    biointerchange.gff3'
-        puts '  rdf.bh12.sio                       : RDF N-Triples for inputs'
+        puts '  rdf.biointerchange.gvf             : RDF N-Triples for the following input'
+        puts '    biointerchange.gff3'
+        puts '    biointerchange.gvf'
+        puts '  rdf.bh12.sio                       : RDF N-Triples for the following inputs'
         puts '    dbcls.catanns.json'
         puts '    uk.ac.man.pdfx'
+        puts ''
         puts 'I/O options:'
         puts '  -f <file>/--file <file>            : file to read; STDIN used if not supplied'
         puts '  -o <file>/--out <file>             : output file; STDOUT used if not supplied'
+        puts ''
         puts 'Input-/RDF-format specific options:'
         puts '  Input: dbcls.catanns.json, uk.ac.man.pdfx'
         puts '  Output: rdf.bh12.sio'
@@ -89,13 +119,15 @@ module BioInterchange
         puts '    -v <version>/--version <version> : version number of resource (optional)'
         puts '    --name <name>                    : name of resource/tool/person (required)'
         puts '    --name_id <id>                   : URI of resource/tool/person (required)'
+        puts ''
         puts 'Input-/RDF-format specific options:'
-        puts '  Input: biointerchange.gff3'
-        puts '  Output: rdf.biointerchange.gff3'
+        puts '  Input: biointerchange.gff3 or biointerchange.gvf'
+        puts '  Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
         puts '  Options:'
-        puts '    -t <date>/--date <date>          : date when the GFF3 file was created (optional)'
-        puts '    --name <name>                    : name of the GFF3 file creator (optional)'
-        puts '    --name_id <id>                   : email address of the GFF3 file creator (optional)'
+        puts '    -t <date>/--date <date>          : date when the GFF3/GVF file was created (optional)'
+        puts '    --name <name>                    : name of the GFF3/GVF file creator (optional)'
+        puts '    --name_id <id>                   : email address of the GFF3/GVF file creator (optional)'
+        puts ''
         puts 'Other options:'
         puts '  -d / --debug                       : turn on debugging output (for stacktraces)'
         puts '  -h  --help                         : this message'
@@ -103,6 +135,7 @@ module BioInterchange
         exit 1
       end
+      # Check if the input/rdf options are supported:
       if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
         if opt['rdf'] == 'rdf.bh12.sio' then
           raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
@@ -115,22 +148,30 @@ module BioInterchange
         else
           unsupported_combination
         end
+      elsif opt['input'] == 'biointerchange.gvf' then
+        if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
+          # Okay. No further arguments required.
+        else
+          unsupported_combination
+        end
       else
         unsupported_combination
       end
       opt['date'] = nil unless opt['date']
       opt['version'] = nil unless opt['version']
-      # generate model from file (deserialise)
+      # Generate model from file (deserialization).
+      # Note: if-clauses are lexicographically ordered.
       reader = nil
-      if opt['input'] == 'dbcls.catanns.json' then
+      if opt['input'] == 'biointerchange.gff3' then
+        reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
+      elsif opt['input'] == 'biointerchange.gvf' then
+        reader = BioInterchange::Genomics::GVFReader.new(opt['name'], opt['name_id'], opt['date'])
+      elsif opt['input'] == 'dbcls.catanns.json' then
         reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
       elsif opt['input'] == 'uk.ac.man.pdfx' then
         reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
-      elsif opt['input'] == 'biointerchange.gff3' then
-        reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
       end
       model = nil
@@ -140,13 +181,14 @@ module BioInterchange
         model = reader.deserialize(STDIN)
       end
-      # generate rdf from model (serialise)
+      # Generate rdf from model (serialization).
+      # Note: if-clauses are lexicographically ordered.
       writer = nil
       if opt['rdf'] == 'rdf.bh12.sio' then
         writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
         writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
       end
-      if opt['rdf'] == 'rdf.biointerchange.gff3' then
+      if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
         writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
         writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
       end

data/lib/biointerchange/genomics/gff3_feature.rb CHANGED

@@ -1,5 +1,6 @@
 module BioInterchange::Genomics
+# Represents a single genomic feature of a GFF3 file.
 class GFF3Feature
   # Constants determining the strand of the feature.

data/lib/biointerchange/genomics/gff3_feature_set.rb CHANGED

@@ -2,6 +2,7 @@ require 'digest/sha1'
 module BioInterchange::Genomics
+# A GFF3 feature set, which encapsules information of a single GFF3 file.
 class GFF3FeatureSet
   # Create a new instance of a Generic Feature Format Version 3 (GFF3) feature set. A feature
@@ -9,13 +10,34 @@ class GFF3FeatureSet
   def initialize
     # Features are stored as the keys of a hash map to increase performance:
     @set = {}
+    # Pragmas, i.e. feature meta-information, are stored as named mappings. Many
+    # pragmas are simple key/value assignments, but others permit multiple values
+    # whose ordering does matter. In that case, an array is used to store the
+    # various values.
+    @pragmas = {}
   end
-  # Returns the contents of the feature set.
+  # Returns the contents of the feature set -- excluding pragma meta-data.
   def contents
     @set.keys
   end
+  # Returns information stored for a named pragma, or nil if there is no information
+  # stored for it.
+  #
+  # +name+:: a string representing the name of the pragma whose value we are interested in
+  def pragma(name)
+    return nil unless name
+    # TODO Should throw exception if name is not a string.
+    return nil unless name.kind_of?(String)
+    @pragmas[name]
+  end
+  # Returns the names of all the pragmas for which some information has been recorded.
+  def pragmas
+    @pragmas.keys
+  end
   # Returns an URI for this particular feature set, which is a SHA1 hash over the content's concatenated properties.
   def uri
     clob = ''
@@ -32,6 +54,14 @@ class GFF3FeatureSet
     @set[feature] = true
   end
+   # Sets the value for named pragma meta-data.
+   #
+   # +name+:: a string representing the unique name of the pragma
+   # +value+:: on object representing the value of the pragma assignment
+   def set_pragma(name, value)
+     # TODO Should throw exception if name is not a string.
+     @pragmas[name] = value
+   end
 end
 end

data/lib/biointerchange/genomics/gff3_pragmas.rb ADDED

@@ -0,0 +1,35 @@
+module BioInterchange::Genomics
+# Represents a named region, which is defined by the pragma statement 'sequence-region'.
+class GFF3NamedRegion
+  # Create a new instance of a named region.
+  #
+  # +seqid+:: unique identifier (in the GFF3 file context) that identifies this region
+  # +start_coordinate+:: genomic start coordinate of the region
+  # +end_coordinate+:: genomic end coordinate of the region
+  def initialize(seqid, start_coordinate, end_coordinate)
+    @seqid = seqid
+    @start_coordinate = start_coordinate
+    @end_coordinate = end_coordinate
+  end
+  # Returns the unique identifier (based on a GFF3 file context) of the region.
+  def seqid
+    @seqid
+  end
+  # Returns the start coordinate of the region.
+  def start_coordinate
+    @start_coordinate
+  end
+  # Returns the end coordinate of the region.
+  def end_coordinate
+    @end_coordinate
+  end
+end
+end

data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb CHANGED

@@ -4,6 +4,15 @@ require 'date'
 module BioInterchange::Genomics
+# Serializes GFF3 and GVF models.
+#
+# Inputs:
+# - biointerchange.gff3
+# - biointerchange.gvf
+#
+# Outputs:
+# - rdf.biointerchange.gff3
+# - rdf.biointerchange.gvf
 class RDFWriter
   # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
@@ -19,14 +28,19 @@ class RDFWriter
   # +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
   def serialize(model)
     if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
+      @base = BioInterchange::GFF3O
+      serialize_model(model)
+    elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
+      @base = BioInterchange::GVF1O
       serialize_model(model)
     else
       raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
-                           'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet.'
+                           'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
+                           'BioInterchange::Genomics::GVFFeatureSet.'
     end
   end
-private
+protected
   # Serializes RDF for a feature set representation.
   #
@@ -34,11 +48,34 @@ private
   def serialize_model(model)
     graph = RDF::Graph.new
     set_uri = RDF::URI.new(model.uri)
-    graph.insert(RDF::Statement.new(set_uri, RDF.type, BioInterchange::GFF3O.Set))
+    graph.insert(RDF::Statement.new(set_uri, RDF.type, @base.Set))
+    model.pragmas.each { |pragma_name|
+      serialize_pragma(graph, set_uri, model.pragma(pragma_name))
+    }
     model.contents.each { |feature|
       serialize_feature(graph, set_uri, feature)
     }
     RDF::NTriples::Writer.dump(graph, @ostream)
+    # TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
+    # RDF::RDFXML::Writer.dump(graph, @ostream)
+  end
+  # Serializes pragmas for a given feature set URI.
+  # +graph+:: RDF graph to which the pragmas are added
+  # +set_uri+:: the feature set URI to which the pragmas belong to
+  # +pragma+:: an object representing a pragma statement
+  def serialize_pragma(graph, set_uri, pragma)
+    if pragma.kind_of?(Hash) then
+      if pragma.has_key?('gff-version') and @base == BioInterchange::GFF3O then
+        graph.insert(RDF::Statement.new(set_uri, @base.version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
+      elsif pragma.has_key?('gff-version') and @base == BioInterchange::GVF1O then
+        graph.insert(RDF::Statement.new(set_uri, @base.gff_version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
+      elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
+        graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
+      end
+    else
+      # TODO
+    end
   end
   # Serializes a +GFF3Feature+ object for a given feature set URI.
@@ -50,30 +87,30 @@ private
     # TODO Make sure there is only one value in the 'ID' list.
     feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
     feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
-    feature_properties = BioInterchange::GFF3O.feature_properties.select { |uri| BioInterchange::GFF3O.is_datatype_property?(uri) }[0]
-    graph.insert(RDF::Statement.new(set_uri, BioInterchange::GFF3O.contains, feature_uri))
-    graph.insert(RDF::Statement.new(feature_uri, RDF.type, BioInterchange::GFF3O.Feature))
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.seqid, RDF::Literal.new(feature.sequence_id)))
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.source, RDF::Literal.new(feature.source)))
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.type, RDF::Literal.new(feature.type)))
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.with_parent(BioInterchange::GFF3O.start, feature_properties)[0], RDF::Literal.new(feature.start_coordinate)))
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.with_parent(BioInterchange::GFF3O.end, feature_properties)[0], RDF::Literal.new(feature.end_coordinate)))
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.score, RDF::Literal.new(feature.score))) if feature.score
-    feature_properties = BioInterchange::GFF3O.feature_properties.select { |uri| BioInterchange::GFF3O.is_object_property?(uri) }[0]
-    strand_uri = BioInterchange::GFF3O.with_parent(BioInterchange::GFF3O.strand, feature_properties)[0]
+    feature_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
+    graph.insert(RDF::Statement.new(set_uri, @base.contains, feature_uri))
+    graph.insert(RDF::Statement.new(feature_uri, RDF.type, @base.Feature))
+    graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.seqid ].flatten, feature_properties)[0], RDF::Literal.new(feature.sequence_id)))
+    graph.insert(RDF::Statement.new(feature_uri, @base.source, RDF::Literal.new(feature.source)))
+    graph.insert(RDF::Statement.new(feature_uri, @base.type, RDF::Literal.new(feature.type)))
+    graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start, feature_properties)[0], RDF::Literal.new(feature.start_coordinate)))
+    graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end, feature_properties)[0], RDF::Literal.new(feature.end_coordinate)))
+    graph.insert(RDF::Statement.new(feature_uri, @base.score, RDF::Literal.new(feature.score))) if feature.score
+    feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
+    strand_uri = @base.with_parent(@base.strand, feature_properties)[0]
     case feature.strand
     when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
-      graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.NotStranded))
+      graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.NotStranded))
     when BioInterchange::Genomics::GFF3Feature::UNKNOWN
-      graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.UnknownStrand))
+      graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.UnknownStrand))
     when BioInterchange::Genomics::GFF3Feature::POSITIVE
-      graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.Positive))
+      graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Positive))
     when BioInterchange::Genomics::GFF3Feature::NEGATIVE
-      graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.Negative))
+      graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Negative))
     else
       raise ArgumentException, 'Strand of feature is set to an unknown constant.'
     end
-    graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.phase, RDF::Literal.new(feature.phase))) if feature.phase
+    graph.insert(RDF::Statement.new(feature_uri, @base.phase, RDF::Literal.new(feature.phase))) if feature.phase
     serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
   end
@@ -88,16 +125,16 @@ private
     attributes.each_pair { |tag, list|
       if tag == 'Parent' then
         list.each { |parent_id|
-          graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
+          graph.insert(RDF::Statement.new(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
         }
       else
         list.each_index { |index|
           value = list[index]
           attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
           attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
-          graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.attributes, attribute_uri))
-          graph.insert(RDF::Statement.new(attribute_uri, RDF.type, BioInterchange::GFF3O.Attribute))
-          graph.insert(RDF::Statement.new(attribute_uri, BioInterchange::GFF3O.tag, RDF::Literal.new("#{tag}")))
+          graph.insert(RDF::Statement.new(feature_uri, @base.attributes, attribute_uri))
+          graph.insert(RDF::Statement.new(attribute_uri, RDF.type, @base.Attribute))
+          graph.insert(RDF::Statement.new(attribute_uri, @base.tag, RDF::Literal.new("#{tag}")))
           graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
         }
       end

data/lib/biointerchange/genomics/gff3_reader.rb CHANGED

@@ -26,60 +26,94 @@ class GFF3Reader
     end
   end
-private
+protected
+  def create_feature_set
+    BioInterchange::Genomics::GFF3FeatureSet.new()
+  end
   def create_model(gff3)
-    feature_set = BioInterchange::Genomics::GFF3FeatureSet.new()
+    feature_set = create_feature_set
     gff3.each_line { |line|
-      next if line.start_with?('#')
+      next if line.start_with?('#') and not line.start_with?('##')
-      line.chomp!
-      seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
-      # The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
-      if type.match(/SO:\d+/) then
-        type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
-      elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
-        type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
-      end
+      # Ignore sequences for now.
+      break if line.start_with?('##FASTA')
-      # String to numeric value conversions:
-      start_coordinate = start_coordinate.to_i
-      stop_coordinate = stop_coordinate.to_i
-      if score == '.' then
-        score = nil
+      unless line.start_with?('##') then
+        add_feature(feature_set, line)
       else
-        score = score.to_f
+        add_pragma(feature_set, line)
       end
+    }
-      # Determine strandedness:
-      if strand == '?' then
-        strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
-      elsif strand == '+' then
-        strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
-      elsif strand == '-' then
-        strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
-      else
-        strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
-      end
+    feature_set
+  end
-      # Set phase, if it lies in the permissable range of values:
-      if phase == '0' or phase == '1' or phase == '2' then
-        phase = phase.to_i
-      else
-        phase = nil
-      end
+  def add_feature(feature_set, line)
+    line.chomp!
+    seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
-      temp = {}
-      attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
-      attributes = temp
+    # The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
+    if type.match(/SO:\d+/) then
+      type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
+    elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
+      type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
+    end
-      feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
-    }
+    # String to numeric value conversions:
+    start_coordinate = start_coordinate.to_i
+    end_coordinate = end_coordinate.to_i
+    if score == '.' then
+      score = nil
+    else
+      score = score.to_f
+    end
-    feature_set
+    # Determine strandedness:
+    if strand == '?' then
+      strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
+    elsif strand == '+' then
+      strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
+    elsif strand == '-' then
+      strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
+    else
+      strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
+    end
+    # Set phase, if it lies in the permissable range of values:
+    if phase == '0' or phase == '1' or phase == '2' then
+      phase = phase.to_i
+    else
+      phase = nil
+    end
+    temp = {}
+    attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
+    attributes = temp
+    feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
   end
+  def add_pragma(feature_set, line)
+    line.chomp!
+    name, value = line[2..-1].split(/\s/, 2)
+    value.strip!
+    # Interpret pragmas depending on their definition:
+    if name == 'gff-version' then
+      feature_set.set_pragma(name, { name => value.to_f })
+    elsif name == 'sequence-region' then
+      regions = feature_set.pragma(name)
+      regions = {} unless regions
+      seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
+      regions[seqid] = BioInterchange::Genomics::GFF3NamedRegion.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
+      feature_set.set_pragma(name, regions)
+    else
+      # Unhandled pragma. Just save the value in its string form.
+      feature_set.set_pragma(name, value)
+    end
+  end
 end
 end