biointerchange 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -4
  3. data/Gemfile +2 -3
  4. data/README.md +36 -22
  5. data/VERSION +1 -1
  6. data/examples/Felis_catus.gvf.gz +0 -0
  7. data/examples/Felis_catus_incl_consequences.vcf.gz +0 -0
  8. data/generators/rdfxml.rb +1 -1
  9. data/generators/tsv2rubyclass.rb +31 -0
  10. data/lib/biointerchange/core.rb +17 -5
  11. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +591 -137
  12. data/lib/biointerchange/genomics/gff3_reader.rb +16 -3
  13. data/lib/biointerchange/genomics/gvf_reader.rb +1 -1
  14. data/lib/biointerchange/genomics/vcf_feature.rb +46 -0
  15. data/lib/biointerchange/genomics/vcf_feature_set.rb +14 -0
  16. data/lib/biointerchange/genomics/vcf_reader.rb +238 -0
  17. data/lib/biointerchange/gfvo.rb +689 -553
  18. data/lib/biointerchange/life_science_registry.rb +3595 -0
  19. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +33 -35
  20. data/lib/biointerchange/writer.rb +11 -16
  21. data/make.sh +4 -0
  22. data/spec/exceptions_spec.rb +1 -7
  23. data/spec/gff3_rdfwriter_spec.rb +2 -16
  24. data/spec/gvf_rdfwriter_spec.rb +2 -19
  25. data/spec/phylogenetics_spec.rb +1 -13
  26. data/spec/text_mining_pdfx_xml_reader_spec.rb +1 -13
  27. data/spec/text_mining_pubannos_json_reader_spec.rb +1 -14
  28. data/spec/text_mining_rdfwriter_spec.rb +8 -19
  29. data/test.sh +4 -0
  30. data/web/about.html +10 -14
  31. data/web/api.html +11 -13
  32. data/web/bootstrap/css/bootstrap-theme.css +347 -0
  33. data/web/bootstrap/css/bootstrap-theme.css.map +1 -0
  34. data/web/bootstrap/css/bootstrap-theme.min.css +7 -0
  35. data/web/bootstrap/css/bootstrap.css +4764 -4603
  36. data/web/bootstrap/css/bootstrap.css.map +1 -0
  37. data/web/bootstrap/css/bootstrap.min.css +6 -8
  38. data/web/bootstrap/fonts/glyphicons-halflings-regular.eot +0 -0
  39. data/web/bootstrap/fonts/glyphicons-halflings-regular.svg +229 -0
  40. data/web/bootstrap/fonts/glyphicons-halflings-regular.ttf +0 -0
  41. data/web/bootstrap/fonts/glyphicons-halflings-regular.woff +0 -0
  42. data/web/bootstrap/js/bootstrap.js +1372 -1448
  43. data/web/bootstrap/js/bootstrap.min.js +5 -5
  44. data/web/cli.html +14 -28
  45. data/web/index.html +15 -33
  46. data/web/ontologies.html +1089 -945
  47. data/web/webservices.html +12 -14
  48. metadata +24 -27
  49. data/lib/biointerchange/gff3o.rb +0 -525
  50. data/lib/biointerchange/gvf1o.rb +0 -1354
  51. data/web/bootstrap/css/bootstrap-responsive.css +0 -1040
  52. data/web/bootstrap/css/bootstrap-responsive.min.css +0 -9
  53. data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
  54. data/web/bootstrap/img/glyphicons-halflings.png +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24fa6ef275de7c6f218b9807eae4049d63ee6ac8
4
- data.tar.gz: f55cd8a7d9fc2448b7a674cad4ea37c549858cd3
3
+ metadata.gz: 0b66ff1fc16121ae1bd423d5f06bf212b00cdd5c
4
+ data.tar.gz: 93bdd0303b40580eb5f70f17db8aad93228fc11d
5
5
  SHA512:
6
- metadata.gz: 612461c1a22264afa5a743fed78116908fed72eda191aee63e6788e50a5d3c81273f5447ad909d46bfed1ee72d1761d8dabbe4e0c3cd9de05f565a8e9336380b
7
- data.tar.gz: d3aaa588d0f3550fd0dbe0d31045b30021a53a2d6212d72aa98168a483ae8dcc32d8bb4c5f1d934c73065642235f7170387d5b483a06c98c74494215bfa3f335
6
+ metadata.gz: 2b8adeeb8584d3d09729faeb6475dfd9be4adc3f568fdb8a1e4e654c605126384574344e6235f831609a9fa6a61f24ef87dbfff579153dab0f6e406887208584
7
+ data.tar.gz: f709f45089d6cc469884f9e182752fda0be6235762e65857fa1f8b2f0bca72e392618a8b7da142c0ed763c24526195d126a07a709c55a30ce152caf44b5c31b8
data/.travis.yml CHANGED
@@ -2,11 +2,9 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.2
4
4
  - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.1
5
7
  - jruby-19mode # JRuby in 1.9 mode
6
- # - rbx-19mode
7
- - 1.8.7
8
- - jruby-18mode # JRuby in 1.8 mode
9
- # - rbx-18mode
10
8
 
11
9
  # uncomment this line if your project needs to run something other than `rake`:
12
10
  # script: bundle exec rspec spec
data/Gemfile CHANGED
@@ -2,10 +2,10 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
- gem "rdf", ">= 0.3.4.1"
5
+ gem "rdf", ">= 1.1.4.3"
6
6
  gem "json", ">= 1.6.4"
7
7
  gem "getopt", ">= 1.4.1"
8
- gem "addressable", ">= 2.3.2"
8
+ gem "addressable", ">= 2.3.6"
9
9
  gem "bio", ">= 1.4.2"
10
10
 
11
11
  # Add dependencies to develop your gem here.
@@ -14,6 +14,5 @@ group :development do
14
14
  gem "rspec", "~> 2.8.0"
15
15
  gem "bundler", ">= 1.1.5"
16
16
  gem "jeweler", "~> 1.8.4"
17
- gem "bio", ">= 1.4.2"
18
17
  gem "rdoc", "~> 3.12"
19
18
  end
data/README.md CHANGED
@@ -21,14 +21,12 @@ Ontologies used in the RDF output:
21
21
 
22
22
  * [Comparative Data Analysis Ontology](http://sourceforge.net/apps/mediawiki/cdao/index.php?title=Main_Page) (CDAO)
23
23
  * [Friend of a Friend](http://xmlns.com/foaf/spec) (FOAF)
24
- * [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O)
25
- * [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O)
26
24
  * [Genomic Feature and Variation Ontology](http://www.biointerchange.org/ontologies.html) (GFVO)
27
25
  * [Semanticscience Integrated Ontology](http://code.google.com/p/semanticscience/wiki/SIO) (SIO)
28
26
  * [Sequence Ontology](http://www.sequenceontology.org/index.html) (SO)
29
27
  * [Sequence Ontology Feature Annotation](http://www.sequenceontology.org/index.html) (SOFA)
30
28
 
31
- *Note:* GFF3O and GVF1O will be replaced by GFVO with the next release of BioInterchange.
29
+ *Note:* GFVO replaces the [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O) and [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O).
32
30
 
33
31
  #### Contributing
34
32
 
@@ -54,7 +52,7 @@ BioInterchange's command-line tool `biointerchange` can be installed as a comman
54
52
 
55
53
  Examples:
56
54
 
57
- biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.gvf --batchsize 100 --file examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf
55
+ biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.gfvo --file examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf
58
56
  biointerchange --input dbcls.catanns.json --rdf rdf.bh12.sio --file examples/pubannotation.10096561.json --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
59
57
  biointerchange --input uk.ac.man.pdfx --rdf rdf.bh12.sio --file examples/gb-2007-8-3-R40.xml --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
60
58
  biointerchange --input phylotastic.newick --rdf rdf.phylotastic.newick --file examples/tree2.new --annotate_date '1 June 2006'
@@ -69,8 +67,7 @@ Input formats:
69
67
 
70
68
  Output formats:
71
69
 
72
- * `rdf.biointerchange.gff3`
73
- * `rdf.biointerchange.gvf`
70
+ * `rdf.biointerchange.gfvo`
74
71
  * `rdf.bh12.sio`
75
72
  * `rdf.phylotastic.newick`
76
73
 
@@ -93,15 +90,6 @@ To list all `seqid` entries from a GVF-file conversion in the store, the followi
93
90
 
94
91
  testrepo> sparql select * where { ?s <http://www.biointerchange.org/gvf1o#GVF1_0004> ?o } .
95
92
 
96
- #### Data Consistency Verification
97
-
98
- Data consistency is verifyable for the output formats `rdf.biointerchange.gff3` and `rdf.biointerchange.gvf` using the [BioInterchange ontologies](http://www.biointerchange.org/ontologies.html) GFF3O and GVF1O. The following is an example of how [Jena](http://jena.apache.org)'s command line tools and the [HermiT reasoner](http://hermit-reasoner.com) can be used for conistency verification:
99
-
100
- rdfcat <path-to-gff3o/gvf1o> <yourdata.n3> > merged.xml
101
- java -d64 -Xmx4G -jar HermiT.jar -k -v merged.xml
102
-
103
- Another approach is to load the data and its related GFF3O/GVF1O ontology into [Protege](http://protege.stanford.edu), merge them, and then use the "Explain inconsistent ontology" menu item to inspect possible data inconsistencies.
104
-
105
93
  #### Example Data Provenance
106
94
 
107
95
  The following list provides information on the origin of the example-data files in the `examples` directory:
@@ -110,8 +98,16 @@ The following list provides information on the origin of the example-data files
110
98
  * `BovineGenomeChrX.gff3.gz`: Gzipped GFF3 file describing a Bos taurus chromosome X. Downloaded from [http://bovinegenome.org/?q=download_chromosome_gff3](http://bovinegenome.org/?q=download_chromosome_gff3)
111
99
  * `chromosome_BF.gff`: GFF3 file of floating contigs from the Baylor Sequencing Centre. Downloaded from [http://dictybase.org/Downloads](http://dictybase.org/Downloads)
112
100
  * `estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf`: GVF file of EBI's [DGVa](http://www.ebi.ac.uk/dgva/database-genomic-variants-archive). Downloaded from [ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf](ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf)
101
+ * `Felis_catus.gvf.gz`: Gzipped GVF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz)
102
+ * `Felis_catus_incl_consequences.vcf.gz`: Gzipped VCF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz](ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz)
113
103
  * `gb-2007-8-3-R40.xml`: Generated by [PDFx](http://pdfx.cs.man.ac.uk) from open-access source PDF [Sense-antisense pairs in mammals: functional and evolutionary considerations](http://genomebiology.com/content/pdf/gb-2007-8-3-r40.pdf)
114
- * `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
104
+ * `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Gzipped GVF files of S. cerevisiae genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
105
+
106
+ #### Additional Example Data
107
+
108
+ The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
109
+
110
+ * `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
115
111
 
116
112
  ### Application Programming Interface
117
113
 
@@ -322,9 +318,10 @@ The writer takes an object model and serializes it via the `BioInterchange::Writ
322
318
  # Serialize a model as RDF.
323
319
  #
324
320
  # +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
325
- def serialize(model)
321
+ # +uri_prefix+:: optional URI prefix that should be used in the RDFization of individuals/class instances
322
+ def serialize(model, uri_prefix = nil)
326
323
  model.contents.each { |tree|
327
- serialize_model(model, tree)
324
+ serialize_model(model, tree, uri_prefix)
328
325
  }
329
326
  end
330
327
 
@@ -501,8 +498,7 @@ RDFization parameters and data are send as a single HTTP POST requests containin
501
498
  * `phylotastic.newick`: [Newick](http://evolution.genetics.washington.edu/phylip/newicktree.html)
502
499
  * `uk.ac.man.pdfx`: [PDFx](http://pdfx.cs.man.ac.uk) XML
503
500
  * `OUTPUT_METHOD`: determines the RDFization method that should be used, output will always be RDF N-Triples; available output formats are
504
- * `rdf.biointerchange.gff3`: RDFization of `biointerchange.gff3`
505
- * `rdf.biointerchange.gvf`: RDFization of `biointerchange.gvf`
501
+ * `rdf.biointerchange.gfvo`: RDFization of `biointerchange.gff3` or `biointerchange.gvf`
506
502
  * `rdf.bh12.sio`: RDFization of `dbcls.catanns.json` or `uk.ac.man.pdfx`
507
503
  * `rdf.phylotastic.newick`: RDFization of `phylotastic.newick`
508
504
  * `URL_ENCODED_DATA`: data for RDFization as [URL encoded](http://en.wikipedia.org/wiki/Percent-encoding) string
@@ -590,6 +586,13 @@ A Geno Ontology external reference (GOxref) vocabulary can be created by directl
590
586
  curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb
591
587
  echo -e "\nend" >> lib/biointerchange/goxref.rb
592
588
 
589
+ Building an external reference vocabulary based on Life Science Registry external database abbreviations (based on download of the
590
+ Life Science registry spreadsheet as TSV file):
591
+
592
+ echo -e "module BioInterchange\n" > lib/biointerchange/life_science_registry.rb
593
+ cut -f 1,25 <path-to-registry-tsv-file> | grep -E 'https?://.*\$id' | ruby generators/tsv2rubyclass.rb LifeScienceRegistry >> lib/biointerchange/life_science_registry.rb
594
+ echo -e "\nend" >> lib/biointerchange/life_science_registry.rb
595
+
593
596
  #### Python Vocabulary Classes
594
597
 
595
598
  The source-code generation can be skipped, if none of the ontologies that are used by BioInterchange have been changed. Otherwise, the existing Python vocabulary class wrappers can be generated as follows:
@@ -630,14 +633,25 @@ The following Java packages will automatically install alongside BioInterchange'
630
633
 
631
634
  ### Gem Bundling/Installing
632
635
 
636
+ Mac OS X prerequisites and `bundle install` difference:
637
+
638
+ sudo port install libxml2 libxslt
639
+ sudo ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future bundle install
640
+
641
+ Actual gem bundling:
642
+
633
643
  bundle exec rake gemspec
634
644
  bundle exec gem build biointerchange.gemspec
635
- sudo bundle exec gem install biointerchange
645
+ sudo bundle exec gem install biointerchange-`cat VERSION`.gem
636
646
 
637
647
  If you encounter problems with gem dependencies, then you can try to explictly use Ruby 1.9:
638
648
 
639
649
  bundle exec gem1.9 build biointerchange.gemspec
640
- sudo bundle exec gem1.9 install biointerchange
650
+ sudo bundle exec gem1.9 install biointerchange-`cat VERSION`.gem
651
+
652
+ Alternative build script, `make.sh`, which installs the gem without RDocs and ri pages (quicker when testing):
653
+
654
+ ./make.sh
641
655
 
642
656
  ### Unit Testing
643
657
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.1
1
+ 1.0.2
Binary file
data/generators/rdfxml.rb CHANGED
@@ -19,7 +19,7 @@ SIO_SYN = RDF::URI.new('http://semanticscience.org/resource/synonym')
19
19
  # This label conversion also appears in:
20
20
  # +lib/biointerchange/core.rb+
21
21
  def make_safe_label(label)
22
- label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
22
+ label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
23
23
  end
24
24
 
25
25
  reader = RDF::RDFXML::Reader.open(ARGV[0])
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'biointerchange'
4
+
5
+ if ARGV.length != 1 then
6
+ puts 'Usage: tsv2rubyclass rubyclassname'
7
+ puts ''
8
+ puts 'Reads a TSV file from STDIN, where the first column values become'
9
+ puts 'method names (sanitized for spaces, etc.) in the class and the'
10
+ puts 'second column values are returned as a string.'
11
+ puts ''
12
+ puts 'The generated Ruby class is output on STDOUT.'
13
+ exit 1
14
+ end
15
+
16
+ classname = ARGV[0]
17
+
18
+ puts "class #{classname}"
19
+ puts ''
20
+
21
+ STDIN.each { |line|
22
+ key, value = line.chomp.split("\t")
23
+
24
+ puts " def self.#{BioInterchange.make_safe_label(key)}"
25
+ puts " \"#{value}\""
26
+ puts ' end'
27
+ puts ''
28
+ }
29
+
30
+ puts 'end'
31
+
@@ -34,13 +34,13 @@ module BioInterchange
34
34
  # Custom Exceptions and Errors
35
35
  require 'biointerchange/exceptions'
36
36
 
37
- # Ontologies (besides the ones from the 'rdf' gem)
37
+ # Ontologies (besides the ones from the 'rdf' gem), vocabularies and
38
+ # other mappings (e.g., database abbreviations to URIs):
39
+ require 'biointerchange/life_science_registry'
38
40
  require 'biointerchange/cdao'
39
41
  require 'biointerchange/faldo'
40
- require 'biointerchange/gff3o'
41
42
  require 'biointerchange/gfvo'
42
43
  require 'biointerchange/goxref'
43
- require 'biointerchange/gvf1o'
44
44
  require 'biointerchange/sio'
45
45
  require 'biointerchange/so'
46
46
  require 'biointerchange/sofa'
@@ -105,6 +105,18 @@ module BioInterchange
105
105
  # Writer
106
106
  # ...same GFF3 writer
107
107
 
108
+ ### VCF ###
109
+
110
+ # Reader
111
+ require 'biointerchange/genomics/vcf_reader'
112
+
113
+ # Feature base model
114
+ require 'biointerchange/genomics/vcf_feature_set'
115
+ require 'biointerchange/genomics/vcf_feature'
116
+
117
+ # Writer
118
+ # ...same GFF3 writer
119
+
108
120
  #
109
121
  # PHYLOGENETICS
110
122
  #
@@ -225,7 +237,7 @@ module BioInterchange
225
237
  'input' => opt['input'],
226
238
  'output' => opt['output']
227
239
  }
228
- map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
240
+ map['batch_size'] = opt['batchsize'].to_i if opt['batchsize']
229
241
  opt.each_key { |key|
230
242
  map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
231
243
  }
@@ -296,7 +308,7 @@ module BioInterchange
296
308
  #
297
309
  # +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name
298
310
  def self.make_safe_label(label)
299
- label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
311
+ label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
300
312
  end
301
313
 
302
314
  private
@@ -4,11 +4,12 @@ require 'date'
4
4
 
5
5
  module BioInterchange::Genomics
6
6
 
7
- # Serializes GFF3 and GVF models.
7
+ # Serializes GFF3, GVF and VCF models.
8
8
  #
9
9
  # Inputs:
10
10
  # - biointerchange.gff3
11
11
  # - biointerchange.gvf
12
+ # - biointerchange.vcf
12
13
  #
13
14
  # Outputs:
14
15
  # - rdf.biointerchange.gfvo
@@ -18,14 +19,7 @@ class RDFWriter < BioInterchange::Writer
18
19
  BioInterchange::Registry.register_writer(
19
20
  'rdf.biointerchange.gfvo',
20
21
  BioInterchange::Genomics::RDFWriter,
21
- [ 'biointerchange.gff3' ],
22
- true,
23
- 'Genomic Feature and Variation Ontology (GFVO) based RDFization'
24
- )
25
- BioInterchange::Registry.register_writer(
26
- 'rdf.biointerchange.gfvo',
27
- BioInterchange::Genomics::RDFWriter,
28
- [ 'biointerchange.gvf' ],
22
+ [ 'biointerchange.gff3', 'biointerchange.gvf', 'biointerchange.vcf' ],
29
23
  true,
30
24
  'Genomic Feature and Variation Ontology (GFVO) based RDFization'
31
25
  )
@@ -34,8 +28,7 @@ class RDFWriter < BioInterchange::Writer
34
28
  #
35
29
  # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
36
30
  def initialize(ostream)
37
- raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
38
- @ostream = ostream
31
+ super(ostream)
39
32
  end
40
33
 
41
34
  # Serialize a model as RDF.
@@ -47,10 +40,12 @@ class RDFWriter < BioInterchange::Writer
47
40
  @format = :gff3
48
41
  elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
49
42
  @format = :gvf
43
+ elsif model.instance_of?(BioInterchange::Genomics::VCFFeatureSet) then
44
+ @format = :vcf
50
45
  else
51
46
  raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
52
- 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
53
- 'BioInterchange::Genomics::GVFFeatureSet.'
47
+ 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet, ' +
48
+ 'BioInterchange::Genomics::GVFFeatureSet and BioInterchange::Genomics::VCFFeatureSet.'
54
49
  end
55
50
  @base = BioInterchange::GFVO
56
51
  serialize_model(model, uri_prefix)
@@ -71,15 +66,24 @@ protected
71
66
  # Record written variants in order to avoid writing out RDF.type multiple times.
72
67
  @variants = {}
73
68
 
69
+ # Set up "matchers" that can be used to match a number of attributes of a feature, and then,
70
+ # link out to an entity that says something about that combination of attributes. Used for
71
+ # GVF #data-source, etc., pragmas and VCF filters.
72
+ @matchers = []
73
+
74
74
  # Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
75
75
  # Then register the prefix with the writer to have a concise Turtle output.
76
76
  set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
77
77
  set_uri = RDF::URI.new(model.uri) unless set_uri
78
78
  set_base(set_uri + '/')
79
79
 
80
- create_triple(set_uri, RDF.type, @base.Set)
80
+ add_prefix('http://biohackathon.org/resource/faldo#', 'faldo')
81
+ add_prefix('http://www.biointerchange.org/gfvo#', 'gfvo')
82
+ add_prefix('http://semanticscience.org/resource/', 'sio')
83
+
84
+ create_triple(set_uri, RDF.type, @base.File)
81
85
  model.pragmas.each { |pragma_name|
82
- serialize_pragma(set_uri, model.pragma(pragma_name))
86
+ serialize_pragma(set_uri, pragma_name, model.pragma(pragma_name))
83
87
  }
84
88
  model.contents.each { |feature|
85
89
  if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
@@ -89,64 +93,162 @@ protected
89
93
  end
90
94
  }
91
95
  close
92
- #RDF::NTriples::Writer.dump(graph, @ostream)
93
- # TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
94
- # Having said that, Jena's rdfcat is very good for converting formats
95
- # anyway, so perhaps it is not worth investigating the following.
96
- # RDF::RDFXML::Writer.dump(graph, @ostream)
97
96
  end
98
97
 
99
98
  # Serializes pragmas for a given feature set URI.
100
99
  #
101
100
  # +set_uri+:: the feature set URI to which the pragmas belong to
101
+ # +name+:: name of the pragma statement
102
102
  # +pragma+:: an object representing a pragma statement
103
- def serialize_pragma(set_uri, pragma)
103
+ def serialize_pragma(set_uri, name, pragma)
104
104
  if pragma.kind_of?(Hash) then
105
105
  if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
106
106
  serialize_structured_attribute(set_uri, pragma)
107
107
  elsif pragma.has_key?('gff-version') then
108
- create_triple(set_uri, @base.gff_version, pragma['gff-version'], RDF::XSD.float)
108
+ create_triple(set_uri.to_s, @base.has_identifier, RDF::URI.new("#{set_uri}/version"))
109
+ create_triple("#{set_uri}/version", RDF.type, @base.Version)
110
+ create_triple("#{set_uri}/version", @base.has_value, "gff-version #{pragma['gff-version']}")
109
111
  elsif pragma.has_key?('gvf-version') then
110
- create_triple(set_uri, @base.gvf_version, pragma['gvf-version'], RDF::XSD.float)
112
+ create_triple("#{set_uri}/version", RDF.type, @base.Version)
113
+ create_triple("#{set_uri}/version", @base.has_value, "gvf-version #{pragma['gvf-version']}")
114
+ elsif pragma.has_key?('fileformat') then
115
+ create_triple("#{set_uri}/version", RDF.type, @base.Version)
116
+ create_triple("#{set_uri}/version", @base.has_value, "fileformat #{pragma['fileformat']}")
111
117
  elsif pragma.has_key?('sequence-region') then
112
118
  pragma['sequence-region'].keys.each { |seqid|
113
119
  serialize_landmark(set_uri, pragma['sequence-region'][seqid])
114
120
  }
115
121
  elsif pragma.has_key?('species') then
116
- create_triple(set_uri, @base.species, RDF::URI.new(pragma['species']))
122
+ create_triple(set_uri, @base.is_about, RDF::URI.new(pragma['species']))
123
+ # VCF section:
124
+ # ...TODO
125
+ # Everything else:
126
+ else
127
+ end
128
+ elsif pragma.kind_of?(Array) then
129
+ # VCF section:
130
+ basic_vcf_mappings = {
131
+ 'ID' => @base.Identifier,
132
+ 'Description' => @base.Comment,
133
+ 'Number' => @base.InformationContentEntity, # Note: not just an integer; can be also 'A', 'G', and '.'
134
+ 'Type' => @base.InformationContentEntity # Can be 'Integer', 'Float', 'Character', 'String'
135
+ }
136
+ if name == 'FILTER' then
137
+ pragma.each { |assignment|
138
+ pragma_uri = serialize_vcf_pragma(set_uri, "filter/#{assignment['ID']}", @base.VariantCalling, basic_vcf_mappings, assignment)
139
+ create_triple(set_uri, @base.is_participant_in, pragma_uri)
140
+ }
141
+ elsif name == 'FORMAT' then
142
+ pragma.each { |assignment|
143
+ pragma_uri = serialize_vcf_pragma(set_uri, "format/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
144
+ create_triple(set_uri, @base.references, pragma_uri)
145
+ }
146
+ elsif name == 'INFO' then
147
+ pragma.each { |assignment|
148
+ pragma_uri = serialize_vcf_pragma(set_uri, "info/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
149
+ create_triple(set_uri, @base.references, pragma_uri)
150
+ }
151
+ else
152
+ # TODO
117
153
  end
118
154
  else
119
155
  # TODO
120
156
  end
121
157
  end
122
158
 
159
+ # Goes through "matchers" and links the feature if its attributes are present
160
+ # and equal to a "matcher's" data.
161
+ #
162
+ # (TODO: Update description of this method, because it is absolutely unclear
163
+ # what it actually does right now. Sorry.)
164
+ #
165
+ # +feature+:: the feature that provides attributes for matching
166
+ # +feature_uri+:: URI of the feature that is linked out to, if the feature's attributes match
167
+ def match_feature(feature, feature_uri)
168
+ @matchers.each { |match_constraints|
169
+ constraints, linkout = match_constraints
170
+
171
+ # No constraints means that *everything* matches.
172
+ matches = true
173
+ constraints.each_pair { |key, value|
174
+ if key == 'Seqid' then
175
+ matches = false unless value.include?(feature.sequence_id)
176
+ elsif key == 'Source' then
177
+ matches = false unless value.include?(feature.source)
178
+ elsif key == 'Type' then
179
+ matches = false unless value.include?(feature.type)
180
+ else
181
+ if feature.attributes.has_key?(key) then
182
+ attributes_have_a_match = false
183
+ feature.attributes[key].each { |attribute_value|
184
+ attributes_have_a_match = true if value.include?(attribute_value)
185
+ }
186
+ matches = false unless attributes_have_a_match
187
+ else
188
+ matches = false
189
+ end
190
+ end
191
+ }
192
+
193
+ # If there is a match, then add linkout.
194
+ create_triple(feature_uri, @base.has_source, RDF::URI.new(linkout))
195
+ }
196
+ end
197
+
123
198
  # Serializes a +GFF3Feature+ object for a given feature set URI.
124
199
  #
125
200
  # +set_uri+:: the feature set URI to which the feature belongs to
126
201
  # +feature+:: a +GFF3Feature+ instance
127
202
  def serialize_feature(set_uri, feature)
128
203
  # TODO Make sure there is only one value in the 'ID' list.
129
- feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
130
- feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
131
- create_triple(set_uri, @base.contains, feature_uri)
204
+ # TODO Ponder about whether it would be possible to get the same URI for two distinct features (bad thing).
205
+ source = ''
206
+ source = "#{feature.source}," if feature.source
207
+ type = ''
208
+ type = "#{feature.type.to_s.sub(/^[^:]+:\/\//, '')}," if feature.type
209
+ phase = ",#{feature.phase}" if feature.phase
210
+ if feature.attributes.has_key?('ID') or feature.attributes.has_key?(' id') then
211
+ feature_id = 'ID'
212
+ feature_id = ' id' if feature.attributes.has_key?(' id')
213
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes[feature_id][0]}")
214
+ else
215
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{source}#{type}#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand}#{phase}")
216
+ end
217
+
218
+ create_triple(set_uri, @base.has_member, feature_uri)
132
219
  create_triple(feature_uri, RDF.type, @base.Feature)
220
+ create_triple(feature_uri, RDF.type, feature.type) if feature.type
221
+ match_feature(feature, feature_uri)
133
222
  serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
134
- create_triple(feature_uri, @base.seqid, @landmarks[feature.sequence_id])
135
- create_triple(feature_uri, @base.source, feature.source)
136
- create_triple(feature_uri, @base.type, feature.type)
137
- create_triple(feature_uri, @base.phase, feature.phase) if feature.phase
223
+ create_triple(feature_uri, @base.is_located_on, RDF::URI.new(@landmarks[feature.sequence_id]))
224
+ create_triple(feature_uri, @base.is_created_by, RDF::URI.new("#{feature_uri}/source"))
225
+ create_triple("#{feature_uri}/source", RDF.type, @base.ExperimentalMethod)
226
+ create_triple("#{feature_uri}/source", @base.has_value, feature.source) if feature.source
227
+ if feature.phase then
228
+ create_triple(feature_uri, @base.has_quality, RDF::URI.new("#{feature_uri}/phase"))
229
+ create_triple("#{feature_uri}/phase", RDF.type, @base.CodingFrameOffset)
230
+ create_triple("#{feature_uri}/phase", @base.has_value, feature.phase)
231
+ end
138
232
 
139
- serialize_coordinate(set_uri, feature_uri, feature)
233
+ create_triple(feature_uri, @base.has_part, RDF::URI.new("#{feature_uri}/locus"))
234
+ create_triple("#{feature_uri}/locus", RDF.type, @base.Locus)
235
+ create_triple("#{feature_uri}/locus", @base.has_attribute, RDF::URI.new("#{feature_uri}/locus/region"))
236
+ serialize_coordinate(set_uri, "#{feature_uri}/locus", feature)
140
237
  serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
141
238
  end
142
239
 
240
+ # Serialize a feature's coordinates using FALDO.
241
+ #
242
+ # +set_uri+:: URI of the feature set that the feature belongs to
243
+ # +feature_uri+:: URI prefix of the feature
244
+ # +feature+:: object representation of the feature, which contains the locus that is described by this method
143
245
  def serialize_coordinate(set_uri, feature_uri, feature)
144
246
  region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
145
247
  start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
146
248
  end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
147
249
  #feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
148
- ##graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
149
- create_triple(feature_uri, @base.locus, region_uri)
250
+ ##graph.insert(BioInterchange::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
251
+ create_triple(feature_uri, @base.is_located_on, region_uri)
150
252
  create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
151
253
  # BIN STUFF
152
254
  if false then
@@ -173,7 +275,23 @@ protected
173
275
  end
174
276
  create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
175
277
  create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
176
- create_triple(feature_uri, @base.score, feature.score) if feature.score
278
+ if feature.score then
279
+ create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
280
+ if @format == :gvf or @format == :vcf then
281
+ create_triple("#{feature_uri}/score", RDF.type, @base.PhredScore)
282
+ else
283
+ create_triple("#{feature_uri}/score", RDF.type, @base.Score)
284
+ end
285
+ create_triple("#{feature_uri}/score", @base.has_value, feature.score)
286
+ end
287
+ end
288
+
289
+ # Constructs a landmark URI based on set URI and the landmark's ID.
290
+ #
291
+ # +set_uri+:: the set URI to which the landmark belongs to
292
+ # +id+:: ID of the landmark
293
+ def landmark_uri(set_uri, id)
294
+ "#{set_uri.to_s}/landmark/#{id}"
177
295
  end
178
296
 
179
297
  # Serializes a genomic feature landmark ("seqid").
@@ -182,12 +300,13 @@ protected
182
300
  # +landmark+:: encapsuled landmark data
183
301
  def serialize_landmark(set_uri, landmark)
184
302
  return if @landmarks.has_key?(landmark.seqid)
185
- landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
303
+ landmark_uri = landmark_uri(set_uri, landmark.seqid)
186
304
  region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
187
305
  @landmarks[landmark.seqid] = landmark_uri
188
306
  create_triple(landmark_uri, RDF.type, @base.Landmark)
189
- create_triple(landmark_uri, @base.id, landmark.seqid)
190
- create_triple(landmark_uri, @base.locus, region_uri)
307
+ create_triple(landmark_uri, @base.has_identifier, RDF::URI.new("#{landmark_uri}/id"))
308
+ create_triple("#{landmark_uri}/id", @base.has_value, landmark.seqid)
309
+ create_triple(landmark_uri, @base.has_attribute, region_uri)
191
310
  if landmark.start_coordinate then
192
311
  start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
193
312
  create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
@@ -209,91 +328,92 @@ protected
209
328
  attributes.each_pair { |tag, list|
210
329
  # Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
211
330
  if tag == 'Alias' then
212
- list.each { |value|
213
- create_triple(feature_uri, @base.alias, value)
331
+ list.each_index { |index|
332
+ create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/alias/#{index}"))
333
+ create_triple("#{feature_uri}/alias/#{index}", RDF.type, @base.Alias)
334
+ create_triple("#{feature_uri}/alias/#{index}", @base.has_value, list[index])
214
335
  }
215
336
  elsif tag == 'Dbxref' then
216
337
  list.each { |value|
217
338
  begin
218
- linkout = nil
219
- # First: determine the Dbxref linkout URI as string
220
- if value.match(/^dbSNP(_\d+)?:rs\d+$/) then
221
- linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{value.split(/:/)[1].sub(/^rs/, '')}"
222
- elsif value.match(/^COSMIC(_\d+)?:COSM\d+$/) then
223
- linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{value.split(/:/)[1].sub(/^COSM/, '')}"
224
- else
225
- abbreviation, id = value.split(':', 2)
226
- linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
227
- end
228
- # Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
229
- create_triple(feature_uri, @base.dbxref, linkout)
339
+ # Try to link the external database reference to a well-established URI:
340
+ serialize_dbxref(feature_uri, value)
230
341
  rescue NoMethodError
231
- # Preserve the Dbxref as a Literal:
342
+ # Not clear where to link to? Preserve the Dbxref as a Literal:
232
343
  @dbxref = 0 if @dbxref == nil
233
344
  literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
234
345
  @dbxref += 1
235
- create_triple(feature_uri, @base.dbxref, literal_uri)
236
- create_triple(literal_uri, RDF.type, RDF::RDFS.Literal)
237
- create_triple(literal_uri, RDF.value, value)
346
+ create_triple(feature_uri, @base.references, literal_uri)
347
+ create_triple(literal_uri, RDF.type, @base.ExternalReference)
348
+ create_triple(literal_uri, @base.has_value, value)
238
349
  end
239
350
  }
240
351
  elsif tag == 'Derives_from' then
241
352
  list.each { |value|
242
- create_triple(feature_uri, @base.derivesFrom, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
353
+ create_triple(feature_uri, @base.is_temporarily_part_of, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
243
354
  }
244
355
  elsif tag == 'Gap' then
245
356
  # Handled by 'Target', because 'Gap' requires 'Target' to be present.
246
357
  elsif tag == 'ID' then
247
358
  list.each { |value|
248
- create_triple(feature_uri, @base.id, value)
359
+ create_triple(feature_uri, @base.has_identifier, RDF::URI.new("#{feature_uri}/id"))
360
+ create_triple("#{feature_uri}/id", RDF.type, @base.Identifier)
361
+ create_triple("#{feature_uri}/id", @base.has_value, value)
249
362
  }
250
363
  elsif tag == 'Is_circular' then
251
364
  value = list.join(',')
252
365
  if value == 'true' then
253
- create_triple(feature_uri, @base.isCircular, true) if value == 'true'
366
+ create_triple(feature_uri, @base.has_quality, @base.CircularHelix) if value == 'true'
254
367
  elsif value == 'false' then
255
- create_triple(feature_uri, @base.isCircular, false) if value == 'false'
368
+ create_triple(feature_uri, @base.is_circular, @base.WatsonCrickHelix) if value == 'false'
256
369
  else
257
- create_triple(feature_uri, RDF::RDFS.comment, "Is_circular non-truth value: #{value}")
370
+ create_triple(feature_uri, BioInterchange::RDFS.comment, "Is_circular non-truth value: #{value}")
258
371
  end
259
372
  elsif tag == 'Name' then
260
373
  list.each { |value|
261
- create_triple(feature_uri, @base.name, value)
374
+ create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/name"))
375
+ create_triple("#{feature_uri}/name", RDF.type, @base.Name)
376
+ create_triple("#{feature_uri}/name", @base.has_value, value)
262
377
  }
263
378
  elsif tag == 'Note' then
264
- list.each { |value|
265
- create_triple(feature_uri, RDF::RDFS.comment, value)
379
+ list.each_index { |index|
380
+ create_triple(feature_uri, @base.has_annotation, RDF::URI.new("#{feature_uri}/note/#{index}"))
381
+ create_triple("#{feature_uri}/note/#{index}", RDF.type, @base.Note)
382
+ create_triple("#{feature_uri}/note/#{index}", @base.has_value, list[index])
266
383
  }
267
384
  elsif tag == 'Ontology_term' then
268
385
  list.each { |value|
269
386
  # TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
270
387
  # match their associated Ruby method.
271
388
  namespace, accession = value.split(/:/, 2)
272
- create_triple(feature_uri, @base.ontology_term, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
389
+ create_triple(feature_uri, @base.references, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
273
390
  }
274
391
  elsif tag == 'Parent' then
275
392
  list.each { |parent_id|
276
- create_triple(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
393
+ create_triple(feature_uri, @base.has_source, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
277
394
  }
278
395
  elsif tag == 'Reference_seq' then
279
396
  list.each { |value|
280
397
  reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
281
- create_triple(feature_uri, @base.sequence_annotation, reference_uri)
282
- create_triple(reference_uri, RDF.type, @base.Reference)
283
- create_triple(reference_uri, @base.sequence, value)
398
+ create_triple(feature_uri, @base.has_attribute, reference_uri)
399
+ create_triple(reference_uri, RDF.type, @base.ReferenceSequence)
400
+ create_triple(reference_uri, @base.has_value, value)
284
401
  }
285
402
  elsif tag == 'Target' then
403
+ # GFF3 spec is unclear on this point, but I assume that a target ID
404
+ # is referencing a feature ID within the same file.
286
405
  target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
287
406
  target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
288
- create_triple(feature_uri, @base.target, target_uri)
289
- create_triple(target_uri, RDF.type, @base.Target)
290
- create_triple(target_uri, @base.id, target_id)
407
+ create_triple(target_uri, RDF.type, @base.SequenceAlignment)
408
+ create_triple(target_uri, @base.has_source, feature_uri)
409
+ create_triple(target_uri, @base.has_input, target_id)
291
410
  region_uri = RDF::URI.new("#{target_uri.to_s}/region")
292
411
  start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
293
412
  end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
294
- create_triple(target_uri, @base.locus, region_uri)
295
- create_triple(region_uri, @base.locus, start_position_uri)
296
- create_triple(region_uri, @base.locus, end_position_uri)
413
+ create_triple(target_uri, @base.has_attribute, region_uri)
414
+ create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
415
+ create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
416
+ create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
297
417
  if strand == '+' then
298
418
  create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
299
419
  create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
@@ -312,13 +432,20 @@ protected
312
432
  create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
313
433
  end
314
434
 
315
- # Describe a possible alignment between the feature and target:
435
+ # Describe a possible alignment with gaps between the feature and target:
316
436
  if attributes.has_key?('Gap') then
317
437
  attributes['Gap'].each_index { |gap_no|
318
438
  cigar_line = attributes['Gap'][gap_no].split(/\s+/)
319
439
  cigar_line.each_index { |alignment_no|
320
440
  alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
321
- create_triple(feature_uri, @base.alignment, alignment_uri) if alignment_no == 0
441
+ if alignment_no == 0 then
442
+ create_triple(target_uri, @base.has_first_part, alignment_uri)
443
+ else
444
+ create_triple(target_uri, @base.has_ordered_part, alignment_uri)
445
+ end
446
+ if alignment_no == cigar_line.length then
447
+ create_triple(target_uri, @base.has_last_part, alignment_uri)
448
+ end
322
449
  operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
323
450
  operation = nil unless operation.length == 1
324
451
  span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
@@ -326,24 +453,25 @@ protected
326
453
  if operation == 'M' then
327
454
  create_triple(alignment_uri, RDF.type, @base.Match)
328
455
  elsif operation == 'I' then
329
- create_triple(alignment_uri, RDF.type, @base.Reference_Sequence_Gap)
456
+ create_triple(alignment_uri, RDF.type, @base.ReferenceSequenceGap)
330
457
  elsif operation == 'D' then
331
- create_triple(alignment_uri, RDF.type, @base.Target_Sequence_Gap)
458
+ create_triple(alignment_uri, RDF.type, @base.TargetSequenceGap)
332
459
  elsif operation == 'F' then
333
- create_triple(alignment_uri, RDF.type, @base.Forward_Reference_Sequence_Frameshift)
460
+ create_triple(alignment_uri, RDF.type, @base.ForwardReferenceSequenceFrameshift)
334
461
  elsif operation == 'R' then
335
- create_triple(alignment_uri, RDF.type, @base.Reverse_Reference_Sequence_Frameshift)
462
+ create_triple(alignment_uri, RDF.type, @base.ReverseReferenceSequenceFrameshift)
336
463
  else
337
464
  # Fallback: operation is outside of the specification
338
- create_triple(alignment_uri, RDF.type, @base.Alignment_Operation)
339
- create_triple(alignment_uri, RDF::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
465
+ create_triple(alignment_uri, RDF.type, @base.SequenceAlignmentOperation)
466
+ create_triple(alignment_uri, BioInterchange::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
340
467
  end
341
- create_triple(alignment_uri, @base.span, span.to_i) if span
342
- create_triple(alignment_uri, RDF.first, alignment_uri)
343
468
  if alignment_no + 1 < cigar_line.length then
344
- create_triple(alignment_uri, RDF.rest, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
345
- else
346
- create_triple(alignment_uri, RDF.rest, RDF.nil)
469
+ create_triple(alignment_uri, @base.is_before, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
470
+ end
471
+ if span then
472
+ create_triple(alignment_uri, @base.has_attribute, RDF::URI.new("#{alignment_uri}/span"))
473
+ create_triple("#{alignment_uri}/span", RDF.type, @base.Span)
474
+ create_triple("#{alignment_uri}/span", @base.has_value, span.to_i)
347
475
  end
348
476
  }
349
477
  }
@@ -352,6 +480,29 @@ protected
352
480
  serialize_variant_effects(set_uri, feature_uri, list)
353
481
  elsif tag == 'Variant_seq' then
354
482
  serialize_variant_seqs(set_uri, feature_uri, list)
483
+ # VCF related attributes:
484
+ elsif tag == ' alternative_alleles' then
485
+ # TODO
486
+ elsif tag == ' filters' then
487
+ # Example: "Qual;MinAB;MinDP" -- comes here as split list (split by ";")
488
+ list.each { |id|
489
+ create_triple(feature_uri, @base.is_refuted_by, RDF::URI.new("#{set_uri}/filter/#{id}"))
490
+ }
491
+ elsif tag == ' samples' then
492
+ list.each_index { |sample|
493
+ list[sample].each_pair { |key, values|
494
+ serialize_vcf_sample(feature_uri, sample, key, values, attributes)
495
+ }
496
+ }
497
+ # Everything else:
498
+ elsif list == true then
499
+ # Attribute is a flag. Tag itself carries meaning and has no value associated with it.
500
+ attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}")
501
+ create_triple(feature_uri, @base.has_attribute, attribute_uri)
502
+ create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
503
+ create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
504
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
505
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
355
506
  else
356
507
  # TODO Report unknown upper case letters here? That would be a spec. validation...
357
508
  # Well, or it would show that this implementation is incomplete. Could be either.
@@ -359,71 +510,299 @@ protected
359
510
  value = list[index]
360
511
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
361
512
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
362
- create_triple(feature_uri, @base.attribute, attribute_uri)
363
- create_triple(attribute_uri, RDF.type, @base.Attribute)
364
- create_triple(attribute_uri, @base.tag, "#{tag}")
365
- create_triple(attribute_uri, RDF.value, value)
513
+ create_triple(feature_uri, @base.has_attribute, attribute_uri)
514
+ create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
515
+ # TODO Figure out why the following line was there. Seems wrong.
516
+ #create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
517
+ create_triple(attribute_uri, @base.has_value, value)
518
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
519
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
366
520
  }
367
521
  end
368
522
  }
369
523
  end
370
524
 
525
+ # Serializes VCF sample data (VCF columns 9 and above).
526
+ #
527
+ # See also genotype serialization of non-VCF data in `serialize_variant_seqs`.
528
+ #
529
+ # +feature_uri+:: URI of the feature that the sample data relates to
530
+ # +sample+:: number of the sample that is being addressed (sample column number)
531
+ # +key+:: key of the described sample values
532
+ # +values+:: values of the sample (possible composite type, e.g. comma separated list)
533
+ # +attribtues+:: a map of tag/value pairs associated with the feature
534
+ def serialize_vcf_sample(feature_uri, sample, key, values, attributes)
535
+ if key == 'DP' then
536
+ # Depth across samples. An integer.
537
+ values = values.split(',')
538
+ values.each_index { |index|
539
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index].to_i, index, values.size > 1, @base.Number_ofReads, RDF::XSD.integer)
540
+ }
541
+ elsif key == 'GT' then
542
+ # Genotype
543
+ list_uri = RDF::URI.new("#{feature_uri}/attribute/#{key}")
544
+ serialize_attribute_with_label(feature_uri, list_uri, @base.Genotype, key)
545
+ phased = values.index('/') == nil
546
+ if phased then
547
+ create_triple(list_uri, @base.has_attribute, "#{list_uri}/phase")
548
+ create_triple("#{list_uri}/phase", RDF.type, @base.GameticPhase)
549
+ end
550
+ value_uris = []
551
+ values = values.split(/\/|\|/)
552
+ # Only say something about zygosity if we deal with single bases and a diploid genome!
553
+ if values.length == 2 and values.map { |sequence| sequence.length }.uniq[0] == 1 then
554
+ if values.uniq.length == 1 then
555
+ create_triple(list_uri, @base.has_quality, @base.Homozygous)
556
+ else
557
+ create_triple(list_uri, @base.has_quality, @base.Heterozygous)
558
+ end
559
+ end
560
+ values.each_index { |index|
561
+ sequence = vcf_allele(values[index].to_i, attributes)
562
+ sequence_type = @base.SequenceVariant
563
+ sequence_type = @base.ReferenceSequence if values[index].to_i == 0
564
+ value_uris << value_uri = serialize_vcf_sample_attribute(feature_uri, sample, key, true, sequence, index, values.size > 1, sequence_type)
565
+ }
566
+ serialize_list_array(list_uri, value_uris)
567
+ elsif key == 'FT' then
568
+ # Filter: passed does nothing; applied filter uses isRefutedBy.
569
+ # TODO How to code using GFVO?
570
+ values = values.split(';')
571
+ values.each_index { |index|
572
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
573
+ }
574
+ elsif key == 'GL' then
575
+ # Genotype likelihoods.
576
+ list_uri = "#{feature_uri}/attribute/#{key}"
577
+ serialize_attribute_with_label(feature_uri, list_uri, @base.Score, key)
578
+ values = values.split(',')
579
+ value_uris = []
580
+ values.each_index { |index|
581
+ value_uris << serialize_vcf_sample_attribute(feature_uri, sample, key, false, values[index], index, values.size > 1, @base.InformationContentEntity)
582
+ }
583
+ serialize_list_array(list_uri, value_uris)
584
+ elsif key == 'GLE' then
585
+ # Genotype likelihoods of heterogenous ploidy.
586
+ # Example: 0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53
587
+ values = values.split(',')
588
+ values.each_index { |index|
589
+ genotype, likelihood = values[index].split(':')
590
+ genotype = genotype.split('/').map { |allele| vcf_allele(allele.to_i, attributes) }
591
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, genotype, index, values.size > 1, @base.InformationContentEntity)
592
+ }
593
+ elsif key == 'PL' then
594
+ # Phred scaled genotype likelihoods.
595
+ values = values.split(',')
596
+ values.each_index { |index|
597
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
598
+ }
599
+ elsif key == 'GP' then
600
+ # Phred scaled genotype posterior probabilities.
601
+ values = values.split(',')
602
+ values.each_index { |index|
603
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
604
+ }
605
+ elsif key == 'GQ' then
606
+ # Conditional genotype quality.
607
+ values = values.split(',')
608
+ values.each_index { |index|
609
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
610
+ }
611
+ elsif key == 'HQ' then
612
+ # Haplotype qualities -- presumably Phred scaled.
613
+ values = values.split(',')
614
+ values.each_index { |index|
615
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
616
+ }
617
+ elsif key == 'PS' then
618
+ # Phase set. It's complicated. See the VCF specification for details.
619
+ values = values.split(',')
620
+ values.each_index { |index|
621
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
622
+ }
623
+ elsif key == 'PQ' then
624
+ # Phasing quality in Phred scale.
625
+ values = values.split(',')
626
+ values.each_index { |index|
627
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
628
+ }
629
+ elsif key == 'EC' then
630
+ # Expected alternate allele counts.
631
+ values = values.split(',')
632
+ values.each_index { |index|
633
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
634
+ }
635
+ elsif key == 'MQ' then
636
+ # RMS mapping quality. An integer.
637
+ values = values.split(',')
638
+ values.each_index { |index|
639
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
640
+ }
641
+ else
642
+ # Unknown keys. Should that be possible at all?
643
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values, 0, false, @base.InformationContentEntity)
644
+ end
645
+ end
646
+
647
+ # Returns the allele based on VCF's genotype indexing specification.
648
+ # Reference allele is index zero, alternatives alleles are designated by one or above.
649
+ #
650
+ # +genotype_index+:: VCF genotype index
651
+ # +attributes+:: feature attribute hash that contains reference/alternative allele bases
652
+ def vcf_allele(genotype_index, attributes)
653
+ if genotype_index == 0 then
654
+ genotype = attributes[' reference_bases'][0]
655
+ else
656
+ genotype = attributes[' alternative_alleles'][genotype_index - 1]
657
+ end
658
+ end
659
+
660
+ # Serializes an ordered list; the list's URIs are given as an array.
661
+ #
662
+ # +list_uri+:: URI of the list object (ordered list items will be part of this instance via "has_first_part", "has_ordered_part, "has_last_part")
663
+ # +uris+:: URIs of the list to be serialized
664
+ def serialize_list_array(list_uri, uris)
665
+ uris.each_index { |index|
666
+ next_uri = nil
667
+ next_uri = uris[index + 1] if index + 1 < uris.length
668
+ serialize_list(uris, index, uris[index], next_uri, list_uri)
669
+ }
670
+ end
671
+
672
+ # Create an ordered list of things; this method serializes one item only and
673
+ # repeated calls to this method create the list.
674
+ #
675
+ # +values+:: array of the things that appear in the ordered list
676
+ # +index+:: index of the thing that is serialized by this method call
677
+ # +value_uri+:: URI that represents the current value that is being linked to
678
+ # +next_value_uri+:: URI of the next serialized value (ignored, if last index)
679
+ # +list_uri+:: URI of the list that contains the items
680
+ def serialize_list(values, index, value_uri, next_value_uri, list_uri)
681
+ if index == 0 then
682
+ create_triple(list_uri, @base.has_first_part, value_uri)
683
+ elsif index + 1 == values.length then
684
+ create_triple(list_uri, @base.has_last_part, value_uri)
685
+ else
686
+ create_triple(list_uri, @base.has_ordered_part, value_uri)
687
+ end
688
+ if index + 1 < values.length then
689
+ create_triple(value_uri, @base.is_before, next_value_uri)
690
+ end
691
+ end
692
+
693
+ # Serializes basic information for an object (a feature's attribtue) with label.
694
+ #
695
+ # Links the object to a feature, sets the objects type, assigns it a label.
696
+ #
697
+ # +feature_uri+:: URI of the feature that has the object as an attribute
698
+ # +object_uri+:: URI that represents the object
699
+ # +object_type+:: type of the object
700
+ # +label+:: label text to use
701
+ def serialize_attribute_with_label(feature_uri, object_uri, object_type, label)
702
+ create_triple(feature_uri, @base.has_attribute, object_uri)
703
+ create_triple(object_uri, RDF.type, object_type)
704
+ label_uri = RDF::URI.new("#{object_uri}/label")
705
+ create_triple(object_uri, @base.has_attribute, label_uri)
706
+ create_triple(label_uri, RDF.type, @base.Label)
707
+ create_triple(label_uri, @base.has_value, label)
708
+ end
709
+
710
+ # Serializes VCF meta-data (pragma equivalent) key/value pairs. Used by serialize_pragma.
711
+ #
712
+ # Returns URI of the serialized meta-data.
713
+ #
714
+ # +set_uri+:: URI of the set that the meta-data belongs to
715
+ # +uri_suffix+:: suffix that is appended to set_uri, which uniquely defines the meta-data (within the set_uri)
716
+ # +meta_type+:: type of the meta-data
717
+ # +key_type_mappings+:: mappings of keys to known types, everything else is considered an Object
718
+ # +attributes+:: key/value pairs that are the actual meta-data being described
719
+ def serialize_vcf_pragma(set_uri, uri_suffix, meta_type, key_type_mappings, attributes)
720
+ pragma_uri = RDF::URI.new("#{set_uri}/#{uri_suffix}")
721
+ create_triple(pragma_uri, RDF.type, meta_type)
722
+ attributes.each_pair { |key, value|
723
+ attribute_uri = RDF::URI.new("#{pragma_uri}/#{key}")
724
+ create_triple(pragma_uri, @base.has_attribute, attribute_uri)
725
+ if key_type_mappings.has_key?(key) then
726
+ create_triple(attribute_uri, RDF.type, key_type_mappings[key])
727
+ # TODO Check if type is integer/double here, then convert value accordingly.
728
+ else
729
+ create_triple(attribute_uri, RDF.type, @base.Object)
730
+ end
731
+ create_triple(attribute_uri, @base.has_value, value)
732
+ }
733
+ pragma_uri
734
+ end
735
+
736
+ # Serializes a VCF sample attribute/value pair. Used by serialize_vcf_sample.
737
+ #
738
+ # Returns URI of the serialized attribute/value pair.
739
+ #
740
+ # +base_uri+:: URI of the "thing" that the sample data relates to
741
+ # +sample+:: number of the sample that is being addressed (sample column number)
742
+ # +key+:: key of the described sample values
743
+ # +has_label+:: if true, then serialize label (value taken from key)
744
+ # +value+:: value that is associated with the key/sample
745
+ # +index+:: index of the value (in case value is part of an array of size > 1)
746
+ # +multivalue+:: true if this value is taken from an array of values of size > 1
747
+ # +attribute_type+:: type of the attribute entity that represents the value
748
+ # +value_type+:: type of the actual value
749
+ def serialize_vcf_sample_attribute(base_uri, sample, key, has_label, value, index, multivalue, attribute_type, value_type = nil)
750
+ value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}") unless multivalue
751
+ value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}-#{index + 1}") if multivalue
752
+ create_triple(base_uri, @base.has_attribute, value_uri)
753
+ create_triple(value_uri, RDF.type, attribute_type)
754
+ create_triple(value_uri, @base.has_value, value, value_type)
755
+ if has_label then
756
+ label_uri = RDF::URI.new("#{value_uri}/label")
757
+ create_triple(value_uri, @base.has_attribute, label_uri)
758
+ create_triple(label_uri, RDF.type, @base.Label)
759
+ create_triple(label_uri, @base.has_value, key)
760
+ end
761
+ value_uri
762
+ end
763
+
371
764
  # Serializes a structured attribute (given as a pragma statement), which later
372
765
  # can be referred to from feature instances.
373
766
  #
374
767
  # +set_uri+:: the feature set URI to which the structured attribute belongs to
375
768
  # +pragma+:: a map that encapsulates the structured attribute data
376
769
  def serialize_structured_attribute(set_uri, pragma)
770
+ # TODO Triple from set_uri to attribute_uri missing; should be isParticipantIn
377
771
  attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
378
772
  attributes = nil
379
773
  class_type = nil
380
774
  if pragma.has_key?('attribute-method') then
381
775
  attributes = pragma['attribute-method'][0]
382
- class_type = @base.Method
776
+ class_type = @base.ExperimentalMethod
383
777
  elsif pragma.has_key?('data-source') then
384
778
  attributes = pragma['data-source'][0]
385
- class_type = @base.DataSource
779
+ class_type = @base.GenomicAscertainingMethod
386
780
  elsif pragma.has_key?('score-method') then
387
781
  attributes = pragma['score-method'][0]
388
- class_type = @base.Method
782
+ class_type = @base.ExperimentalMethod
389
783
  elsif pragma.has_key?('source-method') then
390
784
  attributes = pragma['source-method'][0]
391
- class_type = @base.Method
785
+ class_type = @base.ExperimentalMethod
392
786
  elsif pragma.has_key?('technology-platform') then
393
787
  attributes = pragma['technology-platform'][0]
394
- class_type = @base.TechnologyPlatform
788
+ class_type = @base.SequencingTechnologyPlatform
395
789
  else
396
790
  # TODO Error.
397
791
  end
398
- if class_type == @base.DataSource and attributes.has_key?('Data_type') then
792
+ if class_type == @base.GenomicAscertainingMethod and attributes.has_key?('Data_type') then
399
793
  attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
400
794
  if attributes['Data_type'] == 'Array_CGH' then
401
795
  class_type = @base.ArrayComparativeGenomicHybridization
402
796
  elsif attributes['Data_type'] == 'DNA_microarray' then
403
797
  class_type = @base.DNAMicroarray
404
798
  elsif attributes['Data_type'] == 'DNA_sequence' then
405
- class_type = @base.DNASequence
799
+ class_type = @base.DNASequencing
406
800
  elsif attributes['Data_type'] == 'RNA_sequence' then
407
- class_type = @base.RNASequence
801
+ class_type = @base.RNASequencing
408
802
  else
409
803
  # TODO Error.
410
804
  end
411
- elsif class_type == @base.TechnologyPlatform then
412
- if attributes.has_key?('Average_coverage') then
413
- create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
414
- end
415
- if attributes.has_key?('Platform_class') then
416
- create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
417
- end
418
- if attributes.has_key?('Platform_name') then
419
- create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
420
- end
421
- if attributes.has_key?('Read_length') then
422
- create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
423
- end
424
- if attributes.has_key?('Read_pair_span') then
425
- create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
426
- end
805
+ elsif class_type == @base.SequencingTechnologyPlatform then
427
806
  if attributes.has_key?('Read_type') then
428
807
  attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
429
808
  if attributes['Read_type'] == 'fragment' then
@@ -436,17 +815,54 @@ protected
436
815
  end
437
816
  end
438
817
  create_triple(attribute_uri, RDF.type, class_type)
818
+ if class_type == @base.SequencingTechnologyPlatform or
819
+ class_type == @base.FragmentReadPlatform or
820
+ class_type == @base.PairedEndReadPlatform then
821
+ if attributes.has_key?('Average_coverage') then
822
+ coverage_uri = RDF::URI.new("#{attribute_uri}/averagecoverage")
823
+ create_triple(attribute_uri, @base.has_attribute, coverage_uri)
824
+ create_triple(coverage_uri, RDF.type, @base.AverageCoverage)
825
+ create_triple(coverage_uri, @base.has_value, attributes['Average_coverage'][0].to_i)
826
+ end
827
+ if attributes.has_key?('Platform_class') then
828
+ create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/platformclass"))
829
+ #create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
830
+ end
831
+ if attributes.has_key?('Platform_name') then
832
+ #create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
833
+ end
834
+ if attributes.has_key?('Read_length') then
835
+ #create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
836
+ end
837
+ if attributes.has_key?('Read_pair_span') then
838
+ #create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
839
+ end
840
+ end
439
841
  attributes.keys.each { |tag|
440
842
  if tag.match(/^[a-z]/) then
843
+ tag.strip!
441
844
  custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
442
- create_triple(custom_attribute_uri, RDF.type, @base.StructuredAttribute)
443
- create_triple(custom_attribute_uri, @base.tag, tag)
845
+ create_triple(attribute_uri, @base.has_attribute, custom_attribute_uri)
846
+ create_triple(custom_attribute_uri, RDF.type, @base.InformationContentEntity)
847
+ create_triple(custom_attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"))
444
848
  attributes[tag].each { |value|
445
- create_triple(custom_attribute_uri, RDF.value, value)
849
+ create_triple(custom_attribute_uri, @base.has_value, value)
446
850
  }
447
- create_triple(attribute_uri, @base.attribute, custom_attribute_uri)
851
+ create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), RDF.type, @base.Name)
852
+ create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), @base.has_value, tag)
448
853
  else
449
854
  # TODO
855
+ match_constraints = {}
856
+ attributes[tag].each { |value|
857
+ if tag == 'Seqid' or tag == 'Type' or tag == 'Source' then
858
+ match_constraints[tag] = value.split(',')
859
+ else
860
+ # Not a recognized match. Might be a Dbxref or Comment.
861
+ end
862
+ }
863
+ unless match_constraints.keys.empty? then
864
+ @matches
865
+ end
450
866
  end
451
867
  }
452
868
  end
@@ -462,35 +878,46 @@ protected
462
878
  sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
463
879
  feature_ids = feature_ids.split(' ')
464
880
  effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
465
- serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
466
- create_triple(effect_uri, RDF.type, @base.Effect)
467
- create_triple(effect_uri, @base.sequenceVariant, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
468
- create_triple(effect_uri, @base.featureType, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
881
+ # TODO
882
+ #serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
883
+ # Type is a SO sequence_variant or descendent:
884
+ create_triple(effect_uri, RDF.type, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
885
+ # The feature type should be already apparent from the targeted feature. Do no sanity
886
+ # check here (if they match) and just skip over it.
887
+ # create_triple(effect_uri, @base.feature_type, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
469
888
  feature_ids.each { |feature_id|
470
- create_triple(effect_uri, @base.feature, feature_id)
889
+ create_triple(feature_id, @base.is_affected_by, effect_uri)
471
890
  }
472
891
  }
473
892
  end
474
893
 
475
894
  # Serializes a list of variant sequences.
476
895
  #
896
+ # See also VCF genotype serialization ('GT' attribute) in `serialize_vcf_sample`.
897
+ #
477
898
  # +set_uri+:: the feature set URI to which the feature belongs to
478
899
  # +feature_uri+:: the feature URI to the feature that is annotated with variant data
479
900
  # +list+:: list of variant values
480
901
  def serialize_variant_seqs(set_uri, feature_uri, list)
902
+ variant_uri = nil
903
+
481
904
  list.each_index { |index|
482
905
  value = list[index]
483
906
  variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
484
- serialize_variant_triple(feature_uri, variant_uri, @base.sequence, RDF::Literal.new(value))
907
+ sequence_uri = RDF::URI.new("#{variant_uri}/sequence")
908
+ serialize_variant_triple(feature_uri, variant_uri, @base.has_attribute, sequence_uri)
909
+ create_triple(sequence_uri, @base.has_value, value)
485
910
  }
486
911
 
487
- # Return the variant type based on the present sequence(s):
488
- return @base.Variant if list.length != 2
489
912
  if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
490
- return @base.HomozygousVariant if list[0] == list[1]
491
- return @base.HeterozygousVariant
913
+ if list[0] == list[1] then
914
+ create_triple(variant_uri, @base.has_quality, @base.Homozygous)
915
+ else
916
+ create_triple(variant_uri, @base.has_quality, @base.Heterozygous)
917
+ end
492
918
  end
493
- return @base.Variant
919
+
920
+ return @base.SequenceVariant
494
921
  end
495
922
 
496
923
  # Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
@@ -501,8 +928,8 @@ protected
501
928
  # +object+:: data to be serialized
502
929
  def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
503
930
  unless @variants.has_key?(variant_uri.to_s) then
504
- create_triple(feature_uri, @base.sequence_annotation, variant_uri)
505
- create_triple(variant_uri, RDF.type, @base.Variant)
931
+ create_triple(feature_uri, @base.is_affected_by, variant_uri)
932
+ create_triple(variant_uri, RDF.type, @base.VariantCalling)
506
933
  end
507
934
  @variants[variant_uri.to_s] = true
508
935
  create_triple(variant_uri, predicate, object)
@@ -515,12 +942,39 @@ protected
515
942
  def serialize_feature_sequence(set_uri, feature_sequence)
516
943
  feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
517
944
  annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
518
- create_triple(feature_uri, @base.sequence_annotation, annotation_uri)
519
- create_triple(annotation_uri, RDF.type, @base.Sequence_Annotation)
520
- create_triple(annotation_uri, RDF::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
521
- create_triple(annotation_uri, @base.sequence, feature_sequence.sequence)
945
+ create_triple(feature_uri, @base.has_attribute, annotation_uri)
946
+ create_triple(annotation_uri, RDF.type, @base.Sequence)
947
+ create_triple(annotation_uri, BioInterchange::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
948
+ create_triple(annotation_uri, @base.has_value, feature_sequence.sequence)
522
949
  end
523
950
 
951
+ # Serializes an external database reference.
952
+ #
953
+ # +feature_uri+:: URI of the feature that the external database references is referring to
954
+ # +dbxref_composite+:: composite term of the external database reference (e.g. ""dbSNP_127:rs123456)
955
+ def serialize_dbxref(feature_uri, dbxref_composite)
956
+ abbreviation, accession = dbxref_composite.split(':', 2)
957
+ dbxref_uri = RDF::URI.new("#{feature_uri.to_s}/dbxref/#{BioInterchange.make_safe_label(abbreviation)}")
958
+ create_triple(feature_uri, @base.references, dbxref_uri)
959
+
960
+ create_triple(dbxref_uri, RDF.type, @base.ExternalReference)
961
+ create_triple(dbxref_uri, @base.refers_to, BioInterchange::LifeScienceRegistry.send(dbxref_composite.split('_', 2)[0].downcase).sub('$id', accession))
962
+ if dbxref_composite.match(/^.+_.+:.+$/) then
963
+ # Entry with version information.
964
+ version_uri = RDF::URI.new("#{dbxref_uri}/version")
965
+ create_triple(dbxref_uri, @base.has_identifier, version_uri)
966
+ create_triple(version_uri, @base.has_value, abbreviation[6..-1])
967
+ end
968
+
969
+ #if dbxref_composite.match(/^dbSNP(_\d+)?:rs\d+$/) then
970
+ # # linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{dbxref_composite.split(/:/)[1].sub(/^rs/, '')}"
971
+ #elsif dbxref_composite.match(/^COSMIC(_\d+)?:COSM\d+$/) then
972
+ # linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{accession.sub(/^COSM/, '')}"
973
+ #else
974
+ # BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + accession
975
+ #end
976
+ end
977
+
524
978
  end
525
979
 
526
980
  end