biointerchange 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -4
  3. data/Gemfile +2 -3
  4. data/README.md +36 -22
  5. data/VERSION +1 -1
  6. data/examples/Felis_catus.gvf.gz +0 -0
  7. data/examples/Felis_catus_incl_consequences.vcf.gz +0 -0
  8. data/generators/rdfxml.rb +1 -1
  9. data/generators/tsv2rubyclass.rb +31 -0
  10. data/lib/biointerchange/core.rb +17 -5
  11. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +591 -137
  12. data/lib/biointerchange/genomics/gff3_reader.rb +16 -3
  13. data/lib/biointerchange/genomics/gvf_reader.rb +1 -1
  14. data/lib/biointerchange/genomics/vcf_feature.rb +46 -0
  15. data/lib/biointerchange/genomics/vcf_feature_set.rb +14 -0
  16. data/lib/biointerchange/genomics/vcf_reader.rb +238 -0
  17. data/lib/biointerchange/gfvo.rb +689 -553
  18. data/lib/biointerchange/life_science_registry.rb +3595 -0
  19. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +33 -35
  20. data/lib/biointerchange/writer.rb +11 -16
  21. data/make.sh +4 -0
  22. data/spec/exceptions_spec.rb +1 -7
  23. data/spec/gff3_rdfwriter_spec.rb +2 -16
  24. data/spec/gvf_rdfwriter_spec.rb +2 -19
  25. data/spec/phylogenetics_spec.rb +1 -13
  26. data/spec/text_mining_pdfx_xml_reader_spec.rb +1 -13
  27. data/spec/text_mining_pubannos_json_reader_spec.rb +1 -14
  28. data/spec/text_mining_rdfwriter_spec.rb +8 -19
  29. data/test.sh +4 -0
  30. data/web/about.html +10 -14
  31. data/web/api.html +11 -13
  32. data/web/bootstrap/css/bootstrap-theme.css +347 -0
  33. data/web/bootstrap/css/bootstrap-theme.css.map +1 -0
  34. data/web/bootstrap/css/bootstrap-theme.min.css +7 -0
  35. data/web/bootstrap/css/bootstrap.css +4764 -4603
  36. data/web/bootstrap/css/bootstrap.css.map +1 -0
  37. data/web/bootstrap/css/bootstrap.min.css +6 -8
  38. data/web/bootstrap/fonts/glyphicons-halflings-regular.eot +0 -0
  39. data/web/bootstrap/fonts/glyphicons-halflings-regular.svg +229 -0
  40. data/web/bootstrap/fonts/glyphicons-halflings-regular.ttf +0 -0
  41. data/web/bootstrap/fonts/glyphicons-halflings-regular.woff +0 -0
  42. data/web/bootstrap/js/bootstrap.js +1372 -1448
  43. data/web/bootstrap/js/bootstrap.min.js +5 -5
  44. data/web/cli.html +14 -28
  45. data/web/index.html +15 -33
  46. data/web/ontologies.html +1089 -945
  47. data/web/webservices.html +12 -14
  48. metadata +24 -27
  49. data/lib/biointerchange/gff3o.rb +0 -525
  50. data/lib/biointerchange/gvf1o.rb +0 -1354
  51. data/web/bootstrap/css/bootstrap-responsive.css +0 -1040
  52. data/web/bootstrap/css/bootstrap-responsive.min.css +0 -9
  53. data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
  54. data/web/bootstrap/img/glyphicons-halflings.png +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24fa6ef275de7c6f218b9807eae4049d63ee6ac8
4
- data.tar.gz: f55cd8a7d9fc2448b7a674cad4ea37c549858cd3
3
+ metadata.gz: 0b66ff1fc16121ae1bd423d5f06bf212b00cdd5c
4
+ data.tar.gz: 93bdd0303b40580eb5f70f17db8aad93228fc11d
5
5
  SHA512:
6
- metadata.gz: 612461c1a22264afa5a743fed78116908fed72eda191aee63e6788e50a5d3c81273f5447ad909d46bfed1ee72d1761d8dabbe4e0c3cd9de05f565a8e9336380b
7
- data.tar.gz: d3aaa588d0f3550fd0dbe0d31045b30021a53a2d6212d72aa98168a483ae8dcc32d8bb4c5f1d934c73065642235f7170387d5b483a06c98c74494215bfa3f335
6
+ metadata.gz: 2b8adeeb8584d3d09729faeb6475dfd9be4adc3f568fdb8a1e4e654c605126384574344e6235f831609a9fa6a61f24ef87dbfff579153dab0f6e406887208584
7
+ data.tar.gz: f709f45089d6cc469884f9e182752fda0be6235762e65857fa1f8b2f0bca72e392618a8b7da142c0ed763c24526195d126a07a709c55a30ce152caf44b5c31b8
data/.travis.yml CHANGED
@@ -2,11 +2,9 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.2
4
4
  - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.1
5
7
  - jruby-19mode # JRuby in 1.9 mode
6
- # - rbx-19mode
7
- - 1.8.7
8
- - jruby-18mode # JRuby in 1.8 mode
9
- # - rbx-18mode
10
8
 
11
9
  # uncomment this line if your project needs to run something other than `rake`:
12
10
  # script: bundle exec rspec spec
data/Gemfile CHANGED
@@ -2,10 +2,10 @@ source "http://rubygems.org"
2
2
  # Add dependencies required to use your gem here.
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
- gem "rdf", ">= 0.3.4.1"
5
+ gem "rdf", ">= 1.1.4.3"
6
6
  gem "json", ">= 1.6.4"
7
7
  gem "getopt", ">= 1.4.1"
8
- gem "addressable", ">= 2.3.2"
8
+ gem "addressable", ">= 2.3.6"
9
9
  gem "bio", ">= 1.4.2"
10
10
 
11
11
  # Add dependencies to develop your gem here.
@@ -14,6 +14,5 @@ group :development do
14
14
  gem "rspec", "~> 2.8.0"
15
15
  gem "bundler", ">= 1.1.5"
16
16
  gem "jeweler", "~> 1.8.4"
17
- gem "bio", ">= 1.4.2"
18
17
  gem "rdoc", "~> 3.12"
19
18
  end
data/README.md CHANGED
@@ -21,14 +21,12 @@ Ontologies used in the RDF output:
21
21
 
22
22
  * [Comparative Data Analysis Ontology](http://sourceforge.net/apps/mediawiki/cdao/index.php?title=Main_Page) (CDAO)
23
23
  * [Friend of a Friend](http://xmlns.com/foaf/spec) (FOAF)
24
- * [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O)
25
- * [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O)
26
24
  * [Genomic Feature and Variation Ontology](http://www.biointerchange.org/ontologies.html) (GFVO)
27
25
  * [Semanticscience Integrated Ontology](http://code.google.com/p/semanticscience/wiki/SIO) (SIO)
28
26
  * [Sequence Ontology](http://www.sequenceontology.org/index.html) (SO)
29
27
  * [Sequence Ontology Feature Annotation](http://www.sequenceontology.org/index.html) (SOFA)
30
28
 
31
- *Note:* GFF3O and GVF1O will be replaced by GFVO with the next release of BioInterchange.
29
+ *Note:* GFVO replaces the [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O) and [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O).
32
30
 
33
31
  #### Contributing
34
32
 
@@ -54,7 +52,7 @@ BioInterchange's command-line tool `biointerchange` can be installed as a comman
54
52
 
55
53
  Examples:
56
54
 
57
- biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.gvf --batchsize 100 --file examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf
55
+ biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.gfvo --file examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf
58
56
  biointerchange --input dbcls.catanns.json --rdf rdf.bh12.sio --file examples/pubannotation.10096561.json --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
59
57
  biointerchange --input uk.ac.man.pdfx --rdf rdf.bh12.sio --file examples/gb-2007-8-3-R40.xml --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
60
58
  biointerchange --input phylotastic.newick --rdf rdf.phylotastic.newick --file examples/tree2.new --annotate_date '1 June 2006'
@@ -69,8 +67,7 @@ Input formats:
69
67
 
70
68
  Output formats:
71
69
 
72
- * `rdf.biointerchange.gff3`
73
- * `rdf.biointerchange.gvf`
70
+ * `rdf.biointerchange.gfvo`
74
71
  * `rdf.bh12.sio`
75
72
  * `rdf.phylotastic.newick`
76
73
 
@@ -93,15 +90,6 @@ To list all `seqid` entries from a GVF-file conversion in the store, the followi
93
90
 
94
91
  testrepo> sparql select * where { ?s <http://www.biointerchange.org/gvf1o#GVF1_0004> ?o } .
95
92
 
96
- #### Data Consistency Verification
97
-
98
- Data consistency is verifyable for the output formats `rdf.biointerchange.gff3` and `rdf.biointerchange.gvf` using the [BioInterchange ontologies](http://www.biointerchange.org/ontologies.html) GFF3O and GVF1O. The following is an example of how [Jena](http://jena.apache.org)'s command line tools and the [HermiT reasoner](http://hermit-reasoner.com) can be used for conistency verification:
99
-
100
- rdfcat <path-to-gff3o/gvf1o> <yourdata.n3> > merged.xml
101
- java -d64 -Xmx4G -jar HermiT.jar -k -v merged.xml
102
-
103
- Another approach is to load the data and its related GFF3O/GVF1O ontology into [Protege](http://protege.stanford.edu), merge them, and then use the "Explain inconsistent ontology" menu item to inspect possible data inconsistencies.
104
-
105
93
  #### Example Data Provenance
106
94
 
107
95
  The following list provides information on the origin of the example-data files in the `examples` directory:
@@ -110,8 +98,16 @@ The following list provides information on the origin of the example-data files
110
98
  * `BovineGenomeChrX.gff3.gz`: Gzipped GFF3 file describing a Bos taurus chromosome X. Downloaded from [http://bovinegenome.org/?q=download_chromosome_gff3](http://bovinegenome.org/?q=download_chromosome_gff3)
111
99
  * `chromosome_BF.gff`: GFF3 file of floating contigs from the Baylor Sequencing Centre. Downloaded from [http://dictybase.org/Downloads](http://dictybase.org/Downloads)
112
100
  * `estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf`: GVF file of EBI's [DGVa](http://www.ebi.ac.uk/dgva/database-genomic-variants-archive). Downloaded from [ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf](ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf)
101
+ * `Felis_catus.gvf.gz`: Gzipped GVF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz)
102
+ * `Felis_catus_incl_consequences.vcf.gz`: Gzipped VCF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz](ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz)
113
103
  * `gb-2007-8-3-R40.xml`: Generated by [PDFx](http://pdfx.cs.man.ac.uk) from open-access source PDF [Sense-antisense pairs in mammals: functional and evolutionary considerations](http://genomebiology.com/content/pdf/gb-2007-8-3-r40.pdf)
114
- * `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
104
+ * `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Gzipped GVF files of S. cerevisiae genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
105
+
106
+ #### Additional Example Data
107
+
108
+ The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
109
+
110
+ * `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
115
111
 
116
112
  ### Application Programming Interface
117
113
 
@@ -322,9 +318,10 @@ The writer takes an object model and serializes it via the `BioInterchange::Writ
322
318
  # Serialize a model as RDF.
323
319
  #
324
320
  # +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
325
- def serialize(model)
321
+ # +uri_prefix+:: optional URI prefix that should be used in the RDFization of individuals/class instances
322
+ def serialize(model, uri_prefix = nil)
326
323
  model.contents.each { |tree|
327
- serialize_model(model, tree)
324
+ serialize_model(model, tree, uri_prefix)
328
325
  }
329
326
  end
330
327
 
@@ -501,8 +498,7 @@ RDFization parameters and data are send as a single HTTP POST requests containin
501
498
  * `phylotastic.newick`: [Newick](http://evolution.genetics.washington.edu/phylip/newicktree.html)
502
499
  * `uk.ac.man.pdfx`: [PDFx](http://pdfx.cs.man.ac.uk) XML
503
500
  * `OUTPUT_METHOD`: determines the RDFization method that should be used, output will always be RDF N-Triples; available output formats are
504
- * `rdf.biointerchange.gff3`: RDFization of `biointerchange.gff3`
505
- * `rdf.biointerchange.gvf`: RDFization of `biointerchange.gvf`
501
+ * `rdf.biointerchange.gfvo`: RDFization of `biointerchange.gff3` or `biointerchange.gvf`
506
502
  * `rdf.bh12.sio`: RDFization of `dbcls.catanns.json` or `uk.ac.man.pdfx`
507
503
  * `rdf.phylotastic.newick`: RDFization of `phylotastic.newick`
508
504
  * `URL_ENCODED_DATA`: data for RDFization as [URL encoded](http://en.wikipedia.org/wiki/Percent-encoding) string
@@ -590,6 +586,13 @@ A Geno Ontology external reference (GOxref) vocabulary can be created by directl
590
586
  curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb
591
587
  echo -e "\nend" >> lib/biointerchange/goxref.rb
592
588
 
589
+ Building an external reference vocabulary based on Life Science Registry external database abbreviations (based on download of the
590
+ Life Science registry spreadsheet as TSV file):
591
+
592
+ echo -e "module BioInterchange\n" > lib/biointerchange/life_science_registry.rb
593
+ cut -f 1,25 <path-to-registry-tsv-file> | grep -E 'https?://.*\$id' | ruby generators/tsv2rubyclass.rb LifeScienceRegistry >> lib/biointerchange/life_science_registry.rb
594
+ echo -e "\nend" >> lib/biointerchange/life_science_registry.rb
595
+
593
596
  #### Python Vocabulary Classes
594
597
 
595
598
  The source-code generation can be skipped, if none of the ontologies that are used by BioInterchange have been changed. Otherwise, the existing Python vocabulary class wrappers can be generated as follows:
@@ -630,14 +633,25 @@ The following Java packages will automatically install alongside BioInterchange'
630
633
 
631
634
  ### Gem Bundling/Installing
632
635
 
636
+ Mac OS X prerequisites and `bundle install` difference:
637
+
638
+ sudo port install libxml2 libxslt
639
+ sudo ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future bundle install
640
+
641
+ Actual gem bundling:
642
+
633
643
  bundle exec rake gemspec
634
644
  bundle exec gem build biointerchange.gemspec
635
- sudo bundle exec gem install biointerchange
645
+ sudo bundle exec gem install biointerchange-`cat VERSION`.gem
636
646
 
637
647
  If you encounter problems with gem dependencies, then you can try to explictly use Ruby 1.9:
638
648
 
639
649
  bundle exec gem1.9 build biointerchange.gemspec
640
- sudo bundle exec gem1.9 install biointerchange
650
+ sudo bundle exec gem1.9 install biointerchange-`cat VERSION`.gem
651
+
652
+ Alternative build script, `make.sh`, which installs the gem without RDocs and ri pages (quicker when testing):
653
+
654
+ ./make.sh
641
655
 
642
656
  ### Unit Testing
643
657
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.1
1
+ 1.0.2
Binary file
data/generators/rdfxml.rb CHANGED
@@ -19,7 +19,7 @@ SIO_SYN = RDF::URI.new('http://semanticscience.org/resource/synonym')
19
19
  # This label conversion also appears in:
20
20
  # +lib/biointerchange/core.rb+
21
21
  def make_safe_label(label)
22
- label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
22
+ label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
23
23
  end
24
24
 
25
25
  reader = RDF::RDFXML::Reader.open(ARGV[0])
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'biointerchange'
4
+
5
+ if ARGV.length != 1 then
6
+ puts 'Usage: tsv2rubyclass rubyclassname'
7
+ puts ''
8
+ puts 'Reads a TSV file from STDIN, where the first column values become'
9
+ puts 'method names (sanitized for spaces, etc.) in the class and the'
10
+ puts 'second column values are returned as a string.'
11
+ puts ''
12
+ puts 'The generated Ruby class is output on STDOUT.'
13
+ exit 1
14
+ end
15
+
16
+ classname = ARGV[0]
17
+
18
+ puts "class #{classname}"
19
+ puts ''
20
+
21
+ STDIN.each { |line|
22
+ key, value = line.chomp.split("\t")
23
+
24
+ puts " def self.#{BioInterchange.make_safe_label(key)}"
25
+ puts " \"#{value}\""
26
+ puts ' end'
27
+ puts ''
28
+ }
29
+
30
+ puts 'end'
31
+
@@ -34,13 +34,13 @@ module BioInterchange
34
34
  # Custom Exceptions and Errors
35
35
  require 'biointerchange/exceptions'
36
36
 
37
- # Ontologies (besides the ones from the 'rdf' gem)
37
+ # Ontologies (besides the ones from the 'rdf' gem), vocabularies and
38
+ # other mappings (e.g., database abbreviations to URIs):
39
+ require 'biointerchange/life_science_registry'
38
40
  require 'biointerchange/cdao'
39
41
  require 'biointerchange/faldo'
40
- require 'biointerchange/gff3o'
41
42
  require 'biointerchange/gfvo'
42
43
  require 'biointerchange/goxref'
43
- require 'biointerchange/gvf1o'
44
44
  require 'biointerchange/sio'
45
45
  require 'biointerchange/so'
46
46
  require 'biointerchange/sofa'
@@ -105,6 +105,18 @@ module BioInterchange
105
105
  # Writer
106
106
  # ...same GFF3 writer
107
107
 
108
+ ### VCF ###
109
+
110
+ # Reader
111
+ require 'biointerchange/genomics/vcf_reader'
112
+
113
+ # Feature base model
114
+ require 'biointerchange/genomics/vcf_feature_set'
115
+ require 'biointerchange/genomics/vcf_feature'
116
+
117
+ # Writer
118
+ # ...same GFF3 writer
119
+
108
120
  #
109
121
  # PHYLOGENETICS
110
122
  #
@@ -225,7 +237,7 @@ module BioInterchange
225
237
  'input' => opt['input'],
226
238
  'output' => opt['output']
227
239
  }
228
- map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
240
+ map['batch_size'] = opt['batchsize'].to_i if opt['batchsize']
229
241
  opt.each_key { |key|
230
242
  map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
231
243
  }
@@ -296,7 +308,7 @@ module BioInterchange
296
308
  #
297
309
  # +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name
298
310
  def self.make_safe_label(label)
299
- label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
311
+ label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
300
312
  end
301
313
 
302
314
  private
@@ -4,11 +4,12 @@ require 'date'
4
4
 
5
5
  module BioInterchange::Genomics
6
6
 
7
- # Serializes GFF3 and GVF models.
7
+ # Serializes GFF3, GVF and VCF models.
8
8
  #
9
9
  # Inputs:
10
10
  # - biointerchange.gff3
11
11
  # - biointerchange.gvf
12
+ # - biointerchange.vcf
12
13
  #
13
14
  # Outputs:
14
15
  # - rdf.biointerchange.gfvo
@@ -18,14 +19,7 @@ class RDFWriter < BioInterchange::Writer
18
19
  BioInterchange::Registry.register_writer(
19
20
  'rdf.biointerchange.gfvo',
20
21
  BioInterchange::Genomics::RDFWriter,
21
- [ 'biointerchange.gff3' ],
22
- true,
23
- 'Genomic Feature and Variation Ontology (GFVO) based RDFization'
24
- )
25
- BioInterchange::Registry.register_writer(
26
- 'rdf.biointerchange.gfvo',
27
- BioInterchange::Genomics::RDFWriter,
28
- [ 'biointerchange.gvf' ],
22
+ [ 'biointerchange.gff3', 'biointerchange.gvf', 'biointerchange.vcf' ],
29
23
  true,
30
24
  'Genomic Feature and Variation Ontology (GFVO) based RDFization'
31
25
  )
@@ -34,8 +28,7 @@ class RDFWriter < BioInterchange::Writer
34
28
  #
35
29
  # +ostream+:: instance of an IO class or derivative that is used for RDF serialization
36
30
  def initialize(ostream)
37
- raise BioInterchange::Exceptions::ImplementationWriterError, 'The output stream is not an instance of IO or its subclasses.' unless ostream.kind_of?(IO)
38
- @ostream = ostream
31
+ super(ostream)
39
32
  end
40
33
 
41
34
  # Serialize a model as RDF.
@@ -47,10 +40,12 @@ class RDFWriter < BioInterchange::Writer
47
40
  @format = :gff3
48
41
  elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
49
42
  @format = :gvf
43
+ elsif model.instance_of?(BioInterchange::Genomics::VCFFeatureSet) then
44
+ @format = :vcf
50
45
  else
51
46
  raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
52
- 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
53
- 'BioInterchange::Genomics::GVFFeatureSet.'
47
+ 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet, ' +
48
+ 'BioInterchange::Genomics::GVFFeatureSet and BioInterchange::Genomics::VCFFeatureSet.'
54
49
  end
55
50
  @base = BioInterchange::GFVO
56
51
  serialize_model(model, uri_prefix)
@@ -71,15 +66,24 @@ protected
71
66
  # Record written variants in order to avoid writing out RDF.type multiple times.
72
67
  @variants = {}
73
68
 
69
+ # Set up "matchers" that can be used to match a number of attributes of a feature, and then,
70
+ # link out to an entity that says something about that combination of attributes. Used for
71
+ # GVF #data-source, etc., pragmas and VCF filters.
72
+ @matchers = []
73
+
74
74
  # Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
75
75
  # Then register the prefix with the writer to have a concise Turtle output.
76
76
  set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
77
77
  set_uri = RDF::URI.new(model.uri) unless set_uri
78
78
  set_base(set_uri + '/')
79
79
 
80
- create_triple(set_uri, RDF.type, @base.Set)
80
+ add_prefix('http://biohackathon.org/resource/faldo#', 'faldo')
81
+ add_prefix('http://www.biointerchange.org/gfvo#', 'gfvo')
82
+ add_prefix('http://semanticscience.org/resource/', 'sio')
83
+
84
+ create_triple(set_uri, RDF.type, @base.File)
81
85
  model.pragmas.each { |pragma_name|
82
- serialize_pragma(set_uri, model.pragma(pragma_name))
86
+ serialize_pragma(set_uri, pragma_name, model.pragma(pragma_name))
83
87
  }
84
88
  model.contents.each { |feature|
85
89
  if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
@@ -89,64 +93,162 @@ protected
89
93
  end
90
94
  }
91
95
  close
92
- #RDF::NTriples::Writer.dump(graph, @ostream)
93
- # TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
94
- # Having said that, Jena's rdfcat is very good for converting formats
95
- # anyway, so perhaps it is not worth investigating the following.
96
- # RDF::RDFXML::Writer.dump(graph, @ostream)
97
96
  end
98
97
 
99
98
  # Serializes pragmas for a given feature set URI.
100
99
  #
101
100
  # +set_uri+:: the feature set URI to which the pragmas belong to
101
+ # +name+:: name of the pragma statement
102
102
  # +pragma+:: an object representing a pragma statement
103
- def serialize_pragma(set_uri, pragma)
103
+ def serialize_pragma(set_uri, name, pragma)
104
104
  if pragma.kind_of?(Hash) then
105
105
  if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
106
106
  serialize_structured_attribute(set_uri, pragma)
107
107
  elsif pragma.has_key?('gff-version') then
108
- create_triple(set_uri, @base.gff_version, pragma['gff-version'], RDF::XSD.float)
108
+ create_triple(set_uri.to_s, @base.has_identifier, RDF::URI.new("#{set_uri}/version"))
109
+ create_triple("#{set_uri}/version", RDF.type, @base.Version)
110
+ create_triple("#{set_uri}/version", @base.has_value, "gff-version #{pragma['gff-version']}")
109
111
  elsif pragma.has_key?('gvf-version') then
110
- create_triple(set_uri, @base.gvf_version, pragma['gvf-version'], RDF::XSD.float)
112
+ create_triple("#{set_uri}/version", RDF.type, @base.Version)
113
+ create_triple("#{set_uri}/version", @base.has_value, "gvf-version #{pragma['gvf-version']}")
114
+ elsif pragma.has_key?('fileformat') then
115
+ create_triple("#{set_uri}/version", RDF.type, @base.Version)
116
+ create_triple("#{set_uri}/version", @base.has_value, "fileformat #{pragma['fileformat']}")
111
117
  elsif pragma.has_key?('sequence-region') then
112
118
  pragma['sequence-region'].keys.each { |seqid|
113
119
  serialize_landmark(set_uri, pragma['sequence-region'][seqid])
114
120
  }
115
121
  elsif pragma.has_key?('species') then
116
- create_triple(set_uri, @base.species, RDF::URI.new(pragma['species']))
122
+ create_triple(set_uri, @base.is_about, RDF::URI.new(pragma['species']))
123
+ # VCF section:
124
+ # ...TODO
125
+ # Everything else:
126
+ else
127
+ end
128
+ elsif pragma.kind_of?(Array) then
129
+ # VCF section:
130
+ basic_vcf_mappings = {
131
+ 'ID' => @base.Identifier,
132
+ 'Description' => @base.Comment,
133
+ 'Number' => @base.InformationContentEntity, # Note: not just an integer; can be also 'A', 'G', and '.'
134
+ 'Type' => @base.InformationContentEntity # Can be 'Integer', 'Float', 'Character', 'String'
135
+ }
136
+ if name == 'FILTER' then
137
+ pragma.each { |assignment|
138
+ pragma_uri = serialize_vcf_pragma(set_uri, "filter/#{assignment['ID']}", @base.VariantCalling, basic_vcf_mappings, assignment)
139
+ create_triple(set_uri, @base.is_participant_in, pragma_uri)
140
+ }
141
+ elsif name == 'FORMAT' then
142
+ pragma.each { |assignment|
143
+ pragma_uri = serialize_vcf_pragma(set_uri, "format/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
144
+ create_triple(set_uri, @base.references, pragma_uri)
145
+ }
146
+ elsif name == 'INFO' then
147
+ pragma.each { |assignment|
148
+ pragma_uri = serialize_vcf_pragma(set_uri, "info/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
149
+ create_triple(set_uri, @base.references, pragma_uri)
150
+ }
151
+ else
152
+ # TODO
117
153
  end
118
154
  else
119
155
  # TODO
120
156
  end
121
157
  end
122
158
 
159
+ # Goes through "matchers" and links the feature if its attributes are present
160
+ # and equal to a "matcher's" data.
161
+ #
162
+ # (TODO: Update description of this method, because it is absolutely unclear
163
+ # what it actually does right now. Sorry.)
164
+ #
165
+ # +feature+:: the feature that provides attributes for matching
166
+ # +feature_uri+:: URI of the feature that is linked out to, if the feature's attributes match
167
+ def match_feature(feature, feature_uri)
168
+ @matchers.each { |match_constraints|
169
+ constraints, linkout = match_constraints
170
+
171
+ # No constraints means that *everything* matches.
172
+ matches = true
173
+ constraints.each_pair { |key, value|
174
+ if key == 'Seqid' then
175
+ matches = false unless value.include?(feature.sequence_id)
176
+ elsif key == 'Source' then
177
+ matches = false unless value.include?(feature.source)
178
+ elsif key == 'Type' then
179
+ matches = false unless value.include?(feature.type)
180
+ else
181
+ if feature.attributes.has_key?(key) then
182
+ attributes_have_a_match = false
183
+ feature.attributes[key].each { |attribute_value|
184
+ attributes_have_a_match = true if value.include?(attribute_value)
185
+ }
186
+ matches = false unless attributes_have_a_match
187
+ else
188
+ matches = false
189
+ end
190
+ end
191
+ }
192
+
193
+ # If there is a match, then add linkout.
194
+ create_triple(feature_uri, @base.has_source, RDF::URI.new(linkout))
195
+ }
196
+ end
197
+
123
198
  # Serializes a +GFF3Feature+ object for a given feature set URI.
124
199
  #
125
200
  # +set_uri+:: the feature set URI to which the feature belongs to
126
201
  # +feature+:: a +GFF3Feature+ instance
127
202
  def serialize_feature(set_uri, feature)
128
203
  # TODO Make sure there is only one value in the 'ID' list.
129
- feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
130
- feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
131
- create_triple(set_uri, @base.contains, feature_uri)
204
+ # TODO Ponder about whether it would be possible to get the same URI for two distinct features (bad thing).
205
+ source = ''
206
+ source = "#{feature.source}," if feature.source
207
+ type = ''
208
+ type = "#{feature.type.to_s.sub(/^[^:]+:\/\//, '')}," if feature.type
209
+ phase = ",#{feature.phase}" if feature.phase
210
+ if feature.attributes.has_key?('ID') or feature.attributes.has_key?(' id') then
211
+ feature_id = 'ID'
212
+ feature_id = ' id' if feature.attributes.has_key?(' id')
213
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes[feature_id][0]}")
214
+ else
215
+ feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{source}#{type}#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand}#{phase}")
216
+ end
217
+
218
+ create_triple(set_uri, @base.has_member, feature_uri)
132
219
  create_triple(feature_uri, RDF.type, @base.Feature)
220
+ create_triple(feature_uri, RDF.type, feature.type) if feature.type
221
+ match_feature(feature, feature_uri)
133
222
  serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
134
- create_triple(feature_uri, @base.seqid, @landmarks[feature.sequence_id])
135
- create_triple(feature_uri, @base.source, feature.source)
136
- create_triple(feature_uri, @base.type, feature.type)
137
- create_triple(feature_uri, @base.phase, feature.phase) if feature.phase
223
+ create_triple(feature_uri, @base.is_located_on, RDF::URI.new(@landmarks[feature.sequence_id]))
224
+ create_triple(feature_uri, @base.is_created_by, RDF::URI.new("#{feature_uri}/source"))
225
+ create_triple("#{feature_uri}/source", RDF.type, @base.ExperimentalMethod)
226
+ create_triple("#{feature_uri}/source", @base.has_value, feature.source) if feature.source
227
+ if feature.phase then
228
+ create_triple(feature_uri, @base.has_quality, RDF::URI.new("#{feature_uri}/phase"))
229
+ create_triple("#{feature_uri}/phase", RDF.type, @base.CodingFrameOffset)
230
+ create_triple("#{feature_uri}/phase", @base.has_value, feature.phase)
231
+ end
138
232
 
139
- serialize_coordinate(set_uri, feature_uri, feature)
233
+ create_triple(feature_uri, @base.has_part, RDF::URI.new("#{feature_uri}/locus"))
234
+ create_triple("#{feature_uri}/locus", RDF.type, @base.Locus)
235
+ create_triple("#{feature_uri}/locus", @base.has_attribute, RDF::URI.new("#{feature_uri}/locus/region"))
236
+ serialize_coordinate(set_uri, "#{feature_uri}/locus", feature)
140
237
  serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
141
238
  end
142
239
 
240
+ # Serialize a feature's coordinates using FALDO.
241
+ #
242
+ # +set_uri+:: URI of the feature set that the feature belongs to
243
+ # +feature_uri+:: URI prefix of the feature
244
+ # +feature+:: object representation of the feature, which contains the locus that is described by this method
143
245
  def serialize_coordinate(set_uri, feature_uri, feature)
144
246
  region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
145
247
  start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
146
248
  end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
147
249
  #feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
148
- ##graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
149
- create_triple(feature_uri, @base.locus, region_uri)
250
+ ##graph.insert(BioInterchange::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
251
+ create_triple(feature_uri, @base.is_located_on, region_uri)
150
252
  create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
151
253
  # BIN STUFF
152
254
  if false then
@@ -173,7 +275,23 @@ protected
173
275
  end
174
276
  create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
175
277
  create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
176
- create_triple(feature_uri, @base.score, feature.score) if feature.score
278
+ if feature.score then
279
+ create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
280
+ if @format == :gvf or @format == :vcf then
281
+ create_triple("#{feature_uri}/score", RDF.type, @base.PhredScore)
282
+ else
283
+ create_triple("#{feature_uri}/score", RDF.type, @base.Score)
284
+ end
285
+ create_triple("#{feature_uri}/score", @base.has_value, feature.score)
286
+ end
287
+ end
288
+
289
+ # Constructs a landmark URI based on set URI and the landmark's ID.
290
+ #
291
+ # +set_uri+:: the set URI to which the landmark belongs to
292
+ # +id+:: ID of the landmark
293
+ def landmark_uri(set_uri, id)
294
+ "#{set_uri.to_s}/landmark/#{id}"
177
295
  end
178
296
 
179
297
  # Serializes a genomic feature landmark ("seqid").
@@ -182,12 +300,13 @@ protected
182
300
  # +landmark+:: encapsuled landmark data
183
301
  def serialize_landmark(set_uri, landmark)
184
302
  return if @landmarks.has_key?(landmark.seqid)
185
- landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
303
+ landmark_uri = landmark_uri(set_uri, landmark.seqid)
186
304
  region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
187
305
  @landmarks[landmark.seqid] = landmark_uri
188
306
  create_triple(landmark_uri, RDF.type, @base.Landmark)
189
- create_triple(landmark_uri, @base.id, landmark.seqid)
190
- create_triple(landmark_uri, @base.locus, region_uri)
307
+ create_triple(landmark_uri, @base.has_identifier, RDF::URI.new("#{landmark_uri}/id"))
308
+ create_triple("#{landmark_uri}/id", @base.has_value, landmark.seqid)
309
+ create_triple(landmark_uri, @base.has_attribute, region_uri)
191
310
  if landmark.start_coordinate then
192
311
  start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
193
312
  create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
@@ -209,91 +328,92 @@ protected
209
328
  attributes.each_pair { |tag, list|
210
329
  # Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
211
330
  if tag == 'Alias' then
212
- list.each { |value|
213
- create_triple(feature_uri, @base.alias, value)
331
+ list.each_index { |index|
332
+ create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/alias/#{index}"))
333
+ create_triple("#{feature_uri}/alias/#{index}", RDF.type, @base.Alias)
334
+ create_triple("#{feature_uri}/alias/#{index}", @base.has_value, list[index])
214
335
  }
215
336
  elsif tag == 'Dbxref' then
216
337
  list.each { |value|
217
338
  begin
218
- linkout = nil
219
- # First: determine the Dbxref linkout URI as string
220
- if value.match(/^dbSNP(_\d+)?:rs\d+$/) then
221
- linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{value.split(/:/)[1].sub(/^rs/, '')}"
222
- elsif value.match(/^COSMIC(_\d+)?:COSM\d+$/) then
223
- linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{value.split(/:/)[1].sub(/^COSM/, '')}"
224
- else
225
- abbreviation, id = value.split(':', 2)
226
- linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
227
- end
228
- # Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
229
- create_triple(feature_uri, @base.dbxref, linkout)
339
+ # Try to link the external database reference to a well-established URI:
340
+ serialize_dbxref(feature_uri, value)
230
341
  rescue NoMethodError
231
- # Preserve the Dbxref as a Literal:
342
+ # Not clear where to link to? Preserve the Dbxref as a Literal:
232
343
  @dbxref = 0 if @dbxref == nil
233
344
  literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
234
345
  @dbxref += 1
235
- create_triple(feature_uri, @base.dbxref, literal_uri)
236
- create_triple(literal_uri, RDF.type, RDF::RDFS.Literal)
237
- create_triple(literal_uri, RDF.value, value)
346
+ create_triple(feature_uri, @base.references, literal_uri)
347
+ create_triple(literal_uri, RDF.type, @base.ExternalReference)
348
+ create_triple(literal_uri, @base.has_value, value)
238
349
  end
239
350
  }
240
351
  elsif tag == 'Derives_from' then
241
352
  list.each { |value|
242
- create_triple(feature_uri, @base.derivesFrom, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
353
+ create_triple(feature_uri, @base.is_temporarily_part_of, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
243
354
  }
244
355
  elsif tag == 'Gap' then
245
356
  # Handled by 'Target', because 'Gap' requires 'Target' to be present.
246
357
  elsif tag == 'ID' then
247
358
  list.each { |value|
248
- create_triple(feature_uri, @base.id, value)
359
+ create_triple(feature_uri, @base.has_identifier, RDF::URI.new("#{feature_uri}/id"))
360
+ create_triple("#{feature_uri}/id", RDF.type, @base.Identifier)
361
+ create_triple("#{feature_uri}/id", @base.has_value, value)
249
362
  }
250
363
  elsif tag == 'Is_circular' then
251
364
  value = list.join(',')
252
365
  if value == 'true' then
253
- create_triple(feature_uri, @base.isCircular, true) if value == 'true'
366
+ create_triple(feature_uri, @base.has_quality, @base.CircularHelix) if value == 'true'
254
367
  elsif value == 'false' then
255
- create_triple(feature_uri, @base.isCircular, false) if value == 'false'
368
+ create_triple(feature_uri, @base.is_circular, @base.WatsonCrickHelix) if value == 'false'
256
369
  else
257
- create_triple(feature_uri, RDF::RDFS.comment, "Is_circular non-truth value: #{value}")
370
+ create_triple(feature_uri, BioInterchange::RDFS.comment, "Is_circular non-truth value: #{value}")
258
371
  end
259
372
  elsif tag == 'Name' then
260
373
  list.each { |value|
261
- create_triple(feature_uri, @base.name, value)
374
+ create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/name"))
375
+ create_triple("#{feature_uri}/name", RDF.type, @base.Name)
376
+ create_triple("#{feature_uri}/name", @base.has_value, value)
262
377
  }
263
378
  elsif tag == 'Note' then
264
- list.each { |value|
265
- create_triple(feature_uri, RDF::RDFS.comment, value)
379
+ list.each_index { |index|
380
+ create_triple(feature_uri, @base.has_annotation, RDF::URI.new("#{feature_uri}/note/#{index}"))
381
+ create_triple("#{feature_uri}/note/#{index}", RDF.type, @base.Note)
382
+ create_triple("#{feature_uri}/note/#{index}", @base.has_value, list[index])
266
383
  }
267
384
  elsif tag == 'Ontology_term' then
268
385
  list.each { |value|
269
386
  # TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
270
387
  # match their associated Ruby method.
271
388
  namespace, accession = value.split(/:/, 2)
272
- create_triple(feature_uri, @base.ontology_term, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
389
+ create_triple(feature_uri, @base.references, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
273
390
  }
274
391
  elsif tag == 'Parent' then
275
392
  list.each { |parent_id|
276
- create_triple(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
393
+ create_triple(feature_uri, @base.has_source, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
277
394
  }
278
395
  elsif tag == 'Reference_seq' then
279
396
  list.each { |value|
280
397
  reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
281
- create_triple(feature_uri, @base.sequence_annotation, reference_uri)
282
- create_triple(reference_uri, RDF.type, @base.Reference)
283
- create_triple(reference_uri, @base.sequence, value)
398
+ create_triple(feature_uri, @base.has_attribute, reference_uri)
399
+ create_triple(reference_uri, RDF.type, @base.ReferenceSequence)
400
+ create_triple(reference_uri, @base.has_value, value)
284
401
  }
285
402
  elsif tag == 'Target' then
403
+ # GFF3 spec is unclear on this point, but I assume that a target ID
404
+ # is referencing a feature ID within the same file.
286
405
  target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
287
406
  target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
288
- create_triple(feature_uri, @base.target, target_uri)
289
- create_triple(target_uri, RDF.type, @base.Target)
290
- create_triple(target_uri, @base.id, target_id)
407
+ create_triple(target_uri, RDF.type, @base.SequenceAlignment)
408
+ create_triple(target_uri, @base.has_source, feature_uri)
409
+ create_triple(target_uri, @base.has_input, target_id)
291
410
  region_uri = RDF::URI.new("#{target_uri.to_s}/region")
292
411
  start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
293
412
  end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
294
- create_triple(target_uri, @base.locus, region_uri)
295
- create_triple(region_uri, @base.locus, start_position_uri)
296
- create_triple(region_uri, @base.locus, end_position_uri)
413
+ create_triple(target_uri, @base.has_attribute, region_uri)
414
+ create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
415
+ create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
416
+ create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
297
417
  if strand == '+' then
298
418
  create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
299
419
  create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
@@ -312,13 +432,20 @@ protected
312
432
  create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
313
433
  end
314
434
 
315
- # Describe a possible alignment between the feature and target:
435
+ # Describe a possible alignment with gaps between the feature and target:
316
436
  if attributes.has_key?('Gap') then
317
437
  attributes['Gap'].each_index { |gap_no|
318
438
  cigar_line = attributes['Gap'][gap_no].split(/\s+/)
319
439
  cigar_line.each_index { |alignment_no|
320
440
  alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
321
- create_triple(feature_uri, @base.alignment, alignment_uri) if alignment_no == 0
441
+ if alignment_no == 0 then
442
+ create_triple(target_uri, @base.has_first_part, alignment_uri)
443
+ else
444
+ create_triple(target_uri, @base.has_ordered_part, alignment_uri)
445
+ end
446
+ if alignment_no == cigar_line.length then
447
+ create_triple(target_uri, @base.has_last_part, alignment_uri)
448
+ end
322
449
  operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
323
450
  operation = nil unless operation.length == 1
324
451
  span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
@@ -326,24 +453,25 @@ protected
326
453
  if operation == 'M' then
327
454
  create_triple(alignment_uri, RDF.type, @base.Match)
328
455
  elsif operation == 'I' then
329
- create_triple(alignment_uri, RDF.type, @base.Reference_Sequence_Gap)
456
+ create_triple(alignment_uri, RDF.type, @base.ReferenceSequenceGap)
330
457
  elsif operation == 'D' then
331
- create_triple(alignment_uri, RDF.type, @base.Target_Sequence_Gap)
458
+ create_triple(alignment_uri, RDF.type, @base.TargetSequenceGap)
332
459
  elsif operation == 'F' then
333
- create_triple(alignment_uri, RDF.type, @base.Forward_Reference_Sequence_Frameshift)
460
+ create_triple(alignment_uri, RDF.type, @base.ForwardReferenceSequenceFrameshift)
334
461
  elsif operation == 'R' then
335
- create_triple(alignment_uri, RDF.type, @base.Reverse_Reference_Sequence_Frameshift)
462
+ create_triple(alignment_uri, RDF.type, @base.ReverseReferenceSequenceFrameshift)
336
463
  else
337
464
  # Fallback: operation is outside of the specification
338
- create_triple(alignment_uri, RDF.type, @base.Alignment_Operation)
339
- create_triple(alignment_uri, RDF::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
465
+ create_triple(alignment_uri, RDF.type, @base.SequenceAlignmentOperation)
466
+ create_triple(alignment_uri, BioInterchange::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
340
467
  end
341
- create_triple(alignment_uri, @base.span, span.to_i) if span
342
- create_triple(alignment_uri, RDF.first, alignment_uri)
343
468
  if alignment_no + 1 < cigar_line.length then
344
- create_triple(alignment_uri, RDF.rest, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
345
- else
346
- create_triple(alignment_uri, RDF.rest, RDF.nil)
469
+ create_triple(alignment_uri, @base.is_before, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
470
+ end
471
+ if span then
472
+ create_triple(alignment_uri, @base.has_attribute, RDF::URI.new("#{alignment_uri}/span"))
473
+ create_triple("#{alignment_uri}/span", RDF.type, @base.Span)
474
+ create_triple("#{alignment_uri}/span", @base.has_value, span.to_i)
347
475
  end
348
476
  }
349
477
  }
@@ -352,6 +480,29 @@ protected
352
480
  serialize_variant_effects(set_uri, feature_uri, list)
353
481
  elsif tag == 'Variant_seq' then
354
482
  serialize_variant_seqs(set_uri, feature_uri, list)
483
+ # VCF related attributes:
484
+ elsif tag == ' alternative_alleles' then
485
+ # TODO
486
+ elsif tag == ' filters' then
487
+ # Example: "Qual;MinAB;MinDP" -- comes here as split list (split by ";")
488
+ list.each { |id|
489
+ create_triple(feature_uri, @base.is_refuted_by, RDF::URI.new("#{set_uri}/filter/#{id}"))
490
+ }
491
+ elsif tag == ' samples' then
492
+ list.each_index { |sample|
493
+ list[sample].each_pair { |key, values|
494
+ serialize_vcf_sample(feature_uri, sample, key, values, attributes)
495
+ }
496
+ }
497
+ # Everything else:
498
+ elsif list == true then
499
+ # Attribute is a flag. Tag itself carries meaning and has no value associated with it.
500
+ attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}")
501
+ create_triple(feature_uri, @base.has_attribute, attribute_uri)
502
+ create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
503
+ create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
504
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
505
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
355
506
  else
356
507
  # TODO Report unknown upper case letters here? That would be a spec. validation...
357
508
  # Well, or it would show that this implementation is incomplete. Could be either.
@@ -359,71 +510,299 @@ protected
359
510
  value = list[index]
360
511
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
361
512
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
362
- create_triple(feature_uri, @base.attribute, attribute_uri)
363
- create_triple(attribute_uri, RDF.type, @base.Attribute)
364
- create_triple(attribute_uri, @base.tag, "#{tag}")
365
- create_triple(attribute_uri, RDF.value, value)
513
+ create_triple(feature_uri, @base.has_attribute, attribute_uri)
514
+ create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
515
+ # TODO Figure out why the following line was there. Seems wrong.
516
+ #create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
517
+ create_triple(attribute_uri, @base.has_value, value)
518
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
519
+ create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
366
520
  }
367
521
  end
368
522
  }
369
523
  end
370
524
 
525
+ # Serializes VCF sample data (VCF columns 9 and above).
526
+ #
527
+ # See also genotype serialization of non-VCF data in `serialize_variant_seqs`.
528
+ #
529
+ # +feature_uri+:: URI of the feature that the sample data relates to
530
+ # +sample+:: number of the sample that is being addressed (sample column number)
531
+ # +key+:: key of the described sample values
532
+ # +values+:: values of the sample (possible composite type, e.g. comma separated list)
533
+ # +attribtues+:: a map of tag/value pairs associated with the feature
534
+ def serialize_vcf_sample(feature_uri, sample, key, values, attributes)
535
+ if key == 'DP' then
536
+ # Depth across samples. An integer.
537
+ values = values.split(',')
538
+ values.each_index { |index|
539
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index].to_i, index, values.size > 1, @base.Number_ofReads, RDF::XSD.integer)
540
+ }
541
+ elsif key == 'GT' then
542
+ # Genotype
543
+ list_uri = RDF::URI.new("#{feature_uri}/attribute/#{key}")
544
+ serialize_attribute_with_label(feature_uri, list_uri, @base.Genotype, key)
545
+ phased = values.index('/') == nil
546
+ if phased then
547
+ create_triple(list_uri, @base.has_attribute, "#{list_uri}/phase")
548
+ create_triple("#{list_uri}/phase", RDF.type, @base.GameticPhase)
549
+ end
550
+ value_uris = []
551
+ values = values.split(/\/|\|/)
552
+ # Only say something about zygosity if we deal with single bases and a diploid genome!
553
+ if values.length == 2 and values.map { |sequence| sequence.length }.uniq[0] == 1 then
554
+ if values.uniq.length == 1 then
555
+ create_triple(list_uri, @base.has_quality, @base.Homozygous)
556
+ else
557
+ create_triple(list_uri, @base.has_quality, @base.Heterozygous)
558
+ end
559
+ end
560
+ values.each_index { |index|
561
+ sequence = vcf_allele(values[index].to_i, attributes)
562
+ sequence_type = @base.SequenceVariant
563
+ sequence_type = @base.ReferenceSequence if values[index].to_i == 0
564
+ value_uris << value_uri = serialize_vcf_sample_attribute(feature_uri, sample, key, true, sequence, index, values.size > 1, sequence_type)
565
+ }
566
+ serialize_list_array(list_uri, value_uris)
567
+ elsif key == 'FT' then
568
+ # Filter: passed does nothing; applied filter uses isRefutedBy.
569
+ # TODO How to code using GFVO?
570
+ values = values.split(';')
571
+ values.each_index { |index|
572
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
573
+ }
574
+ elsif key == 'GL' then
575
+ # Genotype likelihoods.
576
+ list_uri = "#{feature_uri}/attribute/#{key}"
577
+ serialize_attribute_with_label(feature_uri, list_uri, @base.Score, key)
578
+ values = values.split(',')
579
+ value_uris = []
580
+ values.each_index { |index|
581
+ value_uris << serialize_vcf_sample_attribute(feature_uri, sample, key, false, values[index], index, values.size > 1, @base.InformationContentEntity)
582
+ }
583
+ serialize_list_array(list_uri, value_uris)
584
+ elsif key == 'GLE' then
585
+ # Genotype likelihoods of heterogenous ploidy.
586
+ # Example: 0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53
587
+ values = values.split(',')
588
+ values.each_index { |index|
589
+ genotype, likelihood = values[index].split(':')
590
+ genotype = genotype.split('/').map { |allele| vcf_allele(allele.to_i, attributes) }
591
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, genotype, index, values.size > 1, @base.InformationContentEntity)
592
+ }
593
+ elsif key == 'PL' then
594
+ # Phred scaled genotype likelihoods.
595
+ values = values.split(',')
596
+ values.each_index { |index|
597
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
598
+ }
599
+ elsif key == 'GP' then
600
+ # Phred scaled genotype posterior probabilities.
601
+ values = values.split(',')
602
+ values.each_index { |index|
603
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
604
+ }
605
+ elsif key == 'GQ' then
606
+ # Conditional genotype quality.
607
+ values = values.split(',')
608
+ values.each_index { |index|
609
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
610
+ }
611
+ elsif key == 'HQ' then
612
+ # Haplotype qualities -- presumably Phred scaled.
613
+ values = values.split(',')
614
+ values.each_index { |index|
615
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
616
+ }
617
+ elsif key == 'PS' then
618
+ # Phase set. It's complicated. See the VCF specification for details.
619
+ values = values.split(',')
620
+ values.each_index { |index|
621
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
622
+ }
623
+ elsif key == 'PQ' then
624
+ # Phasing quality in Phred scale.
625
+ values = values.split(',')
626
+ values.each_index { |index|
627
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
628
+ }
629
+ elsif key == 'EC' then
630
+ # Expected alternate allele counts.
631
+ values = values.split(',')
632
+ values.each_index { |index|
633
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
634
+ }
635
+ elsif key == 'MQ' then
636
+ # RMS mapping quality. An integer.
637
+ values = values.split(',')
638
+ values.each_index { |index|
639
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
640
+ }
641
+ else
642
+ # Unknown keys. Should that be possible at all?
643
+ serialize_vcf_sample_attribute(feature_uri, sample, key, true, values, 0, false, @base.InformationContentEntity)
644
+ end
645
+ end
646
+
647
+ # Returns the allele based on VCF's genotype indexing specification.
648
+ # Reference allele is index zero, alternatives alleles are designated by one or above.
649
+ #
650
+ # +genotype_index+:: VCF genotype index
651
+ # +attributes+:: feature attribute hash that contains reference/alternative allele bases
652
+ def vcf_allele(genotype_index, attributes)
653
+ if genotype_index == 0 then
654
+ genotype = attributes[' reference_bases'][0]
655
+ else
656
+ genotype = attributes[' alternative_alleles'][genotype_index - 1]
657
+ end
658
+ end
659
+
660
+ # Serializes an ordered list; the list's URIs are given as an array.
661
+ #
662
+ # +list_uri+:: URI of the list object (ordered list items will be part of this instance via "has_first_part", "has_ordered_part, "has_last_part")
663
+ # +uris+:: URIs of the list to be serialized
664
+ def serialize_list_array(list_uri, uris)
665
+ uris.each_index { |index|
666
+ next_uri = nil
667
+ next_uri = uris[index + 1] if index + 1 < uris.length
668
+ serialize_list(uris, index, uris[index], next_uri, list_uri)
669
+ }
670
+ end
671
+
672
+ # Create an ordered list of things; this method serializes one item only and
673
+ # repeated calls to this method create the list.
674
+ #
675
+ # +values+:: array of the things that appear in the ordered list
676
+ # +index+:: index of the thing that is serialized by this method call
677
+ # +value_uri+:: URI that represents the current value that is being linked to
678
+ # +next_value_uri+:: URI of the next serialized value (ignored, if last index)
679
+ # +list_uri+:: URI of the list that contains the items
680
+ def serialize_list(values, index, value_uri, next_value_uri, list_uri)
681
+ if index == 0 then
682
+ create_triple(list_uri, @base.has_first_part, value_uri)
683
+ elsif index + 1 == values.length then
684
+ create_triple(list_uri, @base.has_last_part, value_uri)
685
+ else
686
+ create_triple(list_uri, @base.has_ordered_part, value_uri)
687
+ end
688
+ if index + 1 < values.length then
689
+ create_triple(value_uri, @base.is_before, next_value_uri)
690
+ end
691
+ end
692
+
693
+ # Serializes basic information for an object (a feature's attribtue) with label.
694
+ #
695
+ # Links the object to a feature, sets the objects type, assigns it a label.
696
+ #
697
+ # +feature_uri+:: URI of the feature that has the object as an attribute
698
+ # +object_uri+:: URI that represents the object
699
+ # +object_type+:: type of the object
700
+ # +label+:: label text to use
701
+ def serialize_attribute_with_label(feature_uri, object_uri, object_type, label)
702
+ create_triple(feature_uri, @base.has_attribute, object_uri)
703
+ create_triple(object_uri, RDF.type, object_type)
704
+ label_uri = RDF::URI.new("#{object_uri}/label")
705
+ create_triple(object_uri, @base.has_attribute, label_uri)
706
+ create_triple(label_uri, RDF.type, @base.Label)
707
+ create_triple(label_uri, @base.has_value, label)
708
+ end
709
+
710
+ # Serializes VCF meta-data (pragma equivalent) key/value pairs. Used by serialize_pragma.
711
+ #
712
+ # Returns URI of the serialized meta-data.
713
+ #
714
+ # +set_uri+:: URI of the set that the meta-data belongs to
715
+ # +uri_suffix+:: suffix that is appended to set_uri, which uniquely defines the meta-data (within the set_uri)
716
+ # +meta_type+:: type of the meta-data
717
+ # +key_type_mappings+:: mappings of keys to known types, everything else is considered an Object
718
+ # +attributes+:: key/value pairs that are the actual meta-data being described
719
+ def serialize_vcf_pragma(set_uri, uri_suffix, meta_type, key_type_mappings, attributes)
720
+ pragma_uri = RDF::URI.new("#{set_uri}/#{uri_suffix}")
721
+ create_triple(pragma_uri, RDF.type, meta_type)
722
+ attributes.each_pair { |key, value|
723
+ attribute_uri = RDF::URI.new("#{pragma_uri}/#{key}")
724
+ create_triple(pragma_uri, @base.has_attribute, attribute_uri)
725
+ if key_type_mappings.has_key?(key) then
726
+ create_triple(attribute_uri, RDF.type, key_type_mappings[key])
727
+ # TODO Check if type is integer/double here, then convert value accordingly.
728
+ else
729
+ create_triple(attribute_uri, RDF.type, @base.Object)
730
+ end
731
+ create_triple(attribute_uri, @base.has_value, value)
732
+ }
733
+ pragma_uri
734
+ end
735
+
736
+ # Serializes a VCF sample attribute/value pair. Used by serialize_vcf_sample.
737
+ #
738
+ # Returns URI of the serialized attribute/value pair.
739
+ #
740
+ # +base_uri+:: URI of the "thing" that the sample data relates to
741
+ # +sample+:: number of the sample that is being addressed (sample column number)
742
+ # +key+:: key of the described sample values
743
+ # +has_label+:: if true, then serialize label (value taken from key)
744
+ # +value+:: value that is associated with the key/sample
745
+ # +index+:: index of the value (in case value is part of an array of size > 1)
746
+ # +multivalue+:: true if this value is taken from an array of values of size > 1
747
+ # +attribute_type+:: type of the attribute entity that represents the value
748
+ # +value_type+:: type of the actual value
749
+ def serialize_vcf_sample_attribute(base_uri, sample, key, has_label, value, index, multivalue, attribute_type, value_type = nil)
750
+ value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}") unless multivalue
751
+ value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}-#{index + 1}") if multivalue
752
+ create_triple(base_uri, @base.has_attribute, value_uri)
753
+ create_triple(value_uri, RDF.type, attribute_type)
754
+ create_triple(value_uri, @base.has_value, value, value_type)
755
+ if has_label then
756
+ label_uri = RDF::URI.new("#{value_uri}/label")
757
+ create_triple(value_uri, @base.has_attribute, label_uri)
758
+ create_triple(label_uri, RDF.type, @base.Label)
759
+ create_triple(label_uri, @base.has_value, key)
760
+ end
761
+ value_uri
762
+ end
763
+
371
764
  # Serializes a structured attribute (given as a pragma statement), which later
372
765
  # can be referred to from feature instances.
373
766
  #
374
767
  # +set_uri+:: the feature set URI to which the structured attribute belongs to
375
768
  # +pragma+:: a map that encapsulates the structured attribute data
376
769
  def serialize_structured_attribute(set_uri, pragma)
770
+ # TODO Triple from set_uri to attribute_uri missing; should be isParticipantIn
377
771
  attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
378
772
  attributes = nil
379
773
  class_type = nil
380
774
  if pragma.has_key?('attribute-method') then
381
775
  attributes = pragma['attribute-method'][0]
382
- class_type = @base.Method
776
+ class_type = @base.ExperimentalMethod
383
777
  elsif pragma.has_key?('data-source') then
384
778
  attributes = pragma['data-source'][0]
385
- class_type = @base.DataSource
779
+ class_type = @base.GenomicAscertainingMethod
386
780
  elsif pragma.has_key?('score-method') then
387
781
  attributes = pragma['score-method'][0]
388
- class_type = @base.Method
782
+ class_type = @base.ExperimentalMethod
389
783
  elsif pragma.has_key?('source-method') then
390
784
  attributes = pragma['source-method'][0]
391
- class_type = @base.Method
785
+ class_type = @base.ExperimentalMethod
392
786
  elsif pragma.has_key?('technology-platform') then
393
787
  attributes = pragma['technology-platform'][0]
394
- class_type = @base.TechnologyPlatform
788
+ class_type = @base.SequencingTechnologyPlatform
395
789
  else
396
790
  # TODO Error.
397
791
  end
398
- if class_type == @base.DataSource and attributes.has_key?('Data_type') then
792
+ if class_type == @base.GenomicAscertainingMethod and attributes.has_key?('Data_type') then
399
793
  attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
400
794
  if attributes['Data_type'] == 'Array_CGH' then
401
795
  class_type = @base.ArrayComparativeGenomicHybridization
402
796
  elsif attributes['Data_type'] == 'DNA_microarray' then
403
797
  class_type = @base.DNAMicroarray
404
798
  elsif attributes['Data_type'] == 'DNA_sequence' then
405
- class_type = @base.DNASequence
799
+ class_type = @base.DNASequencing
406
800
  elsif attributes['Data_type'] == 'RNA_sequence' then
407
- class_type = @base.RNASequence
801
+ class_type = @base.RNASequencing
408
802
  else
409
803
  # TODO Error.
410
804
  end
411
- elsif class_type == @base.TechnologyPlatform then
412
- if attributes.has_key?('Average_coverage') then
413
- create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
414
- end
415
- if attributes.has_key?('Platform_class') then
416
- create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
417
- end
418
- if attributes.has_key?('Platform_name') then
419
- create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
420
- end
421
- if attributes.has_key?('Read_length') then
422
- create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
423
- end
424
- if attributes.has_key?('Read_pair_span') then
425
- create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
426
- end
805
+ elsif class_type == @base.SequencingTechnologyPlatform then
427
806
  if attributes.has_key?('Read_type') then
428
807
  attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
429
808
  if attributes['Read_type'] == 'fragment' then
@@ -436,17 +815,54 @@ protected
436
815
  end
437
816
  end
438
817
  create_triple(attribute_uri, RDF.type, class_type)
818
+ if class_type == @base.SequencingTechnologyPlatform or
819
+ class_type == @base.FragmentReadPlatform or
820
+ class_type == @base.PairedEndReadPlatform then
821
+ if attributes.has_key?('Average_coverage') then
822
+ coverage_uri = RDF::URI.new("#{attribute_uri}/averagecoverage")
823
+ create_triple(attribute_uri, @base.has_attribute, coverage_uri)
824
+ create_triple(coverage_uri, RDF.type, @base.AverageCoverage)
825
+ create_triple(coverage_uri, @base.has_value, attributes['Average_coverage'][0].to_i)
826
+ end
827
+ if attributes.has_key?('Platform_class') then
828
+ create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/platformclass"))
829
+ #create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
830
+ end
831
+ if attributes.has_key?('Platform_name') then
832
+ #create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
833
+ end
834
+ if attributes.has_key?('Read_length') then
835
+ #create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
836
+ end
837
+ if attributes.has_key?('Read_pair_span') then
838
+ #create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
839
+ end
840
+ end
439
841
  attributes.keys.each { |tag|
440
842
  if tag.match(/^[a-z]/) then
843
+ tag.strip!
441
844
  custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
442
- create_triple(custom_attribute_uri, RDF.type, @base.StructuredAttribute)
443
- create_triple(custom_attribute_uri, @base.tag, tag)
845
+ create_triple(attribute_uri, @base.has_attribute, custom_attribute_uri)
846
+ create_triple(custom_attribute_uri, RDF.type, @base.InformationContentEntity)
847
+ create_triple(custom_attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"))
444
848
  attributes[tag].each { |value|
445
- create_triple(custom_attribute_uri, RDF.value, value)
849
+ create_triple(custom_attribute_uri, @base.has_value, value)
446
850
  }
447
- create_triple(attribute_uri, @base.attribute, custom_attribute_uri)
851
+ create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), RDF.type, @base.Name)
852
+ create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), @base.has_value, tag)
448
853
  else
449
854
  # TODO
855
+ match_constraints = {}
856
+ attributes[tag].each { |value|
857
+ if tag == 'Seqid' or tag == 'Type' or tag == 'Source' then
858
+ match_constraints[tag] = value.split(',')
859
+ else
860
+ # Not a recognized match. Might be a Dbxref or Comment.
861
+ end
862
+ }
863
+ unless match_constraints.keys.empty? then
864
+ @matches
865
+ end
450
866
  end
451
867
  }
452
868
  end
@@ -462,35 +878,46 @@ protected
462
878
  sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
463
879
  feature_ids = feature_ids.split(' ')
464
880
  effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
465
- serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
466
- create_triple(effect_uri, RDF.type, @base.Effect)
467
- create_triple(effect_uri, @base.sequenceVariant, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
468
- create_triple(effect_uri, @base.featureType, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
881
+ # TODO
882
+ #serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
883
+ # Type is a SO sequence_variant or descendent:
884
+ create_triple(effect_uri, RDF.type, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
885
+ # The feature type should be already apparent from the targeted feature. Do no sanity
886
+ # check here (if they match) and just skip over it.
887
+ # create_triple(effect_uri, @base.feature_type, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
469
888
  feature_ids.each { |feature_id|
470
- create_triple(effect_uri, @base.feature, feature_id)
889
+ create_triple(feature_id, @base.is_affected_by, effect_uri)
471
890
  }
472
891
  }
473
892
  end
474
893
 
475
894
  # Serializes a list of variant sequences.
476
895
  #
896
+ # See also VCF genotype serialization ('GT' attribute) in `serialize_vcf_sample`.
897
+ #
477
898
  # +set_uri+:: the feature set URI to which the feature belongs to
478
899
  # +feature_uri+:: the feature URI to the feature that is annotated with variant data
479
900
  # +list+:: list of variant values
480
901
  def serialize_variant_seqs(set_uri, feature_uri, list)
902
+ variant_uri = nil
903
+
481
904
  list.each_index { |index|
482
905
  value = list[index]
483
906
  variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
484
- serialize_variant_triple(feature_uri, variant_uri, @base.sequence, RDF::Literal.new(value))
907
+ sequence_uri = RDF::URI.new("#{variant_uri}/sequence")
908
+ serialize_variant_triple(feature_uri, variant_uri, @base.has_attribute, sequence_uri)
909
+ create_triple(sequence_uri, @base.has_value, value)
485
910
  }
486
911
 
487
- # Return the variant type based on the present sequence(s):
488
- return @base.Variant if list.length != 2
489
912
  if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
490
- return @base.HomozygousVariant if list[0] == list[1]
491
- return @base.HeterozygousVariant
913
+ if list[0] == list[1] then
914
+ create_triple(variant_uri, @base.has_quality, @base.Homozygous)
915
+ else
916
+ create_triple(variant_uri, @base.has_quality, @base.Heterozygous)
917
+ end
492
918
  end
493
- return @base.Variant
919
+
920
+ return @base.SequenceVariant
494
921
  end
495
922
 
496
923
  # Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
@@ -501,8 +928,8 @@ protected
501
928
  # +object+:: data to be serialized
502
929
  def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
503
930
  unless @variants.has_key?(variant_uri.to_s) then
504
- create_triple(feature_uri, @base.sequence_annotation, variant_uri)
505
- create_triple(variant_uri, RDF.type, @base.Variant)
931
+ create_triple(feature_uri, @base.is_affected_by, variant_uri)
932
+ create_triple(variant_uri, RDF.type, @base.VariantCalling)
506
933
  end
507
934
  @variants[variant_uri.to_s] = true
508
935
  create_triple(variant_uri, predicate, object)
@@ -515,12 +942,39 @@ protected
515
942
  def serialize_feature_sequence(set_uri, feature_sequence)
516
943
  feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
517
944
  annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
518
- create_triple(feature_uri, @base.sequence_annotation, annotation_uri)
519
- create_triple(annotation_uri, RDF.type, @base.Sequence_Annotation)
520
- create_triple(annotation_uri, RDF::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
521
- create_triple(annotation_uri, @base.sequence, feature_sequence.sequence)
945
+ create_triple(feature_uri, @base.has_attribute, annotation_uri)
946
+ create_triple(annotation_uri, RDF.type, @base.Sequence)
947
+ create_triple(annotation_uri, BioInterchange::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
948
+ create_triple(annotation_uri, @base.has_value, feature_sequence.sequence)
522
949
  end
523
950
 
951
+ # Serializes an external database reference.
952
+ #
953
+ # +feature_uri+:: URI of the feature that the external database references is referring to
954
+ # +dbxref_composite+:: composite term of the external database reference (e.g. ""dbSNP_127:rs123456)
955
+ def serialize_dbxref(feature_uri, dbxref_composite)
956
+ abbreviation, accession = dbxref_composite.split(':', 2)
957
+ dbxref_uri = RDF::URI.new("#{feature_uri.to_s}/dbxref/#{BioInterchange.make_safe_label(abbreviation)}")
958
+ create_triple(feature_uri, @base.references, dbxref_uri)
959
+
960
+ create_triple(dbxref_uri, RDF.type, @base.ExternalReference)
961
+ create_triple(dbxref_uri, @base.refers_to, BioInterchange::LifeScienceRegistry.send(dbxref_composite.split('_', 2)[0].downcase).sub('$id', accession))
962
+ if dbxref_composite.match(/^.+_.+:.+$/) then
963
+ # Entry with version information.
964
+ version_uri = RDF::URI.new("#{dbxref_uri}/version")
965
+ create_triple(dbxref_uri, @base.has_identifier, version_uri)
966
+ create_triple(version_uri, @base.has_value, abbreviation[6..-1])
967
+ end
968
+
969
+ #if dbxref_composite.match(/^dbSNP(_\d+)?:rs\d+$/) then
970
+ # # linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{dbxref_composite.split(/:/)[1].sub(/^rs/, '')}"
971
+ #elsif dbxref_composite.match(/^COSMIC(_\d+)?:COSM\d+$/) then
972
+ # linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{accession.sub(/^COSM/, '')}"
973
+ #else
974
+ # BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + accession
975
+ #end
976
+ end
977
+
524
978
  end
525
979
 
526
980
  end