biointerchange 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -4
- data/Gemfile +2 -3
- data/README.md +36 -22
- data/VERSION +1 -1
- data/examples/Felis_catus.gvf.gz +0 -0
- data/examples/Felis_catus_incl_consequences.vcf.gz +0 -0
- data/generators/rdfxml.rb +1 -1
- data/generators/tsv2rubyclass.rb +31 -0
- data/lib/biointerchange/core.rb +17 -5
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +591 -137
- data/lib/biointerchange/genomics/gff3_reader.rb +16 -3
- data/lib/biointerchange/genomics/gvf_reader.rb +1 -1
- data/lib/biointerchange/genomics/vcf_feature.rb +46 -0
- data/lib/biointerchange/genomics/vcf_feature_set.rb +14 -0
- data/lib/biointerchange/genomics/vcf_reader.rb +238 -0
- data/lib/biointerchange/gfvo.rb +689 -553
- data/lib/biointerchange/life_science_registry.rb +3595 -0
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +33 -35
- data/lib/biointerchange/writer.rb +11 -16
- data/make.sh +4 -0
- data/spec/exceptions_spec.rb +1 -7
- data/spec/gff3_rdfwriter_spec.rb +2 -16
- data/spec/gvf_rdfwriter_spec.rb +2 -19
- data/spec/phylogenetics_spec.rb +1 -13
- data/spec/text_mining_pdfx_xml_reader_spec.rb +1 -13
- data/spec/text_mining_pubannos_json_reader_spec.rb +1 -14
- data/spec/text_mining_rdfwriter_spec.rb +8 -19
- data/test.sh +4 -0
- data/web/about.html +10 -14
- data/web/api.html +11 -13
- data/web/bootstrap/css/bootstrap-theme.css +347 -0
- data/web/bootstrap/css/bootstrap-theme.css.map +1 -0
- data/web/bootstrap/css/bootstrap-theme.min.css +7 -0
- data/web/bootstrap/css/bootstrap.css +4764 -4603
- data/web/bootstrap/css/bootstrap.css.map +1 -0
- data/web/bootstrap/css/bootstrap.min.css +6 -8
- data/web/bootstrap/fonts/glyphicons-halflings-regular.eot +0 -0
- data/web/bootstrap/fonts/glyphicons-halflings-regular.svg +229 -0
- data/web/bootstrap/fonts/glyphicons-halflings-regular.ttf +0 -0
- data/web/bootstrap/fonts/glyphicons-halflings-regular.woff +0 -0
- data/web/bootstrap/js/bootstrap.js +1372 -1448
- data/web/bootstrap/js/bootstrap.min.js +5 -5
- data/web/cli.html +14 -28
- data/web/index.html +15 -33
- data/web/ontologies.html +1089 -945
- data/web/webservices.html +12 -14
- metadata +24 -27
- data/lib/biointerchange/gff3o.rb +0 -525
- data/lib/biointerchange/gvf1o.rb +0 -1354
- data/web/bootstrap/css/bootstrap-responsive.css +0 -1040
- data/web/bootstrap/css/bootstrap-responsive.min.css +0 -9
- data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
- data/web/bootstrap/img/glyphicons-halflings.png +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0b66ff1fc16121ae1bd423d5f06bf212b00cdd5c
|
|
4
|
+
data.tar.gz: 93bdd0303b40580eb5f70f17db8aad93228fc11d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2b8adeeb8584d3d09729faeb6475dfd9be4adc3f568fdb8a1e4e654c605126384574344e6235f831609a9fa6a61f24ef87dbfff579153dab0f6e406887208584
|
|
7
|
+
data.tar.gz: f709f45089d6cc469884f9e182752fda0be6235762e65857fa1f8b2f0bca72e392618a8b7da142c0ed763c24526195d126a07a709c55a30ce152caf44b5c31b8
|
data/.travis.yml
CHANGED
|
@@ -2,11 +2,9 @@ language: ruby
|
|
|
2
2
|
rvm:
|
|
3
3
|
- 1.9.2
|
|
4
4
|
- 1.9.3
|
|
5
|
+
- 2.0.0
|
|
6
|
+
- 2.1.1
|
|
5
7
|
- jruby-19mode # JRuby in 1.9 mode
|
|
6
|
-
# - rbx-19mode
|
|
7
|
-
- 1.8.7
|
|
8
|
-
- jruby-18mode # JRuby in 1.8 mode
|
|
9
|
-
# - rbx-18mode
|
|
10
8
|
|
|
11
9
|
# uncomment this line if your project needs to run something other than `rake`:
|
|
12
10
|
# script: bundle exec rspec spec
|
data/Gemfile
CHANGED
|
@@ -2,10 +2,10 @@ source "http://rubygems.org"
|
|
|
2
2
|
# Add dependencies required to use your gem here.
|
|
3
3
|
# Example:
|
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
|
5
|
-
gem "rdf", ">=
|
|
5
|
+
gem "rdf", ">= 1.1.4.3"
|
|
6
6
|
gem "json", ">= 1.6.4"
|
|
7
7
|
gem "getopt", ">= 1.4.1"
|
|
8
|
-
gem "addressable", ">= 2.3.
|
|
8
|
+
gem "addressable", ">= 2.3.6"
|
|
9
9
|
gem "bio", ">= 1.4.2"
|
|
10
10
|
|
|
11
11
|
# Add dependencies to develop your gem here.
|
|
@@ -14,6 +14,5 @@ group :development do
|
|
|
14
14
|
gem "rspec", "~> 2.8.0"
|
|
15
15
|
gem "bundler", ">= 1.1.5"
|
|
16
16
|
gem "jeweler", "~> 1.8.4"
|
|
17
|
-
gem "bio", ">= 1.4.2"
|
|
18
17
|
gem "rdoc", "~> 3.12"
|
|
19
18
|
end
|
data/README.md
CHANGED
|
@@ -21,14 +21,12 @@ Ontologies used in the RDF output:
|
|
|
21
21
|
|
|
22
22
|
* [Comparative Data Analysis Ontology](http://sourceforge.net/apps/mediawiki/cdao/index.php?title=Main_Page) (CDAO)
|
|
23
23
|
* [Friend of a Friend](http://xmlns.com/foaf/spec) (FOAF)
|
|
24
|
-
* [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O)
|
|
25
|
-
* [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O)
|
|
26
24
|
* [Genomic Feature and Variation Ontology](http://www.biointerchange.org/ontologies.html) (GFVO)
|
|
27
25
|
* [Semanticscience Integrated Ontology](http://code.google.com/p/semanticscience/wiki/SIO) (SIO)
|
|
28
26
|
* [Sequence Ontology](http://www.sequenceontology.org/index.html) (SO)
|
|
29
27
|
* [Sequence Ontology Feature Annotation](http://www.sequenceontology.org/index.html) (SOFA)
|
|
30
28
|
|
|
31
|
-
*Note:*
|
|
29
|
+
*Note:* GFVO replaces the [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O) and [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O).
|
|
32
30
|
|
|
33
31
|
#### Contributing
|
|
34
32
|
|
|
@@ -54,7 +52,7 @@ BioInterchange's command-line tool `biointerchange` can be installed as a comman
|
|
|
54
52
|
|
|
55
53
|
Examples:
|
|
56
54
|
|
|
57
|
-
biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.
|
|
55
|
+
biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.gfvo --file examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf
|
|
58
56
|
biointerchange --input dbcls.catanns.json --rdf rdf.bh12.sio --file examples/pubannotation.10096561.json --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
|
|
59
57
|
biointerchange --input uk.ac.man.pdfx --rdf rdf.bh12.sio --file examples/gb-2007-8-3-R40.xml --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
|
|
60
58
|
biointerchange --input phylotastic.newick --rdf rdf.phylotastic.newick --file examples/tree2.new --annotate_date '1 June 2006'
|
|
@@ -69,8 +67,7 @@ Input formats:
|
|
|
69
67
|
|
|
70
68
|
Output formats:
|
|
71
69
|
|
|
72
|
-
* `rdf.biointerchange.
|
|
73
|
-
* `rdf.biointerchange.gvf`
|
|
70
|
+
* `rdf.biointerchange.gfvo`
|
|
74
71
|
* `rdf.bh12.sio`
|
|
75
72
|
* `rdf.phylotastic.newick`
|
|
76
73
|
|
|
@@ -93,15 +90,6 @@ To list all `seqid` entries from a GVF-file conversion in the store, the followi
|
|
|
93
90
|
|
|
94
91
|
testrepo> sparql select * where { ?s <http://www.biointerchange.org/gvf1o#GVF1_0004> ?o } .
|
|
95
92
|
|
|
96
|
-
#### Data Consistency Verification
|
|
97
|
-
|
|
98
|
-
Data consistency is verifyable for the output formats `rdf.biointerchange.gff3` and `rdf.biointerchange.gvf` using the [BioInterchange ontologies](http://www.biointerchange.org/ontologies.html) GFF3O and GVF1O. The following is an example of how [Jena](http://jena.apache.org)'s command line tools and the [HermiT reasoner](http://hermit-reasoner.com) can be used for conistency verification:
|
|
99
|
-
|
|
100
|
-
rdfcat <path-to-gff3o/gvf1o> <yourdata.n3> > merged.xml
|
|
101
|
-
java -d64 -Xmx4G -jar HermiT.jar -k -v merged.xml
|
|
102
|
-
|
|
103
|
-
Another approach is to load the data and its related GFF3O/GVF1O ontology into [Protege](http://protege.stanford.edu), merge them, and then use the "Explain inconsistent ontology" menu item to inspect possible data inconsistencies.
|
|
104
|
-
|
|
105
93
|
#### Example Data Provenance
|
|
106
94
|
|
|
107
95
|
The following list provides information on the origin of the example-data files in the `examples` directory:
|
|
@@ -110,8 +98,16 @@ The following list provides information on the origin of the example-data files
|
|
|
110
98
|
* `BovineGenomeChrX.gff3.gz`: Gzipped GFF3 file describing a Bos taurus chromosome X. Downloaded from [http://bovinegenome.org/?q=download_chromosome_gff3](http://bovinegenome.org/?q=download_chromosome_gff3)
|
|
111
99
|
* `chromosome_BF.gff`: GFF3 file of floating contigs from the Baylor Sequencing Centre. Downloaded from [http://dictybase.org/Downloads](http://dictybase.org/Downloads)
|
|
112
100
|
* `estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf`: GVF file of EBI's [DGVa](http://www.ebi.ac.uk/dgva/database-genomic-variants-archive). Downloaded from [ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf](ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf)
|
|
101
|
+
* `Felis_catus.gvf.gz`: Gzipped GVF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz)
|
|
102
|
+
* `Felis_catus_incl_consequences.vcf.gz`: Gzipped VCF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz](ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz)
|
|
113
103
|
* `gb-2007-8-3-R40.xml`: Generated by [PDFx](http://pdfx.cs.man.ac.uk) from open-access source PDF [Sense-antisense pairs in mammals: functional and evolutionary considerations](http://genomebiology.com/content/pdf/gb-2007-8-3-r40.pdf)
|
|
114
|
-
* `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
|
|
104
|
+
* `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Gzipped GVF files of S. cerevisiae genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
|
|
105
|
+
|
|
106
|
+
#### Additional Example Data
|
|
107
|
+
|
|
108
|
+
The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
|
|
109
|
+
|
|
110
|
+
* `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
|
|
115
111
|
|
|
116
112
|
### Application Programming Interface
|
|
117
113
|
|
|
@@ -322,9 +318,10 @@ The writer takes an object model and serializes it via the `BioInterchange::Writ
|
|
|
322
318
|
# Serialize a model as RDF.
|
|
323
319
|
#
|
|
324
320
|
# +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
|
|
325
|
-
|
|
321
|
+
# +uri_prefix+:: optional URI prefix that should be used in the RDFization of individuals/class instances
|
|
322
|
+
def serialize(model, uri_prefix = nil)
|
|
326
323
|
model.contents.each { |tree|
|
|
327
|
-
serialize_model(model, tree)
|
|
324
|
+
serialize_model(model, tree, uri_prefix)
|
|
328
325
|
}
|
|
329
326
|
end
|
|
330
327
|
|
|
@@ -501,8 +498,7 @@ RDFization parameters and data are send as a single HTTP POST requests containin
|
|
|
501
498
|
* `phylotastic.newick`: [Newick](http://evolution.genetics.washington.edu/phylip/newicktree.html)
|
|
502
499
|
* `uk.ac.man.pdfx`: [PDFx](http://pdfx.cs.man.ac.uk) XML
|
|
503
500
|
* `OUTPUT_METHOD`: determines the RDFization method that should be used, output will always be RDF N-Triples; available output formats are
|
|
504
|
-
* `rdf.biointerchange.
|
|
505
|
-
* `rdf.biointerchange.gvf`: RDFization of `biointerchange.gvf`
|
|
501
|
+
* `rdf.biointerchange.gfvo`: RDFization of `biointerchange.gff3` or `biointerchange.gvf`
|
|
506
502
|
* `rdf.bh12.sio`: RDFization of `dbcls.catanns.json` or `uk.ac.man.pdfx`
|
|
507
503
|
* `rdf.phylotastic.newick`: RDFization of `phylotastic.newick`
|
|
508
504
|
* `URL_ENCODED_DATA`: data for RDFization as [URL encoded](http://en.wikipedia.org/wiki/Percent-encoding) string
|
|
@@ -590,6 +586,13 @@ A Geno Ontology external reference (GOxref) vocabulary can be created by directl
|
|
|
590
586
|
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb
|
|
591
587
|
echo -e "\nend" >> lib/biointerchange/goxref.rb
|
|
592
588
|
|
|
589
|
+
Building an external reference vocabulary based on Life Science Registry external database abbreviations (based on download of the
|
|
590
|
+
Life Science registry spreadsheet as TSV file):
|
|
591
|
+
|
|
592
|
+
echo -e "module BioInterchange\n" > lib/biointerchange/life_science_registry.rb
|
|
593
|
+
cut -f 1,25 <path-to-registry-tsv-file> | grep -E 'https?://.*\$id' | ruby generators/tsv2rubyclass.rb LifeScienceRegistry >> lib/biointerchange/life_science_registry.rb
|
|
594
|
+
echo -e "\nend" >> lib/biointerchange/life_science_registry.rb
|
|
595
|
+
|
|
593
596
|
#### Python Vocabulary Classes
|
|
594
597
|
|
|
595
598
|
The source-code generation can be skipped, if none of the ontologies that are used by BioInterchange have been changed. Otherwise, the existing Python vocabulary class wrappers can be generated as follows:
|
|
@@ -630,14 +633,25 @@ The following Java packages will automatically install alongside BioInterchange'
|
|
|
630
633
|
|
|
631
634
|
### Gem Bundling/Installing
|
|
632
635
|
|
|
636
|
+
Mac OS X prerequisites and `bundle install` difference:
|
|
637
|
+
|
|
638
|
+
sudo port install libxml2 libxslt
|
|
639
|
+
sudo ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future bundle install
|
|
640
|
+
|
|
641
|
+
Actual gem bundling:
|
|
642
|
+
|
|
633
643
|
bundle exec rake gemspec
|
|
634
644
|
bundle exec gem build biointerchange.gemspec
|
|
635
|
-
sudo bundle exec gem install biointerchange
|
|
645
|
+
sudo bundle exec gem install biointerchange-`cat VERSION`.gem
|
|
636
646
|
|
|
637
647
|
If you encounter problems with gem dependencies, then you can try to explictly use Ruby 1.9:
|
|
638
648
|
|
|
639
649
|
bundle exec gem1.9 build biointerchange.gemspec
|
|
640
|
-
sudo bundle exec gem1.9 install biointerchange
|
|
650
|
+
sudo bundle exec gem1.9 install biointerchange-`cat VERSION`.gem
|
|
651
|
+
|
|
652
|
+
Alternative build script, `make.sh`, which installs the gem without RDocs and ri pages (quicker when testing):
|
|
653
|
+
|
|
654
|
+
./make.sh
|
|
641
655
|
|
|
642
656
|
### Unit Testing
|
|
643
657
|
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.0.
|
|
1
|
+
1.0.2
|
|
Binary file
|
|
Binary file
|
data/generators/rdfxml.rb
CHANGED
|
@@ -19,7 +19,7 @@ SIO_SYN = RDF::URI.new('http://semanticscience.org/resource/synonym')
|
|
|
19
19
|
# This label conversion also appears in:
|
|
20
20
|
# +lib/biointerchange/core.rb+
|
|
21
21
|
def make_safe_label(label)
|
|
22
|
-
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)
|
|
22
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
reader = RDF::RDFXML::Reader.open(ARGV[0])
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'biointerchange'
|
|
4
|
+
|
|
5
|
+
if ARGV.length != 1 then
|
|
6
|
+
puts 'Usage: tsv2rubyclass rubyclassname'
|
|
7
|
+
puts ''
|
|
8
|
+
puts 'Reads a TSV file from STDIN, where the first column values become'
|
|
9
|
+
puts 'method names (sanitized for spaces, etc.) in the class and the'
|
|
10
|
+
puts 'second column values are returned as a string.'
|
|
11
|
+
puts ''
|
|
12
|
+
puts 'The generated Ruby class is output on STDOUT.'
|
|
13
|
+
exit 1
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
classname = ARGV[0]
|
|
17
|
+
|
|
18
|
+
puts "class #{classname}"
|
|
19
|
+
puts ''
|
|
20
|
+
|
|
21
|
+
STDIN.each { |line|
|
|
22
|
+
key, value = line.chomp.split("\t")
|
|
23
|
+
|
|
24
|
+
puts " def self.#{BioInterchange.make_safe_label(key)}"
|
|
25
|
+
puts " \"#{value}\""
|
|
26
|
+
puts ' end'
|
|
27
|
+
puts ''
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
puts 'end'
|
|
31
|
+
|
data/lib/biointerchange/core.rb
CHANGED
|
@@ -34,13 +34,13 @@ module BioInterchange
|
|
|
34
34
|
# Custom Exceptions and Errors
|
|
35
35
|
require 'biointerchange/exceptions'
|
|
36
36
|
|
|
37
|
-
# Ontologies (besides the ones from the 'rdf' gem)
|
|
37
|
+
# Ontologies (besides the ones from the 'rdf' gem), vocabularies and
|
|
38
|
+
# other mappings (e.g., database abbreviations to URIs):
|
|
39
|
+
require 'biointerchange/life_science_registry'
|
|
38
40
|
require 'biointerchange/cdao'
|
|
39
41
|
require 'biointerchange/faldo'
|
|
40
|
-
require 'biointerchange/gff3o'
|
|
41
42
|
require 'biointerchange/gfvo'
|
|
42
43
|
require 'biointerchange/goxref'
|
|
43
|
-
require 'biointerchange/gvf1o'
|
|
44
44
|
require 'biointerchange/sio'
|
|
45
45
|
require 'biointerchange/so'
|
|
46
46
|
require 'biointerchange/sofa'
|
|
@@ -105,6 +105,18 @@ module BioInterchange
|
|
|
105
105
|
# Writer
|
|
106
106
|
# ...same GFF3 writer
|
|
107
107
|
|
|
108
|
+
### VCF ###
|
|
109
|
+
|
|
110
|
+
# Reader
|
|
111
|
+
require 'biointerchange/genomics/vcf_reader'
|
|
112
|
+
|
|
113
|
+
# Feature base model
|
|
114
|
+
require 'biointerchange/genomics/vcf_feature_set'
|
|
115
|
+
require 'biointerchange/genomics/vcf_feature'
|
|
116
|
+
|
|
117
|
+
# Writer
|
|
118
|
+
# ...same GFF3 writer
|
|
119
|
+
|
|
108
120
|
#
|
|
109
121
|
# PHYLOGENETICS
|
|
110
122
|
#
|
|
@@ -225,7 +237,7 @@ module BioInterchange
|
|
|
225
237
|
'input' => opt['input'],
|
|
226
238
|
'output' => opt['output']
|
|
227
239
|
}
|
|
228
|
-
map['
|
|
240
|
+
map['batch_size'] = opt['batchsize'].to_i if opt['batchsize']
|
|
229
241
|
opt.each_key { |key|
|
|
230
242
|
map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
|
|
231
243
|
}
|
|
@@ -296,7 +308,7 @@ module BioInterchange
|
|
|
296
308
|
#
|
|
297
309
|
# +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name
|
|
298
310
|
def self.make_safe_label(label)
|
|
299
|
-
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)
|
|
311
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
|
|
300
312
|
end
|
|
301
313
|
|
|
302
314
|
private
|
|
@@ -4,11 +4,12 @@ require 'date'
|
|
|
4
4
|
|
|
5
5
|
module BioInterchange::Genomics
|
|
6
6
|
|
|
7
|
-
# Serializes GFF3 and
|
|
7
|
+
# Serializes GFF3, GVF and VCF models.
|
|
8
8
|
#
|
|
9
9
|
# Inputs:
|
|
10
10
|
# - biointerchange.gff3
|
|
11
11
|
# - biointerchange.gvf
|
|
12
|
+
# - biointerchange.vcf
|
|
12
13
|
#
|
|
13
14
|
# Outputs:
|
|
14
15
|
# - rdf.biointerchange.gfvo
|
|
@@ -18,14 +19,7 @@ class RDFWriter < BioInterchange::Writer
|
|
|
18
19
|
BioInterchange::Registry.register_writer(
|
|
19
20
|
'rdf.biointerchange.gfvo',
|
|
20
21
|
BioInterchange::Genomics::RDFWriter,
|
|
21
|
-
[ 'biointerchange.gff3' ],
|
|
22
|
-
true,
|
|
23
|
-
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
|
24
|
-
)
|
|
25
|
-
BioInterchange::Registry.register_writer(
|
|
26
|
-
'rdf.biointerchange.gfvo',
|
|
27
|
-
BioInterchange::Genomics::RDFWriter,
|
|
28
|
-
[ 'biointerchange.gvf' ],
|
|
22
|
+
[ 'biointerchange.gff3', 'biointerchange.gvf', 'biointerchange.vcf' ],
|
|
29
23
|
true,
|
|
30
24
|
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
|
31
25
|
)
|
|
@@ -34,8 +28,7 @@ class RDFWriter < BioInterchange::Writer
|
|
|
34
28
|
#
|
|
35
29
|
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
|
36
30
|
def initialize(ostream)
|
|
37
|
-
|
|
38
|
-
@ostream = ostream
|
|
31
|
+
super(ostream)
|
|
39
32
|
end
|
|
40
33
|
|
|
41
34
|
# Serialize a model as RDF.
|
|
@@ -47,10 +40,12 @@ class RDFWriter < BioInterchange::Writer
|
|
|
47
40
|
@format = :gff3
|
|
48
41
|
elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
|
|
49
42
|
@format = :gvf
|
|
43
|
+
elsif model.instance_of?(BioInterchange::Genomics::VCFFeatureSet) then
|
|
44
|
+
@format = :vcf
|
|
50
45
|
else
|
|
51
46
|
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
|
|
52
|
-
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet
|
|
53
|
-
'BioInterchange::Genomics::GVFFeatureSet.'
|
|
47
|
+
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet, ' +
|
|
48
|
+
'BioInterchange::Genomics::GVFFeatureSet and BioInterchange::Genomics::VCFFeatureSet.'
|
|
54
49
|
end
|
|
55
50
|
@base = BioInterchange::GFVO
|
|
56
51
|
serialize_model(model, uri_prefix)
|
|
@@ -71,15 +66,24 @@ protected
|
|
|
71
66
|
# Record written variants in order to avoid writing out RDF.type multiple times.
|
|
72
67
|
@variants = {}
|
|
73
68
|
|
|
69
|
+
# Set up "matchers" that can be used to match a number of attributes of a feature, and then,
|
|
70
|
+
# link out to an entity that says something about that combination of attributes. Used for
|
|
71
|
+
# GVF #data-source, etc., pragmas and VCF filters.
|
|
72
|
+
@matchers = []
|
|
73
|
+
|
|
74
74
|
# Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
|
|
75
75
|
# Then register the prefix with the writer to have a concise Turtle output.
|
|
76
76
|
set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
|
|
77
77
|
set_uri = RDF::URI.new(model.uri) unless set_uri
|
|
78
78
|
set_base(set_uri + '/')
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
add_prefix('http://biohackathon.org/resource/faldo#', 'faldo')
|
|
81
|
+
add_prefix('http://www.biointerchange.org/gfvo#', 'gfvo')
|
|
82
|
+
add_prefix('http://semanticscience.org/resource/', 'sio')
|
|
83
|
+
|
|
84
|
+
create_triple(set_uri, RDF.type, @base.File)
|
|
81
85
|
model.pragmas.each { |pragma_name|
|
|
82
|
-
serialize_pragma(set_uri, model.pragma(pragma_name))
|
|
86
|
+
serialize_pragma(set_uri, pragma_name, model.pragma(pragma_name))
|
|
83
87
|
}
|
|
84
88
|
model.contents.each { |feature|
|
|
85
89
|
if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
|
|
@@ -89,64 +93,162 @@ protected
|
|
|
89
93
|
end
|
|
90
94
|
}
|
|
91
95
|
close
|
|
92
|
-
#RDF::NTriples::Writer.dump(graph, @ostream)
|
|
93
|
-
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
|
94
|
-
# Having said that, Jena's rdfcat is very good for converting formats
|
|
95
|
-
# anyway, so perhaps it is not worth investigating the following.
|
|
96
|
-
# RDF::RDFXML::Writer.dump(graph, @ostream)
|
|
97
96
|
end
|
|
98
97
|
|
|
99
98
|
# Serializes pragmas for a given feature set URI.
|
|
100
99
|
#
|
|
101
100
|
# +set_uri+:: the feature set URI to which the pragmas belong to
|
|
101
|
+
# +name+:: name of the pragma statement
|
|
102
102
|
# +pragma+:: an object representing a pragma statement
|
|
103
|
-
def serialize_pragma(set_uri, pragma)
|
|
103
|
+
def serialize_pragma(set_uri, name, pragma)
|
|
104
104
|
if pragma.kind_of?(Hash) then
|
|
105
105
|
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
|
|
106
106
|
serialize_structured_attribute(set_uri, pragma)
|
|
107
107
|
elsif pragma.has_key?('gff-version') then
|
|
108
|
-
create_triple(set_uri, @base.
|
|
108
|
+
create_triple(set_uri.to_s, @base.has_identifier, RDF::URI.new("#{set_uri}/version"))
|
|
109
|
+
create_triple("#{set_uri}/version", RDF.type, @base.Version)
|
|
110
|
+
create_triple("#{set_uri}/version", @base.has_value, "gff-version #{pragma['gff-version']}")
|
|
109
111
|
elsif pragma.has_key?('gvf-version') then
|
|
110
|
-
create_triple(set_uri,
|
|
112
|
+
create_triple("#{set_uri}/version", RDF.type, @base.Version)
|
|
113
|
+
create_triple("#{set_uri}/version", @base.has_value, "gvf-version #{pragma['gvf-version']}")
|
|
114
|
+
elsif pragma.has_key?('fileformat') then
|
|
115
|
+
create_triple("#{set_uri}/version", RDF.type, @base.Version)
|
|
116
|
+
create_triple("#{set_uri}/version", @base.has_value, "fileformat #{pragma['fileformat']}")
|
|
111
117
|
elsif pragma.has_key?('sequence-region') then
|
|
112
118
|
pragma['sequence-region'].keys.each { |seqid|
|
|
113
119
|
serialize_landmark(set_uri, pragma['sequence-region'][seqid])
|
|
114
120
|
}
|
|
115
121
|
elsif pragma.has_key?('species') then
|
|
116
|
-
create_triple(set_uri, @base.
|
|
122
|
+
create_triple(set_uri, @base.is_about, RDF::URI.new(pragma['species']))
|
|
123
|
+
# VCF section:
|
|
124
|
+
# ...TODO
|
|
125
|
+
# Everything else:
|
|
126
|
+
else
|
|
127
|
+
end
|
|
128
|
+
elsif pragma.kind_of?(Array) then
|
|
129
|
+
# VCF section:
|
|
130
|
+
basic_vcf_mappings = {
|
|
131
|
+
'ID' => @base.Identifier,
|
|
132
|
+
'Description' => @base.Comment,
|
|
133
|
+
'Number' => @base.InformationContentEntity, # Note: not just an integer; can be also 'A', 'G', and '.'
|
|
134
|
+
'Type' => @base.InformationContentEntity # Can be 'Integer', 'Float', 'Character', 'String'
|
|
135
|
+
}
|
|
136
|
+
if name == 'FILTER' then
|
|
137
|
+
pragma.each { |assignment|
|
|
138
|
+
pragma_uri = serialize_vcf_pragma(set_uri, "filter/#{assignment['ID']}", @base.VariantCalling, basic_vcf_mappings, assignment)
|
|
139
|
+
create_triple(set_uri, @base.is_participant_in, pragma_uri)
|
|
140
|
+
}
|
|
141
|
+
elsif name == 'FORMAT' then
|
|
142
|
+
pragma.each { |assignment|
|
|
143
|
+
pragma_uri = serialize_vcf_pragma(set_uri, "format/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
|
|
144
|
+
create_triple(set_uri, @base.references, pragma_uri)
|
|
145
|
+
}
|
|
146
|
+
elsif name == 'INFO' then
|
|
147
|
+
pragma.each { |assignment|
|
|
148
|
+
pragma_uri = serialize_vcf_pragma(set_uri, "info/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
|
|
149
|
+
create_triple(set_uri, @base.references, pragma_uri)
|
|
150
|
+
}
|
|
151
|
+
else
|
|
152
|
+
# TODO
|
|
117
153
|
end
|
|
118
154
|
else
|
|
119
155
|
# TODO
|
|
120
156
|
end
|
|
121
157
|
end
|
|
122
158
|
|
|
159
|
+
# Goes through "matchers" and links the feature if its attributes are present
|
|
160
|
+
# and equal to a "matcher's" data.
|
|
161
|
+
#
|
|
162
|
+
# (TODO: Update description of this method, because it is absolutely unclear
|
|
163
|
+
# what it actually does right now. Sorry.)
|
|
164
|
+
#
|
|
165
|
+
# +feature+:: the feature that provides attributes for matching
|
|
166
|
+
# +feature_uri+:: URI of the feature that is linked out to, if the feature's attributes match
|
|
167
|
+
def match_feature(feature, feature_uri)
|
|
168
|
+
@matchers.each { |match_constraints|
|
|
169
|
+
constraints, linkout = match_constraints
|
|
170
|
+
|
|
171
|
+
# No constraints means that *everything* matches.
|
|
172
|
+
matches = true
|
|
173
|
+
constraints.each_pair { |key, value|
|
|
174
|
+
if key == 'Seqid' then
|
|
175
|
+
matches = false unless value.include?(feature.sequence_id)
|
|
176
|
+
elsif key == 'Source' then
|
|
177
|
+
matches = false unless value.include?(feature.source)
|
|
178
|
+
elsif key == 'Type' then
|
|
179
|
+
matches = false unless value.include?(feature.type)
|
|
180
|
+
else
|
|
181
|
+
if feature.attributes.has_key?(key) then
|
|
182
|
+
attributes_have_a_match = false
|
|
183
|
+
feature.attributes[key].each { |attribute_value|
|
|
184
|
+
attributes_have_a_match = true if value.include?(attribute_value)
|
|
185
|
+
}
|
|
186
|
+
matches = false unless attributes_have_a_match
|
|
187
|
+
else
|
|
188
|
+
matches = false
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
# If there is a match, then add linkout.
|
|
194
|
+
create_triple(feature_uri, @base.has_source, RDF::URI.new(linkout))
|
|
195
|
+
}
|
|
196
|
+
end
|
|
197
|
+
|
|
123
198
|
# Serializes a +GFF3Feature+ object for a given feature set URI.
|
|
124
199
|
#
|
|
125
200
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
|
126
201
|
# +feature+:: a +GFF3Feature+ instance
|
|
127
202
|
def serialize_feature(set_uri, feature)
|
|
128
203
|
# TODO Make sure there is only one value in the 'ID' list.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
204
|
+
# TODO Ponder about whether it would be possible to get the same URI for two distinct features (bad thing).
|
|
205
|
+
source = ''
|
|
206
|
+
source = "#{feature.source}," if feature.source
|
|
207
|
+
type = ''
|
|
208
|
+
type = "#{feature.type.to_s.sub(/^[^:]+:\/\//, '')}," if feature.type
|
|
209
|
+
phase = ",#{feature.phase}" if feature.phase
|
|
210
|
+
if feature.attributes.has_key?('ID') or feature.attributes.has_key?(' id') then
|
|
211
|
+
feature_id = 'ID'
|
|
212
|
+
feature_id = ' id' if feature.attributes.has_key?(' id')
|
|
213
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes[feature_id][0]}")
|
|
214
|
+
else
|
|
215
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{source}#{type}#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand}#{phase}")
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
create_triple(set_uri, @base.has_member, feature_uri)
|
|
132
219
|
create_triple(feature_uri, RDF.type, @base.Feature)
|
|
220
|
+
create_triple(feature_uri, RDF.type, feature.type) if feature.type
|
|
221
|
+
match_feature(feature, feature_uri)
|
|
133
222
|
serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
|
|
134
|
-
create_triple(feature_uri, @base.
|
|
135
|
-
create_triple(feature_uri, @base.
|
|
136
|
-
create_triple(feature_uri,
|
|
137
|
-
create_triple(feature_uri, @base.
|
|
223
|
+
create_triple(feature_uri, @base.is_located_on, RDF::URI.new(@landmarks[feature.sequence_id]))
|
|
224
|
+
create_triple(feature_uri, @base.is_created_by, RDF::URI.new("#{feature_uri}/source"))
|
|
225
|
+
create_triple("#{feature_uri}/source", RDF.type, @base.ExperimentalMethod)
|
|
226
|
+
create_triple("#{feature_uri}/source", @base.has_value, feature.source) if feature.source
|
|
227
|
+
if feature.phase then
|
|
228
|
+
create_triple(feature_uri, @base.has_quality, RDF::URI.new("#{feature_uri}/phase"))
|
|
229
|
+
create_triple("#{feature_uri}/phase", RDF.type, @base.CodingFrameOffset)
|
|
230
|
+
create_triple("#{feature_uri}/phase", @base.has_value, feature.phase)
|
|
231
|
+
end
|
|
138
232
|
|
|
139
|
-
|
|
233
|
+
create_triple(feature_uri, @base.has_part, RDF::URI.new("#{feature_uri}/locus"))
|
|
234
|
+
create_triple("#{feature_uri}/locus", RDF.type, @base.Locus)
|
|
235
|
+
create_triple("#{feature_uri}/locus", @base.has_attribute, RDF::URI.new("#{feature_uri}/locus/region"))
|
|
236
|
+
serialize_coordinate(set_uri, "#{feature_uri}/locus", feature)
|
|
140
237
|
serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
|
141
238
|
end
|
|
142
239
|
|
|
240
|
+
# Serialize a feature's coordinates using FALDO.
|
|
241
|
+
#
|
|
242
|
+
# +set_uri+:: URI of the feature set that the feature belongs to
|
|
243
|
+
# +feature_uri+:: URI prefix of the feature
|
|
244
|
+
# +feature+:: object representation of the feature, which contains the locus that is described by this method
|
|
143
245
|
def serialize_coordinate(set_uri, feature_uri, feature)
|
|
144
246
|
region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
|
|
145
247
|
start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
|
|
146
248
|
end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
|
|
147
249
|
#feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
|
148
|
-
##graph.insert(
|
|
149
|
-
create_triple(feature_uri, @base.
|
|
250
|
+
##graph.insert(BioInterchange::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
|
|
251
|
+
create_triple(feature_uri, @base.is_located_on, region_uri)
|
|
150
252
|
create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
|
|
151
253
|
# BIN STUFF
|
|
152
254
|
if false then
|
|
@@ -173,7 +275,23 @@ protected
|
|
|
173
275
|
end
|
|
174
276
|
create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
|
|
175
277
|
create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
|
|
176
|
-
|
|
278
|
+
if feature.score then
|
|
279
|
+
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
|
|
280
|
+
if @format == :gvf or @format == :vcf then
|
|
281
|
+
create_triple("#{feature_uri}/score", RDF.type, @base.PhredScore)
|
|
282
|
+
else
|
|
283
|
+
create_triple("#{feature_uri}/score", RDF.type, @base.Score)
|
|
284
|
+
end
|
|
285
|
+
create_triple("#{feature_uri}/score", @base.has_value, feature.score)
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Constructs a landmark URI based on set URI and the landmark's ID.
|
|
290
|
+
#
|
|
291
|
+
# +set_uri+:: the set URI to which the landmark belongs to
|
|
292
|
+
# +id+:: ID of the landmark
|
|
293
|
+
def landmark_uri(set_uri, id)
|
|
294
|
+
"#{set_uri.to_s}/landmark/#{id}"
|
|
177
295
|
end
|
|
178
296
|
|
|
179
297
|
# Serializes a genomic feature landmark ("seqid").
|
|
@@ -182,12 +300,13 @@ protected
|
|
|
182
300
|
# +landmark+:: encapsuled landmark data
|
|
183
301
|
def serialize_landmark(set_uri, landmark)
|
|
184
302
|
return if @landmarks.has_key?(landmark.seqid)
|
|
185
|
-
landmark_uri =
|
|
303
|
+
landmark_uri = landmark_uri(set_uri, landmark.seqid)
|
|
186
304
|
region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
|
|
187
305
|
@landmarks[landmark.seqid] = landmark_uri
|
|
188
306
|
create_triple(landmark_uri, RDF.type, @base.Landmark)
|
|
189
|
-
create_triple(landmark_uri, @base.
|
|
190
|
-
create_triple(landmark_uri, @base.
|
|
307
|
+
create_triple(landmark_uri, @base.has_identifier, RDF::URI.new("#{landmark_uri}/id"))
|
|
308
|
+
create_triple("#{landmark_uri}/id", @base.has_value, landmark.seqid)
|
|
309
|
+
create_triple(landmark_uri, @base.has_attribute, region_uri)
|
|
191
310
|
if landmark.start_coordinate then
|
|
192
311
|
start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
|
|
193
312
|
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
|
@@ -209,91 +328,92 @@ protected
|
|
|
209
328
|
attributes.each_pair { |tag, list|
|
|
210
329
|
# Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
|
|
211
330
|
if tag == 'Alias' then
|
|
212
|
-
list.
|
|
213
|
-
create_triple(feature_uri, @base.
|
|
331
|
+
list.each_index { |index|
|
|
332
|
+
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/alias/#{index}"))
|
|
333
|
+
create_triple("#{feature_uri}/alias/#{index}", RDF.type, @base.Alias)
|
|
334
|
+
create_triple("#{feature_uri}/alias/#{index}", @base.has_value, list[index])
|
|
214
335
|
}
|
|
215
336
|
elsif tag == 'Dbxref' then
|
|
216
337
|
list.each { |value|
|
|
217
338
|
begin
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
if value.match(/^dbSNP(_\d+)?:rs\d+$/) then
|
|
221
|
-
linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{value.split(/:/)[1].sub(/^rs/, '')}"
|
|
222
|
-
elsif value.match(/^COSMIC(_\d+)?:COSM\d+$/) then
|
|
223
|
-
linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{value.split(/:/)[1].sub(/^COSM/, '')}"
|
|
224
|
-
else
|
|
225
|
-
abbreviation, id = value.split(':', 2)
|
|
226
|
-
linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
|
|
227
|
-
end
|
|
228
|
-
# Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
|
|
229
|
-
create_triple(feature_uri, @base.dbxref, linkout)
|
|
339
|
+
# Try to link the external database reference to a well-established URI:
|
|
340
|
+
serialize_dbxref(feature_uri, value)
|
|
230
341
|
rescue NoMethodError
|
|
231
|
-
# Preserve the Dbxref as a Literal:
|
|
342
|
+
# Not clear where to link to? Preserve the Dbxref as a Literal:
|
|
232
343
|
@dbxref = 0 if @dbxref == nil
|
|
233
344
|
literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
|
|
234
345
|
@dbxref += 1
|
|
235
|
-
create_triple(feature_uri, @base.
|
|
236
|
-
create_triple(literal_uri, RDF.type,
|
|
237
|
-
create_triple(literal_uri,
|
|
346
|
+
create_triple(feature_uri, @base.references, literal_uri)
|
|
347
|
+
create_triple(literal_uri, RDF.type, @base.ExternalReference)
|
|
348
|
+
create_triple(literal_uri, @base.has_value, value)
|
|
238
349
|
end
|
|
239
350
|
}
|
|
240
351
|
elsif tag == 'Derives_from' then
|
|
241
352
|
list.each { |value|
|
|
242
|
-
create_triple(feature_uri, @base.
|
|
353
|
+
create_triple(feature_uri, @base.is_temporarily_part_of, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
|
|
243
354
|
}
|
|
244
355
|
elsif tag == 'Gap' then
|
|
245
356
|
# Handled by 'Target', because 'Gap' requires 'Target' to be present.
|
|
246
357
|
elsif tag == 'ID' then
|
|
247
358
|
list.each { |value|
|
|
248
|
-
create_triple(feature_uri, @base.
|
|
359
|
+
create_triple(feature_uri, @base.has_identifier, RDF::URI.new("#{feature_uri}/id"))
|
|
360
|
+
create_triple("#{feature_uri}/id", RDF.type, @base.Identifier)
|
|
361
|
+
create_triple("#{feature_uri}/id", @base.has_value, value)
|
|
249
362
|
}
|
|
250
363
|
elsif tag == 'Is_circular' then
|
|
251
364
|
value = list.join(',')
|
|
252
365
|
if value == 'true' then
|
|
253
|
-
create_triple(feature_uri, @base.
|
|
366
|
+
create_triple(feature_uri, @base.has_quality, @base.CircularHelix) if value == 'true'
|
|
254
367
|
elsif value == 'false' then
|
|
255
|
-
create_triple(feature_uri, @base.
|
|
368
|
+
create_triple(feature_uri, @base.is_circular, @base.WatsonCrickHelix) if value == 'false'
|
|
256
369
|
else
|
|
257
|
-
create_triple(feature_uri,
|
|
370
|
+
create_triple(feature_uri, BioInterchange::RDFS.comment, "Is_circular non-truth value: #{value}")
|
|
258
371
|
end
|
|
259
372
|
elsif tag == 'Name' then
|
|
260
373
|
list.each { |value|
|
|
261
|
-
create_triple(feature_uri, @base.
|
|
374
|
+
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/name"))
|
|
375
|
+
create_triple("#{feature_uri}/name", RDF.type, @base.Name)
|
|
376
|
+
create_triple("#{feature_uri}/name", @base.has_value, value)
|
|
262
377
|
}
|
|
263
378
|
elsif tag == 'Note' then
|
|
264
|
-
list.
|
|
265
|
-
create_triple(feature_uri, RDF::
|
|
379
|
+
list.each_index { |index|
|
|
380
|
+
create_triple(feature_uri, @base.has_annotation, RDF::URI.new("#{feature_uri}/note/#{index}"))
|
|
381
|
+
create_triple("#{feature_uri}/note/#{index}", RDF.type, @base.Note)
|
|
382
|
+
create_triple("#{feature_uri}/note/#{index}", @base.has_value, list[index])
|
|
266
383
|
}
|
|
267
384
|
elsif tag == 'Ontology_term' then
|
|
268
385
|
list.each { |value|
|
|
269
386
|
# TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
|
|
270
387
|
# match their associated Ruby method.
|
|
271
388
|
namespace, accession = value.split(/:/, 2)
|
|
272
|
-
create_triple(feature_uri, @base.
|
|
389
|
+
create_triple(feature_uri, @base.references, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
|
|
273
390
|
}
|
|
274
391
|
elsif tag == 'Parent' then
|
|
275
392
|
list.each { |parent_id|
|
|
276
|
-
create_triple(feature_uri, @base.
|
|
393
|
+
create_triple(feature_uri, @base.has_source, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
|
|
277
394
|
}
|
|
278
395
|
elsif tag == 'Reference_seq' then
|
|
279
396
|
list.each { |value|
|
|
280
397
|
reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
|
|
281
|
-
create_triple(feature_uri, @base.
|
|
282
|
-
create_triple(reference_uri, RDF.type, @base.
|
|
283
|
-
create_triple(reference_uri, @base.
|
|
398
|
+
create_triple(feature_uri, @base.has_attribute, reference_uri)
|
|
399
|
+
create_triple(reference_uri, RDF.type, @base.ReferenceSequence)
|
|
400
|
+
create_triple(reference_uri, @base.has_value, value)
|
|
284
401
|
}
|
|
285
402
|
elsif tag == 'Target' then
|
|
403
|
+
# GFF3 spec is unclear on this point, but I assume that a target ID
|
|
404
|
+
# is referencing a feature ID within the same file.
|
|
286
405
|
target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
|
|
287
406
|
target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
|
|
288
|
-
create_triple(
|
|
289
|
-
create_triple(target_uri,
|
|
290
|
-
create_triple(target_uri, @base.
|
|
407
|
+
create_triple(target_uri, RDF.type, @base.SequenceAlignment)
|
|
408
|
+
create_triple(target_uri, @base.has_source, feature_uri)
|
|
409
|
+
create_triple(target_uri, @base.has_input, target_id)
|
|
291
410
|
region_uri = RDF::URI.new("#{target_uri.to_s}/region")
|
|
292
411
|
start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
|
|
293
412
|
end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
|
|
294
|
-
create_triple(target_uri, @base.
|
|
295
|
-
create_triple(region_uri,
|
|
296
|
-
create_triple(region_uri,
|
|
413
|
+
create_triple(target_uri, @base.has_attribute, region_uri)
|
|
414
|
+
create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
|
|
415
|
+
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
|
416
|
+
create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
|
|
297
417
|
if strand == '+' then
|
|
298
418
|
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
|
299
419
|
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
|
@@ -312,13 +432,20 @@ protected
|
|
|
312
432
|
create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
|
313
433
|
end
|
|
314
434
|
|
|
315
|
-
# Describe a possible alignment between the feature and target:
|
|
435
|
+
# Describe a possible alignment with gaps between the feature and target:
|
|
316
436
|
if attributes.has_key?('Gap') then
|
|
317
437
|
attributes['Gap'].each_index { |gap_no|
|
|
318
438
|
cigar_line = attributes['Gap'][gap_no].split(/\s+/)
|
|
319
439
|
cigar_line.each_index { |alignment_no|
|
|
320
440
|
alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
|
|
321
|
-
|
|
441
|
+
if alignment_no == 0 then
|
|
442
|
+
create_triple(target_uri, @base.has_first_part, alignment_uri)
|
|
443
|
+
else
|
|
444
|
+
create_triple(target_uri, @base.has_ordered_part, alignment_uri)
|
|
445
|
+
end
|
|
446
|
+
if alignment_no == cigar_line.length then
|
|
447
|
+
create_triple(target_uri, @base.has_last_part, alignment_uri)
|
|
448
|
+
end
|
|
322
449
|
operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
|
|
323
450
|
operation = nil unless operation.length == 1
|
|
324
451
|
span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
|
|
@@ -326,24 +453,25 @@ protected
|
|
|
326
453
|
if operation == 'M' then
|
|
327
454
|
create_triple(alignment_uri, RDF.type, @base.Match)
|
|
328
455
|
elsif operation == 'I' then
|
|
329
|
-
create_triple(alignment_uri, RDF.type, @base.
|
|
456
|
+
create_triple(alignment_uri, RDF.type, @base.ReferenceSequenceGap)
|
|
330
457
|
elsif operation == 'D' then
|
|
331
|
-
create_triple(alignment_uri, RDF.type, @base.
|
|
458
|
+
create_triple(alignment_uri, RDF.type, @base.TargetSequenceGap)
|
|
332
459
|
elsif operation == 'F' then
|
|
333
|
-
create_triple(alignment_uri, RDF.type, @base.
|
|
460
|
+
create_triple(alignment_uri, RDF.type, @base.ForwardReferenceSequenceFrameshift)
|
|
334
461
|
elsif operation == 'R' then
|
|
335
|
-
create_triple(alignment_uri, RDF.type, @base.
|
|
462
|
+
create_triple(alignment_uri, RDF.type, @base.ReverseReferenceSequenceFrameshift)
|
|
336
463
|
else
|
|
337
464
|
# Fallback: operation is outside of the specification
|
|
338
|
-
create_triple(alignment_uri, RDF.type, @base.
|
|
339
|
-
create_triple(alignment_uri,
|
|
465
|
+
create_triple(alignment_uri, RDF.type, @base.SequenceAlignmentOperation)
|
|
466
|
+
create_triple(alignment_uri, BioInterchange::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
|
|
340
467
|
end
|
|
341
|
-
create_triple(alignment_uri, @base.span, span.to_i) if span
|
|
342
|
-
create_triple(alignment_uri, RDF.first, alignment_uri)
|
|
343
468
|
if alignment_no + 1 < cigar_line.length then
|
|
344
|
-
create_triple(alignment_uri,
|
|
345
|
-
|
|
346
|
-
|
|
469
|
+
create_triple(alignment_uri, @base.is_before, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
|
|
470
|
+
end
|
|
471
|
+
if span then
|
|
472
|
+
create_triple(alignment_uri, @base.has_attribute, RDF::URI.new("#{alignment_uri}/span"))
|
|
473
|
+
create_triple("#{alignment_uri}/span", RDF.type, @base.Span)
|
|
474
|
+
create_triple("#{alignment_uri}/span", @base.has_value, span.to_i)
|
|
347
475
|
end
|
|
348
476
|
}
|
|
349
477
|
}
|
|
@@ -352,6 +480,29 @@ protected
|
|
|
352
480
|
serialize_variant_effects(set_uri, feature_uri, list)
|
|
353
481
|
elsif tag == 'Variant_seq' then
|
|
354
482
|
serialize_variant_seqs(set_uri, feature_uri, list)
|
|
483
|
+
# VCF related attributes:
|
|
484
|
+
elsif tag == ' alternative_alleles' then
|
|
485
|
+
# TODO
|
|
486
|
+
elsif tag == ' filters' then
|
|
487
|
+
# Example: "Qual;MinAB;MinDP" -- comes here as split list (split by ";")
|
|
488
|
+
list.each { |id|
|
|
489
|
+
create_triple(feature_uri, @base.is_refuted_by, RDF::URI.new("#{set_uri}/filter/#{id}"))
|
|
490
|
+
}
|
|
491
|
+
elsif tag == ' samples' then
|
|
492
|
+
list.each_index { |sample|
|
|
493
|
+
list[sample].each_pair { |key, values|
|
|
494
|
+
serialize_vcf_sample(feature_uri, sample, key, values, attributes)
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
# Everything else:
|
|
498
|
+
elsif list == true then
|
|
499
|
+
# Attribute is a flag. Tag itself carries meaning and has no value associated with it.
|
|
500
|
+
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}")
|
|
501
|
+
create_triple(feature_uri, @base.has_attribute, attribute_uri)
|
|
502
|
+
create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
|
|
503
|
+
create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
|
|
504
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
|
|
505
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
|
|
355
506
|
else
|
|
356
507
|
# TODO Report unknown upper case letters here? That would be a spec. validation...
|
|
357
508
|
# Well, or it would show that this implementation is incomplete. Could be either.
|
|
@@ -359,71 +510,299 @@ protected
|
|
|
359
510
|
value = list[index]
|
|
360
511
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
|
361
512
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
|
362
|
-
create_triple(feature_uri, @base.
|
|
363
|
-
create_triple(attribute_uri, RDF.type, @base.
|
|
364
|
-
|
|
365
|
-
create_triple(attribute_uri,
|
|
513
|
+
create_triple(feature_uri, @base.has_attribute, attribute_uri)
|
|
514
|
+
create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
|
|
515
|
+
# TODO Figure out why the following line was there. Seems wrong.
|
|
516
|
+
#create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
|
|
517
|
+
create_triple(attribute_uri, @base.has_value, value)
|
|
518
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
|
|
519
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
|
|
366
520
|
}
|
|
367
521
|
end
|
|
368
522
|
}
|
|
369
523
|
end
|
|
370
524
|
|
|
525
|
+
# Serializes VCF sample data (VCF columns 9 and above).
|
|
526
|
+
#
|
|
527
|
+
# See also genotype serialization of non-VCF data in `serialize_variant_seqs`.
|
|
528
|
+
#
|
|
529
|
+
# +feature_uri+:: URI of the feature that the sample data relates to
|
|
530
|
+
# +sample+:: number of the sample that is being addressed (sample column number)
|
|
531
|
+
# +key+:: key of the described sample values
|
|
532
|
+
# +values+:: values of the sample (possible composite type, e.g. comma separated list)
|
|
533
|
+
# +attribtues+:: a map of tag/value pairs associated with the feature
|
|
534
|
+
def serialize_vcf_sample(feature_uri, sample, key, values, attributes)
|
|
535
|
+
if key == 'DP' then
|
|
536
|
+
# Depth across samples. An integer.
|
|
537
|
+
values = values.split(',')
|
|
538
|
+
values.each_index { |index|
|
|
539
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index].to_i, index, values.size > 1, @base.Number_ofReads, RDF::XSD.integer)
|
|
540
|
+
}
|
|
541
|
+
elsif key == 'GT' then
|
|
542
|
+
# Genotype
|
|
543
|
+
list_uri = RDF::URI.new("#{feature_uri}/attribute/#{key}")
|
|
544
|
+
serialize_attribute_with_label(feature_uri, list_uri, @base.Genotype, key)
|
|
545
|
+
phased = values.index('/') == nil
|
|
546
|
+
if phased then
|
|
547
|
+
create_triple(list_uri, @base.has_attribute, "#{list_uri}/phase")
|
|
548
|
+
create_triple("#{list_uri}/phase", RDF.type, @base.GameticPhase)
|
|
549
|
+
end
|
|
550
|
+
value_uris = []
|
|
551
|
+
values = values.split(/\/|\|/)
|
|
552
|
+
# Only say something about zygosity if we deal with single bases and a diploid genome!
|
|
553
|
+
if values.length == 2 and values.map { |sequence| sequence.length }.uniq[0] == 1 then
|
|
554
|
+
if values.uniq.length == 1 then
|
|
555
|
+
create_triple(list_uri, @base.has_quality, @base.Homozygous)
|
|
556
|
+
else
|
|
557
|
+
create_triple(list_uri, @base.has_quality, @base.Heterozygous)
|
|
558
|
+
end
|
|
559
|
+
end
|
|
560
|
+
values.each_index { |index|
|
|
561
|
+
sequence = vcf_allele(values[index].to_i, attributes)
|
|
562
|
+
sequence_type = @base.SequenceVariant
|
|
563
|
+
sequence_type = @base.ReferenceSequence if values[index].to_i == 0
|
|
564
|
+
value_uris << value_uri = serialize_vcf_sample_attribute(feature_uri, sample, key, true, sequence, index, values.size > 1, sequence_type)
|
|
565
|
+
}
|
|
566
|
+
serialize_list_array(list_uri, value_uris)
|
|
567
|
+
elsif key == 'FT' then
|
|
568
|
+
# Filter: passed does nothing; applied filter uses isRefutedBy.
|
|
569
|
+
# TODO How to code using GFVO?
|
|
570
|
+
values = values.split(';')
|
|
571
|
+
values.each_index { |index|
|
|
572
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
573
|
+
}
|
|
574
|
+
elsif key == 'GL' then
|
|
575
|
+
# Genotype likelihoods.
|
|
576
|
+
list_uri = "#{feature_uri}/attribute/#{key}"
|
|
577
|
+
serialize_attribute_with_label(feature_uri, list_uri, @base.Score, key)
|
|
578
|
+
values = values.split(',')
|
|
579
|
+
value_uris = []
|
|
580
|
+
values.each_index { |index|
|
|
581
|
+
value_uris << serialize_vcf_sample_attribute(feature_uri, sample, key, false, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
582
|
+
}
|
|
583
|
+
serialize_list_array(list_uri, value_uris)
|
|
584
|
+
elsif key == 'GLE' then
|
|
585
|
+
# Genotype likelihoods of heterogenous ploidy.
|
|
586
|
+
# Example: 0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53
|
|
587
|
+
values = values.split(',')
|
|
588
|
+
values.each_index { |index|
|
|
589
|
+
genotype, likelihood = values[index].split(':')
|
|
590
|
+
genotype = genotype.split('/').map { |allele| vcf_allele(allele.to_i, attributes) }
|
|
591
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, genotype, index, values.size > 1, @base.InformationContentEntity)
|
|
592
|
+
}
|
|
593
|
+
elsif key == 'PL' then
|
|
594
|
+
# Phred scaled genotype likelihoods.
|
|
595
|
+
values = values.split(',')
|
|
596
|
+
values.each_index { |index|
|
|
597
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
598
|
+
}
|
|
599
|
+
elsif key == 'GP' then
|
|
600
|
+
# Phred scaled genotype posterior probabilities.
|
|
601
|
+
values = values.split(',')
|
|
602
|
+
values.each_index { |index|
|
|
603
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
604
|
+
}
|
|
605
|
+
elsif key == 'GQ' then
|
|
606
|
+
# Conditional genotype quality.
|
|
607
|
+
values = values.split(',')
|
|
608
|
+
values.each_index { |index|
|
|
609
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
610
|
+
}
|
|
611
|
+
elsif key == 'HQ' then
|
|
612
|
+
# Haplotype qualities -- presumably Phred scaled.
|
|
613
|
+
values = values.split(',')
|
|
614
|
+
values.each_index { |index|
|
|
615
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
616
|
+
}
|
|
617
|
+
elsif key == 'PS' then
|
|
618
|
+
# Phase set. It's complicated. See the VCF specification for details.
|
|
619
|
+
values = values.split(',')
|
|
620
|
+
values.each_index { |index|
|
|
621
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
622
|
+
}
|
|
623
|
+
elsif key == 'PQ' then
|
|
624
|
+
# Phasing quality in Phred scale.
|
|
625
|
+
values = values.split(',')
|
|
626
|
+
values.each_index { |index|
|
|
627
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
628
|
+
}
|
|
629
|
+
elsif key == 'EC' then
|
|
630
|
+
# Expected alternate allele counts.
|
|
631
|
+
values = values.split(',')
|
|
632
|
+
values.each_index { |index|
|
|
633
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
634
|
+
}
|
|
635
|
+
elsif key == 'MQ' then
|
|
636
|
+
# RMS mapping quality. An integer.
|
|
637
|
+
values = values.split(',')
|
|
638
|
+
values.each_index { |index|
|
|
639
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
|
640
|
+
}
|
|
641
|
+
else
|
|
642
|
+
# Unknown keys. Should that be possible at all?
|
|
643
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values, 0, false, @base.InformationContentEntity)
|
|
644
|
+
end
|
|
645
|
+
end
|
|
646
|
+
|
|
647
|
+
# Returns the allele based on VCF's genotype indexing specification.
|
|
648
|
+
# Reference allele is index zero, alternatives alleles are designated by one or above.
|
|
649
|
+
#
|
|
650
|
+
# +genotype_index+:: VCF genotype index
|
|
651
|
+
# +attributes+:: feature attribute hash that contains reference/alternative allele bases
|
|
652
|
+
def vcf_allele(genotype_index, attributes)
|
|
653
|
+
if genotype_index == 0 then
|
|
654
|
+
genotype = attributes[' reference_bases'][0]
|
|
655
|
+
else
|
|
656
|
+
genotype = attributes[' alternative_alleles'][genotype_index - 1]
|
|
657
|
+
end
|
|
658
|
+
end
|
|
659
|
+
|
|
660
|
+
# Serializes an ordered list; the list's URIs are given as an array.
|
|
661
|
+
#
|
|
662
|
+
# +list_uri+:: URI of the list object (ordered list items will be part of this instance via "has_first_part", "has_ordered_part, "has_last_part")
|
|
663
|
+
# +uris+:: URIs of the list to be serialized
|
|
664
|
+
def serialize_list_array(list_uri, uris)
|
|
665
|
+
uris.each_index { |index|
|
|
666
|
+
next_uri = nil
|
|
667
|
+
next_uri = uris[index + 1] if index + 1 < uris.length
|
|
668
|
+
serialize_list(uris, index, uris[index], next_uri, list_uri)
|
|
669
|
+
}
|
|
670
|
+
end
|
|
671
|
+
|
|
672
|
+
# Create an ordered list of things; this method serializes one item only and
|
|
673
|
+
# repeated calls to this method create the list.
|
|
674
|
+
#
|
|
675
|
+
# +values+:: array of the things that appear in the ordered list
|
|
676
|
+
# +index+:: index of the thing that is serialized by this method call
|
|
677
|
+
# +value_uri+:: URI that represents the current value that is being linked to
|
|
678
|
+
# +next_value_uri+:: URI of the next serialized value (ignored, if last index)
|
|
679
|
+
# +list_uri+:: URI of the list that contains the items
|
|
680
|
+
def serialize_list(values, index, value_uri, next_value_uri, list_uri)
|
|
681
|
+
if index == 0 then
|
|
682
|
+
create_triple(list_uri, @base.has_first_part, value_uri)
|
|
683
|
+
elsif index + 1 == values.length then
|
|
684
|
+
create_triple(list_uri, @base.has_last_part, value_uri)
|
|
685
|
+
else
|
|
686
|
+
create_triple(list_uri, @base.has_ordered_part, value_uri)
|
|
687
|
+
end
|
|
688
|
+
if index + 1 < values.length then
|
|
689
|
+
create_triple(value_uri, @base.is_before, next_value_uri)
|
|
690
|
+
end
|
|
691
|
+
end
|
|
692
|
+
|
|
693
|
+
# Serializes basic information for an object (a feature's attribtue) with label.
|
|
694
|
+
#
|
|
695
|
+
# Links the object to a feature, sets the objects type, assigns it a label.
|
|
696
|
+
#
|
|
697
|
+
# +feature_uri+:: URI of the feature that has the object as an attribute
|
|
698
|
+
# +object_uri+:: URI that represents the object
|
|
699
|
+
# +object_type+:: type of the object
|
|
700
|
+
# +label+:: label text to use
|
|
701
|
+
def serialize_attribute_with_label(feature_uri, object_uri, object_type, label)
|
|
702
|
+
create_triple(feature_uri, @base.has_attribute, object_uri)
|
|
703
|
+
create_triple(object_uri, RDF.type, object_type)
|
|
704
|
+
label_uri = RDF::URI.new("#{object_uri}/label")
|
|
705
|
+
create_triple(object_uri, @base.has_attribute, label_uri)
|
|
706
|
+
create_triple(label_uri, RDF.type, @base.Label)
|
|
707
|
+
create_triple(label_uri, @base.has_value, label)
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
# Serializes VCF meta-data (pragma equivalent) key/value pairs. Used by serialize_pragma.
|
|
711
|
+
#
|
|
712
|
+
# Returns URI of the serialized meta-data.
|
|
713
|
+
#
|
|
714
|
+
# +set_uri+:: URI of the set that the meta-data belongs to
|
|
715
|
+
# +uri_suffix+:: suffix that is appended to set_uri, which uniquely defines the meta-data (within the set_uri)
|
|
716
|
+
# +meta_type+:: type of the meta-data
|
|
717
|
+
# +key_type_mappings+:: mappings of keys to known types, everything else is considered an Object
|
|
718
|
+
# +attributes+:: key/value pairs that are the actual meta-data being described
|
|
719
|
+
def serialize_vcf_pragma(set_uri, uri_suffix, meta_type, key_type_mappings, attributes)
|
|
720
|
+
pragma_uri = RDF::URI.new("#{set_uri}/#{uri_suffix}")
|
|
721
|
+
create_triple(pragma_uri, RDF.type, meta_type)
|
|
722
|
+
attributes.each_pair { |key, value|
|
|
723
|
+
attribute_uri = RDF::URI.new("#{pragma_uri}/#{key}")
|
|
724
|
+
create_triple(pragma_uri, @base.has_attribute, attribute_uri)
|
|
725
|
+
if key_type_mappings.has_key?(key) then
|
|
726
|
+
create_triple(attribute_uri, RDF.type, key_type_mappings[key])
|
|
727
|
+
# TODO Check if type is integer/double here, then convert value accordingly.
|
|
728
|
+
else
|
|
729
|
+
create_triple(attribute_uri, RDF.type, @base.Object)
|
|
730
|
+
end
|
|
731
|
+
create_triple(attribute_uri, @base.has_value, value)
|
|
732
|
+
}
|
|
733
|
+
pragma_uri
|
|
734
|
+
end
|
|
735
|
+
|
|
736
|
+
# Serializes a VCF sample attribute/value pair. Used by serialize_vcf_sample.
|
|
737
|
+
#
|
|
738
|
+
# Returns URI of the serialized attribute/value pair.
|
|
739
|
+
#
|
|
740
|
+
# +base_uri+:: URI of the "thing" that the sample data relates to
|
|
741
|
+
# +sample+:: number of the sample that is being addressed (sample column number)
|
|
742
|
+
# +key+:: key of the described sample values
|
|
743
|
+
# +has_label+:: if true, then serialize label (value taken from key)
|
|
744
|
+
# +value+:: value that is associated with the key/sample
|
|
745
|
+
# +index+:: index of the value (in case value is part of an array of size > 1)
|
|
746
|
+
# +multivalue+:: true if this value is taken from an array of values of size > 1
|
|
747
|
+
# +attribute_type+:: type of the attribute entity that represents the value
|
|
748
|
+
# +value_type+:: type of the actual value
|
|
749
|
+
def serialize_vcf_sample_attribute(base_uri, sample, key, has_label, value, index, multivalue, attribute_type, value_type = nil)
|
|
750
|
+
value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}") unless multivalue
|
|
751
|
+
value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}-#{index + 1}") if multivalue
|
|
752
|
+
create_triple(base_uri, @base.has_attribute, value_uri)
|
|
753
|
+
create_triple(value_uri, RDF.type, attribute_type)
|
|
754
|
+
create_triple(value_uri, @base.has_value, value, value_type)
|
|
755
|
+
if has_label then
|
|
756
|
+
label_uri = RDF::URI.new("#{value_uri}/label")
|
|
757
|
+
create_triple(value_uri, @base.has_attribute, label_uri)
|
|
758
|
+
create_triple(label_uri, RDF.type, @base.Label)
|
|
759
|
+
create_triple(label_uri, @base.has_value, key)
|
|
760
|
+
end
|
|
761
|
+
value_uri
|
|
762
|
+
end
|
|
763
|
+
|
|
371
764
|
# Serializes a structured attribute (given as a pragma statement), which later
|
|
372
765
|
# can be referred to from feature instances.
|
|
373
766
|
#
|
|
374
767
|
# +set_uri+:: the feature set URI to which the structured attribute belongs to
|
|
375
768
|
# +pragma+:: a map that encapsulates the structured attribute data
|
|
376
769
|
def serialize_structured_attribute(set_uri, pragma)
|
|
770
|
+
# TODO Triple from set_uri to attribute_uri missing; should be isParticipantIn
|
|
377
771
|
attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
|
|
378
772
|
attributes = nil
|
|
379
773
|
class_type = nil
|
|
380
774
|
if pragma.has_key?('attribute-method') then
|
|
381
775
|
attributes = pragma['attribute-method'][0]
|
|
382
|
-
class_type = @base.
|
|
776
|
+
class_type = @base.ExperimentalMethod
|
|
383
777
|
elsif pragma.has_key?('data-source') then
|
|
384
778
|
attributes = pragma['data-source'][0]
|
|
385
|
-
class_type = @base.
|
|
779
|
+
class_type = @base.GenomicAscertainingMethod
|
|
386
780
|
elsif pragma.has_key?('score-method') then
|
|
387
781
|
attributes = pragma['score-method'][0]
|
|
388
|
-
class_type = @base.
|
|
782
|
+
class_type = @base.ExperimentalMethod
|
|
389
783
|
elsif pragma.has_key?('source-method') then
|
|
390
784
|
attributes = pragma['source-method'][0]
|
|
391
|
-
class_type = @base.
|
|
785
|
+
class_type = @base.ExperimentalMethod
|
|
392
786
|
elsif pragma.has_key?('technology-platform') then
|
|
393
787
|
attributes = pragma['technology-platform'][0]
|
|
394
|
-
class_type = @base.
|
|
788
|
+
class_type = @base.SequencingTechnologyPlatform
|
|
395
789
|
else
|
|
396
790
|
# TODO Error.
|
|
397
791
|
end
|
|
398
|
-
if class_type == @base.
|
|
792
|
+
if class_type == @base.GenomicAscertainingMethod and attributes.has_key?('Data_type') then
|
|
399
793
|
attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
|
|
400
794
|
if attributes['Data_type'] == 'Array_CGH' then
|
|
401
795
|
class_type = @base.ArrayComparativeGenomicHybridization
|
|
402
796
|
elsif attributes['Data_type'] == 'DNA_microarray' then
|
|
403
797
|
class_type = @base.DNAMicroarray
|
|
404
798
|
elsif attributes['Data_type'] == 'DNA_sequence' then
|
|
405
|
-
class_type = @base.
|
|
799
|
+
class_type = @base.DNASequencing
|
|
406
800
|
elsif attributes['Data_type'] == 'RNA_sequence' then
|
|
407
|
-
class_type = @base.
|
|
801
|
+
class_type = @base.RNASequencing
|
|
408
802
|
else
|
|
409
803
|
# TODO Error.
|
|
410
804
|
end
|
|
411
|
-
elsif class_type == @base.
|
|
412
|
-
if attributes.has_key?('Average_coverage') then
|
|
413
|
-
create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
|
|
414
|
-
end
|
|
415
|
-
if attributes.has_key?('Platform_class') then
|
|
416
|
-
create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
|
|
417
|
-
end
|
|
418
|
-
if attributes.has_key?('Platform_name') then
|
|
419
|
-
create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
|
|
420
|
-
end
|
|
421
|
-
if attributes.has_key?('Read_length') then
|
|
422
|
-
create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
|
|
423
|
-
end
|
|
424
|
-
if attributes.has_key?('Read_pair_span') then
|
|
425
|
-
create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
|
|
426
|
-
end
|
|
805
|
+
elsif class_type == @base.SequencingTechnologyPlatform then
|
|
427
806
|
if attributes.has_key?('Read_type') then
|
|
428
807
|
attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
|
|
429
808
|
if attributes['Read_type'] == 'fragment' then
|
|
@@ -436,17 +815,54 @@ protected
|
|
|
436
815
|
end
|
|
437
816
|
end
|
|
438
817
|
create_triple(attribute_uri, RDF.type, class_type)
|
|
818
|
+
if class_type == @base.SequencingTechnologyPlatform or
|
|
819
|
+
class_type == @base.FragmentReadPlatform or
|
|
820
|
+
class_type == @base.PairedEndReadPlatform then
|
|
821
|
+
if attributes.has_key?('Average_coverage') then
|
|
822
|
+
coverage_uri = RDF::URI.new("#{attribute_uri}/averagecoverage")
|
|
823
|
+
create_triple(attribute_uri, @base.has_attribute, coverage_uri)
|
|
824
|
+
create_triple(coverage_uri, RDF.type, @base.AverageCoverage)
|
|
825
|
+
create_triple(coverage_uri, @base.has_value, attributes['Average_coverage'][0].to_i)
|
|
826
|
+
end
|
|
827
|
+
if attributes.has_key?('Platform_class') then
|
|
828
|
+
create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/platformclass"))
|
|
829
|
+
#create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
|
|
830
|
+
end
|
|
831
|
+
if attributes.has_key?('Platform_name') then
|
|
832
|
+
#create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
|
|
833
|
+
end
|
|
834
|
+
if attributes.has_key?('Read_length') then
|
|
835
|
+
#create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
|
|
836
|
+
end
|
|
837
|
+
if attributes.has_key?('Read_pair_span') then
|
|
838
|
+
#create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
|
|
839
|
+
end
|
|
840
|
+
end
|
|
439
841
|
attributes.keys.each { |tag|
|
|
440
842
|
if tag.match(/^[a-z]/) then
|
|
843
|
+
tag.strip!
|
|
441
844
|
custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
|
|
442
|
-
create_triple(
|
|
443
|
-
create_triple(custom_attribute_uri, @base.
|
|
845
|
+
create_triple(attribute_uri, @base.has_attribute, custom_attribute_uri)
|
|
846
|
+
create_triple(custom_attribute_uri, RDF.type, @base.InformationContentEntity)
|
|
847
|
+
create_triple(custom_attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"))
|
|
444
848
|
attributes[tag].each { |value|
|
|
445
|
-
create_triple(custom_attribute_uri,
|
|
849
|
+
create_triple(custom_attribute_uri, @base.has_value, value)
|
|
446
850
|
}
|
|
447
|
-
create_triple(attribute_uri, @base.
|
|
851
|
+
create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), RDF.type, @base.Name)
|
|
852
|
+
create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), @base.has_value, tag)
|
|
448
853
|
else
|
|
449
854
|
# TODO
|
|
855
|
+
match_constraints = {}
|
|
856
|
+
attributes[tag].each { |value|
|
|
857
|
+
if tag == 'Seqid' or tag == 'Type' or tag == 'Source' then
|
|
858
|
+
match_constraints[tag] = value.split(',')
|
|
859
|
+
else
|
|
860
|
+
# Not a recognized match. Might be a Dbxref or Comment.
|
|
861
|
+
end
|
|
862
|
+
}
|
|
863
|
+
unless match_constraints.keys.empty? then
|
|
864
|
+
@matches
|
|
865
|
+
end
|
|
450
866
|
end
|
|
451
867
|
}
|
|
452
868
|
end
|
|
@@ -462,35 +878,46 @@ protected
|
|
|
462
878
|
sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
|
|
463
879
|
feature_ids = feature_ids.split(' ')
|
|
464
880
|
effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
create_triple(effect_uri,
|
|
881
|
+
# TODO
|
|
882
|
+
#serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
|
|
883
|
+
# Type is a SO sequence_variant or descendent:
|
|
884
|
+
create_triple(effect_uri, RDF.type, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
|
|
885
|
+
# The feature type should be already apparent from the targeted feature. Do no sanity
|
|
886
|
+
# check here (if they match) and just skip over it.
|
|
887
|
+
# create_triple(effect_uri, @base.feature_type, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
|
|
469
888
|
feature_ids.each { |feature_id|
|
|
470
|
-
create_triple(
|
|
889
|
+
create_triple(feature_id, @base.is_affected_by, effect_uri)
|
|
471
890
|
}
|
|
472
891
|
}
|
|
473
892
|
end
|
|
474
893
|
|
|
475
894
|
# Serializes a list of variant sequences.
|
|
476
895
|
#
|
|
896
|
+
# See also VCF genotype serialization ('GT' attribute) in `serialize_vcf_sample`.
|
|
897
|
+
#
|
|
477
898
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
|
478
899
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
|
479
900
|
# +list+:: list of variant values
|
|
480
901
|
def serialize_variant_seqs(set_uri, feature_uri, list)
|
|
902
|
+
variant_uri = nil
|
|
903
|
+
|
|
481
904
|
list.each_index { |index|
|
|
482
905
|
value = list[index]
|
|
483
906
|
variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
|
|
484
|
-
|
|
907
|
+
sequence_uri = RDF::URI.new("#{variant_uri}/sequence")
|
|
908
|
+
serialize_variant_triple(feature_uri, variant_uri, @base.has_attribute, sequence_uri)
|
|
909
|
+
create_triple(sequence_uri, @base.has_value, value)
|
|
485
910
|
}
|
|
486
911
|
|
|
487
|
-
# Return the variant type based on the present sequence(s):
|
|
488
|
-
return @base.Variant if list.length != 2
|
|
489
912
|
if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
|
|
490
|
-
|
|
491
|
-
|
|
913
|
+
if list[0] == list[1] then
|
|
914
|
+
create_triple(variant_uri, @base.has_quality, @base.Homozygous)
|
|
915
|
+
else
|
|
916
|
+
create_triple(variant_uri, @base.has_quality, @base.Heterozygous)
|
|
917
|
+
end
|
|
492
918
|
end
|
|
493
|
-
|
|
919
|
+
|
|
920
|
+
return @base.SequenceVariant
|
|
494
921
|
end
|
|
495
922
|
|
|
496
923
|
# Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
|
|
@@ -501,8 +928,8 @@ protected
|
|
|
501
928
|
# +object+:: data to be serialized
|
|
502
929
|
def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
|
|
503
930
|
unless @variants.has_key?(variant_uri.to_s) then
|
|
504
|
-
create_triple(feature_uri, @base.
|
|
505
|
-
create_triple(variant_uri, RDF.type, @base.
|
|
931
|
+
create_triple(feature_uri, @base.is_affected_by, variant_uri)
|
|
932
|
+
create_triple(variant_uri, RDF.type, @base.VariantCalling)
|
|
506
933
|
end
|
|
507
934
|
@variants[variant_uri.to_s] = true
|
|
508
935
|
create_triple(variant_uri, predicate, object)
|
|
@@ -515,12 +942,39 @@ protected
|
|
|
515
942
|
def serialize_feature_sequence(set_uri, feature_sequence)
|
|
516
943
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
|
|
517
944
|
annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
|
|
518
|
-
create_triple(feature_uri, @base.
|
|
519
|
-
create_triple(annotation_uri, RDF.type, @base.
|
|
520
|
-
create_triple(annotation_uri,
|
|
521
|
-
create_triple(annotation_uri, @base.
|
|
945
|
+
create_triple(feature_uri, @base.has_attribute, annotation_uri)
|
|
946
|
+
create_triple(annotation_uri, RDF.type, @base.Sequence)
|
|
947
|
+
create_triple(annotation_uri, BioInterchange::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
|
|
948
|
+
create_triple(annotation_uri, @base.has_value, feature_sequence.sequence)
|
|
522
949
|
end
|
|
523
950
|
|
|
951
|
+
# Serializes an external database reference.
|
|
952
|
+
#
|
|
953
|
+
# +feature_uri+:: URI of the feature that the external database references is referring to
|
|
954
|
+
# +dbxref_composite+:: composite term of the external database reference (e.g. ""dbSNP_127:rs123456)
|
|
955
|
+
def serialize_dbxref(feature_uri, dbxref_composite)
|
|
956
|
+
abbreviation, accession = dbxref_composite.split(':', 2)
|
|
957
|
+
dbxref_uri = RDF::URI.new("#{feature_uri.to_s}/dbxref/#{BioInterchange.make_safe_label(abbreviation)}")
|
|
958
|
+
create_triple(feature_uri, @base.references, dbxref_uri)
|
|
959
|
+
|
|
960
|
+
create_triple(dbxref_uri, RDF.type, @base.ExternalReference)
|
|
961
|
+
create_triple(dbxref_uri, @base.refers_to, BioInterchange::LifeScienceRegistry.send(dbxref_composite.split('_', 2)[0].downcase).sub('$id', accession))
|
|
962
|
+
if dbxref_composite.match(/^.+_.+:.+$/) then
|
|
963
|
+
# Entry with version information.
|
|
964
|
+
version_uri = RDF::URI.new("#{dbxref_uri}/version")
|
|
965
|
+
create_triple(dbxref_uri, @base.has_identifier, version_uri)
|
|
966
|
+
create_triple(version_uri, @base.has_value, abbreviation[6..-1])
|
|
967
|
+
end
|
|
968
|
+
|
|
969
|
+
#if dbxref_composite.match(/^dbSNP(_\d+)?:rs\d+$/) then
|
|
970
|
+
# # linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{dbxref_composite.split(/:/)[1].sub(/^rs/, '')}"
|
|
971
|
+
#elsif dbxref_composite.match(/^COSMIC(_\d+)?:COSM\d+$/) then
|
|
972
|
+
# linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{accession.sub(/^COSM/, '')}"
|
|
973
|
+
#else
|
|
974
|
+
# BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + accession
|
|
975
|
+
#end
|
|
976
|
+
end
|
|
977
|
+
|
|
524
978
|
end
|
|
525
979
|
|
|
526
980
|
end
|