biointerchange 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -4
- data/Gemfile +2 -3
- data/README.md +36 -22
- data/VERSION +1 -1
- data/examples/Felis_catus.gvf.gz +0 -0
- data/examples/Felis_catus_incl_consequences.vcf.gz +0 -0
- data/generators/rdfxml.rb +1 -1
- data/generators/tsv2rubyclass.rb +31 -0
- data/lib/biointerchange/core.rb +17 -5
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +591 -137
- data/lib/biointerchange/genomics/gff3_reader.rb +16 -3
- data/lib/biointerchange/genomics/gvf_reader.rb +1 -1
- data/lib/biointerchange/genomics/vcf_feature.rb +46 -0
- data/lib/biointerchange/genomics/vcf_feature_set.rb +14 -0
- data/lib/biointerchange/genomics/vcf_reader.rb +238 -0
- data/lib/biointerchange/gfvo.rb +689 -553
- data/lib/biointerchange/life_science_registry.rb +3595 -0
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +33 -35
- data/lib/biointerchange/writer.rb +11 -16
- data/make.sh +4 -0
- data/spec/exceptions_spec.rb +1 -7
- data/spec/gff3_rdfwriter_spec.rb +2 -16
- data/spec/gvf_rdfwriter_spec.rb +2 -19
- data/spec/phylogenetics_spec.rb +1 -13
- data/spec/text_mining_pdfx_xml_reader_spec.rb +1 -13
- data/spec/text_mining_pubannos_json_reader_spec.rb +1 -14
- data/spec/text_mining_rdfwriter_spec.rb +8 -19
- data/test.sh +4 -0
- data/web/about.html +10 -14
- data/web/api.html +11 -13
- data/web/bootstrap/css/bootstrap-theme.css +347 -0
- data/web/bootstrap/css/bootstrap-theme.css.map +1 -0
- data/web/bootstrap/css/bootstrap-theme.min.css +7 -0
- data/web/bootstrap/css/bootstrap.css +4764 -4603
- data/web/bootstrap/css/bootstrap.css.map +1 -0
- data/web/bootstrap/css/bootstrap.min.css +6 -8
- data/web/bootstrap/fonts/glyphicons-halflings-regular.eot +0 -0
- data/web/bootstrap/fonts/glyphicons-halflings-regular.svg +229 -0
- data/web/bootstrap/fonts/glyphicons-halflings-regular.ttf +0 -0
- data/web/bootstrap/fonts/glyphicons-halflings-regular.woff +0 -0
- data/web/bootstrap/js/bootstrap.js +1372 -1448
- data/web/bootstrap/js/bootstrap.min.js +5 -5
- data/web/cli.html +14 -28
- data/web/index.html +15 -33
- data/web/ontologies.html +1089 -945
- data/web/webservices.html +12 -14
- metadata +24 -27
- data/lib/biointerchange/gff3o.rb +0 -525
- data/lib/biointerchange/gvf1o.rb +0 -1354
- data/web/bootstrap/css/bootstrap-responsive.css +0 -1040
- data/web/bootstrap/css/bootstrap-responsive.min.css +0 -9
- data/web/bootstrap/img/glyphicons-halflings-white.png +0 -0
- data/web/bootstrap/img/glyphicons-halflings.png +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b66ff1fc16121ae1bd423d5f06bf212b00cdd5c
|
4
|
+
data.tar.gz: 93bdd0303b40580eb5f70f17db8aad93228fc11d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b8adeeb8584d3d09729faeb6475dfd9be4adc3f568fdb8a1e4e654c605126384574344e6235f831609a9fa6a61f24ef87dbfff579153dab0f6e406887208584
|
7
|
+
data.tar.gz: f709f45089d6cc469884f9e182752fda0be6235762e65857fa1f8b2f0bca72e392618a8b7da142c0ed763c24526195d126a07a709c55a30ce152caf44b5c31b8
|
data/.travis.yml
CHANGED
@@ -2,11 +2,9 @@ language: ruby
|
|
2
2
|
rvm:
|
3
3
|
- 1.9.2
|
4
4
|
- 1.9.3
|
5
|
+
- 2.0.0
|
6
|
+
- 2.1.1
|
5
7
|
- jruby-19mode # JRuby in 1.9 mode
|
6
|
-
# - rbx-19mode
|
7
|
-
- 1.8.7
|
8
|
-
- jruby-18mode # JRuby in 1.8 mode
|
9
|
-
# - rbx-18mode
|
10
8
|
|
11
9
|
# uncomment this line if your project needs to run something other than `rake`:
|
12
10
|
# script: bundle exec rspec spec
|
data/Gemfile
CHANGED
@@ -2,10 +2,10 @@ source "http://rubygems.org"
|
|
2
2
|
# Add dependencies required to use your gem here.
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
|
-
gem "rdf", ">=
|
5
|
+
gem "rdf", ">= 1.1.4.3"
|
6
6
|
gem "json", ">= 1.6.4"
|
7
7
|
gem "getopt", ">= 1.4.1"
|
8
|
-
gem "addressable", ">= 2.3.
|
8
|
+
gem "addressable", ">= 2.3.6"
|
9
9
|
gem "bio", ">= 1.4.2"
|
10
10
|
|
11
11
|
# Add dependencies to develop your gem here.
|
@@ -14,6 +14,5 @@ group :development do
|
|
14
14
|
gem "rspec", "~> 2.8.0"
|
15
15
|
gem "bundler", ">= 1.1.5"
|
16
16
|
gem "jeweler", "~> 1.8.4"
|
17
|
-
gem "bio", ">= 1.4.2"
|
18
17
|
gem "rdoc", "~> 3.12"
|
19
18
|
end
|
data/README.md
CHANGED
@@ -21,14 +21,12 @@ Ontologies used in the RDF output:
|
|
21
21
|
|
22
22
|
* [Comparative Data Analysis Ontology](http://sourceforge.net/apps/mediawiki/cdao/index.php?title=Main_Page) (CDAO)
|
23
23
|
* [Friend of a Friend](http://xmlns.com/foaf/spec) (FOAF)
|
24
|
-
* [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O)
|
25
|
-
* [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O)
|
26
24
|
* [Genomic Feature and Variation Ontology](http://www.biointerchange.org/ontologies.html) (GFVO)
|
27
25
|
* [Semanticscience Integrated Ontology](http://code.google.com/p/semanticscience/wiki/SIO) (SIO)
|
28
26
|
* [Sequence Ontology](http://www.sequenceontology.org/index.html) (SO)
|
29
27
|
* [Sequence Ontology Feature Annotation](http://www.sequenceontology.org/index.html) (SOFA)
|
30
28
|
|
31
|
-
*Note:*
|
29
|
+
*Note:* GFVO replaces the [Generic Feature Format Version 3 Ontology](http://www.biointerchange.org/ontologies.html) (GFF3O) and [Genome Variation Format Version 1 Ontology](http://www.biointerchange.org/ontologies.html) (GVF1O).
|
32
30
|
|
33
31
|
#### Contributing
|
34
32
|
|
@@ -54,7 +52,7 @@ BioInterchange's command-line tool `biointerchange` can be installed as a comman
|
|
54
52
|
|
55
53
|
Examples:
|
56
54
|
|
57
|
-
biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.
|
55
|
+
biointerchange --input biointerchange.gvf --rdf rdf.biointerchange.gfvo --file examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf
|
58
56
|
biointerchange --input dbcls.catanns.json --rdf rdf.bh12.sio --file examples/pubannotation.10096561.json --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
|
59
57
|
biointerchange --input uk.ac.man.pdfx --rdf rdf.bh12.sio --file examples/gb-2007-8-3-R40.xml --annotate_name 'Peter Smith' --annotate_name_id 'peter.smith@example.com'
|
60
58
|
biointerchange --input phylotastic.newick --rdf rdf.phylotastic.newick --file examples/tree2.new --annotate_date '1 June 2006'
|
@@ -69,8 +67,7 @@ Input formats:
|
|
69
67
|
|
70
68
|
Output formats:
|
71
69
|
|
72
|
-
* `rdf.biointerchange.
|
73
|
-
* `rdf.biointerchange.gvf`
|
70
|
+
* `rdf.biointerchange.gfvo`
|
74
71
|
* `rdf.bh12.sio`
|
75
72
|
* `rdf.phylotastic.newick`
|
76
73
|
|
@@ -93,15 +90,6 @@ To list all `seqid` entries from a GVF-file conversion in the store, the followi
|
|
93
90
|
|
94
91
|
testrepo> sparql select * where { ?s <http://www.biointerchange.org/gvf1o#GVF1_0004> ?o } .
|
95
92
|
|
96
|
-
#### Data Consistency Verification
|
97
|
-
|
98
|
-
Data consistency is verifyable for the output formats `rdf.biointerchange.gff3` and `rdf.biointerchange.gvf` using the [BioInterchange ontologies](http://www.biointerchange.org/ontologies.html) GFF3O and GVF1O. The following is an example of how [Jena](http://jena.apache.org)'s command line tools and the [HermiT reasoner](http://hermit-reasoner.com) can be used for conistency verification:
|
99
|
-
|
100
|
-
rdfcat <path-to-gff3o/gvf1o> <yourdata.n3> > merged.xml
|
101
|
-
java -d64 -Xmx4G -jar HermiT.jar -k -v merged.xml
|
102
|
-
|
103
|
-
Another approach is to load the data and its related GFF3O/GVF1O ontology into [Protege](http://protege.stanford.edu), merge them, and then use the "Explain inconsistent ontology" menu item to inspect possible data inconsistencies.
|
104
|
-
|
105
93
|
#### Example Data Provenance
|
106
94
|
|
107
95
|
The following list provides information on the origin of the example-data files in the `examples` directory:
|
@@ -110,8 +98,16 @@ The following list provides information on the origin of the example-data files
|
|
110
98
|
* `BovineGenomeChrX.gff3.gz`: Gzipped GFF3 file describing a Bos taurus chromosome X. Downloaded from [http://bovinegenome.org/?q=download_chromosome_gff3](http://bovinegenome.org/?q=download_chromosome_gff3)
|
111
99
|
* `chromosome_BF.gff`: GFF3 file of floating contigs from the Baylor Sequencing Centre. Downloaded from [http://dictybase.org/Downloads](http://dictybase.org/Downloads)
|
112
100
|
* `estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf`: GVF file of EBI's [DGVa](http://www.ebi.ac.uk/dgva/database-genomic-variants-archive). Downloaded from [ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf](ftp://ftp.ebi.ac.uk/pub/databases/dgva/estd176_Banerjee_et_al_2011/gvf/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf)
|
101
|
+
* `Felis_catus.gvf.gz`: Gzipped GVF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/felis_catus/Felis_catus.gvf.gz)
|
102
|
+
* `Felis_catus_incl_consequences.vcf.gz`: Gzipped VCF file of F. catus genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz](ftp://ftp.ensembl.org/pub/current_variation/vcf/felis_catus/Felis_catus_incl_consequences.vcf.gz)
|
113
103
|
* `gb-2007-8-3-R40.xml`: Generated by [PDFx](http://pdfx.cs.man.ac.uk) from open-access source PDF [Sense-antisense pairs in mammals: functional and evolutionary considerations](http://genomebiology.com/content/pdf/gb-2007-8-3-r40.pdf)
|
114
|
-
* `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
|
104
|
+
* `Saccharomyces_cerevisiae_incl_consequences.gvf.gz`: Gzipped GVF files of S. cerevisiae genomic variations provided by the Ensembl project. Downloaded from [ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz](ftp://ftp.ensembl.org/pub/release-71/variation/gvf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae_incl_consequences.gvf.gz)
|
105
|
+
|
106
|
+
#### Additional Example Data
|
107
|
+
|
108
|
+
The following lists data that has been used in testing BioInterchange's implementation, but was not included in the GitHub repository due to their size:
|
109
|
+
|
110
|
+
* `mgp.v3.indels.rsIDdbSNPv137.vcf.gz`: Gzipped VCF file of M. musculus indels by the Sanger Institute. Downloaded from [ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz](ftp://ftp-mouse.sanger.ac.uk/current_snps/mgp.v3.indels.rsIDdbSNPv137.vcf.gz)
|
115
111
|
|
116
112
|
### Application Programming Interface
|
117
113
|
|
@@ -322,9 +318,10 @@ The writer takes an object model and serializes it via the `BioInterchange::Writ
|
|
322
318
|
# Serialize a model as RDF.
|
323
319
|
#
|
324
320
|
# +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
|
325
|
-
|
321
|
+
# +uri_prefix+:: optional URI prefix that should be used in the RDFization of individuals/class instances
|
322
|
+
def serialize(model, uri_prefix = nil)
|
326
323
|
model.contents.each { |tree|
|
327
|
-
serialize_model(model, tree)
|
324
|
+
serialize_model(model, tree, uri_prefix)
|
328
325
|
}
|
329
326
|
end
|
330
327
|
|
@@ -501,8 +498,7 @@ RDFization parameters and data are send as a single HTTP POST requests containin
|
|
501
498
|
* `phylotastic.newick`: [Newick](http://evolution.genetics.washington.edu/phylip/newicktree.html)
|
502
499
|
* `uk.ac.man.pdfx`: [PDFx](http://pdfx.cs.man.ac.uk) XML
|
503
500
|
* `OUTPUT_METHOD`: determines the RDFization method that should be used, output will always be RDF N-Triples; available output formats are
|
504
|
-
* `rdf.biointerchange.
|
505
|
-
* `rdf.biointerchange.gvf`: RDFization of `biointerchange.gvf`
|
501
|
+
* `rdf.biointerchange.gfvo`: RDFization of `biointerchange.gff3` or `biointerchange.gvf`
|
506
502
|
* `rdf.bh12.sio`: RDFization of `dbcls.catanns.json` or `uk.ac.man.pdfx`
|
507
503
|
* `rdf.phylotastic.newick`: RDFization of `phylotastic.newick`
|
508
504
|
* `URL_ENCODED_DATA`: data for RDFization as [URL encoded](http://en.wikipedia.org/wiki/Percent-encoding) string
|
@@ -590,6 +586,13 @@ A Geno Ontology external reference (GOxref) vocabulary can be created by directl
|
|
590
586
|
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb
|
591
587
|
echo -e "\nend" >> lib/biointerchange/goxref.rb
|
592
588
|
|
589
|
+
Building an external reference vocabulary based on Life Science Registry external database abbreviations (based on download of the
|
590
|
+
Life Science registry spreadsheet as TSV file):
|
591
|
+
|
592
|
+
echo -e "module BioInterchange\n" > lib/biointerchange/life_science_registry.rb
|
593
|
+
cut -f 1,25 <path-to-registry-tsv-file> | grep -E 'https?://.*\$id' | ruby generators/tsv2rubyclass.rb LifeScienceRegistry >> lib/biointerchange/life_science_registry.rb
|
594
|
+
echo -e "\nend" >> lib/biointerchange/life_science_registry.rb
|
595
|
+
|
593
596
|
#### Python Vocabulary Classes
|
594
597
|
|
595
598
|
The source-code generation can be skipped, if none of the ontologies that are used by BioInterchange have been changed. Otherwise, the existing Python vocabulary class wrappers can be generated as follows:
|
@@ -630,14 +633,25 @@ The following Java packages will automatically install alongside BioInterchange'
|
|
630
633
|
|
631
634
|
### Gem Bundling/Installing
|
632
635
|
|
636
|
+
Mac OS X prerequisites and `bundle install` difference:
|
637
|
+
|
638
|
+
sudo port install libxml2 libxslt
|
639
|
+
sudo ARCHFLAGS=-Wno-error=unused-command-line-argument-hard-error-in-future bundle install
|
640
|
+
|
641
|
+
Actual gem bundling:
|
642
|
+
|
633
643
|
bundle exec rake gemspec
|
634
644
|
bundle exec gem build biointerchange.gemspec
|
635
|
-
sudo bundle exec gem install biointerchange
|
645
|
+
sudo bundle exec gem install biointerchange-`cat VERSION`.gem
|
636
646
|
|
637
647
|
If you encounter problems with gem dependencies, then you can try to explictly use Ruby 1.9:
|
638
648
|
|
639
649
|
bundle exec gem1.9 build biointerchange.gemspec
|
640
|
-
sudo bundle exec gem1.9 install biointerchange
|
650
|
+
sudo bundle exec gem1.9 install biointerchange-`cat VERSION`.gem
|
651
|
+
|
652
|
+
Alternative build script, `make.sh`, which installs the gem without RDocs and ri pages (quicker when testing):
|
653
|
+
|
654
|
+
./make.sh
|
641
655
|
|
642
656
|
### Unit Testing
|
643
657
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.0.
|
1
|
+
1.0.2
|
Binary file
|
Binary file
|
data/generators/rdfxml.rb
CHANGED
@@ -19,7 +19,7 @@ SIO_SYN = RDF::URI.new('http://semanticscience.org/resource/synonym')
|
|
19
19
|
# This label conversion also appears in:
|
20
20
|
# +lib/biointerchange/core.rb+
|
21
21
|
def make_safe_label(label)
|
22
|
-
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)
|
22
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
|
23
23
|
end
|
24
24
|
|
25
25
|
reader = RDF::RDFXML::Reader.open(ARGV[0])
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'biointerchange'
|
4
|
+
|
5
|
+
if ARGV.length != 1 then
|
6
|
+
puts 'Usage: tsv2rubyclass rubyclassname'
|
7
|
+
puts ''
|
8
|
+
puts 'Reads a TSV file from STDIN, where the first column values become'
|
9
|
+
puts 'method names (sanitized for spaces, etc.) in the class and the'
|
10
|
+
puts 'second column values are returned as a string.'
|
11
|
+
puts ''
|
12
|
+
puts 'The generated Ruby class is output on STDOUT.'
|
13
|
+
exit 1
|
14
|
+
end
|
15
|
+
|
16
|
+
classname = ARGV[0]
|
17
|
+
|
18
|
+
puts "class #{classname}"
|
19
|
+
puts ''
|
20
|
+
|
21
|
+
STDIN.each { |line|
|
22
|
+
key, value = line.chomp.split("\t")
|
23
|
+
|
24
|
+
puts " def self.#{BioInterchange.make_safe_label(key)}"
|
25
|
+
puts " \"#{value}\""
|
26
|
+
puts ' end'
|
27
|
+
puts ''
|
28
|
+
}
|
29
|
+
|
30
|
+
puts 'end'
|
31
|
+
|
data/lib/biointerchange/core.rb
CHANGED
@@ -34,13 +34,13 @@ module BioInterchange
|
|
34
34
|
# Custom Exceptions and Errors
|
35
35
|
require 'biointerchange/exceptions'
|
36
36
|
|
37
|
-
# Ontologies (besides the ones from the 'rdf' gem)
|
37
|
+
# Ontologies (besides the ones from the 'rdf' gem), vocabularies and
|
38
|
+
# other mappings (e.g., database abbreviations to URIs):
|
39
|
+
require 'biointerchange/life_science_registry'
|
38
40
|
require 'biointerchange/cdao'
|
39
41
|
require 'biointerchange/faldo'
|
40
|
-
require 'biointerchange/gff3o'
|
41
42
|
require 'biointerchange/gfvo'
|
42
43
|
require 'biointerchange/goxref'
|
43
|
-
require 'biointerchange/gvf1o'
|
44
44
|
require 'biointerchange/sio'
|
45
45
|
require 'biointerchange/so'
|
46
46
|
require 'biointerchange/sofa'
|
@@ -105,6 +105,18 @@ module BioInterchange
|
|
105
105
|
# Writer
|
106
106
|
# ...same GFF3 writer
|
107
107
|
|
108
|
+
### VCF ###
|
109
|
+
|
110
|
+
# Reader
|
111
|
+
require 'biointerchange/genomics/vcf_reader'
|
112
|
+
|
113
|
+
# Feature base model
|
114
|
+
require 'biointerchange/genomics/vcf_feature_set'
|
115
|
+
require 'biointerchange/genomics/vcf_feature'
|
116
|
+
|
117
|
+
# Writer
|
118
|
+
# ...same GFF3 writer
|
119
|
+
|
108
120
|
#
|
109
121
|
# PHYLOGENETICS
|
110
122
|
#
|
@@ -225,7 +237,7 @@ module BioInterchange
|
|
225
237
|
'input' => opt['input'],
|
226
238
|
'output' => opt['output']
|
227
239
|
}
|
228
|
-
map['
|
240
|
+
map['batch_size'] = opt['batchsize'].to_i if opt['batchsize']
|
229
241
|
opt.each_key { |key|
|
230
242
|
map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
|
231
243
|
}
|
@@ -296,7 +308,7 @@ module BioInterchange
|
|
296
308
|
#
|
297
309
|
# +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name
|
298
310
|
def self.make_safe_label(label)
|
299
|
-
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)
|
311
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/){ "a_#{$1}" }.gsub(/^_+|_+$/, '').gsub(/_+/, '_').gsub(/_([A-Z])+/x){ "#{$1}" }
|
300
312
|
end
|
301
313
|
|
302
314
|
private
|
@@ -4,11 +4,12 @@ require 'date'
|
|
4
4
|
|
5
5
|
module BioInterchange::Genomics
|
6
6
|
|
7
|
-
# Serializes GFF3 and
|
7
|
+
# Serializes GFF3, GVF and VCF models.
|
8
8
|
#
|
9
9
|
# Inputs:
|
10
10
|
# - biointerchange.gff3
|
11
11
|
# - biointerchange.gvf
|
12
|
+
# - biointerchange.vcf
|
12
13
|
#
|
13
14
|
# Outputs:
|
14
15
|
# - rdf.biointerchange.gfvo
|
@@ -18,14 +19,7 @@ class RDFWriter < BioInterchange::Writer
|
|
18
19
|
BioInterchange::Registry.register_writer(
|
19
20
|
'rdf.biointerchange.gfvo',
|
20
21
|
BioInterchange::Genomics::RDFWriter,
|
21
|
-
[ 'biointerchange.gff3' ],
|
22
|
-
true,
|
23
|
-
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
24
|
-
)
|
25
|
-
BioInterchange::Registry.register_writer(
|
26
|
-
'rdf.biointerchange.gfvo',
|
27
|
-
BioInterchange::Genomics::RDFWriter,
|
28
|
-
[ 'biointerchange.gvf' ],
|
22
|
+
[ 'biointerchange.gff3', 'biointerchange.gvf', 'biointerchange.vcf' ],
|
29
23
|
true,
|
30
24
|
'Genomic Feature and Variation Ontology (GFVO) based RDFization'
|
31
25
|
)
|
@@ -34,8 +28,7 @@ class RDFWriter < BioInterchange::Writer
|
|
34
28
|
#
|
35
29
|
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
36
30
|
def initialize(ostream)
|
37
|
-
|
38
|
-
@ostream = ostream
|
31
|
+
super(ostream)
|
39
32
|
end
|
40
33
|
|
41
34
|
# Serialize a model as RDF.
|
@@ -47,10 +40,12 @@ class RDFWriter < BioInterchange::Writer
|
|
47
40
|
@format = :gff3
|
48
41
|
elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
|
49
42
|
@format = :gvf
|
43
|
+
elsif model.instance_of?(BioInterchange::Genomics::VCFFeatureSet) then
|
44
|
+
@format = :vcf
|
50
45
|
else
|
51
46
|
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
|
52
|
-
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet
|
53
|
-
'BioInterchange::Genomics::GVFFeatureSet.'
|
47
|
+
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet, ' +
|
48
|
+
'BioInterchange::Genomics::GVFFeatureSet and BioInterchange::Genomics::VCFFeatureSet.'
|
54
49
|
end
|
55
50
|
@base = BioInterchange::GFVO
|
56
51
|
serialize_model(model, uri_prefix)
|
@@ -71,15 +66,24 @@ protected
|
|
71
66
|
# Record written variants in order to avoid writing out RDF.type multiple times.
|
72
67
|
@variants = {}
|
73
68
|
|
69
|
+
# Set up "matchers" that can be used to match a number of attributes of a feature, and then,
|
70
|
+
# link out to an entity that says something about that combination of attributes. Used for
|
71
|
+
# GVF #data-source, etc., pragmas and VCF filters.
|
72
|
+
@matchers = []
|
73
|
+
|
74
74
|
# Create a URI prefix that applies to the set, all features in the set, and all the features' annotations.
|
75
75
|
# Then register the prefix with the writer to have a concise Turtle output.
|
76
76
|
set_uri = set_uri[0..-2] if set_uri and set_uri.end_with?('/')
|
77
77
|
set_uri = RDF::URI.new(model.uri) unless set_uri
|
78
78
|
set_base(set_uri + '/')
|
79
79
|
|
80
|
-
|
80
|
+
add_prefix('http://biohackathon.org/resource/faldo#', 'faldo')
|
81
|
+
add_prefix('http://www.biointerchange.org/gfvo#', 'gfvo')
|
82
|
+
add_prefix('http://semanticscience.org/resource/', 'sio')
|
83
|
+
|
84
|
+
create_triple(set_uri, RDF.type, @base.File)
|
81
85
|
model.pragmas.each { |pragma_name|
|
82
|
-
serialize_pragma(set_uri, model.pragma(pragma_name))
|
86
|
+
serialize_pragma(set_uri, pragma_name, model.pragma(pragma_name))
|
83
87
|
}
|
84
88
|
model.contents.each { |feature|
|
85
89
|
if feature.instance_of?(BioInterchange::Genomics::GFF3FeatureSequence) then
|
@@ -89,64 +93,162 @@ protected
|
|
89
93
|
end
|
90
94
|
}
|
91
95
|
close
|
92
|
-
#RDF::NTriples::Writer.dump(graph, @ostream)
|
93
|
-
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
94
|
-
# Having said that, Jena's rdfcat is very good for converting formats
|
95
|
-
# anyway, so perhaps it is not worth investigating the following.
|
96
|
-
# RDF::RDFXML::Writer.dump(graph, @ostream)
|
97
96
|
end
|
98
97
|
|
99
98
|
# Serializes pragmas for a given feature set URI.
|
100
99
|
#
|
101
100
|
# +set_uri+:: the feature set URI to which the pragmas belong to
|
101
|
+
# +name+:: name of the pragma statement
|
102
102
|
# +pragma+:: an object representing a pragma statement
|
103
|
-
def serialize_pragma(set_uri, pragma)
|
103
|
+
def serialize_pragma(set_uri, name, pragma)
|
104
104
|
if pragma.kind_of?(Hash) then
|
105
105
|
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) then
|
106
106
|
serialize_structured_attribute(set_uri, pragma)
|
107
107
|
elsif pragma.has_key?('gff-version') then
|
108
|
-
create_triple(set_uri, @base.
|
108
|
+
create_triple(set_uri.to_s, @base.has_identifier, RDF::URI.new("#{set_uri}/version"))
|
109
|
+
create_triple("#{set_uri}/version", RDF.type, @base.Version)
|
110
|
+
create_triple("#{set_uri}/version", @base.has_value, "gff-version #{pragma['gff-version']}")
|
109
111
|
elsif pragma.has_key?('gvf-version') then
|
110
|
-
create_triple(set_uri,
|
112
|
+
create_triple("#{set_uri}/version", RDF.type, @base.Version)
|
113
|
+
create_triple("#{set_uri}/version", @base.has_value, "gvf-version #{pragma['gvf-version']}")
|
114
|
+
elsif pragma.has_key?('fileformat') then
|
115
|
+
create_triple("#{set_uri}/version", RDF.type, @base.Version)
|
116
|
+
create_triple("#{set_uri}/version", @base.has_value, "fileformat #{pragma['fileformat']}")
|
111
117
|
elsif pragma.has_key?('sequence-region') then
|
112
118
|
pragma['sequence-region'].keys.each { |seqid|
|
113
119
|
serialize_landmark(set_uri, pragma['sequence-region'][seqid])
|
114
120
|
}
|
115
121
|
elsif pragma.has_key?('species') then
|
116
|
-
create_triple(set_uri, @base.
|
122
|
+
create_triple(set_uri, @base.is_about, RDF::URI.new(pragma['species']))
|
123
|
+
# VCF section:
|
124
|
+
# ...TODO
|
125
|
+
# Everything else:
|
126
|
+
else
|
127
|
+
end
|
128
|
+
elsif pragma.kind_of?(Array) then
|
129
|
+
# VCF section:
|
130
|
+
basic_vcf_mappings = {
|
131
|
+
'ID' => @base.Identifier,
|
132
|
+
'Description' => @base.Comment,
|
133
|
+
'Number' => @base.InformationContentEntity, # Note: not just an integer; can be also 'A', 'G', and '.'
|
134
|
+
'Type' => @base.InformationContentEntity # Can be 'Integer', 'Float', 'Character', 'String'
|
135
|
+
}
|
136
|
+
if name == 'FILTER' then
|
137
|
+
pragma.each { |assignment|
|
138
|
+
pragma_uri = serialize_vcf_pragma(set_uri, "filter/#{assignment['ID']}", @base.VariantCalling, basic_vcf_mappings, assignment)
|
139
|
+
create_triple(set_uri, @base.is_participant_in, pragma_uri)
|
140
|
+
}
|
141
|
+
elsif name == 'FORMAT' then
|
142
|
+
pragma.each { |assignment|
|
143
|
+
pragma_uri = serialize_vcf_pragma(set_uri, "format/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
|
144
|
+
create_triple(set_uri, @base.references, pragma_uri)
|
145
|
+
}
|
146
|
+
elsif name == 'INFO' then
|
147
|
+
pragma.each { |assignment|
|
148
|
+
pragma_uri = serialize_vcf_pragma(set_uri, "info/#{assignment['ID']}", @base.InformationContentEntity, basic_vcf_mappings, assignment)
|
149
|
+
create_triple(set_uri, @base.references, pragma_uri)
|
150
|
+
}
|
151
|
+
else
|
152
|
+
# TODO
|
117
153
|
end
|
118
154
|
else
|
119
155
|
# TODO
|
120
156
|
end
|
121
157
|
end
|
122
158
|
|
159
|
+
# Goes through "matchers" and links the feature if its attributes are present
|
160
|
+
# and equal to a "matcher's" data.
|
161
|
+
#
|
162
|
+
# (TODO: Update description of this method, because it is absolutely unclear
|
163
|
+
# what it actually does right now. Sorry.)
|
164
|
+
#
|
165
|
+
# +feature+:: the feature that provides attributes for matching
|
166
|
+
# +feature_uri+:: URI of the feature that is linked out to, if the feature's attributes match
|
167
|
+
def match_feature(feature, feature_uri)
|
168
|
+
@matchers.each { |match_constraints|
|
169
|
+
constraints, linkout = match_constraints
|
170
|
+
|
171
|
+
# No constraints means that *everything* matches.
|
172
|
+
matches = true
|
173
|
+
constraints.each_pair { |key, value|
|
174
|
+
if key == 'Seqid' then
|
175
|
+
matches = false unless value.include?(feature.sequence_id)
|
176
|
+
elsif key == 'Source' then
|
177
|
+
matches = false unless value.include?(feature.source)
|
178
|
+
elsif key == 'Type' then
|
179
|
+
matches = false unless value.include?(feature.type)
|
180
|
+
else
|
181
|
+
if feature.attributes.has_key?(key) then
|
182
|
+
attributes_have_a_match = false
|
183
|
+
feature.attributes[key].each { |attribute_value|
|
184
|
+
attributes_have_a_match = true if value.include?(attribute_value)
|
185
|
+
}
|
186
|
+
matches = false unless attributes_have_a_match
|
187
|
+
else
|
188
|
+
matches = false
|
189
|
+
end
|
190
|
+
end
|
191
|
+
}
|
192
|
+
|
193
|
+
# If there is a match, then add linkout.
|
194
|
+
create_triple(feature_uri, @base.has_source, RDF::URI.new(linkout))
|
195
|
+
}
|
196
|
+
end
|
197
|
+
|
123
198
|
# Serializes a +GFF3Feature+ object for a given feature set URI.
|
124
199
|
#
|
125
200
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
126
201
|
# +feature+:: a +GFF3Feature+ instance
|
127
202
|
def serialize_feature(set_uri, feature)
|
128
203
|
# TODO Make sure there is only one value in the 'ID' list.
|
129
|
-
|
130
|
-
|
131
|
-
|
204
|
+
# TODO Ponder about whether it would be possible to get the same URI for two distinct features (bad thing).
|
205
|
+
source = ''
|
206
|
+
source = "#{feature.source}," if feature.source
|
207
|
+
type = ''
|
208
|
+
type = "#{feature.type.to_s.sub(/^[^:]+:\/\//, '')}," if feature.type
|
209
|
+
phase = ",#{feature.phase}" if feature.phase
|
210
|
+
if feature.attributes.has_key?('ID') or feature.attributes.has_key?(' id') then
|
211
|
+
feature_id = 'ID'
|
212
|
+
feature_id = ' id' if feature.attributes.has_key?(' id')
|
213
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes[feature_id][0]}")
|
214
|
+
else
|
215
|
+
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{source}#{type}#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand}#{phase}")
|
216
|
+
end
|
217
|
+
|
218
|
+
create_triple(set_uri, @base.has_member, feature_uri)
|
132
219
|
create_triple(feature_uri, RDF.type, @base.Feature)
|
220
|
+
create_triple(feature_uri, RDF.type, feature.type) if feature.type
|
221
|
+
match_feature(feature, feature_uri)
|
133
222
|
serialize_landmark(set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
|
134
|
-
create_triple(feature_uri, @base.
|
135
|
-
create_triple(feature_uri, @base.
|
136
|
-
create_triple(feature_uri,
|
137
|
-
create_triple(feature_uri, @base.
|
223
|
+
create_triple(feature_uri, @base.is_located_on, RDF::URI.new(@landmarks[feature.sequence_id]))
|
224
|
+
create_triple(feature_uri, @base.is_created_by, RDF::URI.new("#{feature_uri}/source"))
|
225
|
+
create_triple("#{feature_uri}/source", RDF.type, @base.ExperimentalMethod)
|
226
|
+
create_triple("#{feature_uri}/source", @base.has_value, feature.source) if feature.source
|
227
|
+
if feature.phase then
|
228
|
+
create_triple(feature_uri, @base.has_quality, RDF::URI.new("#{feature_uri}/phase"))
|
229
|
+
create_triple("#{feature_uri}/phase", RDF.type, @base.CodingFrameOffset)
|
230
|
+
create_triple("#{feature_uri}/phase", @base.has_value, feature.phase)
|
231
|
+
end
|
138
232
|
|
139
|
-
|
233
|
+
create_triple(feature_uri, @base.has_part, RDF::URI.new("#{feature_uri}/locus"))
|
234
|
+
create_triple("#{feature_uri}/locus", RDF.type, @base.Locus)
|
235
|
+
create_triple("#{feature_uri}/locus", @base.has_attribute, RDF::URI.new("#{feature_uri}/locus/region"))
|
236
|
+
serialize_coordinate(set_uri, "#{feature_uri}/locus", feature)
|
140
237
|
serialize_attributes(set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
141
238
|
end
|
142
239
|
|
240
|
+
# Serialize a feature's coordinates using FALDO.
|
241
|
+
#
|
242
|
+
# +set_uri+:: URI of the feature set that the feature belongs to
|
243
|
+
# +feature_uri+:: URI prefix of the feature
|
244
|
+
# +feature+:: object representation of the feature, which contains the locus that is described by this method
|
143
245
|
def serialize_coordinate(set_uri, feature_uri, feature)
|
144
246
|
region_uri = RDF::URI.new("#{feature_uri.to_s}/region")
|
145
247
|
start_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/start")
|
146
248
|
end_position_uri = RDF::URI.new("#{feature_uri.to_s}/region/end")
|
147
249
|
#feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
148
|
-
##graph.insert(
|
149
|
-
create_triple(feature_uri, @base.
|
250
|
+
##graph.insert(BioInterchange::Statement.new(feature_uri, @base.with_parent([ @base.region ].flatten, feature_object_properties), region_uri))
|
251
|
+
create_triple(feature_uri, @base.is_located_on, region_uri)
|
150
252
|
create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
|
151
253
|
# BIN STUFF
|
152
254
|
if false then
|
@@ -173,7 +275,23 @@ protected
|
|
173
275
|
end
|
174
276
|
create_triple(start_position_uri, BioInterchange::FALDO.position, feature.start_coordinate)
|
175
277
|
create_triple(end_position_uri, BioInterchange::FALDO.position, feature.end_coordinate)
|
176
|
-
|
278
|
+
if feature.score then
|
279
|
+
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/score"))
|
280
|
+
if @format == :gvf or @format == :vcf then
|
281
|
+
create_triple("#{feature_uri}/score", RDF.type, @base.PhredScore)
|
282
|
+
else
|
283
|
+
create_triple("#{feature_uri}/score", RDF.type, @base.Score)
|
284
|
+
end
|
285
|
+
create_triple("#{feature_uri}/score", @base.has_value, feature.score)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
# Constructs a landmark URI based on set URI and the landmark's ID.
|
290
|
+
#
|
291
|
+
# +set_uri+:: the set URI to which the landmark belongs to
|
292
|
+
# +id+:: ID of the landmark
|
293
|
+
def landmark_uri(set_uri, id)
|
294
|
+
"#{set_uri.to_s}/landmark/#{id}"
|
177
295
|
end
|
178
296
|
|
179
297
|
# Serializes a genomic feature landmark ("seqid").
|
@@ -182,12 +300,13 @@ protected
|
|
182
300
|
# +landmark+:: encapsuled landmark data
|
183
301
|
def serialize_landmark(set_uri, landmark)
|
184
302
|
return if @landmarks.has_key?(landmark.seqid)
|
185
|
-
landmark_uri =
|
303
|
+
landmark_uri = landmark_uri(set_uri, landmark.seqid)
|
186
304
|
region_uri = RDF::URI.new("#{landmark_uri.to_s}/region")
|
187
305
|
@landmarks[landmark.seqid] = landmark_uri
|
188
306
|
create_triple(landmark_uri, RDF.type, @base.Landmark)
|
189
|
-
create_triple(landmark_uri, @base.
|
190
|
-
create_triple(landmark_uri, @base.
|
307
|
+
create_triple(landmark_uri, @base.has_identifier, RDF::URI.new("#{landmark_uri}/id"))
|
308
|
+
create_triple("#{landmark_uri}/id", @base.has_value, landmark.seqid)
|
309
|
+
create_triple(landmark_uri, @base.has_attribute, region_uri)
|
191
310
|
if landmark.start_coordinate then
|
192
311
|
start_position_uri = RDF::URI.new("#{landmark_uri.to_s}/region/start")
|
193
312
|
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
@@ -209,91 +328,92 @@ protected
|
|
209
328
|
attributes.each_pair { |tag, list|
|
210
329
|
# Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
|
211
330
|
if tag == 'Alias' then
|
212
|
-
list.
|
213
|
-
create_triple(feature_uri, @base.
|
331
|
+
list.each_index { |index|
|
332
|
+
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/alias/#{index}"))
|
333
|
+
create_triple("#{feature_uri}/alias/#{index}", RDF.type, @base.Alias)
|
334
|
+
create_triple("#{feature_uri}/alias/#{index}", @base.has_value, list[index])
|
214
335
|
}
|
215
336
|
elsif tag == 'Dbxref' then
|
216
337
|
list.each { |value|
|
217
338
|
begin
|
218
|
-
|
219
|
-
|
220
|
-
if value.match(/^dbSNP(_\d+)?:rs\d+$/) then
|
221
|
-
linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{value.split(/:/)[1].sub(/^rs/, '')}"
|
222
|
-
elsif value.match(/^COSMIC(_\d+)?:COSM\d+$/) then
|
223
|
-
linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{value.split(/:/)[1].sub(/^COSM/, '')}"
|
224
|
-
else
|
225
|
-
abbreviation, id = value.split(':', 2)
|
226
|
-
linkout = BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id
|
227
|
-
end
|
228
|
-
# Second, and finally: add a triple to the graph in the right representative format depending on the ontology used
|
229
|
-
create_triple(feature_uri, @base.dbxref, linkout)
|
339
|
+
# Try to link the external database reference to a well-established URI:
|
340
|
+
serialize_dbxref(feature_uri, value)
|
230
341
|
rescue NoMethodError
|
231
|
-
# Preserve the Dbxref as a Literal:
|
342
|
+
# Not clear where to link to? Preserve the Dbxref as a Literal:
|
232
343
|
@dbxref = 0 if @dbxref == nil
|
233
344
|
literal_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/dbxref/#{@dbxref}")
|
234
345
|
@dbxref += 1
|
235
|
-
create_triple(feature_uri, @base.
|
236
|
-
create_triple(literal_uri, RDF.type,
|
237
|
-
create_triple(literal_uri,
|
346
|
+
create_triple(feature_uri, @base.references, literal_uri)
|
347
|
+
create_triple(literal_uri, RDF.type, @base.ExternalReference)
|
348
|
+
create_triple(literal_uri, @base.has_value, value)
|
238
349
|
end
|
239
350
|
}
|
240
351
|
elsif tag == 'Derives_from' then
|
241
352
|
list.each { |value|
|
242
|
-
create_triple(feature_uri, @base.
|
353
|
+
create_triple(feature_uri, @base.is_temporarily_part_of, RDF::URI.new("#{set_uri.to_s}/feature/#{value}"))
|
243
354
|
}
|
244
355
|
elsif tag == 'Gap' then
|
245
356
|
# Handled by 'Target', because 'Gap' requires 'Target' to be present.
|
246
357
|
elsif tag == 'ID' then
|
247
358
|
list.each { |value|
|
248
|
-
create_triple(feature_uri, @base.
|
359
|
+
create_triple(feature_uri, @base.has_identifier, RDF::URI.new("#{feature_uri}/id"))
|
360
|
+
create_triple("#{feature_uri}/id", RDF.type, @base.Identifier)
|
361
|
+
create_triple("#{feature_uri}/id", @base.has_value, value)
|
249
362
|
}
|
250
363
|
elsif tag == 'Is_circular' then
|
251
364
|
value = list.join(',')
|
252
365
|
if value == 'true' then
|
253
|
-
create_triple(feature_uri, @base.
|
366
|
+
create_triple(feature_uri, @base.has_quality, @base.CircularHelix) if value == 'true'
|
254
367
|
elsif value == 'false' then
|
255
|
-
create_triple(feature_uri, @base.
|
368
|
+
create_triple(feature_uri, @base.is_circular, @base.WatsonCrickHelix) if value == 'false'
|
256
369
|
else
|
257
|
-
create_triple(feature_uri,
|
370
|
+
create_triple(feature_uri, BioInterchange::RDFS.comment, "Is_circular non-truth value: #{value}")
|
258
371
|
end
|
259
372
|
elsif tag == 'Name' then
|
260
373
|
list.each { |value|
|
261
|
-
create_triple(feature_uri, @base.
|
374
|
+
create_triple(feature_uri, @base.has_attribute, RDF::URI.new("#{feature_uri}/name"))
|
375
|
+
create_triple("#{feature_uri}/name", RDF.type, @base.Name)
|
376
|
+
create_triple("#{feature_uri}/name", @base.has_value, value)
|
262
377
|
}
|
263
378
|
elsif tag == 'Note' then
|
264
|
-
list.
|
265
|
-
create_triple(feature_uri, RDF::
|
379
|
+
list.each_index { |index|
|
380
|
+
create_triple(feature_uri, @base.has_annotation, RDF::URI.new("#{feature_uri}/note/#{index}"))
|
381
|
+
create_triple("#{feature_uri}/note/#{index}", RDF.type, @base.Note)
|
382
|
+
create_triple("#{feature_uri}/note/#{index}", @base.has_value, list[index])
|
266
383
|
}
|
267
384
|
elsif tag == 'Ontology_term' then
|
268
385
|
list.each { |value|
|
269
386
|
# TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
|
270
387
|
# match their associated Ruby method.
|
271
388
|
namespace, accession = value.split(/:/, 2)
|
272
|
-
create_triple(feature_uri, @base.
|
389
|
+
create_triple(feature_uri, @base.references, "#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")
|
273
390
|
}
|
274
391
|
elsif tag == 'Parent' then
|
275
392
|
list.each { |parent_id|
|
276
|
-
create_triple(feature_uri, @base.
|
393
|
+
create_triple(feature_uri, @base.has_source, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}"))
|
277
394
|
}
|
278
395
|
elsif tag == 'Reference_seq' then
|
279
396
|
list.each { |value|
|
280
397
|
reference_uri = RDF::URI.new("#{feature_uri.to_s}/reference/#{value}")
|
281
|
-
create_triple(feature_uri, @base.
|
282
|
-
create_triple(reference_uri, RDF.type, @base.
|
283
|
-
create_triple(reference_uri, @base.
|
398
|
+
create_triple(feature_uri, @base.has_attribute, reference_uri)
|
399
|
+
create_triple(reference_uri, RDF.type, @base.ReferenceSequence)
|
400
|
+
create_triple(reference_uri, @base.has_value, value)
|
284
401
|
}
|
285
402
|
elsif tag == 'Target' then
|
403
|
+
# GFF3 spec is unclear on this point, but I assume that a target ID
|
404
|
+
# is referencing a feature ID within the same file.
|
286
405
|
target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
|
287
406
|
target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
|
288
|
-
create_triple(
|
289
|
-
create_triple(target_uri,
|
290
|
-
create_triple(target_uri, @base.
|
407
|
+
create_triple(target_uri, RDF.type, @base.SequenceAlignment)
|
408
|
+
create_triple(target_uri, @base.has_source, feature_uri)
|
409
|
+
create_triple(target_uri, @base.has_input, target_id)
|
291
410
|
region_uri = RDF::URI.new("#{target_uri.to_s}/region")
|
292
411
|
start_position_uri = RDF::URI.new("#{region_uri.to_s}/start")
|
293
412
|
end_position_uri = RDF::URI.new("#{region_uri.to_s}/end")
|
294
|
-
create_triple(target_uri, @base.
|
295
|
-
create_triple(region_uri,
|
296
|
-
create_triple(region_uri,
|
413
|
+
create_triple(target_uri, @base.has_attribute, region_uri)
|
414
|
+
create_triple(region_uri, RDF.type, BioInterchange::FALDO.Region)
|
415
|
+
create_triple(region_uri, BioInterchange::FALDO.begin, start_position_uri)
|
416
|
+
create_triple(region_uri, BioInterchange::FALDO.end, end_position_uri)
|
297
417
|
if strand == '+' then
|
298
418
|
create_triple(start_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
299
419
|
create_triple(end_position_uri, RDF.type, BioInterchange::FALDO.Positive_strand)
|
@@ -312,13 +432,20 @@ protected
|
|
312
432
|
create_triple(end_position_uri, BioInterchange::FALDO.position, end_coordinate)
|
313
433
|
end
|
314
434
|
|
315
|
-
# Describe a possible alignment between the feature and target:
|
435
|
+
# Describe a possible alignment with gaps between the feature and target:
|
316
436
|
if attributes.has_key?('Gap') then
|
317
437
|
attributes['Gap'].each_index { |gap_no|
|
318
438
|
cigar_line = attributes['Gap'][gap_no].split(/\s+/)
|
319
439
|
cigar_line.each_index { |alignment_no|
|
320
440
|
alignment_uri = RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no}")
|
321
|
-
|
441
|
+
if alignment_no == 0 then
|
442
|
+
create_triple(target_uri, @base.has_first_part, alignment_uri)
|
443
|
+
else
|
444
|
+
create_triple(target_uri, @base.has_ordered_part, alignment_uri)
|
445
|
+
end
|
446
|
+
if alignment_no == cigar_line.length then
|
447
|
+
create_triple(target_uri, @base.has_last_part, alignment_uri)
|
448
|
+
end
|
322
449
|
operation = cigar_line[alignment_no].gsub(/[^MIDFR]/, '')
|
323
450
|
operation = nil unless operation.length == 1
|
324
451
|
span = cigar_line[alignment_no].gsub(/[^0-9]/, '')
|
@@ -326,24 +453,25 @@ protected
|
|
326
453
|
if operation == 'M' then
|
327
454
|
create_triple(alignment_uri, RDF.type, @base.Match)
|
328
455
|
elsif operation == 'I' then
|
329
|
-
create_triple(alignment_uri, RDF.type, @base.
|
456
|
+
create_triple(alignment_uri, RDF.type, @base.ReferenceSequenceGap)
|
330
457
|
elsif operation == 'D' then
|
331
|
-
create_triple(alignment_uri, RDF.type, @base.
|
458
|
+
create_triple(alignment_uri, RDF.type, @base.TargetSequenceGap)
|
332
459
|
elsif operation == 'F' then
|
333
|
-
create_triple(alignment_uri, RDF.type, @base.
|
460
|
+
create_triple(alignment_uri, RDF.type, @base.ForwardReferenceSequenceFrameshift)
|
334
461
|
elsif operation == 'R' then
|
335
|
-
create_triple(alignment_uri, RDF.type, @base.
|
462
|
+
create_triple(alignment_uri, RDF.type, @base.ReverseReferenceSequenceFrameshift)
|
336
463
|
else
|
337
464
|
# Fallback: operation is outside of the specification
|
338
|
-
create_triple(alignment_uri, RDF.type, @base.
|
339
|
-
create_triple(alignment_uri,
|
465
|
+
create_triple(alignment_uri, RDF.type, @base.SequenceAlignmentOperation)
|
466
|
+
create_triple(alignment_uri, BioInterchange::RDFS.comment, "Alignment operation: #{operation}") if operation and not operation.empty?
|
340
467
|
end
|
341
|
-
create_triple(alignment_uri, @base.span, span.to_i) if span
|
342
|
-
create_triple(alignment_uri, RDF.first, alignment_uri)
|
343
468
|
if alignment_no + 1 < cigar_line.length then
|
344
|
-
create_triple(alignment_uri,
|
345
|
-
|
346
|
-
|
469
|
+
create_triple(alignment_uri, @base.is_before, RDF::URI.new("#{feature_uri.to_s}/alignment/#{gap_no}/#{alignment_no + 1}"))
|
470
|
+
end
|
471
|
+
if span then
|
472
|
+
create_triple(alignment_uri, @base.has_attribute, RDF::URI.new("#{alignment_uri}/span"))
|
473
|
+
create_triple("#{alignment_uri}/span", RDF.type, @base.Span)
|
474
|
+
create_triple("#{alignment_uri}/span", @base.has_value, span.to_i)
|
347
475
|
end
|
348
476
|
}
|
349
477
|
}
|
@@ -352,6 +480,29 @@ protected
|
|
352
480
|
serialize_variant_effects(set_uri, feature_uri, list)
|
353
481
|
elsif tag == 'Variant_seq' then
|
354
482
|
serialize_variant_seqs(set_uri, feature_uri, list)
|
483
|
+
# VCF related attributes:
|
484
|
+
elsif tag == ' alternative_alleles' then
|
485
|
+
# TODO
|
486
|
+
elsif tag == ' filters' then
|
487
|
+
# Example: "Qual;MinAB;MinDP" -- comes here as split list (split by ";")
|
488
|
+
list.each { |id|
|
489
|
+
create_triple(feature_uri, @base.is_refuted_by, RDF::URI.new("#{set_uri}/filter/#{id}"))
|
490
|
+
}
|
491
|
+
elsif tag == ' samples' then
|
492
|
+
list.each_index { |sample|
|
493
|
+
list[sample].each_pair { |key, values|
|
494
|
+
serialize_vcf_sample(feature_uri, sample, key, values, attributes)
|
495
|
+
}
|
496
|
+
}
|
497
|
+
# Everything else:
|
498
|
+
elsif list == true then
|
499
|
+
# Attribute is a flag. Tag itself carries meaning and has no value associated with it.
|
500
|
+
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}")
|
501
|
+
create_triple(feature_uri, @base.has_attribute, attribute_uri)
|
502
|
+
create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
|
503
|
+
create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
|
504
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
|
505
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
|
355
506
|
else
|
356
507
|
# TODO Report unknown upper case letters here? That would be a spec. validation...
|
357
508
|
# Well, or it would show that this implementation is incomplete. Could be either.
|
@@ -359,71 +510,299 @@ protected
|
|
359
510
|
value = list[index]
|
360
511
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
361
512
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
362
|
-
create_triple(feature_uri, @base.
|
363
|
-
create_triple(attribute_uri, RDF.type, @base.
|
364
|
-
|
365
|
-
create_triple(attribute_uri,
|
513
|
+
create_triple(feature_uri, @base.has_attribute, attribute_uri)
|
514
|
+
create_triple(attribute_uri, RDF.type, @base.InformationContentEntity)
|
515
|
+
# TODO Figure out why the following line was there. Seems wrong.
|
516
|
+
#create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/tag/#{tag}"))
|
517
|
+
create_triple(attribute_uri, @base.has_value, value)
|
518
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), RDF.type, @base.Label)
|
519
|
+
create_triple(RDF::URI.new("#{attribute_uri}/tag/#{tag}"), @base.has_value, tag)
|
366
520
|
}
|
367
521
|
end
|
368
522
|
}
|
369
523
|
end
|
370
524
|
|
525
|
+
# Serializes VCF sample data (VCF columns 9 and above).
|
526
|
+
#
|
527
|
+
# See also genotype serialization of non-VCF data in `serialize_variant_seqs`.
|
528
|
+
#
|
529
|
+
# +feature_uri+:: URI of the feature that the sample data relates to
|
530
|
+
# +sample+:: number of the sample that is being addressed (sample column number)
|
531
|
+
# +key+:: key of the described sample values
|
532
|
+
# +values+:: values of the sample (possible composite type, e.g. comma separated list)
|
533
|
+
# +attribtues+:: a map of tag/value pairs associated with the feature
|
534
|
+
def serialize_vcf_sample(feature_uri, sample, key, values, attributes)
|
535
|
+
if key == 'DP' then
|
536
|
+
# Depth across samples. An integer.
|
537
|
+
values = values.split(',')
|
538
|
+
values.each_index { |index|
|
539
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index].to_i, index, values.size > 1, @base.Number_ofReads, RDF::XSD.integer)
|
540
|
+
}
|
541
|
+
elsif key == 'GT' then
|
542
|
+
# Genotype
|
543
|
+
list_uri = RDF::URI.new("#{feature_uri}/attribute/#{key}")
|
544
|
+
serialize_attribute_with_label(feature_uri, list_uri, @base.Genotype, key)
|
545
|
+
phased = values.index('/') == nil
|
546
|
+
if phased then
|
547
|
+
create_triple(list_uri, @base.has_attribute, "#{list_uri}/phase")
|
548
|
+
create_triple("#{list_uri}/phase", RDF.type, @base.GameticPhase)
|
549
|
+
end
|
550
|
+
value_uris = []
|
551
|
+
values = values.split(/\/|\|/)
|
552
|
+
# Only say something about zygosity if we deal with single bases and a diploid genome!
|
553
|
+
if values.length == 2 and values.map { |sequence| sequence.length }.uniq[0] == 1 then
|
554
|
+
if values.uniq.length == 1 then
|
555
|
+
create_triple(list_uri, @base.has_quality, @base.Homozygous)
|
556
|
+
else
|
557
|
+
create_triple(list_uri, @base.has_quality, @base.Heterozygous)
|
558
|
+
end
|
559
|
+
end
|
560
|
+
values.each_index { |index|
|
561
|
+
sequence = vcf_allele(values[index].to_i, attributes)
|
562
|
+
sequence_type = @base.SequenceVariant
|
563
|
+
sequence_type = @base.ReferenceSequence if values[index].to_i == 0
|
564
|
+
value_uris << value_uri = serialize_vcf_sample_attribute(feature_uri, sample, key, true, sequence, index, values.size > 1, sequence_type)
|
565
|
+
}
|
566
|
+
serialize_list_array(list_uri, value_uris)
|
567
|
+
elsif key == 'FT' then
|
568
|
+
# Filter: passed does nothing; applied filter uses isRefutedBy.
|
569
|
+
# TODO How to code using GFVO?
|
570
|
+
values = values.split(';')
|
571
|
+
values.each_index { |index|
|
572
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
573
|
+
}
|
574
|
+
elsif key == 'GL' then
|
575
|
+
# Genotype likelihoods.
|
576
|
+
list_uri = "#{feature_uri}/attribute/#{key}"
|
577
|
+
serialize_attribute_with_label(feature_uri, list_uri, @base.Score, key)
|
578
|
+
values = values.split(',')
|
579
|
+
value_uris = []
|
580
|
+
values.each_index { |index|
|
581
|
+
value_uris << serialize_vcf_sample_attribute(feature_uri, sample, key, false, values[index], index, values.size > 1, @base.InformationContentEntity)
|
582
|
+
}
|
583
|
+
serialize_list_array(list_uri, value_uris)
|
584
|
+
elsif key == 'GLE' then
|
585
|
+
# Genotype likelihoods of heterogenous ploidy.
|
586
|
+
# Example: 0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53
|
587
|
+
values = values.split(',')
|
588
|
+
values.each_index { |index|
|
589
|
+
genotype, likelihood = values[index].split(':')
|
590
|
+
genotype = genotype.split('/').map { |allele| vcf_allele(allele.to_i, attributes) }
|
591
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, genotype, index, values.size > 1, @base.InformationContentEntity)
|
592
|
+
}
|
593
|
+
elsif key == 'PL' then
|
594
|
+
# Phred scaled genotype likelihoods.
|
595
|
+
values = values.split(',')
|
596
|
+
values.each_index { |index|
|
597
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
598
|
+
}
|
599
|
+
elsif key == 'GP' then
|
600
|
+
# Phred scaled genotype posterior probabilities.
|
601
|
+
values = values.split(',')
|
602
|
+
values.each_index { |index|
|
603
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
604
|
+
}
|
605
|
+
elsif key == 'GQ' then
|
606
|
+
# Conditional genotype quality.
|
607
|
+
values = values.split(',')
|
608
|
+
values.each_index { |index|
|
609
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
610
|
+
}
|
611
|
+
elsif key == 'HQ' then
|
612
|
+
# Haplotype qualities -- presumably Phred scaled.
|
613
|
+
values = values.split(',')
|
614
|
+
values.each_index { |index|
|
615
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
616
|
+
}
|
617
|
+
elsif key == 'PS' then
|
618
|
+
# Phase set. It's complicated. See the VCF specification for details.
|
619
|
+
values = values.split(',')
|
620
|
+
values.each_index { |index|
|
621
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
622
|
+
}
|
623
|
+
elsif key == 'PQ' then
|
624
|
+
# Phasing quality in Phred scale.
|
625
|
+
values = values.split(',')
|
626
|
+
values.each_index { |index|
|
627
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
628
|
+
}
|
629
|
+
elsif key == 'EC' then
|
630
|
+
# Expected alternate allele counts.
|
631
|
+
values = values.split(',')
|
632
|
+
values.each_index { |index|
|
633
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
634
|
+
}
|
635
|
+
elsif key == 'MQ' then
|
636
|
+
# RMS mapping quality. An integer.
|
637
|
+
values = values.split(',')
|
638
|
+
values.each_index { |index|
|
639
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values[index], index, values.size > 1, @base.InformationContentEntity)
|
640
|
+
}
|
641
|
+
else
|
642
|
+
# Unknown keys. Should that be possible at all?
|
643
|
+
serialize_vcf_sample_attribute(feature_uri, sample, key, true, values, 0, false, @base.InformationContentEntity)
|
644
|
+
end
|
645
|
+
end
|
646
|
+
|
647
|
+
# Returns the allele based on VCF's genotype indexing specification.
|
648
|
+
# Reference allele is index zero, alternatives alleles are designated by one or above.
|
649
|
+
#
|
650
|
+
# +genotype_index+:: VCF genotype index
|
651
|
+
# +attributes+:: feature attribute hash that contains reference/alternative allele bases
|
652
|
+
def vcf_allele(genotype_index, attributes)
|
653
|
+
if genotype_index == 0 then
|
654
|
+
genotype = attributes[' reference_bases'][0]
|
655
|
+
else
|
656
|
+
genotype = attributes[' alternative_alleles'][genotype_index - 1]
|
657
|
+
end
|
658
|
+
end
|
659
|
+
|
660
|
+
# Serializes an ordered list; the list's URIs are given as an array.
|
661
|
+
#
|
662
|
+
# +list_uri+:: URI of the list object (ordered list items will be part of this instance via "has_first_part", "has_ordered_part, "has_last_part")
|
663
|
+
# +uris+:: URIs of the list to be serialized
|
664
|
+
def serialize_list_array(list_uri, uris)
|
665
|
+
uris.each_index { |index|
|
666
|
+
next_uri = nil
|
667
|
+
next_uri = uris[index + 1] if index + 1 < uris.length
|
668
|
+
serialize_list(uris, index, uris[index], next_uri, list_uri)
|
669
|
+
}
|
670
|
+
end
|
671
|
+
|
672
|
+
# Create an ordered list of things; this method serializes one item only and
|
673
|
+
# repeated calls to this method create the list.
|
674
|
+
#
|
675
|
+
# +values+:: array of the things that appear in the ordered list
|
676
|
+
# +index+:: index of the thing that is serialized by this method call
|
677
|
+
# +value_uri+:: URI that represents the current value that is being linked to
|
678
|
+
# +next_value_uri+:: URI of the next serialized value (ignored, if last index)
|
679
|
+
# +list_uri+:: URI of the list that contains the items
|
680
|
+
def serialize_list(values, index, value_uri, next_value_uri, list_uri)
|
681
|
+
if index == 0 then
|
682
|
+
create_triple(list_uri, @base.has_first_part, value_uri)
|
683
|
+
elsif index + 1 == values.length then
|
684
|
+
create_triple(list_uri, @base.has_last_part, value_uri)
|
685
|
+
else
|
686
|
+
create_triple(list_uri, @base.has_ordered_part, value_uri)
|
687
|
+
end
|
688
|
+
if index + 1 < values.length then
|
689
|
+
create_triple(value_uri, @base.is_before, next_value_uri)
|
690
|
+
end
|
691
|
+
end
|
692
|
+
|
693
|
+
# Serializes basic information for an object (a feature's attribtue) with label.
|
694
|
+
#
|
695
|
+
# Links the object to a feature, sets the objects type, assigns it a label.
|
696
|
+
#
|
697
|
+
# +feature_uri+:: URI of the feature that has the object as an attribute
|
698
|
+
# +object_uri+:: URI that represents the object
|
699
|
+
# +object_type+:: type of the object
|
700
|
+
# +label+:: label text to use
|
701
|
+
def serialize_attribute_with_label(feature_uri, object_uri, object_type, label)
|
702
|
+
create_triple(feature_uri, @base.has_attribute, object_uri)
|
703
|
+
create_triple(object_uri, RDF.type, object_type)
|
704
|
+
label_uri = RDF::URI.new("#{object_uri}/label")
|
705
|
+
create_triple(object_uri, @base.has_attribute, label_uri)
|
706
|
+
create_triple(label_uri, RDF.type, @base.Label)
|
707
|
+
create_triple(label_uri, @base.has_value, label)
|
708
|
+
end
|
709
|
+
|
710
|
+
# Serializes VCF meta-data (pragma equivalent) key/value pairs. Used by serialize_pragma.
|
711
|
+
#
|
712
|
+
# Returns URI of the serialized meta-data.
|
713
|
+
#
|
714
|
+
# +set_uri+:: URI of the set that the meta-data belongs to
|
715
|
+
# +uri_suffix+:: suffix that is appended to set_uri, which uniquely defines the meta-data (within the set_uri)
|
716
|
+
# +meta_type+:: type of the meta-data
|
717
|
+
# +key_type_mappings+:: mappings of keys to known types, everything else is considered an Object
|
718
|
+
# +attributes+:: key/value pairs that are the actual meta-data being described
|
719
|
+
def serialize_vcf_pragma(set_uri, uri_suffix, meta_type, key_type_mappings, attributes)
|
720
|
+
pragma_uri = RDF::URI.new("#{set_uri}/#{uri_suffix}")
|
721
|
+
create_triple(pragma_uri, RDF.type, meta_type)
|
722
|
+
attributes.each_pair { |key, value|
|
723
|
+
attribute_uri = RDF::URI.new("#{pragma_uri}/#{key}")
|
724
|
+
create_triple(pragma_uri, @base.has_attribute, attribute_uri)
|
725
|
+
if key_type_mappings.has_key?(key) then
|
726
|
+
create_triple(attribute_uri, RDF.type, key_type_mappings[key])
|
727
|
+
# TODO Check if type is integer/double here, then convert value accordingly.
|
728
|
+
else
|
729
|
+
create_triple(attribute_uri, RDF.type, @base.Object)
|
730
|
+
end
|
731
|
+
create_triple(attribute_uri, @base.has_value, value)
|
732
|
+
}
|
733
|
+
pragma_uri
|
734
|
+
end
|
735
|
+
|
736
|
+
# Serializes a VCF sample attribute/value pair. Used by serialize_vcf_sample.
|
737
|
+
#
|
738
|
+
# Returns URI of the serialized attribute/value pair.
|
739
|
+
#
|
740
|
+
# +base_uri+:: URI of the "thing" that the sample data relates to
|
741
|
+
# +sample+:: number of the sample that is being addressed (sample column number)
|
742
|
+
# +key+:: key of the described sample values
|
743
|
+
# +has_label+:: if true, then serialize label (value taken from key)
|
744
|
+
# +value+:: value that is associated with the key/sample
|
745
|
+
# +index+:: index of the value (in case value is part of an array of size > 1)
|
746
|
+
# +multivalue+:: true if this value is taken from an array of values of size > 1
|
747
|
+
# +attribute_type+:: type of the attribute entity that represents the value
|
748
|
+
# +value_type+:: type of the actual value
|
749
|
+
def serialize_vcf_sample_attribute(base_uri, sample, key, has_label, value, index, multivalue, attribute_type, value_type = nil)
|
750
|
+
value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}") unless multivalue
|
751
|
+
value_uri = RDF::URI.new("#{base_uri.to_s}/sample/#{sample}/#{key}-#{index + 1}") if multivalue
|
752
|
+
create_triple(base_uri, @base.has_attribute, value_uri)
|
753
|
+
create_triple(value_uri, RDF.type, attribute_type)
|
754
|
+
create_triple(value_uri, @base.has_value, value, value_type)
|
755
|
+
if has_label then
|
756
|
+
label_uri = RDF::URI.new("#{value_uri}/label")
|
757
|
+
create_triple(value_uri, @base.has_attribute, label_uri)
|
758
|
+
create_triple(label_uri, RDF.type, @base.Label)
|
759
|
+
create_triple(label_uri, @base.has_value, key)
|
760
|
+
end
|
761
|
+
value_uri
|
762
|
+
end
|
763
|
+
|
371
764
|
# Serializes a structured attribute (given as a pragma statement), which later
|
372
765
|
# can be referred to from feature instances.
|
373
766
|
#
|
374
767
|
# +set_uri+:: the feature set URI to which the structured attribute belongs to
|
375
768
|
# +pragma+:: a map that encapsulates the structured attribute data
|
376
769
|
def serialize_structured_attribute(set_uri, pragma)
|
770
|
+
# TODO Triple from set_uri to attribute_uri missing; should be isParticipantIn
|
377
771
|
attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
|
378
772
|
attributes = nil
|
379
773
|
class_type = nil
|
380
774
|
if pragma.has_key?('attribute-method') then
|
381
775
|
attributes = pragma['attribute-method'][0]
|
382
|
-
class_type = @base.
|
776
|
+
class_type = @base.ExperimentalMethod
|
383
777
|
elsif pragma.has_key?('data-source') then
|
384
778
|
attributes = pragma['data-source'][0]
|
385
|
-
class_type = @base.
|
779
|
+
class_type = @base.GenomicAscertainingMethod
|
386
780
|
elsif pragma.has_key?('score-method') then
|
387
781
|
attributes = pragma['score-method'][0]
|
388
|
-
class_type = @base.
|
782
|
+
class_type = @base.ExperimentalMethod
|
389
783
|
elsif pragma.has_key?('source-method') then
|
390
784
|
attributes = pragma['source-method'][0]
|
391
|
-
class_type = @base.
|
785
|
+
class_type = @base.ExperimentalMethod
|
392
786
|
elsif pragma.has_key?('technology-platform') then
|
393
787
|
attributes = pragma['technology-platform'][0]
|
394
|
-
class_type = @base.
|
788
|
+
class_type = @base.SequencingTechnologyPlatform
|
395
789
|
else
|
396
790
|
# TODO Error.
|
397
791
|
end
|
398
|
-
if class_type == @base.
|
792
|
+
if class_type == @base.GenomicAscertainingMethod and attributes.has_key?('Data_type') then
|
399
793
|
attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
|
400
794
|
if attributes['Data_type'] == 'Array_CGH' then
|
401
795
|
class_type = @base.ArrayComparativeGenomicHybridization
|
402
796
|
elsif attributes['Data_type'] == 'DNA_microarray' then
|
403
797
|
class_type = @base.DNAMicroarray
|
404
798
|
elsif attributes['Data_type'] == 'DNA_sequence' then
|
405
|
-
class_type = @base.
|
799
|
+
class_type = @base.DNASequencing
|
406
800
|
elsif attributes['Data_type'] == 'RNA_sequence' then
|
407
|
-
class_type = @base.
|
801
|
+
class_type = @base.RNASequencing
|
408
802
|
else
|
409
803
|
# TODO Error.
|
410
804
|
end
|
411
|
-
elsif class_type == @base.
|
412
|
-
if attributes.has_key?('Average_coverage') then
|
413
|
-
create_triple(attribute_uri, @base.averageCoverage, attributes['Average_coverage'][0].to_i)
|
414
|
-
end
|
415
|
-
if attributes.has_key?('Platform_class') then
|
416
|
-
create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
|
417
|
-
end
|
418
|
-
if attributes.has_key?('Platform_name') then
|
419
|
-
create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
|
420
|
-
end
|
421
|
-
if attributes.has_key?('Read_length') then
|
422
|
-
create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
|
423
|
-
end
|
424
|
-
if attributes.has_key?('Read_pair_span') then
|
425
|
-
create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
|
426
|
-
end
|
805
|
+
elsif class_type == @base.SequencingTechnologyPlatform then
|
427
806
|
if attributes.has_key?('Read_type') then
|
428
807
|
attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
|
429
808
|
if attributes['Read_type'] == 'fragment' then
|
@@ -436,17 +815,54 @@ protected
|
|
436
815
|
end
|
437
816
|
end
|
438
817
|
create_triple(attribute_uri, RDF.type, class_type)
|
818
|
+
if class_type == @base.SequencingTechnologyPlatform or
|
819
|
+
class_type == @base.FragmentReadPlatform or
|
820
|
+
class_type == @base.PairedEndReadPlatform then
|
821
|
+
if attributes.has_key?('Average_coverage') then
|
822
|
+
coverage_uri = RDF::URI.new("#{attribute_uri}/averagecoverage")
|
823
|
+
create_triple(attribute_uri, @base.has_attribute, coverage_uri)
|
824
|
+
create_triple(coverage_uri, RDF.type, @base.AverageCoverage)
|
825
|
+
create_triple(coverage_uri, @base.has_value, attributes['Average_coverage'][0].to_i)
|
826
|
+
end
|
827
|
+
if attributes.has_key?('Platform_class') then
|
828
|
+
create_triple(attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri}/platformclass"))
|
829
|
+
#create_triple(attribute_uri, @base.platformClass, attributes['Platform_class'][0])
|
830
|
+
end
|
831
|
+
if attributes.has_key?('Platform_name') then
|
832
|
+
#create_triple(attribute_uri, @base.platformName, attributes['Platform_name'][0])
|
833
|
+
end
|
834
|
+
if attributes.has_key?('Read_length') then
|
835
|
+
#create_triple(attribute_uri, @base.readLength, attributes['Read_length'][0].to_i)
|
836
|
+
end
|
837
|
+
if attributes.has_key?('Read_pair_span') then
|
838
|
+
#create_triple(attribute_uri, @base.readPairSpan, attributes['Read_pair_span'][0].to_i)
|
839
|
+
end
|
840
|
+
end
|
439
841
|
attributes.keys.each { |tag|
|
440
842
|
if tag.match(/^[a-z]/) then
|
843
|
+
tag.strip!
|
441
844
|
custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
|
442
|
-
create_triple(
|
443
|
-
create_triple(custom_attribute_uri, @base.
|
845
|
+
create_triple(attribute_uri, @base.has_attribute, custom_attribute_uri)
|
846
|
+
create_triple(custom_attribute_uri, RDF.type, @base.InformationContentEntity)
|
847
|
+
create_triple(custom_attribute_uri, @base.has_attribute, RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"))
|
444
848
|
attributes[tag].each { |value|
|
445
|
-
create_triple(custom_attribute_uri,
|
849
|
+
create_triple(custom_attribute_uri, @base.has_value, value)
|
446
850
|
}
|
447
|
-
create_triple(attribute_uri, @base.
|
851
|
+
create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), RDF.type, @base.Name)
|
852
|
+
create_triple(RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}/name"), @base.has_value, tag)
|
448
853
|
else
|
449
854
|
# TODO
|
855
|
+
match_constraints = {}
|
856
|
+
attributes[tag].each { |value|
|
857
|
+
if tag == 'Seqid' or tag == 'Type' or tag == 'Source' then
|
858
|
+
match_constraints[tag] = value.split(',')
|
859
|
+
else
|
860
|
+
# Not a recognized match. Might be a Dbxref or Comment.
|
861
|
+
end
|
862
|
+
}
|
863
|
+
unless match_constraints.keys.empty? then
|
864
|
+
@matches
|
865
|
+
end
|
450
866
|
end
|
451
867
|
}
|
452
868
|
end
|
@@ -462,35 +878,46 @@ protected
|
|
462
878
|
sequence_variant, variant_index, feature_type, feature_ids = effect.split(' ', 4)
|
463
879
|
feature_ids = feature_ids.split(' ')
|
464
880
|
effect_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}/effect/#{index}")
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
create_triple(effect_uri,
|
881
|
+
# TODO
|
882
|
+
#serialize_variant_triple(feature_uri, RDF::URI.new("#{feature_uri.to_s}/variant/#{variant_index}"), @base.effect, effect_uri)
|
883
|
+
# Type is a SO sequence_variant or descendent:
|
884
|
+
create_triple(effect_uri, RDF.type, BioInterchange::SO.send(BioInterchange.make_safe_label(sequence_variant)))
|
885
|
+
# The feature type should be already apparent from the targeted feature. Do no sanity
|
886
|
+
# check here (if they match) and just skip over it.
|
887
|
+
# create_triple(effect_uri, @base.feature_type, BioInterchange::SO.send(BioInterchange.make_safe_label(feature_type)))
|
469
888
|
feature_ids.each { |feature_id|
|
470
|
-
create_triple(
|
889
|
+
create_triple(feature_id, @base.is_affected_by, effect_uri)
|
471
890
|
}
|
472
891
|
}
|
473
892
|
end
|
474
893
|
|
475
894
|
# Serializes a list of variant sequences.
|
476
895
|
#
|
896
|
+
# See also VCF genotype serialization ('GT' attribute) in `serialize_vcf_sample`.
|
897
|
+
#
|
477
898
|
# +set_uri+:: the feature set URI to which the feature belongs to
|
478
899
|
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
479
900
|
# +list+:: list of variant values
|
480
901
|
def serialize_variant_seqs(set_uri, feature_uri, list)
|
902
|
+
variant_uri = nil
|
903
|
+
|
481
904
|
list.each_index { |index|
|
482
905
|
value = list[index]
|
483
906
|
variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
|
484
|
-
|
907
|
+
sequence_uri = RDF::URI.new("#{variant_uri}/sequence")
|
908
|
+
serialize_variant_triple(feature_uri, variant_uri, @base.has_attribute, sequence_uri)
|
909
|
+
create_triple(sequence_uri, @base.has_value, value)
|
485
910
|
}
|
486
911
|
|
487
|
-
# Return the variant type based on the present sequence(s):
|
488
|
-
return @base.Variant if list.length != 2
|
489
912
|
if list[0].match(/a-zA-Z/) and list[1].match(/a-zA-Z/) then
|
490
|
-
|
491
|
-
|
913
|
+
if list[0] == list[1] then
|
914
|
+
create_triple(variant_uri, @base.has_quality, @base.Homozygous)
|
915
|
+
else
|
916
|
+
create_triple(variant_uri, @base.has_quality, @base.Heterozygous)
|
917
|
+
end
|
492
918
|
end
|
493
|
-
|
919
|
+
|
920
|
+
return @base.SequenceVariant
|
494
921
|
end
|
495
922
|
|
496
923
|
# Adds a variant to the graph; tracks the variant's URI that RDF.type is only written out once.
|
@@ -501,8 +928,8 @@ protected
|
|
501
928
|
# +object+:: data to be serialized
|
502
929
|
def serialize_variant_triple(feature_uri, variant_uri, predicate, object)
|
503
930
|
unless @variants.has_key?(variant_uri.to_s) then
|
504
|
-
create_triple(feature_uri, @base.
|
505
|
-
create_triple(variant_uri, RDF.type, @base.
|
931
|
+
create_triple(feature_uri, @base.is_affected_by, variant_uri)
|
932
|
+
create_triple(variant_uri, RDF.type, @base.VariantCalling)
|
506
933
|
end
|
507
934
|
@variants[variant_uri.to_s] = true
|
508
935
|
create_triple(variant_uri, predicate, object)
|
@@ -515,12 +942,39 @@ protected
|
|
515
942
|
def serialize_feature_sequence(set_uri, feature_sequence)
|
516
943
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature_sequence.feature_id}")
|
517
944
|
annotation_uri = RDF::URI.new("#{feature_uri.to_s}/sequence")
|
518
|
-
create_triple(feature_uri, @base.
|
519
|
-
create_triple(annotation_uri, RDF.type, @base.
|
520
|
-
create_triple(annotation_uri,
|
521
|
-
create_triple(annotation_uri, @base.
|
945
|
+
create_triple(feature_uri, @base.has_attribute, annotation_uri)
|
946
|
+
create_triple(annotation_uri, RDF.type, @base.Sequence)
|
947
|
+
create_triple(annotation_uri, BioInterchange::RDFS.comment, feature_sequence.comment) if feature_sequence.comment
|
948
|
+
create_triple(annotation_uri, @base.has_value, feature_sequence.sequence)
|
522
949
|
end
|
523
950
|
|
951
|
+
# Serializes an external database reference.
|
952
|
+
#
|
953
|
+
# +feature_uri+:: URI of the feature that the external database references is referring to
|
954
|
+
# +dbxref_composite+:: composite term of the external database reference (e.g. ""dbSNP_127:rs123456)
|
955
|
+
def serialize_dbxref(feature_uri, dbxref_composite)
|
956
|
+
abbreviation, accession = dbxref_composite.split(':', 2)
|
957
|
+
dbxref_uri = RDF::URI.new("#{feature_uri.to_s}/dbxref/#{BioInterchange.make_safe_label(abbreviation)}")
|
958
|
+
create_triple(feature_uri, @base.references, dbxref_uri)
|
959
|
+
|
960
|
+
create_triple(dbxref_uri, RDF.type, @base.ExternalReference)
|
961
|
+
create_triple(dbxref_uri, @base.refers_to, BioInterchange::LifeScienceRegistry.send(dbxref_composite.split('_', 2)[0].downcase).sub('$id', accession))
|
962
|
+
if dbxref_composite.match(/^.+_.+:.+$/) then
|
963
|
+
# Entry with version information.
|
964
|
+
version_uri = RDF::URI.new("#{dbxref_uri}/version")
|
965
|
+
create_triple(dbxref_uri, @base.has_identifier, version_uri)
|
966
|
+
create_triple(version_uri, @base.has_value, abbreviation[6..-1])
|
967
|
+
end
|
968
|
+
|
969
|
+
#if dbxref_composite.match(/^dbSNP(_\d+)?:rs\d+$/) then
|
970
|
+
# # linkout = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{dbxref_composite.split(/:/)[1].sub(/^rs/, '')}"
|
971
|
+
#elsif dbxref_composite.match(/^COSMIC(_\d+)?:COSM\d+$/) then
|
972
|
+
# linkout = "http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{accession.sub(/^COSM/, '')}"
|
973
|
+
#else
|
974
|
+
# BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + accession
|
975
|
+
#end
|
976
|
+
end
|
977
|
+
|
524
978
|
end
|
525
979
|
|
526
980
|
end
|