biointerchange 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +49 -4
- data/VERSION +1 -1
- data/examples/chromosome_BF.gff +1701 -0
- data/examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf +4326 -0
- data/examples/pubannotation.10096561.json +1 -0
- data/examples/{pubannotation.json → pubannotation.10096561.json.old} +0 -0
- data/examples/pubannotation.2626671.json +1 -0
- data/lib/biointerchange/core.rb +58 -16
- data/lib/biointerchange/genomics/gff3_feature.rb +1 -0
- data/lib/biointerchange/genomics/gff3_feature_set.rb +31 -1
- data/lib/biointerchange/genomics/gff3_pragmas.rb +35 -0
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +60 -23
- data/lib/biointerchange/genomics/gff3_reader.rb +74 -40
- data/lib/biointerchange/genomics/gvf_feature.rb +24 -0
- data/lib/biointerchange/genomics/gvf_feature_set.rb +14 -0
- data/lib/biointerchange/genomics/gvf_pragmas.rb +6 -0
- data/lib/biointerchange/genomics/gvf_reader.rb +37 -0
- data/lib/biointerchange/gff3o.rb +1 -1
- data/lib/biointerchange/gvf1o.rb +145 -17
- data/lib/biointerchange/textmining/content.rb +1 -0
- data/lib/biointerchange/textmining/content_connection.rb +74 -0
- data/lib/biointerchange/textmining/document.rb +3 -1
- data/lib/biointerchange/textmining/pubannos_json_reader.rb +87 -9
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +58 -2
- data/spec/gff3_rdfwriter_spec.rb +9 -1
- data/spec/gvf_rdfwriter_spec.rb +81 -0
- data/spec/text_mining_pubannos_json_reader_spec.rb +82 -10
- data/spec/text_mining_rdfwriter_spec.rb +11 -0
- data/web/api.html +30 -23
- metadata +156 -138
@@ -0,0 +1 @@
|
|
1
|
+
{"name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "4", "docurl":"http://www.ncbi.nlm.nih.gov/pubmed/10096561","text":"Stimulation of CD40 on immunogenic human malignant melanomas augments their cytotoxic T lymphocyte-mediated lysis and induces apoptosis.\nHere, we report the functional expression of CD40 on human malignant melanomas (MMs). Comparison of tumor specimen from MM precursor lesions, primary tumors, and metastases revealed that CD40 surface expression is down-regulated during tumor progression. CD40 expression was confirmed in 7 human MM cell lines established from immunogenic primary tumors or metastases, whereas 11 cell lines established from advanced stages were CD40 negative. CD40 expression could be enhanced in CD40-positive MM by stimulation with IFN-gamma and tumor necrosis factor-alpha but not by interleukin (IL)-1beta or CD40 triggering. CD40 ligation on MM by CD40L-transfected murine L-cells or by a soluble CD40L fusion protein up-regulated their expression of intercellular adhesion molecule-1 and MHC class I and class II molecules and their secretion of IL-6, IL-8, tumor necrosis factor-a, and granulocyte macrophage colony-stimulating factor and also induced a rapid activation of the transcription factor nuclear factor kappaB. Furthermore, CD40 ligation of a HLA-A2+, MelanA/MART1+ MM cell line enhanced its susceptibility to specific lysis by a HLA-A2-restricted, MelanA/MART-1-specific CTL clone. Finally, CD40 ligation induced growth inhibition and apoptosis in MM. These results indicate that CD40-CD40L interactions may play an important role in augmenting antitumor immunity and inducing apoptosis in some CD40-positive immunogenic human MMs.","catanns":[{"id":"T1","span":{"begin":15,"end":19},"category":"NP"},{"id":"T2","span":{"begin":23,"end":60},"category":"NP"},{"id":"T3","span":{"begin":70,"end":75},"category":"PR"},{"id":"T4","span":{"begin":126,"end":135},"category":"NP"},{"id":"T5","span":{"begin":182,"end":186},"category":"NP"},{"id":"T6","span":{"begin":190,"end":221},"category":"NP"},{"id":"T7","span":{"begin":299,"end":309},"category":"NP"},{"id":"T8","span":{"begin":392,"end":407},"category":"NP"},{"id":"T9","span":{"begin":494,"end":504},"category":"NP"},{"id":"T10","span":{"begin":581,"end":596},"category":"NP"},{"id":"T11","span":{"begin":768,"end":770},"category":"NP"},{"id":"T12","span":{"begin":857,"end":862},"category":"PR"},{"id":"T13","span":{"begin":954,"end":959},"category":"PR"},{"id":"T14","span":{"begin":1163,"end":1217},"category":"NP"},{"id":"T15","span":{"begin":1227,"end":1230},"category":"PR"},{"id":"T16","span":{"begin":1375,"end":1384},"category":"NP"},{"id":"T17","span":{"begin":1388,"end":1390},"category":"NP"},{"id":"T18","span":{"begin":1517,"end":1526},"category":"NP"}],"insanns":[],"relanns":[{"id":"R1","type":"coreferenceOf","subject":"T3","object":"T2"},{"id":"R2","type":"coreferenceOf","subject":"T5","object":"T1"},{"id":"R3","type":"coreferenceOf","subject":"T6","object":"T2"},{"id":"R4","type":"coreferenceOf","subject":"T9","object":"T7"},{"id":"R5","type":"coreferenceOf","subject":"T10","object":"T8"},{"id":"R6","type":"coreferenceOf","subject":"T11","object":"T6"},{"id":"R7","type":"coreferenceOf","subject":"T12","object":"T11"},{"id":"R8","type":"coreferenceOf","subject":"T13","object":"T11"},{"id":"R9","type":"coreferenceOf","subject":"T15","object":"T14"},{"id":"R10","type":"coreferenceOf","subject":"T16","object":"T4"},{"id":"R11","type":"coreferenceOf","subject":"T17","object":"T11"},{"id":"R12","type":"coreferenceOf","subject":"T18","object":"T16"}],"modanns":[]}
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
{"name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "3", "docurl":"http://www.ncbi.nlm.nih.gov/pubmed/2626671","pmcdoc_id":"2626671","div_id":"3","text":"Distinct expression kinetics of perforin and granzyme B during CTL development in culture\nOur experiments revealed clear differences in the kinetics of perforin, granzyme B, and cytokine expression during CD8+ T cell activation (Fig. 1). Naive T cells showed detectable expression of perforin mRNA as well as perforin protein (Fig. 1, A\u2013D). Relative to its expression in naive T cells, perforin (Prf1) mRNA expression did not increase appreciably at day 2 but showed a reproducible decrease at day 4, followed by robust reexpression between days 4 and 8 (Fig. 1, A\u2013D). In contrast, granzyme B (Gzmb) mRNA was low or undetectable in naive T cells but was strongly up-regulated by day 2 after stimulation and increased progressively until day 6 (Fig. 1, A and B); similarly, granzyme B protein was expressed by day 4 and remained high until day 6 (Fig. 1 E). As expected, a small fraction of naive T cells expressed the cytokines IFN-\u03b3 and TNF in response to stimulation, and this capacity increased significantly in differentiated cells (Fig. 1 E; see also Fig. 2 A). \nWe evaluated antigen-dependent cytolytic function in a short-term assay in which target cell death was measured within 2 h (Fig. 1 F). By limiting the duration of TCR stimulation, this strategy minimizes cytolysis secondary to new gene expression during the period of the assay. Naive T cells did not display significant cytolytic function in this short-term assay (unpublished data), most likely because they express immature (unprocessed) forms of perforin and lack the capacity to degranulate (18, 19). Even after activation for 2 or 4 d, the cells showed poor cytolytic activity (Fig. 1 F), in striking contrast to their capacity for efficient cytokine production (Fig. 1 E). Only cells cultured until day 6 displayed robust cytotoxicity, as judged by their ability to induce apoptosis in a large number of target cells (Fig. 1 F). \nThese results show that after a strong priming stimulus through TCRs and co-stimulatory receptors in vitro, granzyme B expression and the ability to produce effector cytokines are programmed early, whereas perforin expression and cytolytic function are induced later, during the phase of clonal expansion in IL-2. Therefore, the two major effector functions of CTL, cytokine production and cytolytic activity, are not intrinsically coregulated. \n","catanns":[{"id":"T21","span":{"begin":9,"end":19},"category":"Gene_expression"},{"id":"T1","span":{"begin":32,"end":40},"category":"Protein"},{"id":"T2","span":{"begin":45,"end":55},"category":"Protein"},{"id":"T3","span":{"begin":152,"end":160},"category":"Protein"},{"id":"T4","span":{"begin":162,"end":172},"category":"Protein"},{"id":"T22","span":{"begin":187,"end":197},"category":"Gene_expression"},{"id":"T5","span":{"begin":205,"end":208},"category":"Protein"},{"id":"T23","span":{"begin":270,"end":280},"category":"Gene_expression"},{"id":"T6","span":{"begin":284,"end":292},"category":"Protein"},{"id":"T7","span":{"begin":309,"end":317},"category":"Protein"},{"id":"T19","span":{"begin":353,"end":356},"category":"Anaphora"},{"id":"T24","span":{"begin":357,"end":367},"category":"Gene_expression"},{"id":"T8","span":{"begin":386,"end":394},"category":"Protein"},{"id":"T9","span":{"begin":396,"end":400},"category":"Protein"},{"id":"T25","span":{"begin":402,"end":417},"category":"Transcription"},{"id":"T26","span":{"begin":426,"end":434},"category":"Positive_regulation"},{"id":"T27","span":{"begin":482,"end":490},"category":"Negative_regulation"},{"id":"T28","span":{"begin":520,"end":532},"category":"Positive_regulation"},{"id":"T10","span":{"begin":582,"end":592},"category":"Protein"},{"id":"T11","span":{"begin":594,"end":598},"category":"Protein"},{"id":"T29","span":{"begin":609,"end":628},"category":"Transcription"},{"id":"T30","span":{"begin":663,"end":675},"category":"Positive_regulation"},{"id":"T31","span":{"begin":707,"end":716},"category":"Positive_regulation"},{"id":"T12","span":{"begin":773,"end":783},"category":"Protein"},{"id":"T32","span":{"begin":819,"end":832},"category":"Positive_regulation"},{"id":"T33","span":{"begin":904,"end":913},"category":"Gene_expression"},{"id":"T13","span":{"begin":928,"end":933},"category":"Protein"},{"id":"T14","span":{"begin":938,"end":941},"category":"Protein"},{"id":"T20","span":{"begin":974,"end":987},"category":"Anaphora"},{"id":"T34","span":{"begin":988,"end":997},"category":"Positive_regulation"},{"id":"T35","span":{"begin":1478,"end":1485},"category":"Gene_expression"},{"id":"T15","span":{"begin":1518,"end":1526},"category":"Protein"},{"id":"T16","span":{"begin":2013,"end":2023},"category":"Protein"},{"id":"T36","span":{"begin":2024,"end":2034},"category":"Gene_expression"},{"id":"T17","span":{"begin":2111,"end":2119},"category":"Protein"},{"id":"T37","span":{"begin":2120,"end":2130},"category":"Gene_expression"},{"id":"T38","span":{"begin":2158,"end":2165},"category":"Positive_regulation"},{"id":"T18","span":{"begin":2213,"end":2217},"category":"Protein"}],"insanns":[{"id":"E1","type":"subClassOf","object":"T21"},{"id":"E2","type":"subClassOf","object":"T21"},{"id":"E3","type":"subClassOf","object":"T22"},{"id":"E4","type":"subClassOf","object":"T22"},{"id":"E5","type":"subClassOf","object":"T23"},{"id":"E6","type":"subClassOf","object":"T23"},{"id":"E7","type":"subClassOf","object":"T24"},{"id":"E8","type":"subClassOf","object":"T25"},{"id":"E9","type":"subClassOf","object":"T26"},{"id":"E10","type":"subClassOf","object":"T27"},{"id":"E11","type":"subClassOf","object":"T28"},{"id":"E12","type":"subClassOf","object":"T29"},{"id":"E13","type":"subClassOf","object":"T30"},{"id":"E14","type":"subClassOf","object":"T31"},{"id":"E15","type":"subClassOf","object":"T32"},{"id":"E16","type":"subClassOf","object":"T33"},{"id":"E17","type":"subClassOf","object":"T33"},{"id":"E18","type":"subClassOf","object":"T34"},{"id":"E19","type":"subClassOf","object":"T34"},{"id":"E20","type":"subClassOf","object":"T35"},{"id":"E21","type":"subClassOf","object":"T36"},{"id":"E22","type":"subClassOf","object":"T37"},{"id":"E23","type":"subClassOf","object":"T38"}],"relanns":[{"id":"R4","type":"equivalentTo","subject":"T9","object":"T8"},{"id":"R5","type":"equivalentTo","subject":"T11","object":"T10"},{"id":"R6","type":"themeOf","subject":"T1","object":"E1"},{"id":"R7","type":"themeOf","subject":"T2","object":"E2"},{"id":"R8","type":"themeOf","subject":"T3","object":"E3"},{"id":"R9","type":"themeOf","subject":"T4","object":"E4"},{"id":"R10","type":"themeOf","subject":"T6","object":"E5"},{"id":"R11","type":"themeOf","subject":"T7","object":"E6"},{"id":"R12","type":"themeOf","subject":"T8","object":"E7"},{"id":"R13","type":"themeOf","subject":"T8","object":"E8"},{"id":"R14","type":"themeOf","subject":"E8","object":"E9"},{"id":"R15","type":"themeOf","subject":"E8","object":"E10"},{"id":"R16","type":"themeOf","subject":"E8","object":"E11"},{"id":"R17","type":"themeOf","subject":"T10","object":"E12"},{"id":"R18","type":"themeOf","subject":"T10","object":"E13"},{"id":"R19","type":"themeOf","subject":"T10","object":"E14"},{"id":"R20","type":"themeOf","subject":"T12","object":"E15"},{"id":"R21","type":"themeOf","subject":"T13","object":"E16"},{"id":"R22","type":"themeOf","subject":"T14","object":"E17"},{"id":"R23","type":"themeOf","subject":"E16","object":"E18"},{"id":"R24","type":"themeOf","subject":"E17","object":"E19"},{"id":"R25","type":"themeOf","subject":"T15","object":"E20"},{"id":"R26","type":"themeOf","subject":"T16","object":"E21"},{"id":"R27","type":"themeOf","subject":"T17","object":"E22"},{"id":"R28","type":"themeOf","subject":"E22","object":"E23"}],"modanns":[{"id":"M1","type":"Speculation","object":"E1"},{"id":"M2","type":"Speculation","object":"E2"},{"id":"M3","type":"Negation","object":"E9"},{"id":"M4","type":"Negation","object":"E12"}]}
|
data/lib/biointerchange/core.rb
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
# BioInterchange converts non-RDF data formats into RDF.
|
2
|
+
#
|
3
|
+
# Convert TSV, XML, GFF3, GVF and other files into RDF triples using
|
4
|
+
# BioInterchange's command-line tool, its web-services or make use
|
5
|
+
# of it as a gem in your own Ruby implementation.
|
1
6
|
module BioInterchange
|
2
7
|
|
3
8
|
# Custom Exceptions and Errors
|
@@ -25,6 +30,7 @@ module BioInterchange
|
|
25
30
|
# Text mining model
|
26
31
|
require 'biointerchange/textmining/document'
|
27
32
|
require 'biointerchange/textmining/content'
|
33
|
+
require 'biointerchange/textmining/content_connection'
|
28
34
|
require 'biointerchange/textmining/process'
|
29
35
|
|
30
36
|
# Text mining writers
|
@@ -34,16 +40,32 @@ module BioInterchange
|
|
34
40
|
# GENOMICS
|
35
41
|
#
|
36
42
|
|
37
|
-
|
43
|
+
### GFF3 ###
|
44
|
+
|
45
|
+
# Reader
|
38
46
|
require 'biointerchange/genomics/gff3_reader'
|
39
47
|
|
40
48
|
# Feature base model
|
49
|
+
require 'biointerchange/genomics/gff3_pragmas'
|
41
50
|
require 'biointerchange/genomics/gff3_feature_set'
|
42
51
|
require 'biointerchange/genomics/gff3_feature'
|
43
52
|
|
44
|
-
#
|
53
|
+
# Writer
|
45
54
|
require 'biointerchange/genomics/gff3_rdf_ntriples'
|
46
55
|
|
56
|
+
### GVF ###
|
57
|
+
|
58
|
+
# Reader
|
59
|
+
require 'biointerchange/genomics/gvf_reader'
|
60
|
+
|
61
|
+
# Feature base model
|
62
|
+
require 'biointerchange/genomics/gvf_pragmas'
|
63
|
+
require 'biointerchange/genomics/gvf_feature_set'
|
64
|
+
require 'biointerchange/genomics/gvf_feature'
|
65
|
+
|
66
|
+
# Writer
|
67
|
+
# ...same GFF3 writer
|
68
|
+
|
47
69
|
#
|
48
70
|
# ACTUAL COMMAND LINE IMPLEMENTATION
|
49
71
|
#
|
@@ -68,19 +90,27 @@ module BioInterchange
|
|
68
90
|
|
69
91
|
if opt['help'] or not opt['input'] or not opt['rdf'] then
|
70
92
|
puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
|
93
|
+
puts ''
|
71
94
|
puts 'Supported input formats (--input <format>/-i <format>):'
|
72
95
|
puts ' biointerchange.gff3 : GFF3'
|
96
|
+
puts ' biointerchange.gvf : GVF'
|
73
97
|
puts ' dbcls.catanns.json : PubAnnotation JSON'
|
74
98
|
puts ' uk.ac.man.pdfx : PDFx XML'
|
99
|
+
puts ''
|
75
100
|
puts 'Supported output formats (--rdf <format>/-r <format>)'
|
76
|
-
puts ' rdf.biointerchange.gff3 : RDF N-Triples for input'
|
101
|
+
puts ' rdf.biointerchange.gff3 : RDF N-Triples for the following input'
|
77
102
|
puts ' biointerchange.gff3'
|
78
|
-
puts ' rdf.
|
103
|
+
puts ' rdf.biointerchange.gvf : RDF N-Triples for the following input'
|
104
|
+
puts ' biointerchange.gff3'
|
105
|
+
puts ' biointerchange.gvf'
|
106
|
+
puts ' rdf.bh12.sio : RDF N-Triples for the following inputs'
|
79
107
|
puts ' dbcls.catanns.json'
|
80
108
|
puts ' uk.ac.man.pdfx'
|
109
|
+
puts ''
|
81
110
|
puts 'I/O options:'
|
82
111
|
puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
|
83
112
|
puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
|
113
|
+
puts ''
|
84
114
|
puts 'Input-/RDF-format specific options:'
|
85
115
|
puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
|
86
116
|
puts ' Output: rdf.bh12.sio'
|
@@ -89,13 +119,15 @@ module BioInterchange
|
|
89
119
|
puts ' -v <version>/--version <version> : version number of resource (optional)'
|
90
120
|
puts ' --name <name> : name of resource/tool/person (required)'
|
91
121
|
puts ' --name_id <id> : URI of resource/tool/person (required)'
|
122
|
+
puts ''
|
92
123
|
puts 'Input-/RDF-format specific options:'
|
93
|
-
puts ' Input: biointerchange.gff3'
|
94
|
-
puts ' Output: rdf.biointerchange.gff3'
|
124
|
+
puts ' Input: biointerchange.gff3 or biointerchange.gvf'
|
125
|
+
puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
|
95
126
|
puts ' Options:'
|
96
|
-
puts ' -t <date>/--date <date> : date when the GFF3 file was created (optional)'
|
97
|
-
puts ' --name <name> : name of the GFF3 file creator (optional)'
|
98
|
-
puts ' --name_id <id> : email address of the GFF3 file creator (optional)'
|
127
|
+
puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
|
128
|
+
puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
|
129
|
+
puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
|
130
|
+
puts ''
|
99
131
|
puts 'Other options:'
|
100
132
|
puts ' -d / --debug : turn on debugging output (for stacktraces)'
|
101
133
|
puts ' -h --help : this message'
|
@@ -103,6 +135,7 @@ module BioInterchange
|
|
103
135
|
exit 1
|
104
136
|
end
|
105
137
|
|
138
|
+
# Check if the input/rdf options are supported:
|
106
139
|
if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
|
107
140
|
if opt['rdf'] == 'rdf.bh12.sio' then
|
108
141
|
raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
|
@@ -115,22 +148,30 @@ module BioInterchange
|
|
115
148
|
else
|
116
149
|
unsupported_combination
|
117
150
|
end
|
151
|
+
elsif opt['input'] == 'biointerchange.gvf' then
|
152
|
+
if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
|
153
|
+
# Okay. No further arguments required.
|
154
|
+
else
|
155
|
+
unsupported_combination
|
156
|
+
end
|
118
157
|
else
|
119
158
|
unsupported_combination
|
120
159
|
end
|
121
|
-
|
122
160
|
|
123
161
|
opt['date'] = nil unless opt['date']
|
124
162
|
opt['version'] = nil unless opt['version']
|
125
163
|
|
126
|
-
#
|
164
|
+
# Generate model from file (deserialization).
|
165
|
+
# Note: if-clauses are lexicographically ordered.
|
127
166
|
reader = nil
|
128
|
-
if opt['input'] == '
|
167
|
+
if opt['input'] == 'biointerchange.gff3' then
|
168
|
+
reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
|
169
|
+
elsif opt['input'] == 'biointerchange.gvf' then
|
170
|
+
reader = BioInterchange::Genomics::GVFReader.new(opt['name'], opt['name_id'], opt['date'])
|
171
|
+
elsif opt['input'] == 'dbcls.catanns.json' then
|
129
172
|
reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
130
173
|
elsif opt['input'] == 'uk.ac.man.pdfx' then
|
131
174
|
reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
132
|
-
elsif opt['input'] == 'biointerchange.gff3' then
|
133
|
-
reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
|
134
175
|
end
|
135
176
|
|
136
177
|
model = nil
|
@@ -140,13 +181,14 @@ module BioInterchange
|
|
140
181
|
model = reader.deserialize(STDIN)
|
141
182
|
end
|
142
183
|
|
143
|
-
#
|
184
|
+
# Generate rdf from model (serialization).
|
185
|
+
# Note: if-clauses are lexicographically ordered.
|
144
186
|
writer = nil
|
145
187
|
if opt['rdf'] == 'rdf.bh12.sio' then
|
146
188
|
writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
|
147
189
|
writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
|
148
190
|
end
|
149
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' then
|
191
|
+
if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
|
150
192
|
writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
|
151
193
|
writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
|
152
194
|
end
|
@@ -2,6 +2,7 @@ require 'digest/sha1'
|
|
2
2
|
|
3
3
|
module BioInterchange::Genomics
|
4
4
|
|
5
|
+
# A GFF3 feature set, which encapsules information of a single GFF3 file.
|
5
6
|
class GFF3FeatureSet
|
6
7
|
|
7
8
|
# Create a new instance of a Generic Feature Format Version 3 (GFF3) feature set. A feature
|
@@ -9,13 +10,34 @@ class GFF3FeatureSet
|
|
9
10
|
def initialize
|
10
11
|
# Features are stored as the keys of a hash map to increase performance:
|
11
12
|
@set = {}
|
13
|
+
# Pragmas, i.e. feature meta-information, are stored as named mappings. Many
|
14
|
+
# pragmas are simple key/value assignments, but others permit multiple values
|
15
|
+
# whose ordering does matter. In that case, an array is used to store the
|
16
|
+
# various values.
|
17
|
+
@pragmas = {}
|
12
18
|
end
|
13
19
|
|
14
|
-
# Returns the contents of the feature set.
|
20
|
+
# Returns the contents of the feature set -- excluding pragma meta-data.
|
15
21
|
def contents
|
16
22
|
@set.keys
|
17
23
|
end
|
18
24
|
|
25
|
+
# Returns information stored for a named pragma, or nil if there is no information
|
26
|
+
# stored for it.
|
27
|
+
#
|
28
|
+
# +name+:: a string representing the name of the pragma whose value we are interested in
|
29
|
+
def pragma(name)
|
30
|
+
return nil unless name
|
31
|
+
# TODO Should throw exception if name is not a string.
|
32
|
+
return nil unless name.kind_of?(String)
|
33
|
+
@pragmas[name]
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns the names of all the pragmas for which some information has been recorded.
|
37
|
+
def pragmas
|
38
|
+
@pragmas.keys
|
39
|
+
end
|
40
|
+
|
19
41
|
# Returns an URI for this particular feature set, which is a SHA1 hash over the content's concatenated properties.
|
20
42
|
def uri
|
21
43
|
clob = ''
|
@@ -32,6 +54,14 @@ class GFF3FeatureSet
|
|
32
54
|
@set[feature] = true
|
33
55
|
end
|
34
56
|
|
57
|
+
# Sets the value for named pragma meta-data.
|
58
|
+
#
|
59
|
+
# +name+:: a string representing the unique name of the pragma
|
60
|
+
# +value+:: on object representing the value of the pragma assignment
|
61
|
+
def set_pragma(name, value)
|
62
|
+
# TODO Should throw exception if name is not a string.
|
63
|
+
@pragmas[name] = value
|
64
|
+
end
|
35
65
|
end
|
36
66
|
|
37
67
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
|
2
|
+
module BioInterchange::Genomics
|
3
|
+
|
4
|
+
# Represents a named region, which is defined by the pragma statement 'sequence-region'.
|
5
|
+
class GFF3NamedRegion
|
6
|
+
|
7
|
+
# Create a new instance of a named region.
|
8
|
+
#
|
9
|
+
# +seqid+:: unique identifier (in the GFF3 file context) that identifies this region
|
10
|
+
# +start_coordinate+:: genomic start coordinate of the region
|
11
|
+
# +end_coordinate+:: genomic end coordinate of the region
|
12
|
+
def initialize(seqid, start_coordinate, end_coordinate)
|
13
|
+
@seqid = seqid
|
14
|
+
@start_coordinate = start_coordinate
|
15
|
+
@end_coordinate = end_coordinate
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns the unique identifier (based on a GFF3 file context) of the region.
|
19
|
+
def seqid
|
20
|
+
@seqid
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the start coordinate of the region.
|
24
|
+
def start_coordinate
|
25
|
+
@start_coordinate
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns the end coordinate of the region.
|
29
|
+
def end_coordinate
|
30
|
+
@end_coordinate
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -4,6 +4,15 @@ require 'date'
|
|
4
4
|
|
5
5
|
module BioInterchange::Genomics
|
6
6
|
|
7
|
+
# Serializes GFF3 and GVF models.
|
8
|
+
#
|
9
|
+
# Inputs:
|
10
|
+
# - biointerchange.gff3
|
11
|
+
# - biointerchange.gvf
|
12
|
+
#
|
13
|
+
# Outputs:
|
14
|
+
# - rdf.biointerchange.gff3
|
15
|
+
# - rdf.biointerchange.gvf
|
7
16
|
class RDFWriter
|
8
17
|
|
9
18
|
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
@@ -19,14 +28,19 @@ class RDFWriter
|
|
19
28
|
# +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
|
20
29
|
def serialize(model)
|
21
30
|
if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
|
31
|
+
@base = BioInterchange::GFF3O
|
32
|
+
serialize_model(model)
|
33
|
+
elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
|
34
|
+
@base = BioInterchange::GVF1O
|
22
35
|
serialize_model(model)
|
23
36
|
else
|
24
37
|
raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
|
25
|
-
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet
|
38
|
+
'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
|
39
|
+
'BioInterchange::Genomics::GVFFeatureSet.'
|
26
40
|
end
|
27
41
|
end
|
28
42
|
|
29
|
-
|
43
|
+
protected
|
30
44
|
|
31
45
|
# Serializes RDF for a feature set representation.
|
32
46
|
#
|
@@ -34,11 +48,34 @@ private
|
|
34
48
|
def serialize_model(model)
|
35
49
|
graph = RDF::Graph.new
|
36
50
|
set_uri = RDF::URI.new(model.uri)
|
37
|
-
graph.insert(RDF::Statement.new(set_uri, RDF.type,
|
51
|
+
graph.insert(RDF::Statement.new(set_uri, RDF.type, @base.Set))
|
52
|
+
model.pragmas.each { |pragma_name|
|
53
|
+
serialize_pragma(graph, set_uri, model.pragma(pragma_name))
|
54
|
+
}
|
38
55
|
model.contents.each { |feature|
|
39
56
|
serialize_feature(graph, set_uri, feature)
|
40
57
|
}
|
41
58
|
RDF::NTriples::Writer.dump(graph, @ostream)
|
59
|
+
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
60
|
+
# RDF::RDFXML::Writer.dump(graph, @ostream)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Serializes pragmas for a given feature set URI.
|
64
|
+
# +graph+:: RDF graph to which the pragmas are added
|
65
|
+
# +set_uri+:: the feature set URI to which the pragmas belong to
|
66
|
+
# +pragma+:: an object representing a pragma statement
|
67
|
+
def serialize_pragma(graph, set_uri, pragma)
|
68
|
+
if pragma.kind_of?(Hash) then
|
69
|
+
if pragma.has_key?('gff-version') and @base == BioInterchange::GFF3O then
|
70
|
+
graph.insert(RDF::Statement.new(set_uri, @base.version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
|
71
|
+
elsif pragma.has_key?('gff-version') and @base == BioInterchange::GVF1O then
|
72
|
+
graph.insert(RDF::Statement.new(set_uri, @base.gff_version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
|
73
|
+
elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
|
74
|
+
graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
|
75
|
+
end
|
76
|
+
else
|
77
|
+
# TODO
|
78
|
+
end
|
42
79
|
end
|
43
80
|
|
44
81
|
# Serializes a +GFF3Feature+ object for a given feature set URI.
|
@@ -50,30 +87,30 @@ private
|
|
50
87
|
# TODO Make sure there is only one value in the 'ID' list.
|
51
88
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
|
52
89
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
|
53
|
-
feature_properties =
|
54
|
-
graph.insert(RDF::Statement.new(set_uri,
|
55
|
-
graph.insert(RDF::Statement.new(feature_uri, RDF.type,
|
56
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
57
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
58
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
59
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
60
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
61
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
62
|
-
feature_properties =
|
63
|
-
strand_uri =
|
90
|
+
feature_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
91
|
+
graph.insert(RDF::Statement.new(set_uri, @base.contains, feature_uri))
|
92
|
+
graph.insert(RDF::Statement.new(feature_uri, RDF.type, @base.Feature))
|
93
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.seqid ].flatten, feature_properties)[0], RDF::Literal.new(feature.sequence_id)))
|
94
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.source, RDF::Literal.new(feature.source)))
|
95
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.type, RDF::Literal.new(feature.type)))
|
96
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start, feature_properties)[0], RDF::Literal.new(feature.start_coordinate)))
|
97
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end, feature_properties)[0], RDF::Literal.new(feature.end_coordinate)))
|
98
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.score, RDF::Literal.new(feature.score))) if feature.score
|
99
|
+
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
100
|
+
strand_uri = @base.with_parent(@base.strand, feature_properties)[0]
|
64
101
|
case feature.strand
|
65
102
|
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
66
|
-
graph.insert(RDF::Statement.new(feature_uri, strand_uri,
|
103
|
+
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.NotStranded))
|
67
104
|
when BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
68
|
-
graph.insert(RDF::Statement.new(feature_uri, strand_uri,
|
105
|
+
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.UnknownStrand))
|
69
106
|
when BioInterchange::Genomics::GFF3Feature::POSITIVE
|
70
|
-
graph.insert(RDF::Statement.new(feature_uri, strand_uri,
|
107
|
+
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Positive))
|
71
108
|
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
72
|
-
graph.insert(RDF::Statement.new(feature_uri, strand_uri,
|
109
|
+
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Negative))
|
73
110
|
else
|
74
111
|
raise ArgumentException, 'Strand of feature is set to an unknown constant.'
|
75
112
|
end
|
76
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
113
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.phase, RDF::Literal.new(feature.phase))) if feature.phase
|
77
114
|
|
78
115
|
serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
79
116
|
end
|
@@ -88,16 +125,16 @@ private
|
|
88
125
|
attributes.each_pair { |tag, list|
|
89
126
|
if tag == 'Parent' then
|
90
127
|
list.each { |parent_id|
|
91
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
128
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
|
92
129
|
}
|
93
130
|
else
|
94
131
|
list.each_index { |index|
|
95
132
|
value = list[index]
|
96
133
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
97
134
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
98
|
-
graph.insert(RDF::Statement.new(feature_uri,
|
99
|
-
graph.insert(RDF::Statement.new(attribute_uri, RDF.type,
|
100
|
-
graph.insert(RDF::Statement.new(attribute_uri,
|
135
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.attributes, attribute_uri))
|
136
|
+
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, @base.Attribute))
|
137
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.tag, RDF::Literal.new("#{tag}")))
|
101
138
|
graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
|
102
139
|
}
|
103
140
|
end
|
@@ -26,60 +26,94 @@ class GFF3Reader
|
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
29
|
-
|
29
|
+
protected
|
30
|
+
|
31
|
+
def create_feature_set
|
32
|
+
BioInterchange::Genomics::GFF3FeatureSet.new()
|
33
|
+
end
|
30
34
|
|
31
35
|
def create_model(gff3)
|
32
|
-
feature_set =
|
36
|
+
feature_set = create_feature_set
|
33
37
|
gff3.each_line { |line|
|
34
|
-
next if line.start_with?('#')
|
38
|
+
next if line.start_with?('#') and not line.start_with?('##')
|
35
39
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
# The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
|
40
|
-
if type.match(/SO:\d+/) then
|
41
|
-
type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
|
42
|
-
elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
|
43
|
-
type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
|
44
|
-
end
|
40
|
+
# Ignore sequences for now.
|
41
|
+
break if line.start_with?('##FASTA')
|
45
42
|
|
46
|
-
|
47
|
-
|
48
|
-
stop_coordinate = stop_coordinate.to_i
|
49
|
-
if score == '.' then
|
50
|
-
score = nil
|
43
|
+
unless line.start_with?('##') then
|
44
|
+
add_feature(feature_set, line)
|
51
45
|
else
|
52
|
-
|
46
|
+
add_pragma(feature_set, line)
|
53
47
|
end
|
48
|
+
}
|
54
49
|
|
55
|
-
|
56
|
-
|
57
|
-
strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
58
|
-
elsif strand == '+' then
|
59
|
-
strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
|
60
|
-
elsif strand == '-' then
|
61
|
-
strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
62
|
-
else
|
63
|
-
strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
64
|
-
end
|
50
|
+
feature_set
|
51
|
+
end
|
65
52
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
else
|
70
|
-
phase = nil
|
71
|
-
end
|
53
|
+
def add_feature(feature_set, line)
|
54
|
+
line.chomp!
|
55
|
+
seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
|
72
56
|
|
73
|
-
|
74
|
-
|
75
|
-
|
57
|
+
# The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
|
58
|
+
if type.match(/SO:\d+/) then
|
59
|
+
type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
|
60
|
+
elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
|
61
|
+
type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
|
62
|
+
end
|
76
63
|
|
77
|
-
|
78
|
-
|
64
|
+
# String to numeric value conversions:
|
65
|
+
start_coordinate = start_coordinate.to_i
|
66
|
+
end_coordinate = end_coordinate.to_i
|
67
|
+
if score == '.' then
|
68
|
+
score = nil
|
69
|
+
else
|
70
|
+
score = score.to_f
|
71
|
+
end
|
79
72
|
|
80
|
-
|
73
|
+
# Determine strandedness:
|
74
|
+
if strand == '?' then
|
75
|
+
strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
|
76
|
+
elsif strand == '+' then
|
77
|
+
strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
|
78
|
+
elsif strand == '-' then
|
79
|
+
strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
80
|
+
else
|
81
|
+
strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
82
|
+
end
|
83
|
+
|
84
|
+
# Set phase, if it lies in the permissable range of values:
|
85
|
+
if phase == '0' or phase == '1' or phase == '2' then
|
86
|
+
phase = phase.to_i
|
87
|
+
else
|
88
|
+
phase = nil
|
89
|
+
end
|
90
|
+
|
91
|
+
temp = {}
|
92
|
+
attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
|
93
|
+
attributes = temp
|
94
|
+
|
95
|
+
feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
|
81
96
|
end
|
82
97
|
|
98
|
+
def add_pragma(feature_set, line)
|
99
|
+
line.chomp!
|
100
|
+
name, value = line[2..-1].split(/\s/, 2)
|
101
|
+
value.strip!
|
102
|
+
|
103
|
+
# Interpret pragmas depending on their definition:
|
104
|
+
if name == 'gff-version' then
|
105
|
+
feature_set.set_pragma(name, { name => value.to_f })
|
106
|
+
elsif name == 'sequence-region' then
|
107
|
+
regions = feature_set.pragma(name)
|
108
|
+
regions = {} unless regions
|
109
|
+
seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
|
110
|
+
regions[seqid] = BioInterchange::Genomics::GFF3NamedRegion.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
|
111
|
+
feature_set.set_pragma(name, regions)
|
112
|
+
else
|
113
|
+
# Unhandled pragma. Just save the value in its string form.
|
114
|
+
feature_set.set_pragma(name, value)
|
115
|
+
end
|
116
|
+
end
|
83
117
|
end
|
84
118
|
|
85
119
|
end
|