biointerchange 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. data/README.md +49 -4
  2. data/VERSION +1 -1
  3. data/examples/chromosome_BF.gff +1701 -0
  4. data/examples/estd176_Banerjee_et_al_2011.2012-11-29.NCBI36.gvf +4326 -0
  5. data/examples/pubannotation.10096561.json +1 -0
  6. data/examples/{pubannotation.json → pubannotation.10096561.json.old} +0 -0
  7. data/examples/pubannotation.2626671.json +1 -0
  8. data/lib/biointerchange/core.rb +58 -16
  9. data/lib/biointerchange/genomics/gff3_feature.rb +1 -0
  10. data/lib/biointerchange/genomics/gff3_feature_set.rb +31 -1
  11. data/lib/biointerchange/genomics/gff3_pragmas.rb +35 -0
  12. data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +60 -23
  13. data/lib/biointerchange/genomics/gff3_reader.rb +74 -40
  14. data/lib/biointerchange/genomics/gvf_feature.rb +24 -0
  15. data/lib/biointerchange/genomics/gvf_feature_set.rb +14 -0
  16. data/lib/biointerchange/genomics/gvf_pragmas.rb +6 -0
  17. data/lib/biointerchange/genomics/gvf_reader.rb +37 -0
  18. data/lib/biointerchange/gff3o.rb +1 -1
  19. data/lib/biointerchange/gvf1o.rb +145 -17
  20. data/lib/biointerchange/textmining/content.rb +1 -0
  21. data/lib/biointerchange/textmining/content_connection.rb +74 -0
  22. data/lib/biointerchange/textmining/document.rb +3 -1
  23. data/lib/biointerchange/textmining/pubannos_json_reader.rb +87 -9
  24. data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +58 -2
  25. data/spec/gff3_rdfwriter_spec.rb +9 -1
  26. data/spec/gvf_rdfwriter_spec.rb +81 -0
  27. data/spec/text_mining_pubannos_json_reader_spec.rb +82 -10
  28. data/spec/text_mining_rdfwriter_spec.rb +11 -0
  29. data/web/api.html +30 -23
  30. metadata +156 -138
@@ -0,0 +1 @@
1
+ {"name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "4", "docurl":"http://www.ncbi.nlm.nih.gov/pubmed/10096561","text":"Stimulation of CD40 on immunogenic human malignant melanomas augments their cytotoxic T lymphocyte-mediated lysis and induces apoptosis.\nHere, we report the functional expression of CD40 on human malignant melanomas (MMs). Comparison of tumor specimen from MM precursor lesions, primary tumors, and metastases revealed that CD40 surface expression is down-regulated during tumor progression. CD40 expression was confirmed in 7 human MM cell lines established from immunogenic primary tumors or metastases, whereas 11 cell lines established from advanced stages were CD40 negative. CD40 expression could be enhanced in CD40-positive MM by stimulation with IFN-gamma and tumor necrosis factor-alpha but not by interleukin (IL)-1beta or CD40 triggering. CD40 ligation on MM by CD40L-transfected murine L-cells or by a soluble CD40L fusion protein up-regulated their expression of intercellular adhesion molecule-1 and MHC class I and class II molecules and their secretion of IL-6, IL-8, tumor necrosis factor-a, and granulocyte macrophage colony-stimulating factor and also induced a rapid activation of the transcription factor nuclear factor kappaB. Furthermore, CD40 ligation of a HLA-A2+, MelanA/MART1+ MM cell line enhanced its susceptibility to specific lysis by a HLA-A2-restricted, MelanA/MART-1-specific CTL clone. Finally, CD40 ligation induced growth inhibition and apoptosis in MM. These results indicate that CD40-CD40L interactions may play an important role in augmenting antitumor immunity and inducing apoptosis in some CD40-positive immunogenic human MMs.","catanns":[{"id":"T1","span":{"begin":15,"end":19},"category":"NP"},{"id":"T2","span":{"begin":23,"end":60},"category":"NP"},{"id":"T3","span":{"begin":70,"end":75},"category":"PR"},{"id":"T4","span":{"begin":126,"end":135},"category":"NP"},{"id":"T5","span":{"begin":182,"end":186},"category":"NP"},{"id":"T6","span":{"begin":190,"end":221},"category":"NP"},{"id":"T7","span":{"begin":299,"end":309},"category":"NP"},{"id":"T8","span":{"begin":392,"end":407},"category":"NP"},{"id":"T9","span":{"begin":494,"end":504},"category":"NP"},{"id":"T10","span":{"begin":581,"end":596},"category":"NP"},{"id":"T11","span":{"begin":768,"end":770},"category":"NP"},{"id":"T12","span":{"begin":857,"end":862},"category":"PR"},{"id":"T13","span":{"begin":954,"end":959},"category":"PR"},{"id":"T14","span":{"begin":1163,"end":1217},"category":"NP"},{"id":"T15","span":{"begin":1227,"end":1230},"category":"PR"},{"id":"T16","span":{"begin":1375,"end":1384},"category":"NP"},{"id":"T17","span":{"begin":1388,"end":1390},"category":"NP"},{"id":"T18","span":{"begin":1517,"end":1526},"category":"NP"}],"insanns":[],"relanns":[{"id":"R1","type":"coreferenceOf","subject":"T3","object":"T2"},{"id":"R2","type":"coreferenceOf","subject":"T5","object":"T1"},{"id":"R3","type":"coreferenceOf","subject":"T6","object":"T2"},{"id":"R4","type":"coreferenceOf","subject":"T9","object":"T7"},{"id":"R5","type":"coreferenceOf","subject":"T10","object":"T8"},{"id":"R6","type":"coreferenceOf","subject":"T11","object":"T6"},{"id":"R7","type":"coreferenceOf","subject":"T12","object":"T11"},{"id":"R8","type":"coreferenceOf","subject":"T13","object":"T11"},{"id":"R9","type":"coreferenceOf","subject":"T15","object":"T14"},{"id":"R10","type":"coreferenceOf","subject":"T16","object":"T4"},{"id":"R11","type":"coreferenceOf","subject":"T17","object":"T11"},{"id":"R12","type":"coreferenceOf","subject":"T18","object":"T16"}],"modanns":[]}
@@ -0,0 +1 @@
1
+ {"name": "Peter Smith", "name_id": "<peter.smith@example.json>", "date": "2012-12-08", "version": "3", "docurl":"http://www.ncbi.nlm.nih.gov/pubmed/2626671","pmcdoc_id":"2626671","div_id":"3","text":"Distinct expression kinetics of perforin and granzyme B during CTL development in culture\nOur experiments revealed clear differences in the kinetics of perforin, granzyme B, and cytokine expression during CD8+ T cell activation (Fig. 1). Naive T cells showed detectable expression of perforin mRNA as well as perforin protein (Fig. 1, A\u2013D). Relative to its expression in naive T cells, perforin (Prf1) mRNA expression did not increase appreciably at day 2 but showed a reproducible decrease at day 4, followed by robust reexpression between days 4 and 8 (Fig. 1, A\u2013D). In contrast, granzyme B (Gzmb) mRNA was low or undetectable in naive T cells but was strongly up-regulated by day 2 after stimulation and increased progressively until day 6 (Fig. 1, A and B); similarly, granzyme B protein was expressed by day 4 and remained high until day 6 (Fig. 1 E). As expected, a small fraction of naive T cells expressed the cytokines IFN-\u03b3 and TNF in response to stimulation, and this capacity increased significantly in differentiated cells (Fig. 1 E; see also Fig. 2 A). \nWe evaluated antigen-dependent cytolytic function in a short-term assay in which target cell death was measured within 2 h (Fig. 1 F). By limiting the duration of TCR stimulation, this strategy minimizes cytolysis secondary to new gene expression during the period of the assay. Naive T cells did not display significant cytolytic function in this short-term assay (unpublished data), most likely because they express immature (unprocessed) forms of perforin and lack the capacity to degranulate (18, 19). Even after activation for 2 or 4 d, the cells showed poor cytolytic activity (Fig. 1 F), in striking contrast to their capacity for efficient cytokine production (Fig. 1 E). Only cells cultured until day 6 displayed robust cytotoxicity, as judged by their ability to induce apoptosis in a large number of target cells (Fig. 1 F). \nThese results show that after a strong priming stimulus through TCRs and co-stimulatory receptors in vitro, granzyme B expression and the ability to produce effector cytokines are programmed early, whereas perforin expression and cytolytic function are induced later, during the phase of clonal expansion in IL-2. Therefore, the two major effector functions of CTL, cytokine production and cytolytic activity, are not intrinsically coregulated. \n","catanns":[{"id":"T21","span":{"begin":9,"end":19},"category":"Gene_expression"},{"id":"T1","span":{"begin":32,"end":40},"category":"Protein"},{"id":"T2","span":{"begin":45,"end":55},"category":"Protein"},{"id":"T3","span":{"begin":152,"end":160},"category":"Protein"},{"id":"T4","span":{"begin":162,"end":172},"category":"Protein"},{"id":"T22","span":{"begin":187,"end":197},"category":"Gene_expression"},{"id":"T5","span":{"begin":205,"end":208},"category":"Protein"},{"id":"T23","span":{"begin":270,"end":280},"category":"Gene_expression"},{"id":"T6","span":{"begin":284,"end":292},"category":"Protein"},{"id":"T7","span":{"begin":309,"end":317},"category":"Protein"},{"id":"T19","span":{"begin":353,"end":356},"category":"Anaphora"},{"id":"T24","span":{"begin":357,"end":367},"category":"Gene_expression"},{"id":"T8","span":{"begin":386,"end":394},"category":"Protein"},{"id":"T9","span":{"begin":396,"end":400},"category":"Protein"},{"id":"T25","span":{"begin":402,"end":417},"category":"Transcription"},{"id":"T26","span":{"begin":426,"end":434},"category":"Positive_regulation"},{"id":"T27","span":{"begin":482,"end":490},"category":"Negative_regulation"},{"id":"T28","span":{"begin":520,"end":532},"category":"Positive_regulation"},{"id":"T10","span":{"begin":582,"end":592},"category":"Protein"},{"id":"T11","span":{"begin":594,"end":598},"category":"Protein"},{"id":"T29","span":{"begin":609,"end":628},"category":"Transcription"},{"id":"T30","span":{"begin":663,"end":675},"category":"Positive_regulation"},{"id":"T31","span":{"begin":707,"end":716},"category":"Positive_regulation"},{"id":"T12","span":{"begin":773,"end":783},"category":"Protein"},{"id":"T32","span":{"begin":819,"end":832},"category":"Positive_regulation"},{"id":"T33","span":{"begin":904,"end":913},"category":"Gene_expression"},{"id":"T13","span":{"begin":928,"end":933},"category":"Protein"},{"id":"T14","span":{"begin":938,"end":941},"category":"Protein"},{"id":"T20","span":{"begin":974,"end":987},"category":"Anaphora"},{"id":"T34","span":{"begin":988,"end":997},"category":"Positive_regulation"},{"id":"T35","span":{"begin":1478,"end":1485},"category":"Gene_expression"},{"id":"T15","span":{"begin":1518,"end":1526},"category":"Protein"},{"id":"T16","span":{"begin":2013,"end":2023},"category":"Protein"},{"id":"T36","span":{"begin":2024,"end":2034},"category":"Gene_expression"},{"id":"T17","span":{"begin":2111,"end":2119},"category":"Protein"},{"id":"T37","span":{"begin":2120,"end":2130},"category":"Gene_expression"},{"id":"T38","span":{"begin":2158,"end":2165},"category":"Positive_regulation"},{"id":"T18","span":{"begin":2213,"end":2217},"category":"Protein"}],"insanns":[{"id":"E1","type":"subClassOf","object":"T21"},{"id":"E2","type":"subClassOf","object":"T21"},{"id":"E3","type":"subClassOf","object":"T22"},{"id":"E4","type":"subClassOf","object":"T22"},{"id":"E5","type":"subClassOf","object":"T23"},{"id":"E6","type":"subClassOf","object":"T23"},{"id":"E7","type":"subClassOf","object":"T24"},{"id":"E8","type":"subClassOf","object":"T25"},{"id":"E9","type":"subClassOf","object":"T26"},{"id":"E10","type":"subClassOf","object":"T27"},{"id":"E11","type":"subClassOf","object":"T28"},{"id":"E12","type":"subClassOf","object":"T29"},{"id":"E13","type":"subClassOf","object":"T30"},{"id":"E14","type":"subClassOf","object":"T31"},{"id":"E15","type":"subClassOf","object":"T32"},{"id":"E16","type":"subClassOf","object":"T33"},{"id":"E17","type":"subClassOf","object":"T33"},{"id":"E18","type":"subClassOf","object":"T34"},{"id":"E19","type":"subClassOf","object":"T34"},{"id":"E20","type":"subClassOf","object":"T35"},{"id":"E21","type":"subClassOf","object":"T36"},{"id":"E22","type":"subClassOf","object":"T37"},{"id":"E23","type":"subClassOf","object":"T38"}],"relanns":[{"id":"R4","type":"equivalentTo","subject":"T9","object":"T8"},{"id":"R5","type":"equivalentTo","subject":"T11","object":"T10"},{"id":"R6","type":"themeOf","subject":"T1","object":"E1"},{"id":"R7","type":"themeOf","subject":"T2","object":"E2"},{"id":"R8","type":"themeOf","subject":"T3","object":"E3"},{"id":"R9","type":"themeOf","subject":"T4","object":"E4"},{"id":"R10","type":"themeOf","subject":"T6","object":"E5"},{"id":"R11","type":"themeOf","subject":"T7","object":"E6"},{"id":"R12","type":"themeOf","subject":"T8","object":"E7"},{"id":"R13","type":"themeOf","subject":"T8","object":"E8"},{"id":"R14","type":"themeOf","subject":"E8","object":"E9"},{"id":"R15","type":"themeOf","subject":"E8","object":"E10"},{"id":"R16","type":"themeOf","subject":"E8","object":"E11"},{"id":"R17","type":"themeOf","subject":"T10","object":"E12"},{"id":"R18","type":"themeOf","subject":"T10","object":"E13"},{"id":"R19","type":"themeOf","subject":"T10","object":"E14"},{"id":"R20","type":"themeOf","subject":"T12","object":"E15"},{"id":"R21","type":"themeOf","subject":"T13","object":"E16"},{"id":"R22","type":"themeOf","subject":"T14","object":"E17"},{"id":"R23","type":"themeOf","subject":"E16","object":"E18"},{"id":"R24","type":"themeOf","subject":"E17","object":"E19"},{"id":"R25","type":"themeOf","subject":"T15","object":"E20"},{"id":"R26","type":"themeOf","subject":"T16","object":"E21"},{"id":"R27","type":"themeOf","subject":"T17","object":"E22"},{"id":"R28","type":"themeOf","subject":"E22","object":"E23"}],"modanns":[{"id":"M1","type":"Speculation","object":"E1"},{"id":"M2","type":"Speculation","object":"E2"},{"id":"M3","type":"Negation","object":"E9"},{"id":"M4","type":"Negation","object":"E12"}]}
@@ -1,3 +1,8 @@
1
+ # BioInterchange converts non-RDF data formats into RDF.
2
+ #
3
+ # Convert TSV, XML, GFF3, GVF and other files into RDF triples using
4
+ # BioInterchange's command-line tool, its web-services or make use
5
+ # of it as a gem in your own Ruby implementation.
1
6
  module BioInterchange
2
7
 
3
8
  # Custom Exceptions and Errors
@@ -25,6 +30,7 @@ module BioInterchange
25
30
  # Text mining model
26
31
  require 'biointerchange/textmining/document'
27
32
  require 'biointerchange/textmining/content'
33
+ require 'biointerchange/textmining/content_connection'
28
34
  require 'biointerchange/textmining/process'
29
35
 
30
36
  # Text mining writers
@@ -34,16 +40,32 @@ module BioInterchange
34
40
  # GENOMICS
35
41
  #
36
42
 
37
- # GFF3 reader
43
+ ### GFF3 ###
44
+
45
+ # Reader
38
46
  require 'biointerchange/genomics/gff3_reader'
39
47
 
40
48
  # Feature base model
49
+ require 'biointerchange/genomics/gff3_pragmas'
41
50
  require 'biointerchange/genomics/gff3_feature_set'
42
51
  require 'biointerchange/genomics/gff3_feature'
43
52
 
44
- # GFF3 writer
53
+ # Writer
45
54
  require 'biointerchange/genomics/gff3_rdf_ntriples'
46
55
 
56
+ ### GVF ###
57
+
58
+ # Reader
59
+ require 'biointerchange/genomics/gvf_reader'
60
+
61
+ # Feature base model
62
+ require 'biointerchange/genomics/gvf_pragmas'
63
+ require 'biointerchange/genomics/gvf_feature_set'
64
+ require 'biointerchange/genomics/gvf_feature'
65
+
66
+ # Writer
67
+ # ...same GFF3 writer
68
+
47
69
  #
48
70
  # ACTUAL COMMAND LINE IMPLEMENTATION
49
71
  #
@@ -68,19 +90,27 @@ module BioInterchange
68
90
 
69
91
  if opt['help'] or not opt['input'] or not opt['rdf'] then
70
92
  puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
93
+ puts ''
71
94
  puts 'Supported input formats (--input <format>/-i <format>):'
72
95
  puts ' biointerchange.gff3 : GFF3'
96
+ puts ' biointerchange.gvf : GVF'
73
97
  puts ' dbcls.catanns.json : PubAnnotation JSON'
74
98
  puts ' uk.ac.man.pdfx : PDFx XML'
99
+ puts ''
75
100
  puts 'Supported output formats (--rdf <format>/-r <format>)'
76
- puts ' rdf.biointerchange.gff3 : RDF N-Triples for input'
101
+ puts ' rdf.biointerchange.gff3 : RDF N-Triples for the following input'
77
102
  puts ' biointerchange.gff3'
78
- puts ' rdf.bh12.sio : RDF N-Triples for inputs'
103
+ puts ' rdf.biointerchange.gvf : RDF N-Triples for the following input'
104
+ puts ' biointerchange.gff3'
105
+ puts ' biointerchange.gvf'
106
+ puts ' rdf.bh12.sio : RDF N-Triples for the following inputs'
79
107
  puts ' dbcls.catanns.json'
80
108
  puts ' uk.ac.man.pdfx'
109
+ puts ''
81
110
  puts 'I/O options:'
82
111
  puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
83
112
  puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
113
+ puts ''
84
114
  puts 'Input-/RDF-format specific options:'
85
115
  puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
86
116
  puts ' Output: rdf.bh12.sio'
@@ -89,13 +119,15 @@ module BioInterchange
89
119
  puts ' -v <version>/--version <version> : version number of resource (optional)'
90
120
  puts ' --name <name> : name of resource/tool/person (required)'
91
121
  puts ' --name_id <id> : URI of resource/tool/person (required)'
122
+ puts ''
92
123
  puts 'Input-/RDF-format specific options:'
93
- puts ' Input: biointerchange.gff3'
94
- puts ' Output: rdf.biointerchange.gff3'
124
+ puts ' Input: biointerchange.gff3 or biointerchange.gvf'
125
+ puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
95
126
  puts ' Options:'
96
- puts ' -t <date>/--date <date> : date when the GFF3 file was created (optional)'
97
- puts ' --name <name> : name of the GFF3 file creator (optional)'
98
- puts ' --name_id <id> : email address of the GFF3 file creator (optional)'
127
+ puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
128
+ puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
129
+ puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
130
+ puts ''
99
131
  puts 'Other options:'
100
132
  puts ' -d / --debug : turn on debugging output (for stacktraces)'
101
133
  puts ' -h --help : this message'
@@ -103,6 +135,7 @@ module BioInterchange
103
135
  exit 1
104
136
  end
105
137
 
138
+ # Check if the input/rdf options are supported:
106
139
  if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
107
140
  if opt['rdf'] == 'rdf.bh12.sio' then
108
141
  raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
@@ -115,22 +148,30 @@ module BioInterchange
115
148
  else
116
149
  unsupported_combination
117
150
  end
151
+ elsif opt['input'] == 'biointerchange.gvf' then
152
+ if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
153
+ # Okay. No further arguments required.
154
+ else
155
+ unsupported_combination
156
+ end
118
157
  else
119
158
  unsupported_combination
120
159
  end
121
-
122
160
 
123
161
  opt['date'] = nil unless opt['date']
124
162
  opt['version'] = nil unless opt['version']
125
163
 
126
- # generate model from file (deserialise)
164
+ # Generate model from file (deserialization).
165
+ # Note: if-clauses are lexicographically ordered.
127
166
  reader = nil
128
- if opt['input'] == 'dbcls.catanns.json' then
167
+ if opt['input'] == 'biointerchange.gff3' then
168
+ reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
169
+ elsif opt['input'] == 'biointerchange.gvf' then
170
+ reader = BioInterchange::Genomics::GVFReader.new(opt['name'], opt['name_id'], opt['date'])
171
+ elsif opt['input'] == 'dbcls.catanns.json' then
129
172
  reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
130
173
  elsif opt['input'] == 'uk.ac.man.pdfx' then
131
174
  reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['name'], opt['name_id'], opt['date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
132
- elsif opt['input'] == 'biointerchange.gff3' then
133
- reader = BioInterchange::Genomics::GFF3Reader.new(opt['name'], opt['name_id'], opt['date'])
134
175
  end
135
176
 
136
177
  model = nil
@@ -140,13 +181,14 @@ module BioInterchange
140
181
  model = reader.deserialize(STDIN)
141
182
  end
142
183
 
143
- # generate rdf from model (serialise)
184
+ # Generate rdf from model (serialization).
185
+ # Note: if-clauses are lexicographically ordered.
144
186
  writer = nil
145
187
  if opt['rdf'] == 'rdf.bh12.sio' then
146
188
  writer = BioInterchange::TextMining::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
147
189
  writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
148
190
  end
149
- if opt['rdf'] == 'rdf.biointerchange.gff3' then
191
+ if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
150
192
  writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
151
193
  writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
152
194
  end
@@ -1,5 +1,6 @@
1
1
  module BioInterchange::Genomics
2
2
 
3
+ # Represents a single genomic feature of a GFF3 file.
3
4
  class GFF3Feature
4
5
 
5
6
  # Constants determining the strand of the feature.
@@ -2,6 +2,7 @@ require 'digest/sha1'
2
2
 
3
3
  module BioInterchange::Genomics
4
4
 
5
+ # A GFF3 feature set, which encapsules information of a single GFF3 file.
5
6
  class GFF3FeatureSet
6
7
 
7
8
  # Create a new instance of a Generic Feature Format Version 3 (GFF3) feature set. A feature
@@ -9,13 +10,34 @@ class GFF3FeatureSet
9
10
  def initialize
10
11
  # Features are stored as the keys of a hash map to increase performance:
11
12
  @set = {}
13
+ # Pragmas, i.e. feature meta-information, are stored as named mappings. Many
14
+ # pragmas are simple key/value assignments, but others permit multiple values
15
+ # whose ordering does matter. In that case, an array is used to store the
16
+ # various values.
17
+ @pragmas = {}
12
18
  end
13
19
 
14
- # Returns the contents of the feature set.
20
+ # Returns the contents of the feature set -- excluding pragma meta-data.
15
21
  def contents
16
22
  @set.keys
17
23
  end
18
24
 
25
+ # Returns information stored for a named pragma, or nil if there is no information
26
+ # stored for it.
27
+ #
28
+ # +name+:: a string representing the name of the pragma whose value we are interested in
29
+ def pragma(name)
30
+ return nil unless name
31
+ # TODO Should throw exception if name is not a string.
32
+ return nil unless name.kind_of?(String)
33
+ @pragmas[name]
34
+ end
35
+
36
+ # Returns the names of all the pragmas for which some information has been recorded.
37
+ def pragmas
38
+ @pragmas.keys
39
+ end
40
+
19
41
  # Returns an URI for this particular feature set, which is a SHA1 hash over the content's concatenated properties.
20
42
  def uri
21
43
  clob = ''
@@ -32,6 +54,14 @@ class GFF3FeatureSet
32
54
  @set[feature] = true
33
55
  end
34
56
 
57
+ # Sets the value for named pragma meta-data.
58
+ #
59
+ # +name+:: a string representing the unique name of the pragma
60
+ # +value+:: on object representing the value of the pragma assignment
61
+ def set_pragma(name, value)
62
+ # TODO Should throw exception if name is not a string.
63
+ @pragmas[name] = value
64
+ end
35
65
  end
36
66
 
37
67
  end
@@ -0,0 +1,35 @@
1
+
2
+ module BioInterchange::Genomics
3
+
4
+ # Represents a named region, which is defined by the pragma statement 'sequence-region'.
5
+ class GFF3NamedRegion
6
+
7
+ # Create a new instance of a named region.
8
+ #
9
+ # +seqid+:: unique identifier (in the GFF3 file context) that identifies this region
10
+ # +start_coordinate+:: genomic start coordinate of the region
11
+ # +end_coordinate+:: genomic end coordinate of the region
12
+ def initialize(seqid, start_coordinate, end_coordinate)
13
+ @seqid = seqid
14
+ @start_coordinate = start_coordinate
15
+ @end_coordinate = end_coordinate
16
+ end
17
+
18
+ # Returns the unique identifier (based on a GFF3 file context) of the region.
19
+ def seqid
20
+ @seqid
21
+ end
22
+
23
+ # Returns the start coordinate of the region.
24
+ def start_coordinate
25
+ @start_coordinate
26
+ end
27
+
28
+ # Returns the end coordinate of the region.
29
+ def end_coordinate
30
+ @end_coordinate
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -4,6 +4,15 @@ require 'date'
4
4
 
5
5
  module BioInterchange::Genomics
6
6
 
7
+ # Serializes GFF3 and GVF models.
8
+ #
9
+ # Inputs:
10
+ # - biointerchange.gff3
11
+ # - biointerchange.gvf
12
+ #
13
+ # Outputs:
14
+ # - rdf.biointerchange.gff3
15
+ # - rdf.biointerchange.gvf
7
16
  class RDFWriter
8
17
 
9
18
  # Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
@@ -19,14 +28,19 @@ class RDFWriter
19
28
  # +model+:: a generic representation of input data that is derived from BioInterchange::Genomics::GFF3FeatureSet
20
29
  def serialize(model)
21
30
  if model.instance_of?(BioInterchange::Genomics::GFF3FeatureSet) then
31
+ @base = BioInterchange::GFF3O
32
+ serialize_model(model)
33
+ elsif model.instance_of?(BioInterchange::Genomics::GVFFeatureSet) then
34
+ @base = BioInterchange::GVF1O
22
35
  serialize_model(model)
23
36
  else
24
37
  raise BioInterchange::Exceptions::ImplementationWriterError, 'The provided model cannot be serialized. ' +
25
- 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet.'
38
+ 'This writer supports serialization for BioInterchange::Genomics::GFF3FeatureSet and '
39
+ 'BioInterchange::Genomics::GVFFeatureSet.'
26
40
  end
27
41
  end
28
42
 
29
- private
43
+ protected
30
44
 
31
45
  # Serializes RDF for a feature set representation.
32
46
  #
@@ -34,11 +48,34 @@ private
34
48
  def serialize_model(model)
35
49
  graph = RDF::Graph.new
36
50
  set_uri = RDF::URI.new(model.uri)
37
- graph.insert(RDF::Statement.new(set_uri, RDF.type, BioInterchange::GFF3O.Set))
51
+ graph.insert(RDF::Statement.new(set_uri, RDF.type, @base.Set))
52
+ model.pragmas.each { |pragma_name|
53
+ serialize_pragma(graph, set_uri, model.pragma(pragma_name))
54
+ }
38
55
  model.contents.each { |feature|
39
56
  serialize_feature(graph, set_uri, feature)
40
57
  }
41
58
  RDF::NTriples::Writer.dump(graph, @ostream)
59
+ # TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
60
+ # RDF::RDFXML::Writer.dump(graph, @ostream)
61
+ end
62
+
63
+ # Serializes pragmas for a given feature set URI.
64
+ # +graph+:: RDF graph to which the pragmas are added
65
+ # +set_uri+:: the feature set URI to which the pragmas belong to
66
+ # +pragma+:: an object representing a pragma statement
67
+ def serialize_pragma(graph, set_uri, pragma)
68
+ if pragma.kind_of?(Hash) then
69
+ if pragma.has_key?('gff-version') and @base == BioInterchange::GFF3O then
70
+ graph.insert(RDF::Statement.new(set_uri, @base.version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
71
+ elsif pragma.has_key?('gff-version') and @base == BioInterchange::GVF1O then
72
+ graph.insert(RDF::Statement.new(set_uri, @base.gff_version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
73
+ elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
74
+ graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
75
+ end
76
+ else
77
+ # TODO
78
+ end
42
79
  end
43
80
 
44
81
  # Serializes a +GFF3Feature+ object for a given feature set URI.
@@ -50,30 +87,30 @@ private
50
87
  # TODO Make sure there is only one value in the 'ID' list.
51
88
  feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
52
89
  feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
53
- feature_properties = BioInterchange::GFF3O.feature_properties.select { |uri| BioInterchange::GFF3O.is_datatype_property?(uri) }[0]
54
- graph.insert(RDF::Statement.new(set_uri, BioInterchange::GFF3O.contains, feature_uri))
55
- graph.insert(RDF::Statement.new(feature_uri, RDF.type, BioInterchange::GFF3O.Feature))
56
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.seqid, RDF::Literal.new(feature.sequence_id)))
57
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.source, RDF::Literal.new(feature.source)))
58
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.type, RDF::Literal.new(feature.type)))
59
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.with_parent(BioInterchange::GFF3O.start, feature_properties)[0], RDF::Literal.new(feature.start_coordinate)))
60
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.with_parent(BioInterchange::GFF3O.end, feature_properties)[0], RDF::Literal.new(feature.end_coordinate)))
61
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.score, RDF::Literal.new(feature.score))) if feature.score
62
- feature_properties = BioInterchange::GFF3O.feature_properties.select { |uri| BioInterchange::GFF3O.is_object_property?(uri) }[0]
63
- strand_uri = BioInterchange::GFF3O.with_parent(BioInterchange::GFF3O.strand, feature_properties)[0]
90
+ feature_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
91
+ graph.insert(RDF::Statement.new(set_uri, @base.contains, feature_uri))
92
+ graph.insert(RDF::Statement.new(feature_uri, RDF.type, @base.Feature))
93
+ graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.seqid ].flatten, feature_properties)[0], RDF::Literal.new(feature.sequence_id)))
94
+ graph.insert(RDF::Statement.new(feature_uri, @base.source, RDF::Literal.new(feature.source)))
95
+ graph.insert(RDF::Statement.new(feature_uri, @base.type, RDF::Literal.new(feature.type)))
96
+ graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start, feature_properties)[0], RDF::Literal.new(feature.start_coordinate)))
97
+ graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end, feature_properties)[0], RDF::Literal.new(feature.end_coordinate)))
98
+ graph.insert(RDF::Statement.new(feature_uri, @base.score, RDF::Literal.new(feature.score))) if feature.score
99
+ feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
100
+ strand_uri = @base.with_parent(@base.strand, feature_properties)[0]
64
101
  case feature.strand
65
102
  when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
66
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.NotStranded))
103
+ graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.NotStranded))
67
104
  when BioInterchange::Genomics::GFF3Feature::UNKNOWN
68
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.UnknownStrand))
105
+ graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.UnknownStrand))
69
106
  when BioInterchange::Genomics::GFF3Feature::POSITIVE
70
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.Positive))
107
+ graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Positive))
71
108
  when BioInterchange::Genomics::GFF3Feature::NEGATIVE
72
- graph.insert(RDF::Statement.new(feature_uri, strand_uri, BioInterchange::GFF3O.Negative))
109
+ graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Negative))
73
110
  else
74
111
  raise ArgumentException, 'Strand of feature is set to an unknown constant.'
75
112
  end
76
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.phase, RDF::Literal.new(feature.phase))) if feature.phase
113
+ graph.insert(RDF::Statement.new(feature_uri, @base.phase, RDF::Literal.new(feature.phase))) if feature.phase
77
114
 
78
115
  serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
79
116
  end
@@ -88,16 +125,16 @@ private
88
125
  attributes.each_pair { |tag, list|
89
126
  if tag == 'Parent' then
90
127
  list.each { |parent_id|
91
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
128
+ graph.insert(RDF::Statement.new(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
92
129
  }
93
130
  else
94
131
  list.each_index { |index|
95
132
  value = list[index]
96
133
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
97
134
  attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
98
- graph.insert(RDF::Statement.new(feature_uri, BioInterchange::GFF3O.attributes, attribute_uri))
99
- graph.insert(RDF::Statement.new(attribute_uri, RDF.type, BioInterchange::GFF3O.Attribute))
100
- graph.insert(RDF::Statement.new(attribute_uri, BioInterchange::GFF3O.tag, RDF::Literal.new("#{tag}")))
135
+ graph.insert(RDF::Statement.new(feature_uri, @base.attributes, attribute_uri))
136
+ graph.insert(RDF::Statement.new(attribute_uri, RDF.type, @base.Attribute))
137
+ graph.insert(RDF::Statement.new(attribute_uri, @base.tag, RDF::Literal.new("#{tag}")))
101
138
  graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
102
139
  }
103
140
  end
@@ -26,60 +26,94 @@ class GFF3Reader
26
26
  end
27
27
  end
28
28
 
29
- private
29
+ protected
30
+
31
+ def create_feature_set
32
+ BioInterchange::Genomics::GFF3FeatureSet.new()
33
+ end
30
34
 
31
35
  def create_model(gff3)
32
- feature_set = BioInterchange::Genomics::GFF3FeatureSet.new()
36
+ feature_set = create_feature_set
33
37
  gff3.each_line { |line|
34
- next if line.start_with?('#')
38
+ next if line.start_with?('#') and not line.start_with?('##')
35
39
 
36
- line.chomp!
37
- seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
38
-
39
- # The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
40
- if type.match(/SO:\d+/) then
41
- type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
42
- elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
43
- type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
44
- end
40
+ # Ignore sequences for now.
41
+ break if line.start_with?('##FASTA')
45
42
 
46
- # String to numeric value conversions:
47
- start_coordinate = start_coordinate.to_i
48
- stop_coordinate = stop_coordinate.to_i
49
- if score == '.' then
50
- score = nil
43
+ unless line.start_with?('##') then
44
+ add_feature(feature_set, line)
51
45
  else
52
- score = score.to_f
46
+ add_pragma(feature_set, line)
53
47
  end
48
+ }
54
49
 
55
- # Determine strandedness:
56
- if strand == '?' then
57
- strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
58
- elsif strand == '+' then
59
- strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
60
- elsif strand == '-' then
61
- strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
62
- else
63
- strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
64
- end
50
+ feature_set
51
+ end
65
52
 
66
- # Set phase, if it lies in the permissable range of values:
67
- if phase == '0' or phase == '1' or phase == '2' then
68
- phase = phase.to_i
69
- else
70
- phase = nil
71
- end
53
+ def add_feature(feature_set, line)
54
+ line.chomp!
55
+ seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes = line.split("\t")
72
56
 
73
- temp = {}
74
- attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
75
- attributes = temp
57
+ # The type might be a SO/SOFA term, SO/SOFA accession, or other term (it stays a string then):
58
+ if type.match(/SO:\d+/) then
59
+ type = RDF::URI.new("http://purl.obolibrary.org/obo/#{type.sub(':', '_')}")
60
+ elsif BioInterchange::SOFA.methods.include?(type.gsub(' ', '_').to_sym)
61
+ type = BioInterchange::SOFA.send(type.gsub(' ', '_'))
62
+ end
76
63
 
77
- feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
78
- }
64
+ # String to numeric value conversions:
65
+ start_coordinate = start_coordinate.to_i
66
+ end_coordinate = end_coordinate.to_i
67
+ if score == '.' then
68
+ score = nil
69
+ else
70
+ score = score.to_f
71
+ end
79
72
 
80
- feature_set
73
+ # Determine strandedness:
74
+ if strand == '?' then
75
+ strand = BioInterchange::Genomics::GFF3Feature::UNKNOWN
76
+ elsif strand == '+' then
77
+ strand = BioInterchange::Genomics::GFF3Feature::POSITIVE
78
+ elsif strand == '-' then
79
+ strand = BioInterchange::Genomics::GFF3Feature::NEGATIVE
80
+ else
81
+ strand = BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
82
+ end
83
+
84
+ # Set phase, if it lies in the permissable range of values:
85
+ if phase == '0' or phase == '1' or phase == '2' then
86
+ phase = phase.to_i
87
+ else
88
+ phase = nil
89
+ end
90
+
91
+ temp = {}
92
+ attributes.split(';').map { |assignment| match = assignment.match(/([^=]+)=(.+)/) ; { match[1].strip => match[2].split(',').map { |value| value.strip } } }.map { |hash| hash.each_pair { |tag,list| temp[tag] = list } }
93
+ attributes = temp
94
+
95
+ feature_set.add(BioInterchange::Genomics::GFF3Feature.new(seqid, source, type, start_coordinate, end_coordinate, score, strand, phase, attributes))
81
96
  end
82
97
 
98
+ def add_pragma(feature_set, line)
99
+ line.chomp!
100
+ name, value = line[2..-1].split(/\s/, 2)
101
+ value.strip!
102
+
103
+ # Interpret pragmas depending on their definition:
104
+ if name == 'gff-version' then
105
+ feature_set.set_pragma(name, { name => value.to_f })
106
+ elsif name == 'sequence-region' then
107
+ regions = feature_set.pragma(name)
108
+ regions = {} unless regions
109
+ seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
110
+ regions[seqid] = BioInterchange::Genomics::GFF3NamedRegion.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
111
+ feature_set.set_pragma(name, regions)
112
+ else
113
+ # Unhandled pragma. Just save the value in its string form.
114
+ feature_set.set_pragma(name, value)
115
+ end
116
+ end
83
117
  end
84
118
 
85
119
  end