bio-affy 0.1.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +32 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +33 -0
- data/Rakefile +77 -0
- data/VERSION +1 -0
- data/bin/bio-affy +80 -0
- data/bio-affy.gemspec +128 -0
- data/ext/DESCRIPTION +11 -0
- data/ext/HISTORY +3 -0
- data/ext/LICENSE +456 -0
- data/ext/NAMESPACE +2 -0
- data/ext/R/check.cdf.type.R +18 -0
- data/ext/R/read.cdffile.list.R +23 -0
- data/ext/R/read.celfile.R +11 -0
- data/ext/R/read.celfile.header.R +37 -0
- data/ext/R/read.probematrices.R +29 -0
- data/ext/README_BIOLIB +36 -0
- data/ext/aclocal.m4 +32 -0
- data/ext/configure +4898 -0
- data/ext/configure.in +51 -0
- data/ext/man/check.cdf.type.Rd +22 -0
- data/ext/man/read.cdffile.list.Rd +20 -0
- data/ext/man/read.celfile.Rd +23 -0
- data/ext/man/read.celfile.header.Rd +22 -0
- data/ext/man/read.celfile.probeintensity.matrices.Rd +31 -0
- data/ext/src/CMakeLists.txt +39 -0
- data/ext/src/Makevars.in +3 -0
- data/ext/src/Makevars.win +2 -0
- data/ext/src/Rakefile +43 -0
- data/ext/src/biolib_affyio.c +416 -0
- data/ext/src/biolib_affyio.h +132 -0
- data/ext/src/biolib_affyio.o +0 -0
- data/ext/src/fread_functions.c +871 -0
- data/ext/src/fread_functions.h +60 -0
- data/ext/src/fread_functions.o +0 -0
- data/ext/src/libaffyext.so +0 -0
- data/ext/src/mkrf.log +11 -0
- data/ext/src/mkrf_conf.rb +6 -0
- data/ext/src/read_abatch.c +5484 -0
- data/ext/src/read_abatch.h +63 -0
- data/ext/src/read_abatch.o +0 -0
- data/ext/src/read_bpmap.c +888 -0
- data/ext/src/read_bpmap.o +0 -0
- data/ext/src/read_cdf.h +347 -0
- data/ext/src/read_cdf_xda.c +1342 -0
- data/ext/src/read_cdf_xda.o +0 -0
- data/ext/src/read_cdffile2.c +1576 -0
- data/ext/src/read_cdffile2.o +0 -0
- data/ext/src/read_celfile_generic.c +2061 -0
- data/ext/src/read_celfile_generic.h +33 -0
- data/ext/src/read_celfile_generic.o +0 -0
- data/ext/src/read_clf.c +870 -0
- data/ext/src/read_clf.o +0 -0
- data/ext/src/read_generic.c +1446 -0
- data/ext/src/read_generic.h +144 -0
- data/ext/src/read_generic.o +0 -0
- data/ext/src/read_pgf.c +1337 -0
- data/ext/src/read_pgf.o +0 -0
- data/lib/bio-affy.rb +5 -0
- data/lib/bio/affy.rb +7 -0
- data/lib/bio/affyext.rb +23 -0
- data/lib/bio/libaffyext.so +0 -0
- data/spec/bio-affy_spec.rb +22 -0
- data/spec/spec_helper.rb +13 -0
- data/test/data/affy/GSM103328.CEL.gz +0 -0
- data/test/data/affy/GSM103329.CEL.gz +0 -0
- data/test/data/affy/GSM103330.CEL.gz +0 -0
- data/test/data/affy/MG_U74Av2.CDF.gz +0 -0
- metadata +190 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
|
2
|
+
#ifdef BIOLIB
|
3
|
+
#include <biolib_R_map.h>
|
4
|
+
#endif
|
5
|
+
|
6
|
+
#include <zlib.h>
|
7
|
+
|
8
|
+
/******
|
9
|
+
******
|
10
|
+
****** Data Structures
|
11
|
+
******
|
12
|
+
******/
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
/* File header */
|
17
|
+
|
18
|
+
typedef struct{
|
19
|
+
uint8_t magic_number;
|
20
|
+
uint8_t version;
|
21
|
+
int32_t n_data_groups;
|
22
|
+
uint32_t first_group_file_pos;
|
23
|
+
} generic_file_header;
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
/* An affy generic STRING */
|
29
|
+
|
30
|
+
typedef struct{
|
31
|
+
int32_t len;
|
32
|
+
char *value;
|
33
|
+
} ASTRING;
|
34
|
+
|
35
|
+
/* An affy generic WSTRING */
|
36
|
+
|
37
|
+
typedef struct{
|
38
|
+
int32_t len;
|
39
|
+
wchar_t *value;
|
40
|
+
} AWSTRING;
|
41
|
+
|
42
|
+
|
43
|
+
/* Name Value Type Triplet */
|
44
|
+
|
45
|
+
typedef struct{
|
46
|
+
AWSTRING name;
|
47
|
+
ASTRING value;
|
48
|
+
AWSTRING type;
|
49
|
+
} nvt_triplet;
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
/* Data Header */
|
55
|
+
|
56
|
+
typedef struct generic_data_header *generic_data_header_pointer;
|
57
|
+
|
58
|
+
typedef struct{
|
59
|
+
ASTRING data_type_id; /*Stored in file as INT followed by CHAR array */
|
60
|
+
ASTRING unique_file_id; /*See above */
|
61
|
+
AWSTRING Date_time; /*Stored in file as INT followed by WCHAR array */
|
62
|
+
AWSTRING locale;
|
63
|
+
int32_t n_name_type_value;
|
64
|
+
nvt_triplet *name_type_value;
|
65
|
+
int32_t n_parent_headers;
|
66
|
+
void **parent_headers;
|
67
|
+
} generic_data_header;
|
68
|
+
|
69
|
+
|
70
|
+
/* Data Group */
|
71
|
+
|
72
|
+
typedef struct {
|
73
|
+
|
74
|
+
uint32_t file_position_nextgroup;
|
75
|
+
uint32_t file_position_first_data;
|
76
|
+
int32_t n_data_sets;
|
77
|
+
AWSTRING data_group_name;
|
78
|
+
} generic_data_group;
|
79
|
+
|
80
|
+
|
81
|
+
/* Dataset */
|
82
|
+
|
83
|
+
typedef struct {
|
84
|
+
AWSTRING name;
|
85
|
+
uint8_t type;
|
86
|
+
int32_t size;
|
87
|
+
} col_nvts_triplet;
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
typedef struct {
|
92
|
+
uint32_t file_pos_first;
|
93
|
+
uint32_t file_pos_last;
|
94
|
+
AWSTRING data_set_name;
|
95
|
+
int32_t n_name_type_value;
|
96
|
+
nvt_triplet *name_type_value;
|
97
|
+
uint32_t ncols;
|
98
|
+
col_nvts_triplet* col_name_type_value;
|
99
|
+
uint32_t nrows;
|
100
|
+
void **Data; /* in the docs this is rows */
|
101
|
+
} generic_data_set;
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
typedef enum{
|
107
|
+
|
108
|
+
ASCIITEXT = 1,
|
109
|
+
PLAINTEXT = 2,
|
110
|
+
UINT8 = 3,
|
111
|
+
INT8= 4,
|
112
|
+
UINT16 = 5,
|
113
|
+
INT16 = 6,
|
114
|
+
UINT32 = 7,
|
115
|
+
INT32 = 8,
|
116
|
+
FLOAT32 = 9
|
117
|
+
|
118
|
+
} AffyMIMEtypes;
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
AffyMIMEtypes determine_MIMETYPE(nvt_triplet triplet);
|
123
|
+
void *decode_MIME_value(nvt_triplet triplet, AffyMIMEtypes mimetype, void *result, int *size);
|
124
|
+
char *decode_MIME_value_toASCII(nvt_triplet triplet, AffyMIMEtypes mimetype, void *result, int *size);
|
125
|
+
|
126
|
+
nvt_triplet* find_nvt(generic_data_header *data_header,char *name);
|
127
|
+
|
128
|
+
int read_generic_file_header(generic_file_header* file_header, FILE *instream);
|
129
|
+
int read_generic_data_header(generic_data_header *data_header, FILE *instream);
|
130
|
+
int read_generic_data_group(generic_data_group *data_group, FILE *instream);
|
131
|
+
int read_generic_data_set(generic_data_set *data_set, FILE *instream);
|
132
|
+
int read_generic_data_set_rows(generic_data_set *data_set, FILE *instream);
|
133
|
+
|
134
|
+
|
135
|
+
void Free_generic_data_header(generic_data_header *header);
|
136
|
+
void Free_generic_data_group(generic_data_group *data_group);
|
137
|
+
void Free_generic_data_set(generic_data_set *data_set);
|
138
|
+
|
139
|
+
|
140
|
+
int gzread_generic_file_header(generic_file_header* file_header, gzFile *instream);
|
141
|
+
int gzread_generic_data_header(generic_data_header *data_header, gzFile *instream);
|
142
|
+
int gzread_generic_data_group(generic_data_group *data_group,gzFile *instream);
|
143
|
+
int gzread_generic_data_set(generic_data_set *data_set, gzFile *instream);
|
144
|
+
int gzread_generic_data_set_rows(generic_data_set *data_set, gzFile *instream);
|
Binary file
|
data/ext/src/read_pgf.c
ADDED
@@ -0,0 +1,1337 @@
|
|
1
|
+
/******************************************************************
|
2
|
+
**
|
3
|
+
** file: read_pgf.c
|
4
|
+
**
|
5
|
+
** Aim: implement parsing of PGF format files
|
6
|
+
**
|
7
|
+
** Copyright (C) 2007 B. M. Bolstad
|
8
|
+
**
|
9
|
+
** Created on Nov 4, 2007
|
10
|
+
**
|
11
|
+
** History
|
12
|
+
** Nov 4, 2007 - Initial version
|
13
|
+
** Dec 17. 2007 - add function for counting number of each type of probeset
|
14
|
+
** Dec 31, 2007 - add function which checks that all required fields are present
|
15
|
+
** Mar 18, 2008 - fix error in read_pgf_header function
|
16
|
+
**
|
17
|
+
**
|
18
|
+
**
|
19
|
+
******************************************************************/
|
20
|
+
|
21
|
+
#include <R.h>
|
22
|
+
|
23
|
+
#include <stdio.h>
|
24
|
+
#include <stdlib.h>
|
25
|
+
|
26
|
+
|
27
|
+
#define BUFFERSIZE 1024
|
28
|
+
|
29
|
+
|
30
|
+
/*******************************************************************
|
31
|
+
*******************************************************************
|
32
|
+
**
|
33
|
+
** Structures for dealing with pgf file information
|
34
|
+
**
|
35
|
+
**
|
36
|
+
**
|
37
|
+
*******************************************************************
|
38
|
+
******************************************************************/
|
39
|
+
|
40
|
+
/*******************************************************************
|
41
|
+
*******************************************************************
|
42
|
+
**
|
43
|
+
** Starting off with the headers
|
44
|
+
**
|
45
|
+
*******************************************************************
|
46
|
+
******************************************************************/
|
47
|
+
|
48
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
49
|
+
|
50
|
+
typedef struct{
|
51
|
+
int probeset_id;
|
52
|
+
int type;
|
53
|
+
int probeset_name;
|
54
|
+
} header_0;
|
55
|
+
|
56
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
57
|
+
|
58
|
+
typedef struct{
|
59
|
+
int atom_id;
|
60
|
+
int type;
|
61
|
+
int exon_position;
|
62
|
+
} header_1;
|
63
|
+
|
64
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
65
|
+
|
66
|
+
typedef struct{
|
67
|
+
int probe_id;
|
68
|
+
int type;
|
69
|
+
int gc_count;
|
70
|
+
int probe_length;
|
71
|
+
int interrogation_position;
|
72
|
+
int probe_sequence;
|
73
|
+
} header_2;
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
typedef struct{
|
78
|
+
char **chip_type;
|
79
|
+
int n_chip_type;
|
80
|
+
char *lib_set_name;
|
81
|
+
char *lib_set_version;
|
82
|
+
char *pgf_format_version;
|
83
|
+
char *header0_str;
|
84
|
+
header_0 *header0;
|
85
|
+
char *header1_str;
|
86
|
+
header_1 *header1;
|
87
|
+
char *header2_str;
|
88
|
+
header_2 *header2;
|
89
|
+
char *create_date;
|
90
|
+
char *guid;
|
91
|
+
char **other_headers_keys;
|
92
|
+
char **other_headers_values;
|
93
|
+
int n_other_headers;
|
94
|
+
} pgf_headers;
|
95
|
+
|
96
|
+
|
97
|
+
/********************************************************************
|
98
|
+
*******************************************************************
|
99
|
+
**
|
100
|
+
** Structures for dealing with data stored at the probelevel
|
101
|
+
**
|
102
|
+
**
|
103
|
+
*******************************************************************
|
104
|
+
*******************************************************************/
|
105
|
+
|
106
|
+
typedef struct{
|
107
|
+
|
108
|
+
int probe_id;
|
109
|
+
char *type;
|
110
|
+
int gc_count;
|
111
|
+
int probe_length;
|
112
|
+
int interrogation_position;
|
113
|
+
char *probe_sequence;
|
114
|
+
struct probe_list_node *next;
|
115
|
+
} probe_list_node;
|
116
|
+
|
117
|
+
|
118
|
+
typedef struct{
|
119
|
+
int n_probes;
|
120
|
+
probe_list_node *first;
|
121
|
+
} probe_list_header;
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
/********************************************************************
|
126
|
+
*******************************************************************
|
127
|
+
**
|
128
|
+
** Structures for dealing with data stored at the atom level
|
129
|
+
**
|
130
|
+
**
|
131
|
+
*******************************************************************
|
132
|
+
*******************************************************************/
|
133
|
+
|
134
|
+
typedef struct{
|
135
|
+
int atom_id;
|
136
|
+
char *type;
|
137
|
+
char *exon_position;
|
138
|
+
probe_list_header *probes;
|
139
|
+
struct atom_list_node *next;
|
140
|
+
} atom_list_node;
|
141
|
+
|
142
|
+
|
143
|
+
typedef struct{
|
144
|
+
int n_atoms;
|
145
|
+
atom_list_node *first;
|
146
|
+
} atom_list_header;
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
/*******************************************************************
|
151
|
+
*******************************************************************
|
152
|
+
**
|
153
|
+
** Structures for dealing with data as stored at the probeset level
|
154
|
+
**
|
155
|
+
**
|
156
|
+
**
|
157
|
+
*******************************************************************
|
158
|
+
*******************************************************************/
|
159
|
+
|
160
|
+
typedef struct probeset_list_node *node_pointer;
|
161
|
+
|
162
|
+
|
163
|
+
typedef struct{
|
164
|
+
int probeset_id;
|
165
|
+
char *type;
|
166
|
+
char *probeset_name;
|
167
|
+
|
168
|
+
atom_list_header *atoms;
|
169
|
+
|
170
|
+
struct probeset_list_node *next;
|
171
|
+
} probeset_list_node;
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
typedef struct{
|
176
|
+
|
177
|
+
int n_probesets;
|
178
|
+
|
179
|
+
probeset_list_node *first;
|
180
|
+
probeset_list_node *current;
|
181
|
+
probeset_list_node *last;
|
182
|
+
} probeset_list_header;
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
/*******************************************************************
|
187
|
+
*******************************************************************
|
188
|
+
**
|
189
|
+
** Structure for storing pgf file (after it is read from file)
|
190
|
+
**
|
191
|
+
*******************************************************************
|
192
|
+
******************************************************************/
|
193
|
+
|
194
|
+
|
195
|
+
typedef struct{
|
196
|
+
pgf_headers *headers;
|
197
|
+
probeset_list_header *probesets;
|
198
|
+
} pgf_file;
|
199
|
+
|
200
|
+
|
201
|
+
/*******************************************************************
|
202
|
+
*******************************************************************
|
203
|
+
**
|
204
|
+
**
|
205
|
+
** Code for splitting a string into a series of tokens
|
206
|
+
**
|
207
|
+
**
|
208
|
+
*******************************************************************
|
209
|
+
*******************************************************************/
|
210
|
+
|
211
|
+
|
212
|
+
/***************************************************************
|
213
|
+
**
|
214
|
+
** tokenset
|
215
|
+
**
|
216
|
+
** char **tokens - a array of token strings
|
217
|
+
** int n - number of tokens in this set.
|
218
|
+
**
|
219
|
+
** a structure to hold a set of tokens. Typically a tokenset is
|
220
|
+
** created by breaking a character string based upon a set of
|
221
|
+
** delimiters.
|
222
|
+
**
|
223
|
+
**
|
224
|
+
**************************************************************/
|
225
|
+
|
226
|
+
typedef struct{
|
227
|
+
char **tokens;
|
228
|
+
int n;
|
229
|
+
} tokenset;
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
/******************************************************************
|
234
|
+
**
|
235
|
+
** tokenset *tokenize(char *str, char *delimiters)
|
236
|
+
**
|
237
|
+
** char *str - a string to break into tokens
|
238
|
+
** char *delimiters - delimiters to use in breaking up the line
|
239
|
+
**
|
240
|
+
**
|
241
|
+
** RETURNS a new tokenset
|
242
|
+
**
|
243
|
+
** Given a string, split into tokens based on a set of delimitors
|
244
|
+
**
|
245
|
+
*****************************************************************/
|
246
|
+
|
247
|
+
static tokenset *tokenize(char *str, char *delimiters){
|
248
|
+
|
249
|
+
#if USE_PTHREADS
|
250
|
+
char *tmp_pointer;
|
251
|
+
#endif
|
252
|
+
int i=0;
|
253
|
+
|
254
|
+
char *current_token;
|
255
|
+
tokenset *my_tokenset = Calloc(1,tokenset);
|
256
|
+
my_tokenset->n=0;
|
257
|
+
|
258
|
+
my_tokenset->tokens = NULL;
|
259
|
+
#if USE_PTHREADS
|
260
|
+
current_token = strtok_r(str,delimiters,&tmp_pointer);
|
261
|
+
#else
|
262
|
+
current_token = strtok(str,delimiters);
|
263
|
+
#endif
|
264
|
+
while (current_token != NULL){
|
265
|
+
my_tokenset->n++;
|
266
|
+
my_tokenset->tokens = Realloc(my_tokenset->tokens,my_tokenset->n,char*);
|
267
|
+
my_tokenset->tokens[i] = Calloc(strlen(current_token)+1,char);
|
268
|
+
strcpy(my_tokenset->tokens[i],current_token);
|
269
|
+
my_tokenset->tokens[i][(strlen(current_token))] = '\0';
|
270
|
+
i++;
|
271
|
+
#if USE_PTHREADS
|
272
|
+
current_token = strtok_r(NULL,delimiters,&tmp_pointer);
|
273
|
+
#else
|
274
|
+
current_token = strtok(NULL,delimiters);
|
275
|
+
#endif
|
276
|
+
}
|
277
|
+
return my_tokenset;
|
278
|
+
}
|
279
|
+
|
280
|
+
|
281
|
+
/******************************************************************
|
282
|
+
**
|
283
|
+
** int tokenset_size(tokenset *x)
|
284
|
+
**
|
285
|
+
** tokenset *x - a tokenset
|
286
|
+
**
|
287
|
+
** RETURNS the number of tokens in the tokenset
|
288
|
+
**
|
289
|
+
******************************************************************/
|
290
|
+
|
291
|
+
static int tokenset_size(tokenset *x){
|
292
|
+
return x->n;
|
293
|
+
}
|
294
|
+
|
295
|
+
|
296
|
+
/******************************************************************
|
297
|
+
**
|
298
|
+
** char *get_token(tokenset *x, int i)
|
299
|
+
**
|
300
|
+
** tokenset *x - a tokenset
|
301
|
+
** int i - index of the token to return
|
302
|
+
**
|
303
|
+
** RETURNS pointer to the i'th token
|
304
|
+
**
|
305
|
+
******************************************************************/
|
306
|
+
|
307
|
+
static char *get_token(tokenset *x,int i){
|
308
|
+
return x->tokens[i];
|
309
|
+
}
|
310
|
+
|
311
|
+
/******************************************************************
|
312
|
+
**
|
313
|
+
** void delete_tokens(tokenset *x)
|
314
|
+
**
|
315
|
+
** tokenset *x - a tokenset
|
316
|
+
**
|
317
|
+
** Deallocates all the space allocated for a tokenset
|
318
|
+
**
|
319
|
+
******************************************************************/
|
320
|
+
|
321
|
+
static void delete_tokens(tokenset *x){
|
322
|
+
|
323
|
+
int i;
|
324
|
+
|
325
|
+
for (i=0; i < x->n; i++){
|
326
|
+
Free(x->tokens[i]);
|
327
|
+
}
|
328
|
+
Free(x->tokens);
|
329
|
+
Free(x);
|
330
|
+
}
|
331
|
+
|
332
|
+
/*******************************************************************
|
333
|
+
**
|
334
|
+
** int token_ends_with(char *token, char *ends)
|
335
|
+
**
|
336
|
+
** char *token - a string to check
|
337
|
+
** char *ends_in - we are looking for this string at the end of token
|
338
|
+
**
|
339
|
+
**
|
340
|
+
** returns 0 if no match, otherwise it returns the index of the first character
|
341
|
+
** which matchs the start of *ends.
|
342
|
+
**
|
343
|
+
** Note that there must be one additional character in "token" beyond
|
344
|
+
** the characters in "ends". So
|
345
|
+
**
|
346
|
+
** *token = "TestStr"
|
347
|
+
** *ends = "TestStr"
|
348
|
+
**
|
349
|
+
** would return 0 but if
|
350
|
+
**
|
351
|
+
** ends = "estStr"
|
352
|
+
**
|
353
|
+
** we would return 1.
|
354
|
+
**
|
355
|
+
** and if
|
356
|
+
**
|
357
|
+
** ends= "stStr"
|
358
|
+
** we would return 2 .....etc
|
359
|
+
**
|
360
|
+
**
|
361
|
+
******************************************************************/
|
362
|
+
|
363
|
+
static int token_ends_with(char *token, char *ends_in){
|
364
|
+
|
365
|
+
int tokenlength = strlen(token);
|
366
|
+
int ends_length = strlen(ends_in);
|
367
|
+
int start_pos;
|
368
|
+
char *tmp_ptr;
|
369
|
+
|
370
|
+
if (tokenlength <= ends_length){
|
371
|
+
/* token string is too short so can't possibly end with ends */
|
372
|
+
return 0;
|
373
|
+
}
|
374
|
+
|
375
|
+
start_pos = tokenlength - ends_length;
|
376
|
+
|
377
|
+
tmp_ptr = &token[start_pos];
|
378
|
+
|
379
|
+
if (strcmp(tmp_ptr,ends_in)==0){
|
380
|
+
return start_pos;
|
381
|
+
} else {
|
382
|
+
return 0;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
|
386
|
+
|
387
|
+
/*******************************************************************
|
388
|
+
*******************************************************************
|
389
|
+
**
|
390
|
+
** Code for Reading from file
|
391
|
+
**
|
392
|
+
*******************************************************************
|
393
|
+
*******************************************************************/
|
394
|
+
|
395
|
+
|
396
|
+
|
397
|
+
/****************************************************************
|
398
|
+
**
|
399
|
+
** void ReadFileLine(char *buffer, int buffersize, FILE *currentFile)
|
400
|
+
**
|
401
|
+
** char *buffer - place to store contents of the line
|
402
|
+
** int buffersize - size of the buffer
|
403
|
+
** FILE *currentFile - FILE pointer to an opened CEL file.
|
404
|
+
**
|
405
|
+
** Read a line from a file, into a buffer of specified size.
|
406
|
+
** otherwise die.
|
407
|
+
**
|
408
|
+
***************************************************************/
|
409
|
+
|
410
|
+
static int ReadFileLine(char *buffer, int buffersize, FILE *currentFile){
|
411
|
+
if (fgets(buffer, buffersize, currentFile) == NULL){
|
412
|
+
return 0;
|
413
|
+
//error("End of file reached unexpectedly. Perhaps this file is truncated.\n");
|
414
|
+
}
|
415
|
+
return 1;
|
416
|
+
}
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
|
421
|
+
/****************************************************************
|
422
|
+
**
|
423
|
+
** Code for identifying what type of information is stored in
|
424
|
+
** the current line
|
425
|
+
**
|
426
|
+
***************************************************************/
|
427
|
+
|
428
|
+
/****************************************************************
|
429
|
+
**
|
430
|
+
** static int IsHeaderLine(char *buffer)
|
431
|
+
**
|
432
|
+
** char *buffer - contains line to evaluate
|
433
|
+
**
|
434
|
+
** Checks whether supplied line is a header line (ie starts with #%)
|
435
|
+
**
|
436
|
+
** return 1 (ie true) if header line. 0 otherwise
|
437
|
+
**
|
438
|
+
***************************************************************/
|
439
|
+
|
440
|
+
|
441
|
+
static int IsHeaderLine(char *buffer){
|
442
|
+
|
443
|
+
if (strncmp("#%",buffer,2) == 0){
|
444
|
+
return 1;
|
445
|
+
}
|
446
|
+
return 0;
|
447
|
+
}
|
448
|
+
|
449
|
+
/****************************************************************
|
450
|
+
**
|
451
|
+
** static int IsHeaderLine(char *buffer)
|
452
|
+
**
|
453
|
+
** char *buffer - contains line to evaluate
|
454
|
+
**
|
455
|
+
** Checks whether supplied line is a comment line (ie starts with #)
|
456
|
+
**
|
457
|
+
**
|
458
|
+
***************************************************************/
|
459
|
+
|
460
|
+
static int IsCommentLine(char *buffer){
|
461
|
+
if (strncmp("#",buffer,1) == 0){
|
462
|
+
return 1;
|
463
|
+
}
|
464
|
+
return 0;
|
465
|
+
}
|
466
|
+
|
467
|
+
|
468
|
+
/*****************************************************************
|
469
|
+
**
|
470
|
+
** static int IsLevel2(char *buffer)
|
471
|
+
**
|
472
|
+
** char *buffer - contains line to evaluate
|
473
|
+
**
|
474
|
+
** checks whether supplied line begins with two tab characters it \t\t
|
475
|
+
**
|
476
|
+
** Return 1 if true, 0 otherwise
|
477
|
+
**
|
478
|
+
***************************************************************/
|
479
|
+
|
480
|
+
static int IsLevel2(char *buffer){
|
481
|
+
if (strncmp("\t\t",buffer,2) == 0){
|
482
|
+
return 1;
|
483
|
+
}
|
484
|
+
return 0;
|
485
|
+
}
|
486
|
+
|
487
|
+
|
488
|
+
|
489
|
+
/*****************************************************************
|
490
|
+
**
|
491
|
+
** static int IsLevel1(char *buffer)
|
492
|
+
**
|
493
|
+
** char *buffer - contains line to evaluate
|
494
|
+
**
|
495
|
+
** checks whether supplied line begins with a single tab characters it \t
|
496
|
+
**
|
497
|
+
** Return 1 if true, 0 otherwise
|
498
|
+
**
|
499
|
+
***************************************************************/
|
500
|
+
|
501
|
+
static int IsLevel1(char *buffer){
|
502
|
+
if (strncmp("\t",buffer,1) == 0){
|
503
|
+
if (strncmp("\t\t",buffer,2) != 0){
|
504
|
+
return 1;
|
505
|
+
}
|
506
|
+
return 0;
|
507
|
+
}
|
508
|
+
return 0;
|
509
|
+
}
|
510
|
+
|
511
|
+
|
512
|
+
|
513
|
+
/****************************************************************
|
514
|
+
****************************************************************
|
515
|
+
**
|
516
|
+
** Code for deallocating or initializing header data structures
|
517
|
+
**
|
518
|
+
****************************************************************
|
519
|
+
****************************************************************/
|
520
|
+
|
521
|
+
void dealloc_pgf_headers(pgf_headers *header){
|
522
|
+
int i;
|
523
|
+
|
524
|
+
if (header->n_chip_type > 0){
|
525
|
+
for (i = 0; i < header->n_chip_type; i++){
|
526
|
+
Free(header->chip_type[i]);
|
527
|
+
}
|
528
|
+
Free(header->chip_type);
|
529
|
+
}
|
530
|
+
|
531
|
+
if (header->lib_set_name != NULL){
|
532
|
+
Free(header->lib_set_name);
|
533
|
+
}
|
534
|
+
|
535
|
+
if (header->lib_set_version != NULL){
|
536
|
+
Free(header->lib_set_version);
|
537
|
+
}
|
538
|
+
|
539
|
+
if (header->pgf_format_version != NULL){
|
540
|
+
Free(header->pgf_format_version);
|
541
|
+
}
|
542
|
+
|
543
|
+
if (header->header0_str != NULL){
|
544
|
+
Free(header->header0_str);
|
545
|
+
Free(header->header0);
|
546
|
+
}
|
547
|
+
if (header->header1_str != NULL){
|
548
|
+
Free(header->header1_str);
|
549
|
+
Free(header->header1);
|
550
|
+
}
|
551
|
+
if (header->header2_str != NULL){
|
552
|
+
Free(header->header2_str);
|
553
|
+
Free(header->header2);
|
554
|
+
}
|
555
|
+
|
556
|
+
if (header->create_date != NULL){
|
557
|
+
Free(header->create_date);
|
558
|
+
}
|
559
|
+
|
560
|
+
if (header->guid != NULL){
|
561
|
+
Free(header->guid);
|
562
|
+
}
|
563
|
+
|
564
|
+
if (header->n_other_headers > 0){
|
565
|
+
for (i = 0; i < header->n_other_headers; i++){
|
566
|
+
Free(header->other_headers_keys[i]);
|
567
|
+
Free(header->other_headers_values[i]);
|
568
|
+
}
|
569
|
+
Free(header->other_headers_keys);
|
570
|
+
Free(header->other_headers_values);
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
|
575
|
+
void dealloc_probes(probe_list_header *probes){
|
576
|
+
|
577
|
+
probe_list_node *temp_node;
|
578
|
+
|
579
|
+
if (probes->first != NULL){
|
580
|
+
temp_node = probes->first;
|
581
|
+
while (temp_node != NULL){
|
582
|
+
probes->first = (probe_list_node *)temp_node->next;
|
583
|
+
if (temp_node->type != NULL){
|
584
|
+
Free(temp_node->type);
|
585
|
+
}
|
586
|
+
if (temp_node->probe_sequence != NULL){
|
587
|
+
Free(temp_node->probe_sequence);
|
588
|
+
}
|
589
|
+
Free(temp_node);
|
590
|
+
temp_node = probes->first;
|
591
|
+
}
|
592
|
+
|
593
|
+
|
594
|
+
}
|
595
|
+
}
|
596
|
+
|
597
|
+
|
598
|
+
|
599
|
+
void dealloc_atoms(atom_list_header *atoms){
|
600
|
+
|
601
|
+
atom_list_node *temp_node;
|
602
|
+
|
603
|
+
if (atoms->first != NULL){
|
604
|
+
temp_node = atoms->first;
|
605
|
+
while (temp_node != NULL){
|
606
|
+
atoms->first = (atom_list_node *)temp_node->next;
|
607
|
+
if (temp_node->type != NULL){
|
608
|
+
Free(temp_node->type);
|
609
|
+
}
|
610
|
+
if (temp_node->exon_position != NULL){
|
611
|
+
Free(temp_node->exon_position);
|
612
|
+
}
|
613
|
+
if (temp_node->probes != NULL){
|
614
|
+
dealloc_probes(temp_node->probes);
|
615
|
+
Free(temp_node->probes);
|
616
|
+
}
|
617
|
+
|
618
|
+
Free(temp_node);
|
619
|
+
temp_node = atoms->first;
|
620
|
+
}
|
621
|
+
|
622
|
+
|
623
|
+
}
|
624
|
+
|
625
|
+
|
626
|
+
|
627
|
+
}
|
628
|
+
|
629
|
+
|
630
|
+
void dealloc_pgf_probesets(probeset_list_header *probesets){
|
631
|
+
|
632
|
+
probeset_list_node *temp_node;
|
633
|
+
|
634
|
+
if (probesets->first != NULL){
|
635
|
+
temp_node = probesets->first;
|
636
|
+
while (temp_node != NULL){
|
637
|
+
probesets->first = (probeset_list_node *)temp_node->next;
|
638
|
+
|
639
|
+
if (temp_node->type != NULL){
|
640
|
+
Free(temp_node->type);
|
641
|
+
}
|
642
|
+
if (temp_node->probeset_name != NULL){
|
643
|
+
Free(temp_node->probeset_name);
|
644
|
+
}
|
645
|
+
|
646
|
+
if (temp_node->atoms != NULL){
|
647
|
+
dealloc_atoms(temp_node->atoms);
|
648
|
+
Free(temp_node->atoms);
|
649
|
+
}
|
650
|
+
|
651
|
+
Free(temp_node);
|
652
|
+
temp_node = probesets->first;
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
}
|
657
|
+
|
658
|
+
|
659
|
+
|
660
|
+
void dealloc_pgf_file(pgf_file* my_pgf){
|
661
|
+
|
662
|
+
|
663
|
+
if (my_pgf->headers != NULL){
|
664
|
+
dealloc_pgf_headers(my_pgf->headers);
|
665
|
+
Free(my_pgf->headers);
|
666
|
+
}
|
667
|
+
|
668
|
+
|
669
|
+
if (my_pgf->probesets !=NULL){
|
670
|
+
dealloc_pgf_probesets(my_pgf->probesets);
|
671
|
+
Free(my_pgf->probesets);
|
672
|
+
}
|
673
|
+
|
674
|
+
|
675
|
+
}
|
676
|
+
|
677
|
+
|
678
|
+
void initialize_pgf_header(pgf_headers *header){
|
679
|
+
|
680
|
+
header->chip_type = NULL;
|
681
|
+
header->n_chip_type = 0;
|
682
|
+
|
683
|
+
header->lib_set_name= NULL;
|
684
|
+
header->lib_set_version= NULL;
|
685
|
+
header->pgf_format_version= NULL;
|
686
|
+
header->header0_str= NULL;
|
687
|
+
header->header0= NULL;
|
688
|
+
header->header1_str= NULL;
|
689
|
+
header->header1= NULL;
|
690
|
+
header->header2_str= NULL;
|
691
|
+
header->header2= NULL;
|
692
|
+
header->create_date= NULL;
|
693
|
+
header->guid= NULL;
|
694
|
+
header->other_headers_keys= NULL;
|
695
|
+
header->other_headers_values= NULL;
|
696
|
+
header->n_other_headers=0;
|
697
|
+
}
|
698
|
+
|
699
|
+
/****************************************************************
|
700
|
+
****************************************************************
|
701
|
+
**
|
702
|
+
** Code for figuring out column ordering
|
703
|
+
**
|
704
|
+
****************************************************************
|
705
|
+
***************************************************************/
|
706
|
+
|
707
|
+
|
708
|
+
static void determine_order_header0(char *header_str, header_0 *header0){
|
709
|
+
|
710
|
+
tokenset *cur_tokenset;
|
711
|
+
int i;
|
712
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
713
|
+
|
714
|
+
|
715
|
+
strcpy(temp_str,header_str);
|
716
|
+
|
717
|
+
header0->probeset_id = -1;
|
718
|
+
header0->type = -1;
|
719
|
+
header0->probeset_name = -1;
|
720
|
+
|
721
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
722
|
+
|
723
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
724
|
+
if (strcmp(get_token(cur_tokenset,i),"probeset_id")==0){
|
725
|
+
header0->probeset_id = i;
|
726
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
727
|
+
header0->type = i;
|
728
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
729
|
+
header0->probeset_name = i;
|
730
|
+
}
|
731
|
+
}
|
732
|
+
delete_tokens(cur_tokenset);
|
733
|
+
|
734
|
+
Free(temp_str);
|
735
|
+
|
736
|
+
}
|
737
|
+
|
738
|
+
static void determine_order_header1(char *header_str, header_1 *header1){
|
739
|
+
|
740
|
+
tokenset *cur_tokenset;
|
741
|
+
int i;
|
742
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
743
|
+
|
744
|
+
|
745
|
+
strcpy(temp_str,header_str);
|
746
|
+
|
747
|
+
header1->atom_id = -1;
|
748
|
+
header1->type = -1;
|
749
|
+
header1->exon_position = -1;
|
750
|
+
|
751
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
752
|
+
|
753
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
754
|
+
if (strcmp(get_token(cur_tokenset,i),"atom_id")==0){
|
755
|
+
header1->atom_id = i;
|
756
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
757
|
+
header1->type = i;
|
758
|
+
} else if (strcmp(get_token(cur_tokenset,i),"exon_position")==0){
|
759
|
+
header1->exon_position = i;
|
760
|
+
}
|
761
|
+
}
|
762
|
+
delete_tokens(cur_tokenset);
|
763
|
+
|
764
|
+
Free(temp_str);
|
765
|
+
|
766
|
+
}
|
767
|
+
|
768
|
+
static void determine_order_header2(char *header_str, header_2 *header2){
|
769
|
+
|
770
|
+
tokenset *cur_tokenset;
|
771
|
+
int i;
|
772
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
773
|
+
|
774
|
+
|
775
|
+
strcpy(temp_str,header_str);
|
776
|
+
|
777
|
+
header2->probe_id = -1;
|
778
|
+
header2->type = -1;
|
779
|
+
header2->gc_count = -1;
|
780
|
+
header2->probe_length = -1;
|
781
|
+
header2->interrogation_position = -1;
|
782
|
+
header2->probe_sequence = -1;
|
783
|
+
|
784
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
785
|
+
|
786
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
787
|
+
if (strcmp(get_token(cur_tokenset,i),"probe_id")==0){
|
788
|
+
header2->probe_id = i;
|
789
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
790
|
+
header2->type = i;
|
791
|
+
} else if (strcmp(get_token(cur_tokenset,i),"gc_count")==0){
|
792
|
+
header2->gc_count = i;
|
793
|
+
} else if (strcmp(get_token(cur_tokenset,i),"probe_length")==0){
|
794
|
+
header2->probe_length = i;
|
795
|
+
} else if (strcmp(get_token(cur_tokenset,i),"interrogation_position")==0){
|
796
|
+
header2->interrogation_position = i;
|
797
|
+
} else if (strcmp(get_token(cur_tokenset,i),"probe_sequence")==0){
|
798
|
+
header2->probe_sequence = i;
|
799
|
+
}
|
800
|
+
|
801
|
+
}
|
802
|
+
delete_tokens(cur_tokenset);
|
803
|
+
|
804
|
+
Free(temp_str);
|
805
|
+
|
806
|
+
}
|
807
|
+
|
808
|
+
|
809
|
+
/****************************************************************
|
810
|
+
**
|
811
|
+
** Validate that required headers are present in file.
|
812
|
+
**
|
813
|
+
** Return 0 if an expected header is not present.
|
814
|
+
** Returns 1 otherwise (ie everything looks fine)
|
815
|
+
**
|
816
|
+
***************************************************************/
|
817
|
+
|
818
|
+
static int validate_pgf_header(pgf_headers *header){
|
819
|
+
|
820
|
+
|
821
|
+
/* check that required headers are all there (have been read) */
|
822
|
+
if (header->chip_type == NULL)
|
823
|
+
return 0;
|
824
|
+
|
825
|
+
if (header->lib_set_name == NULL)
|
826
|
+
return 0;
|
827
|
+
|
828
|
+
if (header->lib_set_version == NULL)
|
829
|
+
return 0;
|
830
|
+
|
831
|
+
if (header->pgf_format_version == NULL)
|
832
|
+
return 0;
|
833
|
+
|
834
|
+
if (header->header0_str == NULL)
|
835
|
+
return 0;
|
836
|
+
|
837
|
+
if (header->header1_str == NULL)
|
838
|
+
return 0;
|
839
|
+
|
840
|
+
if (header->header2_str == NULL)
|
841
|
+
return 0;
|
842
|
+
|
843
|
+
|
844
|
+
/* Check that format version is 1.0 (only supported version) */
|
845
|
+
|
846
|
+
if (strcmp( header->pgf_format_version,"1.0") != 0){
|
847
|
+
return 0;
|
848
|
+
}
|
849
|
+
|
850
|
+
/* check that header0, header1, header2 (ie the three levels of headers) have required fields */
|
851
|
+
|
852
|
+
if (header->header0->probeset_id == -1)
|
853
|
+
return 0;
|
854
|
+
|
855
|
+
if (header->header1->atom_id == -1)
|
856
|
+
return 0;
|
857
|
+
|
858
|
+
if (header->header2->probe_id == -1)
|
859
|
+
return 0;
|
860
|
+
|
861
|
+
if (header->header2->type == -1)
|
862
|
+
return 0;
|
863
|
+
|
864
|
+
return 1;
|
865
|
+
}
|
866
|
+
|
867
|
+
|
868
|
+
|
869
|
+
|
870
|
+
/****************************************************************
|
871
|
+
****************************************************************
|
872
|
+
**
|
873
|
+
** Code for actually reading from the file
|
874
|
+
**
|
875
|
+
****************************************************************
|
876
|
+
***************************************************************/
|
877
|
+
|
878
|
+
static FILE *open_pgf_file(const char *filename){
|
879
|
+
|
880
|
+
const char *mode = "r";
|
881
|
+
FILE *currentFile = NULL;
|
882
|
+
|
883
|
+
currentFile = fopen(filename,mode);
|
884
|
+
if (currentFile == NULL){
|
885
|
+
error("Could not open file %s", filename);
|
886
|
+
}
|
887
|
+
return currentFile;
|
888
|
+
|
889
|
+
}
|
890
|
+
|
891
|
+
/****************************************************************
|
892
|
+
**
|
893
|
+
** Reading the header
|
894
|
+
**
|
895
|
+
***************************************************************/
|
896
|
+
|
897
|
+
void read_pgf_header(FILE *cur_file, char *buffer, pgf_headers *header){
|
898
|
+
|
899
|
+
|
900
|
+
tokenset *cur_tokenset;
|
901
|
+
int i;
|
902
|
+
char *temp_str;
|
903
|
+
|
904
|
+
|
905
|
+
initialize_pgf_header(header);
|
906
|
+
do {
|
907
|
+
ReadFileLine(buffer, 1024, cur_file);
|
908
|
+
/* Rprintf("%s\n",buffer); */
|
909
|
+
if (IsHeaderLine(buffer)){
|
910
|
+
cur_tokenset = tokenize(&buffer[2],"=\r\n");
|
911
|
+
/* hopefully token 0 is Key
|
912
|
+
and token 1 is Value */
|
913
|
+
/* Rprintf("Key is: %s\n",get_token(cur_tokenset,0));
|
914
|
+
Rprintf("Value is: %s\n",get_token(cur_tokenset,1)); */
|
915
|
+
/* Decode the Key/Value pair */
|
916
|
+
if (strcmp(get_token(cur_tokenset,0),"chip_type") == 0){
|
917
|
+
if (header->n_chip_type == 0){
|
918
|
+
header->chip_type = Calloc(1, char *);
|
919
|
+
} else {
|
920
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
921
|
+
}
|
922
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1))+1,char);
|
923
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
924
|
+
header->chip_type[header->n_chip_type] = temp_str;
|
925
|
+
header->n_chip_type++;
|
926
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_name") == 0){
|
927
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
928
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
929
|
+
header->lib_set_name = temp_str;
|
930
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_version") == 0){
|
931
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
932
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
933
|
+
header->lib_set_version = temp_str;
|
934
|
+
} else if (strcmp(get_token(cur_tokenset,0), "pgf_format_version") == 0) {
|
935
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
936
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
937
|
+
header->pgf_format_version = temp_str;
|
938
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header0") == 0) {
|
939
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
940
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
941
|
+
header->header0_str = temp_str;
|
942
|
+
header->header0 = Calloc(1,header_0);
|
943
|
+
determine_order_header0(header->header0_str,header->header0);
|
944
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header1") == 0) {
|
945
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
946
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
947
|
+
header->header1_str = temp_str;
|
948
|
+
header->header1 = Calloc(1,header_1);
|
949
|
+
determine_order_header1(header->header1_str,header->header1);
|
950
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header2") == 0) {
|
951
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
952
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
953
|
+
header->header2_str = temp_str;
|
954
|
+
header->header2 = Calloc(1,header_2);
|
955
|
+
determine_order_header2(header->header2_str,header->header2);
|
956
|
+
} else if (strcmp(get_token(cur_tokenset,0), "create_date") == 0) {
|
957
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
958
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
959
|
+
header->create_date = temp_str;
|
960
|
+
} else if (strcmp(get_token(cur_tokenset,0), "guid") == 0) {
|
961
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
962
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
963
|
+
header->guid = temp_str;
|
964
|
+
} else {
|
965
|
+
/* not one of the recognised header types */
|
966
|
+
if ( header->n_other_headers == 0){
|
967
|
+
header->other_headers_keys = Calloc(1, char *);
|
968
|
+
header->other_headers_values = Calloc(1, char *);
|
969
|
+
} else {
|
970
|
+
header->other_headers_keys = Realloc(header->other_headers_keys,header->n_other_headers+1, char *);
|
971
|
+
header->other_headers_values = Realloc(header->other_headers_values,header->n_other_headers+1, char *);
|
972
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
973
|
+
}
|
974
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
975
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
976
|
+
header->other_headers_values[header->n_other_headers] = temp_str;
|
977
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,0)) + 1,char);
|
978
|
+
strcpy(temp_str,get_token(cur_tokenset,0));
|
979
|
+
header->other_headers_keys[header->n_other_headers] = temp_str;
|
980
|
+
header->n_other_headers++;
|
981
|
+
|
982
|
+
}
|
983
|
+
|
984
|
+
delete_tokens(cur_tokenset);
|
985
|
+
}
|
986
|
+
} while (IsHeaderLine(buffer));
|
987
|
+
|
988
|
+
}
|
989
|
+
|
990
|
+
|
991
|
+
/****************************************************************
|
992
|
+
**
|
993
|
+
** Reading the probesets/body of the file
|
994
|
+
**
|
995
|
+
***************************************************************/
|
996
|
+
|
997
|
+
void initialize_probeset_list(probeset_list_header *probeset_list){
|
998
|
+
|
999
|
+
probeset_list->n_probesets = 0;
|
1000
|
+
probeset_list->first = NULL;
|
1001
|
+
probeset_list->current = NULL;
|
1002
|
+
probeset_list->last = NULL;
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
|
1006
|
+
|
1007
|
+
void insert_probe(char *buffer, probe_list_header *probe_list, header_2 *header2){
|
1008
|
+
|
1009
|
+
char *temp_str;
|
1010
|
+
tokenset *cur_tokenset;
|
1011
|
+
probe_list_node *temp_ptr;
|
1012
|
+
|
1013
|
+
probe_list_node *temp_node = Calloc(1,probe_list_node);
|
1014
|
+
|
1015
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
1016
|
+
temp_node->probe_id = atoi(get_token(cur_tokenset,header2->probe_id));
|
1017
|
+
|
1018
|
+
if (header2->type != -1){
|
1019
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header2->type)) + 1,char);
|
1020
|
+
strcpy(temp_str,get_token(cur_tokenset,header2->type));
|
1021
|
+
temp_node->type = temp_str;
|
1022
|
+
}
|
1023
|
+
if (header2->gc_count != -1){
|
1024
|
+
temp_node->gc_count = atoi(get_token(cur_tokenset,header2->gc_count));
|
1025
|
+
}
|
1026
|
+
if (header2->probe_length != -1){
|
1027
|
+
temp_node->probe_length = atoi(get_token(cur_tokenset,header2->probe_length));
|
1028
|
+
}
|
1029
|
+
if (header2->interrogation_position != -1){
|
1030
|
+
temp_node->interrogation_position = atoi(get_token(cur_tokenset,header2->interrogation_position));
|
1031
|
+
}
|
1032
|
+
if (header2->probe_sequence != -1){
|
1033
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header2->probe_sequence)) + 1,char);
|
1034
|
+
strcpy(temp_str,get_token(cur_tokenset,header2->probe_sequence));
|
1035
|
+
temp_node->probe_sequence = temp_str;
|
1036
|
+
}
|
1037
|
+
|
1038
|
+
|
1039
|
+
|
1040
|
+
temp_node->next = NULL;
|
1041
|
+
if (probe_list->n_probes == 0){
|
1042
|
+
probe_list->first = temp_node;
|
1043
|
+
probe_list->n_probes = 1;
|
1044
|
+
} else {
|
1045
|
+
/* figure out where the end of the list is. Insert there */
|
1046
|
+
temp_ptr = probe_list->first;
|
1047
|
+
|
1048
|
+
while (temp_ptr->next != NULL){
|
1049
|
+
temp_ptr = (probe_list_node*)temp_ptr->next;
|
1050
|
+
}
|
1051
|
+
temp_ptr->next = (struct probe_list_node*)temp_node;
|
1052
|
+
probe_list->n_probes++;
|
1053
|
+
}
|
1054
|
+
delete_tokens(cur_tokenset);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
|
1058
|
+
void insert_level2(char *buffer, probeset_list_header *probeset_list, header_2 *header2){
|
1059
|
+
|
1060
|
+
atom_list_header *current_level1;
|
1061
|
+
atom_list_node *current_atom;
|
1062
|
+
|
1063
|
+
if (probeset_list->current == NULL){
|
1064
|
+
/* Oh Boy, this is a problem no header0 level object to insert into. */
|
1065
|
+
error("Can not read a level 2 line before seeing a level 0 line. File corrupted?");
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
if (probeset_list->current->atoms == NULL){
|
1069
|
+
/* Oh Boy, this is a problem no header1 level object to insert into. */
|
1070
|
+
error("Can not read a level 2 line before seeing a level 1 line. File corrupted?");
|
1071
|
+
}
|
1072
|
+
|
1073
|
+
current_level1 = probeset_list->current->atoms;
|
1074
|
+
|
1075
|
+
current_atom = current_level1->first;
|
1076
|
+
|
1077
|
+
while (current_atom->next != NULL){
|
1078
|
+
current_atom = (atom_list_node *)current_atom->next;
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
if (current_atom->probes == NULL){
|
1082
|
+
current_atom->probes = Calloc(1,probe_list_header);
|
1083
|
+
}
|
1084
|
+
|
1085
|
+
insert_probe(buffer, current_atom->probes, header2);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
|
1089
|
+
|
1090
|
+
|
1091
|
+
|
1092
|
+
void insert_atom(char *buffer, atom_list_header *atoms_list, header_1 *header1){
|
1093
|
+
|
1094
|
+
char *temp_str;
|
1095
|
+
tokenset *cur_tokenset;
|
1096
|
+
atom_list_node *temp_ptr;
|
1097
|
+
|
1098
|
+
atom_list_node *temp_node = Calloc(1,atom_list_node);
|
1099
|
+
|
1100
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
1101
|
+
|
1102
|
+
temp_node->atom_id = atoi(get_token(cur_tokenset,header1->atom_id));
|
1103
|
+
|
1104
|
+
if (header1->type != -1){
|
1105
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header1->type)) + 1,char);
|
1106
|
+
strcpy(temp_str,get_token(cur_tokenset,header1->type));
|
1107
|
+
temp_node->type = temp_str;
|
1108
|
+
}
|
1109
|
+
if (header1->exon_position != -1){
|
1110
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header1->exon_position)) + 1,char);
|
1111
|
+
strcpy(temp_str,get_token(cur_tokenset,header1->exon_position));
|
1112
|
+
temp_node->exon_position = temp_str;
|
1113
|
+
}
|
1114
|
+
temp_node->probes = NULL;
|
1115
|
+
temp_node->next = NULL;
|
1116
|
+
|
1117
|
+
if (atoms_list->n_atoms == 0){
|
1118
|
+
atoms_list->first = temp_node;
|
1119
|
+
atoms_list->n_atoms = 1;
|
1120
|
+
} else {
|
1121
|
+
/* figure out where the end of the list is. Insert there */
|
1122
|
+
temp_ptr = (atom_list_node*)atoms_list->first;
|
1123
|
+
|
1124
|
+
while (temp_ptr->next != NULL){
|
1125
|
+
temp_ptr= (atom_list_node*)temp_ptr->next;
|
1126
|
+
}
|
1127
|
+
temp_ptr->next = (struct atom_list_node*)temp_node;
|
1128
|
+
atoms_list->n_atoms++;
|
1129
|
+
}
|
1130
|
+
delete_tokens(cur_tokenset);
|
1131
|
+
}
|
1132
|
+
|
1133
|
+
|
1134
|
+
void insert_level1(char *buffer, probeset_list_header *probeset_list, header_1 *header1){
|
1135
|
+
|
1136
|
+
probeset_list_node *current_level0;
|
1137
|
+
|
1138
|
+
|
1139
|
+
if (probeset_list->current == NULL){
|
1140
|
+
/* Oh Boy, this is a problem no header0 level object to insert into. */
|
1141
|
+
error("Can not read a level 1 line before seeing a level 0 line. File corrupted?");
|
1142
|
+
}
|
1143
|
+
|
1144
|
+
current_level0 = probeset_list->current;
|
1145
|
+
|
1146
|
+
if (current_level0->atoms == NULL){
|
1147
|
+
current_level0->atoms = Calloc(1,atom_list_header);
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
/* Now lets insert the data */
|
1151
|
+
|
1152
|
+
insert_atom(buffer, current_level0->atoms, header1);
|
1153
|
+
|
1154
|
+
|
1155
|
+
|
1156
|
+
|
1157
|
+
|
1158
|
+
}
|
1159
|
+
|
1160
|
+
|
1161
|
+
|
1162
|
+
|
1163
|
+
void insert_level0(char *buffer, probeset_list_header *probeset_list, header_0 *header0){
|
1164
|
+
|
1165
|
+
char *temp_str;
|
1166
|
+
tokenset *cur_tokenset;
|
1167
|
+
probeset_list_node *temp_ptr;
|
1168
|
+
|
1169
|
+
probeset_list_node *temp_node = Calloc(1,probeset_list_node);
|
1170
|
+
|
1171
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
1172
|
+
|
1173
|
+
temp_node->probeset_id = atoi(get_token(cur_tokenset,header0->probeset_id));
|
1174
|
+
|
1175
|
+
if (header0->type != -1){
|
1176
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header0->type)) + 1,char);
|
1177
|
+
strcpy(temp_str,get_token(cur_tokenset,header0->type));
|
1178
|
+
temp_node->type = temp_str;
|
1179
|
+
}
|
1180
|
+
if (header0->probeset_name != -1){
|
1181
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header0->probeset_name)) + 1,char);
|
1182
|
+
strcpy(temp_str,get_token(cur_tokenset,header0->probeset_name));
|
1183
|
+
temp_node->probeset_name = temp_str;
|
1184
|
+
}
|
1185
|
+
temp_node->atoms = NULL;
|
1186
|
+
temp_node->next = NULL;
|
1187
|
+
|
1188
|
+
if (probeset_list->first == NULL){
|
1189
|
+
probeset_list->first = temp_node;
|
1190
|
+
probeset_list->current = temp_node;
|
1191
|
+
probeset_list->last = temp_node;
|
1192
|
+
probeset_list->n_probesets = 1;
|
1193
|
+
} else {
|
1194
|
+
probeset_list->last->next = (struct probeset_list_node *)temp_node;
|
1195
|
+
probeset_list->last = temp_node;
|
1196
|
+
probeset_list->current = temp_node;
|
1197
|
+
probeset_list->n_probesets++;
|
1198
|
+
}
|
1199
|
+
delete_tokens(cur_tokenset);
|
1200
|
+
}
|
1201
|
+
|
1202
|
+
|
1203
|
+
void read_pgf_probesets(FILE *cur_file, char *buffer, probeset_list_header *probeset_list, pgf_headers *header){
|
1204
|
+
|
1205
|
+
initialize_probeset_list(probeset_list);
|
1206
|
+
|
1207
|
+
insert_level0(buffer, probeset_list, header->header0);
|
1208
|
+
|
1209
|
+
while(ReadFileLine(buffer, 1024, cur_file)){
|
1210
|
+
if (IsLevel2(buffer)){
|
1211
|
+
insert_level2(buffer, probeset_list, header->header2);
|
1212
|
+
} else if (IsLevel1(buffer)){
|
1213
|
+
insert_level1(buffer, probeset_list, header->header1);
|
1214
|
+
} else if (IsCommentLine(buffer)){
|
1215
|
+
/*Ignore */
|
1216
|
+
} else {
|
1217
|
+
insert_level0(buffer, probeset_list, header->header0);
|
1218
|
+
}
|
1219
|
+
}
|
1220
|
+
}
|
1221
|
+
|
1222
|
+
/****************************************************************
|
1223
|
+
****************************************************************
|
1224
|
+
**
|
1225
|
+
** Funtionality for counting probeset types
|
1226
|
+
**
|
1227
|
+
****************************************************************
|
1228
|
+
****************************************************************/
|
1229
|
+
|
1230
|
+
typedef struct{
|
1231
|
+
char *type;
|
1232
|
+
int count;
|
1233
|
+
} probeset_type_list;
|
1234
|
+
|
1235
|
+
|
1236
|
+
|
1237
|
+
probeset_type_list *pgf_count_probeset_types(pgf_file *my_pgf, int *number){
|
1238
|
+
|
1239
|
+
|
1240
|
+
probeset_type_list *my_type_list = Calloc(1,probeset_type_list);
|
1241
|
+
|
1242
|
+
char *cur_type;
|
1243
|
+
int n;
|
1244
|
+
|
1245
|
+
/* traverse the probesets. each time examining the probeset type */
|
1246
|
+
|
1247
|
+
|
1248
|
+
if (my_pgf->probesets != NULL){
|
1249
|
+
|
1250
|
+
if (my_pgf->probesets->first != NULL){
|
1251
|
+
|
1252
|
+
my_pgf->probesets->current = my_pgf->probesets->first;
|
1253
|
+
|
1254
|
+
if (my_pgf->probesets->current->type == NULL){
|
1255
|
+
my_type_list[0].type = Calloc(5,char);
|
1256
|
+
strcpy(my_type_list[0].type,"none");
|
1257
|
+
|
1258
|
+
} else {
|
1259
|
+
my_type_list[0].type = Calloc(strlen(my_pgf->probesets->current->type) + 1,char);
|
1260
|
+
strcpy(my_type_list[0].type,my_pgf->probesets->current->type);
|
1261
|
+
}
|
1262
|
+
my_type_list[0].count = 1;
|
1263
|
+
*number = 1; /* number of different types seen */
|
1264
|
+
while (my_pgf->probesets->current->next != NULL){
|
1265
|
+
my_pgf->probesets->current= my_pgf->probesets->current->next;
|
1266
|
+
if (my_pgf->probesets->current->type == NULL){
|
1267
|
+
cur_type = "none";
|
1268
|
+
} else {
|
1269
|
+
cur_type = my_pgf->probesets->current->type;
|
1270
|
+
}
|
1271
|
+
n = 0;
|
1272
|
+
while (n < *number){
|
1273
|
+
if (strcmp(cur_type,my_type_list[n].type) == 0){
|
1274
|
+
break;
|
1275
|
+
}
|
1276
|
+
n++;
|
1277
|
+
}
|
1278
|
+
if (n == *number){
|
1279
|
+
my_type_list = Realloc(my_type_list,(n+1),probeset_type_list);
|
1280
|
+
my_type_list[n].type = Calloc(strlen(cur_type) + 1,char);
|
1281
|
+
strcpy(my_type_list[n].type,cur_type);
|
1282
|
+
my_type_list[n].count = 1;
|
1283
|
+
*number = *number + 1;
|
1284
|
+
} else {
|
1285
|
+
my_type_list[n].count++;
|
1286
|
+
}
|
1287
|
+
}
|
1288
|
+
}
|
1289
|
+
}
|
1290
|
+
return my_type_list;
|
1291
|
+
}
|
1292
|
+
|
1293
|
+
|
1294
|
+
void dealloc_probeset_type_list(probeset_type_list *my_type_list, int length){
|
1295
|
+
|
1296
|
+
int i;
|
1297
|
+
|
1298
|
+
for (i = 0; i < length; i++){
|
1299
|
+
Free(my_type_list[i].type);
|
1300
|
+
}
|
1301
|
+
|
1302
|
+
Free(my_type_list);
|
1303
|
+
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
/****************************************************************
|
1307
|
+
****************************************************************
|
1308
|
+
**
|
1309
|
+
** Functionality for testing the parsers (from R .C interface)
|
1310
|
+
**
|
1311
|
+
****************************************************************
|
1312
|
+
****************************************************************/
|
1313
|
+
|
1314
|
+
void read_pgf_file(char **filename){
|
1315
|
+
|
1316
|
+
FILE *cur_file;
|
1317
|
+
pgf_file my_pgf;
|
1318
|
+
char *buffer = Calloc(1024, char);
|
1319
|
+
probeset_type_list *my_probeset_types;
|
1320
|
+
int ntypes;
|
1321
|
+
|
1322
|
+
cur_file = open_pgf_file(filename[0]);
|
1323
|
+
|
1324
|
+
my_pgf.headers = Calloc(1, pgf_headers);
|
1325
|
+
my_pgf.probesets = Calloc(1, probeset_list_header);
|
1326
|
+
|
1327
|
+
read_pgf_header(cur_file,buffer,my_pgf.headers);
|
1328
|
+
if (validate_pgf_header(my_pgf.headers)){
|
1329
|
+
read_pgf_probesets(cur_file, buffer, my_pgf.probesets, my_pgf.headers);
|
1330
|
+
my_probeset_types = pgf_count_probeset_types(&my_pgf, &ntypes);
|
1331
|
+
dealloc_probeset_type_list(my_probeset_types, ntypes);
|
1332
|
+
}
|
1333
|
+
Free(buffer);
|
1334
|
+
dealloc_pgf_file(&my_pgf);
|
1335
|
+
fclose(cur_file);
|
1336
|
+
|
1337
|
+
}
|