bio-affy 0.1.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +32 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +33 -0
- data/Rakefile +77 -0
- data/VERSION +1 -0
- data/bin/bio-affy +80 -0
- data/bio-affy.gemspec +128 -0
- data/ext/DESCRIPTION +11 -0
- data/ext/HISTORY +3 -0
- data/ext/LICENSE +456 -0
- data/ext/NAMESPACE +2 -0
- data/ext/R/check.cdf.type.R +18 -0
- data/ext/R/read.cdffile.list.R +23 -0
- data/ext/R/read.celfile.R +11 -0
- data/ext/R/read.celfile.header.R +37 -0
- data/ext/R/read.probematrices.R +29 -0
- data/ext/README_BIOLIB +36 -0
- data/ext/aclocal.m4 +32 -0
- data/ext/configure +4898 -0
- data/ext/configure.in +51 -0
- data/ext/man/check.cdf.type.Rd +22 -0
- data/ext/man/read.cdffile.list.Rd +20 -0
- data/ext/man/read.celfile.Rd +23 -0
- data/ext/man/read.celfile.header.Rd +22 -0
- data/ext/man/read.celfile.probeintensity.matrices.Rd +31 -0
- data/ext/src/CMakeLists.txt +39 -0
- data/ext/src/Makevars.in +3 -0
- data/ext/src/Makevars.win +2 -0
- data/ext/src/Rakefile +43 -0
- data/ext/src/biolib_affyio.c +416 -0
- data/ext/src/biolib_affyio.h +132 -0
- data/ext/src/biolib_affyio.o +0 -0
- data/ext/src/fread_functions.c +871 -0
- data/ext/src/fread_functions.h +60 -0
- data/ext/src/fread_functions.o +0 -0
- data/ext/src/libaffyext.so +0 -0
- data/ext/src/mkrf.log +11 -0
- data/ext/src/mkrf_conf.rb +6 -0
- data/ext/src/read_abatch.c +5484 -0
- data/ext/src/read_abatch.h +63 -0
- data/ext/src/read_abatch.o +0 -0
- data/ext/src/read_bpmap.c +888 -0
- data/ext/src/read_bpmap.o +0 -0
- data/ext/src/read_cdf.h +347 -0
- data/ext/src/read_cdf_xda.c +1342 -0
- data/ext/src/read_cdf_xda.o +0 -0
- data/ext/src/read_cdffile2.c +1576 -0
- data/ext/src/read_cdffile2.o +0 -0
- data/ext/src/read_celfile_generic.c +2061 -0
- data/ext/src/read_celfile_generic.h +33 -0
- data/ext/src/read_celfile_generic.o +0 -0
- data/ext/src/read_clf.c +870 -0
- data/ext/src/read_clf.o +0 -0
- data/ext/src/read_generic.c +1446 -0
- data/ext/src/read_generic.h +144 -0
- data/ext/src/read_generic.o +0 -0
- data/ext/src/read_pgf.c +1337 -0
- data/ext/src/read_pgf.o +0 -0
- data/lib/bio-affy.rb +5 -0
- data/lib/bio/affy.rb +7 -0
- data/lib/bio/affyext.rb +23 -0
- data/lib/bio/libaffyext.so +0 -0
- data/spec/bio-affy_spec.rb +22 -0
- data/spec/spec_helper.rb +13 -0
- data/test/data/affy/GSM103328.CEL.gz +0 -0
- data/test/data/affy/GSM103329.CEL.gz +0 -0
- data/test/data/affy/GSM103330.CEL.gz +0 -0
- data/test/data/affy/MG_U74Av2.CDF.gz +0 -0
- metadata +190 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
|
2
|
+
#ifdef BIOLIB
|
3
|
+
#include <biolib_R_map.h>
|
4
|
+
#endif
|
5
|
+
|
6
|
+
#include <zlib.h>
|
7
|
+
|
8
|
+
/******
|
9
|
+
******
|
10
|
+
****** Data Structures
|
11
|
+
******
|
12
|
+
******/
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
/* File header */
|
17
|
+
|
18
|
+
typedef struct{
|
19
|
+
uint8_t magic_number;
|
20
|
+
uint8_t version;
|
21
|
+
int32_t n_data_groups;
|
22
|
+
uint32_t first_group_file_pos;
|
23
|
+
} generic_file_header;
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
/* An affy generic STRING */
|
29
|
+
|
30
|
+
typedef struct{
|
31
|
+
int32_t len;
|
32
|
+
char *value;
|
33
|
+
} ASTRING;
|
34
|
+
|
35
|
+
/* An affy generic WSTRING */
|
36
|
+
|
37
|
+
typedef struct{
|
38
|
+
int32_t len;
|
39
|
+
wchar_t *value;
|
40
|
+
} AWSTRING;
|
41
|
+
|
42
|
+
|
43
|
+
/* Name Value Type Triplet */
|
44
|
+
|
45
|
+
typedef struct{
|
46
|
+
AWSTRING name;
|
47
|
+
ASTRING value;
|
48
|
+
AWSTRING type;
|
49
|
+
} nvt_triplet;
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
/* Data Header */
|
55
|
+
|
56
|
+
typedef struct generic_data_header *generic_data_header_pointer;
|
57
|
+
|
58
|
+
typedef struct{
|
59
|
+
ASTRING data_type_id; /*Stored in file as INT followed by CHAR array */
|
60
|
+
ASTRING unique_file_id; /*See above */
|
61
|
+
AWSTRING Date_time; /*Stored in file as INT followed by WCHAR array */
|
62
|
+
AWSTRING locale;
|
63
|
+
int32_t n_name_type_value;
|
64
|
+
nvt_triplet *name_type_value;
|
65
|
+
int32_t n_parent_headers;
|
66
|
+
void **parent_headers;
|
67
|
+
} generic_data_header;
|
68
|
+
|
69
|
+
|
70
|
+
/* Data Group */
|
71
|
+
|
72
|
+
typedef struct {
|
73
|
+
|
74
|
+
uint32_t file_position_nextgroup;
|
75
|
+
uint32_t file_position_first_data;
|
76
|
+
int32_t n_data_sets;
|
77
|
+
AWSTRING data_group_name;
|
78
|
+
} generic_data_group;
|
79
|
+
|
80
|
+
|
81
|
+
/* Dataset */
|
82
|
+
|
83
|
+
typedef struct {
|
84
|
+
AWSTRING name;
|
85
|
+
uint8_t type;
|
86
|
+
int32_t size;
|
87
|
+
} col_nvts_triplet;
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
typedef struct {
|
92
|
+
uint32_t file_pos_first;
|
93
|
+
uint32_t file_pos_last;
|
94
|
+
AWSTRING data_set_name;
|
95
|
+
int32_t n_name_type_value;
|
96
|
+
nvt_triplet *name_type_value;
|
97
|
+
uint32_t ncols;
|
98
|
+
col_nvts_triplet* col_name_type_value;
|
99
|
+
uint32_t nrows;
|
100
|
+
void **Data; /* in the docs this is rows */
|
101
|
+
} generic_data_set;
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
|
106
|
+
typedef enum{
|
107
|
+
|
108
|
+
ASCIITEXT = 1,
|
109
|
+
PLAINTEXT = 2,
|
110
|
+
UINT8 = 3,
|
111
|
+
INT8= 4,
|
112
|
+
UINT16 = 5,
|
113
|
+
INT16 = 6,
|
114
|
+
UINT32 = 7,
|
115
|
+
INT32 = 8,
|
116
|
+
FLOAT32 = 9
|
117
|
+
|
118
|
+
} AffyMIMEtypes;
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
AffyMIMEtypes determine_MIMETYPE(nvt_triplet triplet);
|
123
|
+
void *decode_MIME_value(nvt_triplet triplet, AffyMIMEtypes mimetype, void *result, int *size);
|
124
|
+
char *decode_MIME_value_toASCII(nvt_triplet triplet, AffyMIMEtypes mimetype, void *result, int *size);
|
125
|
+
|
126
|
+
nvt_triplet* find_nvt(generic_data_header *data_header,char *name);
|
127
|
+
|
128
|
+
int read_generic_file_header(generic_file_header* file_header, FILE *instream);
|
129
|
+
int read_generic_data_header(generic_data_header *data_header, FILE *instream);
|
130
|
+
int read_generic_data_group(generic_data_group *data_group, FILE *instream);
|
131
|
+
int read_generic_data_set(generic_data_set *data_set, FILE *instream);
|
132
|
+
int read_generic_data_set_rows(generic_data_set *data_set, FILE *instream);
|
133
|
+
|
134
|
+
|
135
|
+
void Free_generic_data_header(generic_data_header *header);
|
136
|
+
void Free_generic_data_group(generic_data_group *data_group);
|
137
|
+
void Free_generic_data_set(generic_data_set *data_set);
|
138
|
+
|
139
|
+
|
140
|
+
int gzread_generic_file_header(generic_file_header* file_header, gzFile *instream);
|
141
|
+
int gzread_generic_data_header(generic_data_header *data_header, gzFile *instream);
|
142
|
+
int gzread_generic_data_group(generic_data_group *data_group,gzFile *instream);
|
143
|
+
int gzread_generic_data_set(generic_data_set *data_set, gzFile *instream);
|
144
|
+
int gzread_generic_data_set_rows(generic_data_set *data_set, gzFile *instream);
|
Binary file
|
data/ext/src/read_pgf.c
ADDED
@@ -0,0 +1,1337 @@
|
|
1
|
+
/******************************************************************
|
2
|
+
**
|
3
|
+
** file: read_pgf.c
|
4
|
+
**
|
5
|
+
** Aim: implement parsing of PGF format files
|
6
|
+
**
|
7
|
+
** Copyright (C) 2007 B. M. Bolstad
|
8
|
+
**
|
9
|
+
** Created on Nov 4, 2007
|
10
|
+
**
|
11
|
+
** History
|
12
|
+
** Nov 4, 2007 - Initial version
|
13
|
+
** Dec 17. 2007 - add function for counting number of each type of probeset
|
14
|
+
** Dec 31, 2007 - add function which checks that all required fields are present
|
15
|
+
** Mar 18, 2008 - fix error in read_pgf_header function
|
16
|
+
**
|
17
|
+
**
|
18
|
+
**
|
19
|
+
******************************************************************/
|
20
|
+
|
21
|
+
#include <R.h>
|
22
|
+
|
23
|
+
#include <stdio.h>
|
24
|
+
#include <stdlib.h>
|
25
|
+
|
26
|
+
|
27
|
+
#define BUFFERSIZE 1024
|
28
|
+
|
29
|
+
|
30
|
+
/*******************************************************************
|
31
|
+
*******************************************************************
|
32
|
+
**
|
33
|
+
** Structures for dealing with pgf file information
|
34
|
+
**
|
35
|
+
**
|
36
|
+
**
|
37
|
+
*******************************************************************
|
38
|
+
******************************************************************/
|
39
|
+
|
40
|
+
/*******************************************************************
|
41
|
+
*******************************************************************
|
42
|
+
**
|
43
|
+
** Starting off with the headers
|
44
|
+
**
|
45
|
+
*******************************************************************
|
46
|
+
******************************************************************/
|
47
|
+
|
48
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
49
|
+
|
50
|
+
typedef struct{
|
51
|
+
int probeset_id;
|
52
|
+
int type;
|
53
|
+
int probeset_name;
|
54
|
+
} header_0;
|
55
|
+
|
56
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
57
|
+
|
58
|
+
typedef struct{
|
59
|
+
int atom_id;
|
60
|
+
int type;
|
61
|
+
int exon_position;
|
62
|
+
} header_1;
|
63
|
+
|
64
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
65
|
+
|
66
|
+
typedef struct{
|
67
|
+
int probe_id;
|
68
|
+
int type;
|
69
|
+
int gc_count;
|
70
|
+
int probe_length;
|
71
|
+
int interrogation_position;
|
72
|
+
int probe_sequence;
|
73
|
+
} header_2;
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
typedef struct{
|
78
|
+
char **chip_type;
|
79
|
+
int n_chip_type;
|
80
|
+
char *lib_set_name;
|
81
|
+
char *lib_set_version;
|
82
|
+
char *pgf_format_version;
|
83
|
+
char *header0_str;
|
84
|
+
header_0 *header0;
|
85
|
+
char *header1_str;
|
86
|
+
header_1 *header1;
|
87
|
+
char *header2_str;
|
88
|
+
header_2 *header2;
|
89
|
+
char *create_date;
|
90
|
+
char *guid;
|
91
|
+
char **other_headers_keys;
|
92
|
+
char **other_headers_values;
|
93
|
+
int n_other_headers;
|
94
|
+
} pgf_headers;
|
95
|
+
|
96
|
+
|
97
|
+
/********************************************************************
|
98
|
+
*******************************************************************
|
99
|
+
**
|
100
|
+
** Structures for dealing with data stored at the probelevel
|
101
|
+
**
|
102
|
+
**
|
103
|
+
*******************************************************************
|
104
|
+
*******************************************************************/
|
105
|
+
|
106
|
+
typedef struct{
|
107
|
+
|
108
|
+
int probe_id;
|
109
|
+
char *type;
|
110
|
+
int gc_count;
|
111
|
+
int probe_length;
|
112
|
+
int interrogation_position;
|
113
|
+
char *probe_sequence;
|
114
|
+
struct probe_list_node *next;
|
115
|
+
} probe_list_node;
|
116
|
+
|
117
|
+
|
118
|
+
typedef struct{
|
119
|
+
int n_probes;
|
120
|
+
probe_list_node *first;
|
121
|
+
} probe_list_header;
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
/********************************************************************
|
126
|
+
*******************************************************************
|
127
|
+
**
|
128
|
+
** Structures for dealing with data stored at the atom level
|
129
|
+
**
|
130
|
+
**
|
131
|
+
*******************************************************************
|
132
|
+
*******************************************************************/
|
133
|
+
|
134
|
+
typedef struct{
|
135
|
+
int atom_id;
|
136
|
+
char *type;
|
137
|
+
char *exon_position;
|
138
|
+
probe_list_header *probes;
|
139
|
+
struct atom_list_node *next;
|
140
|
+
} atom_list_node;
|
141
|
+
|
142
|
+
|
143
|
+
typedef struct{
|
144
|
+
int n_atoms;
|
145
|
+
atom_list_node *first;
|
146
|
+
} atom_list_header;
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
/*******************************************************************
|
151
|
+
*******************************************************************
|
152
|
+
**
|
153
|
+
** Structures for dealing with data as stored at the probeset level
|
154
|
+
**
|
155
|
+
**
|
156
|
+
**
|
157
|
+
*******************************************************************
|
158
|
+
*******************************************************************/
|
159
|
+
|
160
|
+
typedef struct probeset_list_node *node_pointer;
|
161
|
+
|
162
|
+
|
163
|
+
typedef struct{
|
164
|
+
int probeset_id;
|
165
|
+
char *type;
|
166
|
+
char *probeset_name;
|
167
|
+
|
168
|
+
atom_list_header *atoms;
|
169
|
+
|
170
|
+
struct probeset_list_node *next;
|
171
|
+
} probeset_list_node;
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
typedef struct{
|
176
|
+
|
177
|
+
int n_probesets;
|
178
|
+
|
179
|
+
probeset_list_node *first;
|
180
|
+
probeset_list_node *current;
|
181
|
+
probeset_list_node *last;
|
182
|
+
} probeset_list_header;
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
/*******************************************************************
|
187
|
+
*******************************************************************
|
188
|
+
**
|
189
|
+
** Structure for storing pgf file (after it is read from file)
|
190
|
+
**
|
191
|
+
*******************************************************************
|
192
|
+
******************************************************************/
|
193
|
+
|
194
|
+
|
195
|
+
typedef struct{
|
196
|
+
pgf_headers *headers;
|
197
|
+
probeset_list_header *probesets;
|
198
|
+
} pgf_file;
|
199
|
+
|
200
|
+
|
201
|
+
/*******************************************************************
|
202
|
+
*******************************************************************
|
203
|
+
**
|
204
|
+
**
|
205
|
+
** Code for splitting a string into a series of tokens
|
206
|
+
**
|
207
|
+
**
|
208
|
+
*******************************************************************
|
209
|
+
*******************************************************************/
|
210
|
+
|
211
|
+
|
212
|
+
/***************************************************************
|
213
|
+
**
|
214
|
+
** tokenset
|
215
|
+
**
|
216
|
+
** char **tokens - a array of token strings
|
217
|
+
** int n - number of tokens in this set.
|
218
|
+
**
|
219
|
+
** a structure to hold a set of tokens. Typically a tokenset is
|
220
|
+
** created by breaking a character string based upon a set of
|
221
|
+
** delimiters.
|
222
|
+
**
|
223
|
+
**
|
224
|
+
**************************************************************/
|
225
|
+
|
226
|
+
typedef struct{
|
227
|
+
char **tokens;
|
228
|
+
int n;
|
229
|
+
} tokenset;
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
/******************************************************************
|
234
|
+
**
|
235
|
+
** tokenset *tokenize(char *str, char *delimiters)
|
236
|
+
**
|
237
|
+
** char *str - a string to break into tokens
|
238
|
+
** char *delimiters - delimiters to use in breaking up the line
|
239
|
+
**
|
240
|
+
**
|
241
|
+
** RETURNS a new tokenset
|
242
|
+
**
|
243
|
+
** Given a string, split into tokens based on a set of delimitors
|
244
|
+
**
|
245
|
+
*****************************************************************/
|
246
|
+
|
247
|
+
static tokenset *tokenize(char *str, char *delimiters){
|
248
|
+
|
249
|
+
#if USE_PTHREADS
|
250
|
+
char *tmp_pointer;
|
251
|
+
#endif
|
252
|
+
int i=0;
|
253
|
+
|
254
|
+
char *current_token;
|
255
|
+
tokenset *my_tokenset = Calloc(1,tokenset);
|
256
|
+
my_tokenset->n=0;
|
257
|
+
|
258
|
+
my_tokenset->tokens = NULL;
|
259
|
+
#if USE_PTHREADS
|
260
|
+
current_token = strtok_r(str,delimiters,&tmp_pointer);
|
261
|
+
#else
|
262
|
+
current_token = strtok(str,delimiters);
|
263
|
+
#endif
|
264
|
+
while (current_token != NULL){
|
265
|
+
my_tokenset->n++;
|
266
|
+
my_tokenset->tokens = Realloc(my_tokenset->tokens,my_tokenset->n,char*);
|
267
|
+
my_tokenset->tokens[i] = Calloc(strlen(current_token)+1,char);
|
268
|
+
strcpy(my_tokenset->tokens[i],current_token);
|
269
|
+
my_tokenset->tokens[i][(strlen(current_token))] = '\0';
|
270
|
+
i++;
|
271
|
+
#if USE_PTHREADS
|
272
|
+
current_token = strtok_r(NULL,delimiters,&tmp_pointer);
|
273
|
+
#else
|
274
|
+
current_token = strtok(NULL,delimiters);
|
275
|
+
#endif
|
276
|
+
}
|
277
|
+
return my_tokenset;
|
278
|
+
}
|
279
|
+
|
280
|
+
|
281
|
+
/******************************************************************
|
282
|
+
**
|
283
|
+
** int tokenset_size(tokenset *x)
|
284
|
+
**
|
285
|
+
** tokenset *x - a tokenset
|
286
|
+
**
|
287
|
+
** RETURNS the number of tokens in the tokenset
|
288
|
+
**
|
289
|
+
******************************************************************/
|
290
|
+
|
291
|
+
static int tokenset_size(tokenset *x){
|
292
|
+
return x->n;
|
293
|
+
}
|
294
|
+
|
295
|
+
|
296
|
+
/******************************************************************
|
297
|
+
**
|
298
|
+
** char *get_token(tokenset *x, int i)
|
299
|
+
**
|
300
|
+
** tokenset *x - a tokenset
|
301
|
+
** int i - index of the token to return
|
302
|
+
**
|
303
|
+
** RETURNS pointer to the i'th token
|
304
|
+
**
|
305
|
+
******************************************************************/
|
306
|
+
|
307
|
+
static char *get_token(tokenset *x,int i){
|
308
|
+
return x->tokens[i];
|
309
|
+
}
|
310
|
+
|
311
|
+
/******************************************************************
|
312
|
+
**
|
313
|
+
** void delete_tokens(tokenset *x)
|
314
|
+
**
|
315
|
+
** tokenset *x - a tokenset
|
316
|
+
**
|
317
|
+
** Deallocates all the space allocated for a tokenset
|
318
|
+
**
|
319
|
+
******************************************************************/
|
320
|
+
|
321
|
+
static void delete_tokens(tokenset *x){
|
322
|
+
|
323
|
+
int i;
|
324
|
+
|
325
|
+
for (i=0; i < x->n; i++){
|
326
|
+
Free(x->tokens[i]);
|
327
|
+
}
|
328
|
+
Free(x->tokens);
|
329
|
+
Free(x);
|
330
|
+
}
|
331
|
+
|
332
|
+
/*******************************************************************
|
333
|
+
**
|
334
|
+
** int token_ends_with(char *token, char *ends)
|
335
|
+
**
|
336
|
+
** char *token - a string to check
|
337
|
+
** char *ends_in - we are looking for this string at the end of token
|
338
|
+
**
|
339
|
+
**
|
340
|
+
** returns 0 if no match, otherwise it returns the index of the first character
|
341
|
+
** which matchs the start of *ends.
|
342
|
+
**
|
343
|
+
** Note that there must be one additional character in "token" beyond
|
344
|
+
** the characters in "ends". So
|
345
|
+
**
|
346
|
+
** *token = "TestStr"
|
347
|
+
** *ends = "TestStr"
|
348
|
+
**
|
349
|
+
** would return 0 but if
|
350
|
+
**
|
351
|
+
** ends = "estStr"
|
352
|
+
**
|
353
|
+
** we would return 1.
|
354
|
+
**
|
355
|
+
** and if
|
356
|
+
**
|
357
|
+
** ends= "stStr"
|
358
|
+
** we would return 2 .....etc
|
359
|
+
**
|
360
|
+
**
|
361
|
+
******************************************************************/
|
362
|
+
|
363
|
+
static int token_ends_with(char *token, char *ends_in){
|
364
|
+
|
365
|
+
int tokenlength = strlen(token);
|
366
|
+
int ends_length = strlen(ends_in);
|
367
|
+
int start_pos;
|
368
|
+
char *tmp_ptr;
|
369
|
+
|
370
|
+
if (tokenlength <= ends_length){
|
371
|
+
/* token string is too short so can't possibly end with ends */
|
372
|
+
return 0;
|
373
|
+
}
|
374
|
+
|
375
|
+
start_pos = tokenlength - ends_length;
|
376
|
+
|
377
|
+
tmp_ptr = &token[start_pos];
|
378
|
+
|
379
|
+
if (strcmp(tmp_ptr,ends_in)==0){
|
380
|
+
return start_pos;
|
381
|
+
} else {
|
382
|
+
return 0;
|
383
|
+
}
|
384
|
+
}
|
385
|
+
|
386
|
+
|
387
|
+
/*******************************************************************
|
388
|
+
*******************************************************************
|
389
|
+
**
|
390
|
+
** Code for Reading from file
|
391
|
+
**
|
392
|
+
*******************************************************************
|
393
|
+
*******************************************************************/
|
394
|
+
|
395
|
+
|
396
|
+
|
397
|
+
/****************************************************************
|
398
|
+
**
|
399
|
+
** void ReadFileLine(char *buffer, int buffersize, FILE *currentFile)
|
400
|
+
**
|
401
|
+
** char *buffer - place to store contents of the line
|
402
|
+
** int buffersize - size of the buffer
|
403
|
+
** FILE *currentFile - FILE pointer to an opened CEL file.
|
404
|
+
**
|
405
|
+
** Read a line from a file, into a buffer of specified size.
|
406
|
+
** otherwise die.
|
407
|
+
**
|
408
|
+
***************************************************************/
|
409
|
+
|
410
|
+
static int ReadFileLine(char *buffer, int buffersize, FILE *currentFile){
|
411
|
+
if (fgets(buffer, buffersize, currentFile) == NULL){
|
412
|
+
return 0;
|
413
|
+
//error("End of file reached unexpectedly. Perhaps this file is truncated.\n");
|
414
|
+
}
|
415
|
+
return 1;
|
416
|
+
}
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
|
421
|
+
/****************************************************************
|
422
|
+
**
|
423
|
+
** Code for identifying what type of information is stored in
|
424
|
+
** the current line
|
425
|
+
**
|
426
|
+
***************************************************************/
|
427
|
+
|
428
|
+
/****************************************************************
|
429
|
+
**
|
430
|
+
** static int IsHeaderLine(char *buffer)
|
431
|
+
**
|
432
|
+
** char *buffer - contains line to evaluate
|
433
|
+
**
|
434
|
+
** Checks whether supplied line is a header line (ie starts with #%)
|
435
|
+
**
|
436
|
+
** return 1 (ie true) if header line. 0 otherwise
|
437
|
+
**
|
438
|
+
***************************************************************/
|
439
|
+
|
440
|
+
|
441
|
+
static int IsHeaderLine(char *buffer){
|
442
|
+
|
443
|
+
if (strncmp("#%",buffer,2) == 0){
|
444
|
+
return 1;
|
445
|
+
}
|
446
|
+
return 0;
|
447
|
+
}
|
448
|
+
|
449
|
+
/****************************************************************
|
450
|
+
**
|
451
|
+
** static int IsHeaderLine(char *buffer)
|
452
|
+
**
|
453
|
+
** char *buffer - contains line to evaluate
|
454
|
+
**
|
455
|
+
** Checks whether supplied line is a comment line (ie starts with #)
|
456
|
+
**
|
457
|
+
**
|
458
|
+
***************************************************************/
|
459
|
+
|
460
|
+
static int IsCommentLine(char *buffer){
|
461
|
+
if (strncmp("#",buffer,1) == 0){
|
462
|
+
return 1;
|
463
|
+
}
|
464
|
+
return 0;
|
465
|
+
}
|
466
|
+
|
467
|
+
|
468
|
+
/*****************************************************************
|
469
|
+
**
|
470
|
+
** static int IsLevel2(char *buffer)
|
471
|
+
**
|
472
|
+
** char *buffer - contains line to evaluate
|
473
|
+
**
|
474
|
+
** checks whether supplied line begins with two tab characters it \t\t
|
475
|
+
**
|
476
|
+
** Return 1 if true, 0 otherwise
|
477
|
+
**
|
478
|
+
***************************************************************/
|
479
|
+
|
480
|
+
static int IsLevel2(char *buffer){
|
481
|
+
if (strncmp("\t\t",buffer,2) == 0){
|
482
|
+
return 1;
|
483
|
+
}
|
484
|
+
return 0;
|
485
|
+
}
|
486
|
+
|
487
|
+
|
488
|
+
|
489
|
+
/*****************************************************************
|
490
|
+
**
|
491
|
+
** static int IsLevel1(char *buffer)
|
492
|
+
**
|
493
|
+
** char *buffer - contains line to evaluate
|
494
|
+
**
|
495
|
+
** checks whether supplied line begins with a single tab characters it \t
|
496
|
+
**
|
497
|
+
** Return 1 if true, 0 otherwise
|
498
|
+
**
|
499
|
+
***************************************************************/
|
500
|
+
|
501
|
+
static int IsLevel1(char *buffer){
|
502
|
+
if (strncmp("\t",buffer,1) == 0){
|
503
|
+
if (strncmp("\t\t",buffer,2) != 0){
|
504
|
+
return 1;
|
505
|
+
}
|
506
|
+
return 0;
|
507
|
+
}
|
508
|
+
return 0;
|
509
|
+
}
|
510
|
+
|
511
|
+
|
512
|
+
|
513
|
+
/****************************************************************
|
514
|
+
****************************************************************
|
515
|
+
**
|
516
|
+
** Code for deallocating or initializing header data structures
|
517
|
+
**
|
518
|
+
****************************************************************
|
519
|
+
****************************************************************/
|
520
|
+
|
521
|
+
void dealloc_pgf_headers(pgf_headers *header){
|
522
|
+
int i;
|
523
|
+
|
524
|
+
if (header->n_chip_type > 0){
|
525
|
+
for (i = 0; i < header->n_chip_type; i++){
|
526
|
+
Free(header->chip_type[i]);
|
527
|
+
}
|
528
|
+
Free(header->chip_type);
|
529
|
+
}
|
530
|
+
|
531
|
+
if (header->lib_set_name != NULL){
|
532
|
+
Free(header->lib_set_name);
|
533
|
+
}
|
534
|
+
|
535
|
+
if (header->lib_set_version != NULL){
|
536
|
+
Free(header->lib_set_version);
|
537
|
+
}
|
538
|
+
|
539
|
+
if (header->pgf_format_version != NULL){
|
540
|
+
Free(header->pgf_format_version);
|
541
|
+
}
|
542
|
+
|
543
|
+
if (header->header0_str != NULL){
|
544
|
+
Free(header->header0_str);
|
545
|
+
Free(header->header0);
|
546
|
+
}
|
547
|
+
if (header->header1_str != NULL){
|
548
|
+
Free(header->header1_str);
|
549
|
+
Free(header->header1);
|
550
|
+
}
|
551
|
+
if (header->header2_str != NULL){
|
552
|
+
Free(header->header2_str);
|
553
|
+
Free(header->header2);
|
554
|
+
}
|
555
|
+
|
556
|
+
if (header->create_date != NULL){
|
557
|
+
Free(header->create_date);
|
558
|
+
}
|
559
|
+
|
560
|
+
if (header->guid != NULL){
|
561
|
+
Free(header->guid);
|
562
|
+
}
|
563
|
+
|
564
|
+
if (header->n_other_headers > 0){
|
565
|
+
for (i = 0; i < header->n_other_headers; i++){
|
566
|
+
Free(header->other_headers_keys[i]);
|
567
|
+
Free(header->other_headers_values[i]);
|
568
|
+
}
|
569
|
+
Free(header->other_headers_keys);
|
570
|
+
Free(header->other_headers_values);
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
|
575
|
+
void dealloc_probes(probe_list_header *probes){
|
576
|
+
|
577
|
+
probe_list_node *temp_node;
|
578
|
+
|
579
|
+
if (probes->first != NULL){
|
580
|
+
temp_node = probes->first;
|
581
|
+
while (temp_node != NULL){
|
582
|
+
probes->first = (probe_list_node *)temp_node->next;
|
583
|
+
if (temp_node->type != NULL){
|
584
|
+
Free(temp_node->type);
|
585
|
+
}
|
586
|
+
if (temp_node->probe_sequence != NULL){
|
587
|
+
Free(temp_node->probe_sequence);
|
588
|
+
}
|
589
|
+
Free(temp_node);
|
590
|
+
temp_node = probes->first;
|
591
|
+
}
|
592
|
+
|
593
|
+
|
594
|
+
}
|
595
|
+
}
|
596
|
+
|
597
|
+
|
598
|
+
|
599
|
+
void dealloc_atoms(atom_list_header *atoms){
|
600
|
+
|
601
|
+
atom_list_node *temp_node;
|
602
|
+
|
603
|
+
if (atoms->first != NULL){
|
604
|
+
temp_node = atoms->first;
|
605
|
+
while (temp_node != NULL){
|
606
|
+
atoms->first = (atom_list_node *)temp_node->next;
|
607
|
+
if (temp_node->type != NULL){
|
608
|
+
Free(temp_node->type);
|
609
|
+
}
|
610
|
+
if (temp_node->exon_position != NULL){
|
611
|
+
Free(temp_node->exon_position);
|
612
|
+
}
|
613
|
+
if (temp_node->probes != NULL){
|
614
|
+
dealloc_probes(temp_node->probes);
|
615
|
+
Free(temp_node->probes);
|
616
|
+
}
|
617
|
+
|
618
|
+
Free(temp_node);
|
619
|
+
temp_node = atoms->first;
|
620
|
+
}
|
621
|
+
|
622
|
+
|
623
|
+
}
|
624
|
+
|
625
|
+
|
626
|
+
|
627
|
+
}
|
628
|
+
|
629
|
+
|
630
|
+
void dealloc_pgf_probesets(probeset_list_header *probesets){
|
631
|
+
|
632
|
+
probeset_list_node *temp_node;
|
633
|
+
|
634
|
+
if (probesets->first != NULL){
|
635
|
+
temp_node = probesets->first;
|
636
|
+
while (temp_node != NULL){
|
637
|
+
probesets->first = (probeset_list_node *)temp_node->next;
|
638
|
+
|
639
|
+
if (temp_node->type != NULL){
|
640
|
+
Free(temp_node->type);
|
641
|
+
}
|
642
|
+
if (temp_node->probeset_name != NULL){
|
643
|
+
Free(temp_node->probeset_name);
|
644
|
+
}
|
645
|
+
|
646
|
+
if (temp_node->atoms != NULL){
|
647
|
+
dealloc_atoms(temp_node->atoms);
|
648
|
+
Free(temp_node->atoms);
|
649
|
+
}
|
650
|
+
|
651
|
+
Free(temp_node);
|
652
|
+
temp_node = probesets->first;
|
653
|
+
}
|
654
|
+
}
|
655
|
+
|
656
|
+
}
|
657
|
+
|
658
|
+
|
659
|
+
|
660
|
+
void dealloc_pgf_file(pgf_file* my_pgf){
|
661
|
+
|
662
|
+
|
663
|
+
if (my_pgf->headers != NULL){
|
664
|
+
dealloc_pgf_headers(my_pgf->headers);
|
665
|
+
Free(my_pgf->headers);
|
666
|
+
}
|
667
|
+
|
668
|
+
|
669
|
+
if (my_pgf->probesets !=NULL){
|
670
|
+
dealloc_pgf_probesets(my_pgf->probesets);
|
671
|
+
Free(my_pgf->probesets);
|
672
|
+
}
|
673
|
+
|
674
|
+
|
675
|
+
}
|
676
|
+
|
677
|
+
|
678
|
+
void initialize_pgf_header(pgf_headers *header){
|
679
|
+
|
680
|
+
header->chip_type = NULL;
|
681
|
+
header->n_chip_type = 0;
|
682
|
+
|
683
|
+
header->lib_set_name= NULL;
|
684
|
+
header->lib_set_version= NULL;
|
685
|
+
header->pgf_format_version= NULL;
|
686
|
+
header->header0_str= NULL;
|
687
|
+
header->header0= NULL;
|
688
|
+
header->header1_str= NULL;
|
689
|
+
header->header1= NULL;
|
690
|
+
header->header2_str= NULL;
|
691
|
+
header->header2= NULL;
|
692
|
+
header->create_date= NULL;
|
693
|
+
header->guid= NULL;
|
694
|
+
header->other_headers_keys= NULL;
|
695
|
+
header->other_headers_values= NULL;
|
696
|
+
header->n_other_headers=0;
|
697
|
+
}
|
698
|
+
|
699
|
+
/****************************************************************
|
700
|
+
****************************************************************
|
701
|
+
**
|
702
|
+
** Code for figuring out column ordering
|
703
|
+
**
|
704
|
+
****************************************************************
|
705
|
+
***************************************************************/
|
706
|
+
|
707
|
+
|
708
|
+
static void determine_order_header0(char *header_str, header_0 *header0){
|
709
|
+
|
710
|
+
tokenset *cur_tokenset;
|
711
|
+
int i;
|
712
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
713
|
+
|
714
|
+
|
715
|
+
strcpy(temp_str,header_str);
|
716
|
+
|
717
|
+
header0->probeset_id = -1;
|
718
|
+
header0->type = -1;
|
719
|
+
header0->probeset_name = -1;
|
720
|
+
|
721
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
722
|
+
|
723
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
724
|
+
if (strcmp(get_token(cur_tokenset,i),"probeset_id")==0){
|
725
|
+
header0->probeset_id = i;
|
726
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
727
|
+
header0->type = i;
|
728
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
729
|
+
header0->probeset_name = i;
|
730
|
+
}
|
731
|
+
}
|
732
|
+
delete_tokens(cur_tokenset);
|
733
|
+
|
734
|
+
Free(temp_str);
|
735
|
+
|
736
|
+
}
|
737
|
+
|
738
|
+
static void determine_order_header1(char *header_str, header_1 *header1){
|
739
|
+
|
740
|
+
tokenset *cur_tokenset;
|
741
|
+
int i;
|
742
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
743
|
+
|
744
|
+
|
745
|
+
strcpy(temp_str,header_str);
|
746
|
+
|
747
|
+
header1->atom_id = -1;
|
748
|
+
header1->type = -1;
|
749
|
+
header1->exon_position = -1;
|
750
|
+
|
751
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
752
|
+
|
753
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
754
|
+
if (strcmp(get_token(cur_tokenset,i),"atom_id")==0){
|
755
|
+
header1->atom_id = i;
|
756
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
757
|
+
header1->type = i;
|
758
|
+
} else if (strcmp(get_token(cur_tokenset,i),"exon_position")==0){
|
759
|
+
header1->exon_position = i;
|
760
|
+
}
|
761
|
+
}
|
762
|
+
delete_tokens(cur_tokenset);
|
763
|
+
|
764
|
+
Free(temp_str);
|
765
|
+
|
766
|
+
}
|
767
|
+
|
768
|
+
static void determine_order_header2(char *header_str, header_2 *header2){
|
769
|
+
|
770
|
+
tokenset *cur_tokenset;
|
771
|
+
int i;
|
772
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
773
|
+
|
774
|
+
|
775
|
+
strcpy(temp_str,header_str);
|
776
|
+
|
777
|
+
header2->probe_id = -1;
|
778
|
+
header2->type = -1;
|
779
|
+
header2->gc_count = -1;
|
780
|
+
header2->probe_length = -1;
|
781
|
+
header2->interrogation_position = -1;
|
782
|
+
header2->probe_sequence = -1;
|
783
|
+
|
784
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
785
|
+
|
786
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
787
|
+
if (strcmp(get_token(cur_tokenset,i),"probe_id")==0){
|
788
|
+
header2->probe_id = i;
|
789
|
+
} else if (strcmp(get_token(cur_tokenset,i),"type")==0){
|
790
|
+
header2->type = i;
|
791
|
+
} else if (strcmp(get_token(cur_tokenset,i),"gc_count")==0){
|
792
|
+
header2->gc_count = i;
|
793
|
+
} else if (strcmp(get_token(cur_tokenset,i),"probe_length")==0){
|
794
|
+
header2->probe_length = i;
|
795
|
+
} else if (strcmp(get_token(cur_tokenset,i),"interrogation_position")==0){
|
796
|
+
header2->interrogation_position = i;
|
797
|
+
} else if (strcmp(get_token(cur_tokenset,i),"probe_sequence")==0){
|
798
|
+
header2->probe_sequence = i;
|
799
|
+
}
|
800
|
+
|
801
|
+
}
|
802
|
+
delete_tokens(cur_tokenset);
|
803
|
+
|
804
|
+
Free(temp_str);
|
805
|
+
|
806
|
+
}
|
807
|
+
|
808
|
+
|
809
|
+
/****************************************************************
|
810
|
+
**
|
811
|
+
** Validate that required headers are present in file.
|
812
|
+
**
|
813
|
+
** Return 0 if an expected header is not present.
|
814
|
+
** Returns 1 otherwise (ie everything looks fine)
|
815
|
+
**
|
816
|
+
***************************************************************/
|
817
|
+
|
818
|
+
static int validate_pgf_header(pgf_headers *header){
|
819
|
+
|
820
|
+
|
821
|
+
/* check that required headers are all there (have been read) */
|
822
|
+
if (header->chip_type == NULL)
|
823
|
+
return 0;
|
824
|
+
|
825
|
+
if (header->lib_set_name == NULL)
|
826
|
+
return 0;
|
827
|
+
|
828
|
+
if (header->lib_set_version == NULL)
|
829
|
+
return 0;
|
830
|
+
|
831
|
+
if (header->pgf_format_version == NULL)
|
832
|
+
return 0;
|
833
|
+
|
834
|
+
if (header->header0_str == NULL)
|
835
|
+
return 0;
|
836
|
+
|
837
|
+
if (header->header1_str == NULL)
|
838
|
+
return 0;
|
839
|
+
|
840
|
+
if (header->header2_str == NULL)
|
841
|
+
return 0;
|
842
|
+
|
843
|
+
|
844
|
+
/* Check that format version is 1.0 (only supported version) */
|
845
|
+
|
846
|
+
if (strcmp( header->pgf_format_version,"1.0") != 0){
|
847
|
+
return 0;
|
848
|
+
}
|
849
|
+
|
850
|
+
/* check that header0, header1, header2 (ie the three levels of headers) have required fields */
|
851
|
+
|
852
|
+
if (header->header0->probeset_id == -1)
|
853
|
+
return 0;
|
854
|
+
|
855
|
+
if (header->header1->atom_id == -1)
|
856
|
+
return 0;
|
857
|
+
|
858
|
+
if (header->header2->probe_id == -1)
|
859
|
+
return 0;
|
860
|
+
|
861
|
+
if (header->header2->type == -1)
|
862
|
+
return 0;
|
863
|
+
|
864
|
+
return 1;
|
865
|
+
}
|
866
|
+
|
867
|
+
|
868
|
+
|
869
|
+
|
870
|
+
/****************************************************************
|
871
|
+
****************************************************************
|
872
|
+
**
|
873
|
+
** Code for actually reading from the file
|
874
|
+
**
|
875
|
+
****************************************************************
|
876
|
+
***************************************************************/
|
877
|
+
|
878
|
+
static FILE *open_pgf_file(const char *filename){
|
879
|
+
|
880
|
+
const char *mode = "r";
|
881
|
+
FILE *currentFile = NULL;
|
882
|
+
|
883
|
+
currentFile = fopen(filename,mode);
|
884
|
+
if (currentFile == NULL){
|
885
|
+
error("Could not open file %s", filename);
|
886
|
+
}
|
887
|
+
return currentFile;
|
888
|
+
|
889
|
+
}
|
890
|
+
|
891
|
+
/****************************************************************
|
892
|
+
**
|
893
|
+
** Reading the header
|
894
|
+
**
|
895
|
+
***************************************************************/
|
896
|
+
|
897
|
+
void read_pgf_header(FILE *cur_file, char *buffer, pgf_headers *header){
|
898
|
+
|
899
|
+
|
900
|
+
tokenset *cur_tokenset;
|
901
|
+
int i;
|
902
|
+
char *temp_str;
|
903
|
+
|
904
|
+
|
905
|
+
initialize_pgf_header(header);
|
906
|
+
do {
|
907
|
+
ReadFileLine(buffer, 1024, cur_file);
|
908
|
+
/* Rprintf("%s\n",buffer); */
|
909
|
+
if (IsHeaderLine(buffer)){
|
910
|
+
cur_tokenset = tokenize(&buffer[2],"=\r\n");
|
911
|
+
/* hopefully token 0 is Key
|
912
|
+
and token 1 is Value */
|
913
|
+
/* Rprintf("Key is: %s\n",get_token(cur_tokenset,0));
|
914
|
+
Rprintf("Value is: %s\n",get_token(cur_tokenset,1)); */
|
915
|
+
/* Decode the Key/Value pair */
|
916
|
+
if (strcmp(get_token(cur_tokenset,0),"chip_type") == 0){
|
917
|
+
if (header->n_chip_type == 0){
|
918
|
+
header->chip_type = Calloc(1, char *);
|
919
|
+
} else {
|
920
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
921
|
+
}
|
922
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1))+1,char);
|
923
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
924
|
+
header->chip_type[header->n_chip_type] = temp_str;
|
925
|
+
header->n_chip_type++;
|
926
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_name") == 0){
|
927
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
928
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
929
|
+
header->lib_set_name = temp_str;
|
930
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_version") == 0){
|
931
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
932
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
933
|
+
header->lib_set_version = temp_str;
|
934
|
+
} else if (strcmp(get_token(cur_tokenset,0), "pgf_format_version") == 0) {
|
935
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
936
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
937
|
+
header->pgf_format_version = temp_str;
|
938
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header0") == 0) {
|
939
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
940
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
941
|
+
header->header0_str = temp_str;
|
942
|
+
header->header0 = Calloc(1,header_0);
|
943
|
+
determine_order_header0(header->header0_str,header->header0);
|
944
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header1") == 0) {
|
945
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
946
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
947
|
+
header->header1_str = temp_str;
|
948
|
+
header->header1 = Calloc(1,header_1);
|
949
|
+
determine_order_header1(header->header1_str,header->header1);
|
950
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header2") == 0) {
|
951
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
952
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
953
|
+
header->header2_str = temp_str;
|
954
|
+
header->header2 = Calloc(1,header_2);
|
955
|
+
determine_order_header2(header->header2_str,header->header2);
|
956
|
+
} else if (strcmp(get_token(cur_tokenset,0), "create_date") == 0) {
|
957
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
958
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
959
|
+
header->create_date = temp_str;
|
960
|
+
} else if (strcmp(get_token(cur_tokenset,0), "guid") == 0) {
|
961
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
962
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
963
|
+
header->guid = temp_str;
|
964
|
+
} else {
|
965
|
+
/* not one of the recognised header types */
|
966
|
+
if ( header->n_other_headers == 0){
|
967
|
+
header->other_headers_keys = Calloc(1, char *);
|
968
|
+
header->other_headers_values = Calloc(1, char *);
|
969
|
+
} else {
|
970
|
+
header->other_headers_keys = Realloc(header->other_headers_keys,header->n_other_headers+1, char *);
|
971
|
+
header->other_headers_values = Realloc(header->other_headers_values,header->n_other_headers+1, char *);
|
972
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
973
|
+
}
|
974
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
975
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
976
|
+
header->other_headers_values[header->n_other_headers] = temp_str;
|
977
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,0)) + 1,char);
|
978
|
+
strcpy(temp_str,get_token(cur_tokenset,0));
|
979
|
+
header->other_headers_keys[header->n_other_headers] = temp_str;
|
980
|
+
header->n_other_headers++;
|
981
|
+
|
982
|
+
}
|
983
|
+
|
984
|
+
delete_tokens(cur_tokenset);
|
985
|
+
}
|
986
|
+
} while (IsHeaderLine(buffer));
|
987
|
+
|
988
|
+
}
|
989
|
+
|
990
|
+
|
991
|
+
/****************************************************************
|
992
|
+
**
|
993
|
+
** Reading the probesets/body of the file
|
994
|
+
**
|
995
|
+
***************************************************************/
|
996
|
+
|
997
|
+
void initialize_probeset_list(probeset_list_header *probeset_list){
|
998
|
+
|
999
|
+
probeset_list->n_probesets = 0;
|
1000
|
+
probeset_list->first = NULL;
|
1001
|
+
probeset_list->current = NULL;
|
1002
|
+
probeset_list->last = NULL;
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
|
1006
|
+
|
1007
|
+
void insert_probe(char *buffer, probe_list_header *probe_list, header_2 *header2){
|
1008
|
+
|
1009
|
+
char *temp_str;
|
1010
|
+
tokenset *cur_tokenset;
|
1011
|
+
probe_list_node *temp_ptr;
|
1012
|
+
|
1013
|
+
probe_list_node *temp_node = Calloc(1,probe_list_node);
|
1014
|
+
|
1015
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
1016
|
+
temp_node->probe_id = atoi(get_token(cur_tokenset,header2->probe_id));
|
1017
|
+
|
1018
|
+
if (header2->type != -1){
|
1019
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header2->type)) + 1,char);
|
1020
|
+
strcpy(temp_str,get_token(cur_tokenset,header2->type));
|
1021
|
+
temp_node->type = temp_str;
|
1022
|
+
}
|
1023
|
+
if (header2->gc_count != -1){
|
1024
|
+
temp_node->gc_count = atoi(get_token(cur_tokenset,header2->gc_count));
|
1025
|
+
}
|
1026
|
+
if (header2->probe_length != -1){
|
1027
|
+
temp_node->probe_length = atoi(get_token(cur_tokenset,header2->probe_length));
|
1028
|
+
}
|
1029
|
+
if (header2->interrogation_position != -1){
|
1030
|
+
temp_node->interrogation_position = atoi(get_token(cur_tokenset,header2->interrogation_position));
|
1031
|
+
}
|
1032
|
+
if (header2->probe_sequence != -1){
|
1033
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header2->probe_sequence)) + 1,char);
|
1034
|
+
strcpy(temp_str,get_token(cur_tokenset,header2->probe_sequence));
|
1035
|
+
temp_node->probe_sequence = temp_str;
|
1036
|
+
}
|
1037
|
+
|
1038
|
+
|
1039
|
+
|
1040
|
+
temp_node->next = NULL;
|
1041
|
+
if (probe_list->n_probes == 0){
|
1042
|
+
probe_list->first = temp_node;
|
1043
|
+
probe_list->n_probes = 1;
|
1044
|
+
} else {
|
1045
|
+
/* figure out where the end of the list is. Insert there */
|
1046
|
+
temp_ptr = probe_list->first;
|
1047
|
+
|
1048
|
+
while (temp_ptr->next != NULL){
|
1049
|
+
temp_ptr = (probe_list_node*)temp_ptr->next;
|
1050
|
+
}
|
1051
|
+
temp_ptr->next = (struct probe_list_node*)temp_node;
|
1052
|
+
probe_list->n_probes++;
|
1053
|
+
}
|
1054
|
+
delete_tokens(cur_tokenset);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
|
1058
|
+
void insert_level2(char *buffer, probeset_list_header *probeset_list, header_2 *header2){
|
1059
|
+
|
1060
|
+
atom_list_header *current_level1;
|
1061
|
+
atom_list_node *current_atom;
|
1062
|
+
|
1063
|
+
if (probeset_list->current == NULL){
|
1064
|
+
/* Oh Boy, this is a problem no header0 level object to insert into. */
|
1065
|
+
error("Can not read a level 2 line before seeing a level 0 line. File corrupted?");
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
if (probeset_list->current->atoms == NULL){
|
1069
|
+
/* Oh Boy, this is a problem no header1 level object to insert into. */
|
1070
|
+
error("Can not read a level 2 line before seeing a level 1 line. File corrupted?");
|
1071
|
+
}
|
1072
|
+
|
1073
|
+
current_level1 = probeset_list->current->atoms;
|
1074
|
+
|
1075
|
+
current_atom = current_level1->first;
|
1076
|
+
|
1077
|
+
while (current_atom->next != NULL){
|
1078
|
+
current_atom = (atom_list_node *)current_atom->next;
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
if (current_atom->probes == NULL){
|
1082
|
+
current_atom->probes = Calloc(1,probe_list_header);
|
1083
|
+
}
|
1084
|
+
|
1085
|
+
insert_probe(buffer, current_atom->probes, header2);
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
|
1089
|
+
|
1090
|
+
|
1091
|
+
|
1092
|
+
void insert_atom(char *buffer, atom_list_header *atoms_list, header_1 *header1){
|
1093
|
+
|
1094
|
+
char *temp_str;
|
1095
|
+
tokenset *cur_tokenset;
|
1096
|
+
atom_list_node *temp_ptr;
|
1097
|
+
|
1098
|
+
atom_list_node *temp_node = Calloc(1,atom_list_node);
|
1099
|
+
|
1100
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
1101
|
+
|
1102
|
+
temp_node->atom_id = atoi(get_token(cur_tokenset,header1->atom_id));
|
1103
|
+
|
1104
|
+
if (header1->type != -1){
|
1105
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header1->type)) + 1,char);
|
1106
|
+
strcpy(temp_str,get_token(cur_tokenset,header1->type));
|
1107
|
+
temp_node->type = temp_str;
|
1108
|
+
}
|
1109
|
+
if (header1->exon_position != -1){
|
1110
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header1->exon_position)) + 1,char);
|
1111
|
+
strcpy(temp_str,get_token(cur_tokenset,header1->exon_position));
|
1112
|
+
temp_node->exon_position = temp_str;
|
1113
|
+
}
|
1114
|
+
temp_node->probes = NULL;
|
1115
|
+
temp_node->next = NULL;
|
1116
|
+
|
1117
|
+
if (atoms_list->n_atoms == 0){
|
1118
|
+
atoms_list->first = temp_node;
|
1119
|
+
atoms_list->n_atoms = 1;
|
1120
|
+
} else {
|
1121
|
+
/* figure out where the end of the list is. Insert there */
|
1122
|
+
temp_ptr = (atom_list_node*)atoms_list->first;
|
1123
|
+
|
1124
|
+
while (temp_ptr->next != NULL){
|
1125
|
+
temp_ptr= (atom_list_node*)temp_ptr->next;
|
1126
|
+
}
|
1127
|
+
temp_ptr->next = (struct atom_list_node*)temp_node;
|
1128
|
+
atoms_list->n_atoms++;
|
1129
|
+
}
|
1130
|
+
delete_tokens(cur_tokenset);
|
1131
|
+
}
|
1132
|
+
|
1133
|
+
|
1134
|
+
void insert_level1(char *buffer, probeset_list_header *probeset_list, header_1 *header1){
|
1135
|
+
|
1136
|
+
probeset_list_node *current_level0;
|
1137
|
+
|
1138
|
+
|
1139
|
+
if (probeset_list->current == NULL){
|
1140
|
+
/* Oh Boy, this is a problem no header0 level object to insert into. */
|
1141
|
+
error("Can not read a level 1 line before seeing a level 0 line. File corrupted?");
|
1142
|
+
}
|
1143
|
+
|
1144
|
+
current_level0 = probeset_list->current;
|
1145
|
+
|
1146
|
+
if (current_level0->atoms == NULL){
|
1147
|
+
current_level0->atoms = Calloc(1,atom_list_header);
|
1148
|
+
}
|
1149
|
+
|
1150
|
+
/* Now lets insert the data */
|
1151
|
+
|
1152
|
+
insert_atom(buffer, current_level0->atoms, header1);
|
1153
|
+
|
1154
|
+
|
1155
|
+
|
1156
|
+
|
1157
|
+
|
1158
|
+
}
|
1159
|
+
|
1160
|
+
|
1161
|
+
|
1162
|
+
|
1163
|
+
void insert_level0(char *buffer, probeset_list_header *probeset_list, header_0 *header0){
|
1164
|
+
|
1165
|
+
char *temp_str;
|
1166
|
+
tokenset *cur_tokenset;
|
1167
|
+
probeset_list_node *temp_ptr;
|
1168
|
+
|
1169
|
+
probeset_list_node *temp_node = Calloc(1,probeset_list_node);
|
1170
|
+
|
1171
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
1172
|
+
|
1173
|
+
temp_node->probeset_id = atoi(get_token(cur_tokenset,header0->probeset_id));
|
1174
|
+
|
1175
|
+
if (header0->type != -1){
|
1176
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header0->type)) + 1,char);
|
1177
|
+
strcpy(temp_str,get_token(cur_tokenset,header0->type));
|
1178
|
+
temp_node->type = temp_str;
|
1179
|
+
}
|
1180
|
+
if (header0->probeset_name != -1){
|
1181
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,header0->probeset_name)) + 1,char);
|
1182
|
+
strcpy(temp_str,get_token(cur_tokenset,header0->probeset_name));
|
1183
|
+
temp_node->probeset_name = temp_str;
|
1184
|
+
}
|
1185
|
+
temp_node->atoms = NULL;
|
1186
|
+
temp_node->next = NULL;
|
1187
|
+
|
1188
|
+
if (probeset_list->first == NULL){
|
1189
|
+
probeset_list->first = temp_node;
|
1190
|
+
probeset_list->current = temp_node;
|
1191
|
+
probeset_list->last = temp_node;
|
1192
|
+
probeset_list->n_probesets = 1;
|
1193
|
+
} else {
|
1194
|
+
probeset_list->last->next = (struct probeset_list_node *)temp_node;
|
1195
|
+
probeset_list->last = temp_node;
|
1196
|
+
probeset_list->current = temp_node;
|
1197
|
+
probeset_list->n_probesets++;
|
1198
|
+
}
|
1199
|
+
delete_tokens(cur_tokenset);
|
1200
|
+
}
|
1201
|
+
|
1202
|
+
|
1203
|
+
void read_pgf_probesets(FILE *cur_file, char *buffer, probeset_list_header *probeset_list, pgf_headers *header){
|
1204
|
+
|
1205
|
+
initialize_probeset_list(probeset_list);
|
1206
|
+
|
1207
|
+
insert_level0(buffer, probeset_list, header->header0);
|
1208
|
+
|
1209
|
+
while(ReadFileLine(buffer, 1024, cur_file)){
|
1210
|
+
if (IsLevel2(buffer)){
|
1211
|
+
insert_level2(buffer, probeset_list, header->header2);
|
1212
|
+
} else if (IsLevel1(buffer)){
|
1213
|
+
insert_level1(buffer, probeset_list, header->header1);
|
1214
|
+
} else if (IsCommentLine(buffer)){
|
1215
|
+
/*Ignore */
|
1216
|
+
} else {
|
1217
|
+
insert_level0(buffer, probeset_list, header->header0);
|
1218
|
+
}
|
1219
|
+
}
|
1220
|
+
}
|
1221
|
+
|
1222
|
+
/****************************************************************
|
1223
|
+
****************************************************************
|
1224
|
+
**
|
1225
|
+
** Funtionality for counting probeset types
|
1226
|
+
**
|
1227
|
+
****************************************************************
|
1228
|
+
****************************************************************/
|
1229
|
+
|
1230
|
+
typedef struct{
|
1231
|
+
char *type;
|
1232
|
+
int count;
|
1233
|
+
} probeset_type_list;
|
1234
|
+
|
1235
|
+
|
1236
|
+
|
1237
|
+
probeset_type_list *pgf_count_probeset_types(pgf_file *my_pgf, int *number){
|
1238
|
+
|
1239
|
+
|
1240
|
+
probeset_type_list *my_type_list = Calloc(1,probeset_type_list);
|
1241
|
+
|
1242
|
+
char *cur_type;
|
1243
|
+
int n;
|
1244
|
+
|
1245
|
+
/* traverse the probesets. each time examining the probeset type */
|
1246
|
+
|
1247
|
+
|
1248
|
+
if (my_pgf->probesets != NULL){
|
1249
|
+
|
1250
|
+
if (my_pgf->probesets->first != NULL){
|
1251
|
+
|
1252
|
+
my_pgf->probesets->current = my_pgf->probesets->first;
|
1253
|
+
|
1254
|
+
if (my_pgf->probesets->current->type == NULL){
|
1255
|
+
my_type_list[0].type = Calloc(5,char);
|
1256
|
+
strcpy(my_type_list[0].type,"none");
|
1257
|
+
|
1258
|
+
} else {
|
1259
|
+
my_type_list[0].type = Calloc(strlen(my_pgf->probesets->current->type) + 1,char);
|
1260
|
+
strcpy(my_type_list[0].type,my_pgf->probesets->current->type);
|
1261
|
+
}
|
1262
|
+
my_type_list[0].count = 1;
|
1263
|
+
*number = 1; /* number of different types seen */
|
1264
|
+
while (my_pgf->probesets->current->next != NULL){
|
1265
|
+
my_pgf->probesets->current= my_pgf->probesets->current->next;
|
1266
|
+
if (my_pgf->probesets->current->type == NULL){
|
1267
|
+
cur_type = "none";
|
1268
|
+
} else {
|
1269
|
+
cur_type = my_pgf->probesets->current->type;
|
1270
|
+
}
|
1271
|
+
n = 0;
|
1272
|
+
while (n < *number){
|
1273
|
+
if (strcmp(cur_type,my_type_list[n].type) == 0){
|
1274
|
+
break;
|
1275
|
+
}
|
1276
|
+
n++;
|
1277
|
+
}
|
1278
|
+
if (n == *number){
|
1279
|
+
my_type_list = Realloc(my_type_list,(n+1),probeset_type_list);
|
1280
|
+
my_type_list[n].type = Calloc(strlen(cur_type) + 1,char);
|
1281
|
+
strcpy(my_type_list[n].type,cur_type);
|
1282
|
+
my_type_list[n].count = 1;
|
1283
|
+
*number = *number + 1;
|
1284
|
+
} else {
|
1285
|
+
my_type_list[n].count++;
|
1286
|
+
}
|
1287
|
+
}
|
1288
|
+
}
|
1289
|
+
}
|
1290
|
+
return my_type_list;
|
1291
|
+
}
|
1292
|
+
|
1293
|
+
|
1294
|
+
void dealloc_probeset_type_list(probeset_type_list *my_type_list, int length){
|
1295
|
+
|
1296
|
+
int i;
|
1297
|
+
|
1298
|
+
for (i = 0; i < length; i++){
|
1299
|
+
Free(my_type_list[i].type);
|
1300
|
+
}
|
1301
|
+
|
1302
|
+
Free(my_type_list);
|
1303
|
+
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
/****************************************************************
|
1307
|
+
****************************************************************
|
1308
|
+
**
|
1309
|
+
** Functionality for testing the parsers (from R .C interface)
|
1310
|
+
**
|
1311
|
+
****************************************************************
|
1312
|
+
****************************************************************/
|
1313
|
+
|
1314
|
+
void read_pgf_file(char **filename){
|
1315
|
+
|
1316
|
+
FILE *cur_file;
|
1317
|
+
pgf_file my_pgf;
|
1318
|
+
char *buffer = Calloc(1024, char);
|
1319
|
+
probeset_type_list *my_probeset_types;
|
1320
|
+
int ntypes;
|
1321
|
+
|
1322
|
+
cur_file = open_pgf_file(filename[0]);
|
1323
|
+
|
1324
|
+
my_pgf.headers = Calloc(1, pgf_headers);
|
1325
|
+
my_pgf.probesets = Calloc(1, probeset_list_header);
|
1326
|
+
|
1327
|
+
read_pgf_header(cur_file,buffer,my_pgf.headers);
|
1328
|
+
if (validate_pgf_header(my_pgf.headers)){
|
1329
|
+
read_pgf_probesets(cur_file, buffer, my_pgf.probesets, my_pgf.headers);
|
1330
|
+
my_probeset_types = pgf_count_probeset_types(&my_pgf, &ntypes);
|
1331
|
+
dealloc_probeset_type_list(my_probeset_types, ntypes);
|
1332
|
+
}
|
1333
|
+
Free(buffer);
|
1334
|
+
dealloc_pgf_file(&my_pgf);
|
1335
|
+
fclose(cur_file);
|
1336
|
+
|
1337
|
+
}
|