bio-affy 0.1.0.alpha.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +32 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +33 -0
- data/Rakefile +77 -0
- data/VERSION +1 -0
- data/bin/bio-affy +80 -0
- data/bio-affy.gemspec +128 -0
- data/ext/DESCRIPTION +11 -0
- data/ext/HISTORY +3 -0
- data/ext/LICENSE +456 -0
- data/ext/NAMESPACE +2 -0
- data/ext/R/check.cdf.type.R +18 -0
- data/ext/R/read.cdffile.list.R +23 -0
- data/ext/R/read.celfile.R +11 -0
- data/ext/R/read.celfile.header.R +37 -0
- data/ext/R/read.probematrices.R +29 -0
- data/ext/README_BIOLIB +36 -0
- data/ext/aclocal.m4 +32 -0
- data/ext/configure +4898 -0
- data/ext/configure.in +51 -0
- data/ext/man/check.cdf.type.Rd +22 -0
- data/ext/man/read.cdffile.list.Rd +20 -0
- data/ext/man/read.celfile.Rd +23 -0
- data/ext/man/read.celfile.header.Rd +22 -0
- data/ext/man/read.celfile.probeintensity.matrices.Rd +31 -0
- data/ext/src/CMakeLists.txt +39 -0
- data/ext/src/Makevars.in +3 -0
- data/ext/src/Makevars.win +2 -0
- data/ext/src/Rakefile +43 -0
- data/ext/src/biolib_affyio.c +416 -0
- data/ext/src/biolib_affyio.h +132 -0
- data/ext/src/biolib_affyio.o +0 -0
- data/ext/src/fread_functions.c +871 -0
- data/ext/src/fread_functions.h +60 -0
- data/ext/src/fread_functions.o +0 -0
- data/ext/src/libaffyext.so +0 -0
- data/ext/src/mkrf.log +11 -0
- data/ext/src/mkrf_conf.rb +6 -0
- data/ext/src/read_abatch.c +5484 -0
- data/ext/src/read_abatch.h +63 -0
- data/ext/src/read_abatch.o +0 -0
- data/ext/src/read_bpmap.c +888 -0
- data/ext/src/read_bpmap.o +0 -0
- data/ext/src/read_cdf.h +347 -0
- data/ext/src/read_cdf_xda.c +1342 -0
- data/ext/src/read_cdf_xda.o +0 -0
- data/ext/src/read_cdffile2.c +1576 -0
- data/ext/src/read_cdffile2.o +0 -0
- data/ext/src/read_celfile_generic.c +2061 -0
- data/ext/src/read_celfile_generic.h +33 -0
- data/ext/src/read_celfile_generic.o +0 -0
- data/ext/src/read_clf.c +870 -0
- data/ext/src/read_clf.o +0 -0
- data/ext/src/read_generic.c +1446 -0
- data/ext/src/read_generic.h +144 -0
- data/ext/src/read_generic.o +0 -0
- data/ext/src/read_pgf.c +1337 -0
- data/ext/src/read_pgf.o +0 -0
- data/lib/bio-affy.rb +5 -0
- data/lib/bio/affy.rb +7 -0
- data/lib/bio/affyext.rb +23 -0
- data/lib/bio/libaffyext.so +0 -0
- data/spec/bio-affy_spec.rb +22 -0
- data/spec/spec_helper.rb +13 -0
- data/test/data/affy/GSM103328.CEL.gz +0 -0
- data/test/data/affy/GSM103329.CEL.gz +0 -0
- data/test/data/affy/GSM103330.CEL.gz +0 -0
- data/test/data/affy/MG_U74Av2.CDF.gz +0 -0
- metadata +190 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
#ifndef READ_CELFILE_GENERIC_H
|
2
|
+
#define READ_CELFILE_GENERIC_H
|
3
|
+
|
4
|
+
#ifdef BIOLIB
|
5
|
+
#include <biolib_R_map.h>
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include "read_abatch.h"
|
9
|
+
|
10
|
+
int isGenericCelFile(const char *filename);
|
11
|
+
char *generic_get_header_info(const char *filename, int *dim1, int *dim2);
|
12
|
+
void generic_get_detailed_header_info(const char *filename, detailed_header_info *header_info);
|
13
|
+
int read_genericcel_file_intensities(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
14
|
+
int check_generic_cel_file(const char *filename, const char *ref_cdfName, int ref_dim_1, int ref_dim_2);
|
15
|
+
int read_genericcel_file_stddev(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
16
|
+
int read_genericcel_file_npixels(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
17
|
+
void generic_get_masks_outliers(const char *filename, int *nmasks, short **masks_x, short **masks_y, int *noutliers, short **outliers_x, short **outliers_y);
|
18
|
+
void generic_apply_masks(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows, int rm_mask, int rm_outliers);
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
int isgzGenericCelFile(const char *filename);
|
23
|
+
char *gzgeneric_get_header_info(const char *filename, int *dim1, int *dim2);
|
24
|
+
void gzgeneric_get_detailed_header_info(const char *filename, detailed_header_info *header_info);
|
25
|
+
int gzread_genericcel_file_intensities(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
26
|
+
int check_gzgeneric_cel_file(const char *filename, const char *ref_cdfName, int ref_dim_1, int ref_dim_2);
|
27
|
+
int gzread_genericcel_file_stddev(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
28
|
+
int gzread_genericcel_file_npixels(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
29
|
+
void gzgeneric_get_masks_outliers(const char *filename, int *nmasks, short **masks_x, short **masks_y, int *noutliers, short **outliers_x, short **outliers_y);
|
30
|
+
void gzgeneric_apply_masks(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows, int rm_mask, int rm_outliers);
|
31
|
+
|
32
|
+
|
33
|
+
#endif
|
Binary file
|
data/ext/src/read_clf.c
ADDED
@@ -0,0 +1,870 @@
|
|
1
|
+
/******************************************************************
|
2
|
+
**
|
3
|
+
** file: read_clf.c
|
4
|
+
**
|
5
|
+
** Aim: implement parsing of CLF format files
|
6
|
+
**
|
7
|
+
** Copyright (C) 2007-2008 B. M. Bolstad
|
8
|
+
**
|
9
|
+
** Created on Nov 4, 2007
|
10
|
+
**
|
11
|
+
** History
|
12
|
+
** Dec 14, 2007 - Initial version
|
13
|
+
** Dec 31, 2007 - Add function for checking that required headers were found
|
14
|
+
** Jan 2, 2008 - port x,y to probe_id and probe_id to x,y functions from RMAExpress parsers
|
15
|
+
** Mar 18, 2008 - fix error in read_clf_header function
|
16
|
+
**
|
17
|
+
**
|
18
|
+
**
|
19
|
+
******************************************************************/
|
20
|
+
|
21
|
+
#include <R.h>
|
22
|
+
|
23
|
+
#include <stdio.h>
|
24
|
+
#include <stdlib.h>
|
25
|
+
|
26
|
+
|
27
|
+
#define BUFFERSIZE 1024
|
28
|
+
|
29
|
+
/*******************************************************************
|
30
|
+
*******************************************************************
|
31
|
+
**
|
32
|
+
** Structures for dealing with clf file information
|
33
|
+
**
|
34
|
+
**
|
35
|
+
**
|
36
|
+
*******************************************************************
|
37
|
+
******************************************************************/
|
38
|
+
|
39
|
+
/*******************************************************************
|
40
|
+
*******************************************************************
|
41
|
+
**
|
42
|
+
** Starting off with the headers
|
43
|
+
**
|
44
|
+
*******************************************************************
|
45
|
+
******************************************************************/
|
46
|
+
|
47
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
48
|
+
|
49
|
+
typedef struct{
|
50
|
+
int probe_id;
|
51
|
+
int x;
|
52
|
+
int y;
|
53
|
+
} header_0;
|
54
|
+
|
55
|
+
/*******************************************************************
|
56
|
+
**
|
57
|
+
** These are all the headers that appear in CLF files
|
58
|
+
**
|
59
|
+
** Note that some are required (chip_type, lib_set_name, lib_set_version, clf_format_version
|
60
|
+
** rows, cols, header0)
|
61
|
+
** While others are optional (sequential, order, create_date, guid and others)
|
62
|
+
**
|
63
|
+
**
|
64
|
+
*******************************************************************/
|
65
|
+
|
66
|
+
typedef struct{
|
67
|
+
char **chip_type;
|
68
|
+
int n_chip_type;
|
69
|
+
char *lib_set_name;
|
70
|
+
char *lib_set_version;
|
71
|
+
char *clf_format_version;
|
72
|
+
int rows;
|
73
|
+
int cols;
|
74
|
+
char *header0_str;
|
75
|
+
header_0 *header0;
|
76
|
+
int sequential;
|
77
|
+
char *order;
|
78
|
+
char *create_date;
|
79
|
+
char *guid;
|
80
|
+
char **other_headers_keys;
|
81
|
+
char **other_headers_values;
|
82
|
+
int n_other_headers;
|
83
|
+
} clf_headers;
|
84
|
+
|
85
|
+
/*******************************************************************
|
86
|
+
*******************************************************************
|
87
|
+
**
|
88
|
+
** Now the actual data
|
89
|
+
**
|
90
|
+
** (only store the probeset ids to save space)
|
91
|
+
**
|
92
|
+
** length of probe_id is rows*cols.
|
93
|
+
**
|
94
|
+
** Given an x, y it maps to probe_id[index]
|
95
|
+
**
|
96
|
+
** index = y*cols + x
|
97
|
+
**
|
98
|
+
** Which means that given an index, it maps to
|
99
|
+
**
|
100
|
+
** x = index % cols where % means modulo (ie remainder)
|
101
|
+
** y = index / cols
|
102
|
+
**
|
103
|
+
**
|
104
|
+
**
|
105
|
+
*******************************************************************
|
106
|
+
******************************************************************/
|
107
|
+
|
108
|
+
|
109
|
+
typedef struct{
|
110
|
+
int *probe_id;
|
111
|
+
} clf_data;
|
112
|
+
|
113
|
+
|
114
|
+
/*******************************************************************
|
115
|
+
*******************************************************************
|
116
|
+
**
|
117
|
+
** Structure for storing clf file (after it is read from file)
|
118
|
+
**
|
119
|
+
*******************************************************************
|
120
|
+
******************************************************************/
|
121
|
+
|
122
|
+
|
123
|
+
typedef struct{
|
124
|
+
clf_headers *headers;
|
125
|
+
clf_data *data;
|
126
|
+
} clf_file;
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
/*******************************************************************
|
131
|
+
*******************************************************************
|
132
|
+
**
|
133
|
+
**
|
134
|
+
** Code for splitting a string into a series of tokens
|
135
|
+
**
|
136
|
+
**
|
137
|
+
*******************************************************************
|
138
|
+
*******************************************************************/
|
139
|
+
|
140
|
+
|
141
|
+
/***************************************************************
|
142
|
+
**
|
143
|
+
** tokenset
|
144
|
+
**
|
145
|
+
** char **tokens - a array of token strings
|
146
|
+
** int n - number of tokens in this set.
|
147
|
+
**
|
148
|
+
** a structure to hold a set of tokens. Typically a tokenset is
|
149
|
+
** created by breaking a character string based upon a set of
|
150
|
+
** delimiters.
|
151
|
+
**
|
152
|
+
**
|
153
|
+
**************************************************************/
|
154
|
+
|
155
|
+
typedef struct{
|
156
|
+
char **tokens;
|
157
|
+
int n;
|
158
|
+
} tokenset;
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
/******************************************************************
|
163
|
+
**
|
164
|
+
** tokenset *tokenize(char *str, char *delimiters)
|
165
|
+
**
|
166
|
+
** char *str - a string to break into tokens
|
167
|
+
** char *delimiters - delimiters to use in breaking up the line
|
168
|
+
**
|
169
|
+
**
|
170
|
+
** RETURNS a new tokenset
|
171
|
+
**
|
172
|
+
** Given a string, split into tokens based on a set of delimitors
|
173
|
+
**
|
174
|
+
*****************************************************************/
|
175
|
+
|
176
|
+
static tokenset *tokenize(char *str, char *delimiters){
|
177
|
+
|
178
|
+
#if USE_PTHREADS
|
179
|
+
char *tmp_pointer;
|
180
|
+
#endif
|
181
|
+
int i=0;
|
182
|
+
|
183
|
+
char *current_token;
|
184
|
+
tokenset *my_tokenset = Calloc(1,tokenset);
|
185
|
+
my_tokenset->n=0;
|
186
|
+
|
187
|
+
my_tokenset->tokens = NULL;
|
188
|
+
#if USE_PTHREADS
|
189
|
+
current_token = strtok_r(str,delimiters,&tmp_pointer);
|
190
|
+
#else
|
191
|
+
current_token = strtok(str,delimiters);
|
192
|
+
#endif
|
193
|
+
while (current_token != NULL){
|
194
|
+
my_tokenset->n++;
|
195
|
+
my_tokenset->tokens = Realloc(my_tokenset->tokens,my_tokenset->n,char*);
|
196
|
+
my_tokenset->tokens[i] = Calloc(strlen(current_token)+1,char);
|
197
|
+
strcpy(my_tokenset->tokens[i],current_token);
|
198
|
+
my_tokenset->tokens[i][(strlen(current_token))] = '\0';
|
199
|
+
i++;
|
200
|
+
#if USE_PTHREADS
|
201
|
+
current_token = strtok_r(NULL,delimiters,&tmp_pointer);
|
202
|
+
#else
|
203
|
+
current_token = strtok(NULL,delimiters);
|
204
|
+
#endif
|
205
|
+
}
|
206
|
+
return my_tokenset;
|
207
|
+
}
|
208
|
+
|
209
|
+
|
210
|
+
/******************************************************************
|
211
|
+
**
|
212
|
+
** int tokenset_size(tokenset *x)
|
213
|
+
**
|
214
|
+
** tokenset *x - a tokenset
|
215
|
+
**
|
216
|
+
** RETURNS the number of tokens in the tokenset
|
217
|
+
**
|
218
|
+
******************************************************************/
|
219
|
+
|
220
|
+
static int tokenset_size(tokenset *x){
|
221
|
+
return x->n;
|
222
|
+
}
|
223
|
+
|
224
|
+
|
225
|
+
/******************************************************************
|
226
|
+
**
|
227
|
+
** char *get_token(tokenset *x, int i)
|
228
|
+
**
|
229
|
+
** tokenset *x - a tokenset
|
230
|
+
** int i - index of the token to return
|
231
|
+
**
|
232
|
+
** RETURNS pointer to the i'th token
|
233
|
+
**
|
234
|
+
******************************************************************/
|
235
|
+
|
236
|
+
static char *get_token(tokenset *x,int i){
|
237
|
+
return x->tokens[i];
|
238
|
+
}
|
239
|
+
|
240
|
+
/******************************************************************
|
241
|
+
**
|
242
|
+
** void delete_tokens(tokenset *x)
|
243
|
+
**
|
244
|
+
** tokenset *x - a tokenset
|
245
|
+
**
|
246
|
+
** Deallocates all the space allocated for a tokenset
|
247
|
+
**
|
248
|
+
******************************************************************/
|
249
|
+
|
250
|
+
static void delete_tokens(tokenset *x){
|
251
|
+
|
252
|
+
int i;
|
253
|
+
|
254
|
+
for (i=0; i < x->n; i++){
|
255
|
+
Free(x->tokens[i]);
|
256
|
+
}
|
257
|
+
Free(x->tokens);
|
258
|
+
Free(x);
|
259
|
+
}
|
260
|
+
|
261
|
+
/*******************************************************************
|
262
|
+
**
|
263
|
+
** int token_ends_with(char *token, char *ends)
|
264
|
+
**
|
265
|
+
** char *token - a string to check
|
266
|
+
** char *ends_in - we are looking for this string at the end of token
|
267
|
+
**
|
268
|
+
**
|
269
|
+
** returns 0 if no match, otherwise it returns the index of the first character
|
270
|
+
** which matchs the start of *ends.
|
271
|
+
**
|
272
|
+
** Note that there must be one additional character in "token" beyond
|
273
|
+
** the characters in "ends". So
|
274
|
+
**
|
275
|
+
** *token = "TestStr"
|
276
|
+
** *ends = "TestStr"
|
277
|
+
**
|
278
|
+
** would return 0 but if
|
279
|
+
**
|
280
|
+
** ends = "estStr"
|
281
|
+
**
|
282
|
+
** we would return 1.
|
283
|
+
**
|
284
|
+
** and if
|
285
|
+
**
|
286
|
+
** ends= "stStr"
|
287
|
+
** we would return 2 .....etc
|
288
|
+
**
|
289
|
+
**
|
290
|
+
******************************************************************/
|
291
|
+
|
292
|
+
static int token_ends_with(char *token, char *ends_in){
|
293
|
+
|
294
|
+
int tokenlength = strlen(token);
|
295
|
+
int ends_length = strlen(ends_in);
|
296
|
+
int start_pos;
|
297
|
+
char *tmp_ptr;
|
298
|
+
|
299
|
+
if (tokenlength <= ends_length){
|
300
|
+
/* token string is too short so can't possibly end with ends */
|
301
|
+
return 0;
|
302
|
+
}
|
303
|
+
|
304
|
+
start_pos = tokenlength - ends_length;
|
305
|
+
|
306
|
+
tmp_ptr = &token[start_pos];
|
307
|
+
|
308
|
+
if (strcmp(tmp_ptr,ends_in)==0){
|
309
|
+
return start_pos;
|
310
|
+
} else {
|
311
|
+
return 0;
|
312
|
+
}
|
313
|
+
}
|
314
|
+
|
315
|
+
|
316
|
+
/*******************************************************************
|
317
|
+
*******************************************************************
|
318
|
+
**
|
319
|
+
** Code for Reading from file
|
320
|
+
**
|
321
|
+
*******************************************************************
|
322
|
+
*******************************************************************/
|
323
|
+
|
324
|
+
|
325
|
+
|
326
|
+
/****************************************************************
|
327
|
+
**
|
328
|
+
** void ReadFileLine(char *buffer, int buffersize, FILE *currentFile)
|
329
|
+
**
|
330
|
+
** char *buffer - place to store contents of the line
|
331
|
+
** int buffersize - size of the buffer
|
332
|
+
** FILE *currentFile - FILE pointer to an opened CEL file.
|
333
|
+
**
|
334
|
+
** Read a line from a file, into a buffer of specified size.
|
335
|
+
** otherwise die.
|
336
|
+
**
|
337
|
+
***************************************************************/
|
338
|
+
|
339
|
+
static int ReadFileLine(char *buffer, int buffersize, FILE *currentFile){
|
340
|
+
if (fgets(buffer, buffersize, currentFile) == NULL){
|
341
|
+
return 0;
|
342
|
+
//error("End of file reached unexpectedly. Perhaps this file is truncated.\n");
|
343
|
+
}
|
344
|
+
return 1;
|
345
|
+
}
|
346
|
+
|
347
|
+
|
348
|
+
/****************************************************************
|
349
|
+
****************************************************************
|
350
|
+
**
|
351
|
+
** Code for identifying what type of information is stored in
|
352
|
+
** the current line
|
353
|
+
**
|
354
|
+
****************************************************************
|
355
|
+
***************************************************************/
|
356
|
+
|
357
|
+
/****************************************************************
|
358
|
+
**
|
359
|
+
** static int IsHeaderLine(char *buffer)
|
360
|
+
**
|
361
|
+
** char *buffer - contains line to evaluate
|
362
|
+
**
|
363
|
+
** Checks whether supplied line is a header line (ie starts with #%)
|
364
|
+
**
|
365
|
+
** return 1 (ie true) if header line. 0 otherwise
|
366
|
+
**
|
367
|
+
***************************************************************/
|
368
|
+
|
369
|
+
|
370
|
+
static int IsHeaderLine(char *buffer){
|
371
|
+
|
372
|
+
if (strncmp("#%",buffer,2) == 0){
|
373
|
+
return 1;
|
374
|
+
}
|
375
|
+
return 0;
|
376
|
+
}
|
377
|
+
|
378
|
+
/****************************************************************
|
379
|
+
**
|
380
|
+
** static int IsHeaderLine(char *buffer)
|
381
|
+
**
|
382
|
+
** char *buffer - contains line to evaluate
|
383
|
+
**
|
384
|
+
** Checks whether supplied line is a comment line (ie starts with #)
|
385
|
+
**
|
386
|
+
**
|
387
|
+
***************************************************************/
|
388
|
+
|
389
|
+
static int IsCommentLine(char *buffer){
|
390
|
+
if (strncmp("#",buffer,1) == 0){
|
391
|
+
return 1;
|
392
|
+
}
|
393
|
+
return 0;
|
394
|
+
}
|
395
|
+
|
396
|
+
/****************************************************************
|
397
|
+
**
|
398
|
+
** void initialize_clf_header(clf_headers *header)
|
399
|
+
**
|
400
|
+
** Initialize all the header values
|
401
|
+
**
|
402
|
+
**
|
403
|
+
**
|
404
|
+
***************************************************************/
|
405
|
+
|
406
|
+
void initialize_clf_header(clf_headers *header){
|
407
|
+
|
408
|
+
header->chip_type = NULL;
|
409
|
+
header->n_chip_type = 0;
|
410
|
+
|
411
|
+
header->lib_set_name= NULL;
|
412
|
+
header->lib_set_version= NULL;
|
413
|
+
header->clf_format_version= NULL;
|
414
|
+
header->header0_str= NULL;
|
415
|
+
header->header0= NULL;
|
416
|
+
header->order = NULL;
|
417
|
+
header->create_date= NULL;
|
418
|
+
header->guid= NULL;
|
419
|
+
header->other_headers_keys= NULL;
|
420
|
+
header->other_headers_values= NULL;
|
421
|
+
header->n_other_headers=0;
|
422
|
+
|
423
|
+
header->rows = -1;
|
424
|
+
header->cols = -1;
|
425
|
+
header->n_other_headers = -1;
|
426
|
+
|
427
|
+
}
|
428
|
+
|
429
|
+
|
430
|
+
/****************************************************************
|
431
|
+
****************************************************************
|
432
|
+
**
|
433
|
+
** Code for reading in clf header
|
434
|
+
**
|
435
|
+
****************************************************************
|
436
|
+
***************************************************************/
|
437
|
+
|
438
|
+
static void determine_order_header0(char *header_str, header_0 *header0){
|
439
|
+
|
440
|
+
tokenset *cur_tokenset;
|
441
|
+
int i;
|
442
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
443
|
+
|
444
|
+
|
445
|
+
strcpy(temp_str,header_str);
|
446
|
+
|
447
|
+
header0->probe_id = -1;
|
448
|
+
header0->x = -1;
|
449
|
+
header0->y = -1;
|
450
|
+
|
451
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
452
|
+
|
453
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
454
|
+
if (strcmp(get_token(cur_tokenset,i),"probe_id")==0){
|
455
|
+
header0->probe_id = i;
|
456
|
+
} else if (strcmp(get_token(cur_tokenset,i),"x")==0){
|
457
|
+
header0->x = i;
|
458
|
+
} else if (strcmp(get_token(cur_tokenset,i),"y")==0){
|
459
|
+
header0->y = i;
|
460
|
+
}
|
461
|
+
}
|
462
|
+
delete_tokens(cur_tokenset);
|
463
|
+
|
464
|
+
Free(temp_str);
|
465
|
+
|
466
|
+
}
|
467
|
+
|
468
|
+
/****************************************************************
|
469
|
+
**
|
470
|
+
** Validate that required headers are present in file.
|
471
|
+
**
|
472
|
+
** Return 0 if an expected header is not present.
|
473
|
+
** Returns 1 otherwise (ie everything looks fine)
|
474
|
+
**
|
475
|
+
***************************************************************/
|
476
|
+
|
477
|
+
static int validate_clf_header(clf_headers *header){
|
478
|
+
|
479
|
+
|
480
|
+
/* check that required headers are all there (have been read) */
|
481
|
+
if (header->chip_type == NULL)
|
482
|
+
return 0;
|
483
|
+
|
484
|
+
if (header->lib_set_name == NULL)
|
485
|
+
return 0;
|
486
|
+
|
487
|
+
if (header->lib_set_version == NULL)
|
488
|
+
return 0;
|
489
|
+
|
490
|
+
if (header->clf_format_version == NULL)
|
491
|
+
return 0;
|
492
|
+
|
493
|
+
if (header->header0_str == NULL)
|
494
|
+
return 0;
|
495
|
+
|
496
|
+
if (header->rows == -1)
|
497
|
+
return 0;
|
498
|
+
|
499
|
+
if (header->cols == -1)
|
500
|
+
return 0;
|
501
|
+
|
502
|
+
/* Check that format version is 1.0 (only supported version) */
|
503
|
+
|
504
|
+
if (strcmp( header->clf_format_version,"1.0") != 0){
|
505
|
+
return 0;
|
506
|
+
}
|
507
|
+
|
508
|
+
/* check that header0, header1, header2 (ie the three levels of headers) have required fields */
|
509
|
+
|
510
|
+
if (header->header0->probe_id == -1)
|
511
|
+
return 0;
|
512
|
+
|
513
|
+
if (header->header0->x == -1)
|
514
|
+
return 0;
|
515
|
+
|
516
|
+
if (header->header0->y == -1)
|
517
|
+
return 0;
|
518
|
+
|
519
|
+
|
520
|
+
return 1;
|
521
|
+
}
|
522
|
+
|
523
|
+
/****************************************************************
|
524
|
+
**
|
525
|
+
** static FILE *open_clf_file(const char *filename)
|
526
|
+
**
|
527
|
+
** Open the CLF to begin reading from it.
|
528
|
+
**
|
529
|
+
***************************************************************/
|
530
|
+
|
531
|
+
static FILE *open_clf_file(const char *filename){
|
532
|
+
|
533
|
+
const char *mode = "r";
|
534
|
+
FILE *currentFile = NULL;
|
535
|
+
|
536
|
+
currentFile = fopen(filename,mode);
|
537
|
+
if (currentFile == NULL){
|
538
|
+
error("Could not open file %s", filename);
|
539
|
+
}
|
540
|
+
return currentFile;
|
541
|
+
|
542
|
+
}
|
543
|
+
|
544
|
+
/****************************************************************
|
545
|
+
**
|
546
|
+
** void read_clf_header(FILE *cur_file, char *buffer, clf_headers *header)
|
547
|
+
**
|
548
|
+
** read the CLF header section
|
549
|
+
**
|
550
|
+
**
|
551
|
+
***************************************************************/
|
552
|
+
|
553
|
+
void read_clf_header(FILE *cur_file, char *buffer, clf_headers *header){
|
554
|
+
|
555
|
+
|
556
|
+
tokenset *cur_tokenset;
|
557
|
+
int i;
|
558
|
+
char *temp_str;
|
559
|
+
|
560
|
+
|
561
|
+
initialize_clf_header(header);
|
562
|
+
do {
|
563
|
+
ReadFileLine(buffer, 1024, cur_file);
|
564
|
+
/* Rprintf("%s\n",buffer); */
|
565
|
+
if (IsHeaderLine(buffer)){
|
566
|
+
cur_tokenset = tokenize(&buffer[2],"=\r\n");
|
567
|
+
/* hopefully token 0 is Key
|
568
|
+
and token 1 is Value */
|
569
|
+
/* Rprintf("Key is: %s\n",get_token(cur_tokenset,0));
|
570
|
+
Rprintf("Value is: %s\n",get_token(cur_tokenset,1)); */
|
571
|
+
/* Decode the Key/Value pair */
|
572
|
+
if (strcmp(get_token(cur_tokenset,0),"chip_type") == 0){
|
573
|
+
if (header->n_chip_type == 0){
|
574
|
+
header->chip_type = Calloc(1, char *);
|
575
|
+
} else {
|
576
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
577
|
+
}
|
578
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1))+1,char);
|
579
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
580
|
+
header->chip_type[header->n_chip_type] = temp_str;
|
581
|
+
header->n_chip_type++;
|
582
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_name") == 0){
|
583
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
584
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
585
|
+
header->lib_set_name = temp_str;
|
586
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_version") == 0){
|
587
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
588
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
589
|
+
header->lib_set_version = temp_str;
|
590
|
+
} else if (strcmp(get_token(cur_tokenset,0), "clf_format_version") == 0) {
|
591
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
592
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
593
|
+
header->clf_format_version = temp_str;
|
594
|
+
} else if (strcmp(get_token(cur_tokenset,0), "rows") == 0) {
|
595
|
+
header->rows = atoi(get_token(cur_tokenset,1));
|
596
|
+
} else if (strcmp(get_token(cur_tokenset,0), "cols") == 0) {
|
597
|
+
header->cols = atoi(get_token(cur_tokenset,1));
|
598
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header0") == 0) {
|
599
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
600
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
601
|
+
header->header0_str = temp_str;
|
602
|
+
header->header0 = Calloc(1,header_0);
|
603
|
+
determine_order_header0(header->header0_str,header->header0);
|
604
|
+
} else if (strcmp(get_token(cur_tokenset,0), "create_date") == 0) {
|
605
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
606
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
607
|
+
header->create_date = temp_str;
|
608
|
+
} else if (strcmp(get_token(cur_tokenset,0), "order") == 0) {
|
609
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
610
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
611
|
+
header->order = temp_str;
|
612
|
+
} else if (strcmp(get_token(cur_tokenset,0), "sequential") == 0) {
|
613
|
+
header->sequential = atoi(get_token(cur_tokenset,1));
|
614
|
+
} else if (strcmp(get_token(cur_tokenset,0), "guid") == 0) {
|
615
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
616
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
617
|
+
header->guid = temp_str;
|
618
|
+
} else {
|
619
|
+
/* not one of the recognised header types */
|
620
|
+
if ( header->n_other_headers == 0){
|
621
|
+
header->other_headers_keys = Calloc(1, char *);
|
622
|
+
header->other_headers_values = Calloc(1, char *);
|
623
|
+
} else {
|
624
|
+
header->other_headers_keys = Realloc(header->other_headers_keys,header->n_other_headers+1, char *);
|
625
|
+
header->other_headers_values = Realloc(header->other_headers_values,header->n_other_headers+1, char *);
|
626
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
627
|
+
}
|
628
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
629
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
630
|
+
header->other_headers_values[header->n_other_headers] = temp_str;
|
631
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,0)) + 1,char);
|
632
|
+
strcpy(temp_str,get_token(cur_tokenset,0));
|
633
|
+
header->other_headers_keys[header->n_other_headers] = temp_str;
|
634
|
+
header->n_other_headers++;
|
635
|
+
|
636
|
+
}
|
637
|
+
|
638
|
+
delete_tokens(cur_tokenset);
|
639
|
+
}
|
640
|
+
} while (IsHeaderLine(buffer));
|
641
|
+
|
642
|
+
}
|
643
|
+
|
644
|
+
/****************************************************************
|
645
|
+
**
|
646
|
+
** void read_clf_data(FILE *cur_file, char *buffer, clf_data *data, clf_headers *header)
|
647
|
+
**
|
648
|
+
** Read in the data part of the file. Specifically, the x,y, probe_id section.
|
649
|
+
** Note to save space only the probe_id are stored.
|
650
|
+
**
|
651
|
+
****************************************************************/
|
652
|
+
|
653
|
+
void read_clf_data(FILE *cur_file, char *buffer, clf_data *data, clf_headers *header){
|
654
|
+
tokenset *cur_tokenset;
|
655
|
+
int x, y, cur_id;
|
656
|
+
|
657
|
+
/* Check to see if the header information includes enough to know that probe_ids are deterministic */
|
658
|
+
/* if the are deterministic then don't need to read the rest of the file */
|
659
|
+
|
660
|
+
|
661
|
+
if (header->sequential > -1){
|
662
|
+
data->probe_id = NULL;
|
663
|
+
return;
|
664
|
+
} else {
|
665
|
+
data->probe_id = Calloc((header->rows)*(header->cols), int);
|
666
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
667
|
+
cur_id = atoi(get_token(cur_tokenset,header->header0->probe_id));
|
668
|
+
x = atoi(get_token(cur_tokenset,header->header0->x));
|
669
|
+
y = atoi(get_token(cur_tokenset,header->header0->y));
|
670
|
+
data->probe_id[y*header->cols + x] = cur_id;
|
671
|
+
|
672
|
+
delete_tokens(cur_tokenset);
|
673
|
+
while(ReadFileLine(buffer, 1024, cur_file)){
|
674
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
675
|
+
cur_id = atoi(get_token(cur_tokenset,header->header0->probe_id));
|
676
|
+
x = atoi(get_token(cur_tokenset,header->header0->x));
|
677
|
+
y = atoi(get_token(cur_tokenset,header->header0->y));
|
678
|
+
data->probe_id[y*header->cols + x] = cur_id;
|
679
|
+
|
680
|
+
delete_tokens(cur_tokenset);
|
681
|
+
}
|
682
|
+
}
|
683
|
+
}
|
684
|
+
|
685
|
+
|
686
|
+
|
687
|
+
|
688
|
+
|
689
|
+
|
690
|
+
/****************************************************************
|
691
|
+
****************************************************************
|
692
|
+
**
|
693
|
+
** Code for deallocating or initializing header data structures
|
694
|
+
**
|
695
|
+
****************************************************************
|
696
|
+
****************************************************************/
|
697
|
+
|
698
|
+
void dealloc_clf_headers(clf_headers *header){
|
699
|
+
int i;
|
700
|
+
|
701
|
+
if (header->n_chip_type > 0){
|
702
|
+
for (i = 0; i < header->n_chip_type; i++){
|
703
|
+
Free(header->chip_type[i]);
|
704
|
+
}
|
705
|
+
Free(header->chip_type);
|
706
|
+
}
|
707
|
+
|
708
|
+
if (header->lib_set_name != NULL){
|
709
|
+
Free(header->lib_set_name);
|
710
|
+
}
|
711
|
+
|
712
|
+
if (header->lib_set_version != NULL){
|
713
|
+
Free(header->lib_set_version);
|
714
|
+
}
|
715
|
+
|
716
|
+
if (header->clf_format_version != NULL){
|
717
|
+
Free(header->clf_format_version);
|
718
|
+
}
|
719
|
+
|
720
|
+
if (header->header0_str != NULL){
|
721
|
+
Free(header->header0_str);
|
722
|
+
Free(header->header0);
|
723
|
+
}
|
724
|
+
|
725
|
+
if (header->order != NULL){
|
726
|
+
Free(header->order);
|
727
|
+
}
|
728
|
+
|
729
|
+
if (header->create_date != NULL){
|
730
|
+
Free(header->create_date);
|
731
|
+
}
|
732
|
+
|
733
|
+
if (header->guid != NULL){
|
734
|
+
Free(header->guid);
|
735
|
+
}
|
736
|
+
|
737
|
+
if (header->n_other_headers > 0){
|
738
|
+
for (i = 0; i < header->n_other_headers; i++){
|
739
|
+
Free(header->other_headers_keys[i]);
|
740
|
+
Free(header->other_headers_values[i]);
|
741
|
+
}
|
742
|
+
Free(header->other_headers_keys);
|
743
|
+
Free(header->other_headers_values);
|
744
|
+
}
|
745
|
+
}
|
746
|
+
|
747
|
+
|
748
|
+
void dealloc_clf_data(clf_data *data){
|
749
|
+
if (data->probe_id != NULL){
|
750
|
+
Free(data->probe_id);
|
751
|
+
}
|
752
|
+
}
|
753
|
+
|
754
|
+
|
755
|
+
void dealloc_clf_file(clf_file* my_clf){
|
756
|
+
|
757
|
+
|
758
|
+
if (my_clf->headers != NULL){
|
759
|
+
dealloc_clf_headers(my_clf->headers);
|
760
|
+
Free(my_clf->headers);
|
761
|
+
}
|
762
|
+
|
763
|
+
|
764
|
+
if (my_clf->data !=NULL){
|
765
|
+
dealloc_clf_data(my_clf->data);
|
766
|
+
Free(my_clf->data);
|
767
|
+
}
|
768
|
+
|
769
|
+
|
770
|
+
}
|
771
|
+
|
772
|
+
/**********************************************************************
|
773
|
+
***
|
774
|
+
*** A function for getting the probe_id for a given x,y
|
775
|
+
***
|
776
|
+
***
|
777
|
+
*********************************************************************/
|
778
|
+
|
779
|
+
void clf_get_probe_id(clf_file *clf, int *probe_id, int x, int y){
|
780
|
+
|
781
|
+
if (clf->headers->sequential > -1){
|
782
|
+
/* Check if order is "col_major" or "row_major" */
|
783
|
+
|
784
|
+
if (strcmp(clf->headers->order,"col_major") == 0){
|
785
|
+
*probe_id = y*clf->headers->cols + x + clf->headers->sequential;
|
786
|
+
} else if (strcmp(clf->headers->order,"row_major") == 0){
|
787
|
+
*probe_id = x*clf->headers->rows + y + clf->headers->sequential;
|
788
|
+
} else {
|
789
|
+
*probe_id = -1; /* ie missing */
|
790
|
+
}
|
791
|
+
|
792
|
+
} else {
|
793
|
+
|
794
|
+
*probe_id = clf->data->probe_id[y*clf->headers->rows + x];
|
795
|
+
}
|
796
|
+
}
|
797
|
+
|
798
|
+
/**********************************************************************
|
799
|
+
***
|
800
|
+
*** A function for getting the x , y for a given probe_id
|
801
|
+
***
|
802
|
+
***
|
803
|
+
*********************************************************************/
|
804
|
+
|
805
|
+
void clf_get_x_y(clf_file *clf, int probe_id, int *x, int *y){
|
806
|
+
int ind;
|
807
|
+
|
808
|
+
if (clf->headers->sequential > -1){
|
809
|
+
/* Check if order is "col_major" or "row_major" */
|
810
|
+
|
811
|
+
if (strcmp(clf->headers->order,"col_major") == 0){
|
812
|
+
ind = (probe_id - clf->headers->sequential);
|
813
|
+
*x = ind%clf->headers->cols;
|
814
|
+
*y = ind/clf->headers->cols;
|
815
|
+
} else if (strcmp(clf->headers->order,"row_major") == 0){
|
816
|
+
ind = (probe_id - clf->headers->sequential);
|
817
|
+
*x = ind/clf->headers->rows;
|
818
|
+
*y = ind%clf->headers->rows;
|
819
|
+
} else {
|
820
|
+
*x = -1; /* ie missing */
|
821
|
+
*y = -1;
|
822
|
+
}
|
823
|
+
} else {
|
824
|
+
/* Linear Search (this should be improved for routine use) */
|
825
|
+
ind = 0;
|
826
|
+
|
827
|
+
while (ind < (clf->headers->cols*clf->headers->rows)){
|
828
|
+
if (clf->data->probe_id[ind] == probe_id){
|
829
|
+
break;
|
830
|
+
}
|
831
|
+
ind++;
|
832
|
+
}
|
833
|
+
|
834
|
+
if (ind == (clf->headers->cols*clf->headers->rows)){
|
835
|
+
*x = -1; *y = -1;
|
836
|
+
} else {
|
837
|
+
*x = ind/clf->headers->rows;
|
838
|
+
*y = ind%clf->headers->rows;
|
839
|
+
}
|
840
|
+
}
|
841
|
+
}
|
842
|
+
|
843
|
+
/*
|
844
|
+
* Note this function is only for testing purposes. It provides no methodology for accessing anything
|
845
|
+
* stored in the CLF file in R.
|
846
|
+
*
|
847
|
+
*/
|
848
|
+
|
849
|
+
void read_clf_file(char **filename){
|
850
|
+
|
851
|
+
FILE *cur_file;
|
852
|
+
clf_file my_clf;
|
853
|
+
char *buffer = Calloc(1024, char);
|
854
|
+
|
855
|
+
|
856
|
+
|
857
|
+
cur_file = open_clf_file(filename[0]);
|
858
|
+
|
859
|
+
my_clf.headers = Calloc(1, clf_headers);
|
860
|
+
my_clf.data = Calloc(1, clf_data);
|
861
|
+
|
862
|
+
read_clf_header(cur_file,buffer,my_clf.headers);
|
863
|
+
if (validate_clf_header(my_clf.headers))
|
864
|
+
read_clf_data(cur_file, buffer, my_clf.data, my_clf.headers);
|
865
|
+
|
866
|
+
Free(buffer);
|
867
|
+
dealloc_clf_file(&my_clf);
|
868
|
+
fclose(cur_file);
|
869
|
+
|
870
|
+
}
|