bio-affy 0.1.0.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +15 -0
- data/Gemfile.lock +32 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +33 -0
- data/Rakefile +77 -0
- data/VERSION +1 -0
- data/bin/bio-affy +80 -0
- data/bio-affy.gemspec +128 -0
- data/ext/DESCRIPTION +11 -0
- data/ext/HISTORY +3 -0
- data/ext/LICENSE +456 -0
- data/ext/NAMESPACE +2 -0
- data/ext/R/check.cdf.type.R +18 -0
- data/ext/R/read.cdffile.list.R +23 -0
- data/ext/R/read.celfile.R +11 -0
- data/ext/R/read.celfile.header.R +37 -0
- data/ext/R/read.probematrices.R +29 -0
- data/ext/README_BIOLIB +36 -0
- data/ext/aclocal.m4 +32 -0
- data/ext/configure +4898 -0
- data/ext/configure.in +51 -0
- data/ext/man/check.cdf.type.Rd +22 -0
- data/ext/man/read.cdffile.list.Rd +20 -0
- data/ext/man/read.celfile.Rd +23 -0
- data/ext/man/read.celfile.header.Rd +22 -0
- data/ext/man/read.celfile.probeintensity.matrices.Rd +31 -0
- data/ext/src/CMakeLists.txt +39 -0
- data/ext/src/Makevars.in +3 -0
- data/ext/src/Makevars.win +2 -0
- data/ext/src/Rakefile +43 -0
- data/ext/src/biolib_affyio.c +416 -0
- data/ext/src/biolib_affyio.h +132 -0
- data/ext/src/biolib_affyio.o +0 -0
- data/ext/src/fread_functions.c +871 -0
- data/ext/src/fread_functions.h +60 -0
- data/ext/src/fread_functions.o +0 -0
- data/ext/src/libaffyext.so +0 -0
- data/ext/src/mkrf.log +11 -0
- data/ext/src/mkrf_conf.rb +6 -0
- data/ext/src/read_abatch.c +5484 -0
- data/ext/src/read_abatch.h +63 -0
- data/ext/src/read_abatch.o +0 -0
- data/ext/src/read_bpmap.c +888 -0
- data/ext/src/read_bpmap.o +0 -0
- data/ext/src/read_cdf.h +347 -0
- data/ext/src/read_cdf_xda.c +1342 -0
- data/ext/src/read_cdf_xda.o +0 -0
- data/ext/src/read_cdffile2.c +1576 -0
- data/ext/src/read_cdffile2.o +0 -0
- data/ext/src/read_celfile_generic.c +2061 -0
- data/ext/src/read_celfile_generic.h +33 -0
- data/ext/src/read_celfile_generic.o +0 -0
- data/ext/src/read_clf.c +870 -0
- data/ext/src/read_clf.o +0 -0
- data/ext/src/read_generic.c +1446 -0
- data/ext/src/read_generic.h +144 -0
- data/ext/src/read_generic.o +0 -0
- data/ext/src/read_pgf.c +1337 -0
- data/ext/src/read_pgf.o +0 -0
- data/lib/bio-affy.rb +5 -0
- data/lib/bio/affy.rb +7 -0
- data/lib/bio/affyext.rb +23 -0
- data/lib/bio/libaffyext.so +0 -0
- data/spec/bio-affy_spec.rb +22 -0
- data/spec/spec_helper.rb +13 -0
- data/test/data/affy/GSM103328.CEL.gz +0 -0
- data/test/data/affy/GSM103329.CEL.gz +0 -0
- data/test/data/affy/GSM103330.CEL.gz +0 -0
- data/test/data/affy/MG_U74Av2.CDF.gz +0 -0
- metadata +190 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
#ifndef READ_CELFILE_GENERIC_H
|
2
|
+
#define READ_CELFILE_GENERIC_H
|
3
|
+
|
4
|
+
#ifdef BIOLIB
|
5
|
+
#include <biolib_R_map.h>
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#include "read_abatch.h"
|
9
|
+
|
10
|
+
int isGenericCelFile(const char *filename);
|
11
|
+
char *generic_get_header_info(const char *filename, int *dim1, int *dim2);
|
12
|
+
void generic_get_detailed_header_info(const char *filename, detailed_header_info *header_info);
|
13
|
+
int read_genericcel_file_intensities(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
14
|
+
int check_generic_cel_file(const char *filename, const char *ref_cdfName, int ref_dim_1, int ref_dim_2);
|
15
|
+
int read_genericcel_file_stddev(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
16
|
+
int read_genericcel_file_npixels(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
17
|
+
void generic_get_masks_outliers(const char *filename, int *nmasks, short **masks_x, short **masks_y, int *noutliers, short **outliers_x, short **outliers_y);
|
18
|
+
void generic_apply_masks(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows, int rm_mask, int rm_outliers);
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
int isgzGenericCelFile(const char *filename);
|
23
|
+
char *gzgeneric_get_header_info(const char *filename, int *dim1, int *dim2);
|
24
|
+
void gzgeneric_get_detailed_header_info(const char *filename, detailed_header_info *header_info);
|
25
|
+
int gzread_genericcel_file_intensities(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
26
|
+
int check_gzgeneric_cel_file(const char *filename, const char *ref_cdfName, int ref_dim_1, int ref_dim_2);
|
27
|
+
int gzread_genericcel_file_stddev(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
28
|
+
int gzread_genericcel_file_npixels(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows);
|
29
|
+
void gzgeneric_get_masks_outliers(const char *filename, int *nmasks, short **masks_x, short **masks_y, int *noutliers, short **outliers_x, short **outliers_y);
|
30
|
+
void gzgeneric_apply_masks(const char *filename, double *intensity, int chip_num, int rows, int cols,int chip_dim_rows, int rm_mask, int rm_outliers);
|
31
|
+
|
32
|
+
|
33
|
+
#endif
|
Binary file
|
data/ext/src/read_clf.c
ADDED
@@ -0,0 +1,870 @@
|
|
1
|
+
/******************************************************************
|
2
|
+
**
|
3
|
+
** file: read_clf.c
|
4
|
+
**
|
5
|
+
** Aim: implement parsing of CLF format files
|
6
|
+
**
|
7
|
+
** Copyright (C) 2007-2008 B. M. Bolstad
|
8
|
+
**
|
9
|
+
** Created on Nov 4, 2007
|
10
|
+
**
|
11
|
+
** History
|
12
|
+
** Dec 14, 2007 - Initial version
|
13
|
+
** Dec 31, 2007 - Add function for checking that required headers were found
|
14
|
+
** Jan 2, 2008 - port x,y to probe_id and probe_id to x,y functions from RMAExpress parsers
|
15
|
+
** Mar 18, 2008 - fix error in read_clf_header function
|
16
|
+
**
|
17
|
+
**
|
18
|
+
**
|
19
|
+
******************************************************************/
|
20
|
+
|
21
|
+
#include <R.h>
|
22
|
+
|
23
|
+
#include <stdio.h>
|
24
|
+
#include <stdlib.h>
|
25
|
+
|
26
|
+
|
27
|
+
#define BUFFERSIZE 1024
|
28
|
+
|
29
|
+
/*******************************************************************
|
30
|
+
*******************************************************************
|
31
|
+
**
|
32
|
+
** Structures for dealing with clf file information
|
33
|
+
**
|
34
|
+
**
|
35
|
+
**
|
36
|
+
*******************************************************************
|
37
|
+
******************************************************************/
|
38
|
+
|
39
|
+
/*******************************************************************
|
40
|
+
*******************************************************************
|
41
|
+
**
|
42
|
+
** Starting off with the headers
|
43
|
+
**
|
44
|
+
*******************************************************************
|
45
|
+
******************************************************************/
|
46
|
+
|
47
|
+
/* integer (from 0 to n-1) indicates position of header (-1 means header is not present) */
|
48
|
+
|
49
|
+
typedef struct{
|
50
|
+
int probe_id;
|
51
|
+
int x;
|
52
|
+
int y;
|
53
|
+
} header_0;
|
54
|
+
|
55
|
+
/*******************************************************************
|
56
|
+
**
|
57
|
+
** These are all the headers that appear in CLF files
|
58
|
+
**
|
59
|
+
** Note that some are required (chip_type, lib_set_name, lib_set_version, clf_format_version
|
60
|
+
** rows, cols, header0)
|
61
|
+
** While others are optional (sequential, order, create_date, guid and others)
|
62
|
+
**
|
63
|
+
**
|
64
|
+
*******************************************************************/
|
65
|
+
|
66
|
+
typedef struct{
|
67
|
+
char **chip_type;
|
68
|
+
int n_chip_type;
|
69
|
+
char *lib_set_name;
|
70
|
+
char *lib_set_version;
|
71
|
+
char *clf_format_version;
|
72
|
+
int rows;
|
73
|
+
int cols;
|
74
|
+
char *header0_str;
|
75
|
+
header_0 *header0;
|
76
|
+
int sequential;
|
77
|
+
char *order;
|
78
|
+
char *create_date;
|
79
|
+
char *guid;
|
80
|
+
char **other_headers_keys;
|
81
|
+
char **other_headers_values;
|
82
|
+
int n_other_headers;
|
83
|
+
} clf_headers;
|
84
|
+
|
85
|
+
/*******************************************************************
|
86
|
+
*******************************************************************
|
87
|
+
**
|
88
|
+
** Now the actual data
|
89
|
+
**
|
90
|
+
** (only store the probeset ids to save space)
|
91
|
+
**
|
92
|
+
** length of probe_id is rows*cols.
|
93
|
+
**
|
94
|
+
** Given an x, y it maps to probe_id[index]
|
95
|
+
**
|
96
|
+
** index = y*cols + x
|
97
|
+
**
|
98
|
+
** Which means that given an index, it maps to
|
99
|
+
**
|
100
|
+
** x = index % cols where % means modulo (ie remainder)
|
101
|
+
** y = index / cols
|
102
|
+
**
|
103
|
+
**
|
104
|
+
**
|
105
|
+
*******************************************************************
|
106
|
+
******************************************************************/
|
107
|
+
|
108
|
+
|
109
|
+
typedef struct{
|
110
|
+
int *probe_id;
|
111
|
+
} clf_data;
|
112
|
+
|
113
|
+
|
114
|
+
/*******************************************************************
|
115
|
+
*******************************************************************
|
116
|
+
**
|
117
|
+
** Structure for storing clf file (after it is read from file)
|
118
|
+
**
|
119
|
+
*******************************************************************
|
120
|
+
******************************************************************/
|
121
|
+
|
122
|
+
|
123
|
+
typedef struct{
|
124
|
+
clf_headers *headers;
|
125
|
+
clf_data *data;
|
126
|
+
} clf_file;
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
/*******************************************************************
|
131
|
+
*******************************************************************
|
132
|
+
**
|
133
|
+
**
|
134
|
+
** Code for splitting a string into a series of tokens
|
135
|
+
**
|
136
|
+
**
|
137
|
+
*******************************************************************
|
138
|
+
*******************************************************************/
|
139
|
+
|
140
|
+
|
141
|
+
/***************************************************************
|
142
|
+
**
|
143
|
+
** tokenset
|
144
|
+
**
|
145
|
+
** char **tokens - a array of token strings
|
146
|
+
** int n - number of tokens in this set.
|
147
|
+
**
|
148
|
+
** a structure to hold a set of tokens. Typically a tokenset is
|
149
|
+
** created by breaking a character string based upon a set of
|
150
|
+
** delimiters.
|
151
|
+
**
|
152
|
+
**
|
153
|
+
**************************************************************/
|
154
|
+
|
155
|
+
typedef struct{
|
156
|
+
char **tokens;
|
157
|
+
int n;
|
158
|
+
} tokenset;
|
159
|
+
|
160
|
+
|
161
|
+
|
162
|
+
/******************************************************************
|
163
|
+
**
|
164
|
+
** tokenset *tokenize(char *str, char *delimiters)
|
165
|
+
**
|
166
|
+
** char *str - a string to break into tokens
|
167
|
+
** char *delimiters - delimiters to use in breaking up the line
|
168
|
+
**
|
169
|
+
**
|
170
|
+
** RETURNS a new tokenset
|
171
|
+
**
|
172
|
+
** Given a string, split into tokens based on a set of delimitors
|
173
|
+
**
|
174
|
+
*****************************************************************/
|
175
|
+
|
176
|
+
static tokenset *tokenize(char *str, char *delimiters){
|
177
|
+
|
178
|
+
#if USE_PTHREADS
|
179
|
+
char *tmp_pointer;
|
180
|
+
#endif
|
181
|
+
int i=0;
|
182
|
+
|
183
|
+
char *current_token;
|
184
|
+
tokenset *my_tokenset = Calloc(1,tokenset);
|
185
|
+
my_tokenset->n=0;
|
186
|
+
|
187
|
+
my_tokenset->tokens = NULL;
|
188
|
+
#if USE_PTHREADS
|
189
|
+
current_token = strtok_r(str,delimiters,&tmp_pointer);
|
190
|
+
#else
|
191
|
+
current_token = strtok(str,delimiters);
|
192
|
+
#endif
|
193
|
+
while (current_token != NULL){
|
194
|
+
my_tokenset->n++;
|
195
|
+
my_tokenset->tokens = Realloc(my_tokenset->tokens,my_tokenset->n,char*);
|
196
|
+
my_tokenset->tokens[i] = Calloc(strlen(current_token)+1,char);
|
197
|
+
strcpy(my_tokenset->tokens[i],current_token);
|
198
|
+
my_tokenset->tokens[i][(strlen(current_token))] = '\0';
|
199
|
+
i++;
|
200
|
+
#if USE_PTHREADS
|
201
|
+
current_token = strtok_r(NULL,delimiters,&tmp_pointer);
|
202
|
+
#else
|
203
|
+
current_token = strtok(NULL,delimiters);
|
204
|
+
#endif
|
205
|
+
}
|
206
|
+
return my_tokenset;
|
207
|
+
}
|
208
|
+
|
209
|
+
|
210
|
+
/******************************************************************
|
211
|
+
**
|
212
|
+
** int tokenset_size(tokenset *x)
|
213
|
+
**
|
214
|
+
** tokenset *x - a tokenset
|
215
|
+
**
|
216
|
+
** RETURNS the number of tokens in the tokenset
|
217
|
+
**
|
218
|
+
******************************************************************/
|
219
|
+
|
220
|
+
static int tokenset_size(tokenset *x){
|
221
|
+
return x->n;
|
222
|
+
}
|
223
|
+
|
224
|
+
|
225
|
+
/******************************************************************
|
226
|
+
**
|
227
|
+
** char *get_token(tokenset *x, int i)
|
228
|
+
**
|
229
|
+
** tokenset *x - a tokenset
|
230
|
+
** int i - index of the token to return
|
231
|
+
**
|
232
|
+
** RETURNS pointer to the i'th token
|
233
|
+
**
|
234
|
+
******************************************************************/
|
235
|
+
|
236
|
+
static char *get_token(tokenset *x,int i){
|
237
|
+
return x->tokens[i];
|
238
|
+
}
|
239
|
+
|
240
|
+
/******************************************************************
|
241
|
+
**
|
242
|
+
** void delete_tokens(tokenset *x)
|
243
|
+
**
|
244
|
+
** tokenset *x - a tokenset
|
245
|
+
**
|
246
|
+
** Deallocates all the space allocated for a tokenset
|
247
|
+
**
|
248
|
+
******************************************************************/
|
249
|
+
|
250
|
+
static void delete_tokens(tokenset *x){
|
251
|
+
|
252
|
+
int i;
|
253
|
+
|
254
|
+
for (i=0; i < x->n; i++){
|
255
|
+
Free(x->tokens[i]);
|
256
|
+
}
|
257
|
+
Free(x->tokens);
|
258
|
+
Free(x);
|
259
|
+
}
|
260
|
+
|
261
|
+
/*******************************************************************
|
262
|
+
**
|
263
|
+
** int token_ends_with(char *token, char *ends)
|
264
|
+
**
|
265
|
+
** char *token - a string to check
|
266
|
+
** char *ends_in - we are looking for this string at the end of token
|
267
|
+
**
|
268
|
+
**
|
269
|
+
** returns 0 if no match, otherwise it returns the index of the first character
|
270
|
+
** which matchs the start of *ends.
|
271
|
+
**
|
272
|
+
** Note that there must be one additional character in "token" beyond
|
273
|
+
** the characters in "ends". So
|
274
|
+
**
|
275
|
+
** *token = "TestStr"
|
276
|
+
** *ends = "TestStr"
|
277
|
+
**
|
278
|
+
** would return 0 but if
|
279
|
+
**
|
280
|
+
** ends = "estStr"
|
281
|
+
**
|
282
|
+
** we would return 1.
|
283
|
+
**
|
284
|
+
** and if
|
285
|
+
**
|
286
|
+
** ends= "stStr"
|
287
|
+
** we would return 2 .....etc
|
288
|
+
**
|
289
|
+
**
|
290
|
+
******************************************************************/
|
291
|
+
|
292
|
+
static int token_ends_with(char *token, char *ends_in){
|
293
|
+
|
294
|
+
int tokenlength = strlen(token);
|
295
|
+
int ends_length = strlen(ends_in);
|
296
|
+
int start_pos;
|
297
|
+
char *tmp_ptr;
|
298
|
+
|
299
|
+
if (tokenlength <= ends_length){
|
300
|
+
/* token string is too short so can't possibly end with ends */
|
301
|
+
return 0;
|
302
|
+
}
|
303
|
+
|
304
|
+
start_pos = tokenlength - ends_length;
|
305
|
+
|
306
|
+
tmp_ptr = &token[start_pos];
|
307
|
+
|
308
|
+
if (strcmp(tmp_ptr,ends_in)==0){
|
309
|
+
return start_pos;
|
310
|
+
} else {
|
311
|
+
return 0;
|
312
|
+
}
|
313
|
+
}
|
314
|
+
|
315
|
+
|
316
|
+
/*******************************************************************
|
317
|
+
*******************************************************************
|
318
|
+
**
|
319
|
+
** Code for Reading from file
|
320
|
+
**
|
321
|
+
*******************************************************************
|
322
|
+
*******************************************************************/
|
323
|
+
|
324
|
+
|
325
|
+
|
326
|
+
/****************************************************************
|
327
|
+
**
|
328
|
+
** void ReadFileLine(char *buffer, int buffersize, FILE *currentFile)
|
329
|
+
**
|
330
|
+
** char *buffer - place to store contents of the line
|
331
|
+
** int buffersize - size of the buffer
|
332
|
+
** FILE *currentFile - FILE pointer to an opened CEL file.
|
333
|
+
**
|
334
|
+
** Read a line from a file, into a buffer of specified size.
|
335
|
+
** otherwise die.
|
336
|
+
**
|
337
|
+
***************************************************************/
|
338
|
+
|
339
|
+
static int ReadFileLine(char *buffer, int buffersize, FILE *currentFile){
|
340
|
+
if (fgets(buffer, buffersize, currentFile) == NULL){
|
341
|
+
return 0;
|
342
|
+
//error("End of file reached unexpectedly. Perhaps this file is truncated.\n");
|
343
|
+
}
|
344
|
+
return 1;
|
345
|
+
}
|
346
|
+
|
347
|
+
|
348
|
+
/****************************************************************
|
349
|
+
****************************************************************
|
350
|
+
**
|
351
|
+
** Code for identifying what type of information is stored in
|
352
|
+
** the current line
|
353
|
+
**
|
354
|
+
****************************************************************
|
355
|
+
***************************************************************/
|
356
|
+
|
357
|
+
/****************************************************************
|
358
|
+
**
|
359
|
+
** static int IsHeaderLine(char *buffer)
|
360
|
+
**
|
361
|
+
** char *buffer - contains line to evaluate
|
362
|
+
**
|
363
|
+
** Checks whether supplied line is a header line (ie starts with #%)
|
364
|
+
**
|
365
|
+
** return 1 (ie true) if header line. 0 otherwise
|
366
|
+
**
|
367
|
+
***************************************************************/
|
368
|
+
|
369
|
+
|
370
|
+
static int IsHeaderLine(char *buffer){
|
371
|
+
|
372
|
+
if (strncmp("#%",buffer,2) == 0){
|
373
|
+
return 1;
|
374
|
+
}
|
375
|
+
return 0;
|
376
|
+
}
|
377
|
+
|
378
|
+
/****************************************************************
|
379
|
+
**
|
380
|
+
** static int IsHeaderLine(char *buffer)
|
381
|
+
**
|
382
|
+
** char *buffer - contains line to evaluate
|
383
|
+
**
|
384
|
+
** Checks whether supplied line is a comment line (ie starts with #)
|
385
|
+
**
|
386
|
+
**
|
387
|
+
***************************************************************/
|
388
|
+
|
389
|
+
static int IsCommentLine(char *buffer){
|
390
|
+
if (strncmp("#",buffer,1) == 0){
|
391
|
+
return 1;
|
392
|
+
}
|
393
|
+
return 0;
|
394
|
+
}
|
395
|
+
|
396
|
+
/****************************************************************
|
397
|
+
**
|
398
|
+
** void initialize_clf_header(clf_headers *header)
|
399
|
+
**
|
400
|
+
** Initialize all the header values
|
401
|
+
**
|
402
|
+
**
|
403
|
+
**
|
404
|
+
***************************************************************/
|
405
|
+
|
406
|
+
void initialize_clf_header(clf_headers *header){
|
407
|
+
|
408
|
+
header->chip_type = NULL;
|
409
|
+
header->n_chip_type = 0;
|
410
|
+
|
411
|
+
header->lib_set_name= NULL;
|
412
|
+
header->lib_set_version= NULL;
|
413
|
+
header->clf_format_version= NULL;
|
414
|
+
header->header0_str= NULL;
|
415
|
+
header->header0= NULL;
|
416
|
+
header->order = NULL;
|
417
|
+
header->create_date= NULL;
|
418
|
+
header->guid= NULL;
|
419
|
+
header->other_headers_keys= NULL;
|
420
|
+
header->other_headers_values= NULL;
|
421
|
+
header->n_other_headers=0;
|
422
|
+
|
423
|
+
header->rows = -1;
|
424
|
+
header->cols = -1;
|
425
|
+
header->n_other_headers = -1;
|
426
|
+
|
427
|
+
}
|
428
|
+
|
429
|
+
|
430
|
+
/****************************************************************
|
431
|
+
****************************************************************
|
432
|
+
**
|
433
|
+
** Code for reading in clf header
|
434
|
+
**
|
435
|
+
****************************************************************
|
436
|
+
***************************************************************/
|
437
|
+
|
438
|
+
static void determine_order_header0(char *header_str, header_0 *header0){
|
439
|
+
|
440
|
+
tokenset *cur_tokenset;
|
441
|
+
int i;
|
442
|
+
char *temp_str = Calloc(strlen(header_str) +1, char);
|
443
|
+
|
444
|
+
|
445
|
+
strcpy(temp_str,header_str);
|
446
|
+
|
447
|
+
header0->probe_id = -1;
|
448
|
+
header0->x = -1;
|
449
|
+
header0->y = -1;
|
450
|
+
|
451
|
+
cur_tokenset = tokenize(temp_str,"\t\r\n");
|
452
|
+
|
453
|
+
for (i=0; i < tokenset_size(cur_tokenset); i++){
|
454
|
+
if (strcmp(get_token(cur_tokenset,i),"probe_id")==0){
|
455
|
+
header0->probe_id = i;
|
456
|
+
} else if (strcmp(get_token(cur_tokenset,i),"x")==0){
|
457
|
+
header0->x = i;
|
458
|
+
} else if (strcmp(get_token(cur_tokenset,i),"y")==0){
|
459
|
+
header0->y = i;
|
460
|
+
}
|
461
|
+
}
|
462
|
+
delete_tokens(cur_tokenset);
|
463
|
+
|
464
|
+
Free(temp_str);
|
465
|
+
|
466
|
+
}
|
467
|
+
|
468
|
+
/****************************************************************
|
469
|
+
**
|
470
|
+
** Validate that required headers are present in file.
|
471
|
+
**
|
472
|
+
** Return 0 if an expected header is not present.
|
473
|
+
** Returns 1 otherwise (ie everything looks fine)
|
474
|
+
**
|
475
|
+
***************************************************************/
|
476
|
+
|
477
|
+
static int validate_clf_header(clf_headers *header){
|
478
|
+
|
479
|
+
|
480
|
+
/* check that required headers are all there (have been read) */
|
481
|
+
if (header->chip_type == NULL)
|
482
|
+
return 0;
|
483
|
+
|
484
|
+
if (header->lib_set_name == NULL)
|
485
|
+
return 0;
|
486
|
+
|
487
|
+
if (header->lib_set_version == NULL)
|
488
|
+
return 0;
|
489
|
+
|
490
|
+
if (header->clf_format_version == NULL)
|
491
|
+
return 0;
|
492
|
+
|
493
|
+
if (header->header0_str == NULL)
|
494
|
+
return 0;
|
495
|
+
|
496
|
+
if (header->rows == -1)
|
497
|
+
return 0;
|
498
|
+
|
499
|
+
if (header->cols == -1)
|
500
|
+
return 0;
|
501
|
+
|
502
|
+
/* Check that format version is 1.0 (only supported version) */
|
503
|
+
|
504
|
+
if (strcmp( header->clf_format_version,"1.0") != 0){
|
505
|
+
return 0;
|
506
|
+
}
|
507
|
+
|
508
|
+
/* check that header0, header1, header2 (ie the three levels of headers) have required fields */
|
509
|
+
|
510
|
+
if (header->header0->probe_id == -1)
|
511
|
+
return 0;
|
512
|
+
|
513
|
+
if (header->header0->x == -1)
|
514
|
+
return 0;
|
515
|
+
|
516
|
+
if (header->header0->y == -1)
|
517
|
+
return 0;
|
518
|
+
|
519
|
+
|
520
|
+
return 1;
|
521
|
+
}
|
522
|
+
|
523
|
+
/****************************************************************
|
524
|
+
**
|
525
|
+
** static FILE *open_clf_file(const char *filename)
|
526
|
+
**
|
527
|
+
** Open the CLF to begin reading from it.
|
528
|
+
**
|
529
|
+
***************************************************************/
|
530
|
+
|
531
|
+
static FILE *open_clf_file(const char *filename){
|
532
|
+
|
533
|
+
const char *mode = "r";
|
534
|
+
FILE *currentFile = NULL;
|
535
|
+
|
536
|
+
currentFile = fopen(filename,mode);
|
537
|
+
if (currentFile == NULL){
|
538
|
+
error("Could not open file %s", filename);
|
539
|
+
}
|
540
|
+
return currentFile;
|
541
|
+
|
542
|
+
}
|
543
|
+
|
544
|
+
/****************************************************************
|
545
|
+
**
|
546
|
+
** void read_clf_header(FILE *cur_file, char *buffer, clf_headers *header)
|
547
|
+
**
|
548
|
+
** read the CLF header section
|
549
|
+
**
|
550
|
+
**
|
551
|
+
***************************************************************/
|
552
|
+
|
553
|
+
void read_clf_header(FILE *cur_file, char *buffer, clf_headers *header){
|
554
|
+
|
555
|
+
|
556
|
+
tokenset *cur_tokenset;
|
557
|
+
int i;
|
558
|
+
char *temp_str;
|
559
|
+
|
560
|
+
|
561
|
+
initialize_clf_header(header);
|
562
|
+
do {
|
563
|
+
ReadFileLine(buffer, 1024, cur_file);
|
564
|
+
/* Rprintf("%s\n",buffer); */
|
565
|
+
if (IsHeaderLine(buffer)){
|
566
|
+
cur_tokenset = tokenize(&buffer[2],"=\r\n");
|
567
|
+
/* hopefully token 0 is Key
|
568
|
+
and token 1 is Value */
|
569
|
+
/* Rprintf("Key is: %s\n",get_token(cur_tokenset,0));
|
570
|
+
Rprintf("Value is: %s\n",get_token(cur_tokenset,1)); */
|
571
|
+
/* Decode the Key/Value pair */
|
572
|
+
if (strcmp(get_token(cur_tokenset,0),"chip_type") == 0){
|
573
|
+
if (header->n_chip_type == 0){
|
574
|
+
header->chip_type = Calloc(1, char *);
|
575
|
+
} else {
|
576
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
577
|
+
}
|
578
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1))+1,char);
|
579
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
580
|
+
header->chip_type[header->n_chip_type] = temp_str;
|
581
|
+
header->n_chip_type++;
|
582
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_name") == 0){
|
583
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
584
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
585
|
+
header->lib_set_name = temp_str;
|
586
|
+
} else if (strcmp(get_token(cur_tokenset,0), "lib_set_version") == 0){
|
587
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
588
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
589
|
+
header->lib_set_version = temp_str;
|
590
|
+
} else if (strcmp(get_token(cur_tokenset,0), "clf_format_version") == 0) {
|
591
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
592
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
593
|
+
header->clf_format_version = temp_str;
|
594
|
+
} else if (strcmp(get_token(cur_tokenset,0), "rows") == 0) {
|
595
|
+
header->rows = atoi(get_token(cur_tokenset,1));
|
596
|
+
} else if (strcmp(get_token(cur_tokenset,0), "cols") == 0) {
|
597
|
+
header->cols = atoi(get_token(cur_tokenset,1));
|
598
|
+
} else if (strcmp(get_token(cur_tokenset,0), "header0") == 0) {
|
599
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
600
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
601
|
+
header->header0_str = temp_str;
|
602
|
+
header->header0 = Calloc(1,header_0);
|
603
|
+
determine_order_header0(header->header0_str,header->header0);
|
604
|
+
} else if (strcmp(get_token(cur_tokenset,0), "create_date") == 0) {
|
605
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
606
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
607
|
+
header->create_date = temp_str;
|
608
|
+
} else if (strcmp(get_token(cur_tokenset,0), "order") == 0) {
|
609
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
610
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
611
|
+
header->order = temp_str;
|
612
|
+
} else if (strcmp(get_token(cur_tokenset,0), "sequential") == 0) {
|
613
|
+
header->sequential = atoi(get_token(cur_tokenset,1));
|
614
|
+
} else if (strcmp(get_token(cur_tokenset,0), "guid") == 0) {
|
615
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
616
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
617
|
+
header->guid = temp_str;
|
618
|
+
} else {
|
619
|
+
/* not one of the recognised header types */
|
620
|
+
if ( header->n_other_headers == 0){
|
621
|
+
header->other_headers_keys = Calloc(1, char *);
|
622
|
+
header->other_headers_values = Calloc(1, char *);
|
623
|
+
} else {
|
624
|
+
header->other_headers_keys = Realloc(header->other_headers_keys,header->n_other_headers+1, char *);
|
625
|
+
header->other_headers_values = Realloc(header->other_headers_values,header->n_other_headers+1, char *);
|
626
|
+
header->chip_type = Realloc(header->chip_type, header->n_chip_type+1, char *);
|
627
|
+
}
|
628
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,1)) + 1,char);
|
629
|
+
strcpy(temp_str,get_token(cur_tokenset,1));
|
630
|
+
header->other_headers_values[header->n_other_headers] = temp_str;
|
631
|
+
temp_str = Calloc(strlen(get_token(cur_tokenset,0)) + 1,char);
|
632
|
+
strcpy(temp_str,get_token(cur_tokenset,0));
|
633
|
+
header->other_headers_keys[header->n_other_headers] = temp_str;
|
634
|
+
header->n_other_headers++;
|
635
|
+
|
636
|
+
}
|
637
|
+
|
638
|
+
delete_tokens(cur_tokenset);
|
639
|
+
}
|
640
|
+
} while (IsHeaderLine(buffer));
|
641
|
+
|
642
|
+
}
|
643
|
+
|
644
|
+
/****************************************************************
|
645
|
+
**
|
646
|
+
** void read_clf_data(FILE *cur_file, char *buffer, clf_data *data, clf_headers *header)
|
647
|
+
**
|
648
|
+
** Read in the data part of the file. Specifically, the x,y, probe_id section.
|
649
|
+
** Note to save space only the probe_id are stored.
|
650
|
+
**
|
651
|
+
****************************************************************/
|
652
|
+
|
653
|
+
void read_clf_data(FILE *cur_file, char *buffer, clf_data *data, clf_headers *header){
|
654
|
+
tokenset *cur_tokenset;
|
655
|
+
int x, y, cur_id;
|
656
|
+
|
657
|
+
/* Check to see if the header information includes enough to know that probe_ids are deterministic */
|
658
|
+
/* if the are deterministic then don't need to read the rest of the file */
|
659
|
+
|
660
|
+
|
661
|
+
if (header->sequential > -1){
|
662
|
+
data->probe_id = NULL;
|
663
|
+
return;
|
664
|
+
} else {
|
665
|
+
data->probe_id = Calloc((header->rows)*(header->cols), int);
|
666
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
667
|
+
cur_id = atoi(get_token(cur_tokenset,header->header0->probe_id));
|
668
|
+
x = atoi(get_token(cur_tokenset,header->header0->x));
|
669
|
+
y = atoi(get_token(cur_tokenset,header->header0->y));
|
670
|
+
data->probe_id[y*header->cols + x] = cur_id;
|
671
|
+
|
672
|
+
delete_tokens(cur_tokenset);
|
673
|
+
while(ReadFileLine(buffer, 1024, cur_file)){
|
674
|
+
cur_tokenset = tokenize(buffer,"\t\r\n");
|
675
|
+
cur_id = atoi(get_token(cur_tokenset,header->header0->probe_id));
|
676
|
+
x = atoi(get_token(cur_tokenset,header->header0->x));
|
677
|
+
y = atoi(get_token(cur_tokenset,header->header0->y));
|
678
|
+
data->probe_id[y*header->cols + x] = cur_id;
|
679
|
+
|
680
|
+
delete_tokens(cur_tokenset);
|
681
|
+
}
|
682
|
+
}
|
683
|
+
}
|
684
|
+
|
685
|
+
|
686
|
+
|
687
|
+
|
688
|
+
|
689
|
+
|
690
|
+
/****************************************************************
|
691
|
+
****************************************************************
|
692
|
+
**
|
693
|
+
** Code for deallocating or initializing header data structures
|
694
|
+
**
|
695
|
+
****************************************************************
|
696
|
+
****************************************************************/
|
697
|
+
|
698
|
+
void dealloc_clf_headers(clf_headers *header){
|
699
|
+
int i;
|
700
|
+
|
701
|
+
if (header->n_chip_type > 0){
|
702
|
+
for (i = 0; i < header->n_chip_type; i++){
|
703
|
+
Free(header->chip_type[i]);
|
704
|
+
}
|
705
|
+
Free(header->chip_type);
|
706
|
+
}
|
707
|
+
|
708
|
+
if (header->lib_set_name != NULL){
|
709
|
+
Free(header->lib_set_name);
|
710
|
+
}
|
711
|
+
|
712
|
+
if (header->lib_set_version != NULL){
|
713
|
+
Free(header->lib_set_version);
|
714
|
+
}
|
715
|
+
|
716
|
+
if (header->clf_format_version != NULL){
|
717
|
+
Free(header->clf_format_version);
|
718
|
+
}
|
719
|
+
|
720
|
+
if (header->header0_str != NULL){
|
721
|
+
Free(header->header0_str);
|
722
|
+
Free(header->header0);
|
723
|
+
}
|
724
|
+
|
725
|
+
if (header->order != NULL){
|
726
|
+
Free(header->order);
|
727
|
+
}
|
728
|
+
|
729
|
+
if (header->create_date != NULL){
|
730
|
+
Free(header->create_date);
|
731
|
+
}
|
732
|
+
|
733
|
+
if (header->guid != NULL){
|
734
|
+
Free(header->guid);
|
735
|
+
}
|
736
|
+
|
737
|
+
if (header->n_other_headers > 0){
|
738
|
+
for (i = 0; i < header->n_other_headers; i++){
|
739
|
+
Free(header->other_headers_keys[i]);
|
740
|
+
Free(header->other_headers_values[i]);
|
741
|
+
}
|
742
|
+
Free(header->other_headers_keys);
|
743
|
+
Free(header->other_headers_values);
|
744
|
+
}
|
745
|
+
}
|
746
|
+
|
747
|
+
|
748
|
+
void dealloc_clf_data(clf_data *data){
|
749
|
+
if (data->probe_id != NULL){
|
750
|
+
Free(data->probe_id);
|
751
|
+
}
|
752
|
+
}
|
753
|
+
|
754
|
+
|
755
|
+
void dealloc_clf_file(clf_file* my_clf){
|
756
|
+
|
757
|
+
|
758
|
+
if (my_clf->headers != NULL){
|
759
|
+
dealloc_clf_headers(my_clf->headers);
|
760
|
+
Free(my_clf->headers);
|
761
|
+
}
|
762
|
+
|
763
|
+
|
764
|
+
if (my_clf->data !=NULL){
|
765
|
+
dealloc_clf_data(my_clf->data);
|
766
|
+
Free(my_clf->data);
|
767
|
+
}
|
768
|
+
|
769
|
+
|
770
|
+
}
|
771
|
+
|
772
|
+
/**********************************************************************
|
773
|
+
***
|
774
|
+
*** A function for getting the probe_id for a given x,y
|
775
|
+
***
|
776
|
+
***
|
777
|
+
*********************************************************************/
|
778
|
+
|
779
|
+
void clf_get_probe_id(clf_file *clf, int *probe_id, int x, int y){
|
780
|
+
|
781
|
+
if (clf->headers->sequential > -1){
|
782
|
+
/* Check if order is "col_major" or "row_major" */
|
783
|
+
|
784
|
+
if (strcmp(clf->headers->order,"col_major") == 0){
|
785
|
+
*probe_id = y*clf->headers->cols + x + clf->headers->sequential;
|
786
|
+
} else if (strcmp(clf->headers->order,"row_major") == 0){
|
787
|
+
*probe_id = x*clf->headers->rows + y + clf->headers->sequential;
|
788
|
+
} else {
|
789
|
+
*probe_id = -1; /* ie missing */
|
790
|
+
}
|
791
|
+
|
792
|
+
} else {
|
793
|
+
|
794
|
+
*probe_id = clf->data->probe_id[y*clf->headers->rows + x];
|
795
|
+
}
|
796
|
+
}
|
797
|
+
|
798
|
+
/**********************************************************************
|
799
|
+
***
|
800
|
+
*** A function for getting the x , y for a given probe_id
|
801
|
+
***
|
802
|
+
***
|
803
|
+
*********************************************************************/
|
804
|
+
|
805
|
+
void clf_get_x_y(clf_file *clf, int probe_id, int *x, int *y){
|
806
|
+
int ind;
|
807
|
+
|
808
|
+
if (clf->headers->sequential > -1){
|
809
|
+
/* Check if order is "col_major" or "row_major" */
|
810
|
+
|
811
|
+
if (strcmp(clf->headers->order,"col_major") == 0){
|
812
|
+
ind = (probe_id - clf->headers->sequential);
|
813
|
+
*x = ind%clf->headers->cols;
|
814
|
+
*y = ind/clf->headers->cols;
|
815
|
+
} else if (strcmp(clf->headers->order,"row_major") == 0){
|
816
|
+
ind = (probe_id - clf->headers->sequential);
|
817
|
+
*x = ind/clf->headers->rows;
|
818
|
+
*y = ind%clf->headers->rows;
|
819
|
+
} else {
|
820
|
+
*x = -1; /* ie missing */
|
821
|
+
*y = -1;
|
822
|
+
}
|
823
|
+
} else {
|
824
|
+
/* Linear Search (this should be improved for routine use) */
|
825
|
+
ind = 0;
|
826
|
+
|
827
|
+
while (ind < (clf->headers->cols*clf->headers->rows)){
|
828
|
+
if (clf->data->probe_id[ind] == probe_id){
|
829
|
+
break;
|
830
|
+
}
|
831
|
+
ind++;
|
832
|
+
}
|
833
|
+
|
834
|
+
if (ind == (clf->headers->cols*clf->headers->rows)){
|
835
|
+
*x = -1; *y = -1;
|
836
|
+
} else {
|
837
|
+
*x = ind/clf->headers->rows;
|
838
|
+
*y = ind%clf->headers->rows;
|
839
|
+
}
|
840
|
+
}
|
841
|
+
}
|
842
|
+
|
843
|
+
/*
|
844
|
+
* Note this function is only for testing purposes. It provides no methodology for accessing anything
|
845
|
+
* stored in the CLF file in R.
|
846
|
+
*
|
847
|
+
*/
|
848
|
+
|
849
|
+
void read_clf_file(char **filename){
|
850
|
+
|
851
|
+
FILE *cur_file;
|
852
|
+
clf_file my_clf;
|
853
|
+
char *buffer = Calloc(1024, char);
|
854
|
+
|
855
|
+
|
856
|
+
|
857
|
+
cur_file = open_clf_file(filename[0]);
|
858
|
+
|
859
|
+
my_clf.headers = Calloc(1, clf_headers);
|
860
|
+
my_clf.data = Calloc(1, clf_data);
|
861
|
+
|
862
|
+
read_clf_header(cur_file,buffer,my_clf.headers);
|
863
|
+
if (validate_clf_header(my_clf.headers))
|
864
|
+
read_clf_data(cur_file, buffer, my_clf.data, my_clf.headers);
|
865
|
+
|
866
|
+
Free(buffer);
|
867
|
+
dealloc_clf_file(&my_clf);
|
868
|
+
fclose(cur_file);
|
869
|
+
|
870
|
+
}
|