bio-affy 0.1.0.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +15 -0
  4. data/Gemfile.lock +32 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +33 -0
  7. data/Rakefile +77 -0
  8. data/VERSION +1 -0
  9. data/bin/bio-affy +80 -0
  10. data/bio-affy.gemspec +128 -0
  11. data/ext/DESCRIPTION +11 -0
  12. data/ext/HISTORY +3 -0
  13. data/ext/LICENSE +456 -0
  14. data/ext/NAMESPACE +2 -0
  15. data/ext/R/check.cdf.type.R +18 -0
  16. data/ext/R/read.cdffile.list.R +23 -0
  17. data/ext/R/read.celfile.R +11 -0
  18. data/ext/R/read.celfile.header.R +37 -0
  19. data/ext/R/read.probematrices.R +29 -0
  20. data/ext/README_BIOLIB +36 -0
  21. data/ext/aclocal.m4 +32 -0
  22. data/ext/configure +4898 -0
  23. data/ext/configure.in +51 -0
  24. data/ext/man/check.cdf.type.Rd +22 -0
  25. data/ext/man/read.cdffile.list.Rd +20 -0
  26. data/ext/man/read.celfile.Rd +23 -0
  27. data/ext/man/read.celfile.header.Rd +22 -0
  28. data/ext/man/read.celfile.probeintensity.matrices.Rd +31 -0
  29. data/ext/src/CMakeLists.txt +39 -0
  30. data/ext/src/Makevars.in +3 -0
  31. data/ext/src/Makevars.win +2 -0
  32. data/ext/src/Rakefile +43 -0
  33. data/ext/src/biolib_affyio.c +416 -0
  34. data/ext/src/biolib_affyio.h +132 -0
  35. data/ext/src/biolib_affyio.o +0 -0
  36. data/ext/src/fread_functions.c +871 -0
  37. data/ext/src/fread_functions.h +60 -0
  38. data/ext/src/fread_functions.o +0 -0
  39. data/ext/src/libaffyext.so +0 -0
  40. data/ext/src/mkrf.log +11 -0
  41. data/ext/src/mkrf_conf.rb +6 -0
  42. data/ext/src/read_abatch.c +5484 -0
  43. data/ext/src/read_abatch.h +63 -0
  44. data/ext/src/read_abatch.o +0 -0
  45. data/ext/src/read_bpmap.c +888 -0
  46. data/ext/src/read_bpmap.o +0 -0
  47. data/ext/src/read_cdf.h +347 -0
  48. data/ext/src/read_cdf_xda.c +1342 -0
  49. data/ext/src/read_cdf_xda.o +0 -0
  50. data/ext/src/read_cdffile2.c +1576 -0
  51. data/ext/src/read_cdffile2.o +0 -0
  52. data/ext/src/read_celfile_generic.c +2061 -0
  53. data/ext/src/read_celfile_generic.h +33 -0
  54. data/ext/src/read_celfile_generic.o +0 -0
  55. data/ext/src/read_clf.c +870 -0
  56. data/ext/src/read_clf.o +0 -0
  57. data/ext/src/read_generic.c +1446 -0
  58. data/ext/src/read_generic.h +144 -0
  59. data/ext/src/read_generic.o +0 -0
  60. data/ext/src/read_pgf.c +1337 -0
  61. data/ext/src/read_pgf.o +0 -0
  62. data/lib/bio-affy.rb +5 -0
  63. data/lib/bio/affy.rb +7 -0
  64. data/lib/bio/affyext.rb +23 -0
  65. data/lib/bio/libaffyext.so +0 -0
  66. data/spec/bio-affy_spec.rb +22 -0
  67. data/spec/spec_helper.rb +13 -0
  68. data/test/data/affy/GSM103328.CEL.gz +0 -0
  69. data/test/data/affy/GSM103329.CEL.gz +0 -0
  70. data/test/data/affy/GSM103330.CEL.gz +0 -0
  71. data/test/data/affy/MG_U74Av2.CDF.gz +0 -0
  72. metadata +190 -0
@@ -0,0 +1,63 @@
1
+ #ifndef READ_ABATCH_H
2
+ #define READ_ABATCH_H
3
+
4
+
5
+
6
+ /****************************************************************
7
+ **
8
+ ** A structure for holding full header information
9
+ **
10
+ **
11
+ **
12
+ ***************************************************************/
13
+
14
+ typedef struct{
15
+ char *cdfName;
16
+ int cols;
17
+ int rows;
18
+ int GridCornerULx,GridCornerULy; /* XY coordinates of the upper left grid corner in pixel coordinates.*/
19
+ int GridCornerURx,GridCornerURy; /* XY coordinates of the upper right grid corner in pixel coordinates.*/
20
+ int GridCornerLRx,GridCornerLRy; /* XY coordinates of the lower right grid corner in pixel coordinates.*/
21
+ int GridCornerLLx,GridCornerLLy; /* XY coordinates of the lower left grid corner in pixel coordinates.*/
22
+ char *DatHeader;
23
+ char *Algorithm;
24
+ char *AlgorithmParameters;
25
+ char *ScanDate;
26
+ } detailed_header_info;
27
+
28
+ /******************************************************************
29
+ **
30
+ ** A "C" level object designed to hold information for a
31
+ ** single CEL file
32
+ **
33
+ ** These should be created using the function
34
+ **
35
+ ** read_cel_file()
36
+ **
37
+ **
38
+ **
39
+ *****************************************************************/
40
+
41
+ typedef struct{
42
+ detailed_header_info header;
43
+
44
+ /** these are for storing the intensities, the sds and the number of pixels **/
45
+ double *intensities;
46
+ double *stddev;
47
+ double *npixels;
48
+
49
+ /** these are for storing information in the masks and outliers section **/
50
+
51
+ int nmasks;
52
+ int noutliers;
53
+
54
+ short *masks_x, *masks_y;
55
+ short *outliers_x, *outliers_y;
56
+
57
+ } CEL;
58
+
59
+ extern CEL *read_cel_file(const char *filename, int read_intensities_only);
60
+
61
+
62
+
63
+ #endif
Binary file
@@ -0,0 +1,888 @@
1
+ /****************************************************************
2
+ **
3
+ ** File: read_bpmap.c
4
+ **
5
+ ** Implementation by: B. M. Bolstad
6
+ **
7
+ ** Copyright (C) B. M. Bolstad 2006-2007
8
+ **
9
+ ** A parser designed to read bpmap files into an R List structure
10
+ **
11
+ ** History
12
+ ** Mar 11, 2006 - Initial version
13
+ ** Mar 12, 2006 - add additional support for versions 2 and 3
14
+ ** May 31, 2006 - Fix some compiler warnings
15
+ ** June 12, 2006 - fix naming vector length issue.
16
+ ** June 12, 2007 - much wailing and grinding of teeth, but finally a fix for reading version number right.
17
+ ** Aug 25, 2007 - Move file reading functions to centralized location
18
+ ** Mar 14, 2008 - Fix reading of version number for big endian platforms
19
+ ** Jan 15, 2008 - Fix VECTOR_ELT/STRING_ELT issues
20
+ **
21
+ *******************************************************************/
22
+
23
+ #include <R.h>
24
+ #include <Rdefines.h>
25
+
26
+ #include "stdlib.h"
27
+ #include "stdio.h"
28
+
29
+ #include "fread_functions.h"
30
+
31
+
32
+
33
+ /****************************************************************
34
+ **
35
+ **
36
+ **
37
+ **
38
+ ** Note BPMAP files are stored in big endian format
39
+ **
40
+ *******************************************************************/
41
+
42
+
43
+
44
+ /*************************************************************************
45
+ **
46
+ ** Code for reading from the big endian binary files, doing bit flipping if
47
+ ** necessary (on little-endian machines)
48
+ **
49
+ **
50
+ ************************************************************************/
51
+
52
+
53
+
54
+ static void swap_float_4(float *tnf4) /* 4 byte floating point numbers */
55
+ {
56
+ int tni = (int)(*tnf4);
57
+
58
+ tni=(((tni>>24)&0xff) | ((tni&0xff)<<24) |
59
+ ((tni>>8)&0xff00) | ((tni&0xff00)<<8));
60
+
61
+ *tnf4 = (float)tni;
62
+
63
+ }
64
+
65
+
66
+
67
+
68
+
69
+ static SEXP ReadBPMAPHeader(FILE *infile){
70
+
71
+
72
+ SEXP Header;
73
+ SEXP tmpSXP;
74
+
75
+
76
+ char *Magicnumber = R_alloc(8,sizeof(char));
77
+ float version_number = 0.0;
78
+ int version_number_int;
79
+ unsigned int unsigned_version_number_int;
80
+
81
+
82
+ unsigned int n_seq;
83
+ static double new_version_number;
84
+
85
+
86
+
87
+ fread_be_char(Magicnumber,8,infile);
88
+
89
+ if (strncmp(Magicnumber,"PHT7",4) !=0){
90
+ error("Based on the magic number which was %s, this does not appear to be a BPMAP file",Magicnumber);
91
+ }
92
+
93
+
94
+ /* version number is a little bit funky
95
+ need to do some funny things to coax it
96
+ into the right format
97
+ */
98
+
99
+
100
+ /* cast to integer, swap bytes, cast to float */
101
+ /* fread_be_float32(&version_number,1,infile); */
102
+ fread_float32(&version_number,1,infile);
103
+ swap_float_4(&version_number);
104
+
105
+ new_version_number = (double)version_number;
106
+ /* // Rprintf("A %f\n",version_number);*/
107
+
108
+ if ((version_number <=0.5) || (version_number > 3.5)){
109
+ /* // Rprintf("Rereading\n"); */
110
+ fseek(infile,-sizeof(float),SEEK_CUR);
111
+ fread_be_uint32(&unsigned_version_number_int,1,infile);
112
+ memcpy(&version_number,&unsigned_version_number_int, sizeof(float));
113
+ new_version_number = (double)version_number;
114
+ }
115
+
116
+ fread_be_uint32(&n_seq,1,infile);
117
+
118
+ PROTECT(Header=allocVector(VECSXP,3));
119
+
120
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
121
+ SET_STRING_ELT(tmpSXP,0,mkChar(Magicnumber));
122
+ SET_VECTOR_ELT(Header,0,tmpSXP);
123
+ UNPROTECT(1);
124
+
125
+
126
+ PROTECT(tmpSXP=allocVector(REALSXP,1));
127
+ REAL(tmpSXP)[0] = (double)new_version_number;
128
+ SET_VECTOR_ELT(Header,1,tmpSXP);
129
+ UNPROTECT(1);
130
+
131
+
132
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
133
+ INTEGER(tmpSXP)[0] = (int)n_seq;
134
+ SET_VECTOR_ELT(Header,2,tmpSXP);
135
+ UNPROTECT(1);
136
+
137
+ PROTECT(tmpSXP=allocVector(STRSXP,3));
138
+ SET_STRING_ELT(tmpSXP,0,mkChar("magic.number"));
139
+ SET_STRING_ELT(tmpSXP,1,mkChar("version"));
140
+ SET_STRING_ELT(tmpSXP,2,mkChar("n.seq"));
141
+ setAttrib(Header,R_NamesSymbol,tmpSXP);
142
+ UNPROTECT(2);
143
+
144
+ /* Rprintf("D %f %f\n",version_number,new_version_number); */
145
+ return Header;
146
+
147
+ }
148
+
149
+
150
+
151
+ static SEXP ReadBPMAPSeqDescription(FILE *infile, float version, int nseq){
152
+
153
+
154
+ SEXP SequenceDescriptionList;
155
+
156
+ SEXP CurSequenceDescription = R_NilValue;
157
+ SEXP tmpSXP,tmpSXP2;
158
+
159
+
160
+
161
+ int i,j;
162
+
163
+ unsigned int seq_name_length;
164
+
165
+ char *seq_name;
166
+
167
+ unsigned int probe_mapping_type;
168
+ unsigned int seq_file_offset;
169
+
170
+ unsigned int n_probes;
171
+
172
+ unsigned int group_name_length;
173
+ char *group_name;
174
+
175
+ unsigned int version_number_length;
176
+ char *version_number;
177
+
178
+ unsigned int number_parameters;
179
+
180
+ unsigned int param_length;
181
+ char *param_name;
182
+
183
+ /* Rprintf("%f %d\n",version,nseq); */
184
+
185
+ PROTECT(SequenceDescriptionList=allocVector(VECSXP,(int)nseq));
186
+
187
+ for (i=0; i < nseq; i++){
188
+ fread_be_uint32(&seq_name_length,1,infile);
189
+ seq_name = (char *)Calloc(seq_name_length+1,char);
190
+ fread_be_char(seq_name,seq_name_length,infile);
191
+
192
+
193
+
194
+ if (version == 3.00){
195
+ PROTECT(CurSequenceDescription=allocVector(VECSXP,8));
196
+ PROTECT(tmpSXP=allocVector(STRSXP,7));
197
+ SET_STRING_ELT(tmpSXP,0,mkChar("Name"));
198
+ SET_STRING_ELT(tmpSXP,1,mkChar("ProbeMappingType"));
199
+ SET_STRING_ELT(tmpSXP,2,mkChar("SequenceFileOffset"));
200
+ SET_STRING_ELT(tmpSXP,3,mkChar("n.probepairs"));
201
+ SET_STRING_ELT(tmpSXP,4,mkChar("GroupName"));
202
+ SET_STRING_ELT(tmpSXP,5,mkChar("VersionNumber"));
203
+ SET_STRING_ELT(tmpSXP,6,mkChar("NumberOfParameters"));
204
+ SET_STRING_ELT(tmpSXP,7,mkChar("Parameters"));
205
+ setAttrib(CurSequenceDescription,R_NamesSymbol,tmpSXP);
206
+ UNPROTECT(1);
207
+ } else if (version == 2.00){
208
+ PROTECT(CurSequenceDescription=allocVector(VECSXP,6));
209
+ PROTECT(tmpSXP=allocVector(STRSXP,6));
210
+ SET_STRING_ELT(tmpSXP,0,mkChar("Name"));
211
+ SET_STRING_ELT(tmpSXP,1,mkChar("n.probepairs"));
212
+ SET_STRING_ELT(tmpSXP,2,mkChar("GroupName"));
213
+ SET_STRING_ELT(tmpSXP,3,mkChar("VersionNumber"));
214
+ SET_STRING_ELT(tmpSXP,4,mkChar("NumberOfParameters"));
215
+ SET_STRING_ELT(tmpSXP,5,mkChar("Parameters"));
216
+ setAttrib(CurSequenceDescription,R_NamesSymbol,tmpSXP);
217
+ UNPROTECT(1);
218
+ } else if (version == 1.00){
219
+ PROTECT(CurSequenceDescription=allocVector(VECSXP,2));
220
+ PROTECT(tmpSXP=allocVector(STRSXP,2));
221
+ SET_STRING_ELT(tmpSXP,0,mkChar("Name"));
222
+ SET_STRING_ELT(tmpSXP,1,mkChar("n.probepairs"));
223
+ setAttrib(CurSequenceDescription,R_NamesSymbol,tmpSXP);
224
+ UNPROTECT(1);
225
+
226
+ }
227
+
228
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
229
+ SET_STRING_ELT(tmpSXP,0,mkChar(seq_name));
230
+ SET_VECTOR_ELT(CurSequenceDescription,0,tmpSXP);
231
+ UNPROTECT(1);
232
+ Free(seq_name);
233
+
234
+
235
+ if (version == 1.0){
236
+ fread_be_uint32(&n_probes,1,infile);
237
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
238
+ INTEGER(tmpSXP)[0] = n_probes;
239
+ SET_VECTOR_ELT(CurSequenceDescription,1,tmpSXP);
240
+ UNPROTECT(1);
241
+ } else if (version ==2.0){
242
+ fread_be_uint32(&n_probes,1,infile);
243
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
244
+ INTEGER(tmpSXP)[0] = n_probes;
245
+ SET_VECTOR_ELT(CurSequenceDescription,1,tmpSXP);
246
+ UNPROTECT(1);
247
+
248
+
249
+
250
+
251
+ fread_be_uint32(&group_name_length,1,infile);
252
+ group_name = (char *)Calloc(group_name_length+1,char);
253
+ fread_be_char(group_name,group_name_length,infile);
254
+
255
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
256
+ SET_STRING_ELT(tmpSXP,0,mkChar(group_name));
257
+ SET_VECTOR_ELT(CurSequenceDescription,2,tmpSXP);
258
+ UNPROTECT(1);
259
+ Free(group_name);
260
+
261
+
262
+ fread_be_uint32(&version_number_length,1,infile);
263
+ version_number = (char *)Calloc(version_number_length+1,char);
264
+ fread_be_char(version_number,version_number_length,infile);
265
+
266
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
267
+ SET_STRING_ELT(tmpSXP,0,mkChar(version_number));
268
+ SET_VECTOR_ELT(CurSequenceDescription,3,tmpSXP);
269
+ UNPROTECT(1);
270
+ Free(version_number);
271
+
272
+
273
+ fread_be_uint32(&number_parameters,1,infile);
274
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
275
+ INTEGER(tmpSXP)[0] = number_parameters;
276
+ SET_VECTOR_ELT(CurSequenceDescription,4,tmpSXP);
277
+ UNPROTECT(1);
278
+
279
+ PROTECT(tmpSXP=allocVector(VECSXP,number_parameters));
280
+
281
+
282
+ for (j=0; j < number_parameters; j++){
283
+ PROTECT(tmpSXP2 = allocVector(STRSXP,2));
284
+ fread_be_uint32(&param_length,1,infile);
285
+ param_name = (char *)Calloc(param_length+1,char);
286
+ fread_be_char(param_name,param_length,infile);
287
+ SET_STRING_ELT(tmpSXP2,0,mkChar(param_name));
288
+ Free(param_name);
289
+ fread_be_uint32(&param_length,1,infile);
290
+ param_name = (char *)Calloc(param_length+1,char);
291
+ fread_be_char(param_name,param_length,infile);
292
+ SET_STRING_ELT(tmpSXP2,1,mkChar(param_name));
293
+ Free(param_name);
294
+
295
+ SET_VECTOR_ELT(tmpSXP,j,tmpSXP2);
296
+ UNPROTECT(1);
297
+ }
298
+ SET_VECTOR_ELT(CurSequenceDescription,5,tmpSXP);
299
+ UNPROTECT(1);
300
+
301
+
302
+
303
+ } else if (version ==3.0){
304
+ fread_be_uint32(&probe_mapping_type,1,infile);
305
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
306
+ INTEGER(tmpSXP)[0] = probe_mapping_type;
307
+ SET_VECTOR_ELT(CurSequenceDescription,1,tmpSXP);
308
+ UNPROTECT(1);
309
+
310
+ fread_be_uint32(&seq_file_offset,1,infile);
311
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
312
+ INTEGER(tmpSXP)[0] = seq_file_offset;
313
+ SET_VECTOR_ELT(CurSequenceDescription,2,tmpSXP);
314
+ UNPROTECT(1);
315
+
316
+ fread_be_uint32(&n_probes,1,infile);
317
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
318
+ INTEGER(tmpSXP)[0] = n_probes;
319
+ SET_VECTOR_ELT(CurSequenceDescription,3,tmpSXP);
320
+ UNPROTECT(1);
321
+
322
+ fread_be_uint32(&group_name_length,1,infile);
323
+ group_name = (char *)Calloc(group_name_length+1,char);
324
+ fread_be_char(group_name,group_name_length,infile);
325
+
326
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
327
+ SET_STRING_ELT(tmpSXP,0,mkChar(group_name));
328
+ SET_VECTOR_ELT(CurSequenceDescription,4,tmpSXP);
329
+ UNPROTECT(1);
330
+ Free(group_name);
331
+
332
+ fread_be_uint32(&version_number_length,1,infile);
333
+ version_number = (char *)Calloc(version_number_length+1,char);
334
+ fread_be_char(version_number,version_number_length,infile);
335
+
336
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
337
+ SET_STRING_ELT(tmpSXP,0,mkChar(version_number));
338
+ SET_VECTOR_ELT(CurSequenceDescription,5,tmpSXP);
339
+ UNPROTECT(1);
340
+ Free(version_number);
341
+
342
+ fread_be_uint32(&number_parameters,1,infile);
343
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
344
+ INTEGER(tmpSXP)[0] = number_parameters;
345
+ SET_VECTOR_ELT(CurSequenceDescription,6,tmpSXP);
346
+ UNPROTECT(1);
347
+
348
+
349
+
350
+ PROTECT(tmpSXP=allocVector(VECSXP,number_parameters));
351
+
352
+
353
+ for (j=0; j < number_parameters; j++){
354
+ PROTECT(tmpSXP2 = allocVector(STRSXP,2));
355
+ fread_be_uint32(&param_length,1,infile);
356
+ param_name = (char *)Calloc(param_length+1,char);
357
+ fread_be_char(param_name,param_length,infile);
358
+ SET_STRING_ELT(tmpSXP2,0,mkChar(param_name));
359
+ Free(param_name);
360
+ fread_be_uint32(&param_length,1,infile);
361
+ param_name = (char *)Calloc(param_length+1,char);
362
+ fread_be_char(param_name,param_length,infile);
363
+ SET_STRING_ELT(tmpSXP2,1,mkChar(param_name));
364
+ Free(param_name);
365
+
366
+ SET_VECTOR_ELT(tmpSXP,j,tmpSXP2);
367
+ UNPROTECT(1);
368
+ }
369
+ SET_VECTOR_ELT(CurSequenceDescription,7,tmpSXP);
370
+ UNPROTECT(1);
371
+ }
372
+
373
+ SET_VECTOR_ELT(SequenceDescriptionList,i,CurSequenceDescription);
374
+ UNPROTECT(1);
375
+
376
+ }
377
+
378
+ UNPROTECT(1);
379
+ return SequenceDescriptionList;
380
+
381
+ }
382
+
383
+
384
+
385
+ static void packedSeqTobaseStr(unsigned char probeseq[7], char *dest){
386
+
387
+ unsigned char currentchar;
388
+
389
+ unsigned char firsttwobits;
390
+ unsigned char secondtwobits;
391
+ unsigned char thirdtwobits;
392
+ unsigned char fourthtwobits;
393
+
394
+ int i;
395
+
396
+
397
+ /* Rprintf("\n\n\n\n\n"); */
398
+
399
+
400
+ for (i =0; i < 6;i++){
401
+ currentchar = probeseq[i];
402
+
403
+ /* extract first two bits */
404
+ firsttwobits = (currentchar & 192);
405
+ secondtwobits = (currentchar & 48);
406
+ thirdtwobits = (currentchar & 12);
407
+ fourthtwobits = (currentchar & 3);
408
+
409
+
410
+
411
+ firsttwobits = firsttwobits >> 6;
412
+ secondtwobits = secondtwobits >> 4;
413
+ thirdtwobits = thirdtwobits >> 2;
414
+
415
+ /* Rprintf("%x %x %x %x\n",firsttwobits,secondtwobits,thirdtwobits,fourthtwobits); */
416
+
417
+
418
+
419
+
420
+ if (firsttwobits == 0){
421
+ dest[4*i +0]='A';
422
+ }
423
+ if (firsttwobits == 1){
424
+ dest[4*i +0]='C';
425
+ }
426
+ if (firsttwobits == 2){
427
+ dest[4*i +0]='G';
428
+ }
429
+ if (firsttwobits == 3){
430
+ dest[4*i +0]='T';
431
+ }
432
+
433
+ if (secondtwobits == 0){
434
+ dest[4*i +1]='A';
435
+ }
436
+ if (secondtwobits == 1){
437
+ dest[4*i +1]='C';
438
+ }
439
+ if (secondtwobits == 2){
440
+ dest[4*i +1]='G';
441
+ }
442
+ if (secondtwobits == 3){
443
+ dest[4*i +1]='T';
444
+ }
445
+
446
+ if (thirdtwobits == 0){
447
+ dest[4*i +2]='A';
448
+ }
449
+ if (thirdtwobits == 1){
450
+ dest[4*i +2]='C';
451
+ }
452
+ if (thirdtwobits == 2){
453
+ dest[4*i +2]='G';
454
+ }
455
+ if (thirdtwobits == 3){
456
+ dest[4*i +2]='T';
457
+ }
458
+
459
+ if (fourthtwobits == 0){
460
+ dest[4*i +3]='A';
461
+ }
462
+ if (fourthtwobits == 1){
463
+ dest[4*i +3]='C';
464
+ }
465
+ if (fourthtwobits == 2){
466
+ dest[4*i +3]='G';
467
+ }
468
+ if (fourthtwobits == 3){
469
+ dest[4*i +3]='T';
470
+ }
471
+
472
+ /* Rprintf("%c%c%c%c\n",dest[4*i],dest[4*i +1],dest[4*i +2], dest[4*i +3]); */
473
+ }
474
+
475
+ currentchar = probeseq[6];
476
+
477
+ /* extract first two bits */
478
+
479
+ firsttwobits = (currentchar & 192);
480
+ firsttwobits = firsttwobits >> 6;
481
+ if (firsttwobits == 0){
482
+ dest[24]='A';
483
+ }
484
+ if (firsttwobits == 1){
485
+ dest[24]='C';
486
+ }
487
+ if (firsttwobits == 2){
488
+ dest[24]='G';
489
+ }
490
+ if (firsttwobits == 3){
491
+ dest[24]='T';
492
+ }
493
+ }
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+ static SEXP readBPMAPSeqIdPositionInfo(FILE *infile, float version, int nseq, SEXP seqDesc){
503
+
504
+
505
+ SEXP SeqIdPositionInfoList;
506
+ SEXP curSeqIdPositionInfo;
507
+ SEXP PositionInfo= R_NilValue;
508
+ SEXP PositionInfoRowNames;
509
+
510
+
511
+ SEXP tmpSEXP;
512
+
513
+ SEXP xPM= R_NilValue,yPM= R_NilValue,xMM= R_NilValue,yMM= R_NilValue;
514
+ SEXP PMprobeLength= R_NilValue;
515
+ SEXP probeSeqString= R_NilValue;
516
+ SEXP MatchScore= R_NilValue;
517
+ SEXP PMposition= R_NilValue;
518
+ SEXP Strand= R_NilValue;
519
+
520
+ char buf[10];
521
+
522
+ char *dest;
523
+
524
+
525
+ int nprobes=0;
526
+ int probe_mapping_type=0;
527
+ int i,j;
528
+
529
+
530
+ unsigned int SeqId;
531
+
532
+ unsigned int x;
533
+ unsigned int y;
534
+
535
+ unsigned int x_mm;
536
+ unsigned int y_mm;
537
+
538
+ unsigned char probelength;
539
+
540
+ unsigned char probeseq[7];
541
+
542
+ float matchScore;
543
+ int matchScore_int;
544
+
545
+ unsigned int positionPM;
546
+ unsigned char strand;
547
+
548
+
549
+ PROTECT(SeqIdPositionInfoList = allocVector(VECSXP,nseq));
550
+
551
+ for (i =0; i < nseq; i++){
552
+ fread_be_uint32(&SeqId,1,infile);
553
+ /*Rprintf("Seq id:%u\n",SeqId);*/
554
+
555
+ PROTECT(curSeqIdPositionInfo = allocVector(VECSXP,2));
556
+
557
+
558
+ PROTECT(tmpSEXP=allocVector(INTSXP,1));
559
+ INTEGER(tmpSEXP)[0] = (int)SeqId;
560
+ SET_VECTOR_ELT(curSeqIdPositionInfo,0,tmpSEXP);
561
+ UNPROTECT(1);
562
+
563
+
564
+ PROTECT(tmpSEXP=allocVector(STRSXP,2));
565
+ SET_STRING_ELT(tmpSEXP,0,mkChar("Header"));
566
+ SET_STRING_ELT(tmpSEXP,1,mkChar("PositionInformation"));
567
+ setAttrib(curSeqIdPositionInfo,R_NamesSymbol,tmpSEXP);
568
+ UNPROTECT(1);
569
+
570
+
571
+
572
+ if ((version == 1.0) || (version == 2.0)){
573
+ nprobes = INTEGER(VECTOR_ELT(VECTOR_ELT(seqDesc,i),1))[0];
574
+ /* Rprintf("nprobes: %d\n",nprobes); */
575
+ probe_mapping_type = 0; /* PM/MM tiling */
576
+
577
+ PROTECT(PositionInfo = allocVector(VECSXP,9));
578
+ PROTECT(xPM = allocVector(INTSXP,nprobes));
579
+ PROTECT(yPM = allocVector(INTSXP,nprobes));
580
+ PROTECT(xMM = allocVector(INTSXP,nprobes));
581
+ PROTECT(yMM = allocVector(INTSXP,nprobes));
582
+ PROTECT(PMprobeLength = allocVector(INTSXP,nprobes));
583
+ PROTECT(probeSeqString = allocVector(STRSXP,nprobes));
584
+ PROTECT(MatchScore = allocVector(REALSXP,nprobes));
585
+ PROTECT(PMposition = allocVector(INTSXP,nprobes));
586
+ PROTECT(Strand = allocVector(STRSXP,nprobes));
587
+
588
+ SET_VECTOR_ELT(PositionInfo,0,xPM);
589
+ SET_VECTOR_ELT(PositionInfo,1,yPM);
590
+ SET_VECTOR_ELT(PositionInfo,2,xMM);
591
+ SET_VECTOR_ELT(PositionInfo,3,yMM);
592
+ SET_VECTOR_ELT(PositionInfo,4,PMprobeLength);
593
+ SET_VECTOR_ELT(PositionInfo,5,probeSeqString);
594
+ SET_VECTOR_ELT(PositionInfo,6,MatchScore);
595
+ SET_VECTOR_ELT(PositionInfo,7,PMposition);
596
+ SET_VECTOR_ELT(PositionInfo,8,Strand);
597
+ UNPROTECT(9);
598
+
599
+ setAttrib(PositionInfo,R_ClassSymbol,mkString("data.frame"));
600
+
601
+ PROTECT(PositionInfoRowNames = allocVector(STRSXP,nprobes));
602
+ for (j=0; j < nprobes; j++){
603
+ sprintf(buf, "%d", j+1);
604
+ SET_STRING_ELT(PositionInfoRowNames,j,mkChar(buf));
605
+ }
606
+ setAttrib(PositionInfo, R_RowNamesSymbol, PositionInfoRowNames);
607
+ UNPROTECT(1);
608
+
609
+ PROTECT(tmpSEXP = allocVector(STRSXP,9));
610
+ SET_STRING_ELT(tmpSEXP,0,mkChar("x"));
611
+ SET_STRING_ELT(tmpSEXP,1,mkChar("y"));
612
+ SET_STRING_ELT(tmpSEXP,2,mkChar("x.mm"));
613
+ SET_STRING_ELT(tmpSEXP,3,mkChar("y.mm"));
614
+ SET_STRING_ELT(tmpSEXP,4,mkChar("PMLength"));
615
+ SET_STRING_ELT(tmpSEXP,5,mkChar("ProbeSeq"));
616
+ SET_STRING_ELT(tmpSEXP,6,mkChar("MatchScore"));
617
+ SET_STRING_ELT(tmpSEXP,7,mkChar("PMPosition"));
618
+ SET_STRING_ELT(tmpSEXP,8,mkChar("TargetStrand"));
619
+
620
+ setAttrib(PositionInfo,R_NamesSymbol,tmpSEXP);
621
+ UNPROTECT(1);
622
+
623
+ } else if (version == 3.0){
624
+ nprobes = INTEGER(VECTOR_ELT(VECTOR_ELT(seqDesc,i),3))[0];
625
+ probe_mapping_type = INTEGER(VECTOR_ELT(VECTOR_ELT(seqDesc,i),1))[0];
626
+
627
+
628
+ if (probe_mapping_type == 0){
629
+ PROTECT(PositionInfo = allocVector(VECSXP,9));
630
+ PROTECT(xPM = allocVector(INTSXP,nprobes));
631
+ PROTECT(yPM = allocVector(INTSXP,nprobes));
632
+ PROTECT(xMM = allocVector(INTSXP,nprobes));
633
+ PROTECT(yMM = allocVector(INTSXP,nprobes));
634
+ PROTECT(PMprobeLength = allocVector(INTSXP,nprobes));
635
+ PROTECT(probeSeqString = allocVector(STRSXP,nprobes));
636
+ PROTECT(MatchScore = allocVector(REALSXP,nprobes));
637
+ PROTECT(PMposition = allocVector(INTSXP,nprobes));
638
+ PROTECT(Strand = allocVector(STRSXP,nprobes));
639
+
640
+ SET_VECTOR_ELT(PositionInfo,0,xPM);
641
+ SET_VECTOR_ELT(PositionInfo,1,yPM);
642
+ SET_VECTOR_ELT(PositionInfo,2,xMM);
643
+ SET_VECTOR_ELT(PositionInfo,3,yMM);
644
+ SET_VECTOR_ELT(PositionInfo,4,PMprobeLength);
645
+ SET_VECTOR_ELT(PositionInfo,5,probeSeqString);
646
+ SET_VECTOR_ELT(PositionInfo,6,MatchScore);
647
+ SET_VECTOR_ELT(PositionInfo,7,PMposition);
648
+ SET_VECTOR_ELT(PositionInfo,8,Strand);
649
+ UNPROTECT(9);
650
+
651
+ setAttrib(PositionInfo,R_ClassSymbol,mkString("data.frame"));
652
+
653
+ PROTECT(PositionInfoRowNames = allocVector(STRSXP,nprobes));
654
+ for (j=0; j < nprobes; j++){
655
+ sprintf(buf, "%d", j+1);
656
+ SET_VECTOR_ELT(PositionInfoRowNames,j,mkChar(buf));
657
+ }
658
+ setAttrib(PositionInfo, R_RowNamesSymbol, PositionInfoRowNames);
659
+ UNPROTECT(1);
660
+
661
+ PROTECT(tmpSEXP = allocVector(STRSXP,9));
662
+ SET_STRING_ELT(tmpSEXP,0,mkChar("x"));
663
+ SET_STRING_ELT(tmpSEXP,1,mkChar("y"));
664
+ SET_STRING_ELT(tmpSEXP,2,mkChar("x.mm"));
665
+ SET_STRING_ELT(tmpSEXP,3,mkChar("y.mm"));
666
+ SET_STRING_ELT(tmpSEXP,4,mkChar("PMLength"));
667
+ SET_STRING_ELT(tmpSEXP,5,mkChar("ProbeSeq"));
668
+ SET_STRING_ELT(tmpSEXP,6,mkChar("MatchScore"));
669
+ SET_STRING_ELT(tmpSEXP,7,mkChar("PMPosition"));
670
+ SET_STRING_ELT(tmpSEXP,8,mkChar("TargetStrand"));
671
+
672
+ setAttrib(PositionInfo,R_NamesSymbol,tmpSEXP);
673
+ UNPROTECT(1);
674
+ } else {
675
+
676
+ PROTECT(PositionInfo = allocVector(VECSXP,7));
677
+ PROTECT(xPM = allocVector(INTSXP,nprobes));
678
+ PROTECT(yPM = allocVector(INTSXP,nprobes));
679
+ PROTECT(PMprobeLength = allocVector(INTSXP,nprobes));
680
+ PROTECT(probeSeqString = allocVector(STRSXP,nprobes));
681
+ PROTECT(MatchScore = allocVector(REALSXP,nprobes));
682
+ PROTECT(PMposition = allocVector(INTSXP,nprobes));
683
+ PROTECT(Strand = allocVector(STRSXP,nprobes));
684
+
685
+ SET_VECTOR_ELT(PositionInfo,0,xPM);
686
+ SET_VECTOR_ELT(PositionInfo,1,yPM);
687
+ SET_VECTOR_ELT(PositionInfo,2,PMprobeLength);
688
+ SET_VECTOR_ELT(PositionInfo,3,probeSeqString);
689
+ SET_VECTOR_ELT(PositionInfo,4,MatchScore);
690
+ SET_VECTOR_ELT(PositionInfo,5,PMposition);
691
+ SET_VECTOR_ELT(PositionInfo,6,Strand);
692
+ UNPROTECT(7);
693
+
694
+ setAttrib(PositionInfo,R_ClassSymbol,mkString("data.frame"));
695
+
696
+ PROTECT(PositionInfoRowNames = allocVector(STRSXP,nprobes));
697
+ for (j=0; j < nprobes; j++){
698
+ sprintf(buf, "%d", j+1);
699
+ SET_STRING_ELT(PositionInfoRowNames,j,mkChar(buf));
700
+ }
701
+ setAttrib(PositionInfo, R_RowNamesSymbol, PositionInfoRowNames);
702
+ UNPROTECT(1);
703
+
704
+ PROTECT(tmpSEXP = allocVector(STRSXP,7));
705
+ SET_STRING_ELT(tmpSEXP,0,mkChar("x"));
706
+ SET_STRING_ELT(tmpSEXP,1,mkChar("y"));
707
+ SET_STRING_ELT(tmpSEXP,2,mkChar("PMLength"));
708
+ SET_STRING_ELT(tmpSEXP,3,mkChar("ProbeSeq"));
709
+ SET_STRING_ELT(tmpSEXP,4,mkChar("MatchScore"));
710
+ SET_STRING_ELT(tmpSEXP,5,mkChar("PMPosition"));
711
+ SET_STRING_ELT(tmpSEXP,6,mkChar("TargetStrand"));
712
+
713
+ setAttrib(PositionInfo,R_NamesSymbol,tmpSEXP);
714
+ UNPROTECT(1);
715
+ }
716
+
717
+
718
+ }
719
+
720
+
721
+
722
+
723
+
724
+ for (j=0; j < nprobes; j++){
725
+ fread_be_uint32(&x,1,infile);
726
+ fread_be_uint32(&y,1,infile);
727
+ /* Rprintf("x y :%u %u\n",x,y); */
728
+
729
+ if (probe_mapping_type == 0){
730
+ fread_be_uint32(&x_mm,1,infile);
731
+ fread_be_uint32(&y_mm,1,infile);
732
+ }
733
+
734
+ /* Rprintf("mm x y :%u %u\n",x_mm,y_mm); */
735
+
736
+ INTEGER(xPM)[j] = x;
737
+ INTEGER(yPM)[j] = y;
738
+
739
+ if (probe_mapping_type == 0){
740
+ INTEGER(xMM)[j] = x_mm;
741
+ INTEGER(yMM)[j] = y_mm;
742
+ }
743
+ fread_be_uchar(&probelength,1,infile);
744
+ /* Rprintf("probelength : %d\n",(int)probelength);*/
745
+
746
+ INTEGER(PMprobeLength)[j] = probelength;
747
+
748
+
749
+ fread_be_uchar(probeseq,7,infile);
750
+ /* Rprintf("probeseq : %s\n",probeseq); */
751
+
752
+
753
+
754
+ dest = (char *)Calloc(25+1,char);
755
+ packedSeqTobaseStr(probeseq,dest);
756
+
757
+ SET_STRING_ELT(probeSeqString,j,mkChar(dest));
758
+ Free(dest);
759
+
760
+
761
+
762
+
763
+ /* matchScore is treated same as version number in header */
764
+ #ifdef WORDS_BIGENDIAN
765
+ /* swap, cast to integer, swap bytes and cast back to float */
766
+ fread_be_float32(&matchScore,1,infile);
767
+ swap_float_4(&matchScore);
768
+ matchScore_int = (int)matchScore;
769
+
770
+
771
+ matchScore_int=(((matchScore_int>>24)&0xff) | ((matchScore_int&0xff)<<24) |
772
+ ((matchScore_int>>8)&0xff00) | ((matchScore_int&0xff00)<<8));
773
+ matchScore = (float)matchScore_int;
774
+
775
+ #else
776
+ /* cast to integer, swap bytes, cast to float */
777
+ fread_float32(&matchScore,1,infile);
778
+ matchScore_int = (int)matchScore;
779
+ matchScore_int=(((matchScore_int>>24)&0xff) | ((matchScore_int&0xff)<<24) |
780
+ ((matchScore_int>>8)&0xff00) | ((matchScore_int&0xff00)<<8));
781
+ matchScore = (float)matchScore_int;
782
+ #endif
783
+ /* Rprintf("matchScore : %f\n",matchScore); */
784
+
785
+ REAL(MatchScore)[j] = matchScore;
786
+
787
+
788
+
789
+ fread_be_uint32(&positionPM,1,infile);
790
+ /* Rprintf("positionPM : %u\n",positionPM);*/
791
+ INTEGER(PMposition)[j] = positionPM;
792
+
793
+
794
+ fread_be_uchar(&strand,1,infile);
795
+ /* Rprintf("strand: %d\n",(int)strand);*/
796
+
797
+ if ((int)strand ==1){
798
+ SET_STRING_ELT(Strand,j,mkChar("F"));
799
+ } else {
800
+ SET_STRING_ELT(Strand,j,mkChar("R"));
801
+ }
802
+
803
+
804
+ }
805
+
806
+ SET_VECTOR_ELT(curSeqIdPositionInfo,1,PositionInfo);
807
+ UNPROTECT(1);
808
+
809
+ SET_VECTOR_ELT(SeqIdPositionInfoList,i,curSeqIdPositionInfo);
810
+ UNPROTECT(1);
811
+ }
812
+
813
+
814
+ UNPROTECT(1);
815
+ return SeqIdPositionInfoList;
816
+
817
+ }
818
+
819
+
820
+
821
+
822
+
823
+
824
+ SEXP ReadBPMAPFileIntoRList(SEXP filename){
825
+
826
+
827
+
828
+ SEXP bpmapRlist;
829
+
830
+ SEXP bpmapHeader;
831
+ SEXP bpmapSeqDesc;
832
+
833
+ SEXP tmpSXP;
834
+
835
+ FILE *infile;
836
+
837
+
838
+ int n_seq;
839
+ float version;
840
+
841
+
842
+ const char *cur_file_name;
843
+ cur_file_name = CHAR(STRING_ELT(filename,0));
844
+
845
+
846
+
847
+ if ((infile = fopen(cur_file_name, "rb")) == NULL)
848
+ {
849
+ error("Unable to open the file %s",filename);
850
+ }
851
+
852
+
853
+
854
+ /*
855
+ first element is header, second item is sequence descriptions
856
+ third item is sequence header/position information
857
+
858
+ */
859
+ PROTECT(bpmapRlist = allocVector(VECSXP,3));
860
+
861
+
862
+ PROTECT(bpmapHeader = ReadBPMAPHeader(infile));
863
+ SET_VECTOR_ELT(bpmapRlist,0,bpmapHeader);
864
+ version = REAL(VECTOR_ELT(bpmapHeader,1))[0];
865
+ n_seq = INTEGER(VECTOR_ELT(bpmapHeader,2))[0];
866
+ UNPROTECT(1);
867
+
868
+ /* Rprintf("version nseq: %f %d\n", version, n_seq); */
869
+
870
+
871
+ PROTECT(bpmapSeqDesc = ReadBPMAPSeqDescription(infile,version,n_seq));
872
+ SET_VECTOR_ELT(bpmapRlist,1,bpmapSeqDesc);
873
+ SET_VECTOR_ELT(bpmapRlist,2,readBPMAPSeqIdPositionInfo(infile,version,n_seq,bpmapSeqDesc));
874
+ UNPROTECT(1);
875
+
876
+ PROTECT(tmpSXP=allocVector(STRSXP,3));
877
+ SET_STRING_ELT(tmpSXP,0,mkChar("Header"));
878
+ SET_STRING_ELT(tmpSXP,1,mkChar("SequenceDescription"));
879
+ SET_STRING_ELT(tmpSXP,2,mkChar("SeqHead.PosInfo"));
880
+ setAttrib(bpmapRlist,R_NamesSymbol,tmpSXP);
881
+ UNPROTECT(1);
882
+
883
+ UNPROTECT(1);
884
+ return bpmapRlist;
885
+
886
+
887
+ }
888
+