bio-affy 0.1.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +15 -0
  4. data/Gemfile.lock +32 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.rdoc +33 -0
  7. data/Rakefile +77 -0
  8. data/VERSION +1 -0
  9. data/bin/bio-affy +80 -0
  10. data/bio-affy.gemspec +128 -0
  11. data/ext/DESCRIPTION +11 -0
  12. data/ext/HISTORY +3 -0
  13. data/ext/LICENSE +456 -0
  14. data/ext/NAMESPACE +2 -0
  15. data/ext/R/check.cdf.type.R +18 -0
  16. data/ext/R/read.cdffile.list.R +23 -0
  17. data/ext/R/read.celfile.R +11 -0
  18. data/ext/R/read.celfile.header.R +37 -0
  19. data/ext/R/read.probematrices.R +29 -0
  20. data/ext/README_BIOLIB +36 -0
  21. data/ext/aclocal.m4 +32 -0
  22. data/ext/configure +4898 -0
  23. data/ext/configure.in +51 -0
  24. data/ext/man/check.cdf.type.Rd +22 -0
  25. data/ext/man/read.cdffile.list.Rd +20 -0
  26. data/ext/man/read.celfile.Rd +23 -0
  27. data/ext/man/read.celfile.header.Rd +22 -0
  28. data/ext/man/read.celfile.probeintensity.matrices.Rd +31 -0
  29. data/ext/src/CMakeLists.txt +39 -0
  30. data/ext/src/Makevars.in +3 -0
  31. data/ext/src/Makevars.win +2 -0
  32. data/ext/src/Rakefile +43 -0
  33. data/ext/src/biolib_affyio.c +416 -0
  34. data/ext/src/biolib_affyio.h +132 -0
  35. data/ext/src/biolib_affyio.o +0 -0
  36. data/ext/src/fread_functions.c +871 -0
  37. data/ext/src/fread_functions.h +60 -0
  38. data/ext/src/fread_functions.o +0 -0
  39. data/ext/src/libaffyext.so +0 -0
  40. data/ext/src/mkrf.log +11 -0
  41. data/ext/src/mkrf_conf.rb +6 -0
  42. data/ext/src/read_abatch.c +5484 -0
  43. data/ext/src/read_abatch.h +63 -0
  44. data/ext/src/read_abatch.o +0 -0
  45. data/ext/src/read_bpmap.c +888 -0
  46. data/ext/src/read_bpmap.o +0 -0
  47. data/ext/src/read_cdf.h +347 -0
  48. data/ext/src/read_cdf_xda.c +1342 -0
  49. data/ext/src/read_cdf_xda.o +0 -0
  50. data/ext/src/read_cdffile2.c +1576 -0
  51. data/ext/src/read_cdffile2.o +0 -0
  52. data/ext/src/read_celfile_generic.c +2061 -0
  53. data/ext/src/read_celfile_generic.h +33 -0
  54. data/ext/src/read_celfile_generic.o +0 -0
  55. data/ext/src/read_clf.c +870 -0
  56. data/ext/src/read_clf.o +0 -0
  57. data/ext/src/read_generic.c +1446 -0
  58. data/ext/src/read_generic.h +144 -0
  59. data/ext/src/read_generic.o +0 -0
  60. data/ext/src/read_pgf.c +1337 -0
  61. data/ext/src/read_pgf.o +0 -0
  62. data/lib/bio-affy.rb +5 -0
  63. data/lib/bio/affy.rb +7 -0
  64. data/lib/bio/affyext.rb +23 -0
  65. data/lib/bio/libaffyext.so +0 -0
  66. data/spec/bio-affy_spec.rb +22 -0
  67. data/spec/spec_helper.rb +13 -0
  68. data/test/data/affy/GSM103328.CEL.gz +0 -0
  69. data/test/data/affy/GSM103329.CEL.gz +0 -0
  70. data/test/data/affy/GSM103330.CEL.gz +0 -0
  71. data/test/data/affy/MG_U74Av2.CDF.gz +0 -0
  72. metadata +190 -0
@@ -0,0 +1,63 @@
1
+ #ifndef READ_ABATCH_H
2
+ #define READ_ABATCH_H
3
+
4
+
5
+
6
+ /****************************************************************
7
+ **
8
+ ** A structure for holding full header information
9
+ **
10
+ **
11
+ **
12
+ ***************************************************************/
13
+
14
+ typedef struct{
15
+ char *cdfName;
16
+ int cols;
17
+ int rows;
18
+ int GridCornerULx,GridCornerULy; /* XY coordinates of the upper left grid corner in pixel coordinates.*/
19
+ int GridCornerURx,GridCornerURy; /* XY coordinates of the upper right grid corner in pixel coordinates.*/
20
+ int GridCornerLRx,GridCornerLRy; /* XY coordinates of the lower right grid corner in pixel coordinates.*/
21
+ int GridCornerLLx,GridCornerLLy; /* XY coordinates of the lower left grid corner in pixel coordinates.*/
22
+ char *DatHeader;
23
+ char *Algorithm;
24
+ char *AlgorithmParameters;
25
+ char *ScanDate;
26
+ } detailed_header_info;
27
+
28
+ /******************************************************************
29
+ **
30
+ ** A "C" level object designed to hold information for a
31
+ ** single CEL file
32
+ **
33
+ ** These should be created using the function
34
+ **
35
+ ** read_cel_file()
36
+ **
37
+ **
38
+ **
39
+ *****************************************************************/
40
+
41
+ typedef struct{
42
+ detailed_header_info header;
43
+
44
+ /** these are for storing the intensities, the sds and the number of pixels **/
45
+ double *intensities;
46
+ double *stddev;
47
+ double *npixels;
48
+
49
+ /** these are for storing information in the masks and outliers section **/
50
+
51
+ int nmasks;
52
+ int noutliers;
53
+
54
+ short *masks_x, *masks_y;
55
+ short *outliers_x, *outliers_y;
56
+
57
+ } CEL;
58
+
59
+ extern CEL *read_cel_file(const char *filename, int read_intensities_only);
60
+
61
+
62
+
63
+ #endif
Binary file
@@ -0,0 +1,888 @@
1
+ /****************************************************************
2
+ **
3
+ ** File: read_bpmap.c
4
+ **
5
+ ** Implementation by: B. M. Bolstad
6
+ **
7
+ ** Copyright (C) B. M. Bolstad 2006-2007
8
+ **
9
+ ** A parser designed to read bpmap files into an R List structure
10
+ **
11
+ ** History
12
+ ** Mar 11, 2006 - Initial version
13
+ ** Mar 12, 2006 - add additional support for versions 2 and 3
14
+ ** May 31, 2006 - Fix some compiler warnings
15
+ ** June 12, 2006 - fix naming vector length issue.
16
+ ** June 12, 2007 - much wailing and grinding of teeth, but finally a fix for reading version number right.
17
+ ** Aug 25, 2007 - Move file reading functions to centralized location
18
+ ** Mar 14, 2008 - Fix reading of version number for big endian platforms
19
+ ** Jan 15, 2008 - Fix VECTOR_ELT/STRING_ELT issues
20
+ **
21
+ *******************************************************************/
22
+
23
+ #include <R.h>
24
+ #include <Rdefines.h>
25
+
26
+ #include "stdlib.h"
27
+ #include "stdio.h"
28
+
29
+ #include "fread_functions.h"
30
+
31
+
32
+
33
+ /****************************************************************
34
+ **
35
+ **
36
+ **
37
+ **
38
+ ** Note BPMAP files are stored in big endian format
39
+ **
40
+ *******************************************************************/
41
+
42
+
43
+
44
+ /*************************************************************************
45
+ **
46
+ ** Code for reading from the big endian binary files, doing bit flipping if
47
+ ** necessary (on little-endian machines)
48
+ **
49
+ **
50
+ ************************************************************************/
51
+
52
+
53
+
54
+ static void swap_float_4(float *tnf4) /* 4 byte floating point numbers */
55
+ {
56
+ int tni = (int)(*tnf4);
57
+
58
+ tni=(((tni>>24)&0xff) | ((tni&0xff)<<24) |
59
+ ((tni>>8)&0xff00) | ((tni&0xff00)<<8));
60
+
61
+ *tnf4 = (float)tni;
62
+
63
+ }
64
+
65
+
66
+
67
+
68
+
69
+ static SEXP ReadBPMAPHeader(FILE *infile){
70
+
71
+
72
+ SEXP Header;
73
+ SEXP tmpSXP;
74
+
75
+
76
+ char *Magicnumber = R_alloc(8,sizeof(char));
77
+ float version_number = 0.0;
78
+ int version_number_int;
79
+ unsigned int unsigned_version_number_int;
80
+
81
+
82
+ unsigned int n_seq;
83
+ static double new_version_number;
84
+
85
+
86
+
87
+ fread_be_char(Magicnumber,8,infile);
88
+
89
+ if (strncmp(Magicnumber,"PHT7",4) !=0){
90
+ error("Based on the magic number which was %s, this does not appear to be a BPMAP file",Magicnumber);
91
+ }
92
+
93
+
94
+ /* version number is a little bit funky
95
+ need to do some funny things to coax it
96
+ into the right format
97
+ */
98
+
99
+
100
+ /* cast to integer, swap bytes, cast to float */
101
+ /* fread_be_float32(&version_number,1,infile); */
102
+ fread_float32(&version_number,1,infile);
103
+ swap_float_4(&version_number);
104
+
105
+ new_version_number = (double)version_number;
106
+ /* // Rprintf("A %f\n",version_number);*/
107
+
108
+ if ((version_number <=0.5) || (version_number > 3.5)){
109
+ /* // Rprintf("Rereading\n"); */
110
+ fseek(infile,-sizeof(float),SEEK_CUR);
111
+ fread_be_uint32(&unsigned_version_number_int,1,infile);
112
+ memcpy(&version_number,&unsigned_version_number_int, sizeof(float));
113
+ new_version_number = (double)version_number;
114
+ }
115
+
116
+ fread_be_uint32(&n_seq,1,infile);
117
+
118
+ PROTECT(Header=allocVector(VECSXP,3));
119
+
120
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
121
+ SET_STRING_ELT(tmpSXP,0,mkChar(Magicnumber));
122
+ SET_VECTOR_ELT(Header,0,tmpSXP);
123
+ UNPROTECT(1);
124
+
125
+
126
+ PROTECT(tmpSXP=allocVector(REALSXP,1));
127
+ REAL(tmpSXP)[0] = (double)new_version_number;
128
+ SET_VECTOR_ELT(Header,1,tmpSXP);
129
+ UNPROTECT(1);
130
+
131
+
132
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
133
+ INTEGER(tmpSXP)[0] = (int)n_seq;
134
+ SET_VECTOR_ELT(Header,2,tmpSXP);
135
+ UNPROTECT(1);
136
+
137
+ PROTECT(tmpSXP=allocVector(STRSXP,3));
138
+ SET_STRING_ELT(tmpSXP,0,mkChar("magic.number"));
139
+ SET_STRING_ELT(tmpSXP,1,mkChar("version"));
140
+ SET_STRING_ELT(tmpSXP,2,mkChar("n.seq"));
141
+ setAttrib(Header,R_NamesSymbol,tmpSXP);
142
+ UNPROTECT(2);
143
+
144
+ /* Rprintf("D %f %f\n",version_number,new_version_number); */
145
+ return Header;
146
+
147
+ }
148
+
149
+
150
+
151
+ static SEXP ReadBPMAPSeqDescription(FILE *infile, float version, int nseq){
152
+
153
+
154
+ SEXP SequenceDescriptionList;
155
+
156
+ SEXP CurSequenceDescription = R_NilValue;
157
+ SEXP tmpSXP,tmpSXP2;
158
+
159
+
160
+
161
+ int i,j;
162
+
163
+ unsigned int seq_name_length;
164
+
165
+ char *seq_name;
166
+
167
+ unsigned int probe_mapping_type;
168
+ unsigned int seq_file_offset;
169
+
170
+ unsigned int n_probes;
171
+
172
+ unsigned int group_name_length;
173
+ char *group_name;
174
+
175
+ unsigned int version_number_length;
176
+ char *version_number;
177
+
178
+ unsigned int number_parameters;
179
+
180
+ unsigned int param_length;
181
+ char *param_name;
182
+
183
+ /* Rprintf("%f %d\n",version,nseq); */
184
+
185
+ PROTECT(SequenceDescriptionList=allocVector(VECSXP,(int)nseq));
186
+
187
+ for (i=0; i < nseq; i++){
188
+ fread_be_uint32(&seq_name_length,1,infile);
189
+ seq_name = (char *)Calloc(seq_name_length+1,char);
190
+ fread_be_char(seq_name,seq_name_length,infile);
191
+
192
+
193
+
194
+ if (version == 3.00){
195
+ PROTECT(CurSequenceDescription=allocVector(VECSXP,8));
196
+ PROTECT(tmpSXP=allocVector(STRSXP,7));
197
+ SET_STRING_ELT(tmpSXP,0,mkChar("Name"));
198
+ SET_STRING_ELT(tmpSXP,1,mkChar("ProbeMappingType"));
199
+ SET_STRING_ELT(tmpSXP,2,mkChar("SequenceFileOffset"));
200
+ SET_STRING_ELT(tmpSXP,3,mkChar("n.probepairs"));
201
+ SET_STRING_ELT(tmpSXP,4,mkChar("GroupName"));
202
+ SET_STRING_ELT(tmpSXP,5,mkChar("VersionNumber"));
203
+ SET_STRING_ELT(tmpSXP,6,mkChar("NumberOfParameters"));
204
+ SET_STRING_ELT(tmpSXP,7,mkChar("Parameters"));
205
+ setAttrib(CurSequenceDescription,R_NamesSymbol,tmpSXP);
206
+ UNPROTECT(1);
207
+ } else if (version == 2.00){
208
+ PROTECT(CurSequenceDescription=allocVector(VECSXP,6));
209
+ PROTECT(tmpSXP=allocVector(STRSXP,6));
210
+ SET_STRING_ELT(tmpSXP,0,mkChar("Name"));
211
+ SET_STRING_ELT(tmpSXP,1,mkChar("n.probepairs"));
212
+ SET_STRING_ELT(tmpSXP,2,mkChar("GroupName"));
213
+ SET_STRING_ELT(tmpSXP,3,mkChar("VersionNumber"));
214
+ SET_STRING_ELT(tmpSXP,4,mkChar("NumberOfParameters"));
215
+ SET_STRING_ELT(tmpSXP,5,mkChar("Parameters"));
216
+ setAttrib(CurSequenceDescription,R_NamesSymbol,tmpSXP);
217
+ UNPROTECT(1);
218
+ } else if (version == 1.00){
219
+ PROTECT(CurSequenceDescription=allocVector(VECSXP,2));
220
+ PROTECT(tmpSXP=allocVector(STRSXP,2));
221
+ SET_STRING_ELT(tmpSXP,0,mkChar("Name"));
222
+ SET_STRING_ELT(tmpSXP,1,mkChar("n.probepairs"));
223
+ setAttrib(CurSequenceDescription,R_NamesSymbol,tmpSXP);
224
+ UNPROTECT(1);
225
+
226
+ }
227
+
228
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
229
+ SET_STRING_ELT(tmpSXP,0,mkChar(seq_name));
230
+ SET_VECTOR_ELT(CurSequenceDescription,0,tmpSXP);
231
+ UNPROTECT(1);
232
+ Free(seq_name);
233
+
234
+
235
+ if (version == 1.0){
236
+ fread_be_uint32(&n_probes,1,infile);
237
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
238
+ INTEGER(tmpSXP)[0] = n_probes;
239
+ SET_VECTOR_ELT(CurSequenceDescription,1,tmpSXP);
240
+ UNPROTECT(1);
241
+ } else if (version ==2.0){
242
+ fread_be_uint32(&n_probes,1,infile);
243
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
244
+ INTEGER(tmpSXP)[0] = n_probes;
245
+ SET_VECTOR_ELT(CurSequenceDescription,1,tmpSXP);
246
+ UNPROTECT(1);
247
+
248
+
249
+
250
+
251
+ fread_be_uint32(&group_name_length,1,infile);
252
+ group_name = (char *)Calloc(group_name_length+1,char);
253
+ fread_be_char(group_name,group_name_length,infile);
254
+
255
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
256
+ SET_STRING_ELT(tmpSXP,0,mkChar(group_name));
257
+ SET_VECTOR_ELT(CurSequenceDescription,2,tmpSXP);
258
+ UNPROTECT(1);
259
+ Free(group_name);
260
+
261
+
262
+ fread_be_uint32(&version_number_length,1,infile);
263
+ version_number = (char *)Calloc(version_number_length+1,char);
264
+ fread_be_char(version_number,version_number_length,infile);
265
+
266
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
267
+ SET_STRING_ELT(tmpSXP,0,mkChar(version_number));
268
+ SET_VECTOR_ELT(CurSequenceDescription,3,tmpSXP);
269
+ UNPROTECT(1);
270
+ Free(version_number);
271
+
272
+
273
+ fread_be_uint32(&number_parameters,1,infile);
274
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
275
+ INTEGER(tmpSXP)[0] = number_parameters;
276
+ SET_VECTOR_ELT(CurSequenceDescription,4,tmpSXP);
277
+ UNPROTECT(1);
278
+
279
+ PROTECT(tmpSXP=allocVector(VECSXP,number_parameters));
280
+
281
+
282
+ for (j=0; j < number_parameters; j++){
283
+ PROTECT(tmpSXP2 = allocVector(STRSXP,2));
284
+ fread_be_uint32(&param_length,1,infile);
285
+ param_name = (char *)Calloc(param_length+1,char);
286
+ fread_be_char(param_name,param_length,infile);
287
+ SET_STRING_ELT(tmpSXP2,0,mkChar(param_name));
288
+ Free(param_name);
289
+ fread_be_uint32(&param_length,1,infile);
290
+ param_name = (char *)Calloc(param_length+1,char);
291
+ fread_be_char(param_name,param_length,infile);
292
+ SET_STRING_ELT(tmpSXP2,1,mkChar(param_name));
293
+ Free(param_name);
294
+
295
+ SET_VECTOR_ELT(tmpSXP,j,tmpSXP2);
296
+ UNPROTECT(1);
297
+ }
298
+ SET_VECTOR_ELT(CurSequenceDescription,5,tmpSXP);
299
+ UNPROTECT(1);
300
+
301
+
302
+
303
+ } else if (version ==3.0){
304
+ fread_be_uint32(&probe_mapping_type,1,infile);
305
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
306
+ INTEGER(tmpSXP)[0] = probe_mapping_type;
307
+ SET_VECTOR_ELT(CurSequenceDescription,1,tmpSXP);
308
+ UNPROTECT(1);
309
+
310
+ fread_be_uint32(&seq_file_offset,1,infile);
311
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
312
+ INTEGER(tmpSXP)[0] = seq_file_offset;
313
+ SET_VECTOR_ELT(CurSequenceDescription,2,tmpSXP);
314
+ UNPROTECT(1);
315
+
316
+ fread_be_uint32(&n_probes,1,infile);
317
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
318
+ INTEGER(tmpSXP)[0] = n_probes;
319
+ SET_VECTOR_ELT(CurSequenceDescription,3,tmpSXP);
320
+ UNPROTECT(1);
321
+
322
+ fread_be_uint32(&group_name_length,1,infile);
323
+ group_name = (char *)Calloc(group_name_length+1,char);
324
+ fread_be_char(group_name,group_name_length,infile);
325
+
326
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
327
+ SET_STRING_ELT(tmpSXP,0,mkChar(group_name));
328
+ SET_VECTOR_ELT(CurSequenceDescription,4,tmpSXP);
329
+ UNPROTECT(1);
330
+ Free(group_name);
331
+
332
+ fread_be_uint32(&version_number_length,1,infile);
333
+ version_number = (char *)Calloc(version_number_length+1,char);
334
+ fread_be_char(version_number,version_number_length,infile);
335
+
336
+ PROTECT(tmpSXP=allocVector(STRSXP,1));
337
+ SET_STRING_ELT(tmpSXP,0,mkChar(version_number));
338
+ SET_VECTOR_ELT(CurSequenceDescription,5,tmpSXP);
339
+ UNPROTECT(1);
340
+ Free(version_number);
341
+
342
+ fread_be_uint32(&number_parameters,1,infile);
343
+ PROTECT(tmpSXP=allocVector(INTSXP,1));
344
+ INTEGER(tmpSXP)[0] = number_parameters;
345
+ SET_VECTOR_ELT(CurSequenceDescription,6,tmpSXP);
346
+ UNPROTECT(1);
347
+
348
+
349
+
350
+ PROTECT(tmpSXP=allocVector(VECSXP,number_parameters));
351
+
352
+
353
+ for (j=0; j < number_parameters; j++){
354
+ PROTECT(tmpSXP2 = allocVector(STRSXP,2));
355
+ fread_be_uint32(&param_length,1,infile);
356
+ param_name = (char *)Calloc(param_length+1,char);
357
+ fread_be_char(param_name,param_length,infile);
358
+ SET_STRING_ELT(tmpSXP2,0,mkChar(param_name));
359
+ Free(param_name);
360
+ fread_be_uint32(&param_length,1,infile);
361
+ param_name = (char *)Calloc(param_length+1,char);
362
+ fread_be_char(param_name,param_length,infile);
363
+ SET_STRING_ELT(tmpSXP2,1,mkChar(param_name));
364
+ Free(param_name);
365
+
366
+ SET_VECTOR_ELT(tmpSXP,j,tmpSXP2);
367
+ UNPROTECT(1);
368
+ }
369
+ SET_VECTOR_ELT(CurSequenceDescription,7,tmpSXP);
370
+ UNPROTECT(1);
371
+ }
372
+
373
+ SET_VECTOR_ELT(SequenceDescriptionList,i,CurSequenceDescription);
374
+ UNPROTECT(1);
375
+
376
+ }
377
+
378
+ UNPROTECT(1);
379
+ return SequenceDescriptionList;
380
+
381
+ }
382
+
383
+
384
+
385
+ static void packedSeqTobaseStr(unsigned char probeseq[7], char *dest){
386
+
387
+ unsigned char currentchar;
388
+
389
+ unsigned char firsttwobits;
390
+ unsigned char secondtwobits;
391
+ unsigned char thirdtwobits;
392
+ unsigned char fourthtwobits;
393
+
394
+ int i;
395
+
396
+
397
+ /* Rprintf("\n\n\n\n\n"); */
398
+
399
+
400
+ for (i =0; i < 6;i++){
401
+ currentchar = probeseq[i];
402
+
403
+ /* extract first two bits */
404
+ firsttwobits = (currentchar & 192);
405
+ secondtwobits = (currentchar & 48);
406
+ thirdtwobits = (currentchar & 12);
407
+ fourthtwobits = (currentchar & 3);
408
+
409
+
410
+
411
+ firsttwobits = firsttwobits >> 6;
412
+ secondtwobits = secondtwobits >> 4;
413
+ thirdtwobits = thirdtwobits >> 2;
414
+
415
+ /* Rprintf("%x %x %x %x\n",firsttwobits,secondtwobits,thirdtwobits,fourthtwobits); */
416
+
417
+
418
+
419
+
420
+ if (firsttwobits == 0){
421
+ dest[4*i +0]='A';
422
+ }
423
+ if (firsttwobits == 1){
424
+ dest[4*i +0]='C';
425
+ }
426
+ if (firsttwobits == 2){
427
+ dest[4*i +0]='G';
428
+ }
429
+ if (firsttwobits == 3){
430
+ dest[4*i +0]='T';
431
+ }
432
+
433
+ if (secondtwobits == 0){
434
+ dest[4*i +1]='A';
435
+ }
436
+ if (secondtwobits == 1){
437
+ dest[4*i +1]='C';
438
+ }
439
+ if (secondtwobits == 2){
440
+ dest[4*i +1]='G';
441
+ }
442
+ if (secondtwobits == 3){
443
+ dest[4*i +1]='T';
444
+ }
445
+
446
+ if (thirdtwobits == 0){
447
+ dest[4*i +2]='A';
448
+ }
449
+ if (thirdtwobits == 1){
450
+ dest[4*i +2]='C';
451
+ }
452
+ if (thirdtwobits == 2){
453
+ dest[4*i +2]='G';
454
+ }
455
+ if (thirdtwobits == 3){
456
+ dest[4*i +2]='T';
457
+ }
458
+
459
+ if (fourthtwobits == 0){
460
+ dest[4*i +3]='A';
461
+ }
462
+ if (fourthtwobits == 1){
463
+ dest[4*i +3]='C';
464
+ }
465
+ if (fourthtwobits == 2){
466
+ dest[4*i +3]='G';
467
+ }
468
+ if (fourthtwobits == 3){
469
+ dest[4*i +3]='T';
470
+ }
471
+
472
+ /* Rprintf("%c%c%c%c\n",dest[4*i],dest[4*i +1],dest[4*i +2], dest[4*i +3]); */
473
+ }
474
+
475
+ currentchar = probeseq[6];
476
+
477
+ /* extract first two bits */
478
+
479
+ firsttwobits = (currentchar & 192);
480
+ firsttwobits = firsttwobits >> 6;
481
+ if (firsttwobits == 0){
482
+ dest[24]='A';
483
+ }
484
+ if (firsttwobits == 1){
485
+ dest[24]='C';
486
+ }
487
+ if (firsttwobits == 2){
488
+ dest[24]='G';
489
+ }
490
+ if (firsttwobits == 3){
491
+ dest[24]='T';
492
+ }
493
+ }
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+ static SEXP readBPMAPSeqIdPositionInfo(FILE *infile, float version, int nseq, SEXP seqDesc){
503
+
504
+
505
+ SEXP SeqIdPositionInfoList;
506
+ SEXP curSeqIdPositionInfo;
507
+ SEXP PositionInfo= R_NilValue;
508
+ SEXP PositionInfoRowNames;
509
+
510
+
511
+ SEXP tmpSEXP;
512
+
513
+ SEXP xPM= R_NilValue,yPM= R_NilValue,xMM= R_NilValue,yMM= R_NilValue;
514
+ SEXP PMprobeLength= R_NilValue;
515
+ SEXP probeSeqString= R_NilValue;
516
+ SEXP MatchScore= R_NilValue;
517
+ SEXP PMposition= R_NilValue;
518
+ SEXP Strand= R_NilValue;
519
+
520
+ char buf[10];
521
+
522
+ char *dest;
523
+
524
+
525
+ int nprobes=0;
526
+ int probe_mapping_type=0;
527
+ int i,j;
528
+
529
+
530
+ unsigned int SeqId;
531
+
532
+ unsigned int x;
533
+ unsigned int y;
534
+
535
+ unsigned int x_mm;
536
+ unsigned int y_mm;
537
+
538
+ unsigned char probelength;
539
+
540
+ unsigned char probeseq[7];
541
+
542
+ float matchScore;
543
+ int matchScore_int;
544
+
545
+ unsigned int positionPM;
546
+ unsigned char strand;
547
+
548
+
549
+ PROTECT(SeqIdPositionInfoList = allocVector(VECSXP,nseq));
550
+
551
+ for (i =0; i < nseq; i++){
552
+ fread_be_uint32(&SeqId,1,infile);
553
+ /*Rprintf("Seq id:%u\n",SeqId);*/
554
+
555
+ PROTECT(curSeqIdPositionInfo = allocVector(VECSXP,2));
556
+
557
+
558
+ PROTECT(tmpSEXP=allocVector(INTSXP,1));
559
+ INTEGER(tmpSEXP)[0] = (int)SeqId;
560
+ SET_VECTOR_ELT(curSeqIdPositionInfo,0,tmpSEXP);
561
+ UNPROTECT(1);
562
+
563
+
564
+ PROTECT(tmpSEXP=allocVector(STRSXP,2));
565
+ SET_STRING_ELT(tmpSEXP,0,mkChar("Header"));
566
+ SET_STRING_ELT(tmpSEXP,1,mkChar("PositionInformation"));
567
+ setAttrib(curSeqIdPositionInfo,R_NamesSymbol,tmpSEXP);
568
+ UNPROTECT(1);
569
+
570
+
571
+
572
+ if ((version == 1.0) || (version == 2.0)){
573
+ nprobes = INTEGER(VECTOR_ELT(VECTOR_ELT(seqDesc,i),1))[0];
574
+ /* Rprintf("nprobes: %d\n",nprobes); */
575
+ probe_mapping_type = 0; /* PM/MM tiling */
576
+
577
+ PROTECT(PositionInfo = allocVector(VECSXP,9));
578
+ PROTECT(xPM = allocVector(INTSXP,nprobes));
579
+ PROTECT(yPM = allocVector(INTSXP,nprobes));
580
+ PROTECT(xMM = allocVector(INTSXP,nprobes));
581
+ PROTECT(yMM = allocVector(INTSXP,nprobes));
582
+ PROTECT(PMprobeLength = allocVector(INTSXP,nprobes));
583
+ PROTECT(probeSeqString = allocVector(STRSXP,nprobes));
584
+ PROTECT(MatchScore = allocVector(REALSXP,nprobes));
585
+ PROTECT(PMposition = allocVector(INTSXP,nprobes));
586
+ PROTECT(Strand = allocVector(STRSXP,nprobes));
587
+
588
+ SET_VECTOR_ELT(PositionInfo,0,xPM);
589
+ SET_VECTOR_ELT(PositionInfo,1,yPM);
590
+ SET_VECTOR_ELT(PositionInfo,2,xMM);
591
+ SET_VECTOR_ELT(PositionInfo,3,yMM);
592
+ SET_VECTOR_ELT(PositionInfo,4,PMprobeLength);
593
+ SET_VECTOR_ELT(PositionInfo,5,probeSeqString);
594
+ SET_VECTOR_ELT(PositionInfo,6,MatchScore);
595
+ SET_VECTOR_ELT(PositionInfo,7,PMposition);
596
+ SET_VECTOR_ELT(PositionInfo,8,Strand);
597
+ UNPROTECT(9);
598
+
599
+ setAttrib(PositionInfo,R_ClassSymbol,mkString("data.frame"));
600
+
601
+ PROTECT(PositionInfoRowNames = allocVector(STRSXP,nprobes));
602
+ for (j=0; j < nprobes; j++){
603
+ sprintf(buf, "%d", j+1);
604
+ SET_STRING_ELT(PositionInfoRowNames,j,mkChar(buf));
605
+ }
606
+ setAttrib(PositionInfo, R_RowNamesSymbol, PositionInfoRowNames);
607
+ UNPROTECT(1);
608
+
609
+ PROTECT(tmpSEXP = allocVector(STRSXP,9));
610
+ SET_STRING_ELT(tmpSEXP,0,mkChar("x"));
611
+ SET_STRING_ELT(tmpSEXP,1,mkChar("y"));
612
+ SET_STRING_ELT(tmpSEXP,2,mkChar("x.mm"));
613
+ SET_STRING_ELT(tmpSEXP,3,mkChar("y.mm"));
614
+ SET_STRING_ELT(tmpSEXP,4,mkChar("PMLength"));
615
+ SET_STRING_ELT(tmpSEXP,5,mkChar("ProbeSeq"));
616
+ SET_STRING_ELT(tmpSEXP,6,mkChar("MatchScore"));
617
+ SET_STRING_ELT(tmpSEXP,7,mkChar("PMPosition"));
618
+ SET_STRING_ELT(tmpSEXP,8,mkChar("TargetStrand"));
619
+
620
+ setAttrib(PositionInfo,R_NamesSymbol,tmpSEXP);
621
+ UNPROTECT(1);
622
+
623
+ } else if (version == 3.0){
624
+ nprobes = INTEGER(VECTOR_ELT(VECTOR_ELT(seqDesc,i),3))[0];
625
+ probe_mapping_type = INTEGER(VECTOR_ELT(VECTOR_ELT(seqDesc,i),1))[0];
626
+
627
+
628
+ if (probe_mapping_type == 0){
629
+ PROTECT(PositionInfo = allocVector(VECSXP,9));
630
+ PROTECT(xPM = allocVector(INTSXP,nprobes));
631
+ PROTECT(yPM = allocVector(INTSXP,nprobes));
632
+ PROTECT(xMM = allocVector(INTSXP,nprobes));
633
+ PROTECT(yMM = allocVector(INTSXP,nprobes));
634
+ PROTECT(PMprobeLength = allocVector(INTSXP,nprobes));
635
+ PROTECT(probeSeqString = allocVector(STRSXP,nprobes));
636
+ PROTECT(MatchScore = allocVector(REALSXP,nprobes));
637
+ PROTECT(PMposition = allocVector(INTSXP,nprobes));
638
+ PROTECT(Strand = allocVector(STRSXP,nprobes));
639
+
640
+ SET_VECTOR_ELT(PositionInfo,0,xPM);
641
+ SET_VECTOR_ELT(PositionInfo,1,yPM);
642
+ SET_VECTOR_ELT(PositionInfo,2,xMM);
643
+ SET_VECTOR_ELT(PositionInfo,3,yMM);
644
+ SET_VECTOR_ELT(PositionInfo,4,PMprobeLength);
645
+ SET_VECTOR_ELT(PositionInfo,5,probeSeqString);
646
+ SET_VECTOR_ELT(PositionInfo,6,MatchScore);
647
+ SET_VECTOR_ELT(PositionInfo,7,PMposition);
648
+ SET_VECTOR_ELT(PositionInfo,8,Strand);
649
+ UNPROTECT(9);
650
+
651
+ setAttrib(PositionInfo,R_ClassSymbol,mkString("data.frame"));
652
+
653
+ PROTECT(PositionInfoRowNames = allocVector(STRSXP,nprobes));
654
+ for (j=0; j < nprobes; j++){
655
+ sprintf(buf, "%d", j+1);
656
+ SET_VECTOR_ELT(PositionInfoRowNames,j,mkChar(buf));
657
+ }
658
+ setAttrib(PositionInfo, R_RowNamesSymbol, PositionInfoRowNames);
659
+ UNPROTECT(1);
660
+
661
+ PROTECT(tmpSEXP = allocVector(STRSXP,9));
662
+ SET_STRING_ELT(tmpSEXP,0,mkChar("x"));
663
+ SET_STRING_ELT(tmpSEXP,1,mkChar("y"));
664
+ SET_STRING_ELT(tmpSEXP,2,mkChar("x.mm"));
665
+ SET_STRING_ELT(tmpSEXP,3,mkChar("y.mm"));
666
+ SET_STRING_ELT(tmpSEXP,4,mkChar("PMLength"));
667
+ SET_STRING_ELT(tmpSEXP,5,mkChar("ProbeSeq"));
668
+ SET_STRING_ELT(tmpSEXP,6,mkChar("MatchScore"));
669
+ SET_STRING_ELT(tmpSEXP,7,mkChar("PMPosition"));
670
+ SET_STRING_ELT(tmpSEXP,8,mkChar("TargetStrand"));
671
+
672
+ setAttrib(PositionInfo,R_NamesSymbol,tmpSEXP);
673
+ UNPROTECT(1);
674
+ } else {
675
+
676
+ PROTECT(PositionInfo = allocVector(VECSXP,7));
677
+ PROTECT(xPM = allocVector(INTSXP,nprobes));
678
+ PROTECT(yPM = allocVector(INTSXP,nprobes));
679
+ PROTECT(PMprobeLength = allocVector(INTSXP,nprobes));
680
+ PROTECT(probeSeqString = allocVector(STRSXP,nprobes));
681
+ PROTECT(MatchScore = allocVector(REALSXP,nprobes));
682
+ PROTECT(PMposition = allocVector(INTSXP,nprobes));
683
+ PROTECT(Strand = allocVector(STRSXP,nprobes));
684
+
685
+ SET_VECTOR_ELT(PositionInfo,0,xPM);
686
+ SET_VECTOR_ELT(PositionInfo,1,yPM);
687
+ SET_VECTOR_ELT(PositionInfo,2,PMprobeLength);
688
+ SET_VECTOR_ELT(PositionInfo,3,probeSeqString);
689
+ SET_VECTOR_ELT(PositionInfo,4,MatchScore);
690
+ SET_VECTOR_ELT(PositionInfo,5,PMposition);
691
+ SET_VECTOR_ELT(PositionInfo,6,Strand);
692
+ UNPROTECT(7);
693
+
694
+ setAttrib(PositionInfo,R_ClassSymbol,mkString("data.frame"));
695
+
696
+ PROTECT(PositionInfoRowNames = allocVector(STRSXP,nprobes));
697
+ for (j=0; j < nprobes; j++){
698
+ sprintf(buf, "%d", j+1);
699
+ SET_STRING_ELT(PositionInfoRowNames,j,mkChar(buf));
700
+ }
701
+ setAttrib(PositionInfo, R_RowNamesSymbol, PositionInfoRowNames);
702
+ UNPROTECT(1);
703
+
704
+ PROTECT(tmpSEXP = allocVector(STRSXP,7));
705
+ SET_STRING_ELT(tmpSEXP,0,mkChar("x"));
706
+ SET_STRING_ELT(tmpSEXP,1,mkChar("y"));
707
+ SET_STRING_ELT(tmpSEXP,2,mkChar("PMLength"));
708
+ SET_STRING_ELT(tmpSEXP,3,mkChar("ProbeSeq"));
709
+ SET_STRING_ELT(tmpSEXP,4,mkChar("MatchScore"));
710
+ SET_STRING_ELT(tmpSEXP,5,mkChar("PMPosition"));
711
+ SET_STRING_ELT(tmpSEXP,6,mkChar("TargetStrand"));
712
+
713
+ setAttrib(PositionInfo,R_NamesSymbol,tmpSEXP);
714
+ UNPROTECT(1);
715
+ }
716
+
717
+
718
+ }
719
+
720
+
721
+
722
+
723
+
724
+ for (j=0; j < nprobes; j++){
725
+ fread_be_uint32(&x,1,infile);
726
+ fread_be_uint32(&y,1,infile);
727
+ /* Rprintf("x y :%u %u\n",x,y); */
728
+
729
+ if (probe_mapping_type == 0){
730
+ fread_be_uint32(&x_mm,1,infile);
731
+ fread_be_uint32(&y_mm,1,infile);
732
+ }
733
+
734
+ /* Rprintf("mm x y :%u %u\n",x_mm,y_mm); */
735
+
736
+ INTEGER(xPM)[j] = x;
737
+ INTEGER(yPM)[j] = y;
738
+
739
+ if (probe_mapping_type == 0){
740
+ INTEGER(xMM)[j] = x_mm;
741
+ INTEGER(yMM)[j] = y_mm;
742
+ }
743
+ fread_be_uchar(&probelength,1,infile);
744
+ /* Rprintf("probelength : %d\n",(int)probelength);*/
745
+
746
+ INTEGER(PMprobeLength)[j] = probelength;
747
+
748
+
749
+ fread_be_uchar(probeseq,7,infile);
750
+ /* Rprintf("probeseq : %s\n",probeseq); */
751
+
752
+
753
+
754
+ dest = (char *)Calloc(25+1,char);
755
+ packedSeqTobaseStr(probeseq,dest);
756
+
757
+ SET_STRING_ELT(probeSeqString,j,mkChar(dest));
758
+ Free(dest);
759
+
760
+
761
+
762
+
763
+ /* matchScore is treated same as version number in header */
764
+ #ifdef WORDS_BIGENDIAN
765
+ /* swap, cast to integer, swap bytes and cast back to float */
766
+ fread_be_float32(&matchScore,1,infile);
767
+ swap_float_4(&matchScore);
768
+ matchScore_int = (int)matchScore;
769
+
770
+
771
+ matchScore_int=(((matchScore_int>>24)&0xff) | ((matchScore_int&0xff)<<24) |
772
+ ((matchScore_int>>8)&0xff00) | ((matchScore_int&0xff00)<<8));
773
+ matchScore = (float)matchScore_int;
774
+
775
+ #else
776
+ /* cast to integer, swap bytes, cast to float */
777
+ fread_float32(&matchScore,1,infile);
778
+ matchScore_int = (int)matchScore;
779
+ matchScore_int=(((matchScore_int>>24)&0xff) | ((matchScore_int&0xff)<<24) |
780
+ ((matchScore_int>>8)&0xff00) | ((matchScore_int&0xff00)<<8));
781
+ matchScore = (float)matchScore_int;
782
+ #endif
783
+ /* Rprintf("matchScore : %f\n",matchScore); */
784
+
785
+ REAL(MatchScore)[j] = matchScore;
786
+
787
+
788
+
789
+ fread_be_uint32(&positionPM,1,infile);
790
+ /* Rprintf("positionPM : %u\n",positionPM);*/
791
+ INTEGER(PMposition)[j] = positionPM;
792
+
793
+
794
+ fread_be_uchar(&strand,1,infile);
795
+ /* Rprintf("strand: %d\n",(int)strand);*/
796
+
797
+ if ((int)strand ==1){
798
+ SET_STRING_ELT(Strand,j,mkChar("F"));
799
+ } else {
800
+ SET_STRING_ELT(Strand,j,mkChar("R"));
801
+ }
802
+
803
+
804
+ }
805
+
806
+ SET_VECTOR_ELT(curSeqIdPositionInfo,1,PositionInfo);
807
+ UNPROTECT(1);
808
+
809
+ SET_VECTOR_ELT(SeqIdPositionInfoList,i,curSeqIdPositionInfo);
810
+ UNPROTECT(1);
811
+ }
812
+
813
+
814
+ UNPROTECT(1);
815
+ return SeqIdPositionInfoList;
816
+
817
+ }
818
+
819
+
820
+
821
+
822
+
823
+
824
+ SEXP ReadBPMAPFileIntoRList(SEXP filename){
825
+
826
+
827
+
828
+ SEXP bpmapRlist;
829
+
830
+ SEXP bpmapHeader;
831
+ SEXP bpmapSeqDesc;
832
+
833
+ SEXP tmpSXP;
834
+
835
+ FILE *infile;
836
+
837
+
838
+ int n_seq;
839
+ float version;
840
+
841
+
842
+ const char *cur_file_name;
843
+ cur_file_name = CHAR(STRING_ELT(filename,0));
844
+
845
+
846
+
847
+ if ((infile = fopen(cur_file_name, "rb")) == NULL)
848
+ {
849
+ error("Unable to open the file %s",filename);
850
+ }
851
+
852
+
853
+
854
+ /*
855
+ first element is header, second item is sequence descriptions
856
+ third item is sequence header/position information
857
+
858
+ */
859
+ PROTECT(bpmapRlist = allocVector(VECSXP,3));
860
+
861
+
862
+ PROTECT(bpmapHeader = ReadBPMAPHeader(infile));
863
+ SET_VECTOR_ELT(bpmapRlist,0,bpmapHeader);
864
+ version = REAL(VECTOR_ELT(bpmapHeader,1))[0];
865
+ n_seq = INTEGER(VECTOR_ELT(bpmapHeader,2))[0];
866
+ UNPROTECT(1);
867
+
868
+ /* Rprintf("version nseq: %f %d\n", version, n_seq); */
869
+
870
+
871
+ PROTECT(bpmapSeqDesc = ReadBPMAPSeqDescription(infile,version,n_seq));
872
+ SET_VECTOR_ELT(bpmapRlist,1,bpmapSeqDesc);
873
+ SET_VECTOR_ELT(bpmapRlist,2,readBPMAPSeqIdPositionInfo(infile,version,n_seq,bpmapSeqDesc));
874
+ UNPROTECT(1);
875
+
876
+ PROTECT(tmpSXP=allocVector(STRSXP,3));
877
+ SET_STRING_ELT(tmpSXP,0,mkChar("Header"));
878
+ SET_STRING_ELT(tmpSXP,1,mkChar("SequenceDescription"));
879
+ SET_STRING_ELT(tmpSXP,2,mkChar("SeqHead.PosInfo"));
880
+ setAttrib(bpmapRlist,R_NamesSymbol,tmpSXP);
881
+ UNPROTECT(1);
882
+
883
+ UNPROTECT(1);
884
+ return bpmapRlist;
885
+
886
+
887
+ }
888
+