scbi_fqbin 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/.DS_Store +0 -0
  3. data/.gitignore +14 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/{README.rdoc → README.md} +0 -0
  7. data/Rakefile +8 -28
  8. data/lib/scbi_fqbin.rb +3 -5
  9. data/lib/scbi_fqbin/fastabin.rb +411 -0
  10. data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
  11. data/lib/scbi_fqbin/fbin_file.rb +1 -1
  12. data/lib/scbi_fqbin/t.rb +9 -0
  13. data/lib/scbi_fqbin/t2.rb +12 -0
  14. data/lib/scbi_fqbin/version.rb +3 -0
  15. data/lib_fqbin_src.zip +0 -0
  16. data/lib_fqbin_src/Makefile +66 -0
  17. data/lib_fqbin_src/fq +0 -0
  18. data/lib_fqbin_src/fq.c +165 -0
  19. data/lib_fqbin_src/hash_fqbin +0 -0
  20. data/lib_fqbin_src/hash_fqbin.c +212 -0
  21. data/lib_fqbin_src/idx_fqbin +21 -0
  22. data/lib_fqbin_src/iterate_fqbin +0 -0
  23. data/lib_fqbin_src/iterate_fqbin.c +136 -0
  24. data/lib_fqbin_src/lib_fqbin.c +1748 -0
  25. data/lib_fqbin_src/lib_fqbin.h +194 -0
  26. data/lib_fqbin_src/mk_fqbin +0 -0
  27. data/lib_fqbin_src/mk_fqbin.c +138 -0
  28. data/lib_fqbin_src/other/bwxform.c +915 -0
  29. data/lib_fqbin_src/other/bwxform.h +74 -0
  30. data/lib_fqbin_src/other/find_in_index.c +130 -0
  31. data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
  32. data/lib_fqbin_src/other/idx_fqbin +0 -0
  33. data/lib_fqbin_src/other/idx_fqbin.c +67 -0
  34. data/lib_fqbin_src/other/make_hsh.sh +14 -0
  35. data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
  36. data/lib_fqbin_src/read_fq +0 -0
  37. data/lib_fqbin_src/read_fq.c +143 -0
  38. data/lib_fqbin_src/read_fqbin +0 -0
  39. data/lib_fqbin_src/read_fqbin.c +101 -0
  40. data/lib_fqbin_src/sort_index +9 -0
  41. data/lib_fqbin_src/test.rb +13 -0
  42. data/scbi_fqbin.gemspec +25 -0
  43. data/test/build.rake +15 -0
  44. data/test/fbinfile +0 -0
  45. data/test/fbinfile.index +0 -0
  46. data/test/no_test_fill_file.rb +66 -0
  47. data/test/old/app.rb +43 -0
  48. data/test/old/bin/iterate_fastabin.rb +54 -0
  49. data/test/old/bin/mk_fastabin.rb +22 -0
  50. data/test/old/bin/rd_fastabin.rb +36 -0
  51. data/test/old/bin/rd_fq.rb +20 -0
  52. data/test/old/bioruby.rb +27 -0
  53. data/test/old/c/Makefile +34 -0
  54. data/test/old/c/fbin_lib.zip +0 -0
  55. data/test/old/c/iterate_fbin.c +54 -0
  56. data/test/old/c/libreria_gz.c +707 -0
  57. data/test/old/c/libreria_gz.h +127 -0
  58. data/test/old/c/main.c +86 -0
  59. data/test/old/c/mk_fbin.c +24 -0
  60. data/test/old/c/rd_seq_fbin.c +44 -0
  61. data/test/old/c/test_ffi/a.out +0 -0
  62. data/test/old/c/test_ffi/app.c +26 -0
  63. data/test/old/c/test_ffi/app.rb +19 -0
  64. data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
  65. data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
  66. data/test/old/c/test_ffi/my_library.rb +23 -0
  67. data/test/old/c/test_ffi/mylibrary.c +22 -0
  68. data/test/old/c/test_ffi/mylibrary.h +6 -0
  69. data/test/old/c/usage_instructions.txt +62 -0
  70. data/test/old/ext/Makefile +187 -0
  71. data/test/old/ext/Makefile.dario +34 -0
  72. data/test/old/ext/extconf.rb +8 -0
  73. data/test/old/ext/mk_fbin.c +24 -0
  74. data/test/old/ext/sample/extras.txt +4 -0
  75. data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
  76. data/test/old/ext/sample/f1.fasta +10 -0
  77. data/test/old/ext/sample/f1.fasta.qual +10 -0
  78. data/test/old/ext/sample/f1.fbin +0 -0
  79. data/test/old/ext/sample/f1.fbin.index +0 -0
  80. data/test/old/ext/sample/main.c +86 -0
  81. data/test/old/ext/usage_instructions.txt +62 -0
  82. data/test/old/t_scbi_fastabin.rb +140 -0
  83. data/test/read_tests/10-original_sizes.sh +16 -0
  84. data/test/read_tests/20-fq_time.sh +23 -0
  85. data/test/read_tests/30-fbin_read_time.sh +23 -0
  86. data/test/read_tests/40-bsc_read_time.sh +21 -0
  87. data/test/read_tests/50-fq_time_x4.sh +25 -0
  88. data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
  89. data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
  90. data/test/results_bio_scbi_fasta.txt +11 -0
  91. data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
  92. data/test/speed.txt +81 -0
  93. data/test/t_scbi_fasta.rb +12 -0
  94. data/test/write_tests/10-original_sizes.sh +16 -0
  95. data/test/write_tests/20-zip_time.sh +17 -0
  96. data/test/write_tests/30-mk_fbin_time.sh +23 -0
  97. data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
  98. data/test/write_tests/40-gzip_time.sh +16 -0
  99. data/test/write_tests/41-bsc_time.sh +16 -0
  100. data/test/write_tests/50-zip_sizes.sh +16 -0
  101. data/test/write_tests/60-fbin_sizes.sh +17 -0
  102. data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
  103. data/test/write_tests/70-gzip_sizes.sh +17 -0
  104. data/test/write_tests/80-bsc_sizes.sh +17 -0
  105. data/website/index.html +87 -0
  106. data/website/index.txt +81 -0
  107. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  108. data/website/stylesheets/screen.css +159 -0
  109. data/website/template.html.erb +50 -0
  110. metadata +208 -95
  111. data/History.txt +0 -19
  112. data/Manifest.txt +0 -12
  113. data/PostInstall.txt +0 -7
  114. data/script/console +0 -10
  115. data/script/destroy +0 -14
  116. data/script/generate +0 -14
@@ -0,0 +1,1748 @@
1
+ #include "lib_fqbin.h"
2
+
3
+
4
+ int check_error(int error_condition, char *message, int return_value){
5
+ if (error_condition) {
6
+ fprintf(stderr,"Error %d; %s\nMSG:%s\n",errno ,message, strerror(errno));
7
+ return return_value;
8
+ }
9
+ }
10
+
11
+
12
+ int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras)
13
+ {
14
+ // compress data
15
+ char metainfo[SEQ_METADATA];
16
+ int error=0;
17
+
18
+ // Convert to fastq
19
+
20
+ // char qual[150000];
21
+ // sprintf(qual,"");
22
+ // char *sep = " ";
23
+ // char *word, *brkt;
24
+ //
25
+ // for (word = strtok_r(in_qual, sep, &brkt);
26
+ // word;
27
+ // word = strtok_r(NULL, sep, &brkt))
28
+ // {
29
+ // sprintf(qual,"%s%c",qual,atoi(word)+33);
30
+ // // strcat(qual2,".");
31
+ // // printf("%s,%s,%c\n",word,qual2, atoi(word)+33);
32
+ //
33
+ // }
34
+
35
+ // printf("write seq\n");
36
+
37
+
38
+ if (file->gzf_bin==NULL) {fprintf(stderr,"error with gzfile_bin, is NULL :%s\n",gzerror(file->gzf_bin,&error));return -2;}
39
+
40
+ long fasta_len=strlen(fasta);
41
+
42
+ // preprocess fasta string
43
+ // if (fasta_len>0)
44
+ // {
45
+ // // printf("IN:%s, %ld\n",fasta,strlen(fasta));
46
+ // // BWXform(fasta, fasta,1);
47
+ // // printf("\nOUT:\n");
48
+ // // write(1,fasta,fasta_len);
49
+ // // printf("\n");
50
+ // // fflush(1);
51
+ // }
52
+
53
+
54
+ // preprocess quality string
55
+ if (strlen(qual)>0){
56
+
57
+ // printf("========================\nQ:%s\n",qual);
58
+ // process qual
59
+ int i;
60
+
61
+ // annotate first qual to compare with rest of quals
62
+ char old=qual[0];
63
+
64
+ // if qual is going to be flattened, take the flatten limit as old
65
+ if ((file->flatten_qual>0) & (old>=file->flatten_qual))
66
+ {
67
+ old=file->flatten_qual;
68
+ }
69
+
70
+ int same_qual=1;
71
+
72
+ // int repeated_count=0;
73
+ // int repeated_start=0;
74
+ // int max_repeated_count=0;
75
+ // int max_repeated_start=0;
76
+
77
+ // process qual string
78
+ for( i = 0; i < strlen(qual); ++i)
79
+ {
80
+
81
+ //discretize up by discretize_qual value
82
+ if (file->discretize_qual>1){
83
+ // qual[i]=qual[i]-(qual[i] % 2)+2-1;
84
+ qual[i]=(qual[i] / file->discretize_qual)*file->discretize_qual;
85
+ }
86
+
87
+ // printf("FL:%c>=%c\n",qual[i],file->flatten_qual);
88
+ // trim high qualitys
89
+ if ((file->flatten_qual>0) & (qual[i]>file->flatten_qual)){
90
+ qual[i]=file->flatten_qual;
91
+ }
92
+
93
+ // if (qual[i]!=old) {
94
+ // if (repeated_count>=max_repeated_count) {
95
+ // max_repeated_start=repeated_start;
96
+ // max_repeated_count=repeated_count;
97
+ // }
98
+ //
99
+ // repeated_start=i;
100
+ // repeated_count=0;
101
+ // old=qual[i]
102
+ //
103
+ // same_qual=0;
104
+ // }
105
+
106
+ // if (qual[i]!=old) {same_qual=0;}
107
+ }
108
+
109
+ if (same_qual)
110
+ {
111
+ // trim qual string
112
+ sprintf(qual,"%c",old);
113
+ qual[1]=0;
114
+ // printf("EQUAL: %s,%s,%ld\n",seq_name, qual,strlen(qual));
115
+ }
116
+
117
+
118
+ // sino hacer RLE
119
+
120
+
121
+ // printf("\nQ:%s\n",qual);
122
+
123
+ }
124
+ // printf("Calc fasta_len\n");
125
+ // printf("Calculated fasta_len\n");
126
+
127
+ snprintf(metainfo,SEQ_METADATA-1,"9999%s %ld %ld %ld", seq_name, fasta_len, strlen(qual), strlen(extras));
128
+ snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, seq_name, fasta_len, strlen(qual), strlen(extras));
129
+ // snprintf(metainfo,SEQ_METADATA-1,"9999%s %ld %ld", seq_name, strlen(fasta), strlen(extras));
130
+ // snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %ld %ld", strlen(metainfo)-4, seq_name, strlen(fasta), strlen(extras));
131
+
132
+ // get begin pos of header
133
+ long beginH=gztell(file->gzf_bin);
134
+
135
+ // TODO check gztell
136
+ if (beginH==-1) {fprintf(stderr,"error with pos of beginH of gzfile_bin :%s\n", gzerror(file->gzf_bin,&error)); return -2;}
137
+
138
+ // write seq to bin file
139
+ gzwrite(file->gzf_bin, metainfo, strlen(metainfo));
140
+
141
+ // TODO check gzwrite
142
+ long beginI=gztell(file->gzf_bin);
143
+
144
+ if (beginI==-1) {fprintf(stderr,"error with pos of beginI of gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
145
+
146
+ // printf("Before write fasta\n");
147
+
148
+ int res=1;
149
+ if (fasta_len>0) res=gzwrite(file->gzf_bin,fasta,fasta_len); //Z_FILTERED);
150
+
151
+ if ( res==0 ) { fprintf(stderr,"Error when writting fasta\n");return -8;}
152
+ long fastaS=gztell(file->gzf_bin)-beginI;
153
+
154
+ // printf("After write fasta\n");
155
+
156
+
157
+ if (strlen(qual)>0){
158
+ res=gzwrite(file->gzf_bin,qual,strlen(qual)); //Z_FILTERED);
159
+ }
160
+
161
+ if ( res==0 ) { fprintf(stderr,"Error when writting qual\n");return -8;}
162
+ long qualS=gztell(file->gzf_bin)-fastaS-beginI;
163
+
164
+ if (strlen(extras)>0) res=gzwrite(file->gzf_bin,extras,strlen(extras)); //Z_FILTERED);
165
+
166
+ if ( res==0 ) { fprintf(stderr,"Error when writting extras\n");return -8;}
167
+ long extrasS=gztell(file->gzf_bin)-qualS-fastaS-beginI;
168
+
169
+ // add_sequence(&seql,seq_name,pos_chunk_gz,beginI,fastaS,qualS,extrasS);
170
+
171
+ // Write index file
172
+ if((file)->create_index)
173
+ {
174
+ char tmp[SEQ_METADATA];
175
+ sprintf(tmp,"%s %lld %ld\n",seq_name,file->pos_chunk_gz,beginH);
176
+ gzwrite(file->gzf_index,tmp,strlen(tmp));
177
+ }
178
+
179
+ (file->counter)++;
180
+ // if (counter > 2) fprintf(stderr,"Probando static counter para llamadas desde ruby, valor %d\n",counter);
181
+
182
+ // create new chunk
183
+ if (((file->counter)%10000)==0) {
184
+ // curr_time=time(NULL);
185
+ // printf("10k seqs in:%f secs\n",difftime(curr_time,prev_time));
186
+ // prev_time=curr_time;
187
+
188
+ // close current chunk
189
+ gzclose(file->gzf_bin);
190
+
191
+ // open file again to annotate chunk
192
+ int file_bin=open(file->name,O_APPEND);
193
+
194
+ //goto end of file
195
+ long long pos=lseek(file_bin,0,SEEK_END);
196
+ if (pos==-1) {fprintf(stderr,"error %d seeking file :%s\n",errno,strerror(errno));return -1;}
197
+
198
+ // annotate chunk pos
199
+ file->pos_chunk_gz=pos;
200
+
201
+ close(file_bin);
202
+
203
+ // open new gzfile
204
+ file->gzf_bin=gzopen(file->name,"ab");
205
+ if (file->gzf_bin==NULL) {fprintf(stderr,"error opening gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
206
+ }
207
+
208
+ return 0;
209
+ }
210
+
211
+
212
+
213
+ /* Reads the metadata from the main file
214
+ It initializes the version variable
215
+ */
216
+ int read_bin_file_metadata(struct file_data *filed)
217
+ {
218
+ char header[SEQ_METADATA];
219
+ int fastaS,qualS,extrasS=0;
220
+ int ver,subver;
221
+
222
+ // printf("pos1b %ld\n",gztell(filed->gzf_bin));//lseek ((*filed), 0, SEEK_CUR))
223
+
224
+ int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
225
+ // printf("pos2b %ld\n",gztell(filed->gzf_bin));//lseek ((*filed), 0, SEEK_CUR))
226
+
227
+
228
+ if ( res!=0 ) {return -1;}
229
+ if ( strlen(header)<20 ) {fprintf(stderr,"Too short sequence header:%s. lenght:%ld\n",header,strlen(header));return -1;}
230
+
231
+ // 28UMACOMPRESSEDFORMAT_1_0 0 0 0
232
+ // header[strlen(header)-2]=0;
233
+
234
+ if (strncmp(header,"UMACOMPRESSEDFORMAT_",19)!=0) {fprintf(stderr,"Incorrect header in file, header:%s\n",header);return -1;}
235
+ // TODO fill the file_data structure with the header data
236
+ if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&ver,&subver)!=2) return -1;
237
+ //if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&(filed->version),&(filed->subversion))!=2) return -1;
238
+ filed->version=11;//ver;
239
+ filed->subversion=subver;
240
+ // fprintf(stderr,"file version:%d,%d\n",filed->version,filed->subversion);
241
+ return 0;
242
+ }
243
+
244
+ /* Reads the metadata from the index file
245
+ It initializes the version and binary_search variable
246
+ */
247
+ int read_index_file_metadata(struct file_data *filed)
248
+ {
249
+ char header[SEQ_METADATA];
250
+ int fastaS,qualS,extrasS=0;
251
+
252
+
253
+ int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
254
+
255
+ if ( strlen(header)<19 ) {return -1;}
256
+
257
+ // 28UMACOMPRESSEDFORMAT 1 0 0 0 0
258
+ header[strlen(header)-2]=0;
259
+
260
+ if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
261
+ // TODO fill the file_data structure with the header data
262
+ if (sscanf(header,"UMACOMPRESSEDFORMAT %d %d",&(filed->version),&(filed->subversion))!=2) {
263
+ fprintf(stderr,"SEQ READ:Header incorrect when reading versions:%s.\n",header);
264
+ return -1;
265
+ }
266
+
267
+ return 0;
268
+ }
269
+
270
+ /* reads the header of a sequence in the main file.
271
+ the pointer to the file points to the fasta data after calling read_seq_header
272
+ returns 0 if ok
273
+ -1 if there is an error
274
+ -2 if EOF
275
+ */
276
+
277
+ int read_seq_header(gzFile *gzf_bin, char *seq_name,int *fastaS, int *qualS, int *extrasS)
278
+ {
279
+ int header_size=4;
280
+ char hsize[40];
281
+ char tmp[1000];
282
+ char sname[SEQ_METADATA];
283
+
284
+ long pos=gzread(gzf_bin,hsize,header_size);
285
+
286
+ // EOF found
287
+ if ( pos==0 ) return -2;
288
+
289
+ // Error reading file
290
+ if ( pos==-1 ) {fprintf(stderr,"Incorrect sequence header. File may be corrupted\n");return -1;}
291
+
292
+ hsize[pos]=0;
293
+ sscanf(hsize,"%d",&header_size);
294
+ pos=gzread(gzf_bin,tmp,header_size);
295
+
296
+ if ( pos==0 ) return -2;
297
+
298
+ if ( pos==-1 ) {fprintf(stderr,"Incorrect sequence header. File may be corrupted\n");return -1;}
299
+
300
+ tmp[header_size]=0;
301
+ int reads=sscanf(tmp,"%s %d %d %d",sname,fastaS,qualS,extrasS);
302
+
303
+ if (reads!=4) {return -1;};
304
+
305
+ if (seq_name!=NULL) strncpy(seq_name,sname,SEQ_METADATA);
306
+
307
+ return 0;
308
+ }
309
+
310
+ // check files before reading
311
+ // it initializes the previous variables, file_version and binary_search
312
+ // result :
313
+ // 0 : if both the bin and index files exists and are from the current version
314
+ // 1 : if both the bin and index files exists but are from another version
315
+ // 2 : if both files are missing
316
+ // 3 : if bin file is missing
317
+ // 4 : if index file is missing
318
+ int check_files()
319
+ {
320
+
321
+ // open the files, read and check the header
322
+ return 0;
323
+ }
324
+
325
+ // returns the version of the opened file
326
+ int version(struct file_data *filed)
327
+ {
328
+ if (filed->gzf_bin==NULL) return -1;
329
+ return filed->version;
330
+ }
331
+
332
+ // returns the version of the opened file
333
+ int subversion(struct file_data *filed)
334
+ {
335
+ if (filed->gzf_bin==NULL) return -1;
336
+ return filed->subversion;
337
+ }
338
+
339
+ /*
340
+ mode can be:
341
+ 1 - random, for each read it begins to read from the beggining of index
342
+ 2 - sequential, it keeps the position inside the index and main files.
343
+ */
344
+ int initialize_sequential_reads(struct file_data ** filed, char *filename)
345
+ {
346
+ char header[SEQ_METADATA];
347
+ int fastaS,qualS,extrasS=0;
348
+
349
+ if ( *filed == NULL ) {*filed=malloc(sizeof(struct file_data));}
350
+
351
+
352
+ (*filed)->gzf_bin=gzopen(filename,"r");
353
+
354
+ int res=check_error((*filed)->gzf_bin==NULL,"Unable to open file",-1);
355
+
356
+ strncpy((*filed)->name,filename,MAXFNAME);
357
+ (*filed)->error=0;
358
+
359
+
360
+
361
+ // reads the metadata
362
+ /*
363
+ int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
364
+
365
+ if ( strlen(header)<19 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s.\n",header);return -1;}
366
+
367
+ // 28UMACOMPRESSEDFORMAT_1 0 0 0
368
+ header[strlen(header)-2]=0;
369
+
370
+ if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
371
+ // TODO fill the file_data structure with the header data
372
+ */
373
+ // printf("pos1 %ld\n",gztell((*filed)->gzf_bin));
374
+ res= read_bin_file_metadata(*filed);
375
+ // printf("pos2 %ld\n",gztell((*filed)->gzf_bin));
376
+
377
+ // inspect_file_data_struct(filed);
378
+
379
+ return res;
380
+ }
381
+
382
+
383
+ int read_data_sequential(struct file_data *filed,char **seq_name, char **fasta, char **qual, char **extras)
384
+ {
385
+ int res=0;
386
+ int error=0;
387
+ int fastaS,qualS,extrasS=0;
388
+
389
+ if ( *seq_name == NULL ) {*seq_name=(char *)malloc(SEQ_METADATA);strncpy(*seq_name,"",4);}
390
+
391
+ res=read_seq_header(filed->gzf_bin, *seq_name, &fastaS, &qualS, &extrasS);
392
+ // printf("FS:%d,QS:%d\n",fastaS,qualS);
393
+ if (res==-2) // EOF
394
+ return -9;
395
+
396
+ if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);strncpy(*fasta,"",fastaS);}
397
+ if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
398
+ if (( *extras == NULL )&&(extrasS>0)) {*extras=(char *)malloc(extrasS+1);strncpy(*extras,"",extrasS);}
399
+
400
+ long pos=gzread(filed->gzf_bin,*fasta,fastaS);
401
+
402
+ // BWReverseXform(*fasta, *fasta, 1, fastaS);
403
+
404
+ (*fasta)[fastaS]=0;
405
+ pos=gzread(filed->gzf_bin,*qual,qualS);
406
+
407
+ // if only one qual read, repeat it
408
+ if((qualS==1) & (qualS!=fastaS))
409
+ {
410
+ char q=*qual[0];
411
+ free(*qual);
412
+ *qual=NULL;
413
+ qualS=fastaS;
414
+ if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
415
+ memset(*qual,q,qualS);
416
+ }
417
+
418
+ (*qual)[qualS]=0;
419
+ // printf("LLEGA\n:");
420
+ if (extrasS>0) {pos=gzread(filed->gzf_bin,*extras,extrasS);(*extras)[extrasS]=0;}
421
+ return 0;
422
+
423
+ }
424
+ int close_sequential_reads(struct file_data *file_d)
425
+ {
426
+ gzclose(file_d->gzf_bin);
427
+ if ( (file_d)!= NULL ) { free(file_d);}
428
+ }
429
+
430
+ int regenerate_index(char * filename){
431
+
432
+
433
+ /* NO SE PUEDE USAR ESTA FUNCION PORQUE CUANDO SE LEE EL FICHERO, LA POSICION DE LECTURA NO TIENE PORQUE COINCIDIR CON LA DE ESCRITURA CUANDO SE CREÓ EL FBIN, DE MODO QUE AL GENERAR EL INDICE NO SABEMOS DE QUE CHUNK LEER. LA LIB ZLIB LEE CON EL GZREAD UN TROZO DE FICHERO DEL TAMAÑO QUE ELLA DECIDA, Y ASI NO COINCIDE LUEGO CON LOS BLOQUES QUE NOSOTROS ESCRIBIAMOS
434
+ */
435
+
436
+ return -1;
437
+
438
+ int res=0;
439
+
440
+ long long bcount=0;
441
+ long long pos=0;
442
+
443
+ struct file_data *filed=NULL;
444
+ int fastaS,qualS,extrasS=0;
445
+
446
+ char header[SEQ_METADATA];
447
+ char metainfo[SEQ_METADATA];
448
+
449
+ char *seq_name = NULL;//[SEQ_METADATA];
450
+
451
+ if ( (filed) == NULL ) {filed=malloc(sizeof(struct file_data));}
452
+
453
+ filed->pos_chunk_gz=0;
454
+ filed->counter=0;
455
+
456
+ // memset(metainfo,'a',20);
457
+ // printf("%s\n",metainfo);
458
+ // open file
459
+
460
+ int file_bin=open(filename,O_RDONLY);
461
+
462
+ // filed->gzf_bin=gzopen(filename,"rb");
463
+
464
+ filed->gzf_bin = gzdopen(file_bin,"rb");
465
+
466
+ res=check_error(filed->gzf_bin==NULL,"Unable to open file",-1);
467
+
468
+ filed->error=0;
469
+ strncpy(filed->name,filename,MAXFNAME);
470
+
471
+ // read header
472
+ res=read_bin_file_metadata(filed);
473
+
474
+
475
+ int error=0;
476
+ int seek_res=0;
477
+
478
+
479
+ if ( seq_name == NULL ) {seq_name=(char *)malloc(SEQ_METADATA);strncpy(seq_name,"",4);}
480
+ char basura[100000];
481
+
482
+ // char ** basura;
483
+ // if ( *basura == NULL ) {*basura=(char *)malloc(fastaS+qualS+extrasS+1); strncpy(*basura,"",fastaS+qualS+extrasS);}
484
+ //
485
+ while (res==0){
486
+ long beginH=gztell(filed->gzf_bin);
487
+
488
+ // long long pos=lseek(file_bin,0,SEEK_CUR);
489
+ // printf("BEF: %lld\n",pos);
490
+
491
+ res=read_seq_header(filed->gzf_bin, seq_name, &fastaS, &qualS, &extrasS);
492
+ if (res==-2) // EOF
493
+ return -9;
494
+
495
+ bcount=bcount+4+fastaS+qualS+extrasS;
496
+
497
+ // printf("SEQ: %s, skip: %d, res:%d\n",seq_name,fastaS+qualS+extrasS,res);
498
+
499
+ snprintf(metainfo,SEQ_METADATA-1,"9999%s %d %d %d", seq_name, fastaS, qualS, extrasS);
500
+ // snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %d %d %d", strlen(metainfo)-4, seq_name, fastaS,qualS,extrasS);
501
+
502
+ // Write index file
503
+ char tmp[SEQ_METADATA];
504
+ sprintf(tmp,"%s %lld %ld\n",seq_name,filed->pos_chunk_gz,beginH);
505
+
506
+ printf("%s %lld %ld\n",seq_name,filed->pos_chunk_gz,beginH);
507
+ printf("%s\n",metainfo);
508
+
509
+ pos=lseek(file_bin,0,SEEK_CUR);
510
+ // long long pos2=gztell(filed->gzf_bin);
511
+
512
+ printf("Antes seek: %lld\n",pos);
513
+
514
+ // printf("%s\n",seq_name);
515
+
516
+ seek_res=gzseek(filed->gzf_bin,fastaS+qualS+extrasS,SEEK_CUR);
517
+
518
+ printf("bcount:%lld\n",bcount);
519
+
520
+ // long long pos4=lseek(file_bin,0,SEEK_CUR);
521
+
522
+
523
+ pos=lseek(file_bin,0,SEEK_CUR);
524
+ printf("Despues seek: %lld\n",pos);
525
+ // printf("AFT: %lld\n=============\n",pos4);
526
+
527
+ // long pos3=gzread(filed->gzf_bin,&basura,fastaS+qualS+extrasS);
528
+
529
+ (filed->counter)++;
530
+
531
+ long long pos2=gztell(filed->gzf_bin);
532
+
533
+
534
+
535
+ // new chunk
536
+ if (((filed->counter)%10000)==0) {
537
+
538
+ printf("SEQ 10K:%s\n",seq_name);
539
+ // close current chunk
540
+ pos=lseek(file_bin,0,SEEK_CUR);
541
+ // long long pos2=gztell(filed->gzf_bin);
542
+
543
+ printf("FINAL BLOCK POSSSSSSSSSSSSSS: %lld\n",pos);
544
+
545
+ gzclose(filed->gzf_bin);
546
+
547
+ // open file again to annotate chunk
548
+ file_bin=open(filed->name,O_RDONLY);
549
+
550
+ //goto end of file
551
+ pos=lseek(file_bin,pos,SEEK_SET);
552
+ if (pos==-1) {fprintf(stderr,"error %d seeking file :%s\n",errno,strerror(errno));return -1;}
553
+
554
+ printf("FINAL BLOCK POSSSSSSSSSSSSSS: %lld\n",pos);
555
+
556
+ // annotate chunk pos
557
+ filed->pos_chunk_gz=pos;
558
+
559
+ close(file_bin);
560
+
561
+ // open new gzfile
562
+ filed->gzf_bin=gzdopen(file_bin,"rb");
563
+ if (filed->gzf_bin==NULL) {fprintf(stderr,"error opening gzfile :%s\n",gzerror(filed->gzf_bin,&error));return -2;}
564
+ }
565
+
566
+
567
+
568
+ }
569
+
570
+ // if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);strncpy(*fasta,"",fastaS);}
571
+ // if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
572
+ // if (( *extras == NULL )&&(extrasS>0)) {*extras=(char *)malloc(extrasS+1);strncpy(*extras,"",extrasS);}
573
+
574
+
575
+ // long pos=gzread(filed->gzf_bin,*fasta,fastaS);
576
+ // (*fasta)[fastaS]=0;
577
+ // pos=gzread(filed->gzf_bin,*qual,qualS);
578
+ // (*qual)[qualS]=0;
579
+ // if (extrasS>0) {pos=gzread(filed->gzf_bin,*extras,extrasS);(*extras)[extrasS]=0;}
580
+
581
+
582
+ // close files
583
+ gzclose(filed->gzf_bin);
584
+ if ( (filed)!= NULL ) {free(filed);}
585
+
586
+ return 0;
587
+
588
+
589
+ }
590
+
591
+ long long find_seq_in_hash(char *filename,char *sname)
592
+ {
593
+
594
+ char hash_file_name[MAXFNAME];
595
+ // char indexname[MAXFNAME];
596
+ int error;
597
+ char sname1[MAXSEQNAME];// sequence name
598
+ char sname2[MAXSEQNAME];// sequence name
599
+ long long gz_chunk=0;
600
+ char tmp[SEQ_METADATA];
601
+ long long res=-1;
602
+
603
+ // to save min, max sequences and current chunk
604
+ char min_name[MAXSEQNAME];
605
+ char max_name[MAXSEQNAME];
606
+ long long current_chunk=0;
607
+
608
+
609
+ strcpy(min_name,"");
610
+ strcpy(max_name,"");
611
+
612
+ // calc index and hash name
613
+ // snprintf(indexname,MAXFNAME,"%s.index",filename);
614
+ snprintf(hash_file_name,MAXFNAME,"%s.index.hash",filename);
615
+
616
+
617
+ // open index and hash file
618
+ gzFile gzhash_file=gzopen(hash_file_name,"r");
619
+
620
+ if (gzhash_file==NULL) {
621
+ // fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(gzhash_file,&error));
622
+ // no hash file found
623
+ return -2;
624
+ }
625
+
626
+ // repeat until EOF
627
+ while ( gzgets(gzhash_file,tmp,sizeof(tmp))!=Z_NULL ) {
628
+
629
+ // printf("%s\n",tmp);
630
+ // parse string
631
+ int reads=sscanf(tmp,"%s %s %lld",sname1,sname2,&gz_chunk);
632
+
633
+ if(reads==3) // valid index line
634
+ {
635
+ //
636
+ if((strcmp(sname,sname1)>=0) && (strcmp(sname,sname2)<=0))
637
+ {
638
+ #if DEBUG
639
+ printf("%s in [%s,%s]\n",sname,sname1,sname2);
640
+ #endif
641
+ res = gz_chunk;
642
+ break;
643
+ }else{
644
+ // printf("%s NOT IN [%s,%s]\n",sname,sname1,sname2);
645
+ }
646
+
647
+ }
648
+
649
+ }
650
+
651
+ // close files
652
+ gzclose(gzhash_file);
653
+
654
+ return res;
655
+ }
656
+
657
+
658
+ long long find_seq_in_index(char *filename,char *sname, long long index_chunk, long long *gz_chunk, long long *beginH){
659
+
660
+ long long chunk=-1;
661
+
662
+ char file_name[MAXFNAME];
663
+ // char indexname[MAXFNAME];
664
+ int error;
665
+ char sname1[MAXSEQNAME];// sequence name
666
+ char sname2[MAXSEQNAME];// sequence name
667
+ long long aux_beginH=-1,aux_gz_chunk=-1;
668
+ char tmp[SEQ_METADATA];
669
+ long long res=-1;
670
+
671
+ *gz_chunk = 0;
672
+ *beginH=0;
673
+
674
+ // to save min, max sequences and current chunk
675
+ char min_name[MAXSEQNAME];
676
+ char max_name[MAXSEQNAME];
677
+ long long current_chunk=0;
678
+
679
+
680
+ strcpy(min_name,"");
681
+ strcpy(max_name,"");
682
+
683
+ // calc index and hash name
684
+ // snprintf(indexname,MAXFNAME,"%s.index",filename);
685
+ snprintf(file_name,MAXFNAME,"%s.index",filename);
686
+
687
+
688
+
689
+ // open index and hash file
690
+ // gzFile filegz=gzopen(file_name,"r");
691
+ int file=open(file_name, O_RDONLY);
692
+
693
+ if (file<0)
694
+ {
695
+ return -2;
696
+ }
697
+
698
+ if(index_chunk>0)
699
+ {
700
+ #if DEBUG
701
+ printf("Seek to %lld\n",index_chunk);
702
+ #endif
703
+ // res=gzseek(filegz,index_chunk,SEEK_SET);
704
+ res=lseek(file,index_chunk,SEEK_SET);
705
+
706
+ // printf("Seeked\n");
707
+ }
708
+
709
+ gzFile filegz=gzdopen(file,"r");
710
+
711
+ if (filegz==NULL) {
712
+ // fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(filegz,&error));
713
+ return -2;
714
+ }
715
+
716
+ // repeat until EOF
717
+ while ( gzgets(filegz,tmp,sizeof(tmp))!=Z_NULL ) {
718
+
719
+ // printf("%s\n",tmp);
720
+ // parse string
721
+ // int reads=sscanf(tmp,"%s %s %lld",sname1,sname2,&gz_chunk);
722
+ int reads=sscanf(tmp,"%s %lld %lld",sname1,&aux_gz_chunk,&aux_beginH);
723
+
724
+
725
+ if(reads==3) // valid index line
726
+ {
727
+ //
728
+ if(strcmp(sname,sname1)==0)
729
+ {
730
+ #if DEBUG
731
+ printf("%s IN %s\n",sname, tmp);
732
+ #endif
733
+
734
+ chunk = aux_gz_chunk;
735
+ *gz_chunk = aux_gz_chunk;
736
+ *beginH=aux_beginH;
737
+
738
+ // beginH=gz_beginH;
739
+ break;
740
+ }else{
741
+ #if DEBUG
742
+ printf("NOT IN %s",tmp);
743
+ #endif
744
+ // break;
745
+ }
746
+
747
+ }
748
+
749
+ }
750
+
751
+ // close files
752
+ gzclose(filegz);
753
+
754
+ return chunk;
755
+ }
756
+
757
+
758
+
759
+ /*
760
+ read_seq reads from filename the sequence named seq_name and returns its
761
+ fasta, qual and extras in those variables.
762
+ It returns 0 if there are no errors, otherwise it returns:
763
+ -2 : error opening index file (it doesn't exists)
764
+ -3 : error reading index file
765
+ -4 : error sequence not found in index file
766
+ -5 : error opening file (it doesn't exists)
767
+ -6 : error reading file
768
+ -7 : error sequence not found
769
+ -8 : error uncompressing sequence
770
+ -9 : EOF
771
+
772
+ */
773
+
774
+ int read_seq(char *filename, char *seq_name, char **fasta, char **qual, char **extras)
775
+ {
776
+ /* Hacer grep en filename.index de seq_name */
777
+ /* Una vez encontrado leer su info (indice y offsets) */
778
+ /* leer de filename en sus offests el fasta qual y extras */
779
+ /* Descomprimirlo y devolverlo */
780
+
781
+ char indexname[MAXFNAME];
782
+ char sname[MAXSEQNAME];// sequence name
783
+ // char *fasta_comp; // compressed fasta
784
+ // char *qual_comp; // compressed qual
785
+ // char *extras_comp; // compressed extras
786
+ long long beginH, gz_chunk=0;
787
+ int fastaS, qualS, extrasS=0;
788
+ char tmp[SEQ_METADATA];
789
+ int res=0;
790
+ int error=0;
791
+
792
+ // int bufsize=MAXSEQLENGTH;
793
+ //
794
+ // // allocate memory for return data if necessary
795
+ // if ( *fasta == NULL ) {*fasta=(char *)malloc(bufsize);strncpy(*fasta,"",bufsize);}
796
+ // if ( *qual == NULL) {*qual=(char *)malloc(bufsize);strncpy(*qual,"",bufsize);}
797
+ // if ( *extras == NULL ) {*extras=(char *)malloc(bufsize);strncpy(*extras,"",bufsize);}
798
+
799
+ // calc index name
800
+ // snprintf(indexname,MAXFNAME,"%s.index",filename);
801
+
802
+ long long chunk=find_seq_in_hash(filename,seq_name);
803
+ // printf("Chunk: %lld\n",chunk);
804
+
805
+ if (chunk<0){
806
+ chunk=0;
807
+ }
808
+
809
+ if ((res=find_seq_in_index(filename,seq_name,chunk,&gz_chunk,&beginH))<0){
810
+ return res;
811
+ };
812
+
813
+
814
+ // open index file
815
+ // gzFile gzfile_index=gzopen(indexname,"r");
816
+ // if (gzfile_index==NULL) {
817
+ // fprintf(stderr,"error opening gzfile_index :%s\n",gzerror(gzfile_index,&error));
818
+ // return -2;
819
+ // }
820
+ //
821
+ // // Reads the index to this info, and the offset to its data
822
+ // int reads=3;
823
+ // while ( reads == 3 ) {
824
+ //
825
+ // // read a chunk of data from index with the size of tmp
826
+ // gzgets(gzfile_index,tmp,sizeof(tmp));
827
+ // reads=sscanf(tmp,"%s %lld %lld",sname,&gz_chunk,&beginH);
828
+ //
829
+ //
830
+ //
831
+ //
832
+ // if (( reads != 3 ) && ( reads!=EOF )) {
833
+ // fprintf(stderr,"Error scanning index: %d\n",reads);
834
+ // gzclose(gzfile_index);
835
+ // return -3;
836
+ // }
837
+ //
838
+ // // sequence was finally found, exit loop
839
+ // if ( strncmp(sname, seq_name,MAXSEQNAME)==0) reads=999; // to get out, seq found
840
+ // }
841
+ //
842
+ // // close index file
843
+ // gzclose(gzfile_index);
844
+ //
845
+ // maybe sequence was not found
846
+ // fprintf(stderr,"Sequence not found\n");
847
+ // if (reads==EOF) {return -4;}
848
+
849
+ // We get here if sequence was found
850
+
851
+ #if DEBUG
852
+ printf("Index found %lld. Seeking\n",gz_chunk);
853
+ #endif
854
+ // open bin file to extract data
855
+ int dataf=open(filename, O_RDONLY);
856
+
857
+ // seek to chunk pos
858
+ // TODO- ¿como se salta el chunk?
859
+ // res=lseek(dataf,gz_chunk,SEEK_SET);
860
+ res=lseek(dataf,gz_chunk,SEEK_SET);
861
+
862
+ // TODO check res
863
+ gzFile gzfile_bin=gzdopen(dataf,"r");
864
+
865
+ // seek to seq inside chunk
866
+ res=gzseek(gzfile_bin,beginH,SEEK_SET);
867
+ // TODO check res
868
+
869
+ // printf("Seeked\n");
870
+
871
+ // read sequence header
872
+ res=read_seq_header(gzfile_bin,NULL, &fastaS, &qualS, &extrasS);
873
+
874
+ // int bufsize=MAXSEQLENGTH;
875
+
876
+ // memset(*qual,q,qualS);
877
+ // allocate memory for return data if necessary
878
+ if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);(*fasta)[0]=0;}
879
+ if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);(*qual)[0]=0;}
880
+ if ( *extras == NULL ) {*extras=(char *)malloc(extrasS+1);(*extras)[0]=0;}
881
+
882
+ long pos=gzread(gzfile_bin,*fasta,fastaS);
883
+ // printf("LEIDO:%ld\n",pos);
884
+ // BWReverseXform(*fasta, *fasta, 1, fastaS);
885
+
886
+ (*fasta)[fastaS]=0;
887
+ pos=gzread(gzfile_bin,*qual,qualS);
888
+
889
+ // if only one qual read, repeat it
890
+ if((qualS==1) & (qualS!=fastaS))
891
+ {
892
+ char q=*qual[0];
893
+ free(*qual);
894
+ *qual=NULL;
895
+ qualS=fastaS;
896
+ if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
897
+ memset(*qual,q,qualS);
898
+ }
899
+
900
+ (*qual)[qualS]=0;
901
+
902
+ if (extrasS>0) {pos=gzread(gzfile_bin,*extras,extrasS); (*extras)[extrasS]=0;}
903
+ gzclose(gzfile_bin);
904
+
905
+ return 0;
906
+ }
907
+
908
+ void inspect_file_data_struct(struct file_data *file){
909
+
910
+ printf("file name:%s\n",file->name);
911
+ printf("file index_name:%s\n",file->index_name);
912
+ printf("file version:%d\n",file->version);
913
+ printf("file subversion:%d\n",file->subversion);
914
+ printf("error:%d\n",file->error);
915
+ /*
916
+ if (file->bin_search==TRUE) printf("file binary search is possible\n");
917
+ else printf("file binary search is not possible\n");
918
+ */
919
+
920
+ }
921
+
922
+ // initialize the state for doing writes
923
+ // two modes:
924
+ // 1 .- new files
925
+ // 2 .- add data to files, if they don't exist they are created
926
+ int initialize_writes(struct file_data ** file, char *output_name, int mode, int discretize_qual, int flatten_qual, int create_index)
927
+ {
928
+
929
+ // check if the files exists, in case it exists check if it has the
930
+ // correct metadata and if it is of the correct version
931
+ // in other case exits with an error
932
+ // struct file_data *file = malloc(sizeof(struct write_file));
933
+ if ( *file == NULL ) {*file=malloc(sizeof(struct file_data));}
934
+
935
+ (*file)->pos_chunk_gz=0;
936
+ (*file)->discretize_qual=discretize_qual;
937
+ (*file)->flatten_qual=flatten_qual;
938
+ (*file)->create_index=create_index;
939
+
940
+ int state=check_files(output_name);
941
+ if (state==1) {
942
+ fprintf(stderr,"File is from a different version\n");
943
+ return -1;
944
+ }
945
+ if ((state!=2)&&(state!=0)) {
946
+ fprintf(stderr,"Error %d when checking files\n",state);
947
+ return -1;
948
+ }
949
+
950
+ // copy the name of the file
951
+ strncpy((*file)->name,output_name,MAXFNAME);
952
+
953
+ // open the compressed files
954
+ int error=0;
955
+ int flags=O_WRONLY|O_CREAT|O_TRUNC;
956
+ if (mode==2) flags=O_RDWR;
957
+ // printf("mode:%d\n",mode);
958
+
959
+ //set index name
960
+ snprintf((*file)->index_name,MAXFNAME,"%s.index",(*file)->name);
961
+
962
+ int file_index=-1;
963
+
964
+ if ((*file)->create_index){
965
+ //open index file
966
+ file_index=open((*file)->index_name,flags,0644);
967
+
968
+ if (file_index==-1) return -2;
969
+ }
970
+
971
+ // open bin file
972
+ int file_bin=open((*file)->name,flags,0644);
973
+ // printf("fd:%d\n",file_bin);
974
+ if (file_bin==-1) {fprintf(stderr,"error opening file_bin for writting:%s\n",strerror(errno));return -2;}
975
+ if (mode==2) {
976
+ long long pos=lseek(file_index,0,SEEK_END);
977
+ if (pos==-1) {fprintf(stderr,"error going to end of index file %s\n",strerror(errno)); return -2;}
978
+ pos=lseek(file_bin,0,SEEK_END);
979
+ if (pos==-1) {fprintf(stderr,"error going to end of bin file %s\n",strerror(errno)); return -2;}
980
+ (*file)->pos_chunk_gz=pos;
981
+ }
982
+
983
+ if ((*file)->create_index){
984
+ // open zlib index file
985
+ (*file)->gzf_index=gzdopen(file_index,"wb");
986
+ if ((*file)->gzf_index==NULL) {
987
+ fprintf(stderr,"error opening gzfile_index for writting:%s\n",gzerror((*file)->gzf_index,&error));
988
+ return -2;
989
+ }
990
+ }
991
+
992
+ // open zlib bin file
993
+ (*file)->gzf_bin=gzdopen(file_bin,"wb");
994
+ if ((*file)->gzf_bin==NULL) {
995
+ fprintf(stderr,"error opening gzfile for writting:%s\n",gzerror((*file)->gzf_bin,&error));
996
+ return -2;
997
+ }
998
+
999
+ // initializes the files, writting the metadata
1000
+ if (mode==1) {
1001
+ char header[SEQ_METADATA];
1002
+ (*file)->version=VERSION;
1003
+ (*file)->subversion=SUBVERSION;
1004
+ (*file)->error=0;
1005
+ // TODO put correct size
1006
+ snprintf(header,SEQ_METADATA-1,"9999UMACOMPRESSEDFORMAT_%d_%d %d %d %d", (*file)->version,(*file)->subversion, 0, 0, 0);
1007
+ snprintf(header,SEQ_METADATA-1,"%4ldUMACOMPRESSEDFORMAT_%d_%d %d %d %d", strlen(header)-4,(*file)->version,(*file)->subversion, 0, 0, 0);
1008
+ // snprintf(header,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, (*file)->version,(*file)->subversion, 0, 0, 0);
1009
+
1010
+ // sprintf(header," 29UMACOMPRESSEDFORMAT_%d_%d 0 0 0\n",(*file)->version,(*file)->subversion);
1011
+ int res=gzwrite((*file)->gzf_bin,header,strlen(header));
1012
+
1013
+ if((*file)->create_index)
1014
+ {
1015
+ sprintf(header,"UMACOMPRESSEDFORMAT 1 0 0 999999999999 999999999999\n");
1016
+ res=gzwrite((*file)->gzf_index,header,strlen(header));
1017
+ }
1018
+ }
1019
+ (*file)->counter=0;
1020
+
1021
+ // prev_time=time(NULL);
1022
+
1023
+ // printf("Init writes done\n");
1024
+ return 0;
1025
+ }
1026
+
1027
+
1028
+
1029
+ int close_writes(struct file_data *file)
1030
+ {
1031
+ gzclose(file->gzf_bin);
1032
+ if((file)->create_index)
1033
+ {
1034
+ gzclose(file->gzf_index);
1035
+ }
1036
+
1037
+ if ( file != NULL ) { free(file);}
1038
+ }
1039
+
1040
+ // open a file for reading and check error
1041
+ int open_file(char *fname, FILE **file){
1042
+
1043
+ *file=fopen(fname,"r");
1044
+
1045
+ if (*file==NULL) { fprintf(stderr,"Error opening fasta file %s, result %d %s\n",fname,errno,strerror(errno));
1046
+ return -1;
1047
+ };
1048
+
1049
+ return 0;
1050
+ }
1051
+
1052
+ // open a file for reading and check error
1053
+ int close_file(FILE *file){
1054
+
1055
+ fclose(file);
1056
+ return 0;
1057
+ }
1058
+
1059
+
1060
+ // removes last \n from string if any
1061
+ int chomp(char *str){
1062
+
1063
+ // printf("LEN: %s, %ld\n",str,strlen(str));
1064
+ if (str[strlen(str)-1]=='\n'){
1065
+ str[strlen(str)-1]='\0';
1066
+ // printf("LEN2: %s, %ld\n",str,strlen(str));
1067
+ }
1068
+
1069
+ }
1070
+
1071
+ // split name in name and comments and remove @ or > from first char
1072
+ int split_name(char *fname, char *name, char *comments){
1073
+
1074
+ char *name_part;
1075
+ char *comment_part;
1076
+
1077
+ // remove first char (@, >, etc)
1078
+ memmove(fname, fname+1, strlen(fname));
1079
+
1080
+ // split name by space
1081
+ name_part = strtok(fname, " ");
1082
+
1083
+ // get remaining until end of line
1084
+ comment_part=strtok(NULL, "\n");
1085
+
1086
+ // assign name and comments
1087
+ if(name_part)
1088
+ {
1089
+ strcpy(name,name_part);
1090
+ }else{
1091
+ strcpy(name,"");
1092
+ }
1093
+
1094
+ if(comment_part)
1095
+ {
1096
+ strcpy(comments,comment_part);
1097
+ }else{
1098
+ strcpy(comments,"");
1099
+ }
1100
+
1101
+ return 1;
1102
+ }
1103
+
1104
+ int check_mem(char **var,int size)
1105
+ {
1106
+ if ( *var == NULL) {*var=(char *)malloc(size);strncpy(*var,"",size);}
1107
+ }
1108
+
1109
+ // read next seq from fastq file
1110
+ int get_next_seq_fastq(FILE *file, char **name, char **fasta, char **qual, char **comments){
1111
+
1112
+ check_mem(name,MAXSEQNAME);
1113
+ check_mem(fasta,MAXSEQLENGTH);
1114
+ check_mem(qual,MAXSEQLENGTH);
1115
+ check_mem(comments,MAXSEQLENGTH);
1116
+
1117
+
1118
+
1119
+
1120
+ char fname[MAXSEQNAME];// sequence name
1121
+ char qname[MAXSEQNAME];// sequence name
1122
+
1123
+ strcpy(*name,"");
1124
+ strcpy(*fasta,"");
1125
+ strcpy(*qual,"");
1126
+ strcpy(*comments,"");
1127
+
1128
+ errno = 0;
1129
+
1130
+ // read sequence name line ---------------------------------
1131
+ if (fgets( fname, MAXSEQNAME, file )==NULL) {return 0; };
1132
+
1133
+ if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
1134
+ chomp(fname);
1135
+
1136
+ // check for @ at beginning
1137
+ if (fname[0]!='@'){
1138
+ fprintf(stderr,"ERROR: Invalid FASTQ format %s. Missing @ in name line.\n", fname);
1139
+ return INVALID_FASTQ_FORMAT;
1140
+ }
1141
+
1142
+ // split name by space in name and comments
1143
+ split_name(fname,*name,*comments);
1144
+
1145
+ // read fasta line ---------------------------------
1146
+ if (fgets( *fasta, MAXSEQLENGTH, file )==NULL) { return 0; };
1147
+
1148
+ if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
1149
+ chomp(*fasta);
1150
+
1151
+
1152
+ // read qual name line ---------------------------------
1153
+ if (fgets( qname, MAXSEQLENGTH, file )==NULL) { return 0; };
1154
+
1155
+ if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
1156
+ chomp(qname);
1157
+
1158
+ // check for + sign at beginning of qual name line
1159
+ if (qname[0]!='+'){
1160
+ fprintf(stderr,"ERROR: Invalid FASTQ format. Missing + in qual line. %s.\n", *name);
1161
+ return INVALID_FASTQ_FORMAT;
1162
+ }
1163
+
1164
+ // read qual line ---------------------------------
1165
+ if (fgets( *qual, MAXSEQLENGTH, file )==NULL) { return 0; };
1166
+ if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
1167
+
1168
+ chomp(*qual);
1169
+
1170
+ return 1;
1171
+ }
1172
+
1173
+ // read next seq from fasta file
1174
+ int get_next_seq_fasta(FILE *file, char *name, char *fasta, char *comments){
1175
+
1176
+ char fname[MAXSEQNAME];// sequence name
1177
+ char *line;
1178
+ if ((line = malloc(MAXSEQLENGTH)) == NULL) {
1179
+ puts("Memory allocation error!");
1180
+ return EXIT_FAILURE;
1181
+ }
1182
+
1183
+ // init vars
1184
+ strcpy(name,"");
1185
+ strcpy(fasta,"");
1186
+ strcpy(comments,"");
1187
+
1188
+ // read sequence name line ---------------------------------
1189
+ if (fgets( fname, MAXSEQNAME, file )==NULL) { return 0; };
1190
+ if (errno) {fprintf(stderr,"Error reading fasta, result %d %s\n",errno,strerror(errno));return INVALID_FASTA_FORMAT;}
1191
+ chomp(fname);
1192
+
1193
+ // check for @ at beginning
1194
+ if (fname[0]!='>'){
1195
+ fprintf(stderr,"ERROR: Invalid FASTA format %s. Missing @ in name line.\n", fname);
1196
+ return INVALID_FASTA_FORMAT;
1197
+ }
1198
+
1199
+ // split name by space in name and comments
1200
+ split_name(fname,name,comments);
1201
+
1202
+ // get current pos in file
1203
+ //fpos_t pos;
1204
+
1205
+ // read fasta line ---------------------------------
1206
+ char c=0;
1207
+ int len=0;
1208
+ int num_lines=0;
1209
+ while (1) {
1210
+ // fgetpos( file, &pos );
1211
+
1212
+ // inspect first char
1213
+ c=fgetc(file);
1214
+ ungetc(c,file);
1215
+
1216
+ if (c!='>'){
1217
+ // get following line
1218
+ if (fgets( line, MAXSEQLENGTH, file )==NULL) { break; };
1219
+ if (errno) {fprintf(stderr,"Error reading fasta, result %d %s\n",errno,strerror(errno));return INVALID_FASTA_FORMAT;}
1220
+ chomp(line);
1221
+
1222
+ // append to fasta
1223
+ // strcat(fasta,line);
1224
+
1225
+ len = len + sprintf(fasta+len,"%s",line);
1226
+
1227
+ if (len>=MAXSEQLENGTH)
1228
+ {
1229
+ fprintf(stderr,"Error, maximun sequence size error (%d). You can recompile lib with a bigger MAXSEQLENGTH\n",MAXSEQLENGTH);return MAX_SEQ_SIZE_ERROR;
1230
+ }
1231
+ // fasta[strlen(fasta)]=0;
1232
+ // num_lines++;
1233
+ // printf("%d\n",num_lines);
1234
+
1235
+ // if((num_lines%1000)==0)
1236
+ // {
1237
+ // printf("%ld\n,",strlen(fasta));
1238
+ // }
1239
+ }else{ // name line
1240
+ // rewind file and exit
1241
+ // fsetpos(file, &pos);
1242
+ break;
1243
+ }
1244
+ }
1245
+
1246
+ // printf("%ld\n,",strlen(fasta));
1247
+
1248
+ free(line);
1249
+
1250
+ return 1;
1251
+ }
1252
+
1253
+ // process a fastq file adding it to fbin file
1254
+ int process_fastq(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index)
1255
+ {
1256
+
1257
+ // allocate strings
1258
+ char *name;
1259
+ if ((name = malloc(MAXSEQNAME)) == NULL) {
1260
+ puts("Memory allocation error!");
1261
+ return EXIT_FAILURE;
1262
+ }
1263
+
1264
+ char *fasta;
1265
+ if ((fasta = malloc(MAXSEQLENGTH)) == NULL) {
1266
+ puts("Memory allocation error!");
1267
+ return EXIT_FAILURE;
1268
+ }
1269
+
1270
+ char *qual;
1271
+ if ((qual = malloc(MAXSEQLENGTH)) == NULL) {
1272
+ puts("Memory allocation error!");
1273
+ return EXIT_FAILURE;
1274
+ }
1275
+
1276
+ char *comments;
1277
+ if ((comments = malloc(MAXSEQLENGTH)) == NULL) {
1278
+ puts("Memory allocation error!");
1279
+ return EXIT_FAILURE;
1280
+ }
1281
+
1282
+
1283
+ char *extras_name;
1284
+ if ((extras_name = malloc(MAXSEQLENGTH)) == NULL) {
1285
+ puts("Memory allocation error!");
1286
+ return EXIT_FAILURE;
1287
+ }
1288
+
1289
+
1290
+ char *extras;
1291
+ if ((extras = malloc(MAXSEQLENGTH)) == NULL) {
1292
+ puts("Memory allocation error!");
1293
+ return EXIT_FAILURE;
1294
+ }
1295
+
1296
+ char *final_extras;
1297
+ if ((final_extras = malloc(MAXSEQLENGTH)) == NULL) {
1298
+ puts("Memory allocation error!");
1299
+ return EXIT_FAILURE;
1300
+ }
1301
+
1302
+
1303
+ // char name[MAXSEQLENGTH];
1304
+ // char fasta[MAXSEQLENGTH];
1305
+ // char qual[MAXSEQLENGTH];
1306
+ // char extras[MAXSEQLENGTH];
1307
+ static time_t curr_time=0;
1308
+ static time_t prev_time=0;
1309
+
1310
+ prev_time=time(NULL);
1311
+
1312
+
1313
+ FILE *fastq_file=NULL;
1314
+ FILE *extras_file=NULL;
1315
+
1316
+ int valid=0;
1317
+ int res=0;
1318
+ int r=0;
1319
+
1320
+ // Open fasta and qual files
1321
+ if (strcmp(fname,"-")==0){
1322
+ fastq_file=stdin;
1323
+ }else{
1324
+ open_file(fname,&fastq_file);
1325
+ }
1326
+
1327
+ if(efname!=NULL)
1328
+ {
1329
+
1330
+ open_file(efname,&extras_file);
1331
+ }
1332
+
1333
+ // open output file
1334
+ struct file_data *file=NULL;
1335
+ int error2=initialize_writes(&file, outname,1, discretize_qual, flatten_qual,create_index);
1336
+
1337
+ // printf("Init writes\n");
1338
+ // read first extra entry
1339
+ if(extras_file!=NULL)
1340
+ {
1341
+ get_next_seq_fasta(extras_file,extras_name,extras,comments);
1342
+ }
1343
+
1344
+
1345
+ // for each sequence on fastq file
1346
+ while (valid=get_next_seq_fastq(fastq_file,&name,&fasta,&qual,&comments)){
1347
+ if(valid==1)
1348
+ {
1349
+ r++;
1350
+
1351
+ // printf("======================\nNAME:%s\nSEQ :%s\nQUAL:%s\n", name,fasta,qual);
1352
+ // if(strlen(comments)>0)
1353
+ // {
1354
+ // printf("COM :%s\n", comments);
1355
+ // }
1356
+
1357
+ strcpy(final_extras,comments);
1358
+
1359
+
1360
+ // check if there are extras available
1361
+ if (strcmp(name,extras_name)==0){
1362
+ strcat(final_extras,extras);
1363
+
1364
+ // read next extras
1365
+ if(extras_file!=NULL)
1366
+ {
1367
+ get_next_seq_fasta(extras_file,extras_name,extras,comments);
1368
+ }
1369
+ }
1370
+
1371
+ int error_wr=write_seq(file,name,fasta,qual,final_extras);
1372
+ if (error_wr!=0) {res=error_wr; break;};
1373
+ // if (error_wr==0) cnt++;
1374
+
1375
+ }else{
1376
+ fprintf(stderr,"Invalid sequence found. Aborting import.");
1377
+ res=-1;
1378
+ break;
1379
+ }
1380
+
1381
+ if ((r%10000)==0) {
1382
+ printf(".");
1383
+ fflush(stdout);
1384
+ // curr_time=time(NULL);
1385
+ // printf("10k seqs in:%8.0f secs\n",difftime(curr_time,prev_time));
1386
+ // prev_time=curr_time;
1387
+ }
1388
+
1389
+ }
1390
+
1391
+ curr_time=time(NULL);
1392
+ printf("\nEnd fastq processing. %d seqs in %.0f s. Rate: %8.2f seqs/s\n",r,difftime(curr_time,prev_time),r/difftime(curr_time,prev_time));
1393
+
1394
+ // free mem
1395
+ free(name);
1396
+ free(fasta);
1397
+ free(qual);
1398
+ free(comments);
1399
+ free(extras_name);
1400
+ free(extras);
1401
+ free(final_extras);
1402
+
1403
+ // close files
1404
+ fclose(fastq_file);
1405
+ if(extras_file!=NULL)
1406
+ {
1407
+ fclose(extras_file);
1408
+ }
1409
+ close_writes(file);
1410
+
1411
+ return res;
1412
+ }
1413
+
1414
+
1415
+ // process a fastq file adding it to fbin file
1416
+ int process_fasta(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index)
1417
+ {
1418
+
1419
+ // allocate strings
1420
+ char *name;
1421
+ if ((name = malloc(MAXSEQNAME)) == NULL) {
1422
+ puts("Memory allocation error!");
1423
+ return EXIT_FAILURE;
1424
+ }
1425
+
1426
+ char *fasta;
1427
+ if ((fasta = malloc(MAXSEQLENGTH)) == NULL) {
1428
+ puts("Memory allocation error!");
1429
+ return EXIT_FAILURE;
1430
+ }
1431
+
1432
+ char *qual;
1433
+ if ((qual = malloc(MAXSEQLENGTH)) == NULL) {
1434
+ puts("Memory allocation error!");
1435
+ return EXIT_FAILURE;
1436
+ }
1437
+
1438
+ char *comments;
1439
+ if ((comments = malloc(MAXSEQLENGTH)) == NULL) {
1440
+ puts("Memory allocation error!");
1441
+ return EXIT_FAILURE;
1442
+ }
1443
+
1444
+
1445
+ char *extras_name;
1446
+ if ((extras_name = malloc(MAXSEQLENGTH)) == NULL) {
1447
+ puts("Memory allocation error!");
1448
+ return EXIT_FAILURE;
1449
+ }
1450
+
1451
+
1452
+ char *extras;
1453
+ if ((extras = malloc(MAXSEQLENGTH)) == NULL) {
1454
+ puts("Memory allocation error!");
1455
+ return EXIT_FAILURE;
1456
+ }
1457
+
1458
+ char *final_extras;
1459
+ if ((final_extras = malloc(MAXSEQLENGTH)) == NULL) {
1460
+ puts("Memory allocation error!");
1461
+ return EXIT_FAILURE;
1462
+ }
1463
+
1464
+
1465
+ static time_t curr_time=0;
1466
+ static time_t prev_time=0;
1467
+
1468
+ prev_time=time(NULL);
1469
+
1470
+
1471
+ FILE *fastq_file=NULL;
1472
+ FILE *extras_file=NULL;
1473
+
1474
+ int valid=0;
1475
+ int res=0;
1476
+ int r=0;
1477
+
1478
+ // Open fasta and qual files
1479
+ if (strcmp(fname,"-")==0){
1480
+ fastq_file=stdin;
1481
+ }else{
1482
+ open_file(fname,&fastq_file);
1483
+ }
1484
+
1485
+ if(efname!=NULL)
1486
+ {
1487
+
1488
+ open_file(efname,&extras_file);
1489
+ }
1490
+
1491
+ // open output file
1492
+ struct file_data *file=NULL;
1493
+ int error2=initialize_writes(&file, outname,1, discretize_qual, flatten_qual,create_index);
1494
+
1495
+ // printf("Init writes\n");
1496
+ // read first extra entry
1497
+ if(extras_file!=NULL)
1498
+ {
1499
+ get_next_seq_fasta(extras_file,extras_name,extras,comments);
1500
+ }
1501
+
1502
+ strcpy(qual,"");
1503
+ qual[0]=0;
1504
+
1505
+ // for each sequence on fastq file
1506
+ while (valid=get_next_seq_fasta(fastq_file,name,fasta,comments)){
1507
+ if(valid==1)
1508
+ {
1509
+ r++;
1510
+
1511
+ // printf("======================\nNAME:%s\nSEQ :%s\nQUAL:%s\n", name,fasta,qual);
1512
+ // if(strlen(comments)>0)
1513
+ // {
1514
+ // printf("COM :%s\n", comments);
1515
+ // }
1516
+
1517
+ strcpy(final_extras,comments);
1518
+
1519
+
1520
+ // check if there are extras available
1521
+ if (strcmp(name,extras_name)==0){
1522
+ strcat(final_extras,extras);
1523
+
1524
+ // read next extras
1525
+ if(extras_file!=NULL)
1526
+ {
1527
+ get_next_seq_fasta(extras_file,extras_name,extras,comments);
1528
+ }
1529
+ }
1530
+
1531
+ int error_wr=write_seq(file,name,fasta,qual,final_extras);
1532
+ if (error_wr!=0) {res=error_wr; break;};
1533
+ // if (error_wr==0) cnt++;
1534
+
1535
+ }else{
1536
+ fprintf(stderr,"Invalid sequence found. Aborting import.");
1537
+ res=-1;
1538
+ break;
1539
+ }
1540
+
1541
+ if ((r%10000)==0) {
1542
+ printf(".");
1543
+ fflush(stdout);
1544
+ // curr_time=time(NULL);
1545
+ // printf("10k seqs in:%8.0f secs\n",difftime(curr_time,prev_time));
1546
+ // prev_time=curr_time;
1547
+ }
1548
+
1549
+ }
1550
+
1551
+ curr_time=time(NULL);
1552
+ printf("\nEnd fastq processing. %d seqs in %.0f s. Rate: %8.2f seqs/s\n",r,difftime(curr_time,prev_time),r/difftime(curr_time,prev_time));
1553
+
1554
+ // free mem
1555
+ free(name);
1556
+ free(fasta);
1557
+ free(qual);
1558
+ free(comments);
1559
+ free(extras_name);
1560
+ free(extras);
1561
+ free(final_extras);
1562
+
1563
+ // close files
1564
+ fclose(fastq_file);
1565
+ if(extras_file!=NULL)
1566
+ {
1567
+ fclose(extras_file);
1568
+ }
1569
+ close_writes(file);
1570
+
1571
+ return res;
1572
+ }
1573
+
1574
+ // int process_biofile(char *fname, char *qfname, char *efname, char *outname)
1575
+ // {
1576
+ //
1577
+ // char sname[MAXSEQNAME];// sequence name
1578
+ // char qname[MAXSEQNAME];// sequence name
1579
+ // char ename[MAXSEQNAME];// sequence name
1580
+ // char next_sname[MAXSEQNAME];// sequence name
1581
+ // char next_qname[MAXSEQNAME];// sequence name
1582
+ // char next_ename[MAXSEQNAME];// sequence name
1583
+ //
1584
+ // char fasta[150000];
1585
+ // char qual[150000];
1586
+ // char extras[150000];
1587
+ // char extras_used[150000];
1588
+ // char next_fcomment[150000];
1589
+ // char next_qcomment[150000];
1590
+ // char next_ecomment[150000];
1591
+ // char tmp[150000];
1592
+ // int extras_bool=TRUE;
1593
+ //
1594
+ // int cnt=1;
1595
+ //
1596
+ // sprintf(extras_used,"INITIALIZED");
1597
+ //
1598
+ // // Open fasta and qual files
1599
+ // FILE *file_fasta=fopen(fname,"r");
1600
+ //
1601
+ // if (file_fasta==NULL) { fprintf(stderr,"error opening fasta file %s, result %d %s\n",fname,errno,strerror(errno));return -2;};
1602
+ // // setvbuf(file_fasta,NULL,_IONBF,0);
1603
+ // FILE *file_qual=fopen(qfname,"r");
1604
+ // if (file_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",qfname,errno,strerror(errno));return -2;};
1605
+ // FILE *file_extras=fopen(efname,"r");
1606
+ // if (file_extras==NULL) {fprintf(stderr,"error opening extras file %s, result %d %s\n",efname,errno,strerror(errno)); extras_bool=FALSE;sprintf(extras,"");};
1607
+ //
1608
+ // // setvbuf(file_qual,NULL,_IONBF,0);
1609
+ // int error=0;
1610
+ // int end=0; //0 is false
1611
+ // char *res;
1612
+ //
1613
+ // // reads the name of the sequence from both
1614
+ //
1615
+ // // fscanf(file_qual,">%9000s",qname);
1616
+ // // fscanf(file_fasta,">%9000s",sname);
1617
+ //
1618
+ //
1619
+ // res=fgets(tmp,150000,file_fasta);
1620
+ // if (res!=NULL) {
1621
+ // sscanf(tmp,">%9000s",sname);
1622
+ // strncpy(next_fcomment,tmp+strlen(sname)+2,150000);
1623
+ // }
1624
+ //
1625
+ // res=fgets(tmp,150000,file_qual);
1626
+ // if (res!=NULL) {
1627
+ // sscanf(tmp,">%9000s",qname);
1628
+ // strncpy(next_qcomment,tmp+strlen(qname)+2,150000);
1629
+ // }
1630
+ //
1631
+ // if ( extras_bool ) {
1632
+ // res=fgets(tmp,150000,file_extras);
1633
+ // if (res!=NULL) {
1634
+ // sscanf(tmp,">%9000s",ename);
1635
+ // strncpy(next_ecomment,tmp+strlen(ename)+2,150000);
1636
+ // } else sprintf(ename,"");
1637
+ // }
1638
+ // printf("extras seq:%s\n",ename);
1639
+ //
1640
+ // printf("file:%s q:%s seqname:%s qseqname%s efname:%s extras:%s\n",fname, qfname,sname,qname,efname,extras);
1641
+ // printf("next_fcomment:%s next_qcomment:%s\n",next_fcomment,next_qcomment);
1642
+ //
1643
+ // struct file_data *file=NULL;
1644
+ // int error2=initialize_writes(&file, outname,1);
1645
+ //
1646
+ // // sprintf(next_fcomment,"");
1647
+ // // sprintf(next_qcomment,"");
1648
+ //
1649
+ // while (!end) {
1650
+ // if ( strcmp(sname,qname)!=0 ) {error = -9; goto end;}
1651
+ // /*
1652
+ // if (extras_bool)
1653
+ // if ( strcmp(sname,ename)!=0 ) {error = -9; goto end;}
1654
+ // */
1655
+ // // load the qual and fasta
1656
+ //
1657
+ // sprintf(fasta,"");
1658
+ // sprintf(fasta,"%s",next_fcomment);
1659
+ // sprintf(next_fcomment,"");
1660
+ // sprintf(tmp,"");
1661
+ // res=fasta;
1662
+ // while (( res!=NULL ) && (tmp[0]!='>' )) {
1663
+ // res=fgets(tmp,150000,file_fasta);
1664
+ // if ((tmp[0]!='>')&&(res!=NULL)) sprintf (fasta,"%s%s",fasta,tmp);
1665
+ // else if (res!=NULL) {sscanf(tmp,">%9000s",next_sname); strncpy(next_fcomment,tmp+strlen(next_sname)+2,sizeof(next_fcomment));}
1666
+ // }
1667
+ // if (res==NULL) end=1;
1668
+ //
1669
+ // sprintf(qual,"");
1670
+ // sprintf(qual,"%s",next_qcomment);
1671
+ // sprintf(next_qcomment,"");
1672
+ // res=qual;
1673
+ // sprintf(tmp,"");
1674
+ // while (( res!=NULL ) && (tmp[0]!='>' )) {
1675
+ // res=fgets(tmp,150000,file_qual);
1676
+ // if ((tmp[0]!='>')&&(res!=NULL)) sprintf (qual,"%s%s",qual,tmp);
1677
+ // else if (res!=NULL) {sscanf(tmp,">%9000s",next_qname); strncpy(next_qcomment,tmp+strlen(next_qname)+2,sizeof(next_qcomment));}
1678
+ // }
1679
+ // if (res==NULL) end=1;
1680
+ //
1681
+ // // If extra_used!=NULL then it means that it has been used and a new one must be read
1682
+ // if (extras_bool && (strcmp(extras_used,"")!=0)) {
1683
+ // sprintf(extras,"");
1684
+ // sprintf(extras,"%s",next_ecomment);
1685
+ // sprintf(next_ecomment,"");
1686
+ // res=extras;
1687
+ // sprintf(tmp,"");
1688
+ // while (( res!=NULL ) && (tmp[0]!='>' )) {
1689
+ // res=fgets(tmp,150000,file_extras);
1690
+ // if ((tmp[0]!='>')&&(res!=NULL)) sprintf (extras,"%s%s",extras,tmp);
1691
+ // else if (res!=NULL) {sscanf(tmp,">%9000s",next_ename); strncpy(next_ecomment,tmp+strlen(next_ename)+2,sizeof(next_ecomment));}
1692
+ // }
1693
+ // //if (res==NULL) end=1; Extras file can be finished and processing will continue
1694
+ // }
1695
+ //
1696
+ // /* If the name of the name is equal to the name of the actual sequence then it will be used for writting */
1697
+ // if ( strcmp(sname,ename)==0 ) {
1698
+ // strcpy(extras_used,extras);
1699
+ // strcpy(ename,next_ename);
1700
+ // } else sprintf(extras_used,"");
1701
+ //
1702
+ //
1703
+ // int error_wr=write_seq(file,sname, fasta,qual,extras_used);
1704
+ // if (error_wr!=0) { end=1;error=error_wr; };
1705
+ // if (error_wr==0) cnt++;
1706
+ // strcpy(sname,next_sname);
1707
+ // strcpy(qname,next_qname);
1708
+ //
1709
+ // }
1710
+ //
1711
+ // // repeat until EOF or error
1712
+ // end:
1713
+ // fclose(file_fasta);
1714
+ // fclose(file_qual);
1715
+ //
1716
+ // close_writes(file);
1717
+ // //fclose(file_index);
1718
+ // // print_seqs(seql);
1719
+ // return error;
1720
+ // }
1721
+ //
1722
+
1723
+
1724
+
1725
+ int init_dicts(char *d_fasta,char *d_qual,int size)
1726
+ {
1727
+ char *dict_f="fasta.dic";
1728
+ char *dict_q="qual.dic";
1729
+ FILE *f_d_fasta=fopen(dict_f,"r");
1730
+ if (f_d_fasta==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_f,errno,strerror(errno));return -2;};
1731
+ fread(d_fasta,size,1,f_d_fasta);
1732
+ fclose(f_d_fasta);
1733
+
1734
+ FILE *f_d_qual=fopen(dict_q,"r");
1735
+ if (f_d_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_q,errno,strerror(errno));return -2;};
1736
+ fread(d_qual,size,1,f_d_fasta);
1737
+ fclose(f_d_qual);
1738
+ }
1739
+
1740
+
1741
+
1742
+ int free_string(char **string){
1743
+ if (string!=NULL){
1744
+ free(*string);
1745
+ *string=NULL;
1746
+ }
1747
+ }
1748
+