scbi_fqbin 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/{README.rdoc → README.md} +0 -0
- data/Rakefile +8 -28
- data/lib/scbi_fqbin.rb +3 -5
- data/lib/scbi_fqbin/fastabin.rb +411 -0
- data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
- data/lib/scbi_fqbin/fbin_file.rb +1 -1
- data/lib/scbi_fqbin/t.rb +9 -0
- data/lib/scbi_fqbin/t2.rb +12 -0
- data/lib/scbi_fqbin/version.rb +3 -0
- data/lib_fqbin_src.zip +0 -0
- data/lib_fqbin_src/Makefile +66 -0
- data/lib_fqbin_src/fq +0 -0
- data/lib_fqbin_src/fq.c +165 -0
- data/lib_fqbin_src/hash_fqbin +0 -0
- data/lib_fqbin_src/hash_fqbin.c +212 -0
- data/lib_fqbin_src/idx_fqbin +21 -0
- data/lib_fqbin_src/iterate_fqbin +0 -0
- data/lib_fqbin_src/iterate_fqbin.c +136 -0
- data/lib_fqbin_src/lib_fqbin.c +1748 -0
- data/lib_fqbin_src/lib_fqbin.h +194 -0
- data/lib_fqbin_src/mk_fqbin +0 -0
- data/lib_fqbin_src/mk_fqbin.c +138 -0
- data/lib_fqbin_src/other/bwxform.c +915 -0
- data/lib_fqbin_src/other/bwxform.h +74 -0
- data/lib_fqbin_src/other/find_in_index.c +130 -0
- data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
- data/lib_fqbin_src/other/idx_fqbin +0 -0
- data/lib_fqbin_src/other/idx_fqbin.c +67 -0
- data/lib_fqbin_src/other/make_hsh.sh +14 -0
- data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
- data/lib_fqbin_src/read_fq +0 -0
- data/lib_fqbin_src/read_fq.c +143 -0
- data/lib_fqbin_src/read_fqbin +0 -0
- data/lib_fqbin_src/read_fqbin.c +101 -0
- data/lib_fqbin_src/sort_index +9 -0
- data/lib_fqbin_src/test.rb +13 -0
- data/scbi_fqbin.gemspec +25 -0
- data/test/build.rake +15 -0
- data/test/fbinfile +0 -0
- data/test/fbinfile.index +0 -0
- data/test/no_test_fill_file.rb +66 -0
- data/test/old/app.rb +43 -0
- data/test/old/bin/iterate_fastabin.rb +54 -0
- data/test/old/bin/mk_fastabin.rb +22 -0
- data/test/old/bin/rd_fastabin.rb +36 -0
- data/test/old/bin/rd_fq.rb +20 -0
- data/test/old/bioruby.rb +27 -0
- data/test/old/c/Makefile +34 -0
- data/test/old/c/fbin_lib.zip +0 -0
- data/test/old/c/iterate_fbin.c +54 -0
- data/test/old/c/libreria_gz.c +707 -0
- data/test/old/c/libreria_gz.h +127 -0
- data/test/old/c/main.c +86 -0
- data/test/old/c/mk_fbin.c +24 -0
- data/test/old/c/rd_seq_fbin.c +44 -0
- data/test/old/c/test_ffi/a.out +0 -0
- data/test/old/c/test_ffi/app.c +26 -0
- data/test/old/c/test_ffi/app.rb +19 -0
- data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
- data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
- data/test/old/c/test_ffi/my_library.rb +23 -0
- data/test/old/c/test_ffi/mylibrary.c +22 -0
- data/test/old/c/test_ffi/mylibrary.h +6 -0
- data/test/old/c/usage_instructions.txt +62 -0
- data/test/old/ext/Makefile +187 -0
- data/test/old/ext/Makefile.dario +34 -0
- data/test/old/ext/extconf.rb +8 -0
- data/test/old/ext/mk_fbin.c +24 -0
- data/test/old/ext/sample/extras.txt +4 -0
- data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
- data/test/old/ext/sample/f1.fasta +10 -0
- data/test/old/ext/sample/f1.fasta.qual +10 -0
- data/test/old/ext/sample/f1.fbin +0 -0
- data/test/old/ext/sample/f1.fbin.index +0 -0
- data/test/old/ext/sample/main.c +86 -0
- data/test/old/ext/usage_instructions.txt +62 -0
- data/test/old/t_scbi_fastabin.rb +140 -0
- data/test/read_tests/10-original_sizes.sh +16 -0
- data/test/read_tests/20-fq_time.sh +23 -0
- data/test/read_tests/30-fbin_read_time.sh +23 -0
- data/test/read_tests/40-bsc_read_time.sh +21 -0
- data/test/read_tests/50-fq_time_x4.sh +25 -0
- data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
- data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
- data/test/results_bio_scbi_fasta.txt +11 -0
- data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
- data/test/speed.txt +81 -0
- data/test/t_scbi_fasta.rb +12 -0
- data/test/write_tests/10-original_sizes.sh +16 -0
- data/test/write_tests/20-zip_time.sh +17 -0
- data/test/write_tests/30-mk_fbin_time.sh +23 -0
- data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
- data/test/write_tests/40-gzip_time.sh +16 -0
- data/test/write_tests/41-bsc_time.sh +16 -0
- data/test/write_tests/50-zip_sizes.sh +16 -0
- data/test/write_tests/60-fbin_sizes.sh +17 -0
- data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
- data/test/write_tests/70-gzip_sizes.sh +17 -0
- data/test/write_tests/80-bsc_sizes.sh +17 -0
- data/website/index.html +87 -0
- data/website/index.txt +81 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +208 -95
- data/History.txt +0 -19
- data/Manifest.txt +0 -12
- data/PostInstall.txt +0 -7
- data/script/console +0 -10
- data/script/destroy +0 -14
- data/script/generate +0 -14
@@ -0,0 +1,1748 @@
|
|
1
|
+
#include "lib_fqbin.h"
|
2
|
+
|
3
|
+
|
4
|
+
int check_error(int error_condition, char *message, int return_value){
|
5
|
+
if (error_condition) {
|
6
|
+
fprintf(stderr,"Error %d; %s\nMSG:%s\n",errno ,message, strerror(errno));
|
7
|
+
return return_value;
|
8
|
+
}
|
9
|
+
}
|
10
|
+
|
11
|
+
|
12
|
+
int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras)
|
13
|
+
{
|
14
|
+
// compress data
|
15
|
+
char metainfo[SEQ_METADATA];
|
16
|
+
int error=0;
|
17
|
+
|
18
|
+
// Convert to fastq
|
19
|
+
|
20
|
+
// char qual[150000];
|
21
|
+
// sprintf(qual,"");
|
22
|
+
// char *sep = " ";
|
23
|
+
// char *word, *brkt;
|
24
|
+
//
|
25
|
+
// for (word = strtok_r(in_qual, sep, &brkt);
|
26
|
+
// word;
|
27
|
+
// word = strtok_r(NULL, sep, &brkt))
|
28
|
+
// {
|
29
|
+
// sprintf(qual,"%s%c",qual,atoi(word)+33);
|
30
|
+
// // strcat(qual2,".");
|
31
|
+
// // printf("%s,%s,%c\n",word,qual2, atoi(word)+33);
|
32
|
+
//
|
33
|
+
// }
|
34
|
+
|
35
|
+
// printf("write seq\n");
|
36
|
+
|
37
|
+
|
38
|
+
if (file->gzf_bin==NULL) {fprintf(stderr,"error with gzfile_bin, is NULL :%s\n",gzerror(file->gzf_bin,&error));return -2;}
|
39
|
+
|
40
|
+
long fasta_len=strlen(fasta);
|
41
|
+
|
42
|
+
// preprocess fasta string
|
43
|
+
// if (fasta_len>0)
|
44
|
+
// {
|
45
|
+
// // printf("IN:%s, %ld\n",fasta,strlen(fasta));
|
46
|
+
// // BWXform(fasta, fasta,1);
|
47
|
+
// // printf("\nOUT:\n");
|
48
|
+
// // write(1,fasta,fasta_len);
|
49
|
+
// // printf("\n");
|
50
|
+
// // fflush(1);
|
51
|
+
// }
|
52
|
+
|
53
|
+
|
54
|
+
// preprocess quality string
|
55
|
+
if (strlen(qual)>0){
|
56
|
+
|
57
|
+
// printf("========================\nQ:%s\n",qual);
|
58
|
+
// process qual
|
59
|
+
int i;
|
60
|
+
|
61
|
+
// annotate first qual to compare with rest of quals
|
62
|
+
char old=qual[0];
|
63
|
+
|
64
|
+
// if qual is going to be flattened, take the flatten limit as old
|
65
|
+
if ((file->flatten_qual>0) & (old>=file->flatten_qual))
|
66
|
+
{
|
67
|
+
old=file->flatten_qual;
|
68
|
+
}
|
69
|
+
|
70
|
+
int same_qual=1;
|
71
|
+
|
72
|
+
// int repeated_count=0;
|
73
|
+
// int repeated_start=0;
|
74
|
+
// int max_repeated_count=0;
|
75
|
+
// int max_repeated_start=0;
|
76
|
+
|
77
|
+
// process qual string
|
78
|
+
for( i = 0; i < strlen(qual); ++i)
|
79
|
+
{
|
80
|
+
|
81
|
+
//discretize up by discretize_qual value
|
82
|
+
if (file->discretize_qual>1){
|
83
|
+
// qual[i]=qual[i]-(qual[i] % 2)+2-1;
|
84
|
+
qual[i]=(qual[i] / file->discretize_qual)*file->discretize_qual;
|
85
|
+
}
|
86
|
+
|
87
|
+
// printf("FL:%c>=%c\n",qual[i],file->flatten_qual);
|
88
|
+
// trim high qualitys
|
89
|
+
if ((file->flatten_qual>0) & (qual[i]>file->flatten_qual)){
|
90
|
+
qual[i]=file->flatten_qual;
|
91
|
+
}
|
92
|
+
|
93
|
+
// if (qual[i]!=old) {
|
94
|
+
// if (repeated_count>=max_repeated_count) {
|
95
|
+
// max_repeated_start=repeated_start;
|
96
|
+
// max_repeated_count=repeated_count;
|
97
|
+
// }
|
98
|
+
//
|
99
|
+
// repeated_start=i;
|
100
|
+
// repeated_count=0;
|
101
|
+
// old=qual[i]
|
102
|
+
//
|
103
|
+
// same_qual=0;
|
104
|
+
// }
|
105
|
+
|
106
|
+
// if (qual[i]!=old) {same_qual=0;}
|
107
|
+
}
|
108
|
+
|
109
|
+
if (same_qual)
|
110
|
+
{
|
111
|
+
// trim qual string
|
112
|
+
sprintf(qual,"%c",old);
|
113
|
+
qual[1]=0;
|
114
|
+
// printf("EQUAL: %s,%s,%ld\n",seq_name, qual,strlen(qual));
|
115
|
+
}
|
116
|
+
|
117
|
+
|
118
|
+
// sino hacer RLE
|
119
|
+
|
120
|
+
|
121
|
+
// printf("\nQ:%s\n",qual);
|
122
|
+
|
123
|
+
}
|
124
|
+
// printf("Calc fasta_len\n");
|
125
|
+
// printf("Calculated fasta_len\n");
|
126
|
+
|
127
|
+
snprintf(metainfo,SEQ_METADATA-1,"9999%s %ld %ld %ld", seq_name, fasta_len, strlen(qual), strlen(extras));
|
128
|
+
snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, seq_name, fasta_len, strlen(qual), strlen(extras));
|
129
|
+
// snprintf(metainfo,SEQ_METADATA-1,"9999%s %ld %ld", seq_name, strlen(fasta), strlen(extras));
|
130
|
+
// snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %ld %ld", strlen(metainfo)-4, seq_name, strlen(fasta), strlen(extras));
|
131
|
+
|
132
|
+
// get begin pos of header
|
133
|
+
long beginH=gztell(file->gzf_bin);
|
134
|
+
|
135
|
+
// TODO check gztell
|
136
|
+
if (beginH==-1) {fprintf(stderr,"error with pos of beginH of gzfile_bin :%s\n", gzerror(file->gzf_bin,&error)); return -2;}
|
137
|
+
|
138
|
+
// write seq to bin file
|
139
|
+
gzwrite(file->gzf_bin, metainfo, strlen(metainfo));
|
140
|
+
|
141
|
+
// TODO check gzwrite
|
142
|
+
long beginI=gztell(file->gzf_bin);
|
143
|
+
|
144
|
+
if (beginI==-1) {fprintf(stderr,"error with pos of beginI of gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
|
145
|
+
|
146
|
+
// printf("Before write fasta\n");
|
147
|
+
|
148
|
+
int res=1;
|
149
|
+
if (fasta_len>0) res=gzwrite(file->gzf_bin,fasta,fasta_len); //Z_FILTERED);
|
150
|
+
|
151
|
+
if ( res==0 ) { fprintf(stderr,"Error when writting fasta\n");return -8;}
|
152
|
+
long fastaS=gztell(file->gzf_bin)-beginI;
|
153
|
+
|
154
|
+
// printf("After write fasta\n");
|
155
|
+
|
156
|
+
|
157
|
+
if (strlen(qual)>0){
|
158
|
+
res=gzwrite(file->gzf_bin,qual,strlen(qual)); //Z_FILTERED);
|
159
|
+
}
|
160
|
+
|
161
|
+
if ( res==0 ) { fprintf(stderr,"Error when writting qual\n");return -8;}
|
162
|
+
long qualS=gztell(file->gzf_bin)-fastaS-beginI;
|
163
|
+
|
164
|
+
if (strlen(extras)>0) res=gzwrite(file->gzf_bin,extras,strlen(extras)); //Z_FILTERED);
|
165
|
+
|
166
|
+
if ( res==0 ) { fprintf(stderr,"Error when writting extras\n");return -8;}
|
167
|
+
long extrasS=gztell(file->gzf_bin)-qualS-fastaS-beginI;
|
168
|
+
|
169
|
+
// add_sequence(&seql,seq_name,pos_chunk_gz,beginI,fastaS,qualS,extrasS);
|
170
|
+
|
171
|
+
// Write index file
|
172
|
+
if((file)->create_index)
|
173
|
+
{
|
174
|
+
char tmp[SEQ_METADATA];
|
175
|
+
sprintf(tmp,"%s %lld %ld\n",seq_name,file->pos_chunk_gz,beginH);
|
176
|
+
gzwrite(file->gzf_index,tmp,strlen(tmp));
|
177
|
+
}
|
178
|
+
|
179
|
+
(file->counter)++;
|
180
|
+
// if (counter > 2) fprintf(stderr,"Probando static counter para llamadas desde ruby, valor %d\n",counter);
|
181
|
+
|
182
|
+
// create new chunk
|
183
|
+
if (((file->counter)%10000)==0) {
|
184
|
+
// curr_time=time(NULL);
|
185
|
+
// printf("10k seqs in:%f secs\n",difftime(curr_time,prev_time));
|
186
|
+
// prev_time=curr_time;
|
187
|
+
|
188
|
+
// close current chunk
|
189
|
+
gzclose(file->gzf_bin);
|
190
|
+
|
191
|
+
// open file again to annotate chunk
|
192
|
+
int file_bin=open(file->name,O_APPEND);
|
193
|
+
|
194
|
+
//goto end of file
|
195
|
+
long long pos=lseek(file_bin,0,SEEK_END);
|
196
|
+
if (pos==-1) {fprintf(stderr,"error %d seeking file :%s\n",errno,strerror(errno));return -1;}
|
197
|
+
|
198
|
+
// annotate chunk pos
|
199
|
+
file->pos_chunk_gz=pos;
|
200
|
+
|
201
|
+
close(file_bin);
|
202
|
+
|
203
|
+
// open new gzfile
|
204
|
+
file->gzf_bin=gzopen(file->name,"ab");
|
205
|
+
if (file->gzf_bin==NULL) {fprintf(stderr,"error opening gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
|
206
|
+
}
|
207
|
+
|
208
|
+
return 0;
|
209
|
+
}
|
210
|
+
|
211
|
+
|
212
|
+
|
213
|
+
/* Reads the metadata from the main file
|
214
|
+
It initializes the version variable
|
215
|
+
*/
|
216
|
+
int read_bin_file_metadata(struct file_data *filed)
|
217
|
+
{
|
218
|
+
char header[SEQ_METADATA];
|
219
|
+
int fastaS,qualS,extrasS=0;
|
220
|
+
int ver,subver;
|
221
|
+
|
222
|
+
// printf("pos1b %ld\n",gztell(filed->gzf_bin));//lseek ((*filed), 0, SEEK_CUR))
|
223
|
+
|
224
|
+
int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
|
225
|
+
// printf("pos2b %ld\n",gztell(filed->gzf_bin));//lseek ((*filed), 0, SEEK_CUR))
|
226
|
+
|
227
|
+
|
228
|
+
if ( res!=0 ) {return -1;}
|
229
|
+
if ( strlen(header)<20 ) {fprintf(stderr,"Too short sequence header:%s. lenght:%ld\n",header,strlen(header));return -1;}
|
230
|
+
|
231
|
+
// 28UMACOMPRESSEDFORMAT_1_0 0 0 0
|
232
|
+
// header[strlen(header)-2]=0;
|
233
|
+
|
234
|
+
if (strncmp(header,"UMACOMPRESSEDFORMAT_",19)!=0) {fprintf(stderr,"Incorrect header in file, header:%s\n",header);return -1;}
|
235
|
+
// TODO fill the file_data structure with the header data
|
236
|
+
if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&ver,&subver)!=2) return -1;
|
237
|
+
//if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&(filed->version),&(filed->subversion))!=2) return -1;
|
238
|
+
filed->version=11;//ver;
|
239
|
+
filed->subversion=subver;
|
240
|
+
// fprintf(stderr,"file version:%d,%d\n",filed->version,filed->subversion);
|
241
|
+
return 0;
|
242
|
+
}
|
243
|
+
|
244
|
+
/* Reads the metadata from the index file
|
245
|
+
It initializes the version and binary_search variable
|
246
|
+
*/
|
247
|
+
int read_index_file_metadata(struct file_data *filed)
|
248
|
+
{
|
249
|
+
char header[SEQ_METADATA];
|
250
|
+
int fastaS,qualS,extrasS=0;
|
251
|
+
|
252
|
+
|
253
|
+
int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
|
254
|
+
|
255
|
+
if ( strlen(header)<19 ) {return -1;}
|
256
|
+
|
257
|
+
// 28UMACOMPRESSEDFORMAT 1 0 0 0 0
|
258
|
+
header[strlen(header)-2]=0;
|
259
|
+
|
260
|
+
if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
|
261
|
+
// TODO fill the file_data structure with the header data
|
262
|
+
if (sscanf(header,"UMACOMPRESSEDFORMAT %d %d",&(filed->version),&(filed->subversion))!=2) {
|
263
|
+
fprintf(stderr,"SEQ READ:Header incorrect when reading versions:%s.\n",header);
|
264
|
+
return -1;
|
265
|
+
}
|
266
|
+
|
267
|
+
return 0;
|
268
|
+
}
|
269
|
+
|
270
|
+
/* reads the header of a sequence in the main file.
|
271
|
+
the pointer to the file points to the fasta data after calling read_seq_header
|
272
|
+
returns 0 if ok
|
273
|
+
-1 if there is an error
|
274
|
+
-2 if EOF
|
275
|
+
*/
|
276
|
+
|
277
|
+
int read_seq_header(gzFile *gzf_bin, char *seq_name,int *fastaS, int *qualS, int *extrasS)
|
278
|
+
{
|
279
|
+
int header_size=4;
|
280
|
+
char hsize[40];
|
281
|
+
char tmp[1000];
|
282
|
+
char sname[SEQ_METADATA];
|
283
|
+
|
284
|
+
long pos=gzread(gzf_bin,hsize,header_size);
|
285
|
+
|
286
|
+
// EOF found
|
287
|
+
if ( pos==0 ) return -2;
|
288
|
+
|
289
|
+
// Error reading file
|
290
|
+
if ( pos==-1 ) {fprintf(stderr,"Incorrect sequence header. File may be corrupted\n");return -1;}
|
291
|
+
|
292
|
+
hsize[pos]=0;
|
293
|
+
sscanf(hsize,"%d",&header_size);
|
294
|
+
pos=gzread(gzf_bin,tmp,header_size);
|
295
|
+
|
296
|
+
if ( pos==0 ) return -2;
|
297
|
+
|
298
|
+
if ( pos==-1 ) {fprintf(stderr,"Incorrect sequence header. File may be corrupted\n");return -1;}
|
299
|
+
|
300
|
+
tmp[header_size]=0;
|
301
|
+
int reads=sscanf(tmp,"%s %d %d %d",sname,fastaS,qualS,extrasS);
|
302
|
+
|
303
|
+
if (reads!=4) {return -1;};
|
304
|
+
|
305
|
+
if (seq_name!=NULL) strncpy(seq_name,sname,SEQ_METADATA);
|
306
|
+
|
307
|
+
return 0;
|
308
|
+
}
|
309
|
+
|
310
|
+
// check files before reading
|
311
|
+
// it initializes the previous variables, file_version and binary_search
|
312
|
+
// result :
|
313
|
+
// 0 : if both the bin and index files exists and are from the current version
|
314
|
+
// 1 : if both the bin and index files exists but are from another version
|
315
|
+
// 2 : if both files are missing
|
316
|
+
// 3 : if bin file is missing
|
317
|
+
// 4 : if index file is missing
|
318
|
+
int check_files()
|
319
|
+
{
|
320
|
+
|
321
|
+
// open the files, read and check the header
|
322
|
+
return 0;
|
323
|
+
}
|
324
|
+
|
325
|
+
// returns the version of the opened file
|
326
|
+
int version(struct file_data *filed)
|
327
|
+
{
|
328
|
+
if (filed->gzf_bin==NULL) return -1;
|
329
|
+
return filed->version;
|
330
|
+
}
|
331
|
+
|
332
|
+
// returns the version of the opened file
|
333
|
+
int subversion(struct file_data *filed)
|
334
|
+
{
|
335
|
+
if (filed->gzf_bin==NULL) return -1;
|
336
|
+
return filed->subversion;
|
337
|
+
}
|
338
|
+
|
339
|
+
/*
|
340
|
+
mode can be:
|
341
|
+
1 - random, for each read it begins to read from the beggining of index
|
342
|
+
2 - sequential, it keeps the position inside the index and main files.
|
343
|
+
*/
|
344
|
+
int initialize_sequential_reads(struct file_data ** filed, char *filename)
|
345
|
+
{
|
346
|
+
char header[SEQ_METADATA];
|
347
|
+
int fastaS,qualS,extrasS=0;
|
348
|
+
|
349
|
+
if ( *filed == NULL ) {*filed=malloc(sizeof(struct file_data));}
|
350
|
+
|
351
|
+
|
352
|
+
(*filed)->gzf_bin=gzopen(filename,"r");
|
353
|
+
|
354
|
+
int res=check_error((*filed)->gzf_bin==NULL,"Unable to open file",-1);
|
355
|
+
|
356
|
+
strncpy((*filed)->name,filename,MAXFNAME);
|
357
|
+
(*filed)->error=0;
|
358
|
+
|
359
|
+
|
360
|
+
|
361
|
+
// reads the metadata
|
362
|
+
/*
|
363
|
+
int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
|
364
|
+
|
365
|
+
if ( strlen(header)<19 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s.\n",header);return -1;}
|
366
|
+
|
367
|
+
// 28UMACOMPRESSEDFORMAT_1 0 0 0
|
368
|
+
header[strlen(header)-2]=0;
|
369
|
+
|
370
|
+
if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
|
371
|
+
// TODO fill the file_data structure with the header data
|
372
|
+
*/
|
373
|
+
// printf("pos1 %ld\n",gztell((*filed)->gzf_bin));
|
374
|
+
res= read_bin_file_metadata(*filed);
|
375
|
+
// printf("pos2 %ld\n",gztell((*filed)->gzf_bin));
|
376
|
+
|
377
|
+
// inspect_file_data_struct(filed);
|
378
|
+
|
379
|
+
return res;
|
380
|
+
}
|
381
|
+
|
382
|
+
|
383
|
+
int read_data_sequential(struct file_data *filed,char **seq_name, char **fasta, char **qual, char **extras)
|
384
|
+
{
|
385
|
+
int res=0;
|
386
|
+
int error=0;
|
387
|
+
int fastaS,qualS,extrasS=0;
|
388
|
+
|
389
|
+
if ( *seq_name == NULL ) {*seq_name=(char *)malloc(SEQ_METADATA);strncpy(*seq_name,"",4);}
|
390
|
+
|
391
|
+
res=read_seq_header(filed->gzf_bin, *seq_name, &fastaS, &qualS, &extrasS);
|
392
|
+
// printf("FS:%d,QS:%d\n",fastaS,qualS);
|
393
|
+
if (res==-2) // EOF
|
394
|
+
return -9;
|
395
|
+
|
396
|
+
if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);strncpy(*fasta,"",fastaS);}
|
397
|
+
if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
|
398
|
+
if (( *extras == NULL )&&(extrasS>0)) {*extras=(char *)malloc(extrasS+1);strncpy(*extras,"",extrasS);}
|
399
|
+
|
400
|
+
long pos=gzread(filed->gzf_bin,*fasta,fastaS);
|
401
|
+
|
402
|
+
// BWReverseXform(*fasta, *fasta, 1, fastaS);
|
403
|
+
|
404
|
+
(*fasta)[fastaS]=0;
|
405
|
+
pos=gzread(filed->gzf_bin,*qual,qualS);
|
406
|
+
|
407
|
+
// if only one qual read, repeat it
|
408
|
+
if((qualS==1) & (qualS!=fastaS))
|
409
|
+
{
|
410
|
+
char q=*qual[0];
|
411
|
+
free(*qual);
|
412
|
+
*qual=NULL;
|
413
|
+
qualS=fastaS;
|
414
|
+
if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
|
415
|
+
memset(*qual,q,qualS);
|
416
|
+
}
|
417
|
+
|
418
|
+
(*qual)[qualS]=0;
|
419
|
+
// printf("LLEGA\n:");
|
420
|
+
if (extrasS>0) {pos=gzread(filed->gzf_bin,*extras,extrasS);(*extras)[extrasS]=0;}
|
421
|
+
return 0;
|
422
|
+
|
423
|
+
}
|
424
|
+
int close_sequential_reads(struct file_data *file_d)
|
425
|
+
{
|
426
|
+
gzclose(file_d->gzf_bin);
|
427
|
+
if ( (file_d)!= NULL ) { free(file_d);}
|
428
|
+
}
|
429
|
+
|
430
|
+
int regenerate_index(char * filename){
|
431
|
+
|
432
|
+
|
433
|
+
/* NO SE PUEDE USAR ESTA FUNCION PORQUE CUANDO SE LEE EL FICHERO, LA POSICION DE LECTURA NO TIENE PORQUE COINCIDIR CON LA DE ESCRITURA CUANDO SE CREÓ EL FBIN, DE MODO QUE AL GENERAR EL INDICE NO SABEMOS DE QUE CHUNK LEER. LA LIB ZLIB LEE CON EL GZREAD UN TROZO DE FICHERO DEL TAMAÑO QUE ELLA DECIDA, Y ASI NO COINCIDE LUEGO CON LOS BLOQUES QUE NOSOTROS ESCRIBIAMOS
|
434
|
+
*/
|
435
|
+
|
436
|
+
return -1;
|
437
|
+
|
438
|
+
int res=0;
|
439
|
+
|
440
|
+
long long bcount=0;
|
441
|
+
long long pos=0;
|
442
|
+
|
443
|
+
struct file_data *filed=NULL;
|
444
|
+
int fastaS,qualS,extrasS=0;
|
445
|
+
|
446
|
+
char header[SEQ_METADATA];
|
447
|
+
char metainfo[SEQ_METADATA];
|
448
|
+
|
449
|
+
char *seq_name = NULL;//[SEQ_METADATA];
|
450
|
+
|
451
|
+
if ( (filed) == NULL ) {filed=malloc(sizeof(struct file_data));}
|
452
|
+
|
453
|
+
filed->pos_chunk_gz=0;
|
454
|
+
filed->counter=0;
|
455
|
+
|
456
|
+
// memset(metainfo,'a',20);
|
457
|
+
// printf("%s\n",metainfo);
|
458
|
+
// open file
|
459
|
+
|
460
|
+
int file_bin=open(filename,O_RDONLY);
|
461
|
+
|
462
|
+
// filed->gzf_bin=gzopen(filename,"rb");
|
463
|
+
|
464
|
+
filed->gzf_bin = gzdopen(file_bin,"rb");
|
465
|
+
|
466
|
+
res=check_error(filed->gzf_bin==NULL,"Unable to open file",-1);
|
467
|
+
|
468
|
+
filed->error=0;
|
469
|
+
strncpy(filed->name,filename,MAXFNAME);
|
470
|
+
|
471
|
+
// read header
|
472
|
+
res=read_bin_file_metadata(filed);
|
473
|
+
|
474
|
+
|
475
|
+
int error=0;
|
476
|
+
int seek_res=0;
|
477
|
+
|
478
|
+
|
479
|
+
if ( seq_name == NULL ) {seq_name=(char *)malloc(SEQ_METADATA);strncpy(seq_name,"",4);}
|
480
|
+
char basura[100000];
|
481
|
+
|
482
|
+
// char ** basura;
|
483
|
+
// if ( *basura == NULL ) {*basura=(char *)malloc(fastaS+qualS+extrasS+1); strncpy(*basura,"",fastaS+qualS+extrasS);}
|
484
|
+
//
|
485
|
+
while (res==0){
|
486
|
+
long beginH=gztell(filed->gzf_bin);
|
487
|
+
|
488
|
+
// long long pos=lseek(file_bin,0,SEEK_CUR);
|
489
|
+
// printf("BEF: %lld\n",pos);
|
490
|
+
|
491
|
+
res=read_seq_header(filed->gzf_bin, seq_name, &fastaS, &qualS, &extrasS);
|
492
|
+
if (res==-2) // EOF
|
493
|
+
return -9;
|
494
|
+
|
495
|
+
bcount=bcount+4+fastaS+qualS+extrasS;
|
496
|
+
|
497
|
+
// printf("SEQ: %s, skip: %d, res:%d\n",seq_name,fastaS+qualS+extrasS,res);
|
498
|
+
|
499
|
+
snprintf(metainfo,SEQ_METADATA-1,"9999%s %d %d %d", seq_name, fastaS, qualS, extrasS);
|
500
|
+
// snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %d %d %d", strlen(metainfo)-4, seq_name, fastaS,qualS,extrasS);
|
501
|
+
|
502
|
+
// Write index file
|
503
|
+
char tmp[SEQ_METADATA];
|
504
|
+
sprintf(tmp,"%s %lld %ld\n",seq_name,filed->pos_chunk_gz,beginH);
|
505
|
+
|
506
|
+
printf("%s %lld %ld\n",seq_name,filed->pos_chunk_gz,beginH);
|
507
|
+
printf("%s\n",metainfo);
|
508
|
+
|
509
|
+
pos=lseek(file_bin,0,SEEK_CUR);
|
510
|
+
// long long pos2=gztell(filed->gzf_bin);
|
511
|
+
|
512
|
+
printf("Antes seek: %lld\n",pos);
|
513
|
+
|
514
|
+
// printf("%s\n",seq_name);
|
515
|
+
|
516
|
+
seek_res=gzseek(filed->gzf_bin,fastaS+qualS+extrasS,SEEK_CUR);
|
517
|
+
|
518
|
+
printf("bcount:%lld\n",bcount);
|
519
|
+
|
520
|
+
// long long pos4=lseek(file_bin,0,SEEK_CUR);
|
521
|
+
|
522
|
+
|
523
|
+
pos=lseek(file_bin,0,SEEK_CUR);
|
524
|
+
printf("Despues seek: %lld\n",pos);
|
525
|
+
// printf("AFT: %lld\n=============\n",pos4);
|
526
|
+
|
527
|
+
// long pos3=gzread(filed->gzf_bin,&basura,fastaS+qualS+extrasS);
|
528
|
+
|
529
|
+
(filed->counter)++;
|
530
|
+
|
531
|
+
long long pos2=gztell(filed->gzf_bin);
|
532
|
+
|
533
|
+
|
534
|
+
|
535
|
+
// new chunk
|
536
|
+
if (((filed->counter)%10000)==0) {
|
537
|
+
|
538
|
+
printf("SEQ 10K:%s\n",seq_name);
|
539
|
+
// close current chunk
|
540
|
+
pos=lseek(file_bin,0,SEEK_CUR);
|
541
|
+
// long long pos2=gztell(filed->gzf_bin);
|
542
|
+
|
543
|
+
printf("FINAL BLOCK POSSSSSSSSSSSSSS: %lld\n",pos);
|
544
|
+
|
545
|
+
gzclose(filed->gzf_bin);
|
546
|
+
|
547
|
+
// open file again to annotate chunk
|
548
|
+
file_bin=open(filed->name,O_RDONLY);
|
549
|
+
|
550
|
+
//goto end of file
|
551
|
+
pos=lseek(file_bin,pos,SEEK_SET);
|
552
|
+
if (pos==-1) {fprintf(stderr,"error %d seeking file :%s\n",errno,strerror(errno));return -1;}
|
553
|
+
|
554
|
+
printf("FINAL BLOCK POSSSSSSSSSSSSSS: %lld\n",pos);
|
555
|
+
|
556
|
+
// annotate chunk pos
|
557
|
+
filed->pos_chunk_gz=pos;
|
558
|
+
|
559
|
+
close(file_bin);
|
560
|
+
|
561
|
+
// open new gzfile
|
562
|
+
filed->gzf_bin=gzdopen(file_bin,"rb");
|
563
|
+
if (filed->gzf_bin==NULL) {fprintf(stderr,"error opening gzfile :%s\n",gzerror(filed->gzf_bin,&error));return -2;}
|
564
|
+
}
|
565
|
+
|
566
|
+
|
567
|
+
|
568
|
+
}
|
569
|
+
|
570
|
+
// if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);strncpy(*fasta,"",fastaS);}
|
571
|
+
// if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
|
572
|
+
// if (( *extras == NULL )&&(extrasS>0)) {*extras=(char *)malloc(extrasS+1);strncpy(*extras,"",extrasS);}
|
573
|
+
|
574
|
+
|
575
|
+
// long pos=gzread(filed->gzf_bin,*fasta,fastaS);
|
576
|
+
// (*fasta)[fastaS]=0;
|
577
|
+
// pos=gzread(filed->gzf_bin,*qual,qualS);
|
578
|
+
// (*qual)[qualS]=0;
|
579
|
+
// if (extrasS>0) {pos=gzread(filed->gzf_bin,*extras,extrasS);(*extras)[extrasS]=0;}
|
580
|
+
|
581
|
+
|
582
|
+
// close files
|
583
|
+
gzclose(filed->gzf_bin);
|
584
|
+
if ( (filed)!= NULL ) {free(filed);}
|
585
|
+
|
586
|
+
return 0;
|
587
|
+
|
588
|
+
|
589
|
+
}
|
590
|
+
|
591
|
+
long long find_seq_in_hash(char *filename,char *sname)
|
592
|
+
{
|
593
|
+
|
594
|
+
char hash_file_name[MAXFNAME];
|
595
|
+
// char indexname[MAXFNAME];
|
596
|
+
int error;
|
597
|
+
char sname1[MAXSEQNAME];// sequence name
|
598
|
+
char sname2[MAXSEQNAME];// sequence name
|
599
|
+
long long gz_chunk=0;
|
600
|
+
char tmp[SEQ_METADATA];
|
601
|
+
long long res=-1;
|
602
|
+
|
603
|
+
// to save min, max sequences and current chunk
|
604
|
+
char min_name[MAXSEQNAME];
|
605
|
+
char max_name[MAXSEQNAME];
|
606
|
+
long long current_chunk=0;
|
607
|
+
|
608
|
+
|
609
|
+
strcpy(min_name,"");
|
610
|
+
strcpy(max_name,"");
|
611
|
+
|
612
|
+
// calc index and hash name
|
613
|
+
// snprintf(indexname,MAXFNAME,"%s.index",filename);
|
614
|
+
snprintf(hash_file_name,MAXFNAME,"%s.index.hash",filename);
|
615
|
+
|
616
|
+
|
617
|
+
// open index and hash file
|
618
|
+
gzFile gzhash_file=gzopen(hash_file_name,"r");
|
619
|
+
|
620
|
+
if (gzhash_file==NULL) {
|
621
|
+
// fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(gzhash_file,&error));
|
622
|
+
// no hash file found
|
623
|
+
return -2;
|
624
|
+
}
|
625
|
+
|
626
|
+
// repeat until EOF
|
627
|
+
while ( gzgets(gzhash_file,tmp,sizeof(tmp))!=Z_NULL ) {
|
628
|
+
|
629
|
+
// printf("%s\n",tmp);
|
630
|
+
// parse string
|
631
|
+
int reads=sscanf(tmp,"%s %s %lld",sname1,sname2,&gz_chunk);
|
632
|
+
|
633
|
+
if(reads==3) // valid index line
|
634
|
+
{
|
635
|
+
//
|
636
|
+
if((strcmp(sname,sname1)>=0) && (strcmp(sname,sname2)<=0))
|
637
|
+
{
|
638
|
+
#if DEBUG
|
639
|
+
printf("%s in [%s,%s]\n",sname,sname1,sname2);
|
640
|
+
#endif
|
641
|
+
res = gz_chunk;
|
642
|
+
break;
|
643
|
+
}else{
|
644
|
+
// printf("%s NOT IN [%s,%s]\n",sname,sname1,sname2);
|
645
|
+
}
|
646
|
+
|
647
|
+
}
|
648
|
+
|
649
|
+
}
|
650
|
+
|
651
|
+
// close files
|
652
|
+
gzclose(gzhash_file);
|
653
|
+
|
654
|
+
return res;
|
655
|
+
}
|
656
|
+
|
657
|
+
|
658
|
+
long long find_seq_in_index(char *filename,char *sname, long long index_chunk, long long *gz_chunk, long long *beginH){
|
659
|
+
|
660
|
+
long long chunk=-1;
|
661
|
+
|
662
|
+
char file_name[MAXFNAME];
|
663
|
+
// char indexname[MAXFNAME];
|
664
|
+
int error;
|
665
|
+
char sname1[MAXSEQNAME];// sequence name
|
666
|
+
char sname2[MAXSEQNAME];// sequence name
|
667
|
+
long long aux_beginH=-1,aux_gz_chunk=-1;
|
668
|
+
char tmp[SEQ_METADATA];
|
669
|
+
long long res=-1;
|
670
|
+
|
671
|
+
*gz_chunk = 0;
|
672
|
+
*beginH=0;
|
673
|
+
|
674
|
+
// to save min, max sequences and current chunk
|
675
|
+
char min_name[MAXSEQNAME];
|
676
|
+
char max_name[MAXSEQNAME];
|
677
|
+
long long current_chunk=0;
|
678
|
+
|
679
|
+
|
680
|
+
strcpy(min_name,"");
|
681
|
+
strcpy(max_name,"");
|
682
|
+
|
683
|
+
// calc index and hash name
|
684
|
+
// snprintf(indexname,MAXFNAME,"%s.index",filename);
|
685
|
+
snprintf(file_name,MAXFNAME,"%s.index",filename);
|
686
|
+
|
687
|
+
|
688
|
+
|
689
|
+
// open index and hash file
|
690
|
+
// gzFile filegz=gzopen(file_name,"r");
|
691
|
+
int file=open(file_name, O_RDONLY);
|
692
|
+
|
693
|
+
if (file<0)
|
694
|
+
{
|
695
|
+
return -2;
|
696
|
+
}
|
697
|
+
|
698
|
+
if(index_chunk>0)
|
699
|
+
{
|
700
|
+
#if DEBUG
|
701
|
+
printf("Seek to %lld\n",index_chunk);
|
702
|
+
#endif
|
703
|
+
// res=gzseek(filegz,index_chunk,SEEK_SET);
|
704
|
+
res=lseek(file,index_chunk,SEEK_SET);
|
705
|
+
|
706
|
+
// printf("Seeked\n");
|
707
|
+
}
|
708
|
+
|
709
|
+
gzFile filegz=gzdopen(file,"r");
|
710
|
+
|
711
|
+
if (filegz==NULL) {
|
712
|
+
// fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(filegz,&error));
|
713
|
+
return -2;
|
714
|
+
}
|
715
|
+
|
716
|
+
// repeat until EOF
|
717
|
+
while ( gzgets(filegz,tmp,sizeof(tmp))!=Z_NULL ) {
|
718
|
+
|
719
|
+
// printf("%s\n",tmp);
|
720
|
+
// parse string
|
721
|
+
// int reads=sscanf(tmp,"%s %s %lld",sname1,sname2,&gz_chunk);
|
722
|
+
int reads=sscanf(tmp,"%s %lld %lld",sname1,&aux_gz_chunk,&aux_beginH);
|
723
|
+
|
724
|
+
|
725
|
+
if(reads==3) // valid index line
|
726
|
+
{
|
727
|
+
//
|
728
|
+
if(strcmp(sname,sname1)==0)
|
729
|
+
{
|
730
|
+
#if DEBUG
|
731
|
+
printf("%s IN %s\n",sname, tmp);
|
732
|
+
#endif
|
733
|
+
|
734
|
+
chunk = aux_gz_chunk;
|
735
|
+
*gz_chunk = aux_gz_chunk;
|
736
|
+
*beginH=aux_beginH;
|
737
|
+
|
738
|
+
// beginH=gz_beginH;
|
739
|
+
break;
|
740
|
+
}else{
|
741
|
+
#if DEBUG
|
742
|
+
printf("NOT IN %s",tmp);
|
743
|
+
#endif
|
744
|
+
// break;
|
745
|
+
}
|
746
|
+
|
747
|
+
}
|
748
|
+
|
749
|
+
}
|
750
|
+
|
751
|
+
// close files
|
752
|
+
gzclose(filegz);
|
753
|
+
|
754
|
+
return chunk;
|
755
|
+
}
|
756
|
+
|
757
|
+
|
758
|
+
|
759
|
+
/*
|
760
|
+
read_seq reads from filename the sequence named seq_name and returns its
|
761
|
+
fasta, qual and extras in those variables.
|
762
|
+
It returns 0 if there are no errors, otherwise it returns:
|
763
|
+
-2 : error opening index file (it doesn't exists)
|
764
|
+
-3 : error reading index file
|
765
|
+
-4 : error sequence not found in index file
|
766
|
+
-5 : error opening file (it doesn't exists)
|
767
|
+
-6 : error reading file
|
768
|
+
-7 : error sequence not found
|
769
|
+
-8 : error uncompressing sequence
|
770
|
+
-9 : EOF
|
771
|
+
|
772
|
+
*/
|
773
|
+
|
774
|
+
int read_seq(char *filename, char *seq_name, char **fasta, char **qual, char **extras)
|
775
|
+
{
|
776
|
+
/* Hacer grep en filename.index de seq_name */
|
777
|
+
/* Una vez encontrado leer su info (indice y offsets) */
|
778
|
+
/* leer de filename en sus offests el fasta qual y extras */
|
779
|
+
/* Descomprimirlo y devolverlo */
|
780
|
+
|
781
|
+
char indexname[MAXFNAME];
|
782
|
+
char sname[MAXSEQNAME];// sequence name
|
783
|
+
// char *fasta_comp; // compressed fasta
|
784
|
+
// char *qual_comp; // compressed qual
|
785
|
+
// char *extras_comp; // compressed extras
|
786
|
+
long long beginH, gz_chunk=0;
|
787
|
+
int fastaS, qualS, extrasS=0;
|
788
|
+
char tmp[SEQ_METADATA];
|
789
|
+
int res=0;
|
790
|
+
int error=0;
|
791
|
+
|
792
|
+
// int bufsize=MAXSEQLENGTH;
|
793
|
+
//
|
794
|
+
// // allocate memory for return data if necessary
|
795
|
+
// if ( *fasta == NULL ) {*fasta=(char *)malloc(bufsize);strncpy(*fasta,"",bufsize);}
|
796
|
+
// if ( *qual == NULL) {*qual=(char *)malloc(bufsize);strncpy(*qual,"",bufsize);}
|
797
|
+
// if ( *extras == NULL ) {*extras=(char *)malloc(bufsize);strncpy(*extras,"",bufsize);}
|
798
|
+
|
799
|
+
// calc index name
|
800
|
+
// snprintf(indexname,MAXFNAME,"%s.index",filename);
|
801
|
+
|
802
|
+
long long chunk=find_seq_in_hash(filename,seq_name);
|
803
|
+
// printf("Chunk: %lld\n",chunk);
|
804
|
+
|
805
|
+
if (chunk<0){
|
806
|
+
chunk=0;
|
807
|
+
}
|
808
|
+
|
809
|
+
if ((res=find_seq_in_index(filename,seq_name,chunk,&gz_chunk,&beginH))<0){
|
810
|
+
return res;
|
811
|
+
};
|
812
|
+
|
813
|
+
|
814
|
+
// open index file
|
815
|
+
// gzFile gzfile_index=gzopen(indexname,"r");
|
816
|
+
// if (gzfile_index==NULL) {
|
817
|
+
// fprintf(stderr,"error opening gzfile_index :%s\n",gzerror(gzfile_index,&error));
|
818
|
+
// return -2;
|
819
|
+
// }
|
820
|
+
//
|
821
|
+
// // Reads the index to this info, and the offset to its data
|
822
|
+
// int reads=3;
|
823
|
+
// while ( reads == 3 ) {
|
824
|
+
//
|
825
|
+
// // read a chunk of data from index with the size of tmp
|
826
|
+
// gzgets(gzfile_index,tmp,sizeof(tmp));
|
827
|
+
// reads=sscanf(tmp,"%s %lld %lld",sname,&gz_chunk,&beginH);
|
828
|
+
//
|
829
|
+
//
|
830
|
+
//
|
831
|
+
//
|
832
|
+
// if (( reads != 3 ) && ( reads!=EOF )) {
|
833
|
+
// fprintf(stderr,"Error scanning index: %d\n",reads);
|
834
|
+
// gzclose(gzfile_index);
|
835
|
+
// return -3;
|
836
|
+
// }
|
837
|
+
//
|
838
|
+
// // sequence was finally found, exit loop
|
839
|
+
// if ( strncmp(sname, seq_name,MAXSEQNAME)==0) reads=999; // to get out, seq found
|
840
|
+
// }
|
841
|
+
//
|
842
|
+
// // close index file
|
843
|
+
// gzclose(gzfile_index);
|
844
|
+
//
|
845
|
+
// maybe sequence was not found
|
846
|
+
// fprintf(stderr,"Sequence not found\n");
|
847
|
+
// if (reads==EOF) {return -4;}
|
848
|
+
|
849
|
+
// We get here if sequence was found
|
850
|
+
|
851
|
+
#if DEBUG
|
852
|
+
printf("Index found %lld. Seeking\n",gz_chunk);
|
853
|
+
#endif
|
854
|
+
// open bin file to extract data
|
855
|
+
int dataf=open(filename, O_RDONLY);
|
856
|
+
|
857
|
+
// seek to chunk pos
|
858
|
+
// TODO- ¿como se salta el chunk?
|
859
|
+
// res=lseek(dataf,gz_chunk,SEEK_SET);
|
860
|
+
res=lseek(dataf,gz_chunk,SEEK_SET);
|
861
|
+
|
862
|
+
// TODO check res
|
863
|
+
gzFile gzfile_bin=gzdopen(dataf,"r");
|
864
|
+
|
865
|
+
// seek to seq inside chunk
|
866
|
+
res=gzseek(gzfile_bin,beginH,SEEK_SET);
|
867
|
+
// TODO check res
|
868
|
+
|
869
|
+
// printf("Seeked\n");
|
870
|
+
|
871
|
+
// read sequence header
|
872
|
+
res=read_seq_header(gzfile_bin,NULL, &fastaS, &qualS, &extrasS);
|
873
|
+
|
874
|
+
// int bufsize=MAXSEQLENGTH;
|
875
|
+
|
876
|
+
// memset(*qual,q,qualS);
|
877
|
+
// allocate memory for return data if necessary
|
878
|
+
if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);(*fasta)[0]=0;}
|
879
|
+
if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);(*qual)[0]=0;}
|
880
|
+
if ( *extras == NULL ) {*extras=(char *)malloc(extrasS+1);(*extras)[0]=0;}
|
881
|
+
|
882
|
+
long pos=gzread(gzfile_bin,*fasta,fastaS);
|
883
|
+
// printf("LEIDO:%ld\n",pos);
|
884
|
+
// BWReverseXform(*fasta, *fasta, 1, fastaS);
|
885
|
+
|
886
|
+
(*fasta)[fastaS]=0;
|
887
|
+
pos=gzread(gzfile_bin,*qual,qualS);
|
888
|
+
|
889
|
+
// if only one qual read, repeat it
|
890
|
+
if((qualS==1) & (qualS!=fastaS))
|
891
|
+
{
|
892
|
+
char q=*qual[0];
|
893
|
+
free(*qual);
|
894
|
+
*qual=NULL;
|
895
|
+
qualS=fastaS;
|
896
|
+
if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
|
897
|
+
memset(*qual,q,qualS);
|
898
|
+
}
|
899
|
+
|
900
|
+
(*qual)[qualS]=0;
|
901
|
+
|
902
|
+
if (extrasS>0) {pos=gzread(gzfile_bin,*extras,extrasS); (*extras)[extrasS]=0;}
|
903
|
+
gzclose(gzfile_bin);
|
904
|
+
|
905
|
+
return 0;
|
906
|
+
}
|
907
|
+
|
908
|
+
void inspect_file_data_struct(struct file_data *file){
|
909
|
+
|
910
|
+
printf("file name:%s\n",file->name);
|
911
|
+
printf("file index_name:%s\n",file->index_name);
|
912
|
+
printf("file version:%d\n",file->version);
|
913
|
+
printf("file subversion:%d\n",file->subversion);
|
914
|
+
printf("error:%d\n",file->error);
|
915
|
+
/*
|
916
|
+
if (file->bin_search==TRUE) printf("file binary search is possible\n");
|
917
|
+
else printf("file binary search is not possible\n");
|
918
|
+
*/
|
919
|
+
|
920
|
+
}
|
921
|
+
|
922
|
+
// initialize the state for doing writes
|
923
|
+
// two modes:
|
924
|
+
// 1 .- new files
|
925
|
+
// 2 .- add data to files, if they don't exist they are created
|
926
|
+
int initialize_writes(struct file_data ** file, char *output_name, int mode, int discretize_qual, int flatten_qual, int create_index)
|
927
|
+
{
|
928
|
+
|
929
|
+
// check if the files exists, in case it exists check if it has the
|
930
|
+
// correct metadata and if it is of the correct version
|
931
|
+
// in other case exits with an error
|
932
|
+
// struct file_data *file = malloc(sizeof(struct write_file));
|
933
|
+
if ( *file == NULL ) {*file=malloc(sizeof(struct file_data));}
|
934
|
+
|
935
|
+
(*file)->pos_chunk_gz=0;
|
936
|
+
(*file)->discretize_qual=discretize_qual;
|
937
|
+
(*file)->flatten_qual=flatten_qual;
|
938
|
+
(*file)->create_index=create_index;
|
939
|
+
|
940
|
+
int state=check_files(output_name);
|
941
|
+
if (state==1) {
|
942
|
+
fprintf(stderr,"File is from a different version\n");
|
943
|
+
return -1;
|
944
|
+
}
|
945
|
+
if ((state!=2)&&(state!=0)) {
|
946
|
+
fprintf(stderr,"Error %d when checking files\n",state);
|
947
|
+
return -1;
|
948
|
+
}
|
949
|
+
|
950
|
+
// copy the name of the file
|
951
|
+
strncpy((*file)->name,output_name,MAXFNAME);
|
952
|
+
|
953
|
+
// open the compressed files
|
954
|
+
int error=0;
|
955
|
+
int flags=O_WRONLY|O_CREAT|O_TRUNC;
|
956
|
+
if (mode==2) flags=O_RDWR;
|
957
|
+
// printf("mode:%d\n",mode);
|
958
|
+
|
959
|
+
//set index name
|
960
|
+
snprintf((*file)->index_name,MAXFNAME,"%s.index",(*file)->name);
|
961
|
+
|
962
|
+
int file_index=-1;
|
963
|
+
|
964
|
+
if ((*file)->create_index){
|
965
|
+
//open index file
|
966
|
+
file_index=open((*file)->index_name,flags,0644);
|
967
|
+
|
968
|
+
if (file_index==-1) return -2;
|
969
|
+
}
|
970
|
+
|
971
|
+
// open bin file
|
972
|
+
int file_bin=open((*file)->name,flags,0644);
|
973
|
+
// printf("fd:%d\n",file_bin);
|
974
|
+
if (file_bin==-1) {fprintf(stderr,"error opening file_bin for writting:%s\n",strerror(errno));return -2;}
|
975
|
+
if (mode==2) {
|
976
|
+
long long pos=lseek(file_index,0,SEEK_END);
|
977
|
+
if (pos==-1) {fprintf(stderr,"error going to end of index file %s\n",strerror(errno)); return -2;}
|
978
|
+
pos=lseek(file_bin,0,SEEK_END);
|
979
|
+
if (pos==-1) {fprintf(stderr,"error going to end of bin file %s\n",strerror(errno)); return -2;}
|
980
|
+
(*file)->pos_chunk_gz=pos;
|
981
|
+
}
|
982
|
+
|
983
|
+
if ((*file)->create_index){
|
984
|
+
// open zlib index file
|
985
|
+
(*file)->gzf_index=gzdopen(file_index,"wb");
|
986
|
+
if ((*file)->gzf_index==NULL) {
|
987
|
+
fprintf(stderr,"error opening gzfile_index for writting:%s\n",gzerror((*file)->gzf_index,&error));
|
988
|
+
return -2;
|
989
|
+
}
|
990
|
+
}
|
991
|
+
|
992
|
+
// open zlib bin file
|
993
|
+
(*file)->gzf_bin=gzdopen(file_bin,"wb");
|
994
|
+
if ((*file)->gzf_bin==NULL) {
|
995
|
+
fprintf(stderr,"error opening gzfile for writting:%s\n",gzerror((*file)->gzf_bin,&error));
|
996
|
+
return -2;
|
997
|
+
}
|
998
|
+
|
999
|
+
// initializes the files, writting the metadata
|
1000
|
+
if (mode==1) {
|
1001
|
+
char header[SEQ_METADATA];
|
1002
|
+
(*file)->version=VERSION;
|
1003
|
+
(*file)->subversion=SUBVERSION;
|
1004
|
+
(*file)->error=0;
|
1005
|
+
// TODO put correct size
|
1006
|
+
snprintf(header,SEQ_METADATA-1,"9999UMACOMPRESSEDFORMAT_%d_%d %d %d %d", (*file)->version,(*file)->subversion, 0, 0, 0);
|
1007
|
+
snprintf(header,SEQ_METADATA-1,"%4ldUMACOMPRESSEDFORMAT_%d_%d %d %d %d", strlen(header)-4,(*file)->version,(*file)->subversion, 0, 0, 0);
|
1008
|
+
// snprintf(header,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, (*file)->version,(*file)->subversion, 0, 0, 0);
|
1009
|
+
|
1010
|
+
// sprintf(header," 29UMACOMPRESSEDFORMAT_%d_%d 0 0 0\n",(*file)->version,(*file)->subversion);
|
1011
|
+
int res=gzwrite((*file)->gzf_bin,header,strlen(header));
|
1012
|
+
|
1013
|
+
if((*file)->create_index)
|
1014
|
+
{
|
1015
|
+
sprintf(header,"UMACOMPRESSEDFORMAT 1 0 0 999999999999 999999999999\n");
|
1016
|
+
res=gzwrite((*file)->gzf_index,header,strlen(header));
|
1017
|
+
}
|
1018
|
+
}
|
1019
|
+
(*file)->counter=0;
|
1020
|
+
|
1021
|
+
// prev_time=time(NULL);
|
1022
|
+
|
1023
|
+
// printf("Init writes done\n");
|
1024
|
+
return 0;
|
1025
|
+
}
|
1026
|
+
|
1027
|
+
|
1028
|
+
|
1029
|
+
int close_writes(struct file_data *file)
|
1030
|
+
{
|
1031
|
+
gzclose(file->gzf_bin);
|
1032
|
+
if((file)->create_index)
|
1033
|
+
{
|
1034
|
+
gzclose(file->gzf_index);
|
1035
|
+
}
|
1036
|
+
|
1037
|
+
if ( file != NULL ) { free(file);}
|
1038
|
+
}
|
1039
|
+
|
1040
|
+
// open a file for reading and check error
|
1041
|
+
int open_file(char *fname, FILE **file){
|
1042
|
+
|
1043
|
+
*file=fopen(fname,"r");
|
1044
|
+
|
1045
|
+
if (*file==NULL) { fprintf(stderr,"Error opening fasta file %s, result %d %s\n",fname,errno,strerror(errno));
|
1046
|
+
return -1;
|
1047
|
+
};
|
1048
|
+
|
1049
|
+
return 0;
|
1050
|
+
}
|
1051
|
+
|
1052
|
+
// open a file for reading and check error
|
1053
|
+
int close_file(FILE *file){
|
1054
|
+
|
1055
|
+
fclose(file);
|
1056
|
+
return 0;
|
1057
|
+
}
|
1058
|
+
|
1059
|
+
|
1060
|
+
// removes last \n from string if any
|
1061
|
+
int chomp(char *str){
|
1062
|
+
|
1063
|
+
// printf("LEN: %s, %ld\n",str,strlen(str));
|
1064
|
+
if (str[strlen(str)-1]=='\n'){
|
1065
|
+
str[strlen(str)-1]='\0';
|
1066
|
+
// printf("LEN2: %s, %ld\n",str,strlen(str));
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
}
|
1070
|
+
|
1071
|
+
// split name in name and comments and remove @ or > from first char
|
1072
|
+
int split_name(char *fname, char *name, char *comments){
|
1073
|
+
|
1074
|
+
char *name_part;
|
1075
|
+
char *comment_part;
|
1076
|
+
|
1077
|
+
// remove first char (@, >, etc)
|
1078
|
+
memmove(fname, fname+1, strlen(fname));
|
1079
|
+
|
1080
|
+
// split name by space
|
1081
|
+
name_part = strtok(fname, " ");
|
1082
|
+
|
1083
|
+
// get remaining until end of line
|
1084
|
+
comment_part=strtok(NULL, "\n");
|
1085
|
+
|
1086
|
+
// assign name and comments
|
1087
|
+
if(name_part)
|
1088
|
+
{
|
1089
|
+
strcpy(name,name_part);
|
1090
|
+
}else{
|
1091
|
+
strcpy(name,"");
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
if(comment_part)
|
1095
|
+
{
|
1096
|
+
strcpy(comments,comment_part);
|
1097
|
+
}else{
|
1098
|
+
strcpy(comments,"");
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
return 1;
|
1102
|
+
}
|
1103
|
+
|
1104
|
+
int check_mem(char **var,int size)
|
1105
|
+
{
|
1106
|
+
if ( *var == NULL) {*var=(char *)malloc(size);strncpy(*var,"",size);}
|
1107
|
+
}
|
1108
|
+
|
1109
|
+
// read next seq from fastq file
|
1110
|
+
int get_next_seq_fastq(FILE *file, char **name, char **fasta, char **qual, char **comments){
|
1111
|
+
|
1112
|
+
check_mem(name,MAXSEQNAME);
|
1113
|
+
check_mem(fasta,MAXSEQLENGTH);
|
1114
|
+
check_mem(qual,MAXSEQLENGTH);
|
1115
|
+
check_mem(comments,MAXSEQLENGTH);
|
1116
|
+
|
1117
|
+
|
1118
|
+
|
1119
|
+
|
1120
|
+
char fname[MAXSEQNAME];// sequence name
|
1121
|
+
char qname[MAXSEQNAME];// sequence name
|
1122
|
+
|
1123
|
+
strcpy(*name,"");
|
1124
|
+
strcpy(*fasta,"");
|
1125
|
+
strcpy(*qual,"");
|
1126
|
+
strcpy(*comments,"");
|
1127
|
+
|
1128
|
+
errno = 0;
|
1129
|
+
|
1130
|
+
// read sequence name line ---------------------------------
|
1131
|
+
if (fgets( fname, MAXSEQNAME, file )==NULL) {return 0; };
|
1132
|
+
|
1133
|
+
if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
|
1134
|
+
chomp(fname);
|
1135
|
+
|
1136
|
+
// check for @ at beginning
|
1137
|
+
if (fname[0]!='@'){
|
1138
|
+
fprintf(stderr,"ERROR: Invalid FASTQ format %s. Missing @ in name line.\n", fname);
|
1139
|
+
return INVALID_FASTQ_FORMAT;
|
1140
|
+
}
|
1141
|
+
|
1142
|
+
// split name by space in name and comments
|
1143
|
+
split_name(fname,*name,*comments);
|
1144
|
+
|
1145
|
+
// read fasta line ---------------------------------
|
1146
|
+
if (fgets( *fasta, MAXSEQLENGTH, file )==NULL) { return 0; };
|
1147
|
+
|
1148
|
+
if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
|
1149
|
+
chomp(*fasta);
|
1150
|
+
|
1151
|
+
|
1152
|
+
// read qual name line ---------------------------------
|
1153
|
+
if (fgets( qname, MAXSEQLENGTH, file )==NULL) { return 0; };
|
1154
|
+
|
1155
|
+
if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
|
1156
|
+
chomp(qname);
|
1157
|
+
|
1158
|
+
// check for + sign at beginning of qual name line
|
1159
|
+
if (qname[0]!='+'){
|
1160
|
+
fprintf(stderr,"ERROR: Invalid FASTQ format. Missing + in qual line. %s.\n", *name);
|
1161
|
+
return INVALID_FASTQ_FORMAT;
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
// read qual line ---------------------------------
|
1165
|
+
if (fgets( *qual, MAXSEQLENGTH, file )==NULL) { return 0; };
|
1166
|
+
if (errno) {fprintf(stderr,"Error reading fastq, result %d %s\n",errno,strerror(errno));return INVALID_FASTQ_FORMAT;}
|
1167
|
+
|
1168
|
+
chomp(*qual);
|
1169
|
+
|
1170
|
+
return 1;
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
// read next seq from fasta file
|
1174
|
+
int get_next_seq_fasta(FILE *file, char *name, char *fasta, char *comments){
|
1175
|
+
|
1176
|
+
char fname[MAXSEQNAME];// sequence name
|
1177
|
+
char *line;
|
1178
|
+
if ((line = malloc(MAXSEQLENGTH)) == NULL) {
|
1179
|
+
puts("Memory allocation error!");
|
1180
|
+
return EXIT_FAILURE;
|
1181
|
+
}
|
1182
|
+
|
1183
|
+
// init vars
|
1184
|
+
strcpy(name,"");
|
1185
|
+
strcpy(fasta,"");
|
1186
|
+
strcpy(comments,"");
|
1187
|
+
|
1188
|
+
// read sequence name line ---------------------------------
|
1189
|
+
if (fgets( fname, MAXSEQNAME, file )==NULL) { return 0; };
|
1190
|
+
if (errno) {fprintf(stderr,"Error reading fasta, result %d %s\n",errno,strerror(errno));return INVALID_FASTA_FORMAT;}
|
1191
|
+
chomp(fname);
|
1192
|
+
|
1193
|
+
// check for @ at beginning
|
1194
|
+
if (fname[0]!='>'){
|
1195
|
+
fprintf(stderr,"ERROR: Invalid FASTA format %s. Missing @ in name line.\n", fname);
|
1196
|
+
return INVALID_FASTA_FORMAT;
|
1197
|
+
}
|
1198
|
+
|
1199
|
+
// split name by space in name and comments
|
1200
|
+
split_name(fname,name,comments);
|
1201
|
+
|
1202
|
+
// get current pos in file
|
1203
|
+
//fpos_t pos;
|
1204
|
+
|
1205
|
+
// read fasta line ---------------------------------
|
1206
|
+
char c=0;
|
1207
|
+
int len=0;
|
1208
|
+
int num_lines=0;
|
1209
|
+
while (1) {
|
1210
|
+
// fgetpos( file, &pos );
|
1211
|
+
|
1212
|
+
// inspect first char
|
1213
|
+
c=fgetc(file);
|
1214
|
+
ungetc(c,file);
|
1215
|
+
|
1216
|
+
if (c!='>'){
|
1217
|
+
// get following line
|
1218
|
+
if (fgets( line, MAXSEQLENGTH, file )==NULL) { break; };
|
1219
|
+
if (errno) {fprintf(stderr,"Error reading fasta, result %d %s\n",errno,strerror(errno));return INVALID_FASTA_FORMAT;}
|
1220
|
+
chomp(line);
|
1221
|
+
|
1222
|
+
// append to fasta
|
1223
|
+
// strcat(fasta,line);
|
1224
|
+
|
1225
|
+
len = len + sprintf(fasta+len,"%s",line);
|
1226
|
+
|
1227
|
+
if (len>=MAXSEQLENGTH)
|
1228
|
+
{
|
1229
|
+
fprintf(stderr,"Error, maximun sequence size error (%d). You can recompile lib with a bigger MAXSEQLENGTH\n",MAXSEQLENGTH);return MAX_SEQ_SIZE_ERROR;
|
1230
|
+
}
|
1231
|
+
// fasta[strlen(fasta)]=0;
|
1232
|
+
// num_lines++;
|
1233
|
+
// printf("%d\n",num_lines);
|
1234
|
+
|
1235
|
+
// if((num_lines%1000)==0)
|
1236
|
+
// {
|
1237
|
+
// printf("%ld\n,",strlen(fasta));
|
1238
|
+
// }
|
1239
|
+
}else{ // name line
|
1240
|
+
// rewind file and exit
|
1241
|
+
// fsetpos(file, &pos);
|
1242
|
+
break;
|
1243
|
+
}
|
1244
|
+
}
|
1245
|
+
|
1246
|
+
// printf("%ld\n,",strlen(fasta));
|
1247
|
+
|
1248
|
+
free(line);
|
1249
|
+
|
1250
|
+
return 1;
|
1251
|
+
}
|
1252
|
+
|
1253
|
+
// process a fastq file adding it to fbin file
|
1254
|
+
int process_fastq(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index)
|
1255
|
+
{
|
1256
|
+
|
1257
|
+
// allocate strings
|
1258
|
+
char *name;
|
1259
|
+
if ((name = malloc(MAXSEQNAME)) == NULL) {
|
1260
|
+
puts("Memory allocation error!");
|
1261
|
+
return EXIT_FAILURE;
|
1262
|
+
}
|
1263
|
+
|
1264
|
+
char *fasta;
|
1265
|
+
if ((fasta = malloc(MAXSEQLENGTH)) == NULL) {
|
1266
|
+
puts("Memory allocation error!");
|
1267
|
+
return EXIT_FAILURE;
|
1268
|
+
}
|
1269
|
+
|
1270
|
+
char *qual;
|
1271
|
+
if ((qual = malloc(MAXSEQLENGTH)) == NULL) {
|
1272
|
+
puts("Memory allocation error!");
|
1273
|
+
return EXIT_FAILURE;
|
1274
|
+
}
|
1275
|
+
|
1276
|
+
char *comments;
|
1277
|
+
if ((comments = malloc(MAXSEQLENGTH)) == NULL) {
|
1278
|
+
puts("Memory allocation error!");
|
1279
|
+
return EXIT_FAILURE;
|
1280
|
+
}
|
1281
|
+
|
1282
|
+
|
1283
|
+
char *extras_name;
|
1284
|
+
if ((extras_name = malloc(MAXSEQLENGTH)) == NULL) {
|
1285
|
+
puts("Memory allocation error!");
|
1286
|
+
return EXIT_FAILURE;
|
1287
|
+
}
|
1288
|
+
|
1289
|
+
|
1290
|
+
char *extras;
|
1291
|
+
if ((extras = malloc(MAXSEQLENGTH)) == NULL) {
|
1292
|
+
puts("Memory allocation error!");
|
1293
|
+
return EXIT_FAILURE;
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
char *final_extras;
|
1297
|
+
if ((final_extras = malloc(MAXSEQLENGTH)) == NULL) {
|
1298
|
+
puts("Memory allocation error!");
|
1299
|
+
return EXIT_FAILURE;
|
1300
|
+
}
|
1301
|
+
|
1302
|
+
|
1303
|
+
// char name[MAXSEQLENGTH];
|
1304
|
+
// char fasta[MAXSEQLENGTH];
|
1305
|
+
// char qual[MAXSEQLENGTH];
|
1306
|
+
// char extras[MAXSEQLENGTH];
|
1307
|
+
static time_t curr_time=0;
|
1308
|
+
static time_t prev_time=0;
|
1309
|
+
|
1310
|
+
prev_time=time(NULL);
|
1311
|
+
|
1312
|
+
|
1313
|
+
FILE *fastq_file=NULL;
|
1314
|
+
FILE *extras_file=NULL;
|
1315
|
+
|
1316
|
+
int valid=0;
|
1317
|
+
int res=0;
|
1318
|
+
int r=0;
|
1319
|
+
|
1320
|
+
// Open fasta and qual files
|
1321
|
+
if (strcmp(fname,"-")==0){
|
1322
|
+
fastq_file=stdin;
|
1323
|
+
}else{
|
1324
|
+
open_file(fname,&fastq_file);
|
1325
|
+
}
|
1326
|
+
|
1327
|
+
if(efname!=NULL)
|
1328
|
+
{
|
1329
|
+
|
1330
|
+
open_file(efname,&extras_file);
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
// open output file
|
1334
|
+
struct file_data *file=NULL;
|
1335
|
+
int error2=initialize_writes(&file, outname,1, discretize_qual, flatten_qual,create_index);
|
1336
|
+
|
1337
|
+
// printf("Init writes\n");
|
1338
|
+
// read first extra entry
|
1339
|
+
if(extras_file!=NULL)
|
1340
|
+
{
|
1341
|
+
get_next_seq_fasta(extras_file,extras_name,extras,comments);
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
|
1345
|
+
// for each sequence on fastq file
|
1346
|
+
while (valid=get_next_seq_fastq(fastq_file,&name,&fasta,&qual,&comments)){
|
1347
|
+
if(valid==1)
|
1348
|
+
{
|
1349
|
+
r++;
|
1350
|
+
|
1351
|
+
// printf("======================\nNAME:%s\nSEQ :%s\nQUAL:%s\n", name,fasta,qual);
|
1352
|
+
// if(strlen(comments)>0)
|
1353
|
+
// {
|
1354
|
+
// printf("COM :%s\n", comments);
|
1355
|
+
// }
|
1356
|
+
|
1357
|
+
strcpy(final_extras,comments);
|
1358
|
+
|
1359
|
+
|
1360
|
+
// check if there are extras available
|
1361
|
+
if (strcmp(name,extras_name)==0){
|
1362
|
+
strcat(final_extras,extras);
|
1363
|
+
|
1364
|
+
// read next extras
|
1365
|
+
if(extras_file!=NULL)
|
1366
|
+
{
|
1367
|
+
get_next_seq_fasta(extras_file,extras_name,extras,comments);
|
1368
|
+
}
|
1369
|
+
}
|
1370
|
+
|
1371
|
+
int error_wr=write_seq(file,name,fasta,qual,final_extras);
|
1372
|
+
if (error_wr!=0) {res=error_wr; break;};
|
1373
|
+
// if (error_wr==0) cnt++;
|
1374
|
+
|
1375
|
+
}else{
|
1376
|
+
fprintf(stderr,"Invalid sequence found. Aborting import.");
|
1377
|
+
res=-1;
|
1378
|
+
break;
|
1379
|
+
}
|
1380
|
+
|
1381
|
+
if ((r%10000)==0) {
|
1382
|
+
printf(".");
|
1383
|
+
fflush(stdout);
|
1384
|
+
// curr_time=time(NULL);
|
1385
|
+
// printf("10k seqs in:%8.0f secs\n",difftime(curr_time,prev_time));
|
1386
|
+
// prev_time=curr_time;
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
}
|
1390
|
+
|
1391
|
+
curr_time=time(NULL);
|
1392
|
+
printf("\nEnd fastq processing. %d seqs in %.0f s. Rate: %8.2f seqs/s\n",r,difftime(curr_time,prev_time),r/difftime(curr_time,prev_time));
|
1393
|
+
|
1394
|
+
// free mem
|
1395
|
+
free(name);
|
1396
|
+
free(fasta);
|
1397
|
+
free(qual);
|
1398
|
+
free(comments);
|
1399
|
+
free(extras_name);
|
1400
|
+
free(extras);
|
1401
|
+
free(final_extras);
|
1402
|
+
|
1403
|
+
// close files
|
1404
|
+
fclose(fastq_file);
|
1405
|
+
if(extras_file!=NULL)
|
1406
|
+
{
|
1407
|
+
fclose(extras_file);
|
1408
|
+
}
|
1409
|
+
close_writes(file);
|
1410
|
+
|
1411
|
+
return res;
|
1412
|
+
}
|
1413
|
+
|
1414
|
+
|
1415
|
+
// process a fastq file adding it to fbin file
|
1416
|
+
int process_fasta(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index)
|
1417
|
+
{
|
1418
|
+
|
1419
|
+
// allocate strings
|
1420
|
+
char *name;
|
1421
|
+
if ((name = malloc(MAXSEQNAME)) == NULL) {
|
1422
|
+
puts("Memory allocation error!");
|
1423
|
+
return EXIT_FAILURE;
|
1424
|
+
}
|
1425
|
+
|
1426
|
+
char *fasta;
|
1427
|
+
if ((fasta = malloc(MAXSEQLENGTH)) == NULL) {
|
1428
|
+
puts("Memory allocation error!");
|
1429
|
+
return EXIT_FAILURE;
|
1430
|
+
}
|
1431
|
+
|
1432
|
+
char *qual;
|
1433
|
+
if ((qual = malloc(MAXSEQLENGTH)) == NULL) {
|
1434
|
+
puts("Memory allocation error!");
|
1435
|
+
return EXIT_FAILURE;
|
1436
|
+
}
|
1437
|
+
|
1438
|
+
char *comments;
|
1439
|
+
if ((comments = malloc(MAXSEQLENGTH)) == NULL) {
|
1440
|
+
puts("Memory allocation error!");
|
1441
|
+
return EXIT_FAILURE;
|
1442
|
+
}
|
1443
|
+
|
1444
|
+
|
1445
|
+
char *extras_name;
|
1446
|
+
if ((extras_name = malloc(MAXSEQLENGTH)) == NULL) {
|
1447
|
+
puts("Memory allocation error!");
|
1448
|
+
return EXIT_FAILURE;
|
1449
|
+
}
|
1450
|
+
|
1451
|
+
|
1452
|
+
char *extras;
|
1453
|
+
if ((extras = malloc(MAXSEQLENGTH)) == NULL) {
|
1454
|
+
puts("Memory allocation error!");
|
1455
|
+
return EXIT_FAILURE;
|
1456
|
+
}
|
1457
|
+
|
1458
|
+
char *final_extras;
|
1459
|
+
if ((final_extras = malloc(MAXSEQLENGTH)) == NULL) {
|
1460
|
+
puts("Memory allocation error!");
|
1461
|
+
return EXIT_FAILURE;
|
1462
|
+
}
|
1463
|
+
|
1464
|
+
|
1465
|
+
static time_t curr_time=0;
|
1466
|
+
static time_t prev_time=0;
|
1467
|
+
|
1468
|
+
prev_time=time(NULL);
|
1469
|
+
|
1470
|
+
|
1471
|
+
FILE *fastq_file=NULL;
|
1472
|
+
FILE *extras_file=NULL;
|
1473
|
+
|
1474
|
+
int valid=0;
|
1475
|
+
int res=0;
|
1476
|
+
int r=0;
|
1477
|
+
|
1478
|
+
// Open fasta and qual files
|
1479
|
+
if (strcmp(fname,"-")==0){
|
1480
|
+
fastq_file=stdin;
|
1481
|
+
}else{
|
1482
|
+
open_file(fname,&fastq_file);
|
1483
|
+
}
|
1484
|
+
|
1485
|
+
if(efname!=NULL)
|
1486
|
+
{
|
1487
|
+
|
1488
|
+
open_file(efname,&extras_file);
|
1489
|
+
}
|
1490
|
+
|
1491
|
+
// open output file
|
1492
|
+
struct file_data *file=NULL;
|
1493
|
+
int error2=initialize_writes(&file, outname,1, discretize_qual, flatten_qual,create_index);
|
1494
|
+
|
1495
|
+
// printf("Init writes\n");
|
1496
|
+
// read first extra entry
|
1497
|
+
if(extras_file!=NULL)
|
1498
|
+
{
|
1499
|
+
get_next_seq_fasta(extras_file,extras_name,extras,comments);
|
1500
|
+
}
|
1501
|
+
|
1502
|
+
strcpy(qual,"");
|
1503
|
+
qual[0]=0;
|
1504
|
+
|
1505
|
+
// for each sequence on fastq file
|
1506
|
+
while (valid=get_next_seq_fasta(fastq_file,name,fasta,comments)){
|
1507
|
+
if(valid==1)
|
1508
|
+
{
|
1509
|
+
r++;
|
1510
|
+
|
1511
|
+
// printf("======================\nNAME:%s\nSEQ :%s\nQUAL:%s\n", name,fasta,qual);
|
1512
|
+
// if(strlen(comments)>0)
|
1513
|
+
// {
|
1514
|
+
// printf("COM :%s\n", comments);
|
1515
|
+
// }
|
1516
|
+
|
1517
|
+
strcpy(final_extras,comments);
|
1518
|
+
|
1519
|
+
|
1520
|
+
// check if there are extras available
|
1521
|
+
if (strcmp(name,extras_name)==0){
|
1522
|
+
strcat(final_extras,extras);
|
1523
|
+
|
1524
|
+
// read next extras
|
1525
|
+
if(extras_file!=NULL)
|
1526
|
+
{
|
1527
|
+
get_next_seq_fasta(extras_file,extras_name,extras,comments);
|
1528
|
+
}
|
1529
|
+
}
|
1530
|
+
|
1531
|
+
int error_wr=write_seq(file,name,fasta,qual,final_extras);
|
1532
|
+
if (error_wr!=0) {res=error_wr; break;};
|
1533
|
+
// if (error_wr==0) cnt++;
|
1534
|
+
|
1535
|
+
}else{
|
1536
|
+
fprintf(stderr,"Invalid sequence found. Aborting import.");
|
1537
|
+
res=-1;
|
1538
|
+
break;
|
1539
|
+
}
|
1540
|
+
|
1541
|
+
if ((r%10000)==0) {
|
1542
|
+
printf(".");
|
1543
|
+
fflush(stdout);
|
1544
|
+
// curr_time=time(NULL);
|
1545
|
+
// printf("10k seqs in:%8.0f secs\n",difftime(curr_time,prev_time));
|
1546
|
+
// prev_time=curr_time;
|
1547
|
+
}
|
1548
|
+
|
1549
|
+
}
|
1550
|
+
|
1551
|
+
curr_time=time(NULL);
|
1552
|
+
printf("\nEnd fastq processing. %d seqs in %.0f s. Rate: %8.2f seqs/s\n",r,difftime(curr_time,prev_time),r/difftime(curr_time,prev_time));
|
1553
|
+
|
1554
|
+
// free mem
|
1555
|
+
free(name);
|
1556
|
+
free(fasta);
|
1557
|
+
free(qual);
|
1558
|
+
free(comments);
|
1559
|
+
free(extras_name);
|
1560
|
+
free(extras);
|
1561
|
+
free(final_extras);
|
1562
|
+
|
1563
|
+
// close files
|
1564
|
+
fclose(fastq_file);
|
1565
|
+
if(extras_file!=NULL)
|
1566
|
+
{
|
1567
|
+
fclose(extras_file);
|
1568
|
+
}
|
1569
|
+
close_writes(file);
|
1570
|
+
|
1571
|
+
return res;
|
1572
|
+
}
|
1573
|
+
|
1574
|
+
// int process_biofile(char *fname, char *qfname, char *efname, char *outname)
|
1575
|
+
// {
|
1576
|
+
//
|
1577
|
+
// char sname[MAXSEQNAME];// sequence name
|
1578
|
+
// char qname[MAXSEQNAME];// sequence name
|
1579
|
+
// char ename[MAXSEQNAME];// sequence name
|
1580
|
+
// char next_sname[MAXSEQNAME];// sequence name
|
1581
|
+
// char next_qname[MAXSEQNAME];// sequence name
|
1582
|
+
// char next_ename[MAXSEQNAME];// sequence name
|
1583
|
+
//
|
1584
|
+
// char fasta[150000];
|
1585
|
+
// char qual[150000];
|
1586
|
+
// char extras[150000];
|
1587
|
+
// char extras_used[150000];
|
1588
|
+
// char next_fcomment[150000];
|
1589
|
+
// char next_qcomment[150000];
|
1590
|
+
// char next_ecomment[150000];
|
1591
|
+
// char tmp[150000];
|
1592
|
+
// int extras_bool=TRUE;
|
1593
|
+
//
|
1594
|
+
// int cnt=1;
|
1595
|
+
//
|
1596
|
+
// sprintf(extras_used,"INITIALIZED");
|
1597
|
+
//
|
1598
|
+
// // Open fasta and qual files
|
1599
|
+
// FILE *file_fasta=fopen(fname,"r");
|
1600
|
+
//
|
1601
|
+
// if (file_fasta==NULL) { fprintf(stderr,"error opening fasta file %s, result %d %s\n",fname,errno,strerror(errno));return -2;};
|
1602
|
+
// // setvbuf(file_fasta,NULL,_IONBF,0);
|
1603
|
+
// FILE *file_qual=fopen(qfname,"r");
|
1604
|
+
// if (file_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",qfname,errno,strerror(errno));return -2;};
|
1605
|
+
// FILE *file_extras=fopen(efname,"r");
|
1606
|
+
// if (file_extras==NULL) {fprintf(stderr,"error opening extras file %s, result %d %s\n",efname,errno,strerror(errno)); extras_bool=FALSE;sprintf(extras,"");};
|
1607
|
+
//
|
1608
|
+
// // setvbuf(file_qual,NULL,_IONBF,0);
|
1609
|
+
// int error=0;
|
1610
|
+
// int end=0; //0 is false
|
1611
|
+
// char *res;
|
1612
|
+
//
|
1613
|
+
// // reads the name of the sequence from both
|
1614
|
+
//
|
1615
|
+
// // fscanf(file_qual,">%9000s",qname);
|
1616
|
+
// // fscanf(file_fasta,">%9000s",sname);
|
1617
|
+
//
|
1618
|
+
//
|
1619
|
+
// res=fgets(tmp,150000,file_fasta);
|
1620
|
+
// if (res!=NULL) {
|
1621
|
+
// sscanf(tmp,">%9000s",sname);
|
1622
|
+
// strncpy(next_fcomment,tmp+strlen(sname)+2,150000);
|
1623
|
+
// }
|
1624
|
+
//
|
1625
|
+
// res=fgets(tmp,150000,file_qual);
|
1626
|
+
// if (res!=NULL) {
|
1627
|
+
// sscanf(tmp,">%9000s",qname);
|
1628
|
+
// strncpy(next_qcomment,tmp+strlen(qname)+2,150000);
|
1629
|
+
// }
|
1630
|
+
//
|
1631
|
+
// if ( extras_bool ) {
|
1632
|
+
// res=fgets(tmp,150000,file_extras);
|
1633
|
+
// if (res!=NULL) {
|
1634
|
+
// sscanf(tmp,">%9000s",ename);
|
1635
|
+
// strncpy(next_ecomment,tmp+strlen(ename)+2,150000);
|
1636
|
+
// } else sprintf(ename,"");
|
1637
|
+
// }
|
1638
|
+
// printf("extras seq:%s\n",ename);
|
1639
|
+
//
|
1640
|
+
// printf("file:%s q:%s seqname:%s qseqname%s efname:%s extras:%s\n",fname, qfname,sname,qname,efname,extras);
|
1641
|
+
// printf("next_fcomment:%s next_qcomment:%s\n",next_fcomment,next_qcomment);
|
1642
|
+
//
|
1643
|
+
// struct file_data *file=NULL;
|
1644
|
+
// int error2=initialize_writes(&file, outname,1);
|
1645
|
+
//
|
1646
|
+
// // sprintf(next_fcomment,"");
|
1647
|
+
// // sprintf(next_qcomment,"");
|
1648
|
+
//
|
1649
|
+
// while (!end) {
|
1650
|
+
// if ( strcmp(sname,qname)!=0 ) {error = -9; goto end;}
|
1651
|
+
// /*
|
1652
|
+
// if (extras_bool)
|
1653
|
+
// if ( strcmp(sname,ename)!=0 ) {error = -9; goto end;}
|
1654
|
+
// */
|
1655
|
+
// // load the qual and fasta
|
1656
|
+
//
|
1657
|
+
// sprintf(fasta,"");
|
1658
|
+
// sprintf(fasta,"%s",next_fcomment);
|
1659
|
+
// sprintf(next_fcomment,"");
|
1660
|
+
// sprintf(tmp,"");
|
1661
|
+
// res=fasta;
|
1662
|
+
// while (( res!=NULL ) && (tmp[0]!='>' )) {
|
1663
|
+
// res=fgets(tmp,150000,file_fasta);
|
1664
|
+
// if ((tmp[0]!='>')&&(res!=NULL)) sprintf (fasta,"%s%s",fasta,tmp);
|
1665
|
+
// else if (res!=NULL) {sscanf(tmp,">%9000s",next_sname); strncpy(next_fcomment,tmp+strlen(next_sname)+2,sizeof(next_fcomment));}
|
1666
|
+
// }
|
1667
|
+
// if (res==NULL) end=1;
|
1668
|
+
//
|
1669
|
+
// sprintf(qual,"");
|
1670
|
+
// sprintf(qual,"%s",next_qcomment);
|
1671
|
+
// sprintf(next_qcomment,"");
|
1672
|
+
// res=qual;
|
1673
|
+
// sprintf(tmp,"");
|
1674
|
+
// while (( res!=NULL ) && (tmp[0]!='>' )) {
|
1675
|
+
// res=fgets(tmp,150000,file_qual);
|
1676
|
+
// if ((tmp[0]!='>')&&(res!=NULL)) sprintf (qual,"%s%s",qual,tmp);
|
1677
|
+
// else if (res!=NULL) {sscanf(tmp,">%9000s",next_qname); strncpy(next_qcomment,tmp+strlen(next_qname)+2,sizeof(next_qcomment));}
|
1678
|
+
// }
|
1679
|
+
// if (res==NULL) end=1;
|
1680
|
+
//
|
1681
|
+
// // If extra_used!=NULL then it means that it has been used and a new one must be read
|
1682
|
+
// if (extras_bool && (strcmp(extras_used,"")!=0)) {
|
1683
|
+
// sprintf(extras,"");
|
1684
|
+
// sprintf(extras,"%s",next_ecomment);
|
1685
|
+
// sprintf(next_ecomment,"");
|
1686
|
+
// res=extras;
|
1687
|
+
// sprintf(tmp,"");
|
1688
|
+
// while (( res!=NULL ) && (tmp[0]!='>' )) {
|
1689
|
+
// res=fgets(tmp,150000,file_extras);
|
1690
|
+
// if ((tmp[0]!='>')&&(res!=NULL)) sprintf (extras,"%s%s",extras,tmp);
|
1691
|
+
// else if (res!=NULL) {sscanf(tmp,">%9000s",next_ename); strncpy(next_ecomment,tmp+strlen(next_ename)+2,sizeof(next_ecomment));}
|
1692
|
+
// }
|
1693
|
+
// //if (res==NULL) end=1; Extras file can be finished and processing will continue
|
1694
|
+
// }
|
1695
|
+
//
|
1696
|
+
// /* If the name of the name is equal to the name of the actual sequence then it will be used for writting */
|
1697
|
+
// if ( strcmp(sname,ename)==0 ) {
|
1698
|
+
// strcpy(extras_used,extras);
|
1699
|
+
// strcpy(ename,next_ename);
|
1700
|
+
// } else sprintf(extras_used,"");
|
1701
|
+
//
|
1702
|
+
//
|
1703
|
+
// int error_wr=write_seq(file,sname, fasta,qual,extras_used);
|
1704
|
+
// if (error_wr!=0) { end=1;error=error_wr; };
|
1705
|
+
// if (error_wr==0) cnt++;
|
1706
|
+
// strcpy(sname,next_sname);
|
1707
|
+
// strcpy(qname,next_qname);
|
1708
|
+
//
|
1709
|
+
// }
|
1710
|
+
//
|
1711
|
+
// // repeat until EOF or error
|
1712
|
+
// end:
|
1713
|
+
// fclose(file_fasta);
|
1714
|
+
// fclose(file_qual);
|
1715
|
+
//
|
1716
|
+
// close_writes(file);
|
1717
|
+
// //fclose(file_index);
|
1718
|
+
// // print_seqs(seql);
|
1719
|
+
// return error;
|
1720
|
+
// }
|
1721
|
+
//
|
1722
|
+
|
1723
|
+
|
1724
|
+
|
1725
|
+
int init_dicts(char *d_fasta,char *d_qual,int size)
|
1726
|
+
{
|
1727
|
+
char *dict_f="fasta.dic";
|
1728
|
+
char *dict_q="qual.dic";
|
1729
|
+
FILE *f_d_fasta=fopen(dict_f,"r");
|
1730
|
+
if (f_d_fasta==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_f,errno,strerror(errno));return -2;};
|
1731
|
+
fread(d_fasta,size,1,f_d_fasta);
|
1732
|
+
fclose(f_d_fasta);
|
1733
|
+
|
1734
|
+
FILE *f_d_qual=fopen(dict_q,"r");
|
1735
|
+
if (f_d_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_q,errno,strerror(errno));return -2;};
|
1736
|
+
fread(d_qual,size,1,f_d_fasta);
|
1737
|
+
fclose(f_d_qual);
|
1738
|
+
}
|
1739
|
+
|
1740
|
+
|
1741
|
+
|
1742
|
+
int free_string(char **string){
|
1743
|
+
if (string!=NULL){
|
1744
|
+
free(*string);
|
1745
|
+
*string=NULL;
|
1746
|
+
}
|
1747
|
+
}
|
1748
|
+
|