scbi_fqbin 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/.DS_Store +0 -0
  3. data/.gitignore +14 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/{README.rdoc → README.md} +0 -0
  7. data/Rakefile +8 -28
  8. data/lib/scbi_fqbin.rb +3 -5
  9. data/lib/scbi_fqbin/fastabin.rb +411 -0
  10. data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
  11. data/lib/scbi_fqbin/fbin_file.rb +1 -1
  12. data/lib/scbi_fqbin/t.rb +9 -0
  13. data/lib/scbi_fqbin/t2.rb +12 -0
  14. data/lib/scbi_fqbin/version.rb +3 -0
  15. data/lib_fqbin_src.zip +0 -0
  16. data/lib_fqbin_src/Makefile +66 -0
  17. data/lib_fqbin_src/fq +0 -0
  18. data/lib_fqbin_src/fq.c +165 -0
  19. data/lib_fqbin_src/hash_fqbin +0 -0
  20. data/lib_fqbin_src/hash_fqbin.c +212 -0
  21. data/lib_fqbin_src/idx_fqbin +21 -0
  22. data/lib_fqbin_src/iterate_fqbin +0 -0
  23. data/lib_fqbin_src/iterate_fqbin.c +136 -0
  24. data/lib_fqbin_src/lib_fqbin.c +1748 -0
  25. data/lib_fqbin_src/lib_fqbin.h +194 -0
  26. data/lib_fqbin_src/mk_fqbin +0 -0
  27. data/lib_fqbin_src/mk_fqbin.c +138 -0
  28. data/lib_fqbin_src/other/bwxform.c +915 -0
  29. data/lib_fqbin_src/other/bwxform.h +74 -0
  30. data/lib_fqbin_src/other/find_in_index.c +130 -0
  31. data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
  32. data/lib_fqbin_src/other/idx_fqbin +0 -0
  33. data/lib_fqbin_src/other/idx_fqbin.c +67 -0
  34. data/lib_fqbin_src/other/make_hsh.sh +14 -0
  35. data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
  36. data/lib_fqbin_src/read_fq +0 -0
  37. data/lib_fqbin_src/read_fq.c +143 -0
  38. data/lib_fqbin_src/read_fqbin +0 -0
  39. data/lib_fqbin_src/read_fqbin.c +101 -0
  40. data/lib_fqbin_src/sort_index +9 -0
  41. data/lib_fqbin_src/test.rb +13 -0
  42. data/scbi_fqbin.gemspec +25 -0
  43. data/test/build.rake +15 -0
  44. data/test/fbinfile +0 -0
  45. data/test/fbinfile.index +0 -0
  46. data/test/no_test_fill_file.rb +66 -0
  47. data/test/old/app.rb +43 -0
  48. data/test/old/bin/iterate_fastabin.rb +54 -0
  49. data/test/old/bin/mk_fastabin.rb +22 -0
  50. data/test/old/bin/rd_fastabin.rb +36 -0
  51. data/test/old/bin/rd_fq.rb +20 -0
  52. data/test/old/bioruby.rb +27 -0
  53. data/test/old/c/Makefile +34 -0
  54. data/test/old/c/fbin_lib.zip +0 -0
  55. data/test/old/c/iterate_fbin.c +54 -0
  56. data/test/old/c/libreria_gz.c +707 -0
  57. data/test/old/c/libreria_gz.h +127 -0
  58. data/test/old/c/main.c +86 -0
  59. data/test/old/c/mk_fbin.c +24 -0
  60. data/test/old/c/rd_seq_fbin.c +44 -0
  61. data/test/old/c/test_ffi/a.out +0 -0
  62. data/test/old/c/test_ffi/app.c +26 -0
  63. data/test/old/c/test_ffi/app.rb +19 -0
  64. data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
  65. data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
  66. data/test/old/c/test_ffi/my_library.rb +23 -0
  67. data/test/old/c/test_ffi/mylibrary.c +22 -0
  68. data/test/old/c/test_ffi/mylibrary.h +6 -0
  69. data/test/old/c/usage_instructions.txt +62 -0
  70. data/test/old/ext/Makefile +187 -0
  71. data/test/old/ext/Makefile.dario +34 -0
  72. data/test/old/ext/extconf.rb +8 -0
  73. data/test/old/ext/mk_fbin.c +24 -0
  74. data/test/old/ext/sample/extras.txt +4 -0
  75. data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
  76. data/test/old/ext/sample/f1.fasta +10 -0
  77. data/test/old/ext/sample/f1.fasta.qual +10 -0
  78. data/test/old/ext/sample/f1.fbin +0 -0
  79. data/test/old/ext/sample/f1.fbin.index +0 -0
  80. data/test/old/ext/sample/main.c +86 -0
  81. data/test/old/ext/usage_instructions.txt +62 -0
  82. data/test/old/t_scbi_fastabin.rb +140 -0
  83. data/test/read_tests/10-original_sizes.sh +16 -0
  84. data/test/read_tests/20-fq_time.sh +23 -0
  85. data/test/read_tests/30-fbin_read_time.sh +23 -0
  86. data/test/read_tests/40-bsc_read_time.sh +21 -0
  87. data/test/read_tests/50-fq_time_x4.sh +25 -0
  88. data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
  89. data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
  90. data/test/results_bio_scbi_fasta.txt +11 -0
  91. data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
  92. data/test/speed.txt +81 -0
  93. data/test/t_scbi_fasta.rb +12 -0
  94. data/test/write_tests/10-original_sizes.sh +16 -0
  95. data/test/write_tests/20-zip_time.sh +17 -0
  96. data/test/write_tests/30-mk_fbin_time.sh +23 -0
  97. data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
  98. data/test/write_tests/40-gzip_time.sh +16 -0
  99. data/test/write_tests/41-bsc_time.sh +16 -0
  100. data/test/write_tests/50-zip_sizes.sh +16 -0
  101. data/test/write_tests/60-fbin_sizes.sh +17 -0
  102. data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
  103. data/test/write_tests/70-gzip_sizes.sh +17 -0
  104. data/test/write_tests/80-bsc_sizes.sh +17 -0
  105. data/website/index.html +87 -0
  106. data/website/index.txt +81 -0
  107. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  108. data/website/stylesheets/screen.css +159 -0
  109. data/website/template.html.erb +50 -0
  110. metadata +208 -95
  111. data/History.txt +0 -19
  112. data/Manifest.txt +0 -12
  113. data/PostInstall.txt +0 -7
  114. data/script/console +0 -10
  115. data/script/destroy +0 -14
  116. data/script/generate +0 -14
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), %w[.. lib scbi_fqbin]))
5
+
6
+ #check args
7
+ if ARGV.count < 2
8
+ puts "$0 fbin_file -f|-q|-e"
9
+ puts
10
+ puts "-f => Get fasta"
11
+ puts "-q => Get qual"
12
+ puts "-e => Get extras"
13
+ exit
14
+ end
15
+
16
+ bin_file = ARGV.shift
17
+ mode = ARGV.join.gsub('-','').upcase
18
+
19
+ #print mode
20
+
21
+ get_fasta=mode.index('F')
22
+ get_qual=mode.index('Q')
23
+ get_extra=mode.index('E')
24
+
25
+ index_file = bin_file+'.index'
26
+
27
+ if !File.exists?(bin_file)
28
+ puts "File \"#{bin_file}\" doesn't exists'"
29
+ exit
30
+ end
31
+
32
+ # open fastabin file
33
+ fb=Fastabin.new(bin_file,'r')
34
+
35
+ # iterate over all sequences
36
+ fb.each do |n,f,q,e|
37
+ if get_fasta
38
+ puts ">"+n
39
+ puts f
40
+ end
41
+
42
+ if get_qual
43
+ puts ">"+n
44
+ puts q
45
+ end
46
+
47
+ if get_extra
48
+ puts ">"+n
49
+ puts e
50
+ end
51
+ end
52
+
53
+ fb.close
54
+
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), %w[.. lib scbi_fqbin]))
4
+
5
+ require 'zlib'
6
+
7
+ if ARGV.count != 3
8
+ puts "$0 fasta_file qual_file out_file"
9
+ exit
10
+ end
11
+
12
+ fasta_file = ARGV[0]
13
+ qual_file = ARGV[1]
14
+ output_name = ARGV[2] ||= File.basename(fasta_file,File.extname(fasta_file))+'.fbin'
15
+
16
+ fb=Fastabin.new(output_name,'wb')
17
+
18
+ fb.add_fasta_qual(fasta_file,qual_file)
19
+
20
+ fb.close
21
+
22
+
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(
4
+ File.join(File.dirname(__FILE__), %w[.. lib scbi_fqbin]))
5
+
6
+ #check args
7
+ if ARGV.count != 2
8
+ puts "$0 fbin_file sequence_name"
9
+ exit
10
+ end
11
+
12
+
13
+ bin_file = ARGV[0]
14
+ index_file = bin_file+'.index'
15
+ seq_name = ARGV[1]
16
+
17
+ if !File.exists?(bin_file)
18
+ puts "Binary file \"#{bin_file}\" doesn't exists'"
19
+ exit
20
+ end
21
+
22
+ fb=Fastabin.new(bin_file,'r')
23
+ n,f,q=fb.read_seq(seq_name)
24
+
25
+ if n.nil?
26
+ puts "Sequence not found"
27
+ else
28
+ puts ">"+n
29
+ puts f
30
+ puts
31
+ puts ">"+n
32
+ puts q
33
+ end
34
+
35
+ fb.close
36
+
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fasta'
4
+
5
+ # use FastaQualFile to read fasta
6
+ qf = FastaQualFile.new(ARGV[0],ARGV[1])
7
+
8
+
9
+ # iterate over sequences
10
+ qf.each do |name,fasta,qual|
11
+
12
+ puts "> #{name}"
13
+ puts fasta
14
+ puts "> #{name}"
15
+ puts qual
16
+
17
+ end
18
+
19
+ qf.close
20
+
@@ -0,0 +1,27 @@
1
+ #!/usr/local/bin/ruby
2
+ require 'bio'
3
+
4
+ ff = Bio::FlatFile.open(Bio::FastaFormat, ARGV[0])
5
+
6
+
7
+ qf = Bio::FlatFile.open(Bio::FastaFormat, ARGV[1])
8
+
9
+ i=0
10
+ while ((f_seq= ff.next_entry) && (q_seq = qf.next_entry))
11
+
12
+ if f_seq.entry_id!=q_seq.entry_id
13
+ raise "ERROR in name"
14
+ end
15
+ if (f_seq.seq.size!=q_seq.data.count(' ')+1)
16
+ raise "ERROR in sizes #{f_seq.data.size}, #{q_seq.data.count(' ')+1}"
17
+ end
18
+
19
+ puts f_seq.entry_id
20
+ puts f_seq.seq
21
+ puts q_seq.entry_id
22
+ puts q_seq.data
23
+
24
+ i += 1
25
+ end
26
+
27
+
@@ -0,0 +1,34 @@
1
+ OS := $(shell uname)
2
+
3
+ CFLAGS=
4
+
5
+ ifeq ($(OS), Linux)
6
+ CFLAGS=-fPIC -O3
7
+ endif
8
+
9
+ CC=gcc
10
+ DEPFILE=.depend
11
+ PROGS=mk_fbin rd_seq_fbin iterate_fbin
12
+ OBJS=libreria_gz.o
13
+ LIBS=-lz
14
+ LIB_NAME=liblibreria_gz
15
+
16
+ all: $(OBJS) $(PROGS) lib
17
+
18
+ $(PROGS):
19
+ $(CC) $(CFLAGS) $(OBJS) $(LIBS) $@.c -o $@
20
+
21
+ .c.o:
22
+ $(CC) $(CFLAGS) -c -o $@ $<
23
+
24
+ lib:
25
+ ifeq ($(OS), Linux)
26
+ $(CC) -shared -Wl,-soname,$(LIB_NAME).so.1 -o $(LIB_NAME).so $(OBJS) $(LIBS)
27
+ endif
28
+
29
+ ifeq ($(OS), Darwin)
30
+ $(CC) $(LIBS) -dynamiclib -o $(LIB_NAME).dylib -dylib $(OBJS)
31
+ endif
32
+
33
+ clean:
34
+ rm -f *.o $(PROGS) *.dylib *.so*
Binary file
@@ -0,0 +1,54 @@
1
+ #include "libreria_gz.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+
7
+
8
+ /*******************************************************/
9
+ /* main */
10
+ /*******************************************************/
11
+ int main(int argc, char *argv[])
12
+ {
13
+
14
+ char *fasta=NULL;
15
+ char *qual=NULL;
16
+ char *extras=NULL;
17
+ int size=5000;
18
+ int res=0;
19
+
20
+ //gzFile gzf_bin;
21
+ // struct file_data filed;
22
+
23
+ struct file_data *filed=NULL;
24
+
25
+ if (argc!=2)
26
+ {
27
+ printf("Usage %s fbin_file\n\n",argv[0]);
28
+ return -1;
29
+ }
30
+
31
+ initialize_sequential_reads(&filed, argv[1]);
32
+
33
+ char *sname=NULL;
34
+
35
+ while ((res=read_data_sequential(filed, &sname, &fasta, &qual, &extras))==0)
36
+ {
37
+ // printf("res:%d\n",res);
38
+ if (res==0){
39
+
40
+ printf(">%s\n%s\n", sname, fasta);
41
+ printf("%s\n",qual);
42
+ if (extras!=NULL) printf ("extras:%s\n",extras);
43
+ }
44
+
45
+ if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
46
+ if ( qual!=NULL ) {free(qual);qual=NULL;}
47
+ if ( extras!=NULL ) {free(extras);extras=NULL;}
48
+ }
49
+
50
+ close_sequential_reads(filed);
51
+
52
+ return res;
53
+ }
54
+
@@ -0,0 +1,707 @@
1
+
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <time.h>
5
+
6
+
7
+ #include <sys/types.h>
8
+ #include <sys/stat.h>
9
+ #include <fcntl.h>
10
+ #include <errno.h>
11
+
12
+ #include <zlib.h>
13
+ #include <stdlib.h>
14
+ #include "libreria_gz.h"
15
+
16
+ #define CHUNK 262144
17
+
18
+ // Maximum file name (including .idx)
19
+ #define MAXFNAME 512
20
+
21
+ // Maximum lenght of the name of a sequence
22
+ #define MAXSEQNAME 1024
23
+ #define DEBUG 0
24
+ #define FALSE 0
25
+ #define TRUE 1
26
+
27
+ char dict_fasta[65536];
28
+ char dict_qual[65536];
29
+
30
+ // Maximum size of the metadata of a sequence, including name, lenght of fasta, qual and extras.
31
+ // It should be a maximum of 10000
32
+ #define SEQ_METADATA 10000
33
+
34
+ static time_t curr_time=0;
35
+ static time_t prev_time=0;
36
+
37
+ int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras)
38
+ {
39
+ // compress data
40
+ char metainfo[SEQ_METADATA];
41
+ int error=0;
42
+
43
+
44
+ if (file->gzf_bin==NULL) {fprintf(stderr,"error with gzfile_bin, is NULL :%s\n",gzerror(file->gzf_bin,&error));return -2;}
45
+
46
+ snprintf(metainfo,SEQ_METADATA-1,"9999%s %ld %ld %ld", seq_name, strlen(fasta), strlen(qual), strlen(extras));
47
+ snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, seq_name, strlen(fasta), strlen(qual), strlen(extras));
48
+
49
+ // get begin pos of header
50
+ long beginH=gztell(file->gzf_bin);
51
+
52
+ // TODO check gztell
53
+ if (beginH==-1) {fprintf(stderr,"error with pos of beginH of gzfile_bin :%s\n", gzerror(file->gzf_bin,&error)); return -2;}
54
+
55
+ // write seq to bin file
56
+ gzwrite(file->gzf_bin, metainfo, strlen(metainfo));
57
+
58
+ // TODO check gzwrite
59
+ long beginI=gztell(file->gzf_bin);
60
+
61
+ if (beginI==-1) {fprintf(stderr,"error with pos of beginI of gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
62
+
63
+
64
+ int res=1;
65
+ if (strlen(fasta)>0) res=gzwrite(file->gzf_bin,fasta,strlen(fasta)); //Z_FILTERED);
66
+
67
+ if ( res==0 ) { fprintf(stderr,"Error when writting fasta\n");return -8;}
68
+ long fastaS=gztell(file->gzf_bin)-beginI;
69
+
70
+ if (strlen(qual)>0) res=gzwrite(file->gzf_bin,qual,strlen(qual)); //Z_FILTERED);
71
+
72
+ if ( res==0 ) { fprintf(stderr,"Error when writting qual\n");return -8;}
73
+ long qualS=gztell(file->gzf_bin)-fastaS-beginI;
74
+
75
+ if (strlen(extras)>0) res=gzwrite(file->gzf_bin,extras,strlen(extras)); //Z_FILTERED);
76
+
77
+ if ( res==0 ) { fprintf(stderr,"Error when writting extras\n");return -8;}
78
+ long extrasS=gztell(file->gzf_bin)-qualS-fastaS-beginI;
79
+
80
+
81
+ // add_sequence(&seql,seq_name,pos_chunk_gz,beginI,fastaS,qualS,extrasS);
82
+
83
+ // Write index file
84
+ char tmp[SEQ_METADATA];
85
+ sprintf(tmp,"%s %lld %ld\n",seq_name,file->pos_chunk_gz,beginH);
86
+
87
+ gzwrite(file->gzf_index,tmp,strlen(tmp));
88
+
89
+ (file->counter)++;
90
+ // if (counter > 2) fprintf(stderr,"Probando static counter para llamadas desde ruby, valor %d\n",counter);
91
+
92
+ // create new chunk
93
+ if (((file->counter)%10000)==0) {
94
+ curr_time=time(NULL);
95
+ printf("time passed:%ld\n",curr_time-prev_time);
96
+ prev_time=curr_time;
97
+
98
+ // close current chunk
99
+ gzclose(file->gzf_bin);
100
+
101
+ // open file again
102
+ int file_bin=open(file->name,O_APPEND);
103
+
104
+ //goto end of file
105
+ long long pos=lseek(file_bin,0,SEEK_END);
106
+ if (pos==-1) {fprintf(stderr,"error %d seeking file :%s\n",errno,strerror(errno));return -1;}
107
+
108
+ // annotate chunk pos
109
+ file->pos_chunk_gz=pos;
110
+
111
+ close(file_bin);
112
+
113
+ // open new gzfile
114
+ file->gzf_bin=gzopen(file->name,"ab");
115
+ if (file->gzf_bin==NULL) {fprintf(stderr,"error opening gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
116
+ }
117
+
118
+ return 0;
119
+ }
120
+
121
+
122
+
123
+ /* Reads the metadata from the main file
124
+ It initializes the version variable
125
+ */
126
+ int read_bin_file_metadata(struct file_data *filed)
127
+ {
128
+ char header[SEQ_METADATA];
129
+ int fastaS,qualS,extrasS=0;
130
+ int ver,subver;
131
+
132
+ int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
133
+
134
+ if ( res!=0 ) {fprintf(stderr,"SEQ READ incorrect:%d\n",res);return -1;}
135
+ if ( strlen(header)<20 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s. lenght:%ld\n",header,strlen(header));return -1;}
136
+
137
+ // 28UMACOMPRESSEDFORMAT_1_0 0 0 0
138
+ // header[strlen(header)-2]=0;
139
+
140
+ if (strncmp(header,"UMACOMPRESSEDFORMAT_",19)!=0) {fprintf(stderr,"Incorrect header in file, header:%s\n",header);return -1;}
141
+ // TODO fill the file_data structure with the header data
142
+ if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&ver,&subver)!=2) return -1;
143
+ //if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&(filed->version),&(filed->subversion))!=2) return -1;
144
+ filed->version=11;//ver;
145
+ filed->subversion=subver;
146
+ // fprintf(stderr,"file version:%d,%d\n",filed->version,filed->subversion);
147
+ return 0;
148
+ }
149
+
150
+ /* Reads the metadata from the index file
151
+ It initializes the version and binary_search variable
152
+ */
153
+ int read_index_file_metadata(struct file_data *filed)
154
+ {
155
+ char header[SEQ_METADATA];
156
+ int fastaS,qualS,extrasS=0;
157
+
158
+
159
+ int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
160
+
161
+ if ( strlen(header)<19 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s.\n",header);return -1;}
162
+
163
+ // 28UMACOMPRESSEDFORMAT 1 0 0 0 0
164
+ header[strlen(header)-2]=0;
165
+
166
+ if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
167
+ // TODO fill the file_data structure with the header data
168
+ if (sscanf(header,"UMACOMPRESSEDFORMAT %d %d",&(filed->version),&(filed->subversion))!=2) {
169
+ fprintf(stderr,"SEQ READ:Header incorrect when reading versions:%s.\n",header);
170
+ return -1;
171
+ }
172
+
173
+ return 0;
174
+ }
175
+
176
+ /* reads the header of a sequence in the main file.
177
+ the pointer to the file points to the fasta data after calling read_seq_header
178
+ returns 0 if ok
179
+ -1 if there is an error
180
+ -2 if EOF
181
+ */
182
+
183
+ int read_seq_header(gzFile *gzf_bin, char *seq_name,int *fastaS, int *qualS, int*extrasS)
184
+ {
185
+ int header_size=4;
186
+ char hsize[40];
187
+ char tmp[1000];
188
+ char sname[SEQ_METADATA];
189
+
190
+ long pos=gzread(gzf_bin,hsize,header_size);
191
+
192
+ // EOF found
193
+ if ( pos==0 ) return -2;
194
+
195
+ // Error reading file
196
+ if ( pos==-1 ) {fprintf(stderr,"error reading header\n");return -1;}
197
+
198
+ hsize[pos]=0;
199
+ sscanf(hsize,"%d",&header_size);
200
+ pos=gzread(gzf_bin,tmp,header_size);
201
+
202
+ if ( pos==0 ) return -2;
203
+
204
+ if ( pos==-1 ) {fprintf(stderr,"error reading header\n");return -1;}
205
+
206
+ tmp[header_size]=0;
207
+ int reads=sscanf(tmp,"%s %d %d %d",sname,fastaS,qualS,extrasS);
208
+
209
+ if (reads!=4) {return -1;};
210
+
211
+ if (seq_name!=NULL) strncpy(seq_name,sname,SEQ_METADATA);
212
+
213
+ return 0;
214
+ }
215
+
216
+ // check files before reading
217
+ // it initializes the previous variables, file_version and binary_search
218
+ // result :
219
+ // 0 : if both the bin and index files exists and are from the current version
220
+ // 1 : if both the bin and index files exists but are from another version
221
+ // 2 : if both files are missing
222
+ // 3 : if bin file is missing
223
+ // 4 : if index file is missing
224
+ int check_files()
225
+ {
226
+
227
+ // open the files, read and check the header
228
+ return 0;
229
+ }
230
+
231
+ // returns the version of the opened file
232
+ int version(struct file_data *filed)
233
+ {
234
+ if (filed->gzf_bin==NULL) return -1;
235
+ return filed->version;
236
+ }
237
+
238
+ // returns the version of the opened file
239
+ int subversion(struct file_data *filed)
240
+ {
241
+ if (filed->gzf_bin==NULL) return -1;
242
+ return filed->subversion;
243
+ }
244
+
245
+ /*
246
+ mode can be:
247
+ 1 - random, for each read it begins to read from the beggining of index
248
+ 2 - sequential, it keeps the position inside the index and main files.
249
+ */
250
+ int initialize_sequential_reads(struct file_data ** filed, char *filename)
251
+ {
252
+ char header[SEQ_METADATA];
253
+ int fastaS,qualS,extrasS=0;
254
+
255
+ if ( *filed == NULL ) {*filed=malloc(sizeof(struct file_data));}
256
+
257
+
258
+ (*filed)->gzf_bin=gzopen(filename,"r");
259
+ strncpy((*filed)->name,filename,MAXFNAME);
260
+ (*filed)->error=0;
261
+
262
+
263
+
264
+ // reads the metadata
265
+ /*
266
+ int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
267
+
268
+ if ( strlen(header)<19 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s.\n",header);return -1;}
269
+
270
+ // 28UMACOMPRESSEDFORMAT_1 0 0 0
271
+ header[strlen(header)-2]=0;
272
+
273
+ if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
274
+ // TODO fill the file_data structure with the header data
275
+ */
276
+ int res= read_bin_file_metadata(*filed);
277
+ // inspect_file_data_struct(filed);
278
+
279
+ return res;
280
+ }
281
+
282
+ int read_data_sequential(struct file_data *filed,char **seq_name, char **fasta, char **qual, char **extras)
283
+ {
284
+ int res=0;
285
+ int error=0;
286
+ int fastaS,qualS,extrasS=0;
287
+
288
+ if ( *seq_name == NULL ) {*seq_name=(char *)malloc(SEQ_METADATA);strncpy(*seq_name,"",4);}
289
+
290
+ res=read_seq_header(filed->gzf_bin, *seq_name, &fastaS, &qualS, &extrasS);
291
+ if (res==-2) // EOF
292
+ return -9;
293
+
294
+ if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);strncpy(*fasta,"",fastaS);}
295
+ if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
296
+ if (( *extras == NULL )&&(extrasS>0)) {*extras=(char *)malloc(extrasS+1);strncpy(*extras,"",extrasS);}
297
+
298
+ long pos=gzread(filed->gzf_bin,*fasta,fastaS);
299
+ (*fasta)[fastaS]=0;
300
+ pos=gzread(filed->gzf_bin,*qual,qualS);
301
+ (*qual)[qualS]=0;
302
+ if (extrasS>0) {pos=gzread(filed->gzf_bin,*extras,extrasS);(*extras)[extrasS]=0;}
303
+ return 0;
304
+
305
+ }
306
+ int close_sequential_reads(struct file_data *file_d)
307
+ {
308
+ gzclose(file_d->gzf_bin);
309
+ }
310
+
311
+ /*
312
+ read_seq reads from filename the sequence named seq_name and returns its
313
+ fasta, qual and extras in those variables.
314
+ It returns 0 if there are no errors, otherwise it returns:
315
+ -2 : error opening index file (it doesn't exists)
316
+ -3 : error reading index file
317
+ -4 : error sequence not found in index file
318
+ -5 : error opening file (it doesn't exists)
319
+ -6 : error reading file
320
+ -7 : error sequence not found
321
+ -8 : error uncompressing sequence
322
+ -9 : EOF
323
+
324
+ */
325
+
326
+ int read_seq(char *filename, char *seq_name, char **fasta, char **qual, char **extras)
327
+ {
328
+ /* Hacer grep en filename.index de seq_name */
329
+ /* Una vez encontrado leer su info (indice y offsets) */
330
+ /* leer de filename en sus offests el fasta qual y extras */
331
+ /* Descomprimirlo y devolverlo */
332
+
333
+ char indexname[MAXFNAME];
334
+ char sname[MAXSEQNAME];// sequence name
335
+ // char *fasta_comp; // compressed fasta
336
+ // char *qual_comp; // compressed qual
337
+ // char *extras_comp; // compressed extras
338
+ long long beginH, gz_chunk=0;
339
+ int fastaS, qualS, extrasS=0;
340
+ char tmp[SEQ_METADATA];
341
+ int res=0;
342
+ int error=0;
343
+
344
+ int bufsize=150000;
345
+
346
+ // allocate memory for return data if necessary
347
+ if ( *fasta == NULL ) {*fasta=(char *)malloc(bufsize);strncpy(*fasta,"",bufsize);}
348
+ if ( *qual == NULL) {*qual=(char *)malloc(bufsize);strncpy(*qual,"",bufsize);}
349
+ if ( *extras == NULL ) {*extras=(char *)malloc(bufsize);strncpy(*extras,"",bufsize);}
350
+
351
+ // calc index name
352
+ snprintf(indexname,MAXFNAME,"%s.index",filename);
353
+ //FILE * filein=fopen(indexname,"r");
354
+
355
+ // open index file
356
+ gzFile gzfile_index=gzopen(indexname,"r");
357
+ if (gzfile_index==NULL) {
358
+ fprintf(stderr,"error opening gzfile_index :%s\n",gzerror(gzfile_index,&error));
359
+ return -2;
360
+ }
361
+
362
+ // Reads the index to this info, and the offset to its data
363
+ int reads=3;
364
+ while ( reads == 3 ) {
365
+
366
+ // read a chunk of data from index with the size of tmp
367
+ gzgets(gzfile_index,tmp,sizeof(tmp));
368
+ reads=sscanf(tmp,"%s %lld %lld",sname,&gz_chunk,&beginH);
369
+
370
+
371
+
372
+
373
+ if (( reads != 3 ) && ( reads!=EOF )) {
374
+ fprintf(stderr,"Error scanning index: %d\n",reads);
375
+ gzclose(gzfile_index);
376
+ return -3;
377
+ }
378
+
379
+ // sequence was finally found, exit loop
380
+ if ( strncmp(sname, seq_name,MAXSEQNAME)==0) reads=999; // to get out, seq found
381
+ }
382
+
383
+ // close index file
384
+ gzclose(gzfile_index);
385
+
386
+ // maybe sequence was not found
387
+ // fprintf(stderr,"Sequence not found\n");
388
+ if (reads==EOF) {return -4;}
389
+
390
+ // We get here if sequence was found
391
+
392
+ // open bin file to extract data
393
+ int dataf=open(filename, O_RDONLY);
394
+
395
+ // seek to chunk pos
396
+ // TODO- ¿como se salta el chunk?
397
+ res=lseek(dataf,gz_chunk,SEEK_SET);
398
+
399
+ // TODO check res
400
+ gzFile gzfile_bin=gzdopen(dataf,"r");
401
+
402
+ // seek to seq inside chunk
403
+ res=gzseek(gzfile_bin,beginH,SEEK_SET);
404
+ // TODO check res
405
+
406
+ // fasta=malloc(fastaO+1);
407
+ // qual=malloc(qualO+1);
408
+ // extras=malloc(extrasO+1);
409
+ // long pos=gzread(gzfile_bin,header,4);
410
+ // read sequence header
411
+
412
+ res=read_seq_header(gzfile_bin,NULL, &fastaS, &qualS, &extrasS);
413
+
414
+
415
+
416
+ long pos=gzread(gzfile_bin,*fasta,fastaS);
417
+
418
+ (*fasta)[fastaS]=0;
419
+
420
+ pos=gzread(gzfile_bin,*qual,qualS);
421
+ (*qual)[qualS]=0;
422
+
423
+ if (extrasS>0) {pos=gzread(gzfile_bin,*extras,extrasS); (*extras)[extrasS]=0;}
424
+ gzclose(gzfile_bin);
425
+
426
+ return 0;
427
+ }
428
+
429
+ void inspect_file_data_struct(struct file_data *file){
430
+
431
+ printf("file name:%s\n",file->name);
432
+ printf("file index_name:%s\n",file->index_name);
433
+ printf("file version:%d\n",file->version);
434
+ printf("file subversion:%d\n",file->subversion);
435
+ printf("error:%d\n",file->error);
436
+ /*
437
+ if (file->bin_search==TRUE) printf("file binary search is possible\n");
438
+ else printf("file binary search is not possible\n");
439
+ */
440
+
441
+ }
442
+
443
+ // initialize the state for doing writes
444
+ // two modes:
445
+ // 1 .- new files
446
+ // 2 .- add data to files, if they don't exist they are created
447
+ int initialize_writes(struct file_data ** file, char *output_name, int mode)
448
+ {
449
+
450
+ // check if the files exists, in case it exists check if it has the
451
+ // correct metadata and if it is of the correct version
452
+ // in other case exits with an error
453
+ // struct file_data *file = malloc(sizeof(struct write_file));
454
+ if ( *file == NULL ) {*file=malloc(sizeof(struct file_data));}
455
+
456
+ (*file)->pos_chunk_gz=0;
457
+
458
+ int state=check_files(output_name);
459
+ if (state==1) {
460
+ fprintf(stderr,"File is from a different version\n");
461
+ return -1;
462
+ }
463
+ if ((state!=2)&&(state!=0)) {
464
+ fprintf(stderr,"Error %d when checking files\n",state);
465
+ return -1;
466
+ }
467
+
468
+ // copy the name of the file
469
+ strncpy((*file)->name,output_name,MAXFNAME);
470
+
471
+ // open the compressed files
472
+ int error=0;
473
+ int flags=O_WRONLY|O_CREAT|O_TRUNC;
474
+ if (mode==2) flags=O_RDWR;
475
+ // printf("mode:%d\n",mode);
476
+
477
+ //set index name
478
+ snprintf((*file)->index_name,MAXFNAME,"%s.index",(*file)->name);
479
+
480
+ //open index file
481
+ int file_index=open((*file)->index_name,flags,0644);
482
+
483
+ if (file_index==-1) return -2;
484
+
485
+ // open bin file
486
+ int file_bin=open((*file)->name,flags,0644);
487
+ // printf("fd:%d\n",file_bin);
488
+ if (file_bin==-1) {fprintf(stderr,"error opening file_bin for writting:%s\n",strerror(errno));return -2;}
489
+ if (mode==2) {
490
+ long long pos=lseek(file_index,0,SEEK_END);
491
+ if (pos==-1) {fprintf(stderr,"error going to end of index file %s\n",strerror(errno)); return -2;}
492
+ pos=lseek(file_bin,0,SEEK_END);
493
+ if (pos==-1) {fprintf(stderr,"error going to end of bin file %s\n",strerror(errno)); return -2;}
494
+ (*file)->pos_chunk_gz=pos;
495
+ }
496
+
497
+ // open zlib index file
498
+ (*file)->gzf_index=gzdopen(file_index,"w");
499
+ if ((*file)->gzf_index==NULL) {
500
+ fprintf(stderr,"error opening gzfile_index for writting:%s\n",gzerror((*file)->gzf_index,&error));
501
+ return -2;
502
+ }
503
+
504
+ // open zlib bin file
505
+ (*file)->gzf_bin=gzdopen(file_bin,"w");
506
+ if ((*file)->gzf_bin==NULL) {
507
+ fprintf(stderr,"error opening gzfile for writting:%s\n",gzerror((*file)->gzf_bin,&error));
508
+ return -2;
509
+ }
510
+
511
+ // initializes the files, writting the metadata
512
+ if (mode==1) {
513
+ char header[SEQ_METADATA];
514
+ (*file)->version=VERSION;
515
+ (*file)->subversion=SUBVERSION;
516
+ (*file)->error=0;
517
+ // TODO put correct size
518
+ snprintf(header,SEQ_METADATA-1,"9999UMACOMPRESSEDFORMAT_%d_%d %d %d %d", (*file)->version,(*file)->subversion, 0, 0, 0);
519
+ snprintf(header,SEQ_METADATA-1,"%4ldUMACOMPRESSEDFORMAT_%d_%d %d %d %d", strlen(header)-4,(*file)->version,(*file)->subversion, 0, 0, 0);
520
+ // snprintf(header,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, (*file)->version,(*file)->subversion, 0, 0, 0);
521
+
522
+ // sprintf(header," 29UMACOMPRESSEDFORMAT_%d_%d 0 0 0\n",(*file)->version,(*file)->subversion);
523
+ int res=gzwrite((*file)->gzf_bin,header,strlen(header));
524
+
525
+ sprintf(header,"UMACOMPRESSEDFORMAT 1 0 0 999999999999 999999999999\n");
526
+ res=gzwrite((*file)->gzf_index,header,strlen(header));
527
+ }
528
+ (*file)->counter=0;
529
+
530
+ // printf("Init writes done\n");
531
+ return 0;
532
+ }
533
+
534
+
535
+
536
+ int close_writes(struct file_data *file)
537
+ {
538
+ gzclose(file->gzf_bin);
539
+ gzclose(file->gzf_index);
540
+ }
541
+
542
+
543
+ int process_biofile(char *fname, char *qfname, char *efname, char *outname)
544
+ {
545
+
546
+ char sname[MAXSEQNAME];// sequence name
547
+ char qname[MAXSEQNAME];// sequence name
548
+ char ename[MAXSEQNAME];// sequence name
549
+ char next_sname[MAXSEQNAME];// sequence name
550
+ char next_qname[MAXSEQNAME];// sequence name
551
+ char next_ename[MAXSEQNAME];// sequence name
552
+
553
+ char fasta[150000];
554
+ char qual[150000];
555
+ char extras[150000];
556
+ char extras_used[150000];
557
+ char next_fcomment[150000];
558
+ char next_qcomment[150000];
559
+ char next_ecomment[150000];
560
+ char tmp[150000];
561
+ int extras_bool=TRUE;
562
+
563
+ int cnt=1;
564
+
565
+ sprintf(extras_used,"INITIALIZED");
566
+
567
+ // Open fasta and qual files
568
+ FILE *file_fasta=fopen(fname,"r");
569
+
570
+ if (file_fasta==NULL) { fprintf(stderr,"error opening fasta file %s, result %d %s\n",fname,errno,strerror(errno));return -2;};
571
+ // setvbuf(file_fasta,NULL,_IONBF,0);
572
+ FILE *file_qual=fopen(qfname,"r");
573
+ if (file_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",qfname,errno,strerror(errno));return -2;};
574
+ FILE *file_extras=fopen(efname,"r");
575
+ if (file_extras==NULL) {fprintf(stderr,"error opening extras file %s, result %d %s\n",efname,errno,strerror(errno)); extras_bool=FALSE;sprintf(extras,"");};
576
+
577
+ // setvbuf(file_qual,NULL,_IONBF,0);
578
+ int error=0;
579
+ int end=0; //0 is false
580
+ char *res;
581
+
582
+ // reads the name of the sequence from both
583
+
584
+ // fscanf(file_qual,">%9000s",qname);
585
+ // fscanf(file_fasta,">%9000s",sname);
586
+
587
+
588
+ res=fgets(tmp,150000,file_fasta);
589
+ if (res!=NULL) {
590
+ sscanf(tmp,">%9000s",sname);
591
+ strncpy(next_fcomment,tmp+strlen(sname)+2,150000);
592
+ }
593
+
594
+ res=fgets(tmp,150000,file_qual);
595
+ if (res!=NULL) {
596
+ sscanf(tmp,">%9000s",qname);
597
+ strncpy(next_qcomment,tmp+strlen(qname)+2,150000);
598
+ }
599
+
600
+ if ( extras_bool ) {
601
+ res=fgets(tmp,150000,file_extras);
602
+ if (res!=NULL) {
603
+ sscanf(tmp,">%9000s",ename);
604
+ strncpy(next_ecomment,tmp+strlen(ename)+2,150000);
605
+ } else sprintf(ename,"");
606
+ }
607
+ printf("extras seq:%s\n",ename);
608
+
609
+ printf("file:%s q:%s seqname:%s qseqname%s efname:%s extras:%s\n",fname, qfname,sname,qname,efname,extras);
610
+ printf("next_fcomment:%s next_qcomment:%s\n",next_fcomment,next_qcomment);
611
+
612
+ struct file_data *file=NULL;
613
+ int error2=initialize_writes(&file, outname,1);
614
+
615
+ // sprintf(next_fcomment,"");
616
+ // sprintf(next_qcomment,"");
617
+
618
+ while (!end) {
619
+ if ( strcmp(sname,qname)!=0 ) {error = -9; goto end;}
620
+ /*
621
+ if (extras_bool)
622
+ if ( strcmp(sname,ename)!=0 ) {error = -9; goto end;}
623
+ */
624
+ // load the qual and fasta
625
+
626
+ sprintf(fasta,"");
627
+ sprintf(fasta,"%s",next_fcomment);
628
+ sprintf(next_fcomment,"");
629
+ sprintf(tmp,"");
630
+ res=fasta;
631
+ while (( res!=NULL ) && (tmp[0]!='>' )) {
632
+ res=fgets(tmp,150000,file_fasta);
633
+ if ((tmp[0]!='>')&&(res!=NULL)) sprintf (fasta,"%s%s",fasta,tmp);
634
+ else if (res!=NULL) {sscanf(tmp,">%9000s",next_sname); strncpy(next_fcomment,tmp+strlen(next_sname)+2,sizeof(next_fcomment));}
635
+ }
636
+ if (res==NULL) end=1;
637
+
638
+ sprintf(qual,"");
639
+ sprintf(qual,"%s",next_qcomment);
640
+ sprintf(next_qcomment,"");
641
+ res=qual;
642
+ sprintf(tmp,"");
643
+ while (( res!=NULL ) && (tmp[0]!='>' )) {
644
+ res=fgets(tmp,150000,file_qual);
645
+ if ((tmp[0]!='>')&&(res!=NULL)) sprintf (qual,"%s%s",qual,tmp);
646
+ else if (res!=NULL) {sscanf(tmp,">%9000s",next_qname); strncpy(next_qcomment,tmp+strlen(next_qname)+2,sizeof(next_qcomment));}
647
+ }
648
+ if (res==NULL) end=1;
649
+
650
+ // If extra_used!=NULL then it means that it has been used and a new one must be read
651
+ if (extras_bool && (strcmp(extras_used,"")!=0)) {
652
+ sprintf(extras,"");
653
+ sprintf(extras,"%s",next_ecomment);
654
+ sprintf(next_ecomment,"");
655
+ res=extras;
656
+ sprintf(tmp,"");
657
+ while (( res!=NULL ) && (tmp[0]!='>' )) {
658
+ res=fgets(tmp,150000,file_extras);
659
+ if ((tmp[0]!='>')&&(res!=NULL)) sprintf (extras,"%s%s",extras,tmp);
660
+ else if (res!=NULL) {sscanf(tmp,">%9000s",next_ename); strncpy(next_ecomment,tmp+strlen(next_ename)+2,sizeof(next_ecomment));}
661
+ }
662
+ //if (res==NULL) end=1; Extras file can be finished and processing will continue
663
+ }
664
+
665
+ /* If the name of the name is equal to the name of the actual sequence then it will be used for writting */
666
+ if ( strcmp(sname,ename)==0 ) {
667
+ strcpy(extras_used,extras);
668
+ strcpy(ename,next_ename);
669
+ } else sprintf(extras_used,"");
670
+
671
+ int error_wr=write_seq(file,sname, fasta,qual,extras_used);
672
+ if (error_wr!=0) { end=1;error=error_wr; };
673
+ if (error_wr==0) cnt++;
674
+ strcpy(sname,next_sname);
675
+ strcpy(qname,next_qname);
676
+
677
+ }
678
+
679
+ // repeat until EOF or error
680
+ end:
681
+ fclose(file_fasta);
682
+ fclose(file_qual);
683
+ close_writes(file);
684
+ //fclose(file_index);
685
+ // print_seqs(seql);
686
+ return error;
687
+ }
688
+
689
+
690
+ int init_dicts(char *d_fasta,char *d_qual,int size)
691
+ {
692
+ char *dict_f="fasta.dic";
693
+ char *dict_q="qual.dic";
694
+ FILE *f_d_fasta=fopen(dict_f,"r");
695
+ if (f_d_fasta==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_f,errno,strerror(errno));return -2;};
696
+ fread(d_fasta,size,1,f_d_fasta);
697
+ fclose(f_d_fasta);
698
+
699
+ FILE *f_d_qual=fopen(dict_q,"r");
700
+ if (f_d_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_q,errno,strerror(errno));return -2;};
701
+ fread(d_qual,size,1,f_d_fasta);
702
+ fclose(f_d_qual);
703
+ }
704
+
705
+
706
+
707
+