scbi_fqbin 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/{README.rdoc → README.md} +0 -0
- data/Rakefile +8 -28
- data/lib/scbi_fqbin.rb +3 -5
- data/lib/scbi_fqbin/fastabin.rb +411 -0
- data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
- data/lib/scbi_fqbin/fbin_file.rb +1 -1
- data/lib/scbi_fqbin/t.rb +9 -0
- data/lib/scbi_fqbin/t2.rb +12 -0
- data/lib/scbi_fqbin/version.rb +3 -0
- data/lib_fqbin_src.zip +0 -0
- data/lib_fqbin_src/Makefile +66 -0
- data/lib_fqbin_src/fq +0 -0
- data/lib_fqbin_src/fq.c +165 -0
- data/lib_fqbin_src/hash_fqbin +0 -0
- data/lib_fqbin_src/hash_fqbin.c +212 -0
- data/lib_fqbin_src/idx_fqbin +21 -0
- data/lib_fqbin_src/iterate_fqbin +0 -0
- data/lib_fqbin_src/iterate_fqbin.c +136 -0
- data/lib_fqbin_src/lib_fqbin.c +1748 -0
- data/lib_fqbin_src/lib_fqbin.h +194 -0
- data/lib_fqbin_src/mk_fqbin +0 -0
- data/lib_fqbin_src/mk_fqbin.c +138 -0
- data/lib_fqbin_src/other/bwxform.c +915 -0
- data/lib_fqbin_src/other/bwxform.h +74 -0
- data/lib_fqbin_src/other/find_in_index.c +130 -0
- data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
- data/lib_fqbin_src/other/idx_fqbin +0 -0
- data/lib_fqbin_src/other/idx_fqbin.c +67 -0
- data/lib_fqbin_src/other/make_hsh.sh +14 -0
- data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
- data/lib_fqbin_src/read_fq +0 -0
- data/lib_fqbin_src/read_fq.c +143 -0
- data/lib_fqbin_src/read_fqbin +0 -0
- data/lib_fqbin_src/read_fqbin.c +101 -0
- data/lib_fqbin_src/sort_index +9 -0
- data/lib_fqbin_src/test.rb +13 -0
- data/scbi_fqbin.gemspec +25 -0
- data/test/build.rake +15 -0
- data/test/fbinfile +0 -0
- data/test/fbinfile.index +0 -0
- data/test/no_test_fill_file.rb +66 -0
- data/test/old/app.rb +43 -0
- data/test/old/bin/iterate_fastabin.rb +54 -0
- data/test/old/bin/mk_fastabin.rb +22 -0
- data/test/old/bin/rd_fastabin.rb +36 -0
- data/test/old/bin/rd_fq.rb +20 -0
- data/test/old/bioruby.rb +27 -0
- data/test/old/c/Makefile +34 -0
- data/test/old/c/fbin_lib.zip +0 -0
- data/test/old/c/iterate_fbin.c +54 -0
- data/test/old/c/libreria_gz.c +707 -0
- data/test/old/c/libreria_gz.h +127 -0
- data/test/old/c/main.c +86 -0
- data/test/old/c/mk_fbin.c +24 -0
- data/test/old/c/rd_seq_fbin.c +44 -0
- data/test/old/c/test_ffi/a.out +0 -0
- data/test/old/c/test_ffi/app.c +26 -0
- data/test/old/c/test_ffi/app.rb +19 -0
- data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
- data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
- data/test/old/c/test_ffi/my_library.rb +23 -0
- data/test/old/c/test_ffi/mylibrary.c +22 -0
- data/test/old/c/test_ffi/mylibrary.h +6 -0
- data/test/old/c/usage_instructions.txt +62 -0
- data/test/old/ext/Makefile +187 -0
- data/test/old/ext/Makefile.dario +34 -0
- data/test/old/ext/extconf.rb +8 -0
- data/test/old/ext/mk_fbin.c +24 -0
- data/test/old/ext/sample/extras.txt +4 -0
- data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
- data/test/old/ext/sample/f1.fasta +10 -0
- data/test/old/ext/sample/f1.fasta.qual +10 -0
- data/test/old/ext/sample/f1.fbin +0 -0
- data/test/old/ext/sample/f1.fbin.index +0 -0
- data/test/old/ext/sample/main.c +86 -0
- data/test/old/ext/usage_instructions.txt +62 -0
- data/test/old/t_scbi_fastabin.rb +140 -0
- data/test/read_tests/10-original_sizes.sh +16 -0
- data/test/read_tests/20-fq_time.sh +23 -0
- data/test/read_tests/30-fbin_read_time.sh +23 -0
- data/test/read_tests/40-bsc_read_time.sh +21 -0
- data/test/read_tests/50-fq_time_x4.sh +25 -0
- data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
- data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
- data/test/results_bio_scbi_fasta.txt +11 -0
- data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
- data/test/speed.txt +81 -0
- data/test/t_scbi_fasta.rb +12 -0
- data/test/write_tests/10-original_sizes.sh +16 -0
- data/test/write_tests/20-zip_time.sh +17 -0
- data/test/write_tests/30-mk_fbin_time.sh +23 -0
- data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
- data/test/write_tests/40-gzip_time.sh +16 -0
- data/test/write_tests/41-bsc_time.sh +16 -0
- data/test/write_tests/50-zip_sizes.sh +16 -0
- data/test/write_tests/60-fbin_sizes.sh +17 -0
- data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
- data/test/write_tests/70-gzip_sizes.sh +17 -0
- data/test/write_tests/80-bsc_sizes.sh +17 -0
- data/website/index.html +87 -0
- data/website/index.txt +81 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +208 -95
- data/History.txt +0 -19
- data/Manifest.txt +0 -12
- data/PostInstall.txt +0 -7
- data/script/console +0 -10
- data/script/destroy +0 -14
- data/script/generate +0 -14
@@ -0,0 +1,54 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path(
|
4
|
+
File.join(File.dirname(__FILE__), %w[.. lib scbi_fqbin]))
|
5
|
+
|
6
|
+
#check args
|
7
|
+
if ARGV.count < 2
|
8
|
+
puts "$0 fbin_file -f|-q|-e"
|
9
|
+
puts
|
10
|
+
puts "-f => Get fasta"
|
11
|
+
puts "-q => Get qual"
|
12
|
+
puts "-e => Get extras"
|
13
|
+
exit
|
14
|
+
end
|
15
|
+
|
16
|
+
bin_file = ARGV.shift
|
17
|
+
mode = ARGV.join.gsub('-','').upcase
|
18
|
+
|
19
|
+
#print mode
|
20
|
+
|
21
|
+
get_fasta=mode.index('F')
|
22
|
+
get_qual=mode.index('Q')
|
23
|
+
get_extra=mode.index('E')
|
24
|
+
|
25
|
+
index_file = bin_file+'.index'
|
26
|
+
|
27
|
+
if !File.exists?(bin_file)
|
28
|
+
puts "File \"#{bin_file}\" doesn't exists'"
|
29
|
+
exit
|
30
|
+
end
|
31
|
+
|
32
|
+
# open fastabin file
|
33
|
+
fb=Fastabin.new(bin_file,'r')
|
34
|
+
|
35
|
+
# iterate over all sequences
|
36
|
+
fb.each do |n,f,q,e|
|
37
|
+
if get_fasta
|
38
|
+
puts ">"+n
|
39
|
+
puts f
|
40
|
+
end
|
41
|
+
|
42
|
+
if get_qual
|
43
|
+
puts ">"+n
|
44
|
+
puts q
|
45
|
+
end
|
46
|
+
|
47
|
+
if get_extra
|
48
|
+
puts ">"+n
|
49
|
+
puts e
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
fb.close
|
54
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), %w[.. lib scbi_fqbin]))
|
4
|
+
|
5
|
+
require 'zlib'
|
6
|
+
|
7
|
+
if ARGV.count != 3
|
8
|
+
puts "$0 fasta_file qual_file out_file"
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
fasta_file = ARGV[0]
|
13
|
+
qual_file = ARGV[1]
|
14
|
+
output_name = ARGV[2] ||= File.basename(fasta_file,File.extname(fasta_file))+'.fbin'
|
15
|
+
|
16
|
+
fb=Fastabin.new(output_name,'wb')
|
17
|
+
|
18
|
+
fb.add_fasta_qual(fasta_file,qual_file)
|
19
|
+
|
20
|
+
fb.close
|
21
|
+
|
22
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path(
|
4
|
+
File.join(File.dirname(__FILE__), %w[.. lib scbi_fqbin]))
|
5
|
+
|
6
|
+
#check args
|
7
|
+
if ARGV.count != 2
|
8
|
+
puts "$0 fbin_file sequence_name"
|
9
|
+
exit
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
bin_file = ARGV[0]
|
14
|
+
index_file = bin_file+'.index'
|
15
|
+
seq_name = ARGV[1]
|
16
|
+
|
17
|
+
if !File.exists?(bin_file)
|
18
|
+
puts "Binary file \"#{bin_file}\" doesn't exists'"
|
19
|
+
exit
|
20
|
+
end
|
21
|
+
|
22
|
+
fb=Fastabin.new(bin_file,'r')
|
23
|
+
n,f,q=fb.read_seq(seq_name)
|
24
|
+
|
25
|
+
if n.nil?
|
26
|
+
puts "Sequence not found"
|
27
|
+
else
|
28
|
+
puts ">"+n
|
29
|
+
puts f
|
30
|
+
puts
|
31
|
+
puts ">"+n
|
32
|
+
puts q
|
33
|
+
end
|
34
|
+
|
35
|
+
fb.close
|
36
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scbi_fasta'
|
4
|
+
|
5
|
+
# use FastaQualFile to read fasta
|
6
|
+
qf = FastaQualFile.new(ARGV[0],ARGV[1])
|
7
|
+
|
8
|
+
|
9
|
+
# iterate over sequences
|
10
|
+
qf.each do |name,fasta,qual|
|
11
|
+
|
12
|
+
puts "> #{name}"
|
13
|
+
puts fasta
|
14
|
+
puts "> #{name}"
|
15
|
+
puts qual
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
qf.close
|
20
|
+
|
data/test/old/bioruby.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/local/bin/ruby
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
ff = Bio::FlatFile.open(Bio::FastaFormat, ARGV[0])
|
5
|
+
|
6
|
+
|
7
|
+
qf = Bio::FlatFile.open(Bio::FastaFormat, ARGV[1])
|
8
|
+
|
9
|
+
i=0
|
10
|
+
while ((f_seq= ff.next_entry) && (q_seq = qf.next_entry))
|
11
|
+
|
12
|
+
if f_seq.entry_id!=q_seq.entry_id
|
13
|
+
raise "ERROR in name"
|
14
|
+
end
|
15
|
+
if (f_seq.seq.size!=q_seq.data.count(' ')+1)
|
16
|
+
raise "ERROR in sizes #{f_seq.data.size}, #{q_seq.data.count(' ')+1}"
|
17
|
+
end
|
18
|
+
|
19
|
+
puts f_seq.entry_id
|
20
|
+
puts f_seq.seq
|
21
|
+
puts q_seq.entry_id
|
22
|
+
puts q_seq.data
|
23
|
+
|
24
|
+
i += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
|
data/test/old/c/Makefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
OS := $(shell uname)
|
2
|
+
|
3
|
+
CFLAGS=
|
4
|
+
|
5
|
+
ifeq ($(OS), Linux)
|
6
|
+
CFLAGS=-fPIC -O3
|
7
|
+
endif
|
8
|
+
|
9
|
+
CC=gcc
|
10
|
+
DEPFILE=.depend
|
11
|
+
PROGS=mk_fbin rd_seq_fbin iterate_fbin
|
12
|
+
OBJS=libreria_gz.o
|
13
|
+
LIBS=-lz
|
14
|
+
LIB_NAME=liblibreria_gz
|
15
|
+
|
16
|
+
all: $(OBJS) $(PROGS) lib
|
17
|
+
|
18
|
+
$(PROGS):
|
19
|
+
$(CC) $(CFLAGS) $(OBJS) $(LIBS) $@.c -o $@
|
20
|
+
|
21
|
+
.c.o:
|
22
|
+
$(CC) $(CFLAGS) -c -o $@ $<
|
23
|
+
|
24
|
+
lib:
|
25
|
+
ifeq ($(OS), Linux)
|
26
|
+
$(CC) -shared -Wl,-soname,$(LIB_NAME).so.1 -o $(LIB_NAME).so $(OBJS) $(LIBS)
|
27
|
+
endif
|
28
|
+
|
29
|
+
ifeq ($(OS), Darwin)
|
30
|
+
$(CC) $(LIBS) -dynamiclib -o $(LIB_NAME).dylib -dylib $(OBJS)
|
31
|
+
endif
|
32
|
+
|
33
|
+
clean:
|
34
|
+
rm -f *.o $(PROGS) *.dylib *.so*
|
Binary file
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#include "libreria_gz.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
#include <string.h>
|
5
|
+
#include <stdlib.h>
|
6
|
+
|
7
|
+
|
8
|
+
/*******************************************************/
|
9
|
+
/* main */
|
10
|
+
/*******************************************************/
|
11
|
+
int main(int argc, char *argv[])
|
12
|
+
{
|
13
|
+
|
14
|
+
char *fasta=NULL;
|
15
|
+
char *qual=NULL;
|
16
|
+
char *extras=NULL;
|
17
|
+
int size=5000;
|
18
|
+
int res=0;
|
19
|
+
|
20
|
+
//gzFile gzf_bin;
|
21
|
+
// struct file_data filed;
|
22
|
+
|
23
|
+
struct file_data *filed=NULL;
|
24
|
+
|
25
|
+
if (argc!=2)
|
26
|
+
{
|
27
|
+
printf("Usage %s fbin_file\n\n",argv[0]);
|
28
|
+
return -1;
|
29
|
+
}
|
30
|
+
|
31
|
+
initialize_sequential_reads(&filed, argv[1]);
|
32
|
+
|
33
|
+
char *sname=NULL;
|
34
|
+
|
35
|
+
while ((res=read_data_sequential(filed, &sname, &fasta, &qual, &extras))==0)
|
36
|
+
{
|
37
|
+
// printf("res:%d\n",res);
|
38
|
+
if (res==0){
|
39
|
+
|
40
|
+
printf(">%s\n%s\n", sname, fasta);
|
41
|
+
printf("%s\n",qual);
|
42
|
+
if (extras!=NULL) printf ("extras:%s\n",extras);
|
43
|
+
}
|
44
|
+
|
45
|
+
if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
|
46
|
+
if ( qual!=NULL ) {free(qual);qual=NULL;}
|
47
|
+
if ( extras!=NULL ) {free(extras);extras=NULL;}
|
48
|
+
}
|
49
|
+
|
50
|
+
close_sequential_reads(filed);
|
51
|
+
|
52
|
+
return res;
|
53
|
+
}
|
54
|
+
|
@@ -0,0 +1,707 @@
|
|
1
|
+
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include <time.h>
|
5
|
+
|
6
|
+
|
7
|
+
#include <sys/types.h>
|
8
|
+
#include <sys/stat.h>
|
9
|
+
#include <fcntl.h>
|
10
|
+
#include <errno.h>
|
11
|
+
|
12
|
+
#include <zlib.h>
|
13
|
+
#include <stdlib.h>
|
14
|
+
#include "libreria_gz.h"
|
15
|
+
|
16
|
+
#define CHUNK 262144
|
17
|
+
|
18
|
+
// Maximum file name (including .idx)
|
19
|
+
#define MAXFNAME 512
|
20
|
+
|
21
|
+
// Maximum lenght of the name of a sequence
|
22
|
+
#define MAXSEQNAME 1024
|
23
|
+
#define DEBUG 0
|
24
|
+
#define FALSE 0
|
25
|
+
#define TRUE 1
|
26
|
+
|
27
|
+
char dict_fasta[65536];
|
28
|
+
char dict_qual[65536];
|
29
|
+
|
30
|
+
// Maximum size of the metadata of a sequence, including name, lenght of fasta, qual and extras.
|
31
|
+
// It should be a maximum of 10000
|
32
|
+
#define SEQ_METADATA 10000
|
33
|
+
|
34
|
+
static time_t curr_time=0;
|
35
|
+
static time_t prev_time=0;
|
36
|
+
|
37
|
+
int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras)
|
38
|
+
{
|
39
|
+
// compress data
|
40
|
+
char metainfo[SEQ_METADATA];
|
41
|
+
int error=0;
|
42
|
+
|
43
|
+
|
44
|
+
if (file->gzf_bin==NULL) {fprintf(stderr,"error with gzfile_bin, is NULL :%s\n",gzerror(file->gzf_bin,&error));return -2;}
|
45
|
+
|
46
|
+
snprintf(metainfo,SEQ_METADATA-1,"9999%s %ld %ld %ld", seq_name, strlen(fasta), strlen(qual), strlen(extras));
|
47
|
+
snprintf(metainfo,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, seq_name, strlen(fasta), strlen(qual), strlen(extras));
|
48
|
+
|
49
|
+
// get begin pos of header
|
50
|
+
long beginH=gztell(file->gzf_bin);
|
51
|
+
|
52
|
+
// TODO check gztell
|
53
|
+
if (beginH==-1) {fprintf(stderr,"error with pos of beginH of gzfile_bin :%s\n", gzerror(file->gzf_bin,&error)); return -2;}
|
54
|
+
|
55
|
+
// write seq to bin file
|
56
|
+
gzwrite(file->gzf_bin, metainfo, strlen(metainfo));
|
57
|
+
|
58
|
+
// TODO check gzwrite
|
59
|
+
long beginI=gztell(file->gzf_bin);
|
60
|
+
|
61
|
+
if (beginI==-1) {fprintf(stderr,"error with pos of beginI of gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
|
62
|
+
|
63
|
+
|
64
|
+
int res=1;
|
65
|
+
if (strlen(fasta)>0) res=gzwrite(file->gzf_bin,fasta,strlen(fasta)); //Z_FILTERED);
|
66
|
+
|
67
|
+
if ( res==0 ) { fprintf(stderr,"Error when writting fasta\n");return -8;}
|
68
|
+
long fastaS=gztell(file->gzf_bin)-beginI;
|
69
|
+
|
70
|
+
if (strlen(qual)>0) res=gzwrite(file->gzf_bin,qual,strlen(qual)); //Z_FILTERED);
|
71
|
+
|
72
|
+
if ( res==0 ) { fprintf(stderr,"Error when writting qual\n");return -8;}
|
73
|
+
long qualS=gztell(file->gzf_bin)-fastaS-beginI;
|
74
|
+
|
75
|
+
if (strlen(extras)>0) res=gzwrite(file->gzf_bin,extras,strlen(extras)); //Z_FILTERED);
|
76
|
+
|
77
|
+
if ( res==0 ) { fprintf(stderr,"Error when writting extras\n");return -8;}
|
78
|
+
long extrasS=gztell(file->gzf_bin)-qualS-fastaS-beginI;
|
79
|
+
|
80
|
+
|
81
|
+
// add_sequence(&seql,seq_name,pos_chunk_gz,beginI,fastaS,qualS,extrasS);
|
82
|
+
|
83
|
+
// Write index file
|
84
|
+
char tmp[SEQ_METADATA];
|
85
|
+
sprintf(tmp,"%s %lld %ld\n",seq_name,file->pos_chunk_gz,beginH);
|
86
|
+
|
87
|
+
gzwrite(file->gzf_index,tmp,strlen(tmp));
|
88
|
+
|
89
|
+
(file->counter)++;
|
90
|
+
// if (counter > 2) fprintf(stderr,"Probando static counter para llamadas desde ruby, valor %d\n",counter);
|
91
|
+
|
92
|
+
// create new chunk
|
93
|
+
if (((file->counter)%10000)==0) {
|
94
|
+
curr_time=time(NULL);
|
95
|
+
printf("time passed:%ld\n",curr_time-prev_time);
|
96
|
+
prev_time=curr_time;
|
97
|
+
|
98
|
+
// close current chunk
|
99
|
+
gzclose(file->gzf_bin);
|
100
|
+
|
101
|
+
// open file again
|
102
|
+
int file_bin=open(file->name,O_APPEND);
|
103
|
+
|
104
|
+
//goto end of file
|
105
|
+
long long pos=lseek(file_bin,0,SEEK_END);
|
106
|
+
if (pos==-1) {fprintf(stderr,"error %d seeking file :%s\n",errno,strerror(errno));return -1;}
|
107
|
+
|
108
|
+
// annotate chunk pos
|
109
|
+
file->pos_chunk_gz=pos;
|
110
|
+
|
111
|
+
close(file_bin);
|
112
|
+
|
113
|
+
// open new gzfile
|
114
|
+
file->gzf_bin=gzopen(file->name,"ab");
|
115
|
+
if (file->gzf_bin==NULL) {fprintf(stderr,"error opening gzfile :%s\n",gzerror(file->gzf_bin,&error));return -2;}
|
116
|
+
}
|
117
|
+
|
118
|
+
return 0;
|
119
|
+
}
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
/* Reads the metadata from the main file
|
124
|
+
It initializes the version variable
|
125
|
+
*/
|
126
|
+
int read_bin_file_metadata(struct file_data *filed)
|
127
|
+
{
|
128
|
+
char header[SEQ_METADATA];
|
129
|
+
int fastaS,qualS,extrasS=0;
|
130
|
+
int ver,subver;
|
131
|
+
|
132
|
+
int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
|
133
|
+
|
134
|
+
if ( res!=0 ) {fprintf(stderr,"SEQ READ incorrect:%d\n",res);return -1;}
|
135
|
+
if ( strlen(header)<20 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s. lenght:%ld\n",header,strlen(header));return -1;}
|
136
|
+
|
137
|
+
// 28UMACOMPRESSEDFORMAT_1_0 0 0 0
|
138
|
+
// header[strlen(header)-2]=0;
|
139
|
+
|
140
|
+
if (strncmp(header,"UMACOMPRESSEDFORMAT_",19)!=0) {fprintf(stderr,"Incorrect header in file, header:%s\n",header);return -1;}
|
141
|
+
// TODO fill the file_data structure with the header data
|
142
|
+
if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&ver,&subver)!=2) return -1;
|
143
|
+
//if (sscanf(header,"UMACOMPRESSEDFORMAT_%d_%d",&(filed->version),&(filed->subversion))!=2) return -1;
|
144
|
+
filed->version=11;//ver;
|
145
|
+
filed->subversion=subver;
|
146
|
+
// fprintf(stderr,"file version:%d,%d\n",filed->version,filed->subversion);
|
147
|
+
return 0;
|
148
|
+
}
|
149
|
+
|
150
|
+
/* Reads the metadata from the index file
|
151
|
+
It initializes the version and binary_search variable
|
152
|
+
*/
|
153
|
+
int read_index_file_metadata(struct file_data *filed)
|
154
|
+
{
|
155
|
+
char header[SEQ_METADATA];
|
156
|
+
int fastaS,qualS,extrasS=0;
|
157
|
+
|
158
|
+
|
159
|
+
int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
|
160
|
+
|
161
|
+
if ( strlen(header)<19 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s.\n",header);return -1;}
|
162
|
+
|
163
|
+
// 28UMACOMPRESSEDFORMAT 1 0 0 0 0
|
164
|
+
header[strlen(header)-2]=0;
|
165
|
+
|
166
|
+
if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
|
167
|
+
// TODO fill the file_data structure with the header data
|
168
|
+
if (sscanf(header,"UMACOMPRESSEDFORMAT %d %d",&(filed->version),&(filed->subversion))!=2) {
|
169
|
+
fprintf(stderr,"SEQ READ:Header incorrect when reading versions:%s.\n",header);
|
170
|
+
return -1;
|
171
|
+
}
|
172
|
+
|
173
|
+
return 0;
|
174
|
+
}
|
175
|
+
|
176
|
+
/* reads the header of a sequence in the main file.
|
177
|
+
the pointer to the file points to the fasta data after calling read_seq_header
|
178
|
+
returns 0 if ok
|
179
|
+
-1 if there is an error
|
180
|
+
-2 if EOF
|
181
|
+
*/
|
182
|
+
|
183
|
+
int read_seq_header(gzFile *gzf_bin, char *seq_name,int *fastaS, int *qualS, int*extrasS)
|
184
|
+
{
|
185
|
+
int header_size=4;
|
186
|
+
char hsize[40];
|
187
|
+
char tmp[1000];
|
188
|
+
char sname[SEQ_METADATA];
|
189
|
+
|
190
|
+
long pos=gzread(gzf_bin,hsize,header_size);
|
191
|
+
|
192
|
+
// EOF found
|
193
|
+
if ( pos==0 ) return -2;
|
194
|
+
|
195
|
+
// Error reading file
|
196
|
+
if ( pos==-1 ) {fprintf(stderr,"error reading header\n");return -1;}
|
197
|
+
|
198
|
+
hsize[pos]=0;
|
199
|
+
sscanf(hsize,"%d",&header_size);
|
200
|
+
pos=gzread(gzf_bin,tmp,header_size);
|
201
|
+
|
202
|
+
if ( pos==0 ) return -2;
|
203
|
+
|
204
|
+
if ( pos==-1 ) {fprintf(stderr,"error reading header\n");return -1;}
|
205
|
+
|
206
|
+
tmp[header_size]=0;
|
207
|
+
int reads=sscanf(tmp,"%s %d %d %d",sname,fastaS,qualS,extrasS);
|
208
|
+
|
209
|
+
if (reads!=4) {return -1;};
|
210
|
+
|
211
|
+
if (seq_name!=NULL) strncpy(seq_name,sname,SEQ_METADATA);
|
212
|
+
|
213
|
+
return 0;
|
214
|
+
}
|
215
|
+
|
216
|
+
// check files before reading
|
217
|
+
// it initializes the previous variables, file_version and binary_search
|
218
|
+
// result :
|
219
|
+
// 0 : if both the bin and index files exists and are from the current version
|
220
|
+
// 1 : if both the bin and index files exists but are from another version
|
221
|
+
// 2 : if both files are missing
|
222
|
+
// 3 : if bin file is missing
|
223
|
+
// 4 : if index file is missing
|
224
|
+
int check_files()
|
225
|
+
{
|
226
|
+
|
227
|
+
// open the files, read and check the header
|
228
|
+
return 0;
|
229
|
+
}
|
230
|
+
|
231
|
+
// returns the version of the opened file
|
232
|
+
int version(struct file_data *filed)
|
233
|
+
{
|
234
|
+
if (filed->gzf_bin==NULL) return -1;
|
235
|
+
return filed->version;
|
236
|
+
}
|
237
|
+
|
238
|
+
// returns the version of the opened file
|
239
|
+
int subversion(struct file_data *filed)
|
240
|
+
{
|
241
|
+
if (filed->gzf_bin==NULL) return -1;
|
242
|
+
return filed->subversion;
|
243
|
+
}
|
244
|
+
|
245
|
+
/*
|
246
|
+
mode can be:
|
247
|
+
1 - random, for each read it begins to read from the beggining of index
|
248
|
+
2 - sequential, it keeps the position inside the index and main files.
|
249
|
+
*/
|
250
|
+
int initialize_sequential_reads(struct file_data ** filed, char *filename)
|
251
|
+
{
|
252
|
+
char header[SEQ_METADATA];
|
253
|
+
int fastaS,qualS,extrasS=0;
|
254
|
+
|
255
|
+
if ( *filed == NULL ) {*filed=malloc(sizeof(struct file_data));}
|
256
|
+
|
257
|
+
|
258
|
+
(*filed)->gzf_bin=gzopen(filename,"r");
|
259
|
+
strncpy((*filed)->name,filename,MAXFNAME);
|
260
|
+
(*filed)->error=0;
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
// reads the metadata
|
265
|
+
/*
|
266
|
+
int res=read_seq_header(filed->gzf_bin, header, &fastaS, &qualS, &extrasS);
|
267
|
+
|
268
|
+
if ( strlen(header)<19 ) {fprintf(stderr,"SEQ READ:Header incorrect:%s.\n",header);return -1;}
|
269
|
+
|
270
|
+
// 28UMACOMPRESSEDFORMAT_1 0 0 0
|
271
|
+
header[strlen(header)-2]=0;
|
272
|
+
|
273
|
+
if (strncmp(header,"UMACOMPRESSEDFORMAT",19)!=0) return -1;
|
274
|
+
// TODO fill the file_data structure with the header data
|
275
|
+
*/
|
276
|
+
int res= read_bin_file_metadata(*filed);
|
277
|
+
// inspect_file_data_struct(filed);
|
278
|
+
|
279
|
+
return res;
|
280
|
+
}
|
281
|
+
|
282
|
+
int read_data_sequential(struct file_data *filed,char **seq_name, char **fasta, char **qual, char **extras)
|
283
|
+
{
|
284
|
+
int res=0;
|
285
|
+
int error=0;
|
286
|
+
int fastaS,qualS,extrasS=0;
|
287
|
+
|
288
|
+
if ( *seq_name == NULL ) {*seq_name=(char *)malloc(SEQ_METADATA);strncpy(*seq_name,"",4);}
|
289
|
+
|
290
|
+
res=read_seq_header(filed->gzf_bin, *seq_name, &fastaS, &qualS, &extrasS);
|
291
|
+
if (res==-2) // EOF
|
292
|
+
return -9;
|
293
|
+
|
294
|
+
if ( *fasta == NULL ) {*fasta=(char *)malloc(fastaS+1);strncpy(*fasta,"",fastaS);}
|
295
|
+
if ( *qual == NULL) {*qual=(char *)malloc(qualS+1);strncpy(*qual,"",qualS);}
|
296
|
+
if (( *extras == NULL )&&(extrasS>0)) {*extras=(char *)malloc(extrasS+1);strncpy(*extras,"",extrasS);}
|
297
|
+
|
298
|
+
long pos=gzread(filed->gzf_bin,*fasta,fastaS);
|
299
|
+
(*fasta)[fastaS]=0;
|
300
|
+
pos=gzread(filed->gzf_bin,*qual,qualS);
|
301
|
+
(*qual)[qualS]=0;
|
302
|
+
if (extrasS>0) {pos=gzread(filed->gzf_bin,*extras,extrasS);(*extras)[extrasS]=0;}
|
303
|
+
return 0;
|
304
|
+
|
305
|
+
}
|
306
|
+
int close_sequential_reads(struct file_data *file_d)
|
307
|
+
{
|
308
|
+
gzclose(file_d->gzf_bin);
|
309
|
+
}
|
310
|
+
|
311
|
+
/*
|
312
|
+
read_seq reads from filename the sequence named seq_name and returns its
|
313
|
+
fasta, qual and extras in those variables.
|
314
|
+
It returns 0 if there are no errors, otherwise it returns:
|
315
|
+
-2 : error opening index file (it doesn't exists)
|
316
|
+
-3 : error reading index file
|
317
|
+
-4 : error sequence not found in index file
|
318
|
+
-5 : error opening file (it doesn't exists)
|
319
|
+
-6 : error reading file
|
320
|
+
-7 : error sequence not found
|
321
|
+
-8 : error uncompressing sequence
|
322
|
+
-9 : EOF
|
323
|
+
|
324
|
+
*/
|
325
|
+
|
326
|
+
int read_seq(char *filename, char *seq_name, char **fasta, char **qual, char **extras)
|
327
|
+
{
|
328
|
+
/* Hacer grep en filename.index de seq_name */
|
329
|
+
/* Una vez encontrado leer su info (indice y offsets) */
|
330
|
+
/* leer de filename en sus offests el fasta qual y extras */
|
331
|
+
/* Descomprimirlo y devolverlo */
|
332
|
+
|
333
|
+
char indexname[MAXFNAME];
|
334
|
+
char sname[MAXSEQNAME];// sequence name
|
335
|
+
// char *fasta_comp; // compressed fasta
|
336
|
+
// char *qual_comp; // compressed qual
|
337
|
+
// char *extras_comp; // compressed extras
|
338
|
+
long long beginH, gz_chunk=0;
|
339
|
+
int fastaS, qualS, extrasS=0;
|
340
|
+
char tmp[SEQ_METADATA];
|
341
|
+
int res=0;
|
342
|
+
int error=0;
|
343
|
+
|
344
|
+
int bufsize=150000;
|
345
|
+
|
346
|
+
// allocate memory for return data if necessary
|
347
|
+
if ( *fasta == NULL ) {*fasta=(char *)malloc(bufsize);strncpy(*fasta,"",bufsize);}
|
348
|
+
if ( *qual == NULL) {*qual=(char *)malloc(bufsize);strncpy(*qual,"",bufsize);}
|
349
|
+
if ( *extras == NULL ) {*extras=(char *)malloc(bufsize);strncpy(*extras,"",bufsize);}
|
350
|
+
|
351
|
+
// calc index name
|
352
|
+
snprintf(indexname,MAXFNAME,"%s.index",filename);
|
353
|
+
//FILE * filein=fopen(indexname,"r");
|
354
|
+
|
355
|
+
// open index file
|
356
|
+
gzFile gzfile_index=gzopen(indexname,"r");
|
357
|
+
if (gzfile_index==NULL) {
|
358
|
+
fprintf(stderr,"error opening gzfile_index :%s\n",gzerror(gzfile_index,&error));
|
359
|
+
return -2;
|
360
|
+
}
|
361
|
+
|
362
|
+
// Reads the index to this info, and the offset to its data
|
363
|
+
int reads=3;
|
364
|
+
while ( reads == 3 ) {
|
365
|
+
|
366
|
+
// read a chunk of data from index with the size of tmp
|
367
|
+
gzgets(gzfile_index,tmp,sizeof(tmp));
|
368
|
+
reads=sscanf(tmp,"%s %lld %lld",sname,&gz_chunk,&beginH);
|
369
|
+
|
370
|
+
|
371
|
+
|
372
|
+
|
373
|
+
if (( reads != 3 ) && ( reads!=EOF )) {
|
374
|
+
fprintf(stderr,"Error scanning index: %d\n",reads);
|
375
|
+
gzclose(gzfile_index);
|
376
|
+
return -3;
|
377
|
+
}
|
378
|
+
|
379
|
+
// sequence was finally found, exit loop
|
380
|
+
if ( strncmp(sname, seq_name,MAXSEQNAME)==0) reads=999; // to get out, seq found
|
381
|
+
}
|
382
|
+
|
383
|
+
// close index file
|
384
|
+
gzclose(gzfile_index);
|
385
|
+
|
386
|
+
// maybe sequence was not found
|
387
|
+
// fprintf(stderr,"Sequence not found\n");
|
388
|
+
if (reads==EOF) {return -4;}
|
389
|
+
|
390
|
+
// We get here if sequence was found
|
391
|
+
|
392
|
+
// open bin file to extract data
|
393
|
+
int dataf=open(filename, O_RDONLY);
|
394
|
+
|
395
|
+
// seek to chunk pos
|
396
|
+
// TODO- ¿como se salta el chunk?
|
397
|
+
res=lseek(dataf,gz_chunk,SEEK_SET);
|
398
|
+
|
399
|
+
// TODO check res
|
400
|
+
gzFile gzfile_bin=gzdopen(dataf,"r");
|
401
|
+
|
402
|
+
// seek to seq inside chunk
|
403
|
+
res=gzseek(gzfile_bin,beginH,SEEK_SET);
|
404
|
+
// TODO check res
|
405
|
+
|
406
|
+
// fasta=malloc(fastaO+1);
|
407
|
+
// qual=malloc(qualO+1);
|
408
|
+
// extras=malloc(extrasO+1);
|
409
|
+
// long pos=gzread(gzfile_bin,header,4);
|
410
|
+
// read sequence header
|
411
|
+
|
412
|
+
res=read_seq_header(gzfile_bin,NULL, &fastaS, &qualS, &extrasS);
|
413
|
+
|
414
|
+
|
415
|
+
|
416
|
+
long pos=gzread(gzfile_bin,*fasta,fastaS);
|
417
|
+
|
418
|
+
(*fasta)[fastaS]=0;
|
419
|
+
|
420
|
+
pos=gzread(gzfile_bin,*qual,qualS);
|
421
|
+
(*qual)[qualS]=0;
|
422
|
+
|
423
|
+
if (extrasS>0) {pos=gzread(gzfile_bin,*extras,extrasS); (*extras)[extrasS]=0;}
|
424
|
+
gzclose(gzfile_bin);
|
425
|
+
|
426
|
+
return 0;
|
427
|
+
}
|
428
|
+
|
429
|
+
void inspect_file_data_struct(struct file_data *file){
|
430
|
+
|
431
|
+
printf("file name:%s\n",file->name);
|
432
|
+
printf("file index_name:%s\n",file->index_name);
|
433
|
+
printf("file version:%d\n",file->version);
|
434
|
+
printf("file subversion:%d\n",file->subversion);
|
435
|
+
printf("error:%d\n",file->error);
|
436
|
+
/*
|
437
|
+
if (file->bin_search==TRUE) printf("file binary search is possible\n");
|
438
|
+
else printf("file binary search is not possible\n");
|
439
|
+
*/
|
440
|
+
|
441
|
+
}
|
442
|
+
|
443
|
+
// initialize the state for doing writes
|
444
|
+
// two modes:
|
445
|
+
// 1 .- new files
|
446
|
+
// 2 .- add data to files, if they don't exist they are created
|
447
|
+
int initialize_writes(struct file_data ** file, char *output_name, int mode)
|
448
|
+
{
|
449
|
+
|
450
|
+
// check if the files exists, in case it exists check if it has the
|
451
|
+
// correct metadata and if it is of the correct version
|
452
|
+
// in other case exits with an error
|
453
|
+
// struct file_data *file = malloc(sizeof(struct write_file));
|
454
|
+
if ( *file == NULL ) {*file=malloc(sizeof(struct file_data));}
|
455
|
+
|
456
|
+
(*file)->pos_chunk_gz=0;
|
457
|
+
|
458
|
+
int state=check_files(output_name);
|
459
|
+
if (state==1) {
|
460
|
+
fprintf(stderr,"File is from a different version\n");
|
461
|
+
return -1;
|
462
|
+
}
|
463
|
+
if ((state!=2)&&(state!=0)) {
|
464
|
+
fprintf(stderr,"Error %d when checking files\n",state);
|
465
|
+
return -1;
|
466
|
+
}
|
467
|
+
|
468
|
+
// copy the name of the file
|
469
|
+
strncpy((*file)->name,output_name,MAXFNAME);
|
470
|
+
|
471
|
+
// open the compressed files
|
472
|
+
int error=0;
|
473
|
+
int flags=O_WRONLY|O_CREAT|O_TRUNC;
|
474
|
+
if (mode==2) flags=O_RDWR;
|
475
|
+
// printf("mode:%d\n",mode);
|
476
|
+
|
477
|
+
//set index name
|
478
|
+
snprintf((*file)->index_name,MAXFNAME,"%s.index",(*file)->name);
|
479
|
+
|
480
|
+
//open index file
|
481
|
+
int file_index=open((*file)->index_name,flags,0644);
|
482
|
+
|
483
|
+
if (file_index==-1) return -2;
|
484
|
+
|
485
|
+
// open bin file
|
486
|
+
int file_bin=open((*file)->name,flags,0644);
|
487
|
+
// printf("fd:%d\n",file_bin);
|
488
|
+
if (file_bin==-1) {fprintf(stderr,"error opening file_bin for writting:%s\n",strerror(errno));return -2;}
|
489
|
+
if (mode==2) {
|
490
|
+
long long pos=lseek(file_index,0,SEEK_END);
|
491
|
+
if (pos==-1) {fprintf(stderr,"error going to end of index file %s\n",strerror(errno)); return -2;}
|
492
|
+
pos=lseek(file_bin,0,SEEK_END);
|
493
|
+
if (pos==-1) {fprintf(stderr,"error going to end of bin file %s\n",strerror(errno)); return -2;}
|
494
|
+
(*file)->pos_chunk_gz=pos;
|
495
|
+
}
|
496
|
+
|
497
|
+
// open zlib index file
|
498
|
+
(*file)->gzf_index=gzdopen(file_index,"w");
|
499
|
+
if ((*file)->gzf_index==NULL) {
|
500
|
+
fprintf(stderr,"error opening gzfile_index for writting:%s\n",gzerror((*file)->gzf_index,&error));
|
501
|
+
return -2;
|
502
|
+
}
|
503
|
+
|
504
|
+
// open zlib bin file
|
505
|
+
(*file)->gzf_bin=gzdopen(file_bin,"w");
|
506
|
+
if ((*file)->gzf_bin==NULL) {
|
507
|
+
fprintf(stderr,"error opening gzfile for writting:%s\n",gzerror((*file)->gzf_bin,&error));
|
508
|
+
return -2;
|
509
|
+
}
|
510
|
+
|
511
|
+
// initializes the files, writting the metadata
|
512
|
+
if (mode==1) {
|
513
|
+
char header[SEQ_METADATA];
|
514
|
+
(*file)->version=VERSION;
|
515
|
+
(*file)->subversion=SUBVERSION;
|
516
|
+
(*file)->error=0;
|
517
|
+
// TODO put correct size
|
518
|
+
snprintf(header,SEQ_METADATA-1,"9999UMACOMPRESSEDFORMAT_%d_%d %d %d %d", (*file)->version,(*file)->subversion, 0, 0, 0);
|
519
|
+
snprintf(header,SEQ_METADATA-1,"%4ldUMACOMPRESSEDFORMAT_%d_%d %d %d %d", strlen(header)-4,(*file)->version,(*file)->subversion, 0, 0, 0);
|
520
|
+
// snprintf(header,SEQ_METADATA-1,"%4ld%s %ld %ld %ld", strlen(metainfo)-4, (*file)->version,(*file)->subversion, 0, 0, 0);
|
521
|
+
|
522
|
+
// sprintf(header," 29UMACOMPRESSEDFORMAT_%d_%d 0 0 0\n",(*file)->version,(*file)->subversion);
|
523
|
+
int res=gzwrite((*file)->gzf_bin,header,strlen(header));
|
524
|
+
|
525
|
+
sprintf(header,"UMACOMPRESSEDFORMAT 1 0 0 999999999999 999999999999\n");
|
526
|
+
res=gzwrite((*file)->gzf_index,header,strlen(header));
|
527
|
+
}
|
528
|
+
(*file)->counter=0;
|
529
|
+
|
530
|
+
// printf("Init writes done\n");
|
531
|
+
return 0;
|
532
|
+
}
|
533
|
+
|
534
|
+
|
535
|
+
|
536
|
+
int close_writes(struct file_data *file)
|
537
|
+
{
|
538
|
+
gzclose(file->gzf_bin);
|
539
|
+
gzclose(file->gzf_index);
|
540
|
+
}
|
541
|
+
|
542
|
+
|
543
|
+
int process_biofile(char *fname, char *qfname, char *efname, char *outname)
|
544
|
+
{
|
545
|
+
|
546
|
+
char sname[MAXSEQNAME];// sequence name
|
547
|
+
char qname[MAXSEQNAME];// sequence name
|
548
|
+
char ename[MAXSEQNAME];// sequence name
|
549
|
+
char next_sname[MAXSEQNAME];// sequence name
|
550
|
+
char next_qname[MAXSEQNAME];// sequence name
|
551
|
+
char next_ename[MAXSEQNAME];// sequence name
|
552
|
+
|
553
|
+
char fasta[150000];
|
554
|
+
char qual[150000];
|
555
|
+
char extras[150000];
|
556
|
+
char extras_used[150000];
|
557
|
+
char next_fcomment[150000];
|
558
|
+
char next_qcomment[150000];
|
559
|
+
char next_ecomment[150000];
|
560
|
+
char tmp[150000];
|
561
|
+
int extras_bool=TRUE;
|
562
|
+
|
563
|
+
int cnt=1;
|
564
|
+
|
565
|
+
sprintf(extras_used,"INITIALIZED");
|
566
|
+
|
567
|
+
// Open fasta and qual files
|
568
|
+
FILE *file_fasta=fopen(fname,"r");
|
569
|
+
|
570
|
+
if (file_fasta==NULL) { fprintf(stderr,"error opening fasta file %s, result %d %s\n",fname,errno,strerror(errno));return -2;};
|
571
|
+
// setvbuf(file_fasta,NULL,_IONBF,0);
|
572
|
+
FILE *file_qual=fopen(qfname,"r");
|
573
|
+
if (file_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",qfname,errno,strerror(errno));return -2;};
|
574
|
+
FILE *file_extras=fopen(efname,"r");
|
575
|
+
if (file_extras==NULL) {fprintf(stderr,"error opening extras file %s, result %d %s\n",efname,errno,strerror(errno)); extras_bool=FALSE;sprintf(extras,"");};
|
576
|
+
|
577
|
+
// setvbuf(file_qual,NULL,_IONBF,0);
|
578
|
+
int error=0;
|
579
|
+
int end=0; //0 is false
|
580
|
+
char *res;
|
581
|
+
|
582
|
+
// reads the name of the sequence from both
|
583
|
+
|
584
|
+
// fscanf(file_qual,">%9000s",qname);
|
585
|
+
// fscanf(file_fasta,">%9000s",sname);
|
586
|
+
|
587
|
+
|
588
|
+
res=fgets(tmp,150000,file_fasta);
|
589
|
+
if (res!=NULL) {
|
590
|
+
sscanf(tmp,">%9000s",sname);
|
591
|
+
strncpy(next_fcomment,tmp+strlen(sname)+2,150000);
|
592
|
+
}
|
593
|
+
|
594
|
+
res=fgets(tmp,150000,file_qual);
|
595
|
+
if (res!=NULL) {
|
596
|
+
sscanf(tmp,">%9000s",qname);
|
597
|
+
strncpy(next_qcomment,tmp+strlen(qname)+2,150000);
|
598
|
+
}
|
599
|
+
|
600
|
+
if ( extras_bool ) {
|
601
|
+
res=fgets(tmp,150000,file_extras);
|
602
|
+
if (res!=NULL) {
|
603
|
+
sscanf(tmp,">%9000s",ename);
|
604
|
+
strncpy(next_ecomment,tmp+strlen(ename)+2,150000);
|
605
|
+
} else sprintf(ename,"");
|
606
|
+
}
|
607
|
+
printf("extras seq:%s\n",ename);
|
608
|
+
|
609
|
+
printf("file:%s q:%s seqname:%s qseqname%s efname:%s extras:%s\n",fname, qfname,sname,qname,efname,extras);
|
610
|
+
printf("next_fcomment:%s next_qcomment:%s\n",next_fcomment,next_qcomment);
|
611
|
+
|
612
|
+
struct file_data *file=NULL;
|
613
|
+
int error2=initialize_writes(&file, outname,1);
|
614
|
+
|
615
|
+
// sprintf(next_fcomment,"");
|
616
|
+
// sprintf(next_qcomment,"");
|
617
|
+
|
618
|
+
while (!end) {
|
619
|
+
if ( strcmp(sname,qname)!=0 ) {error = -9; goto end;}
|
620
|
+
/*
|
621
|
+
if (extras_bool)
|
622
|
+
if ( strcmp(sname,ename)!=0 ) {error = -9; goto end;}
|
623
|
+
*/
|
624
|
+
// load the qual and fasta
|
625
|
+
|
626
|
+
sprintf(fasta,"");
|
627
|
+
sprintf(fasta,"%s",next_fcomment);
|
628
|
+
sprintf(next_fcomment,"");
|
629
|
+
sprintf(tmp,"");
|
630
|
+
res=fasta;
|
631
|
+
while (( res!=NULL ) && (tmp[0]!='>' )) {
|
632
|
+
res=fgets(tmp,150000,file_fasta);
|
633
|
+
if ((tmp[0]!='>')&&(res!=NULL)) sprintf (fasta,"%s%s",fasta,tmp);
|
634
|
+
else if (res!=NULL) {sscanf(tmp,">%9000s",next_sname); strncpy(next_fcomment,tmp+strlen(next_sname)+2,sizeof(next_fcomment));}
|
635
|
+
}
|
636
|
+
if (res==NULL) end=1;
|
637
|
+
|
638
|
+
sprintf(qual,"");
|
639
|
+
sprintf(qual,"%s",next_qcomment);
|
640
|
+
sprintf(next_qcomment,"");
|
641
|
+
res=qual;
|
642
|
+
sprintf(tmp,"");
|
643
|
+
while (( res!=NULL ) && (tmp[0]!='>' )) {
|
644
|
+
res=fgets(tmp,150000,file_qual);
|
645
|
+
if ((tmp[0]!='>')&&(res!=NULL)) sprintf (qual,"%s%s",qual,tmp);
|
646
|
+
else if (res!=NULL) {sscanf(tmp,">%9000s",next_qname); strncpy(next_qcomment,tmp+strlen(next_qname)+2,sizeof(next_qcomment));}
|
647
|
+
}
|
648
|
+
if (res==NULL) end=1;
|
649
|
+
|
650
|
+
// If extra_used!=NULL then it means that it has been used and a new one must be read
|
651
|
+
if (extras_bool && (strcmp(extras_used,"")!=0)) {
|
652
|
+
sprintf(extras,"");
|
653
|
+
sprintf(extras,"%s",next_ecomment);
|
654
|
+
sprintf(next_ecomment,"");
|
655
|
+
res=extras;
|
656
|
+
sprintf(tmp,"");
|
657
|
+
while (( res!=NULL ) && (tmp[0]!='>' )) {
|
658
|
+
res=fgets(tmp,150000,file_extras);
|
659
|
+
if ((tmp[0]!='>')&&(res!=NULL)) sprintf (extras,"%s%s",extras,tmp);
|
660
|
+
else if (res!=NULL) {sscanf(tmp,">%9000s",next_ename); strncpy(next_ecomment,tmp+strlen(next_ename)+2,sizeof(next_ecomment));}
|
661
|
+
}
|
662
|
+
//if (res==NULL) end=1; Extras file can be finished and processing will continue
|
663
|
+
}
|
664
|
+
|
665
|
+
/* If the name of the name is equal to the name of the actual sequence then it will be used for writting */
|
666
|
+
if ( strcmp(sname,ename)==0 ) {
|
667
|
+
strcpy(extras_used,extras);
|
668
|
+
strcpy(ename,next_ename);
|
669
|
+
} else sprintf(extras_used,"");
|
670
|
+
|
671
|
+
int error_wr=write_seq(file,sname, fasta,qual,extras_used);
|
672
|
+
if (error_wr!=0) { end=1;error=error_wr; };
|
673
|
+
if (error_wr==0) cnt++;
|
674
|
+
strcpy(sname,next_sname);
|
675
|
+
strcpy(qname,next_qname);
|
676
|
+
|
677
|
+
}
|
678
|
+
|
679
|
+
// repeat until EOF or error
|
680
|
+
end:
|
681
|
+
fclose(file_fasta);
|
682
|
+
fclose(file_qual);
|
683
|
+
close_writes(file);
|
684
|
+
//fclose(file_index);
|
685
|
+
// print_seqs(seql);
|
686
|
+
return error;
|
687
|
+
}
|
688
|
+
|
689
|
+
|
690
|
+
int init_dicts(char *d_fasta,char *d_qual,int size)
|
691
|
+
{
|
692
|
+
char *dict_f="fasta.dic";
|
693
|
+
char *dict_q="qual.dic";
|
694
|
+
FILE *f_d_fasta=fopen(dict_f,"r");
|
695
|
+
if (f_d_fasta==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_f,errno,strerror(errno));return -2;};
|
696
|
+
fread(d_fasta,size,1,f_d_fasta);
|
697
|
+
fclose(f_d_fasta);
|
698
|
+
|
699
|
+
FILE *f_d_qual=fopen(dict_q,"r");
|
700
|
+
if (f_d_qual==NULL) { fprintf(stderr,"error opening qual file %s, result %d %s\n",dict_q,errno,strerror(errno));return -2;};
|
701
|
+
fread(d_qual,size,1,f_d_fasta);
|
702
|
+
fclose(f_d_qual);
|
703
|
+
}
|
704
|
+
|
705
|
+
|
706
|
+
|
707
|
+
|