scbi_fqbin 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/.DS_Store +0 -0
  3. data/.gitignore +14 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/{README.rdoc → README.md} +0 -0
  7. data/Rakefile +8 -28
  8. data/lib/scbi_fqbin.rb +3 -5
  9. data/lib/scbi_fqbin/fastabin.rb +411 -0
  10. data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
  11. data/lib/scbi_fqbin/fbin_file.rb +1 -1
  12. data/lib/scbi_fqbin/t.rb +9 -0
  13. data/lib/scbi_fqbin/t2.rb +12 -0
  14. data/lib/scbi_fqbin/version.rb +3 -0
  15. data/lib_fqbin_src.zip +0 -0
  16. data/lib_fqbin_src/Makefile +66 -0
  17. data/lib_fqbin_src/fq +0 -0
  18. data/lib_fqbin_src/fq.c +165 -0
  19. data/lib_fqbin_src/hash_fqbin +0 -0
  20. data/lib_fqbin_src/hash_fqbin.c +212 -0
  21. data/lib_fqbin_src/idx_fqbin +21 -0
  22. data/lib_fqbin_src/iterate_fqbin +0 -0
  23. data/lib_fqbin_src/iterate_fqbin.c +136 -0
  24. data/lib_fqbin_src/lib_fqbin.c +1748 -0
  25. data/lib_fqbin_src/lib_fqbin.h +194 -0
  26. data/lib_fqbin_src/mk_fqbin +0 -0
  27. data/lib_fqbin_src/mk_fqbin.c +138 -0
  28. data/lib_fqbin_src/other/bwxform.c +915 -0
  29. data/lib_fqbin_src/other/bwxform.h +74 -0
  30. data/lib_fqbin_src/other/find_in_index.c +130 -0
  31. data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
  32. data/lib_fqbin_src/other/idx_fqbin +0 -0
  33. data/lib_fqbin_src/other/idx_fqbin.c +67 -0
  34. data/lib_fqbin_src/other/make_hsh.sh +14 -0
  35. data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
  36. data/lib_fqbin_src/read_fq +0 -0
  37. data/lib_fqbin_src/read_fq.c +143 -0
  38. data/lib_fqbin_src/read_fqbin +0 -0
  39. data/lib_fqbin_src/read_fqbin.c +101 -0
  40. data/lib_fqbin_src/sort_index +9 -0
  41. data/lib_fqbin_src/test.rb +13 -0
  42. data/scbi_fqbin.gemspec +25 -0
  43. data/test/build.rake +15 -0
  44. data/test/fbinfile +0 -0
  45. data/test/fbinfile.index +0 -0
  46. data/test/no_test_fill_file.rb +66 -0
  47. data/test/old/app.rb +43 -0
  48. data/test/old/bin/iterate_fastabin.rb +54 -0
  49. data/test/old/bin/mk_fastabin.rb +22 -0
  50. data/test/old/bin/rd_fastabin.rb +36 -0
  51. data/test/old/bin/rd_fq.rb +20 -0
  52. data/test/old/bioruby.rb +27 -0
  53. data/test/old/c/Makefile +34 -0
  54. data/test/old/c/fbin_lib.zip +0 -0
  55. data/test/old/c/iterate_fbin.c +54 -0
  56. data/test/old/c/libreria_gz.c +707 -0
  57. data/test/old/c/libreria_gz.h +127 -0
  58. data/test/old/c/main.c +86 -0
  59. data/test/old/c/mk_fbin.c +24 -0
  60. data/test/old/c/rd_seq_fbin.c +44 -0
  61. data/test/old/c/test_ffi/a.out +0 -0
  62. data/test/old/c/test_ffi/app.c +26 -0
  63. data/test/old/c/test_ffi/app.rb +19 -0
  64. data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
  65. data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
  66. data/test/old/c/test_ffi/my_library.rb +23 -0
  67. data/test/old/c/test_ffi/mylibrary.c +22 -0
  68. data/test/old/c/test_ffi/mylibrary.h +6 -0
  69. data/test/old/c/usage_instructions.txt +62 -0
  70. data/test/old/ext/Makefile +187 -0
  71. data/test/old/ext/Makefile.dario +34 -0
  72. data/test/old/ext/extconf.rb +8 -0
  73. data/test/old/ext/mk_fbin.c +24 -0
  74. data/test/old/ext/sample/extras.txt +4 -0
  75. data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
  76. data/test/old/ext/sample/f1.fasta +10 -0
  77. data/test/old/ext/sample/f1.fasta.qual +10 -0
  78. data/test/old/ext/sample/f1.fbin +0 -0
  79. data/test/old/ext/sample/f1.fbin.index +0 -0
  80. data/test/old/ext/sample/main.c +86 -0
  81. data/test/old/ext/usage_instructions.txt +62 -0
  82. data/test/old/t_scbi_fastabin.rb +140 -0
  83. data/test/read_tests/10-original_sizes.sh +16 -0
  84. data/test/read_tests/20-fq_time.sh +23 -0
  85. data/test/read_tests/30-fbin_read_time.sh +23 -0
  86. data/test/read_tests/40-bsc_read_time.sh +21 -0
  87. data/test/read_tests/50-fq_time_x4.sh +25 -0
  88. data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
  89. data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
  90. data/test/results_bio_scbi_fasta.txt +11 -0
  91. data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
  92. data/test/speed.txt +81 -0
  93. data/test/t_scbi_fasta.rb +12 -0
  94. data/test/write_tests/10-original_sizes.sh +16 -0
  95. data/test/write_tests/20-zip_time.sh +17 -0
  96. data/test/write_tests/30-mk_fbin_time.sh +23 -0
  97. data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
  98. data/test/write_tests/40-gzip_time.sh +16 -0
  99. data/test/write_tests/41-bsc_time.sh +16 -0
  100. data/test/write_tests/50-zip_sizes.sh +16 -0
  101. data/test/write_tests/60-fbin_sizes.sh +17 -0
  102. data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
  103. data/test/write_tests/70-gzip_sizes.sh +17 -0
  104. data/test/write_tests/80-bsc_sizes.sh +17 -0
  105. data/website/index.html +87 -0
  106. data/website/index.txt +81 -0
  107. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  108. data/website/stylesheets/screen.css +159 -0
  109. data/website/template.html.erb +50 -0
  110. metadata +208 -95
  111. data/History.txt +0 -19
  112. data/Manifest.txt +0 -12
  113. data/PostInstall.txt +0 -7
  114. data/script/console +0 -10
  115. data/script/destroy +0 -14
  116. data/script/generate +0 -14
data/lib_fqbin_src/fq ADDED
Binary file
@@ -0,0 +1,165 @@
1
+
2
+ #include "lib_fqbin.h"
3
+ #include <stdio.h>
4
+ #include <ctype.h>
5
+
6
+ #include <unistd.h>
7
+
8
+
9
+ // process a fastq file adding it to fbin file
10
+ int iterate_fastq(char *fname, int only_extras, int output_fasta, int output_extras)
11
+ {
12
+
13
+ // allocate strings
14
+ char *name;
15
+ if ((name = malloc(MAXSEQNAME)) == NULL) {
16
+ puts("Memory allocation error!");
17
+ return EXIT_FAILURE;
18
+ }
19
+
20
+ char *fasta;
21
+ if ((fasta = malloc(MAXSEQLENGTH)) == NULL) {
22
+ puts("Memory allocation error!");
23
+ return EXIT_FAILURE;
24
+ }
25
+
26
+ char *qual;
27
+ if ((qual = malloc(MAXSEQLENGTH)) == NULL) {
28
+ puts("Memory allocation error!");
29
+ return EXIT_FAILURE;
30
+ }
31
+
32
+ char *comments;
33
+ if ((comments = malloc(MAXSEQLENGTH)) == NULL) {
34
+ puts("Memory allocation error!");
35
+ return EXIT_FAILURE;
36
+ }
37
+
38
+ static time_t curr_time=0;
39
+ static time_t prev_time=0;
40
+
41
+ prev_time=time(NULL);
42
+
43
+ FILE *fastq_file=NULL;
44
+
45
+
46
+ int valid=0;
47
+ int res=0;
48
+ int r=0;
49
+ // Open fasta and qual files
50
+ if (strcmp(fname,"-")==0){
51
+ fastq_file=stdin;
52
+ }else{
53
+
54
+ open_file(fname,&fastq_file);
55
+ }
56
+
57
+ if (fastq_file==NULL){
58
+ printf("TRESb\n");
59
+ }
60
+
61
+ // for each sequence on fastq file
62
+ while (valid=get_next_seq_fastq(fastq_file,&name,&fasta,&qual,&comments)){
63
+ if(valid==1)
64
+ {
65
+ r++;
66
+
67
+ if (!only_extras){
68
+ if (output_fasta){
69
+ printf(">%s %s\n%s\n", name, comments, fasta);
70
+ }else{
71
+ printf("@%s %s\n%s\n", name,comments, fasta);
72
+ printf("+%s\n%s\n",name,qual);
73
+ }
74
+ }
75
+
76
+ // if ((extras!=NULL) && (output_extras)) printf ("EXTRAS:%s\n",extras);
77
+
78
+ }else{
79
+ fprintf(stderr,"Invalid sequence found %s. Aborting import.\n",name);
80
+ res=-1;
81
+ break;
82
+ }
83
+
84
+ // if ((r%10000)==0) {
85
+ // }
86
+
87
+ }
88
+
89
+ curr_time=time(NULL);
90
+ printf("\nEnd fastq processing. %d seqs in %.0f s. Rate: %8.2f seqs/s\n",r,difftime(curr_time,prev_time),r/difftime(curr_time,prev_time));
91
+
92
+ // free mem
93
+ free(name);
94
+ free(fasta);
95
+ free(qual);
96
+ free(comments);
97
+
98
+ // close files
99
+ fclose(fastq_file);
100
+
101
+ return res;
102
+ }
103
+
104
+ void usage(){
105
+ printf("Usage: fq [-f][-e][-E] fbin_file seq_name\n\n");
106
+ printf(" -f Output sequence in fasta format\n");
107
+ printf(" -e Output extras for sequence\n");
108
+ printf(" -E Output only extras for sequence\n");
109
+
110
+ exit(-1);
111
+
112
+ }
113
+
114
+
115
+
116
+ /*******************************************************/
117
+ /* main */
118
+ /*******************************************************/
119
+ int main(int argc, char *argv[])
120
+ {
121
+
122
+ int ch;
123
+
124
+ int output_fasta = 0;
125
+ int output_extras = 0;
126
+ int only_extras = 0;
127
+
128
+ while ((ch = getopt(argc, argv, "feEh")) != -1) {
129
+ switch (ch) {
130
+ case 'f':
131
+ output_fasta = 1;
132
+ break;
133
+ case 'e':
134
+ output_extras = 1;
135
+ break;
136
+ case 'E':
137
+ output_extras = 1;
138
+ only_extras = 1;
139
+ break;
140
+
141
+ case 'h':
142
+ usage();
143
+ break;
144
+ case '?':
145
+ default:
146
+ usage();
147
+ }
148
+ }
149
+
150
+ argc -= optind;
151
+ argv += optind;
152
+
153
+
154
+ // check params
155
+ if (argc<1)
156
+ {
157
+ usage();
158
+ return -1;
159
+ }
160
+
161
+ int res=iterate_fastq(argv[0],only_extras, output_fasta, output_extras);
162
+
163
+ return res;
164
+ }
165
+
Binary file
@@ -0,0 +1,212 @@
1
+
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <time.h>
5
+
6
+
7
+ #include <sys/types.h>
8
+ #include <sys/stat.h>
9
+ #include <fcntl.h>
10
+ #include <errno.h>
11
+
12
+ #include <zlib.h>
13
+ #include <zlib.h>
14
+ #include <stdlib.h>
15
+
16
+ // Maximum file name (including .idx)
17
+ #define MAXFNAME 512
18
+
19
+ // Maximum lenght of the name of a sequence
20
+ #define MAXSEQNAME 1024
21
+ #define MAXSEQLENGTH 150000000
22
+ #define DEBUG 1
23
+ #define FALSE 0
24
+ #define TRUE 1
25
+
26
+ #define INVALID_FASTQ_FORMAT -5
27
+ #define INVALID_FASTA_FORMAT -6
28
+
29
+ #define SEQ_METADATA 10000
30
+
31
+
32
+ long long last_chunk_file(char *filename){
33
+
34
+ // open file again to annotate chunk
35
+ int file=open(filename,O_APPEND);
36
+
37
+ //goto end of file
38
+ long long pos=lseek(file,0,SEEK_END);
39
+ if (pos==-1) {fprintf(stderr,"error %d seeking file %s :%s\n",errno,filename,strerror(errno));return -1;}
40
+
41
+ close(file);
42
+
43
+ return pos;
44
+ }
45
+
46
+ // creates a hash from an index file with the desired chunk size. Chunk size can be
47
+ // adjusted to fit a good compromise between access speed and used space.
48
+ int hash_index_file(char *filename, int chunk_size, int skip_sort)
49
+ {
50
+
51
+ char hash_file_name[MAXFNAME];
52
+ char indexname[MAXFNAME];
53
+ char sorted_indexname[MAXFNAME];
54
+
55
+ char sname[MAXSEQNAME];// sequence name
56
+ long long beginH, gz_chunk=0;
57
+ char tmp[SEQ_METADATA];
58
+ int res=0;
59
+ int error;
60
+
61
+ // to save min, max sequences and current chunk
62
+ char min_name[MAXSEQNAME];
63
+ char max_name[MAXSEQNAME];
64
+ long long current_chunk=0;
65
+ long long count=0;
66
+
67
+
68
+ strcpy(min_name,"");
69
+ strcpy(max_name,"");
70
+
71
+ // calc index and hash name
72
+ snprintf(indexname,MAXFNAME,"%s.index",filename);
73
+ snprintf(hash_file_name,MAXFNAME,"%s.index.hash",filename);
74
+
75
+ // sort index file by external command
76
+ if(skip_sort==0)
77
+ {
78
+ char cmd[10000];
79
+ snprintf(cmd,10000,"sort_index %s",indexname);
80
+ system(cmd);
81
+ }
82
+
83
+ // use sorted index
84
+ snprintf(sorted_indexname,MAXFNAME,"%s.index.sorted",filename);
85
+
86
+ // open hash file
87
+ gzFile gzhash_file=gzopen(hash_file_name,"wb");
88
+
89
+ // open sorted index file
90
+ gzFile gzsorted_file_index=gzopen(sorted_indexname,"r");
91
+
92
+ // open output index file
93
+ // int file_index=open(indexname,flags,0644);
94
+ gzFile gzfile_index=gzopen(indexname,"w");
95
+
96
+ // write header
97
+ gzprintf(gzfile_index,"UMACOMPRESSEDFORMAT 1 0 0 999999999999 999999999999\n");
98
+
99
+ //reopen
100
+ gzclose(gzfile_index);
101
+ gzfile_index=gzopen(indexname,"ab");
102
+
103
+
104
+ if (gzsorted_file_index==NULL) {
105
+ fprintf(stderr,"error opening gzsorted_file_index :%s\n",gzerror(gzsorted_file_index,&error));
106
+ return -2;
107
+ }
108
+
109
+ if (gzfile_index==NULL) {
110
+ fprintf(stderr,"error opening gzfile_index :%s\n",gzerror(gzfile_index,&error));
111
+ return -2;
112
+ }
113
+
114
+ if (gzhash_file==NULL) {
115
+ fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(gzhash_file,&error));
116
+ return -2;
117
+ }
118
+
119
+ // podria leerse saltando linea 1, y luego leyendo 10000 lineas sin sscanf
120
+
121
+ // repeat until EOF
122
+ while ( gzgets(gzsorted_file_index,tmp,sizeof(tmp))!=Z_NULL ) {
123
+
124
+ // parse string
125
+ sscanf(tmp,"%s %lld %lld",sname,&gz_chunk,&beginH);
126
+
127
+ if(strcmp(sname,"UMACOMPRESSEDFORMAT")!=0) // valid index line
128
+ {
129
+
130
+ // clear chunk_data if any
131
+ // if (gz_chunk!=current_chunk){
132
+ if ((count%chunk_size)==0){
133
+ if (strcmp(min_name,"")!=0){
134
+ // there is data to write
135
+ res=gzprintf(gzhash_file,"%s %s %lld\n",min_name,max_name,current_chunk);
136
+ }
137
+
138
+ strcpy(min_name,"");
139
+ strcpy(max_name,"");
140
+ // current_chunk=gz_chunk;
141
+ current_chunk = gztell(gzfile_index);
142
+
143
+ //reopen new gzchunk
144
+ gzclose(gzfile_index);
145
+ current_chunk = last_chunk_file(indexname);
146
+ gzfile_index=gzopen(indexname,"ab");
147
+
148
+ }
149
+
150
+ // write line to current gzchunk in index
151
+ gzprintf(gzfile_index,tmp);
152
+
153
+ // save min_name
154
+ if((strcmp(min_name,"")==0) || (strcmp(sname,min_name)<0))
155
+ {
156
+ // replace min_name
157
+ strcpy(min_name,sname);
158
+ }
159
+
160
+ //save max_name
161
+ if((strcmp(max_name,"")==0) || (strcmp(sname,max_name)>0))
162
+ {
163
+ strcpy(max_name,sname);
164
+ }
165
+
166
+ count++;
167
+ }
168
+
169
+ }
170
+
171
+ if (strcmp(min_name,"")!=0){
172
+ // there are data to write
173
+ res=gzprintf(gzhash_file,"%s %s %lld\n",min_name,max_name,current_chunk);
174
+ }
175
+
176
+ // close files
177
+ gzclose(gzhash_file);
178
+ gzclose(gzfile_index);
179
+ gzclose(gzsorted_file_index);
180
+
181
+ return 0;
182
+ }
183
+
184
+
185
+ /*******************************************************/
186
+ /* main */
187
+ /*******************************************************/
188
+ int main(int argc, char *argv[])
189
+ {
190
+ // check params
191
+ if (argc<2)
192
+ {
193
+ printf("Usage %s fqbin_file [chunk_size [--skip_sort]]\n\n",argv[0]);
194
+ return -1;
195
+ }
196
+
197
+ int chunk_size=10000;
198
+ int skip_sort=0;
199
+
200
+ if(argc==3){
201
+ chunk_size=atoi(argv[2]);
202
+ }
203
+
204
+ if (argc==4){
205
+ skip_sort=1;
206
+ }
207
+
208
+ int res=hash_index_file(argv[1],chunk_size, skip_sort);
209
+
210
+ return res;
211
+ }
212
+
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env bash
2
+
3
+ if [ ! -e "$1" ]; then
4
+ echo "File $1 does not exists"
5
+ echo "Usage: $0 fqbin_file.fqbin"
6
+ exit
7
+ fi
8
+
9
+
10
+ if [ -e "$1.index" ]; then
11
+ echo "Index file $1.index already exists"
12
+ exit
13
+ fi
14
+
15
+ echo "Backing up file as $1.old"
16
+ mv $1 $1.old
17
+
18
+ echo "Regenerating index"
19
+ iterate_fqbin $1.old | mk_fqbin -i -o $1
20
+ rm $1
21
+ mv $1.old $1
Binary file
@@ -0,0 +1,136 @@
1
+ #include "lib_fqbin.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+ #include <unistd.h>
7
+
8
+
9
+ void usage(){
10
+
11
+ printf("Usage: iterate_fqbin [-F|-q] fqbin_file\n\n");
12
+ printf("By default outputs in fastq format\n\n");
13
+ printf(" -F Output only sequence in fasta format\n");
14
+ printf(" -q Output only qualities in phred format\n\n");
15
+ printf(" -e Output only extras\n\n");
16
+
17
+ exit(-1);
18
+
19
+ }
20
+
21
+ int print_file(struct file_data *filed, int only_fasta, int only_qual, int only_extras){
22
+ char *sname=NULL;
23
+ char *fasta=NULL;
24
+ char *qual=NULL;
25
+ char *extras=NULL;
26
+
27
+ int len=0;
28
+ int i=0;
29
+ // int size=5000;
30
+ int res=0;
31
+
32
+
33
+ while ((res=read_data_sequential(filed, &sname, &fasta, &qual, &extras))==0)
34
+ {
35
+
36
+ if (only_fasta){
37
+
38
+ printf(">%s %s\n", sname, extras);
39
+ len=strlen(fasta);
40
+ i=0;
41
+ for(i = 0; i < len; i+=70)
42
+ {
43
+ printf("%.70s\n", fasta+i);
44
+ }
45
+ }else if (only_qual){
46
+ printf(">%s %s\n", sname, extras);
47
+ if (qual!=NULL){
48
+ len=strlen(qual);
49
+ i=0;
50
+ for(i = 0; i < len; i++)
51
+ {
52
+ printf("%02d ", qual[i]-33);
53
+ if (((i+1)%30 == 0) || (i==len-1)) printf("\n");
54
+ }
55
+ }
56
+
57
+ }else if (only_extras){
58
+ printf(">%s %s\n", sname, extras);
59
+ if (extras!=NULL) printf ("%s\n",extras);
60
+ }else{
61
+ printf("@%s %s\n%s\n", sname, extras, fasta);
62
+ printf("+\n%s\n",qual);
63
+ // printf("+%s\n%s\n",sname,qual);
64
+ }
65
+
66
+
67
+ if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
68
+ if ( qual!=NULL ) {free(qual);qual=NULL;}
69
+ if ( extras!=NULL ) {free(extras);extras=NULL;}
70
+ }
71
+
72
+ return res;
73
+ }
74
+
75
+
76
+
77
+ /*******************************************************/
78
+ /* main */
79
+ /*******************************************************/
80
+ int main(int argc, char *argv[])
81
+ {
82
+
83
+ //gzFile gzf_bin;
84
+ // struct file_data filed;
85
+
86
+ struct file_data *filed=NULL;
87
+
88
+ int ch;
89
+
90
+ int output_fasta = 0;
91
+ int output_qual = 0;
92
+ int output_extras = 0;
93
+
94
+ while ((ch = getopt(argc, argv, "Fqeh")) != -1) {
95
+ switch (ch) {
96
+ case 'F':
97
+ output_fasta = 1;
98
+ break;
99
+ case 'q':
100
+ output_qual=1;
101
+ break;
102
+ case 'e':
103
+ output_extras=1;
104
+ break;
105
+ case 'h':
106
+ usage();
107
+ break;
108
+ case '?':
109
+ default:
110
+ usage();
111
+ }
112
+ }
113
+
114
+ argc -= optind;
115
+ argv += optind;
116
+ // printf("argc: %d", argc);
117
+ // printf("argv: %s", argv[0]);
118
+
119
+ if (argc!=1)
120
+ {
121
+ usage();
122
+ }
123
+
124
+ // initialize reads
125
+ if (initialize_sequential_reads(&filed, argv[0])==-1){
126
+ printf("File %s does not exists",argv[0]);
127
+ exit(-1);
128
+ }
129
+
130
+ int res=print_file(filed,output_fasta,output_qual,output_extras);
131
+
132
+ close_sequential_reads(filed);
133
+
134
+ return res;
135
+ }
136
+