scbi_fqbin 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/.DS_Store +0 -0
  3. data/.gitignore +14 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/{README.rdoc → README.md} +0 -0
  7. data/Rakefile +8 -28
  8. data/lib/scbi_fqbin.rb +3 -5
  9. data/lib/scbi_fqbin/fastabin.rb +411 -0
  10. data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
  11. data/lib/scbi_fqbin/fbin_file.rb +1 -1
  12. data/lib/scbi_fqbin/t.rb +9 -0
  13. data/lib/scbi_fqbin/t2.rb +12 -0
  14. data/lib/scbi_fqbin/version.rb +3 -0
  15. data/lib_fqbin_src.zip +0 -0
  16. data/lib_fqbin_src/Makefile +66 -0
  17. data/lib_fqbin_src/fq +0 -0
  18. data/lib_fqbin_src/fq.c +165 -0
  19. data/lib_fqbin_src/hash_fqbin +0 -0
  20. data/lib_fqbin_src/hash_fqbin.c +212 -0
  21. data/lib_fqbin_src/idx_fqbin +21 -0
  22. data/lib_fqbin_src/iterate_fqbin +0 -0
  23. data/lib_fqbin_src/iterate_fqbin.c +136 -0
  24. data/lib_fqbin_src/lib_fqbin.c +1748 -0
  25. data/lib_fqbin_src/lib_fqbin.h +194 -0
  26. data/lib_fqbin_src/mk_fqbin +0 -0
  27. data/lib_fqbin_src/mk_fqbin.c +138 -0
  28. data/lib_fqbin_src/other/bwxform.c +915 -0
  29. data/lib_fqbin_src/other/bwxform.h +74 -0
  30. data/lib_fqbin_src/other/find_in_index.c +130 -0
  31. data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
  32. data/lib_fqbin_src/other/idx_fqbin +0 -0
  33. data/lib_fqbin_src/other/idx_fqbin.c +67 -0
  34. data/lib_fqbin_src/other/make_hsh.sh +14 -0
  35. data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
  36. data/lib_fqbin_src/read_fq +0 -0
  37. data/lib_fqbin_src/read_fq.c +143 -0
  38. data/lib_fqbin_src/read_fqbin +0 -0
  39. data/lib_fqbin_src/read_fqbin.c +101 -0
  40. data/lib_fqbin_src/sort_index +9 -0
  41. data/lib_fqbin_src/test.rb +13 -0
  42. data/scbi_fqbin.gemspec +25 -0
  43. data/test/build.rake +15 -0
  44. data/test/fbinfile +0 -0
  45. data/test/fbinfile.index +0 -0
  46. data/test/no_test_fill_file.rb +66 -0
  47. data/test/old/app.rb +43 -0
  48. data/test/old/bin/iterate_fastabin.rb +54 -0
  49. data/test/old/bin/mk_fastabin.rb +22 -0
  50. data/test/old/bin/rd_fastabin.rb +36 -0
  51. data/test/old/bin/rd_fq.rb +20 -0
  52. data/test/old/bioruby.rb +27 -0
  53. data/test/old/c/Makefile +34 -0
  54. data/test/old/c/fbin_lib.zip +0 -0
  55. data/test/old/c/iterate_fbin.c +54 -0
  56. data/test/old/c/libreria_gz.c +707 -0
  57. data/test/old/c/libreria_gz.h +127 -0
  58. data/test/old/c/main.c +86 -0
  59. data/test/old/c/mk_fbin.c +24 -0
  60. data/test/old/c/rd_seq_fbin.c +44 -0
  61. data/test/old/c/test_ffi/a.out +0 -0
  62. data/test/old/c/test_ffi/app.c +26 -0
  63. data/test/old/c/test_ffi/app.rb +19 -0
  64. data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
  65. data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
  66. data/test/old/c/test_ffi/my_library.rb +23 -0
  67. data/test/old/c/test_ffi/mylibrary.c +22 -0
  68. data/test/old/c/test_ffi/mylibrary.h +6 -0
  69. data/test/old/c/usage_instructions.txt +62 -0
  70. data/test/old/ext/Makefile +187 -0
  71. data/test/old/ext/Makefile.dario +34 -0
  72. data/test/old/ext/extconf.rb +8 -0
  73. data/test/old/ext/mk_fbin.c +24 -0
  74. data/test/old/ext/sample/extras.txt +4 -0
  75. data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
  76. data/test/old/ext/sample/f1.fasta +10 -0
  77. data/test/old/ext/sample/f1.fasta.qual +10 -0
  78. data/test/old/ext/sample/f1.fbin +0 -0
  79. data/test/old/ext/sample/f1.fbin.index +0 -0
  80. data/test/old/ext/sample/main.c +86 -0
  81. data/test/old/ext/usage_instructions.txt +62 -0
  82. data/test/old/t_scbi_fastabin.rb +140 -0
  83. data/test/read_tests/10-original_sizes.sh +16 -0
  84. data/test/read_tests/20-fq_time.sh +23 -0
  85. data/test/read_tests/30-fbin_read_time.sh +23 -0
  86. data/test/read_tests/40-bsc_read_time.sh +21 -0
  87. data/test/read_tests/50-fq_time_x4.sh +25 -0
  88. data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
  89. data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
  90. data/test/results_bio_scbi_fasta.txt +11 -0
  91. data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
  92. data/test/speed.txt +81 -0
  93. data/test/t_scbi_fasta.rb +12 -0
  94. data/test/write_tests/10-original_sizes.sh +16 -0
  95. data/test/write_tests/20-zip_time.sh +17 -0
  96. data/test/write_tests/30-mk_fbin_time.sh +23 -0
  97. data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
  98. data/test/write_tests/40-gzip_time.sh +16 -0
  99. data/test/write_tests/41-bsc_time.sh +16 -0
  100. data/test/write_tests/50-zip_sizes.sh +16 -0
  101. data/test/write_tests/60-fbin_sizes.sh +17 -0
  102. data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
  103. data/test/write_tests/70-gzip_sizes.sh +17 -0
  104. data/test/write_tests/80-bsc_sizes.sh +17 -0
  105. data/website/index.html +87 -0
  106. data/website/index.txt +81 -0
  107. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  108. data/website/stylesheets/screen.css +159 -0
  109. data/website/template.html.erb +50 -0
  110. metadata +208 -95
  111. data/History.txt +0 -19
  112. data/Manifest.txt +0 -12
  113. data/PostInstall.txt +0 -7
  114. data/script/console +0 -10
  115. data/script/destroy +0 -14
  116. data/script/generate +0 -14
@@ -0,0 +1,74 @@
1
+ /***************************************************************************
2
+ * Header for Burrows-Wheeler Transform Library
3
+ *
4
+ * File : bwxform.h
5
+ * Purpose : Provides that apply and reverse the Burrows-Wheeler transform
6
+ * (with or without move to front coding/decoding.
7
+ * transformation).
8
+ * Author : Michael Dipperstein
9
+ * Date : August 20, 2004
10
+ *
11
+ ****************************************************************************
12
+ * UPDATES
13
+ *
14
+ * $Id: bwxform.h,v 1.3 2007/09/17 13:21:19 michael Exp $
15
+ * $Log: bwxform.h,v $
16
+ * Revision 1.3 2007/09/17 13:21:19 michael
17
+ * Changes required for LGPL v3.
18
+ *
19
+ * Revision 1.2 2005/05/02 13:35:49 michael
20
+ * Update e-mail address.
21
+ *
22
+ * Revision 1.1.1.1 2004/08/23 04:34:18 michael
23
+ * Burrows-Wheeler Transform
24
+ *
25
+ ****************************************************************************
26
+ *
27
+ * bwxform: An ANSI C Burrows-Wheeler Transform/Reverse Transform Routines
28
+ * Copyright (C) 2004-2005, 2007 by
29
+ * Michael Dipperstein (mdipper@alumni.engr.ucsb.edu)
30
+ *
31
+ * This file is part of the BWT library.
32
+ *
33
+ * The BWT library is free software; you can redistribute it and/or modify
34
+ * it under the terms of the GNU Lesser General Public License as published
35
+ * by the Free Software Foundation; either version 3 of the License, or (at
36
+ * your option) any later version.
37
+ *
38
+ * The BWT library is distributed in the hope that it will be useful, but
39
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
40
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
41
+ * General Public License for more details.
42
+ *
43
+ * You should have received a copy of the GNU Lesser General Public License
44
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
45
+ *
46
+ ***************************************************************************/
47
+
48
+ #ifndef _BWXFORM_H_
49
+ #define _BWXFORM_H_
50
+
51
+ /***************************************************************************
52
+ * CONSTANTS
53
+ ***************************************************************************/
54
+ #ifndef FALSE
55
+ #define FALSE 0
56
+ #endif
57
+
58
+ #ifndef TRUE
59
+ #define TRUE 1
60
+ #endif
61
+
62
+ /***************************************************************************
63
+ * PROTOTYPES
64
+ ***************************************************************************/
65
+ /* Transform inFile */
66
+ int BWXformFile(char *inFile, char *outFile, char mtf);
67
+
68
+ /* Reverse Transform inFile*/
69
+ int BWReverseXformFile(char *inFile, char *outFile, char mtf);
70
+
71
+ int BWXform(char *inString, char *outString, int mtf);
72
+ int BWReverseXform(char *inString, char *outString, int mtf, long size);
73
+
74
+ #endif /* ndef _BWXFORM_H_ */
@@ -0,0 +1,130 @@
1
+
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <time.h>
5
+
6
+
7
+ #include <sys/types.h>
8
+ #include <sys/stat.h>
9
+ #include <fcntl.h>
10
+ #include <errno.h>
11
+
12
+ #include <zlib.h>
13
+ #include <zlib.h>
14
+ #include <stdlib.h>
15
+
16
+ // Maximum file name (including .idx)
17
+ #define MAXFNAME 512
18
+
19
+ // Maximum lenght of the name of a sequence
20
+ #define MAXSEQNAME 1024
21
+ #define MAXSEQLENGTH 150000000
22
+ #define DEBUG 1
23
+ #define FALSE 0
24
+ #define TRUE 1
25
+
26
+ #define INVALID_FASTQ_FORMAT -5
27
+ #define INVALID_FASTA_FORMAT -6
28
+
29
+ #define SEQ_METADATA 10000
30
+
31
+
32
+ // int mystrcmp(const char *a,const char *b)
33
+ // {
34
+ // return strlen(a)-strlen(b)?strlen(a)-strlen(b):strcmp(a,b);
35
+ // }
36
+
37
+
38
+ int first_line(gzFile file, line){
39
+
40
+
41
+
42
+ }
43
+
44
+
45
+
46
+ long long find_seq_in_hash(char *filename,char *sname)
47
+ {
48
+
49
+ char file_name[MAXFNAME];
50
+ // char indexname[MAXFNAME];
51
+ int error;
52
+ char sname1[MAXSEQNAME];// sequence name
53
+ char sname2[MAXSEQNAME];// sequence name
54
+ long long gz_chunk=0;
55
+ char tmp[SEQ_METADATA];
56
+ long long res=-1;
57
+
58
+ // to save min, max sequences and current chunk
59
+ char min_name[MAXSEQNAME];
60
+ char max_name[MAXSEQNAME];
61
+ long long current_chunk=0;
62
+
63
+
64
+ strcpy(min_name,"");
65
+ strcpy(max_name,"");
66
+
67
+ // calc index and hash name
68
+ // snprintf(indexname,MAXFNAME,"%s.index",filename);
69
+ snprintf(file_name,MAXFNAME,"%s.index",filename);
70
+
71
+ // open index and hash file
72
+ gzFile gzhash_file=gzopen(file_name,"r");
73
+
74
+ if (gzhash_file==NULL) {
75
+ fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(gzhash_file,&error));
76
+ return -2;
77
+ }
78
+
79
+ // repeat until EOF
80
+ while ( gzgets(gzhash_file,tmp,sizeof(tmp))!=Z_NULL ) {
81
+
82
+ // printf("%s\n",tmp);
83
+ // parse string
84
+ int reads=sscanf(tmp,"%s %s %lld",sname1,sname2,&gz_chunk);
85
+
86
+ if(reads==3) // valid index line
87
+ {
88
+ //
89
+ if((mystrcmp(sname,sname1)>=0) && (mystrcmp(sname,sname2)<=0))
90
+ {
91
+ printf("%s in [%s,%s]\n",sname,sname1,sname2);
92
+ res = gz_chunk;
93
+ break;
94
+ }else{
95
+ printf("%s NOT IN [%s,%s]\n",sname,sname1,sname2);
96
+ }
97
+
98
+ }
99
+
100
+ }
101
+
102
+ // close files
103
+ gzclose(gzhash_file);
104
+
105
+ return res;
106
+ }
107
+
108
+
109
+ /*******************************************************/
110
+ /* main */
111
+ /*******************************************************/
112
+ int main(int argc, char *argv[])
113
+ {
114
+ // check params
115
+ if (argc!=3)
116
+ {
117
+ printf("Usage %s fbin_index_file seq_name\n\n",argv[0]);
118
+ return -1;
119
+ }
120
+
121
+ int c1=mystrcmp("SRR314795.1","SRR314795.1000000");
122
+ int c2=mystrcmp("SRR314795.1000000","SRR314795.9");
123
+ printf("RES: %d,%d\n",c1,c2);
124
+
125
+ long long chunk=find_seq_in_hash(argv[1],argv[2]);
126
+
127
+ printf("Chunk: %lld\n",chunk);
128
+
129
+ }
130
+
@@ -0,0 +1,164 @@
1
+
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <time.h>
5
+
6
+
7
+ #include <sys/types.h>
8
+ #include <sys/stat.h>
9
+ #include <fcntl.h>
10
+ #include <errno.h>
11
+
12
+ #include <zlib.h>
13
+ #include <zlib.h>
14
+ #include <stdlib.h>
15
+
16
+ // Maximum file name (including .idx)
17
+ #define MAXFNAME 512
18
+
19
+ // Maximum lenght of the name of a sequence
20
+ #define MAXSEQNAME 1024
21
+ #define MAXSEQLENGTH 150000000
22
+ #define DEBUG 1
23
+ #define FALSE 0
24
+ #define TRUE 1
25
+
26
+ #define INVALID_FASTQ_FORMAT -5
27
+ #define INVALID_FASTA_FORMAT -6
28
+
29
+ #define SEQ_METADATA 10000
30
+
31
+ // creates a hash from an index file with the desired chunk size. Chunk size can be
32
+ // adjusted to fit a good compromise between access speed and used space.
33
+ int hash_index_file(char *filename, int chunk_size, int skip_sort)
34
+ {
35
+
36
+ char hash_file_name[MAXFNAME];
37
+ char indexname[MAXFNAME];
38
+
39
+ char sname[MAXSEQNAME];// sequence name
40
+ long long beginH, gz_chunk=0;
41
+ char tmp[SEQ_METADATA];
42
+ int res=0;
43
+ int error;
44
+
45
+ // to save min, max sequences and current chunk
46
+ char min_name[MAXSEQNAME];
47
+ char max_name[MAXSEQNAME];
48
+ long long current_chunk=0;
49
+ long long count=0;
50
+
51
+
52
+ strcpy(min_name,"");
53
+ strcpy(max_name,"");
54
+
55
+ // calc index and hash name
56
+ snprintf(indexname,MAXFNAME,"%s.index",filename);
57
+ snprintf(hash_file_name,MAXFNAME,"%s.index.hash",filename);
58
+
59
+ // sort index file by external command
60
+ if(skip_sort==0)
61
+ {
62
+ char cmd[10000];
63
+ snprintf(cmd,10000,"sort_index.sh %s",indexname);
64
+ system(cmd);
65
+ }
66
+
67
+ // use sorted index
68
+ // snprintf(indexname,MAXFNAME,"%s.index.sort",filename);
69
+
70
+ // open index and hash file
71
+ gzFile gzhash_file=gzopen(hash_file_name,"wb");
72
+ gzFile gzfile_index=gzopen(indexname,"r");
73
+
74
+ if (gzfile_index==NULL) {
75
+ fprintf(stderr,"error opening gzfile_index :%s\n",gzerror(gzfile_index,&error));
76
+ return -2;
77
+ }
78
+
79
+ if (gzhash_file==NULL) {
80
+ fprintf(stderr,"error opening gzhash_file :%s\n",gzerror(gzhash_file,&error));
81
+ return -2;
82
+ }
83
+
84
+ // repeat until EOF
85
+ while ( gzgets(gzfile_index,tmp,sizeof(tmp))!=Z_NULL ) {
86
+
87
+ // parse string
88
+ sscanf(tmp,"%s %lld %lld",sname,&gz_chunk,&beginH);
89
+
90
+ if(strcmp(sname,"UMACOMPRESSEDFORMAT")!=0) // valid index line
91
+ {
92
+
93
+ // clear chunk_data if any
94
+ // if (gz_chunk!=current_chunk){
95
+ if ((count%chunk_size)==0){
96
+ if (strcmp(min_name,"")!=0){
97
+ // there are data to write
98
+ res=gzprintf(gzhash_file,"%s %s %lld\n",min_name,max_name,current_chunk);
99
+ }
100
+ strcpy(min_name,"");
101
+ strcpy(max_name,"");
102
+ // current_chunk=gz_chunk;
103
+ current_chunk = gztell(gzfile_index);
104
+ }
105
+
106
+ // save min_name
107
+ if((strcmp(min_name,"")==0) || (strcmp(sname,min_name)<0))
108
+ {
109
+ // replace min_name
110
+ strcpy(min_name,sname);
111
+ }
112
+
113
+ //save max_name
114
+ if((strcmp(max_name,"")==0) || (strcmp(sname,max_name)>0))
115
+ {
116
+ strcpy(max_name,sname);
117
+ }
118
+
119
+ count++;
120
+ }
121
+
122
+ }
123
+
124
+ if (strcmp(min_name,"")!=0){
125
+ // there are data to write
126
+ res=gzprintf(gzhash_file,"%s %s %lld\n",min_name,max_name,current_chunk);
127
+ }
128
+
129
+ // close files
130
+ gzclose(gzhash_file);
131
+ gzclose(gzfile_index);
132
+
133
+ return 0;
134
+ }
135
+
136
+
137
+ /*******************************************************/
138
+ /* main */
139
+ /*******************************************************/
140
+ int main(int argc, char *argv[])
141
+ {
142
+ // check params
143
+ if (argc<2)
144
+ {
145
+ printf("Usage %s fbin_file [chunk_size [--skip_sort]]\n\n",argv[0]);
146
+ return -1;
147
+ }
148
+
149
+ int chunk_size=10000;
150
+ int skip_sort=0;
151
+
152
+ if(argc==3){
153
+ chunk_size=atoi(argv[2]);
154
+ }
155
+
156
+ if (argc==4){
157
+ skip_sort=1;
158
+ }
159
+
160
+ int res=hash_index_file(argv[1],chunk_size, skip_sort);
161
+
162
+ return res;
163
+ }
164
+
Binary file
@@ -0,0 +1,67 @@
1
+ #include "lib_fqbin.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+
7
+
8
+
9
+ #include <unistd.h>
10
+
11
+
12
+ void usage(){
13
+ printf("Usage: idx_fqbin fqbin_file\n\n");
14
+ // printf(" -f Output sequence in fasta format\n\n");
15
+
16
+ exit(-1);
17
+
18
+ }
19
+
20
+ /*******************************************************/
21
+ /* main */
22
+ /*******************************************************/
23
+ int main(int argc, char *argv[])
24
+ {
25
+
26
+ char *fasta=NULL;
27
+ char *qual=NULL;
28
+ char *extras=NULL;
29
+ int size=5000;
30
+ int res=0;
31
+
32
+ int ch;
33
+
34
+ int output_fasta = 0;
35
+ int output_qual = 0;
36
+
37
+ while ((ch = getopt(argc, argv, "h")) != -1) {
38
+ switch (ch) {
39
+ case 'h':
40
+ usage();
41
+ break;
42
+ case '?':
43
+ default:
44
+ usage();
45
+ }
46
+ }
47
+
48
+ argc -= optind;
49
+ argv += optind;
50
+ // printf("argc: %d", argc);
51
+ // printf("argv: %s", argv[0]);
52
+
53
+ if (argc!=1)
54
+ {
55
+ usage();
56
+ }
57
+
58
+
59
+ if (regenerate_index(argv[0])==-1){
60
+ printf("File %s does not exists",argv[0]);
61
+ exit(-1);
62
+ }
63
+
64
+ exit(0);
65
+
66
+ }
67
+
@@ -0,0 +1,14 @@
1
+ # La entrada a fichero es el nombre del fichero sin extensión
2
+
3
+ # crea un fichero con los bloques existentes
4
+ rm $1.hsh
5
+ zmore $1.index | awk '{if ( FNR!=1 ) print $2}' |sort -n|uniq > $1.tmp
6
+
7
+ for block in `cat $1.tmp` ; do
8
+ minmax=`zegrep "^[^[:space:]]* $block " $1.index|awk ' \
9
+ BEGIN { MIN="ZZZZZZZZZZZZZZZZZZZZZZZZZZZ";MAX=""} \
10
+ {if ((MIN>$1) && ($1!="" )) MIN=$1; \
11
+ if (MAX<$1) MAX=$1;nlines++ } \
12
+ END {print MIN,MAX}'`
13
+ echo $minmax $block >> $1.hsh
14
+ done