scbi_fqbin 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/.DS_Store +0 -0
  3. data/.gitignore +14 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/{README.rdoc → README.md} +0 -0
  7. data/Rakefile +8 -28
  8. data/lib/scbi_fqbin.rb +3 -5
  9. data/lib/scbi_fqbin/fastabin.rb +411 -0
  10. data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
  11. data/lib/scbi_fqbin/fbin_file.rb +1 -1
  12. data/lib/scbi_fqbin/t.rb +9 -0
  13. data/lib/scbi_fqbin/t2.rb +12 -0
  14. data/lib/scbi_fqbin/version.rb +3 -0
  15. data/lib_fqbin_src.zip +0 -0
  16. data/lib_fqbin_src/Makefile +66 -0
  17. data/lib_fqbin_src/fq +0 -0
  18. data/lib_fqbin_src/fq.c +165 -0
  19. data/lib_fqbin_src/hash_fqbin +0 -0
  20. data/lib_fqbin_src/hash_fqbin.c +212 -0
  21. data/lib_fqbin_src/idx_fqbin +21 -0
  22. data/lib_fqbin_src/iterate_fqbin +0 -0
  23. data/lib_fqbin_src/iterate_fqbin.c +136 -0
  24. data/lib_fqbin_src/lib_fqbin.c +1748 -0
  25. data/lib_fqbin_src/lib_fqbin.h +194 -0
  26. data/lib_fqbin_src/mk_fqbin +0 -0
  27. data/lib_fqbin_src/mk_fqbin.c +138 -0
  28. data/lib_fqbin_src/other/bwxform.c +915 -0
  29. data/lib_fqbin_src/other/bwxform.h +74 -0
  30. data/lib_fqbin_src/other/find_in_index.c +130 -0
  31. data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
  32. data/lib_fqbin_src/other/idx_fqbin +0 -0
  33. data/lib_fqbin_src/other/idx_fqbin.c +67 -0
  34. data/lib_fqbin_src/other/make_hsh.sh +14 -0
  35. data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
  36. data/lib_fqbin_src/read_fq +0 -0
  37. data/lib_fqbin_src/read_fq.c +143 -0
  38. data/lib_fqbin_src/read_fqbin +0 -0
  39. data/lib_fqbin_src/read_fqbin.c +101 -0
  40. data/lib_fqbin_src/sort_index +9 -0
  41. data/lib_fqbin_src/test.rb +13 -0
  42. data/scbi_fqbin.gemspec +25 -0
  43. data/test/build.rake +15 -0
  44. data/test/fbinfile +0 -0
  45. data/test/fbinfile.index +0 -0
  46. data/test/no_test_fill_file.rb +66 -0
  47. data/test/old/app.rb +43 -0
  48. data/test/old/bin/iterate_fastabin.rb +54 -0
  49. data/test/old/bin/mk_fastabin.rb +22 -0
  50. data/test/old/bin/rd_fastabin.rb +36 -0
  51. data/test/old/bin/rd_fq.rb +20 -0
  52. data/test/old/bioruby.rb +27 -0
  53. data/test/old/c/Makefile +34 -0
  54. data/test/old/c/fbin_lib.zip +0 -0
  55. data/test/old/c/iterate_fbin.c +54 -0
  56. data/test/old/c/libreria_gz.c +707 -0
  57. data/test/old/c/libreria_gz.h +127 -0
  58. data/test/old/c/main.c +86 -0
  59. data/test/old/c/mk_fbin.c +24 -0
  60. data/test/old/c/rd_seq_fbin.c +44 -0
  61. data/test/old/c/test_ffi/a.out +0 -0
  62. data/test/old/c/test_ffi/app.c +26 -0
  63. data/test/old/c/test_ffi/app.rb +19 -0
  64. data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
  65. data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
  66. data/test/old/c/test_ffi/my_library.rb +23 -0
  67. data/test/old/c/test_ffi/mylibrary.c +22 -0
  68. data/test/old/c/test_ffi/mylibrary.h +6 -0
  69. data/test/old/c/usage_instructions.txt +62 -0
  70. data/test/old/ext/Makefile +187 -0
  71. data/test/old/ext/Makefile.dario +34 -0
  72. data/test/old/ext/extconf.rb +8 -0
  73. data/test/old/ext/mk_fbin.c +24 -0
  74. data/test/old/ext/sample/extras.txt +4 -0
  75. data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
  76. data/test/old/ext/sample/f1.fasta +10 -0
  77. data/test/old/ext/sample/f1.fasta.qual +10 -0
  78. data/test/old/ext/sample/f1.fbin +0 -0
  79. data/test/old/ext/sample/f1.fbin.index +0 -0
  80. data/test/old/ext/sample/main.c +86 -0
  81. data/test/old/ext/usage_instructions.txt +62 -0
  82. data/test/old/t_scbi_fastabin.rb +140 -0
  83. data/test/read_tests/10-original_sizes.sh +16 -0
  84. data/test/read_tests/20-fq_time.sh +23 -0
  85. data/test/read_tests/30-fbin_read_time.sh +23 -0
  86. data/test/read_tests/40-bsc_read_time.sh +21 -0
  87. data/test/read_tests/50-fq_time_x4.sh +25 -0
  88. data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
  89. data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
  90. data/test/results_bio_scbi_fasta.txt +11 -0
  91. data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
  92. data/test/speed.txt +81 -0
  93. data/test/t_scbi_fasta.rb +12 -0
  94. data/test/write_tests/10-original_sizes.sh +16 -0
  95. data/test/write_tests/20-zip_time.sh +17 -0
  96. data/test/write_tests/30-mk_fbin_time.sh +23 -0
  97. data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
  98. data/test/write_tests/40-gzip_time.sh +16 -0
  99. data/test/write_tests/41-bsc_time.sh +16 -0
  100. data/test/write_tests/50-zip_sizes.sh +16 -0
  101. data/test/write_tests/60-fbin_sizes.sh +17 -0
  102. data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
  103. data/test/write_tests/70-gzip_sizes.sh +17 -0
  104. data/test/write_tests/80-bsc_sizes.sh +17 -0
  105. data/website/index.html +87 -0
  106. data/website/index.txt +81 -0
  107. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  108. data/website/stylesheets/screen.css +159 -0
  109. data/website/template.html.erb +50 -0
  110. metadata +208 -95
  111. data/History.txt +0 -19
  112. data/Manifest.txt +0 -12
  113. data/PostInstall.txt +0 -7
  114. data/script/console +0 -10
  115. data/script/destroy +0 -14
  116. data/script/generate +0 -14
@@ -0,0 +1,127 @@
1
+ #include <zlib.h>
2
+
3
+ #define VERSION 1
4
+ #define SUBVERSION 0
5
+
6
+ struct file_data {
7
+ char name[10000];
8
+ char index_name[10000];
9
+ gzFile gzf_bin;
10
+ // int file_bin;
11
+ gzFile gzf_index;
12
+ // int file_index;
13
+ // char file_outname[10000];
14
+ long long pos_chunk_gz;
15
+ // Contains the version and subversion of this file
16
+ int version;
17
+ int subversion;
18
+ // bin_search is true when a binary search can be used.
19
+ int bin_search;
20
+ // Counts the number of sequences written to the bin file, so it can
21
+ // decide where to create a new gz chunk
22
+ long long counter;
23
+ // If there is an error it is stored here so it can be retrieved.
24
+ int error;
25
+ };
26
+
27
+ // two modes:
28
+ // 1 .- new files
29
+ // 2 .- add data to files, if they don't exist they are created
30
+ int initialize_writes(struct file_data ** file, char *output_name, int mode);
31
+
32
+ /* write_seq writes a sequence to the files f_bin and its index to f_index
33
+ pos_chunk_gz is the offset of the beggining of the current gz chunk inside the file
34
+ seq_name is a pointer to the name of the sequence
35
+ fasta, quanta and extras are pointers to strings, must be zero terminated.
36
+ Returns 0 if all goes fine.
37
+ */
38
+
39
+ void inspect_file_data_struct(struct file_data *file);
40
+
41
+ // int write_seq(gzFile *f_bin, FILE *f_index, long pos_chunk_gz, char *seq_name, char *fasta, char *quanta, char *extras);
42
+ int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras);
43
+
44
+ int close_writes(struct file_data *file);
45
+
46
+ /*
47
+ read_seq reads from filename the sequence named seq_name and returns its
48
+ fasta, quanta and extras in those variables.
49
+ It returns 0 if there are no errors, otherwise it returns:
50
+ -2 : error opening index file (it doesn't exists)
51
+ -3 : error reading index file
52
+ -4 : error sequence not found in index file
53
+ -5 : error opening file (it doesn't exists)
54
+ -6 : error reading file
55
+ -7 : error sequence not found
56
+ -8 : error uncompressing sequence
57
+ -9 : EOF
58
+ */
59
+ int read_seq(char *filename, char *seq_name, char **fasta, char **quanta, char **extras);
60
+
61
+ // For doing sequential reads of the whole file:
62
+ // int initialize_sequential_reads(struct file_data *filed, char *filename);
63
+ int initialize_sequential_reads(struct file_data ** filed, char *filename);
64
+
65
+ // return -9 on EOF
66
+ int read_data_sequential(struct file_data *filed, char **seq_name, char **fasta, char **qual, char **extras);
67
+ int close_sequential_reads(struct file_data *filed);
68
+
69
+
70
+ /* process_biofile reads from fname (and fname.quanta) and writes to outname
71
+ (and outname.index) with the binary format
72
+ Returns 0 if all goes fine */
73
+ int process_biofile(char *fname,char *qfname, char *efname, char *outname);
74
+
75
+ /*
76
+ Format definition
77
+
78
+ Main file that contains chunks compressed in gz
79
+ For each sequence the information of that sequence is written with the format:
80
+ 28F143CJN01EBIJN 105 312 0
81
+
82
+ That is:
83
+ 4 chars for the size of this header, excluding itself, that is, it is the size of
84
+ the rest of the header
85
+ sequence name
86
+ fasta size
87
+ qual size
88
+ extras size
89
+
90
+ The First sequence can be a special sequence with metainfo for this file:
91
+ 30UMACOMPRESSEDFORMAT_version 0 0 0
92
+ 27UMACOMPRESSEDFORMAT_1 0 0 0
93
+
94
+
95
+
96
+ Index file
97
+
98
+ Compressed using chunks
99
+
100
+ At the beggining a special sequence can be used to store metadata
101
+ like the number of fields, if a binary search can be used, etc.
102
+
103
+ That sequence will be:
104
+ UMACOMPRESSEDFORMAT version binary_search begin_of_sequential_index 0 0
105
+
106
+ If binary_search is yes then a metaindex follows to do a fast access to the
107
+ index data.
108
+ That will be the first sequence of each chunk and its offset inside the file.
109
+ (Or perphaps it can be put in another file....)
110
+
111
+
112
+ The rest of the index file will be indexes to the stored sequences, with
113
+ the following fields separated by spaces:
114
+
115
+ F143CJN01ETK00 0 471
116
+
117
+ Sequence name
118
+ begin of the compressed chunk
119
+ offset inside the chunk of the header of that sequence.
120
+
121
+ */
122
+
123
+
124
+
125
+
126
+
127
+
data/test/old/c/main.c ADDED
@@ -0,0 +1,86 @@
1
+ #include "libreria_gz.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+
5
+
6
+ /*******************************************************/
7
+ /* main */
8
+ /*******************************************************/
9
+ int main(int argc, char *argv[])
10
+ {
11
+
12
+ char *fasta=NULL;
13
+ char *qual=NULL;
14
+ char *extras=NULL;
15
+ int size=5000;
16
+
17
+ if (argc!=4)
18
+ {
19
+ printf("Usage %s fasta_file qual_file output_file\n\n",argv[0]);
20
+ exit(-1);
21
+ }
22
+
23
+
24
+
25
+ // prueba de lectura
26
+ printf ("Comienzo\n");
27
+
28
+ //int res=read_seq("data/F143CJN01.fbin","F143CJN01D96I9", &fasta, &qual, &extras);
29
+ //int res=read_seq("fasta_filt_w_dict.fbin","F143CJN01D96I9", &fasta, &qual, &extras);
30
+
31
+ // prueba de escritura
32
+ /*int res=write_seq("borrar","S143CJN01D96I9", "primero","QUANTAQUANTAQUANTAQUANTAQUANTA","");
33
+ res=write_seq("borrar","3CJN01D96I9", "el de enmedio","QUANTAQUANTAQUANTAQUANTAQUANTA","");
34
+ res=write_seq("borrar","3CJsdasdaN01D96I9", "ultimo","QUANTAQUANTAQUANTAQUANTAQUANTA","");
35
+ */
36
+ //init_dicts(dict_fasta,dict_qual,32767);
37
+ //int res=process_biofile("data/F143CJN01.fasta","/tmp/prueba2gz.fbin");
38
+ int res=process_biofile(argv[1],argv[3]);
39
+
40
+ //res=read_seq("fasta_filt_w_dict.fbin","F143CJN01D96I9", &fasta, &qual, &extras);
41
+ /*
42
+ int res=read_seq("prueba2.fbin","F143CJN01DI5MZ", &fasta, &qual, &extras);
43
+ printf ("-------------------------------------------------------------\n");
44
+ printf ("RES of read_seq1 call is :%d\n",res);
45
+ if ( res==0 ) printf ("fasta:%s\n size:%d\n",fasta,sizeof(fasta));
46
+ if ( res==0 ) printf ("qual:%s\n",qual);
47
+ if (( res==0 )&& (extras!=NULL)) printf ("extras:%s\n",extras);
48
+ if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
49
+ if ( qual!=NULL ) {free(qual);qual=NULL;}
50
+ if ( extras!=NULL ) {free(extras);extras=NULL;}
51
+ */
52
+ //int res=read_seq("/tmp/prueba2gz.fbin","F143CJN01BO14N", &fasta, &qual, &extras);
53
+ //int res=read_seq("/tmp/prueba2gz.fbin","F143CJN01D2X26", &fasta, &qual, &extras);
54
+ //res=read_seq("prueba2gz.fbin","F143CJN01EBIJN", &fasta, &qual, &extras);
55
+ res=read_seq(argv[3],"F143CJN01DZW7L", &fasta, &qual, &extras);
56
+ //res=read_seq("prueba2gz.fbin","F143CJN01EN6AH", &fasta, &qual, &extras);
57
+ printf ("-------------------------------------------------------------\n");
58
+ printf ("RES of read_seq2 call is :%d\n",res);
59
+ if ( res==0 ) printf ("fasta:%s\n fasta size:%d\n",fasta,strlen(fasta));
60
+ if ( res==0 ) printf ("qual:%s\n",qual);
61
+ if (( res==0 )&& (extras!=NULL)) printf ("extras:%s\n",extras);
62
+ if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
63
+ if ( qual!=NULL ) {free(qual);qual=NULL;}
64
+ if ( extras!=NULL ) {free(extras);extras=NULL;}
65
+
66
+ printf ("***************************\n");
67
+ printf ("Sequential reads\n");
68
+
69
+ initialize_sequential_reads(argv[3]);
70
+ char *sname=NULL;
71
+ while (read_data_sequential(&sname, &fasta, &qual, &extras)==0)
72
+ {
73
+ printf ("***************************\n");
74
+ printf ("RES of read_seq2 call is :%d, sname:%s\n",res,sname);
75
+ if ( res==0 ) printf ("fasta:%s fasta size:%d\n",fasta,strlen(fasta));
76
+ if ( res==0 ) printf ("qual:%s",qual);
77
+ if (( res==0 )&& (extras!=NULL)) printf ("extras:%s\n",extras);
78
+ if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
79
+ if ( qual!=NULL ) {free(qual);qual=NULL;}
80
+ if ( extras!=NULL ) {free(extras);extras=NULL;}
81
+ }
82
+ close_sequential_reads();
83
+
84
+ return res;
85
+ }
86
+
@@ -0,0 +1,24 @@
1
+ #include "libreria_gz.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+
5
+
6
+ /*******************************************************/
7
+ /* main */
8
+ /*******************************************************/
9
+ int main(int argc, char *argv[])
10
+ {
11
+ // check params
12
+ if (argc!=5)
13
+ {
14
+ printf("Usage %s fasta_file qual_file extras_file output_file\n\n",argv[0]);
15
+ return -1;
16
+ }
17
+
18
+ // process file
19
+ // int res=process_biofile(argv[1],argv[2],argv[3],argv[4]);
20
+ int res=process_biofile(argv[1],argv[2],argv[3] ,argv[4]);
21
+
22
+ return res;
23
+ }
24
+
@@ -0,0 +1,44 @@
1
+ #include "libreria_gz.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+ #include <string.h>
5
+ #include <stdlib.h>
6
+
7
+
8
+ /*******************************************************/
9
+ /* main */
10
+ /*******************************************************/
11
+ int main(int argc, char *argv[])
12
+ {
13
+
14
+ char *fasta=NULL;
15
+ char *qual=NULL;
16
+ char *extras=NULL;
17
+ int size=5000;
18
+ int res=0;
19
+
20
+ if (argc!=3)
21
+ {
22
+ printf("Usage %s fbin_file seq_name\n\n",argv[0]);
23
+ return -1;
24
+ }
25
+
26
+
27
+
28
+ res=read_seq(argv[1],argv[2], &fasta, &qual, &extras);
29
+
30
+ if (res==0){
31
+
32
+ printf(">%s\n%s\n", argv[2], fasta);
33
+ printf("%s\n",qual);
34
+ if (extras!=NULL) printf ("extras:%s\n",extras);
35
+ }
36
+
37
+ //res=read_seq("prueba2gz.fbin","F143CJN01EN6AH", &fasta, &qual, &extras);
38
+ if ( fasta!=NULL ) {free(fasta);fasta=NULL;}
39
+ if ( qual!=NULL ) {free(qual);qual=NULL;}
40
+ if ( extras!=NULL ) {free(extras);extras=NULL;}
41
+
42
+ return res;
43
+ }
44
+
Binary file
@@ -0,0 +1,26 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "mylibrary.h"
6
+
7
+ int main()
8
+ {
9
+ double c, d ;
10
+ int errcode ;
11
+ struct SomeObject *objptr ;
12
+
13
+ c = calculate_something(42, 98.6);
14
+
15
+ if ((errcode = error_code()) != 0) {
16
+ fprintf(stderr, "error calculating something: %d\n", errcode);
17
+ exit(1);
18
+ }
19
+
20
+ objptr = create_object("my object") ;
21
+ d = calculate_something_else(c, objptr) ;
22
+ free_object(objptr) ;
23
+
24
+ fprintf(stdout, "calculated %f\n", d);
25
+ exit(0) ;
26
+ }
@@ -0,0 +1,19 @@
1
+ require 'my_library'
2
+
3
+ o = MyLibrary.new
4
+
5
+ r=o.calculate_something(42,98.6)
6
+ puts r
7
+
8
+ # c = MyLibrary.calculate_something(42, 98.6) # note FFI handles literals just fine
9
+ #
10
+ # if ( (errcode = MyLibrary.error_code()) != 0)
11
+ # puts "error calculating something: #{errcode}"
12
+ # exit 1
13
+ # end
14
+ #
15
+ # objptr = MyLibrary.create_object("my object") # note FFI handles string literals as well
16
+ # d = MyLibrary.calculate_something_else(c, objptr)
17
+ # MyLibrary.free_object(objptr)
18
+ #
19
+ # puts "calculated #{d}"
@@ -0,0 +1,23 @@
1
+ require 'ffi'
2
+
3
+ module MyModule
4
+ extend FFI::Library
5
+ ffi_lib "mylibrary"
6
+
7
+ attach_function :calculate_something, [:int, :float], :double
8
+ attach_function :error_code, [], :int # note empty array for functions taking zero arguments
9
+ attach_function :create_object, [:string], :pointer
10
+ attach_function :calculate_something_else, [:double, :pointer], :double
11
+ attach_function :free_object, [:pointer], :void
12
+
13
+ end
14
+
15
+ class MyLibrary
16
+
17
+ include MyModule
18
+
19
+ def initialize
20
+ puts calculate_something(1,2.3)
21
+ end
22
+
23
+ end
@@ -0,0 +1,22 @@
1
+ #include "mylibrary.h"
2
+
3
+ double calculate_something(int a, float b){
4
+ return a+b;
5
+ }
6
+
7
+ int error_code(void){
8
+ return 0;
9
+ }
10
+
11
+ struct SomeObject* create_object(char* name){
12
+ // SomeObject e;
13
+ return 0;
14
+ }
15
+
16
+ double calculate_something_else(double c, struct SomeObject* obj){
17
+ return 1.1;
18
+ }
19
+
20
+ void free_object(void* pointer_to_memory){
21
+
22
+ }
@@ -0,0 +1,6 @@
1
+ /* mylibrary.h */
2
+ double calculate_something(int a, float b);
3
+ int error_code(void);
4
+ struct SomeObject* create_object(char* name);
5
+ double calculate_something_else(double c, struct SomeObject* obj);
6
+ void free_object(void* pointer_to_memory);
@@ -0,0 +1,62 @@
1
+
2
+ Manual of the YAGFF (Yet Another Genetic File Format)
3
+ OGF (Optimized Genetic Format)
4
+
5
+
6
+ First a call to :
7
+ error=initialize_writes(outname,1);
8
+
9
+ Is needed to initialize the files.
10
+
11
+ Its arguments are:
12
+
13
+ - the name of the main output file and to this the suffix .index will be
14
+ appended for the index file.
15
+
16
+ - The mode of the write:
17
+ 1 : create a new file
18
+ 2 : append data to already existing files
19
+
20
+ It returns
21
+ 0 if everything goes fine,
22
+ -1 if there is a problem with already existing files, like an incorrect
23
+ version number
24
+ -2 if there is a problem creating or opening files
25
+
26
+ In case of error it writes it on stderr
27
+
28
+
29
+ Then request to write can be done:
30
+
31
+ int error_wr=write_seq(sname, fasta,qual,extras);
32
+
33
+ Where sname is a char* with the name of the sequence, and fasta, qual and
34
+ extras are char* with the fasta, qual and extras data.
35
+
36
+ At the end a call to : closes_writes();
37
+ is needed to close the opened files
38
+
39
+
40
+
41
+ The reads are even easier:
42
+
43
+ int read_seq(char *filename, char *seq_name, char **fasta, char **qual, char **extras)
44
+ The arguments are:
45
+ filename: is a char* used as input the name of the main file, .index is appended for the index file
46
+ seq_name : char* that contains the name of the searched sequence
47
+
48
+ And the result are returned in :
49
+ fasta, qual and extras, if they point to NULL they are malloced to bufsize=150000 bytes (defined inside the function)
50
+ else they are used without checking its size
51
+
52
+
53
+
54
+ For doing sequential reads an untested procedure is implemented:
55
+
56
+ int initialize_sequential_reads(char *filename);
57
+
58
+ // return -9 on EOF
59
+ int read_data_sequential(char **seq_name, char **fasta, char **qual, char **extras)
60
+ int close_sequential_reads();
61
+
62
+