scbi_fqbin 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.DS_Store +0 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/{README.rdoc → README.md} +0 -0
- data/Rakefile +8 -28
- data/lib/scbi_fqbin.rb +3 -5
- data/lib/scbi_fqbin/fastabin.rb +411 -0
- data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
- data/lib/scbi_fqbin/fbin_file.rb +1 -1
- data/lib/scbi_fqbin/t.rb +9 -0
- data/lib/scbi_fqbin/t2.rb +12 -0
- data/lib/scbi_fqbin/version.rb +3 -0
- data/lib_fqbin_src.zip +0 -0
- data/lib_fqbin_src/Makefile +66 -0
- data/lib_fqbin_src/fq +0 -0
- data/lib_fqbin_src/fq.c +165 -0
- data/lib_fqbin_src/hash_fqbin +0 -0
- data/lib_fqbin_src/hash_fqbin.c +212 -0
- data/lib_fqbin_src/idx_fqbin +21 -0
- data/lib_fqbin_src/iterate_fqbin +0 -0
- data/lib_fqbin_src/iterate_fqbin.c +136 -0
- data/lib_fqbin_src/lib_fqbin.c +1748 -0
- data/lib_fqbin_src/lib_fqbin.h +194 -0
- data/lib_fqbin_src/mk_fqbin +0 -0
- data/lib_fqbin_src/mk_fqbin.c +138 -0
- data/lib_fqbin_src/other/bwxform.c +915 -0
- data/lib_fqbin_src/other/bwxform.h +74 -0
- data/lib_fqbin_src/other/find_in_index.c +130 -0
- data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
- data/lib_fqbin_src/other/idx_fqbin +0 -0
- data/lib_fqbin_src/other/idx_fqbin.c +67 -0
- data/lib_fqbin_src/other/make_hsh.sh +14 -0
- data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
- data/lib_fqbin_src/read_fq +0 -0
- data/lib_fqbin_src/read_fq.c +143 -0
- data/lib_fqbin_src/read_fqbin +0 -0
- data/lib_fqbin_src/read_fqbin.c +101 -0
- data/lib_fqbin_src/sort_index +9 -0
- data/lib_fqbin_src/test.rb +13 -0
- data/scbi_fqbin.gemspec +25 -0
- data/test/build.rake +15 -0
- data/test/fbinfile +0 -0
- data/test/fbinfile.index +0 -0
- data/test/no_test_fill_file.rb +66 -0
- data/test/old/app.rb +43 -0
- data/test/old/bin/iterate_fastabin.rb +54 -0
- data/test/old/bin/mk_fastabin.rb +22 -0
- data/test/old/bin/rd_fastabin.rb +36 -0
- data/test/old/bin/rd_fq.rb +20 -0
- data/test/old/bioruby.rb +27 -0
- data/test/old/c/Makefile +34 -0
- data/test/old/c/fbin_lib.zip +0 -0
- data/test/old/c/iterate_fbin.c +54 -0
- data/test/old/c/libreria_gz.c +707 -0
- data/test/old/c/libreria_gz.h +127 -0
- data/test/old/c/main.c +86 -0
- data/test/old/c/mk_fbin.c +24 -0
- data/test/old/c/rd_seq_fbin.c +44 -0
- data/test/old/c/test_ffi/a.out +0 -0
- data/test/old/c/test_ffi/app.c +26 -0
- data/test/old/c/test_ffi/app.rb +19 -0
- data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
- data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
- data/test/old/c/test_ffi/my_library.rb +23 -0
- data/test/old/c/test_ffi/mylibrary.c +22 -0
- data/test/old/c/test_ffi/mylibrary.h +6 -0
- data/test/old/c/usage_instructions.txt +62 -0
- data/test/old/ext/Makefile +187 -0
- data/test/old/ext/Makefile.dario +34 -0
- data/test/old/ext/extconf.rb +8 -0
- data/test/old/ext/mk_fbin.c +24 -0
- data/test/old/ext/sample/extras.txt +4 -0
- data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
- data/test/old/ext/sample/f1.fasta +10 -0
- data/test/old/ext/sample/f1.fasta.qual +10 -0
- data/test/old/ext/sample/f1.fbin +0 -0
- data/test/old/ext/sample/f1.fbin.index +0 -0
- data/test/old/ext/sample/main.c +86 -0
- data/test/old/ext/usage_instructions.txt +62 -0
- data/test/old/t_scbi_fastabin.rb +140 -0
- data/test/read_tests/10-original_sizes.sh +16 -0
- data/test/read_tests/20-fq_time.sh +23 -0
- data/test/read_tests/30-fbin_read_time.sh +23 -0
- data/test/read_tests/40-bsc_read_time.sh +21 -0
- data/test/read_tests/50-fq_time_x4.sh +25 -0
- data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
- data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
- data/test/results_bio_scbi_fasta.txt +11 -0
- data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
- data/test/speed.txt +81 -0
- data/test/t_scbi_fasta.rb +12 -0
- data/test/write_tests/10-original_sizes.sh +16 -0
- data/test/write_tests/20-zip_time.sh +17 -0
- data/test/write_tests/30-mk_fbin_time.sh +23 -0
- data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
- data/test/write_tests/40-gzip_time.sh +16 -0
- data/test/write_tests/41-bsc_time.sh +16 -0
- data/test/write_tests/50-zip_sizes.sh +16 -0
- data/test/write_tests/60-fbin_sizes.sh +17 -0
- data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
- data/test/write_tests/70-gzip_sizes.sh +17 -0
- data/test/write_tests/80-bsc_sizes.sh +17 -0
- data/website/index.html +87 -0
- data/website/index.txt +81 -0
- data/website/javascripts/rounded_corners_lite.inc.js +285 -0
- data/website/stylesheets/screen.css +159 -0
- data/website/template.html.erb +50 -0
- metadata +208 -95
- data/History.txt +0 -19
- data/Manifest.txt +0 -12
- data/PostInstall.txt +0 -7
- data/script/console +0 -10
- data/script/destroy +0 -14
- data/script/generate +0 -14
@@ -0,0 +1,194 @@
|
|
1
|
+
#include <stdio.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <time.h>
|
4
|
+
|
5
|
+
|
6
|
+
#include <sys/types.h>
|
7
|
+
#include <sys/stat.h>
|
8
|
+
#include <fcntl.h>
|
9
|
+
#include <errno.h>
|
10
|
+
|
11
|
+
#include <zlib.h>
|
12
|
+
#include <stdlib.h>
|
13
|
+
|
14
|
+
#include <zlib.h>
|
15
|
+
|
16
|
+
#define VERSION 1
|
17
|
+
#define SUBVERSION 0
|
18
|
+
|
19
|
+
#define CHUNK 262144
|
20
|
+
|
21
|
+
// Maximum file name (including .idx)
|
22
|
+
#define MAXFNAME 512
|
23
|
+
|
24
|
+
// Maximum lenght of the name of a sequence
|
25
|
+
#define MAXSEQNAME 1024
|
26
|
+
#define ONEMB 1000000
|
27
|
+
#define MAXSEQLENGTH 500*ONEMB
|
28
|
+
|
29
|
+
#define INVALID_FASTQ_FORMAT -5
|
30
|
+
#define INVALID_FASTA_FORMAT -6
|
31
|
+
#define MAX_SEQ_SIZE_ERROR -7
|
32
|
+
|
33
|
+
#define DEBUG 0
|
34
|
+
#define FALSE 0
|
35
|
+
#define TRUE 1
|
36
|
+
|
37
|
+
// Maximum size of the metadata of a sequence, including name, lenght of fasta, qual and extras.
|
38
|
+
// It should be a maximum of 10000
|
39
|
+
#define SEQ_METADATA 10000
|
40
|
+
|
41
|
+
|
42
|
+
struct file_data {
|
43
|
+
char name[10000];
|
44
|
+
char index_name[10000];
|
45
|
+
gzFile gzf_bin;
|
46
|
+
// int file_bin;
|
47
|
+
gzFile gzf_index;
|
48
|
+
// int file_index;
|
49
|
+
// char file_outname[10000];
|
50
|
+
long long pos_chunk_gz;
|
51
|
+
// Contains the version and subversion of this file
|
52
|
+
int version;
|
53
|
+
int subversion;
|
54
|
+
// bin_search is true when a binary search can be used.
|
55
|
+
int bin_search;
|
56
|
+
// Counts the number of sequences written to the bin file, so it can
|
57
|
+
// decide where to create a new gz chunk
|
58
|
+
long long counter;
|
59
|
+
// If there is an error it is stored here so it can be retrieved.
|
60
|
+
int error;
|
61
|
+
|
62
|
+
int discretize_qual;
|
63
|
+
int flatten_qual;
|
64
|
+
int create_index;
|
65
|
+
};
|
66
|
+
|
67
|
+
// two modes:
|
68
|
+
// 1 .- new files
|
69
|
+
// 2 .- add data to files, if they don't exist they are created
|
70
|
+
int initialize_writes(struct file_data ** file, char *output_name, int mode, int discretize_qual, int flatten_qual, int create_index);
|
71
|
+
|
72
|
+
/*
|
73
|
+
|
74
|
+
write_seq writes a sequence to the files f_bin and its index to f_index
|
75
|
+
pos_chunk_gz is the offset of the beggining of the current gz chunk inside the file
|
76
|
+
seq_name is a pointer to the name of the sequence
|
77
|
+
fasta, quanta and extras are pointers to strings, must be zero terminated.
|
78
|
+
Returns 0 if all goes fine.
|
79
|
+
|
80
|
+
*/
|
81
|
+
|
82
|
+
void inspect_file_data_struct(struct file_data *file);
|
83
|
+
|
84
|
+
// int write_seq(gzFile *f_bin, FILE *f_index, long pos_chunk_gz, char *seq_name, char *fasta, char *quanta, char *extras);
|
85
|
+
int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras);
|
86
|
+
|
87
|
+
int close_writes(struct file_data *file);
|
88
|
+
|
89
|
+
/*
|
90
|
+
read_seq reads from filename the sequence named seq_name and returns its
|
91
|
+
fasta, quanta and extras in those variables.
|
92
|
+
It returns 0 if there are no errors, otherwise it returns:
|
93
|
+
-2 : error opening index file (it doesn't exists)
|
94
|
+
-3 : error reading index file
|
95
|
+
-4 : error sequence not found in index file
|
96
|
+
-5 : error opening file (it doesn't exists)
|
97
|
+
-6 : error reading file
|
98
|
+
-7 : error sequence not found
|
99
|
+
-8 : error uncompressing sequence
|
100
|
+
-9 : EOF
|
101
|
+
*/
|
102
|
+
int read_seq(char *filename, char *seq_name, char **fasta, char **quanta, char **extras);
|
103
|
+
|
104
|
+
// For doing sequential reads of the whole file:
|
105
|
+
// int initialize_sequential_reads(struct file_data *filed, char *filename);
|
106
|
+
int initialize_sequential_reads(struct file_data ** filed, char *filename);
|
107
|
+
|
108
|
+
// return -9 on EOF
|
109
|
+
int read_data_sequential(struct file_data *filed, char **seq_name, char **fasta, char **qual, char **extras);
|
110
|
+
int close_sequential_reads(struct file_data *filed);
|
111
|
+
|
112
|
+
|
113
|
+
/* process_biofile reads from fname (and fname.quanta) and writes to outname
|
114
|
+
(and outname.index) with the binary format
|
115
|
+
Returns 0 if all goes fine */
|
116
|
+
// int process_biofile(char *fname,char *qfname, char *efname, char *outname);
|
117
|
+
|
118
|
+
|
119
|
+
int open_file(char *fname, FILE **file);
|
120
|
+
int close_file(FILE *file);
|
121
|
+
int chomp(char *str);
|
122
|
+
int split_name(char *fname, char *name, char *comments);
|
123
|
+
int get_next_seq_fastq(FILE *file, char **name, char **fasta, char **qual, char **comments);
|
124
|
+
int get_next_seq_fasta(FILE *file, char *name, char *fasta, char *comments);
|
125
|
+
int process_fastq(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index);
|
126
|
+
int process_fasta(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index);
|
127
|
+
|
128
|
+
|
129
|
+
/*
|
130
|
+
Format definition
|
131
|
+
|
132
|
+
Main file that contains chunks compressed in gz
|
133
|
+
For each sequence the information of that sequence is written with the format:
|
134
|
+
28F143CJN01EBIJN 105 312 0
|
135
|
+
|
136
|
+
That is:
|
137
|
+
4 chars for the size of this header, excluding itself, that is, it is the size of
|
138
|
+
the rest of the header
|
139
|
+
sequence name
|
140
|
+
fasta size
|
141
|
+
qual size
|
142
|
+
extras size
|
143
|
+
|
144
|
+
The First sequence can be a special sequence with metainfo for this file:
|
145
|
+
30UMACOMPRESSEDFORMAT_version 0 0 0
|
146
|
+
27UMACOMPRESSEDFORMAT_1 0 0 0
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
Index file
|
151
|
+
|
152
|
+
Compressed using chunks
|
153
|
+
|
154
|
+
At the beggining a special sequence can be used to store metadata
|
155
|
+
like the number of fields, if a binary search can be used, etc.
|
156
|
+
|
157
|
+
That sequence will be:
|
158
|
+
UMACOMPRESSEDFORMAT version binary_search begin_of_sequential_index 0 0
|
159
|
+
|
160
|
+
If binary_search is yes then a metaindex follows to do a fast access to the
|
161
|
+
index data.
|
162
|
+
That will be the first sequence of each chunk and its offset inside the file.
|
163
|
+
(Or perphaps it can be put in another file....)
|
164
|
+
|
165
|
+
|
166
|
+
The rest of the index file will be indexes to the stored sequences, with
|
167
|
+
the following fields separated by spaces:
|
168
|
+
|
169
|
+
F143CJN01ETK00 0 471
|
170
|
+
|
171
|
+
Sequence name
|
172
|
+
begin of the compressed chunk
|
173
|
+
offset inside the chunk of the header of that sequence.
|
174
|
+
|
175
|
+
*/
|
176
|
+
|
177
|
+
int check_error(int error_condition,char *message, int return_value);
|
178
|
+
|
179
|
+
// #ifndef _libfbin
|
180
|
+
//
|
181
|
+
// int check_error(int error_condition,char *message, int return_value){
|
182
|
+
// if (error_condition) {
|
183
|
+
// fprintf(stderr,"Error %d; %s\nMSG:%s\n",errno ,message, strerror(errno));
|
184
|
+
// return return_value;
|
185
|
+
// }
|
186
|
+
// }
|
187
|
+
//
|
188
|
+
// #define _libfbin
|
189
|
+
// #endif
|
190
|
+
|
191
|
+
int free_string(char **string);
|
192
|
+
|
193
|
+
int regenerate_index(char * filename);
|
194
|
+
|
Binary file
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#include "lib_fqbin.h"
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <ctype.h>
|
4
|
+
|
5
|
+
#include <unistd.h>
|
6
|
+
|
7
|
+
|
8
|
+
void usage(){
|
9
|
+
|
10
|
+
// printf("Usage: mk_fqbin [-i] [-f flatten_qual] [-d discretize_qual] [-e extras_file] -o output_file [fastq_file_input_file]\n\n");
|
11
|
+
|
12
|
+
printf("mk_fqbin converts a fastQ input file or STDIN stream (use no filename or '-') to compressed fqbin format.\n\n");
|
13
|
+
|
14
|
+
printf("Usage: mk_fqbin [OPTIONS] -o output_file [fastq_file_input_file]\n\n");
|
15
|
+
|
16
|
+
printf("Options:\n");
|
17
|
+
printf(" -i create random access index\n");
|
18
|
+
printf(" -d discretize_qual: quality values are discretized in groups of size discretize_qual. This way less quality values are used and a better compression is obtained\n");
|
19
|
+
printf(" -f flatten_qual: quality values over flatten_qual (use phred scale) will be set to flatten_qual value in order to achieve a better compression\n");
|
20
|
+
printf(" -e extras_file: a file with extra metadata for each sequence if standard FASTA format\n");
|
21
|
+
|
22
|
+
printf("Mandatory parameters:\n");
|
23
|
+
printf(" -o output_file: output fqbin file\n");
|
24
|
+
printf(" -F input is in fasta format (will look for filename.qual for qualities) \n");
|
25
|
+
|
26
|
+
printf("\nSCBI - Supercomputación y Bioinformática. University of Malaga. http://www.scbi.uma.es. Copyright 2011\n\n");
|
27
|
+
|
28
|
+
exit(-1);
|
29
|
+
|
30
|
+
}
|
31
|
+
|
32
|
+
/*******************************************************/
|
33
|
+
/* main */
|
34
|
+
/*******************************************************/
|
35
|
+
int main(int argc, char *argv[])
|
36
|
+
{
|
37
|
+
|
38
|
+
int res = 0;
|
39
|
+
|
40
|
+
int ch;
|
41
|
+
|
42
|
+
int output_fasta = 0;
|
43
|
+
int output_qual = 0;
|
44
|
+
|
45
|
+
int flatten_qual=0;
|
46
|
+
int discretize_qual=0;
|
47
|
+
|
48
|
+
char *extras_file=NULL;
|
49
|
+
char *output_file=NULL;
|
50
|
+
int create_index=0;
|
51
|
+
int input_in_fasta=0;
|
52
|
+
|
53
|
+
while ((ch = getopt(argc, argv, "o:e:d:f:iFh")) != -1) {
|
54
|
+
// printf("opt %d\n",ch);
|
55
|
+
switch (ch) {
|
56
|
+
case 'e':
|
57
|
+
// strcopy(extras_file,optarg);
|
58
|
+
extras_file=optarg;
|
59
|
+
break;
|
60
|
+
case 'f':
|
61
|
+
flatten_qual = atoi(optarg)+33;
|
62
|
+
break;
|
63
|
+
case 'd':
|
64
|
+
discretize_qual = atoi(optarg);
|
65
|
+
|
66
|
+
if(discretize_qual<2)
|
67
|
+
{
|
68
|
+
discretize_qual=0;
|
69
|
+
}
|
70
|
+
break;
|
71
|
+
case 'o':
|
72
|
+
output_file=optarg;
|
73
|
+
break;
|
74
|
+
case 'i':
|
75
|
+
create_index=1;
|
76
|
+
break;
|
77
|
+
case 'F':
|
78
|
+
input_in_fasta=1;
|
79
|
+
break;
|
80
|
+
case 'h':
|
81
|
+
usage();
|
82
|
+
break;
|
83
|
+
case '?':
|
84
|
+
default:
|
85
|
+
usage();
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
argc -= optind;
|
90
|
+
argv += optind;
|
91
|
+
// printf("argc: %d", argc);
|
92
|
+
// printf("argv: %s", argv[0]);
|
93
|
+
|
94
|
+
if(output_file==NULL)
|
95
|
+
{
|
96
|
+
printf("Output file is a mandatory option. Provide one with -o filename\n");
|
97
|
+
usage();
|
98
|
+
exit -1;
|
99
|
+
}
|
100
|
+
|
101
|
+
printf("Extra metadata file: %s\nFlattenning qual over:%d (%c char)\nDistretizing qual in groups of:%d\n",extras_file,flatten_qual,flatten_qual,discretize_qual);
|
102
|
+
|
103
|
+
if (create_index){printf("Creating random access index\n");}
|
104
|
+
|
105
|
+
|
106
|
+
if(input_in_fasta)
|
107
|
+
{
|
108
|
+
|
109
|
+
// check remaining params
|
110
|
+
if (argc==1){
|
111
|
+
res=process_fasta(argv[0],extras_file,output_file,discretize_qual,flatten_qual,create_index);
|
112
|
+
}
|
113
|
+
else if(argc==0)
|
114
|
+
{
|
115
|
+
res=process_fasta("-",extras_file,output_file,discretize_qual,flatten_qual,create_index);
|
116
|
+
}
|
117
|
+
else{
|
118
|
+
usage();
|
119
|
+
}
|
120
|
+
|
121
|
+
}else{
|
122
|
+
// check remaining params
|
123
|
+
if (argc==1){
|
124
|
+
res=process_fastq(argv[0],extras_file,output_file,discretize_qual,flatten_qual,create_index);
|
125
|
+
}
|
126
|
+
else if(argc==0)
|
127
|
+
{
|
128
|
+
res=process_fastq("-",extras_file,output_file,discretize_qual,flatten_qual,create_index);
|
129
|
+
}
|
130
|
+
else{
|
131
|
+
usage();
|
132
|
+
}
|
133
|
+
|
134
|
+
}
|
135
|
+
|
136
|
+
return res;
|
137
|
+
}
|
138
|
+
|
@@ -0,0 +1,915 @@
|
|
1
|
+
/***************************************************************************
|
2
|
+
* Burrows-Wheeler Transform Library
|
3
|
+
*
|
4
|
+
* File : bwxform.c
|
5
|
+
* Purpose : Provides prototypes for functions that apply and reverse the
|
6
|
+
* Burrows-Wheeler transform (with or without move to front
|
7
|
+
* coding/decoding). The algorithms implemented are based upon
|
8
|
+
* those described in "A Block-sorting Lossless Data Compression
|
9
|
+
* Algorithm" by M. Burrows and D.J. Wheeler.
|
10
|
+
* Author : Michael Dipperstein
|
11
|
+
* Date : August 20, 2004
|
12
|
+
*
|
13
|
+
****************************************************************************
|
14
|
+
* UPDATES
|
15
|
+
*
|
16
|
+
* $Id: bwxform.c,v 1.6 2007/09/17 13:21:19 michael Exp $
|
17
|
+
* $Log: bwxform.c,v $
|
18
|
+
* Revision 1.6 2007/09/17 13:21:19 michael
|
19
|
+
* Changes required for LGPL v3.
|
20
|
+
*
|
21
|
+
* Revision 1.5 2005/11/03 15:01:46 michael
|
22
|
+
* Speed up block sorting using the algorithm suggested by the
|
23
|
+
* Burrows-Wheeler paper. Radix sort all rotations by the first
|
24
|
+
* two charcters before employing quicksort.
|
25
|
+
*
|
26
|
+
* Revision 1.4 2005/05/02 13:33:41 michael
|
27
|
+
* Allocate large arrays on heap instead of stack so that gcc builds code
|
28
|
+
* that can handle larger blocks.
|
29
|
+
*
|
30
|
+
* Update e-mail address
|
31
|
+
*
|
32
|
+
* Revision 1.3 2004/08/27 01:24:16 michael
|
33
|
+
* Write S[0] index (I) before transformed block to aviod having to
|
34
|
+
* find I in a partial block.
|
35
|
+
*
|
36
|
+
* Revision 1.2 2004/08/26 06:16:08 michael
|
37
|
+
* Handle partial blocks without need to store block size. Use size
|
38
|
+
* returned by fread() to indicate smaller than standard block.
|
39
|
+
*
|
40
|
+
* Revision 1.1.1.1 2004/08/23 04:34:18 michael
|
41
|
+
* Burrows-Wheeler Transform
|
42
|
+
*
|
43
|
+
****************************************************************************
|
44
|
+
*
|
45
|
+
* bwxform: An ANSI C Burrows-Wheeler Transform/Reverse Transform Routines
|
46
|
+
* Copyright (C) 2004-2005, 2007 by
|
47
|
+
* Michael Dipperstein (mdipper@alumni.engr.ucsb.edu)
|
48
|
+
*
|
49
|
+
* This file is part of the BWT library.
|
50
|
+
*
|
51
|
+
* The BWT library is free software; you can redistribute it and/or modify
|
52
|
+
* it under the terms of the GNU Lesser General Public License as published
|
53
|
+
* by the Free Software Foundation; either version 3 of the License, or (at
|
54
|
+
* your option) any later version.
|
55
|
+
*
|
56
|
+
* The BWT library is distributed in the hope that it will be useful, but
|
57
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
58
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
|
59
|
+
* General Public License for more details.
|
60
|
+
*
|
61
|
+
* You should have received a copy of the GNU Lesser General Public License
|
62
|
+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
63
|
+
*
|
64
|
+
***************************************************************************/
|
65
|
+
|
66
|
+
/***************************************************************************
|
67
|
+
* INCLUDED FILES
|
68
|
+
***************************************************************************/
|
69
|
+
#include <stdio.h>
|
70
|
+
#include <stdlib.h>
|
71
|
+
#include <limits.h>
|
72
|
+
#include <string.h>
|
73
|
+
#include "bwxform.h"
|
74
|
+
|
75
|
+
/***************************************************************************
|
76
|
+
* CONSTANTS
|
77
|
+
***************************************************************************/
|
78
|
+
#define BLOCK_SIZE 4096 /* size of blocks */
|
79
|
+
|
80
|
+
#if BLOCK_SIZE > INT_MAX
|
81
|
+
#error BLOCK_SIZE must be <= INT_MAX and maximum size_t
|
82
|
+
#endif
|
83
|
+
|
84
|
+
/* NOTE: Need to find a way to check for maximum size_t */
|
85
|
+
|
86
|
+
/***************************************************************************
|
87
|
+
* TYPE DEFINITIONS
|
88
|
+
***************************************************************************/
|
89
|
+
unsigned char block[BLOCK_SIZE]; /* block being (un)transformed */
|
90
|
+
size_t blockSize; /* actual size of block */
|
91
|
+
|
92
|
+
/* counters and offsets used for radix sorting with characters */
|
93
|
+
unsigned int counters[256];
|
94
|
+
unsigned int offsetTable[256];
|
95
|
+
|
96
|
+
/***************************************************************************
|
97
|
+
* MACROS
|
98
|
+
***************************************************************************/
|
99
|
+
/* wraps array index within array bounds (assumes value < 2 * limit) */
|
100
|
+
#define Wrap(value, limit) (((value) < (limit)) ? (value) : ((value) - (limit)))
|
101
|
+
|
102
|
+
/***************************************************************************
|
103
|
+
* PROTOTYPES
|
104
|
+
***************************************************************************/
|
105
|
+
/* move to front functions */
|
106
|
+
|
107
|
+
void DoMTF(unsigned char *last, int length);
|
108
|
+
void UndoMTF(unsigned char *last, int length);
|
109
|
+
|
110
|
+
/***************************************************************************
|
111
|
+
* FUNCTIONS
|
112
|
+
***************************************************************************/
|
113
|
+
|
114
|
+
/***************************************************************************
|
115
|
+
* Function : ComparePresorted
|
116
|
+
* Description: This comparison function is designed for use with qsort
|
117
|
+
* and "block", a global array of "blockSize" unsigned chars.
|
118
|
+
* It compares two strings in "block" starting at indices
|
119
|
+
* s1 and s2 and ending at indices s1 - 1 and s2 - 1.
|
120
|
+
* The strings are assumed to be presorted so that first two
|
121
|
+
* characters are known to be matching.
|
122
|
+
* Parameters : s1 - The starting index of a string in block
|
123
|
+
* s2 - The starting index of a string in block
|
124
|
+
* Effects : NONE
|
125
|
+
* Returned : > 0 if string s1 > string s2
|
126
|
+
* 0 if string s1 == string s2
|
127
|
+
* < 0 if string s1 < string s2
|
128
|
+
***************************************************************************/
|
129
|
+
int ComparePresorted(const void *s1, const void *s2)
|
130
|
+
{
|
131
|
+
int offset1, offset2;
|
132
|
+
int i;
|
133
|
+
int result;
|
134
|
+
|
135
|
+
offset1 = *((int *)s1);
|
136
|
+
offset2 = *((int *)s2);
|
137
|
+
|
138
|
+
/***********************************************************************
|
139
|
+
* Compare 1 character at a time until there's difference or the end of
|
140
|
+
* the block is reached. Since we're only sorting strings that already
|
141
|
+
* match at the first two characters, start with the third character.
|
142
|
+
***********************************************************************/
|
143
|
+
for(i = 2; i < blockSize; i++)
|
144
|
+
{
|
145
|
+
result = (int)block[Wrap((offset1 + i), blockSize)] -
|
146
|
+
(int)block[Wrap((offset2 + i), blockSize)];
|
147
|
+
|
148
|
+
if (result != 0)
|
149
|
+
{
|
150
|
+
return result;
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
/* strings are identical */
|
155
|
+
return 0;
|
156
|
+
}
|
157
|
+
|
158
|
+
/***************************************************************************
|
159
|
+
* Function : BWXformFile
|
160
|
+
* Description: This function performs a Burrows-Wheeler transformation
|
161
|
+
* on a file (with optional move to front) and writes the
|
162
|
+
* resulting data to the specified output file. Comments in
|
163
|
+
* this function indicate corresponding variables, labels,
|
164
|
+
* and sections in "A Block-sorting Lossless Data Compression
|
165
|
+
* Algorithm" by M. Burrows and D.J. Wheeler.
|
166
|
+
* Parameters : inFile - Name of file to transform
|
167
|
+
* outFile - Name of file to write transformed output to
|
168
|
+
* mtf - Set to TRUE if move to front coding should be
|
169
|
+
* applied.
|
170
|
+
* Effects : A Burrows-Wheeler transformation (and possibly move to
|
171
|
+
* front encoding) is applied to inFile. The results of
|
172
|
+
* the transformation are written to outFile.
|
173
|
+
* Returned : TRUE for success, otherwise FALSE.
|
174
|
+
***************************************************************************/
|
175
|
+
int BWXformFile(char *inFile, char *outFile, char mtf)
|
176
|
+
{
|
177
|
+
int i, j, k;
|
178
|
+
FILE *fpIn, *fpOut;
|
179
|
+
unsigned int *rotationIdx; /* index of first char in rotation */
|
180
|
+
unsigned int *v; /* index of radix sorted charaters */
|
181
|
+
int s0Idx; /* index of S0 in rotations (I) */
|
182
|
+
unsigned char *last; /* last characters from sorted rotations */
|
183
|
+
|
184
|
+
/***********************************************************************
|
185
|
+
* BLOCK_SIZE arrays are allocated on the heap, because gcc generates
|
186
|
+
* code that throws a Segmentation fault when the large arrays are
|
187
|
+
* allocated on the stack.
|
188
|
+
***********************************************************************/
|
189
|
+
rotationIdx = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
|
190
|
+
|
191
|
+
if (NULL == rotationIdx)
|
192
|
+
{
|
193
|
+
perror("Allocating array of rotation indices");
|
194
|
+
return FALSE;
|
195
|
+
}
|
196
|
+
|
197
|
+
v = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
|
198
|
+
|
199
|
+
if (v == rotationIdx)
|
200
|
+
{
|
201
|
+
perror("Allocating array of sort indices");
|
202
|
+
free(rotationIdx);
|
203
|
+
return FALSE;
|
204
|
+
}
|
205
|
+
|
206
|
+
last = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
|
207
|
+
|
208
|
+
if (NULL == last)
|
209
|
+
{
|
210
|
+
perror("Allocating array of last characters");
|
211
|
+
free(rotationIdx);
|
212
|
+
free(v);
|
213
|
+
return FALSE;
|
214
|
+
}
|
215
|
+
|
216
|
+
/* open input and output files */
|
217
|
+
if ((fpIn = fopen(inFile, "rb")) == NULL)
|
218
|
+
{
|
219
|
+
perror(inFile);
|
220
|
+
return FALSE;
|
221
|
+
}
|
222
|
+
|
223
|
+
if (outFile == NULL)
|
224
|
+
{
|
225
|
+
fpOut = stdout;
|
226
|
+
}
|
227
|
+
else
|
228
|
+
{
|
229
|
+
if ((fpOut = fopen(outFile, "wb")) == NULL)
|
230
|
+
{
|
231
|
+
fclose(fpIn);
|
232
|
+
perror(outFile);
|
233
|
+
return FALSE;
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
while((blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn))
|
238
|
+
!= 0)
|
239
|
+
{
|
240
|
+
/*******************************************************************
|
241
|
+
* Sort the rotated strings in the block. A radix sort is performed
|
242
|
+
* on the first to characters of all the rotated strings (2nd
|
243
|
+
* character then 1st). All rotated strings with matching initial
|
244
|
+
* characters are then quicksorted. - Q4..Q7
|
245
|
+
*******************************************************************/
|
246
|
+
|
247
|
+
/*** radix sort on second character in rotation ***/
|
248
|
+
|
249
|
+
/* count number of characters for radix sort */
|
250
|
+
memset(counters, 0, 256 * sizeof(int));
|
251
|
+
for (i = 0; i < blockSize; i++)
|
252
|
+
{
|
253
|
+
counters[block[i]]++;
|
254
|
+
}
|
255
|
+
|
256
|
+
offsetTable[0] = 0;
|
257
|
+
|
258
|
+
for(i = 1; i < 256; i++)
|
259
|
+
{
|
260
|
+
/* determine number of values before those sorted under i */
|
261
|
+
offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
|
262
|
+
}
|
263
|
+
|
264
|
+
/* sort on 2nd character */
|
265
|
+
for (i = 0; i < blockSize - 1; i++)
|
266
|
+
{
|
267
|
+
j = block[i + 1];
|
268
|
+
v[offsetTable[j]] = i;
|
269
|
+
offsetTable[j] = offsetTable[j] + 1;
|
270
|
+
}
|
271
|
+
|
272
|
+
/* handle wrap around for string starting at end of block */
|
273
|
+
j = block[0];
|
274
|
+
v[offsetTable[j]] = i;
|
275
|
+
offsetTable[0] = 0;
|
276
|
+
|
277
|
+
/*** radix sort on first character in rotation ***/
|
278
|
+
|
279
|
+
for(i = 1; i < 256; i++)
|
280
|
+
{
|
281
|
+
/* determine number of values before those sorted under i */
|
282
|
+
offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
|
283
|
+
}
|
284
|
+
|
285
|
+
for (i = 0; i < blockSize; i++)
|
286
|
+
{
|
287
|
+
j = v[i];
|
288
|
+
j = block[j];
|
289
|
+
rotationIdx[offsetTable[j]] = v[i];
|
290
|
+
offsetTable[j] = offsetTable[j] + 1;
|
291
|
+
}
|
292
|
+
|
293
|
+
/*******************************************************************
|
294
|
+
* now rotationIdx contains the sort order of all strings sorted
|
295
|
+
* by their first 2 characters. Use qsort to sort the strings
|
296
|
+
* that have their first two characters matching.
|
297
|
+
*******************************************************************/
|
298
|
+
for (i = 0, k = 0; (i <= UCHAR_MAX) && (k < (blockSize - 1)); i++)
|
299
|
+
{
|
300
|
+
for (j = 0; (j <= UCHAR_MAX) && (k < (blockSize - 1)); j++)
|
301
|
+
{
|
302
|
+
int first = k;
|
303
|
+
|
304
|
+
/* count strings starting with ij */
|
305
|
+
while ((i == block[rotationIdx[k]]) &&
|
306
|
+
(j == block[Wrap(rotationIdx[k] + 1, blockSize)]))
|
307
|
+
{
|
308
|
+
k++;
|
309
|
+
|
310
|
+
if (k == blockSize)
|
311
|
+
{
|
312
|
+
/* we've searched the whole block */
|
313
|
+
break;
|
314
|
+
}
|
315
|
+
}
|
316
|
+
|
317
|
+
if (k - first > 1)
|
318
|
+
{
|
319
|
+
/* there are at least 2 strings staring with ij, sort them */
|
320
|
+
qsort(&rotationIdx[first], k - first, sizeof(int),
|
321
|
+
ComparePresorted);
|
322
|
+
}
|
323
|
+
}
|
324
|
+
}
|
325
|
+
|
326
|
+
/* find last characters of rotations (L) - C2 */
|
327
|
+
s0Idx = 0;
|
328
|
+
for (i = 0; i < blockSize; i++)
|
329
|
+
{
|
330
|
+
if (rotationIdx[i] != 0)
|
331
|
+
{
|
332
|
+
last[i] = block[rotationIdx[i] - 1];
|
333
|
+
}
|
334
|
+
else
|
335
|
+
{
|
336
|
+
/* unrotated string 1st character is end of string */
|
337
|
+
s0Idx = i;
|
338
|
+
last[i] = block[blockSize - 1];
|
339
|
+
}
|
340
|
+
}
|
341
|
+
|
342
|
+
if (mtf)
|
343
|
+
{
|
344
|
+
DoMTF(last, blockSize);
|
345
|
+
}
|
346
|
+
|
347
|
+
/* write index of end of unrotated string (I) */
|
348
|
+
fwrite(&s0Idx, sizeof(int), 1, fpOut);
|
349
|
+
|
350
|
+
/* write out last characters of rotations (L) */
|
351
|
+
fwrite(last, sizeof(unsigned char), blockSize, fpOut);
|
352
|
+
}
|
353
|
+
|
354
|
+
/* clean up */
|
355
|
+
free(rotationIdx);
|
356
|
+
free(v);
|
357
|
+
free(last);
|
358
|
+
fclose(fpIn);
|
359
|
+
fclose(fpOut);
|
360
|
+
return TRUE;
|
361
|
+
}
|
362
|
+
|
363
|
+
|
364
|
+
|
365
|
+
int BWXform(char *inString, char *outString, int mtf)
|
366
|
+
{
|
367
|
+
int i, j, k;
|
368
|
+
FILE *fpIn, *fpOut;
|
369
|
+
unsigned int *rotationIdx; /* index of first char in rotation */
|
370
|
+
unsigned int *v; /* index of radix sorted charaters */
|
371
|
+
int s0Idx; /* index of S0 in rotations (I) */
|
372
|
+
unsigned char *last; /* last characters from sorted rotations */
|
373
|
+
|
374
|
+
/***********************************************************************
|
375
|
+
* BLOCK_SIZE arrays are allocated on the heap, because gcc generates
|
376
|
+
* code that throws a Segmentation fault when the large arrays are
|
377
|
+
* allocated on the stack.
|
378
|
+
***********************************************************************/
|
379
|
+
rotationIdx = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
|
380
|
+
|
381
|
+
if (NULL == rotationIdx)
|
382
|
+
{
|
383
|
+
perror("Allocating array of rotation indices");
|
384
|
+
return FALSE;
|
385
|
+
}
|
386
|
+
|
387
|
+
v = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
|
388
|
+
|
389
|
+
if (v == rotationIdx)
|
390
|
+
{
|
391
|
+
perror("Allocating array of sort indices");
|
392
|
+
free(rotationIdx);
|
393
|
+
return FALSE;
|
394
|
+
}
|
395
|
+
|
396
|
+
last = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
|
397
|
+
|
398
|
+
if (NULL == last)
|
399
|
+
{
|
400
|
+
perror("Allocating array of last characters");
|
401
|
+
free(rotationIdx);
|
402
|
+
free(v);
|
403
|
+
return FALSE;
|
404
|
+
}
|
405
|
+
|
406
|
+
strcpy(block,inString);
|
407
|
+
blockSize=strlen(inString);
|
408
|
+
block[blockSize]='\0';
|
409
|
+
// printf("block:%s, SIZE: %ld\n",block,blockSize);
|
410
|
+
|
411
|
+
// while((blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn))
|
412
|
+
// != 0)
|
413
|
+
// {
|
414
|
+
/*******************************************************************
|
415
|
+
* Sort the rotated strings in the block. A radix sort is performed
|
416
|
+
* on the first to characters of all the rotated strings (2nd
|
417
|
+
* character then 1st). All rotated strings with matching initial
|
418
|
+
* characters are then quicksorted. - Q4..Q7
|
419
|
+
*******************************************************************/
|
420
|
+
|
421
|
+
/*** radix sort on second character in rotation ***/
|
422
|
+
|
423
|
+
/* count number of characters for radix sort */
|
424
|
+
memset(counters, 0, 256 * sizeof(int));
|
425
|
+
for (i = 0; i < blockSize; i++)
|
426
|
+
{
|
427
|
+
counters[block[i]]++;
|
428
|
+
}
|
429
|
+
|
430
|
+
offsetTable[0] = 0;
|
431
|
+
|
432
|
+
for(i = 1; i < 256; i++)
|
433
|
+
{
|
434
|
+
/* determine number of values before those sorted under i */
|
435
|
+
offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
|
436
|
+
}
|
437
|
+
|
438
|
+
/* sort on 2nd character */
|
439
|
+
for (i = 0; i < blockSize - 1; i++)
|
440
|
+
{
|
441
|
+
j = block[i + 1];
|
442
|
+
v[offsetTable[j]] = i;
|
443
|
+
offsetTable[j] = offsetTable[j] + 1;
|
444
|
+
}
|
445
|
+
|
446
|
+
/* handle wrap around for string starting at end of block */
|
447
|
+
j = block[0];
|
448
|
+
v[offsetTable[j]] = i;
|
449
|
+
offsetTable[0] = 0;
|
450
|
+
|
451
|
+
/*** radix sort on first character in rotation ***/
|
452
|
+
|
453
|
+
for(i = 1; i < 256; i++)
|
454
|
+
{
|
455
|
+
/* determine number of values before those sorted under i */
|
456
|
+
offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
|
457
|
+
}
|
458
|
+
|
459
|
+
for (i = 0; i < blockSize; i++)
|
460
|
+
{
|
461
|
+
j = v[i];
|
462
|
+
j = block[j];
|
463
|
+
rotationIdx[offsetTable[j]] = v[i];
|
464
|
+
offsetTable[j] = offsetTable[j] + 1;
|
465
|
+
}
|
466
|
+
|
467
|
+
/*******************************************************************
|
468
|
+
* now rotationIdx contains the sort order of all strings sorted
|
469
|
+
* by their first 2 characters. Use qsort to sort the strings
|
470
|
+
* that have their first two characters matching.
|
471
|
+
*******************************************************************/
|
472
|
+
for (i = 0, k = 0; (i <= UCHAR_MAX) && (k < (blockSize - 1)); i++)
|
473
|
+
{
|
474
|
+
for (j = 0; (j <= UCHAR_MAX) && (k < (blockSize - 1)); j++)
|
475
|
+
{
|
476
|
+
int first = k;
|
477
|
+
|
478
|
+
/* count strings starting with ij */
|
479
|
+
while ((i == block[rotationIdx[k]]) &&
|
480
|
+
(j == block[Wrap(rotationIdx[k] + 1, blockSize)]))
|
481
|
+
{
|
482
|
+
k++;
|
483
|
+
|
484
|
+
if (k == blockSize)
|
485
|
+
{
|
486
|
+
/* we've searched the whole block */
|
487
|
+
break;
|
488
|
+
}
|
489
|
+
}
|
490
|
+
|
491
|
+
if (k - first > 1)
|
492
|
+
{
|
493
|
+
/* there are at least 2 strings staring with ij, sort them */
|
494
|
+
qsort(&rotationIdx[first], k - first, sizeof(int),
|
495
|
+
ComparePresorted);
|
496
|
+
}
|
497
|
+
}
|
498
|
+
}
|
499
|
+
|
500
|
+
/* find last characters of rotations (L) - C2 */
|
501
|
+
s0Idx = 0;
|
502
|
+
for (i = 0; i < blockSize; i++)
|
503
|
+
{
|
504
|
+
if (rotationIdx[i] != 0)
|
505
|
+
{
|
506
|
+
last[i] = block[rotationIdx[i] - 1];
|
507
|
+
}
|
508
|
+
else
|
509
|
+
{
|
510
|
+
/* unrotated string 1st character is end of string */
|
511
|
+
s0Idx = i;
|
512
|
+
last[i] = block[blockSize - 1];
|
513
|
+
}
|
514
|
+
}
|
515
|
+
// printf("ANTES:\n");
|
516
|
+
// for (i = 0; i < blockSize; i++)
|
517
|
+
// {
|
518
|
+
// printf("%2d,",last[i]);
|
519
|
+
// }
|
520
|
+
// printf("\nDESPUES:\n");
|
521
|
+
|
522
|
+
if (mtf)
|
523
|
+
{
|
524
|
+
DoMTF(last, blockSize);
|
525
|
+
}
|
526
|
+
|
527
|
+
// for (i = 0; i < blockSize; i++)
|
528
|
+
// {
|
529
|
+
// printf("%2d,",last[i]);
|
530
|
+
// }
|
531
|
+
// printf("\n");
|
532
|
+
|
533
|
+
// /* write index of end of unrotated string (I) */
|
534
|
+
// fwrite(&s0Idx, sizeof(int), 1, fpOut);
|
535
|
+
//
|
536
|
+
// /* write out last characters of rotations (L) */
|
537
|
+
// fwrite(last, sizeof(unsigned char), blockSize, fpOut);
|
538
|
+
// } //FIN WHILE
|
539
|
+
|
540
|
+
|
541
|
+
memcpy((char *)outString, (void *)last, sizeof(unsigned char) * blockSize);
|
542
|
+
// strncpy(outString,last,blockSize+1);
|
543
|
+
// blockSize=strlen(last);
|
544
|
+
// printf("block2:%s, SIZE: %ld\n",last,strlen(last));
|
545
|
+
|
546
|
+
|
547
|
+
/* clean up */
|
548
|
+
free(rotationIdx);
|
549
|
+
free(v);
|
550
|
+
free(last);
|
551
|
+
// fclose(fpIn);
|
552
|
+
// fclose(fpOut);
|
553
|
+
return TRUE;
|
554
|
+
}
|
555
|
+
|
556
|
+
/***************************************************************************
|
557
|
+
* Function : DoMTF
|
558
|
+
* Description: This function performs move to front encoding on a block
|
559
|
+
* on of data that has already had the Burrows-Wheeler
|
560
|
+
* transformation applied to it. Comments in this function
|
561
|
+
* indicate corresponding variables, labels, and sections in
|
562
|
+
* "A Block-sorting Lossless Data Compression Algorithm" by
|
563
|
+
* M. Burrows and D.J. Wheeler.
|
564
|
+
* Parameters : last - pointer an array of "last" characters from
|
565
|
+
* Burrows-Wheeler rotations (L)
|
566
|
+
* length - the number of unsigned chars contained in last.
|
567
|
+
* Effects : Move to front encoding is applied on an array of last
|
568
|
+
* characters. The results of the encoding replace the data
|
569
|
+
* that was stored in last.
|
570
|
+
* Returned : NONE
|
571
|
+
***************************************************************************/
|
572
|
+
void DoMTF(unsigned char *last, int length)
|
573
|
+
{
|
574
|
+
unsigned char list[UCHAR_MAX + 1]; /* list of characters (Y) */
|
575
|
+
unsigned char *encoded; /* mtf encoded block (R) */
|
576
|
+
int i, j;
|
577
|
+
|
578
|
+
/***********************************************************************
|
579
|
+
* BLOCK_SIZE arrays are allocated on the heap, because gcc generates
|
580
|
+
* code that throws a Segmentation fault when the large arrays are
|
581
|
+
* allocated on the stack.
|
582
|
+
***********************************************************************/
|
583
|
+
encoded = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
|
584
|
+
|
585
|
+
if (NULL == encoded)
|
586
|
+
{
|
587
|
+
perror("Allocating array to store MTF encoding");
|
588
|
+
return;
|
589
|
+
}
|
590
|
+
|
591
|
+
/* start with alphabetically sorted list of characters */
|
592
|
+
for(i = 0; i <= UCHAR_MAX; i++)
|
593
|
+
{
|
594
|
+
list[i] = (unsigned char)i;
|
595
|
+
}
|
596
|
+
|
597
|
+
/* move-to-front coding - M1 */
|
598
|
+
for (i = 0; i < length; i++)
|
599
|
+
{
|
600
|
+
/*******************************************************************
|
601
|
+
* Find the character in the list of characters. I do a sequential
|
602
|
+
* search because of move to front causes common characters to be
|
603
|
+
* near the front of the list.
|
604
|
+
*******************************************************************/
|
605
|
+
for (j = 0; j <= UCHAR_MAX; j++)
|
606
|
+
{
|
607
|
+
if (list[j] == last[i])
|
608
|
+
{
|
609
|
+
/* we found the character */
|
610
|
+
encoded[i] = j;
|
611
|
+
break;
|
612
|
+
}
|
613
|
+
}
|
614
|
+
|
615
|
+
/* now move the current character to the front of the list */
|
616
|
+
for (; j > 0; j--)
|
617
|
+
{
|
618
|
+
list[j] = list[j - 1];
|
619
|
+
}
|
620
|
+
list[0] = last[i];
|
621
|
+
}
|
622
|
+
|
623
|
+
/* copy mtf encoded vector of last characters (R) to input */
|
624
|
+
memcpy((void *)last, (void *)encoded, sizeof(unsigned char) * length);
|
625
|
+
free(encoded);
|
626
|
+
|
627
|
+
return;
|
628
|
+
}
|
629
|
+
|
630
|
+
/***************************************************************************
|
631
|
+
* Function : BWReverseXformFile
|
632
|
+
* Description: This function reverses a Burrows-Wheeler transformation
|
633
|
+
* on a file (with optional move to front) and writes the
|
634
|
+
* resulting data to the specified output file. Comments in
|
635
|
+
* this function indicate corresponding variables, labels,
|
636
|
+
* and sections in "A Block-sorting Lossless Data Compression
|
637
|
+
* Algorithm" by M. Burrows and D.J. Wheeler.
|
638
|
+
* Parameters : inFile - Name of file to reverse transform
|
639
|
+
* outFile - Name of file to write reverse transformed
|
640
|
+
* output to
|
641
|
+
* mtf - Set to TRUE if move to front decoding should be
|
642
|
+
* applied
|
643
|
+
* Effects : A Burrows-Wheeler reverse transformation (and possibly
|
644
|
+
* move to front encoding) is applied to inFile. The results
|
645
|
+
* of the reverse transformation are written to outFile.
|
646
|
+
* Returned : TRUE for success, otherwise FALSE.
|
647
|
+
***************************************************************************/
|
648
|
+
int BWReverseXformFile(char *inFile, char *outFile, char mtf)
|
649
|
+
{
|
650
|
+
FILE *fpIn, *fpOut;
|
651
|
+
int i, j, sum;
|
652
|
+
int count[UCHAR_MAX + 1]; /* count[i] = # of chars in block <= i */
|
653
|
+
int *pred; /* pred[i] = # of times block[i] appears in
|
654
|
+
block[0 .. i - 1] */
|
655
|
+
unsigned char *unrotated; /* original block */
|
656
|
+
int s0Idx; /* index of S0 in rotations (I) */
|
657
|
+
|
658
|
+
/***********************************************************************
|
659
|
+
* BLOCK_SIZE arrays are allocated on the heap, because gcc generates
|
660
|
+
* code that throws a Segmentation fault when the large arrays are
|
661
|
+
* allocated on the stack.
|
662
|
+
***********************************************************************/
|
663
|
+
pred = (int *)malloc(BLOCK_SIZE * sizeof(int));
|
664
|
+
|
665
|
+
if (NULL == pred)
|
666
|
+
{
|
667
|
+
perror("Allocating array of matching predicessors");
|
668
|
+
return FALSE;
|
669
|
+
}
|
670
|
+
|
671
|
+
unrotated = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
|
672
|
+
|
673
|
+
if (NULL == unrotated)
|
674
|
+
{
|
675
|
+
perror("Allocating array to store unrotated block");
|
676
|
+
free(pred);
|
677
|
+
return FALSE;
|
678
|
+
}
|
679
|
+
|
680
|
+
/* open input and output files */
|
681
|
+
if ((fpIn = fopen(inFile, "rb")) == NULL)
|
682
|
+
{
|
683
|
+
perror(inFile);
|
684
|
+
return FALSE;
|
685
|
+
}
|
686
|
+
|
687
|
+
if (outFile == NULL)
|
688
|
+
{
|
689
|
+
fpOut = stdout;
|
690
|
+
}
|
691
|
+
else
|
692
|
+
{
|
693
|
+
if ((fpOut = fopen(outFile, "wb")) == NULL)
|
694
|
+
{
|
695
|
+
fclose(fpIn);
|
696
|
+
perror(outFile);
|
697
|
+
return FALSE;
|
698
|
+
}
|
699
|
+
}
|
700
|
+
|
701
|
+
while(fread(&s0Idx, sizeof(int), 1, fpIn) != 0)
|
702
|
+
{
|
703
|
+
blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn);
|
704
|
+
|
705
|
+
if(mtf)
|
706
|
+
{
|
707
|
+
UndoMTF(block, blockSize);
|
708
|
+
}
|
709
|
+
|
710
|
+
/* code based on pseudo code from section 4.2 (D1 and D2) follows */
|
711
|
+
for(i = 0; i <= UCHAR_MAX; i++)
|
712
|
+
{
|
713
|
+
count[i] = 0;
|
714
|
+
}
|
715
|
+
|
716
|
+
/*******************************************************************
|
717
|
+
* Set pred[i] to the number of times block[i] appears in the
|
718
|
+
* substring block[0 .. i - 1]. As a useful side effect count[i]
|
719
|
+
* will be the number of times character i appears in block.
|
720
|
+
*******************************************************************/
|
721
|
+
for (i = 0; i < blockSize; i++)
|
722
|
+
{
|
723
|
+
pred[i] = count[block[i]];
|
724
|
+
count[block[i]]++;
|
725
|
+
}
|
726
|
+
|
727
|
+
/*******************************************************************
|
728
|
+
* Finally, set count[i] to the number of characters in block
|
729
|
+
* lexicographically less than i.
|
730
|
+
*******************************************************************/
|
731
|
+
sum = 0;
|
732
|
+
for(i = 0; i <= UCHAR_MAX; i++)
|
733
|
+
{
|
734
|
+
j = count[i];
|
735
|
+
count[i] = sum;
|
736
|
+
sum += j;
|
737
|
+
}
|
738
|
+
|
739
|
+
/* construct the initial unrotated string (S[0]) */
|
740
|
+
i = s0Idx;
|
741
|
+
for(j = blockSize - 1; j >= 0; j--)
|
742
|
+
{
|
743
|
+
unrotated[j] = block[i];
|
744
|
+
i = pred[i] + count[block[i]];
|
745
|
+
}
|
746
|
+
|
747
|
+
fwrite(unrotated, sizeof(unsigned char), blockSize, fpOut);
|
748
|
+
}
|
749
|
+
|
750
|
+
/* clean up */
|
751
|
+
free(pred);
|
752
|
+
free(unrotated);
|
753
|
+
fclose(fpIn);
|
754
|
+
fclose(fpOut);
|
755
|
+
return TRUE;
|
756
|
+
}
|
757
|
+
|
758
|
+
|
759
|
+
int BWReverseXform(char *inString, char *outString, int mtf, long size)
|
760
|
+
{
|
761
|
+
// FILE *fpIn, *fpOut;
|
762
|
+
int i, j, sum;
|
763
|
+
int count[UCHAR_MAX + 1]; /* count[i] = # of chars in block <= i */
|
764
|
+
int *pred; /* pred[i] = # of times block[i] appears in
|
765
|
+
block[0 .. i - 1] */
|
766
|
+
unsigned char *unrotated; /* original block */
|
767
|
+
int s0Idx; /* index of S0 in rotations (I) */
|
768
|
+
|
769
|
+
/***********************************************************************
|
770
|
+
* BLOCK_SIZE arrays are allocated on the heap, because gcc generates
|
771
|
+
* code that throws a Segmentation fault when the large arrays are
|
772
|
+
* allocated on the stack.
|
773
|
+
***********************************************************************/
|
774
|
+
pred = (int *)malloc(BLOCK_SIZE * sizeof(int));
|
775
|
+
|
776
|
+
if (NULL == pred)
|
777
|
+
{
|
778
|
+
perror("Allocating array of matching predicessors");
|
779
|
+
return FALSE;
|
780
|
+
}
|
781
|
+
|
782
|
+
unrotated = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
|
783
|
+
|
784
|
+
if (NULL == unrotated)
|
785
|
+
{
|
786
|
+
perror("Allocating array to store unrotated block");
|
787
|
+
free(pred);
|
788
|
+
return FALSE;
|
789
|
+
}
|
790
|
+
|
791
|
+
// while(fread(&s0Idx, sizeof(int), 1, fpIn) != 0)
|
792
|
+
// {
|
793
|
+
// blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn);
|
794
|
+
|
795
|
+
blockSize=size;
|
796
|
+
strncpy(block,inString,blockSize);
|
797
|
+
|
798
|
+
if(mtf)
|
799
|
+
{
|
800
|
+
UndoMTF(block, blockSize);
|
801
|
+
}
|
802
|
+
|
803
|
+
/* code based on pseudo code from section 4.2 (D1 and D2) follows */
|
804
|
+
for(i = 0; i <= UCHAR_MAX; i++)
|
805
|
+
{
|
806
|
+
count[i] = 0;
|
807
|
+
}
|
808
|
+
|
809
|
+
/*******************************************************************
|
810
|
+
* Set pred[i] to the number of times block[i] appears in the
|
811
|
+
* substring block[0 .. i - 1]. As a useful side effect count[i]
|
812
|
+
* will be the number of times character i appears in block.
|
813
|
+
*******************************************************************/
|
814
|
+
for (i = 0; i < blockSize; i++)
|
815
|
+
{
|
816
|
+
pred[i] = count[block[i]];
|
817
|
+
count[block[i]]++;
|
818
|
+
}
|
819
|
+
|
820
|
+
/*******************************************************************
|
821
|
+
* Finally, set count[i] to the number of characters in block
|
822
|
+
* lexicographically less than i.
|
823
|
+
*******************************************************************/
|
824
|
+
sum = 0;
|
825
|
+
for(i = 0; i <= UCHAR_MAX; i++)
|
826
|
+
{
|
827
|
+
j = count[i];
|
828
|
+
count[i] = sum;
|
829
|
+
sum += j;
|
830
|
+
}
|
831
|
+
|
832
|
+
/* construct the initial unrotated string (S[0]) */
|
833
|
+
i = s0Idx;
|
834
|
+
for(j = blockSize - 1; j >= 0; j--)
|
835
|
+
{
|
836
|
+
unrotated[j] = block[i];
|
837
|
+
i = pred[i] + count[block[i]];
|
838
|
+
}
|
839
|
+
|
840
|
+
// fwrite(unrotated, sizeof(unsigned char), blockSize, fpOut);
|
841
|
+
// }
|
842
|
+
|
843
|
+
strncpy(outString,unrotated,blockSize);
|
844
|
+
|
845
|
+
/* clean up */
|
846
|
+
free(pred);
|
847
|
+
free(unrotated);
|
848
|
+
// fclose(fpIn);
|
849
|
+
// fclose(fpOut);
|
850
|
+
//
|
851
|
+
return TRUE;
|
852
|
+
}
|
853
|
+
|
854
|
+
/***************************************************************************
|
855
|
+
* Function : UndoMTF
|
856
|
+
* Description: This function reverses move to front encoding on a block
|
857
|
+
* on of data that has already had the Burrows-Wheeler
|
858
|
+
* transformation applied to it. Comments in this function
|
859
|
+
* indicate corresponding variables, labels, and sections in
|
860
|
+
* "A Block-sorting Lossless Data Compression Algorithm" by
|
861
|
+
* M. Burrows and D.J. Wheeler.
|
862
|
+
* Parameters : last - pointer an array of mtf encoded characters from
|
863
|
+
* Burrows-Wheeler rotations.
|
864
|
+
* length - the number of unsigned chars contained in last.
|
865
|
+
* Effects : Move to front encoding is reversed on an array of last
|
866
|
+
* characters. The results of the reversal are stored in
|
867
|
+
* the array last (L), providing an array of last characters
|
868
|
+
* of sorted rotations.
|
869
|
+
* Returned : NONE
|
870
|
+
***************************************************************************/
|
871
|
+
void UndoMTF(unsigned char *last, int length)
|
872
|
+
{
|
873
|
+
unsigned char list[UCHAR_MAX + 1]; /* list of characters (Y) */
|
874
|
+
unsigned char *encoded; /* mtf encoded block (R) */
|
875
|
+
int i, j;
|
876
|
+
|
877
|
+
/***********************************************************************
|
878
|
+
* BLOCK_SIZE arrays are allocated on the heap, because gcc generates
|
879
|
+
* code that throws a Segmentation fault when the large arrays are
|
880
|
+
* allocated on the stack.
|
881
|
+
***********************************************************************/
|
882
|
+
encoded = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
|
883
|
+
|
884
|
+
if (NULL == encoded)
|
885
|
+
{
|
886
|
+
perror("Allocating array to store MTF encoding");
|
887
|
+
return;
|
888
|
+
}
|
889
|
+
|
890
|
+
/* copy last into encoded */
|
891
|
+
memcpy((void *)encoded, (void *)last, sizeof(unsigned char) * length);
|
892
|
+
|
893
|
+
/* start with alphabetically sorted list of characters */
|
894
|
+
for(i = 0; i <= UCHAR_MAX; i++)
|
895
|
+
{
|
896
|
+
list[i] = (unsigned char)i;
|
897
|
+
}
|
898
|
+
|
899
|
+
/* move-to-front decoding - W2 */
|
900
|
+
for (i = 0; i < length; i++)
|
901
|
+
{
|
902
|
+
/* decode the character */
|
903
|
+
last[i] = list[encoded[i]];
|
904
|
+
|
905
|
+
/* now move the current character to the front of the list */
|
906
|
+
for (j = encoded[i]; j > 0; j--)
|
907
|
+
{
|
908
|
+
list[j] = list[j - 1];
|
909
|
+
}
|
910
|
+
list[0] = last[i];
|
911
|
+
}
|
912
|
+
|
913
|
+
free(encoded);
|
914
|
+
return;
|
915
|
+
}
|