scbi_fqbin 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (116) hide show
  1. checksums.yaml +7 -0
  2. data/.DS_Store +0 -0
  3. data/.gitignore +14 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/{README.rdoc → README.md} +0 -0
  7. data/Rakefile +8 -28
  8. data/lib/scbi_fqbin.rb +3 -5
  9. data/lib/scbi_fqbin/fastabin.rb +411 -0
  10. data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
  11. data/lib/scbi_fqbin/fbin_file.rb +1 -1
  12. data/lib/scbi_fqbin/t.rb +9 -0
  13. data/lib/scbi_fqbin/t2.rb +12 -0
  14. data/lib/scbi_fqbin/version.rb +3 -0
  15. data/lib_fqbin_src.zip +0 -0
  16. data/lib_fqbin_src/Makefile +66 -0
  17. data/lib_fqbin_src/fq +0 -0
  18. data/lib_fqbin_src/fq.c +165 -0
  19. data/lib_fqbin_src/hash_fqbin +0 -0
  20. data/lib_fqbin_src/hash_fqbin.c +212 -0
  21. data/lib_fqbin_src/idx_fqbin +21 -0
  22. data/lib_fqbin_src/iterate_fqbin +0 -0
  23. data/lib_fqbin_src/iterate_fqbin.c +136 -0
  24. data/lib_fqbin_src/lib_fqbin.c +1748 -0
  25. data/lib_fqbin_src/lib_fqbin.h +194 -0
  26. data/lib_fqbin_src/mk_fqbin +0 -0
  27. data/lib_fqbin_src/mk_fqbin.c +138 -0
  28. data/lib_fqbin_src/other/bwxform.c +915 -0
  29. data/lib_fqbin_src/other/bwxform.h +74 -0
  30. data/lib_fqbin_src/other/find_in_index.c +130 -0
  31. data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
  32. data/lib_fqbin_src/other/idx_fqbin +0 -0
  33. data/lib_fqbin_src/other/idx_fqbin.c +67 -0
  34. data/lib_fqbin_src/other/make_hsh.sh +14 -0
  35. data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
  36. data/lib_fqbin_src/read_fq +0 -0
  37. data/lib_fqbin_src/read_fq.c +143 -0
  38. data/lib_fqbin_src/read_fqbin +0 -0
  39. data/lib_fqbin_src/read_fqbin.c +101 -0
  40. data/lib_fqbin_src/sort_index +9 -0
  41. data/lib_fqbin_src/test.rb +13 -0
  42. data/scbi_fqbin.gemspec +25 -0
  43. data/test/build.rake +15 -0
  44. data/test/fbinfile +0 -0
  45. data/test/fbinfile.index +0 -0
  46. data/test/no_test_fill_file.rb +66 -0
  47. data/test/old/app.rb +43 -0
  48. data/test/old/bin/iterate_fastabin.rb +54 -0
  49. data/test/old/bin/mk_fastabin.rb +22 -0
  50. data/test/old/bin/rd_fastabin.rb +36 -0
  51. data/test/old/bin/rd_fq.rb +20 -0
  52. data/test/old/bioruby.rb +27 -0
  53. data/test/old/c/Makefile +34 -0
  54. data/test/old/c/fbin_lib.zip +0 -0
  55. data/test/old/c/iterate_fbin.c +54 -0
  56. data/test/old/c/libreria_gz.c +707 -0
  57. data/test/old/c/libreria_gz.h +127 -0
  58. data/test/old/c/main.c +86 -0
  59. data/test/old/c/mk_fbin.c +24 -0
  60. data/test/old/c/rd_seq_fbin.c +44 -0
  61. data/test/old/c/test_ffi/a.out +0 -0
  62. data/test/old/c/test_ffi/app.c +26 -0
  63. data/test/old/c/test_ffi/app.rb +19 -0
  64. data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
  65. data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
  66. data/test/old/c/test_ffi/my_library.rb +23 -0
  67. data/test/old/c/test_ffi/mylibrary.c +22 -0
  68. data/test/old/c/test_ffi/mylibrary.h +6 -0
  69. data/test/old/c/usage_instructions.txt +62 -0
  70. data/test/old/ext/Makefile +187 -0
  71. data/test/old/ext/Makefile.dario +34 -0
  72. data/test/old/ext/extconf.rb +8 -0
  73. data/test/old/ext/mk_fbin.c +24 -0
  74. data/test/old/ext/sample/extras.txt +4 -0
  75. data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
  76. data/test/old/ext/sample/f1.fasta +10 -0
  77. data/test/old/ext/sample/f1.fasta.qual +10 -0
  78. data/test/old/ext/sample/f1.fbin +0 -0
  79. data/test/old/ext/sample/f1.fbin.index +0 -0
  80. data/test/old/ext/sample/main.c +86 -0
  81. data/test/old/ext/usage_instructions.txt +62 -0
  82. data/test/old/t_scbi_fastabin.rb +140 -0
  83. data/test/read_tests/10-original_sizes.sh +16 -0
  84. data/test/read_tests/20-fq_time.sh +23 -0
  85. data/test/read_tests/30-fbin_read_time.sh +23 -0
  86. data/test/read_tests/40-bsc_read_time.sh +21 -0
  87. data/test/read_tests/50-fq_time_x4.sh +25 -0
  88. data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
  89. data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
  90. data/test/results_bio_scbi_fasta.txt +11 -0
  91. data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
  92. data/test/speed.txt +81 -0
  93. data/test/t_scbi_fasta.rb +12 -0
  94. data/test/write_tests/10-original_sizes.sh +16 -0
  95. data/test/write_tests/20-zip_time.sh +17 -0
  96. data/test/write_tests/30-mk_fbin_time.sh +23 -0
  97. data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
  98. data/test/write_tests/40-gzip_time.sh +16 -0
  99. data/test/write_tests/41-bsc_time.sh +16 -0
  100. data/test/write_tests/50-zip_sizes.sh +16 -0
  101. data/test/write_tests/60-fbin_sizes.sh +17 -0
  102. data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
  103. data/test/write_tests/70-gzip_sizes.sh +17 -0
  104. data/test/write_tests/80-bsc_sizes.sh +17 -0
  105. data/website/index.html +87 -0
  106. data/website/index.txt +81 -0
  107. data/website/javascripts/rounded_corners_lite.inc.js +285 -0
  108. data/website/stylesheets/screen.css +159 -0
  109. data/website/template.html.erb +50 -0
  110. metadata +208 -95
  111. data/History.txt +0 -19
  112. data/Manifest.txt +0 -12
  113. data/PostInstall.txt +0 -7
  114. data/script/console +0 -10
  115. data/script/destroy +0 -14
  116. data/script/generate +0 -14
@@ -0,0 +1,194 @@
1
+ #include <stdio.h>
2
+ #include <string.h>
3
+ #include <time.h>
4
+
5
+
6
+ #include <sys/types.h>
7
+ #include <sys/stat.h>
8
+ #include <fcntl.h>
9
+ #include <errno.h>
10
+
11
+ #include <zlib.h>
12
+ #include <stdlib.h>
13
+
14
+ #include <zlib.h>
15
+
16
+ #define VERSION 1
17
+ #define SUBVERSION 0
18
+
19
+ #define CHUNK 262144
20
+
21
+ // Maximum file name (including .idx)
22
+ #define MAXFNAME 512
23
+
24
+ // Maximum lenght of the name of a sequence
25
+ #define MAXSEQNAME 1024
26
+ #define ONEMB 1000000
27
+ #define MAXSEQLENGTH 500*ONEMB
28
+
29
+ #define INVALID_FASTQ_FORMAT -5
30
+ #define INVALID_FASTA_FORMAT -6
31
+ #define MAX_SEQ_SIZE_ERROR -7
32
+
33
+ #define DEBUG 0
34
+ #define FALSE 0
35
+ #define TRUE 1
36
+
37
+ // Maximum size of the metadata of a sequence, including name, lenght of fasta, qual and extras.
38
+ // It should be a maximum of 10000
39
+ #define SEQ_METADATA 10000
40
+
41
+
42
+ struct file_data {
43
+ char name[10000];
44
+ char index_name[10000];
45
+ gzFile gzf_bin;
46
+ // int file_bin;
47
+ gzFile gzf_index;
48
+ // int file_index;
49
+ // char file_outname[10000];
50
+ long long pos_chunk_gz;
51
+ // Contains the version and subversion of this file
52
+ int version;
53
+ int subversion;
54
+ // bin_search is true when a binary search can be used.
55
+ int bin_search;
56
+ // Counts the number of sequences written to the bin file, so it can
57
+ // decide where to create a new gz chunk
58
+ long long counter;
59
+ // If there is an error it is stored here so it can be retrieved.
60
+ int error;
61
+
62
+ int discretize_qual;
63
+ int flatten_qual;
64
+ int create_index;
65
+ };
66
+
67
+ // two modes:
68
+ // 1 .- new files
69
+ // 2 .- add data to files, if they don't exist they are created
70
+ int initialize_writes(struct file_data ** file, char *output_name, int mode, int discretize_qual, int flatten_qual, int create_index);
71
+
72
+ /*
73
+
74
+ write_seq writes a sequence to the files f_bin and its index to f_index
75
+ pos_chunk_gz is the offset of the beggining of the current gz chunk inside the file
76
+ seq_name is a pointer to the name of the sequence
77
+ fasta, quanta and extras are pointers to strings, must be zero terminated.
78
+ Returns 0 if all goes fine.
79
+
80
+ */
81
+
82
+ void inspect_file_data_struct(struct file_data *file);
83
+
84
+ // int write_seq(gzFile *f_bin, FILE *f_index, long pos_chunk_gz, char *seq_name, char *fasta, char *quanta, char *extras);
85
+ int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras);
86
+
87
+ int close_writes(struct file_data *file);
88
+
89
+ /*
90
+ read_seq reads from filename the sequence named seq_name and returns its
91
+ fasta, quanta and extras in those variables.
92
+ It returns 0 if there are no errors, otherwise it returns:
93
+ -2 : error opening index file (it doesn't exists)
94
+ -3 : error reading index file
95
+ -4 : error sequence not found in index file
96
+ -5 : error opening file (it doesn't exists)
97
+ -6 : error reading file
98
+ -7 : error sequence not found
99
+ -8 : error uncompressing sequence
100
+ -9 : EOF
101
+ */
102
+ int read_seq(char *filename, char *seq_name, char **fasta, char **quanta, char **extras);
103
+
104
+ // For doing sequential reads of the whole file:
105
+ // int initialize_sequential_reads(struct file_data *filed, char *filename);
106
+ int initialize_sequential_reads(struct file_data ** filed, char *filename);
107
+
108
+ // return -9 on EOF
109
+ int read_data_sequential(struct file_data *filed, char **seq_name, char **fasta, char **qual, char **extras);
110
+ int close_sequential_reads(struct file_data *filed);
111
+
112
+
113
+ /* process_biofile reads from fname (and fname.quanta) and writes to outname
114
+ (and outname.index) with the binary format
115
+ Returns 0 if all goes fine */
116
+ // int process_biofile(char *fname,char *qfname, char *efname, char *outname);
117
+
118
+
119
+ int open_file(char *fname, FILE **file);
120
+ int close_file(FILE *file);
121
+ int chomp(char *str);
122
+ int split_name(char *fname, char *name, char *comments);
123
+ int get_next_seq_fastq(FILE *file, char **name, char **fasta, char **qual, char **comments);
124
+ int get_next_seq_fasta(FILE *file, char *name, char *fasta, char *comments);
125
+ int process_fastq(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index);
126
+ int process_fasta(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index);
127
+
128
+
129
+ /*
130
+ Format definition
131
+
132
+ Main file that contains chunks compressed in gz
133
+ For each sequence the information of that sequence is written with the format:
134
+ 28F143CJN01EBIJN 105 312 0
135
+
136
+ That is:
137
+ 4 chars for the size of this header, excluding itself, that is, it is the size of
138
+ the rest of the header
139
+ sequence name
140
+ fasta size
141
+ qual size
142
+ extras size
143
+
144
+ The First sequence can be a special sequence with metainfo for this file:
145
+ 30UMACOMPRESSEDFORMAT_version 0 0 0
146
+ 27UMACOMPRESSEDFORMAT_1 0 0 0
147
+
148
+
149
+
150
+ Index file
151
+
152
+ Compressed using chunks
153
+
154
+ At the beggining a special sequence can be used to store metadata
155
+ like the number of fields, if a binary search can be used, etc.
156
+
157
+ That sequence will be:
158
+ UMACOMPRESSEDFORMAT version binary_search begin_of_sequential_index 0 0
159
+
160
+ If binary_search is yes then a metaindex follows to do a fast access to the
161
+ index data.
162
+ That will be the first sequence of each chunk and its offset inside the file.
163
+ (Or perphaps it can be put in another file....)
164
+
165
+
166
+ The rest of the index file will be indexes to the stored sequences, with
167
+ the following fields separated by spaces:
168
+
169
+ F143CJN01ETK00 0 471
170
+
171
+ Sequence name
172
+ begin of the compressed chunk
173
+ offset inside the chunk of the header of that sequence.
174
+
175
+ */
176
+
177
+ int check_error(int error_condition,char *message, int return_value);
178
+
179
+ // #ifndef _libfbin
180
+ //
181
+ // int check_error(int error_condition,char *message, int return_value){
182
+ // if (error_condition) {
183
+ // fprintf(stderr,"Error %d; %s\nMSG:%s\n",errno ,message, strerror(errno));
184
+ // return return_value;
185
+ // }
186
+ // }
187
+ //
188
+ // #define _libfbin
189
+ // #endif
190
+
191
+ int free_string(char **string);
192
+
193
+ int regenerate_index(char * filename);
194
+
Binary file
@@ -0,0 +1,138 @@
1
+ #include "lib_fqbin.h"
2
+ #include <stdio.h>
3
+ #include <ctype.h>
4
+
5
+ #include <unistd.h>
6
+
7
+
8
+ void usage(){
9
+
10
+ // printf("Usage: mk_fqbin [-i] [-f flatten_qual] [-d discretize_qual] [-e extras_file] -o output_file [fastq_file_input_file]\n\n");
11
+
12
+ printf("mk_fqbin converts a fastQ input file or STDIN stream (use no filename or '-') to compressed fqbin format.\n\n");
13
+
14
+ printf("Usage: mk_fqbin [OPTIONS] -o output_file [fastq_file_input_file]\n\n");
15
+
16
+ printf("Options:\n");
17
+ printf(" -i create random access index\n");
18
+ printf(" -d discretize_qual: quality values are discretized in groups of size discretize_qual. This way less quality values are used and a better compression is obtained\n");
19
+ printf(" -f flatten_qual: quality values over flatten_qual (use phred scale) will be set to flatten_qual value in order to achieve a better compression\n");
20
+ printf(" -e extras_file: a file with extra metadata for each sequence if standard FASTA format\n");
21
+
22
+ printf("Mandatory parameters:\n");
23
+ printf(" -o output_file: output fqbin file\n");
24
+ printf(" -F input is in fasta format (will look for filename.qual for qualities) \n");
25
+
26
+ printf("\nSCBI - Supercomputación y Bioinformática. University of Malaga. http://www.scbi.uma.es. Copyright 2011\n\n");
27
+
28
+ exit(-1);
29
+
30
+ }
31
+
32
+ /*******************************************************/
33
+ /* main */
34
+ /*******************************************************/
35
+ int main(int argc, char *argv[])
36
+ {
37
+
38
+ int res = 0;
39
+
40
+ int ch;
41
+
42
+ int output_fasta = 0;
43
+ int output_qual = 0;
44
+
45
+ int flatten_qual=0;
46
+ int discretize_qual=0;
47
+
48
+ char *extras_file=NULL;
49
+ char *output_file=NULL;
50
+ int create_index=0;
51
+ int input_in_fasta=0;
52
+
53
+ while ((ch = getopt(argc, argv, "o:e:d:f:iFh")) != -1) {
54
+ // printf("opt %d\n",ch);
55
+ switch (ch) {
56
+ case 'e':
57
+ // strcopy(extras_file,optarg);
58
+ extras_file=optarg;
59
+ break;
60
+ case 'f':
61
+ flatten_qual = atoi(optarg)+33;
62
+ break;
63
+ case 'd':
64
+ discretize_qual = atoi(optarg);
65
+
66
+ if(discretize_qual<2)
67
+ {
68
+ discretize_qual=0;
69
+ }
70
+ break;
71
+ case 'o':
72
+ output_file=optarg;
73
+ break;
74
+ case 'i':
75
+ create_index=1;
76
+ break;
77
+ case 'F':
78
+ input_in_fasta=1;
79
+ break;
80
+ case 'h':
81
+ usage();
82
+ break;
83
+ case '?':
84
+ default:
85
+ usage();
86
+ }
87
+ }
88
+
89
+ argc -= optind;
90
+ argv += optind;
91
+ // printf("argc: %d", argc);
92
+ // printf("argv: %s", argv[0]);
93
+
94
+ if(output_file==NULL)
95
+ {
96
+ printf("Output file is a mandatory option. Provide one with -o filename\n");
97
+ usage();
98
+ exit -1;
99
+ }
100
+
101
+ printf("Extra metadata file: %s\nFlattenning qual over:%d (%c char)\nDistretizing qual in groups of:%d\n",extras_file,flatten_qual,flatten_qual,discretize_qual);
102
+
103
+ if (create_index){printf("Creating random access index\n");}
104
+
105
+
106
+ if(input_in_fasta)
107
+ {
108
+
109
+ // check remaining params
110
+ if (argc==1){
111
+ res=process_fasta(argv[0],extras_file,output_file,discretize_qual,flatten_qual,create_index);
112
+ }
113
+ else if(argc==0)
114
+ {
115
+ res=process_fasta("-",extras_file,output_file,discretize_qual,flatten_qual,create_index);
116
+ }
117
+ else{
118
+ usage();
119
+ }
120
+
121
+ }else{
122
+ // check remaining params
123
+ if (argc==1){
124
+ res=process_fastq(argv[0],extras_file,output_file,discretize_qual,flatten_qual,create_index);
125
+ }
126
+ else if(argc==0)
127
+ {
128
+ res=process_fastq("-",extras_file,output_file,discretize_qual,flatten_qual,create_index);
129
+ }
130
+ else{
131
+ usage();
132
+ }
133
+
134
+ }
135
+
136
+ return res;
137
+ }
138
+
@@ -0,0 +1,915 @@
1
+ /***************************************************************************
2
+ * Burrows-Wheeler Transform Library
3
+ *
4
+ * File : bwxform.c
5
+ * Purpose : Provides prototypes for functions that apply and reverse the
6
+ * Burrows-Wheeler transform (with or without move to front
7
+ * coding/decoding). The algorithms implemented are based upon
8
+ * those described in "A Block-sorting Lossless Data Compression
9
+ * Algorithm" by M. Burrows and D.J. Wheeler.
10
+ * Author : Michael Dipperstein
11
+ * Date : August 20, 2004
12
+ *
13
+ ****************************************************************************
14
+ * UPDATES
15
+ *
16
+ * $Id: bwxform.c,v 1.6 2007/09/17 13:21:19 michael Exp $
17
+ * $Log: bwxform.c,v $
18
+ * Revision 1.6 2007/09/17 13:21:19 michael
19
+ * Changes required for LGPL v3.
20
+ *
21
+ * Revision 1.5 2005/11/03 15:01:46 michael
22
+ * Speed up block sorting using the algorithm suggested by the
23
+ * Burrows-Wheeler paper. Radix sort all rotations by the first
24
+ * two charcters before employing quicksort.
25
+ *
26
+ * Revision 1.4 2005/05/02 13:33:41 michael
27
+ * Allocate large arrays on heap instead of stack so that gcc builds code
28
+ * that can handle larger blocks.
29
+ *
30
+ * Update e-mail address
31
+ *
32
+ * Revision 1.3 2004/08/27 01:24:16 michael
33
+ * Write S[0] index (I) before transformed block to aviod having to
34
+ * find I in a partial block.
35
+ *
36
+ * Revision 1.2 2004/08/26 06:16:08 michael
37
+ * Handle partial blocks without need to store block size. Use size
38
+ * returned by fread() to indicate smaller than standard block.
39
+ *
40
+ * Revision 1.1.1.1 2004/08/23 04:34:18 michael
41
+ * Burrows-Wheeler Transform
42
+ *
43
+ ****************************************************************************
44
+ *
45
+ * bwxform: An ANSI C Burrows-Wheeler Transform/Reverse Transform Routines
46
+ * Copyright (C) 2004-2005, 2007 by
47
+ * Michael Dipperstein (mdipper@alumni.engr.ucsb.edu)
48
+ *
49
+ * This file is part of the BWT library.
50
+ *
51
+ * The BWT library is free software; you can redistribute it and/or modify
52
+ * it under the terms of the GNU Lesser General Public License as published
53
+ * by the Free Software Foundation; either version 3 of the License, or (at
54
+ * your option) any later version.
55
+ *
56
+ * The BWT library is distributed in the hope that it will be useful, but
57
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
58
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
59
+ * General Public License for more details.
60
+ *
61
+ * You should have received a copy of the GNU Lesser General Public License
62
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
63
+ *
64
+ ***************************************************************************/
65
+
66
+ /***************************************************************************
67
+ * INCLUDED FILES
68
+ ***************************************************************************/
69
+ #include <stdio.h>
70
+ #include <stdlib.h>
71
+ #include <limits.h>
72
+ #include <string.h>
73
+ #include "bwxform.h"
74
+
75
+ /***************************************************************************
76
+ * CONSTANTS
77
+ ***************************************************************************/
78
+ #define BLOCK_SIZE 4096 /* size of blocks */
79
+
80
+ #if BLOCK_SIZE > INT_MAX
81
+ #error BLOCK_SIZE must be <= INT_MAX and maximum size_t
82
+ #endif
83
+
84
+ /* NOTE: Need to find a way to check for maximum size_t */
85
+
86
+ /***************************************************************************
87
+ * TYPE DEFINITIONS
88
+ ***************************************************************************/
89
+ unsigned char block[BLOCK_SIZE]; /* block being (un)transformed */
90
+ size_t blockSize; /* actual size of block */
91
+
92
+ /* counters and offsets used for radix sorting with characters */
93
+ unsigned int counters[256];
94
+ unsigned int offsetTable[256];
95
+
96
+ /***************************************************************************
97
+ * MACROS
98
+ ***************************************************************************/
99
+ /* wraps array index within array bounds (assumes value < 2 * limit) */
100
+ #define Wrap(value, limit) (((value) < (limit)) ? (value) : ((value) - (limit)))
101
+
102
+ /***************************************************************************
103
+ * PROTOTYPES
104
+ ***************************************************************************/
105
+ /* move to front functions */
106
+
107
+ void DoMTF(unsigned char *last, int length);
108
+ void UndoMTF(unsigned char *last, int length);
109
+
110
+ /***************************************************************************
111
+ * FUNCTIONS
112
+ ***************************************************************************/
113
+
114
+ /***************************************************************************
115
+ * Function : ComparePresorted
116
+ * Description: This comparison function is designed for use with qsort
117
+ * and "block", a global array of "blockSize" unsigned chars.
118
+ * It compares two strings in "block" starting at indices
119
+ * s1 and s2 and ending at indices s1 - 1 and s2 - 1.
120
+ * The strings are assumed to be presorted so that first two
121
+ * characters are known to be matching.
122
+ * Parameters : s1 - The starting index of a string in block
123
+ * s2 - The starting index of a string in block
124
+ * Effects : NONE
125
+ * Returned : > 0 if string s1 > string s2
126
+ * 0 if string s1 == string s2
127
+ * < 0 if string s1 < string s2
128
+ ***************************************************************************/
129
+ int ComparePresorted(const void *s1, const void *s2)
130
+ {
131
+ int offset1, offset2;
132
+ int i;
133
+ int result;
134
+
135
+ offset1 = *((int *)s1);
136
+ offset2 = *((int *)s2);
137
+
138
+ /***********************************************************************
139
+ * Compare 1 character at a time until there's difference or the end of
140
+ * the block is reached. Since we're only sorting strings that already
141
+ * match at the first two characters, start with the third character.
142
+ ***********************************************************************/
143
+ for(i = 2; i < blockSize; i++)
144
+ {
145
+ result = (int)block[Wrap((offset1 + i), blockSize)] -
146
+ (int)block[Wrap((offset2 + i), blockSize)];
147
+
148
+ if (result != 0)
149
+ {
150
+ return result;
151
+ }
152
+ }
153
+
154
+ /* strings are identical */
155
+ return 0;
156
+ }
157
+
158
+ /***************************************************************************
159
+ * Function : BWXformFile
160
+ * Description: This function performs a Burrows-Wheeler transformation
161
+ * on a file (with optional move to front) and writes the
162
+ * resulting data to the specified output file. Comments in
163
+ * this function indicate corresponding variables, labels,
164
+ * and sections in "A Block-sorting Lossless Data Compression
165
+ * Algorithm" by M. Burrows and D.J. Wheeler.
166
+ * Parameters : inFile - Name of file to transform
167
+ * outFile - Name of file to write transformed output to
168
+ * mtf - Set to TRUE if move to front coding should be
169
+ * applied.
170
+ * Effects : A Burrows-Wheeler transformation (and possibly move to
171
+ * front encoding) is applied to inFile. The results of
172
+ * the transformation are written to outFile.
173
+ * Returned : TRUE for success, otherwise FALSE.
174
+ ***************************************************************************/
175
+ int BWXformFile(char *inFile, char *outFile, char mtf)
176
+ {
177
+ int i, j, k;
178
+ FILE *fpIn, *fpOut;
179
+ unsigned int *rotationIdx; /* index of first char in rotation */
180
+ unsigned int *v; /* index of radix sorted charaters */
181
+ int s0Idx; /* index of S0 in rotations (I) */
182
+ unsigned char *last; /* last characters from sorted rotations */
183
+
184
+ /***********************************************************************
185
+ * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
186
+ * code that throws a Segmentation fault when the large arrays are
187
+ * allocated on the stack.
188
+ ***********************************************************************/
189
+ rotationIdx = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
190
+
191
+ if (NULL == rotationIdx)
192
+ {
193
+ perror("Allocating array of rotation indices");
194
+ return FALSE;
195
+ }
196
+
197
+ v = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
198
+
199
+ if (v == rotationIdx)
200
+ {
201
+ perror("Allocating array of sort indices");
202
+ free(rotationIdx);
203
+ return FALSE;
204
+ }
205
+
206
+ last = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
207
+
208
+ if (NULL == last)
209
+ {
210
+ perror("Allocating array of last characters");
211
+ free(rotationIdx);
212
+ free(v);
213
+ return FALSE;
214
+ }
215
+
216
+ /* open input and output files */
217
+ if ((fpIn = fopen(inFile, "rb")) == NULL)
218
+ {
219
+ perror(inFile);
220
+ return FALSE;
221
+ }
222
+
223
+ if (outFile == NULL)
224
+ {
225
+ fpOut = stdout;
226
+ }
227
+ else
228
+ {
229
+ if ((fpOut = fopen(outFile, "wb")) == NULL)
230
+ {
231
+ fclose(fpIn);
232
+ perror(outFile);
233
+ return FALSE;
234
+ }
235
+ }
236
+
237
+ while((blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn))
238
+ != 0)
239
+ {
240
+ /*******************************************************************
241
+ * Sort the rotated strings in the block. A radix sort is performed
242
+ * on the first to characters of all the rotated strings (2nd
243
+ * character then 1st). All rotated strings with matching initial
244
+ * characters are then quicksorted. - Q4..Q7
245
+ *******************************************************************/
246
+
247
+ /*** radix sort on second character in rotation ***/
248
+
249
+ /* count number of characters for radix sort */
250
+ memset(counters, 0, 256 * sizeof(int));
251
+ for (i = 0; i < blockSize; i++)
252
+ {
253
+ counters[block[i]]++;
254
+ }
255
+
256
+ offsetTable[0] = 0;
257
+
258
+ for(i = 1; i < 256; i++)
259
+ {
260
+ /* determine number of values before those sorted under i */
261
+ offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
262
+ }
263
+
264
+ /* sort on 2nd character */
265
+ for (i = 0; i < blockSize - 1; i++)
266
+ {
267
+ j = block[i + 1];
268
+ v[offsetTable[j]] = i;
269
+ offsetTable[j] = offsetTable[j] + 1;
270
+ }
271
+
272
+ /* handle wrap around for string starting at end of block */
273
+ j = block[0];
274
+ v[offsetTable[j]] = i;
275
+ offsetTable[0] = 0;
276
+
277
+ /*** radix sort on first character in rotation ***/
278
+
279
+ for(i = 1; i < 256; i++)
280
+ {
281
+ /* determine number of values before those sorted under i */
282
+ offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
283
+ }
284
+
285
+ for (i = 0; i < blockSize; i++)
286
+ {
287
+ j = v[i];
288
+ j = block[j];
289
+ rotationIdx[offsetTable[j]] = v[i];
290
+ offsetTable[j] = offsetTable[j] + 1;
291
+ }
292
+
293
+ /*******************************************************************
294
+ * now rotationIdx contains the sort order of all strings sorted
295
+ * by their first 2 characters. Use qsort to sort the strings
296
+ * that have their first two characters matching.
297
+ *******************************************************************/
298
+ for (i = 0, k = 0; (i <= UCHAR_MAX) && (k < (blockSize - 1)); i++)
299
+ {
300
+ for (j = 0; (j <= UCHAR_MAX) && (k < (blockSize - 1)); j++)
301
+ {
302
+ int first = k;
303
+
304
+ /* count strings starting with ij */
305
+ while ((i == block[rotationIdx[k]]) &&
306
+ (j == block[Wrap(rotationIdx[k] + 1, blockSize)]))
307
+ {
308
+ k++;
309
+
310
+ if (k == blockSize)
311
+ {
312
+ /* we've searched the whole block */
313
+ break;
314
+ }
315
+ }
316
+
317
+ if (k - first > 1)
318
+ {
319
+ /* there are at least 2 strings staring with ij, sort them */
320
+ qsort(&rotationIdx[first], k - first, sizeof(int),
321
+ ComparePresorted);
322
+ }
323
+ }
324
+ }
325
+
326
+ /* find last characters of rotations (L) - C2 */
327
+ s0Idx = 0;
328
+ for (i = 0; i < blockSize; i++)
329
+ {
330
+ if (rotationIdx[i] != 0)
331
+ {
332
+ last[i] = block[rotationIdx[i] - 1];
333
+ }
334
+ else
335
+ {
336
+ /* unrotated string 1st character is end of string */
337
+ s0Idx = i;
338
+ last[i] = block[blockSize - 1];
339
+ }
340
+ }
341
+
342
+ if (mtf)
343
+ {
344
+ DoMTF(last, blockSize);
345
+ }
346
+
347
+ /* write index of end of unrotated string (I) */
348
+ fwrite(&s0Idx, sizeof(int), 1, fpOut);
349
+
350
+ /* write out last characters of rotations (L) */
351
+ fwrite(last, sizeof(unsigned char), blockSize, fpOut);
352
+ }
353
+
354
+ /* clean up */
355
+ free(rotationIdx);
356
+ free(v);
357
+ free(last);
358
+ fclose(fpIn);
359
+ fclose(fpOut);
360
+ return TRUE;
361
+ }
362
+
363
+
364
+
365
+ int BWXform(char *inString, char *outString, int mtf)
366
+ {
367
+ int i, j, k;
368
+ FILE *fpIn, *fpOut;
369
+ unsigned int *rotationIdx; /* index of first char in rotation */
370
+ unsigned int *v; /* index of radix sorted charaters */
371
+ int s0Idx; /* index of S0 in rotations (I) */
372
+ unsigned char *last; /* last characters from sorted rotations */
373
+
374
+ /***********************************************************************
375
+ * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
376
+ * code that throws a Segmentation fault when the large arrays are
377
+ * allocated on the stack.
378
+ ***********************************************************************/
379
+ rotationIdx = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
380
+
381
+ if (NULL == rotationIdx)
382
+ {
383
+ perror("Allocating array of rotation indices");
384
+ return FALSE;
385
+ }
386
+
387
+ v = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
388
+
389
+ if (v == rotationIdx)
390
+ {
391
+ perror("Allocating array of sort indices");
392
+ free(rotationIdx);
393
+ return FALSE;
394
+ }
395
+
396
+ last = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
397
+
398
+ if (NULL == last)
399
+ {
400
+ perror("Allocating array of last characters");
401
+ free(rotationIdx);
402
+ free(v);
403
+ return FALSE;
404
+ }
405
+
406
+ strcpy(block,inString);
407
+ blockSize=strlen(inString);
408
+ block[blockSize]='\0';
409
+ // printf("block:%s, SIZE: %ld\n",block,blockSize);
410
+
411
+ // while((blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn))
412
+ // != 0)
413
+ // {
414
+ /*******************************************************************
415
+ * Sort the rotated strings in the block. A radix sort is performed
416
+ * on the first to characters of all the rotated strings (2nd
417
+ * character then 1st). All rotated strings with matching initial
418
+ * characters are then quicksorted. - Q4..Q7
419
+ *******************************************************************/
420
+
421
+ /*** radix sort on second character in rotation ***/
422
+
423
+ /* count number of characters for radix sort */
424
+ memset(counters, 0, 256 * sizeof(int));
425
+ for (i = 0; i < blockSize; i++)
426
+ {
427
+ counters[block[i]]++;
428
+ }
429
+
430
+ offsetTable[0] = 0;
431
+
432
+ for(i = 1; i < 256; i++)
433
+ {
434
+ /* determine number of values before those sorted under i */
435
+ offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
436
+ }
437
+
438
+ /* sort on 2nd character */
439
+ for (i = 0; i < blockSize - 1; i++)
440
+ {
441
+ j = block[i + 1];
442
+ v[offsetTable[j]] = i;
443
+ offsetTable[j] = offsetTable[j] + 1;
444
+ }
445
+
446
+ /* handle wrap around for string starting at end of block */
447
+ j = block[0];
448
+ v[offsetTable[j]] = i;
449
+ offsetTable[0] = 0;
450
+
451
+ /*** radix sort on first character in rotation ***/
452
+
453
+ for(i = 1; i < 256; i++)
454
+ {
455
+ /* determine number of values before those sorted under i */
456
+ offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
457
+ }
458
+
459
+ for (i = 0; i < blockSize; i++)
460
+ {
461
+ j = v[i];
462
+ j = block[j];
463
+ rotationIdx[offsetTable[j]] = v[i];
464
+ offsetTable[j] = offsetTable[j] + 1;
465
+ }
466
+
467
+ /*******************************************************************
468
+ * now rotationIdx contains the sort order of all strings sorted
469
+ * by their first 2 characters. Use qsort to sort the strings
470
+ * that have their first two characters matching.
471
+ *******************************************************************/
472
+ for (i = 0, k = 0; (i <= UCHAR_MAX) && (k < (blockSize - 1)); i++)
473
+ {
474
+ for (j = 0; (j <= UCHAR_MAX) && (k < (blockSize - 1)); j++)
475
+ {
476
+ int first = k;
477
+
478
+ /* count strings starting with ij */
479
+ while ((i == block[rotationIdx[k]]) &&
480
+ (j == block[Wrap(rotationIdx[k] + 1, blockSize)]))
481
+ {
482
+ k++;
483
+
484
+ if (k == blockSize)
485
+ {
486
+ /* we've searched the whole block */
487
+ break;
488
+ }
489
+ }
490
+
491
+ if (k - first > 1)
492
+ {
493
+ /* there are at least 2 strings staring with ij, sort them */
494
+ qsort(&rotationIdx[first], k - first, sizeof(int),
495
+ ComparePresorted);
496
+ }
497
+ }
498
+ }
499
+
500
+ /* find last characters of rotations (L) - C2 */
501
+ s0Idx = 0;
502
+ for (i = 0; i < blockSize; i++)
503
+ {
504
+ if (rotationIdx[i] != 0)
505
+ {
506
+ last[i] = block[rotationIdx[i] - 1];
507
+ }
508
+ else
509
+ {
510
+ /* unrotated string 1st character is end of string */
511
+ s0Idx = i;
512
+ last[i] = block[blockSize - 1];
513
+ }
514
+ }
515
+ // printf("ANTES:\n");
516
+ // for (i = 0; i < blockSize; i++)
517
+ // {
518
+ // printf("%2d,",last[i]);
519
+ // }
520
+ // printf("\nDESPUES:\n");
521
+
522
+ if (mtf)
523
+ {
524
+ DoMTF(last, blockSize);
525
+ }
526
+
527
+ // for (i = 0; i < blockSize; i++)
528
+ // {
529
+ // printf("%2d,",last[i]);
530
+ // }
531
+ // printf("\n");
532
+
533
+ // /* write index of end of unrotated string (I) */
534
+ // fwrite(&s0Idx, sizeof(int), 1, fpOut);
535
+ //
536
+ // /* write out last characters of rotations (L) */
537
+ // fwrite(last, sizeof(unsigned char), blockSize, fpOut);
538
+ // } //FIN WHILE
539
+
540
+
541
+ memcpy((char *)outString, (void *)last, sizeof(unsigned char) * blockSize);
542
+ // strncpy(outString,last,blockSize+1);
543
+ // blockSize=strlen(last);
544
+ // printf("block2:%s, SIZE: %ld\n",last,strlen(last));
545
+
546
+
547
+ /* clean up */
548
+ free(rotationIdx);
549
+ free(v);
550
+ free(last);
551
+ // fclose(fpIn);
552
+ // fclose(fpOut);
553
+ return TRUE;
554
+ }
555
+
556
+ /***************************************************************************
557
+ * Function : DoMTF
558
+ * Description: This function performs move to front encoding on a block
559
+ * on of data that has already had the Burrows-Wheeler
560
+ * transformation applied to it. Comments in this function
561
+ * indicate corresponding variables, labels, and sections in
562
+ * "A Block-sorting Lossless Data Compression Algorithm" by
563
+ * M. Burrows and D.J. Wheeler.
564
+ * Parameters : last - pointer an array of "last" characters from
565
+ * Burrows-Wheeler rotations (L)
566
+ * length - the number of unsigned chars contained in last.
567
+ * Effects : Move to front encoding is applied on an array of last
568
+ * characters. The results of the encoding replace the data
569
+ * that was stored in last.
570
+ * Returned : NONE
571
+ ***************************************************************************/
572
+ void DoMTF(unsigned char *last, int length)
573
+ {
574
+ unsigned char list[UCHAR_MAX + 1]; /* list of characters (Y) */
575
+ unsigned char *encoded; /* mtf encoded block (R) */
576
+ int i, j;
577
+
578
+ /***********************************************************************
579
+ * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
580
+ * code that throws a Segmentation fault when the large arrays are
581
+ * allocated on the stack.
582
+ ***********************************************************************/
583
+ encoded = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
584
+
585
+ if (NULL == encoded)
586
+ {
587
+ perror("Allocating array to store MTF encoding");
588
+ return;
589
+ }
590
+
591
+ /* start with alphabetically sorted list of characters */
592
+ for(i = 0; i <= UCHAR_MAX; i++)
593
+ {
594
+ list[i] = (unsigned char)i;
595
+ }
596
+
597
+ /* move-to-front coding - M1 */
598
+ for (i = 0; i < length; i++)
599
+ {
600
+ /*******************************************************************
601
+ * Find the character in the list of characters. I do a sequential
602
+ * search because of move to front causes common characters to be
603
+ * near the front of the list.
604
+ *******************************************************************/
605
+ for (j = 0; j <= UCHAR_MAX; j++)
606
+ {
607
+ if (list[j] == last[i])
608
+ {
609
+ /* we found the character */
610
+ encoded[i] = j;
611
+ break;
612
+ }
613
+ }
614
+
615
+ /* now move the current character to the front of the list */
616
+ for (; j > 0; j--)
617
+ {
618
+ list[j] = list[j - 1];
619
+ }
620
+ list[0] = last[i];
621
+ }
622
+
623
+ /* copy mtf encoded vector of last characters (R) to input */
624
+ memcpy((void *)last, (void *)encoded, sizeof(unsigned char) * length);
625
+ free(encoded);
626
+
627
+ return;
628
+ }
629
+
630
+ /***************************************************************************
631
+ * Function : BWReverseXformFile
632
+ * Description: This function reverses a Burrows-Wheeler transformation
633
+ * on a file (with optional move to front) and writes the
634
+ * resulting data to the specified output file. Comments in
635
+ * this function indicate corresponding variables, labels,
636
+ * and sections in "A Block-sorting Lossless Data Compression
637
+ * Algorithm" by M. Burrows and D.J. Wheeler.
638
+ * Parameters : inFile - Name of file to reverse transform
639
+ * outFile - Name of file to write reverse transformed
640
+ * output to
641
+ * mtf - Set to TRUE if move to front decoding should be
642
+ * applied
643
+ * Effects : A Burrows-Wheeler reverse transformation (and possibly
644
+ * move to front encoding) is applied to inFile. The results
645
+ * of the reverse transformation are written to outFile.
646
+ * Returned : TRUE for success, otherwise FALSE.
647
+ ***************************************************************************/
648
+ int BWReverseXformFile(char *inFile, char *outFile, char mtf)
649
+ {
650
+ FILE *fpIn, *fpOut;
651
+ int i, j, sum;
652
+ int count[UCHAR_MAX + 1]; /* count[i] = # of chars in block <= i */
653
+ int *pred; /* pred[i] = # of times block[i] appears in
654
+ block[0 .. i - 1] */
655
+ unsigned char *unrotated; /* original block */
656
+ int s0Idx; /* index of S0 in rotations (I) */
657
+
658
+ /***********************************************************************
659
+ * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
660
+ * code that throws a Segmentation fault when the large arrays are
661
+ * allocated on the stack.
662
+ ***********************************************************************/
663
+ pred = (int *)malloc(BLOCK_SIZE * sizeof(int));
664
+
665
+ if (NULL == pred)
666
+ {
667
+ perror("Allocating array of matching predicessors");
668
+ return FALSE;
669
+ }
670
+
671
+ unrotated = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
672
+
673
+ if (NULL == unrotated)
674
+ {
675
+ perror("Allocating array to store unrotated block");
676
+ free(pred);
677
+ return FALSE;
678
+ }
679
+
680
+ /* open input and output files */
681
+ if ((fpIn = fopen(inFile, "rb")) == NULL)
682
+ {
683
+ perror(inFile);
684
+ return FALSE;
685
+ }
686
+
687
+ if (outFile == NULL)
688
+ {
689
+ fpOut = stdout;
690
+ }
691
+ else
692
+ {
693
+ if ((fpOut = fopen(outFile, "wb")) == NULL)
694
+ {
695
+ fclose(fpIn);
696
+ perror(outFile);
697
+ return FALSE;
698
+ }
699
+ }
700
+
701
+ while(fread(&s0Idx, sizeof(int), 1, fpIn) != 0)
702
+ {
703
+ blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn);
704
+
705
+ if(mtf)
706
+ {
707
+ UndoMTF(block, blockSize);
708
+ }
709
+
710
+ /* code based on pseudo code from section 4.2 (D1 and D2) follows */
711
+ for(i = 0; i <= UCHAR_MAX; i++)
712
+ {
713
+ count[i] = 0;
714
+ }
715
+
716
+ /*******************************************************************
717
+ * Set pred[i] to the number of times block[i] appears in the
718
+ * substring block[0 .. i - 1]. As a useful side effect count[i]
719
+ * will be the number of times character i appears in block.
720
+ *******************************************************************/
721
+ for (i = 0; i < blockSize; i++)
722
+ {
723
+ pred[i] = count[block[i]];
724
+ count[block[i]]++;
725
+ }
726
+
727
+ /*******************************************************************
728
+ * Finally, set count[i] to the number of characters in block
729
+ * lexicographically less than i.
730
+ *******************************************************************/
731
+ sum = 0;
732
+ for(i = 0; i <= UCHAR_MAX; i++)
733
+ {
734
+ j = count[i];
735
+ count[i] = sum;
736
+ sum += j;
737
+ }
738
+
739
+ /* construct the initial unrotated string (S[0]) */
740
+ i = s0Idx;
741
+ for(j = blockSize - 1; j >= 0; j--)
742
+ {
743
+ unrotated[j] = block[i];
744
+ i = pred[i] + count[block[i]];
745
+ }
746
+
747
+ fwrite(unrotated, sizeof(unsigned char), blockSize, fpOut);
748
+ }
749
+
750
+ /* clean up */
751
+ free(pred);
752
+ free(unrotated);
753
+ fclose(fpIn);
754
+ fclose(fpOut);
755
+ return TRUE;
756
+ }
757
+
758
+
759
+ int BWReverseXform(char *inString, char *outString, int mtf, long size)
760
+ {
761
+ // FILE *fpIn, *fpOut;
762
+ int i, j, sum;
763
+ int count[UCHAR_MAX + 1]; /* count[i] = # of chars in block <= i */
764
+ int *pred; /* pred[i] = # of times block[i] appears in
765
+ block[0 .. i - 1] */
766
+ unsigned char *unrotated; /* original block */
767
+ int s0Idx; /* index of S0 in rotations (I) */
768
+
769
+ /***********************************************************************
770
+ * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
771
+ * code that throws a Segmentation fault when the large arrays are
772
+ * allocated on the stack.
773
+ ***********************************************************************/
774
+ pred = (int *)malloc(BLOCK_SIZE * sizeof(int));
775
+
776
+ if (NULL == pred)
777
+ {
778
+ perror("Allocating array of matching predicessors");
779
+ return FALSE;
780
+ }
781
+
782
+ unrotated = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
783
+
784
+ if (NULL == unrotated)
785
+ {
786
+ perror("Allocating array to store unrotated block");
787
+ free(pred);
788
+ return FALSE;
789
+ }
790
+
791
+ // while(fread(&s0Idx, sizeof(int), 1, fpIn) != 0)
792
+ // {
793
+ // blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn);
794
+
795
+ blockSize=size;
796
+ strncpy(block,inString,blockSize);
797
+
798
+ if(mtf)
799
+ {
800
+ UndoMTF(block, blockSize);
801
+ }
802
+
803
+ /* code based on pseudo code from section 4.2 (D1 and D2) follows */
804
+ for(i = 0; i <= UCHAR_MAX; i++)
805
+ {
806
+ count[i] = 0;
807
+ }
808
+
809
+ /*******************************************************************
810
+ * Set pred[i] to the number of times block[i] appears in the
811
+ * substring block[0 .. i - 1]. As a useful side effect count[i]
812
+ * will be the number of times character i appears in block.
813
+ *******************************************************************/
814
+ for (i = 0; i < blockSize; i++)
815
+ {
816
+ pred[i] = count[block[i]];
817
+ count[block[i]]++;
818
+ }
819
+
820
+ /*******************************************************************
821
+ * Finally, set count[i] to the number of characters in block
822
+ * lexicographically less than i.
823
+ *******************************************************************/
824
+ sum = 0;
825
+ for(i = 0; i <= UCHAR_MAX; i++)
826
+ {
827
+ j = count[i];
828
+ count[i] = sum;
829
+ sum += j;
830
+ }
831
+
832
+ /* construct the initial unrotated string (S[0]) */
833
+ i = s0Idx;
834
+ for(j = blockSize - 1; j >= 0; j--)
835
+ {
836
+ unrotated[j] = block[i];
837
+ i = pred[i] + count[block[i]];
838
+ }
839
+
840
+ // fwrite(unrotated, sizeof(unsigned char), blockSize, fpOut);
841
+ // }
842
+
843
+ strncpy(outString,unrotated,blockSize);
844
+
845
+ /* clean up */
846
+ free(pred);
847
+ free(unrotated);
848
+ // fclose(fpIn);
849
+ // fclose(fpOut);
850
+ //
851
+ return TRUE;
852
+ }
853
+
854
+ /***************************************************************************
855
+ * Function : UndoMTF
856
+ * Description: This function reverses move to front encoding on a block
857
+ * on of data that has already had the Burrows-Wheeler
858
+ * transformation applied to it. Comments in this function
859
+ * indicate corresponding variables, labels, and sections in
860
+ * "A Block-sorting Lossless Data Compression Algorithm" by
861
+ * M. Burrows and D.J. Wheeler.
862
+ * Parameters : last - pointer an array of mtf encoded characters from
863
+ * Burrows-Wheeler rotations.
864
+ * length - the number of unsigned chars contained in last.
865
+ * Effects : Move to front encoding is reversed on an array of last
866
+ * characters. The results of the reversal are stored in
867
+ * the array last (L), providing an array of last characters
868
+ * of sorted rotations.
869
+ * Returned : NONE
870
+ ***************************************************************************/
871
+ void UndoMTF(unsigned char *last, int length)
872
+ {
873
+ unsigned char list[UCHAR_MAX + 1]; /* list of characters (Y) */
874
+ unsigned char *encoded; /* mtf encoded block (R) */
875
+ int i, j;
876
+
877
+ /***********************************************************************
878
+ * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
879
+ * code that throws a Segmentation fault when the large arrays are
880
+ * allocated on the stack.
881
+ ***********************************************************************/
882
+ encoded = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
883
+
884
+ if (NULL == encoded)
885
+ {
886
+ perror("Allocating array to store MTF encoding");
887
+ return;
888
+ }
889
+
890
+ /* copy last into encoded */
891
+ memcpy((void *)encoded, (void *)last, sizeof(unsigned char) * length);
892
+
893
+ /* start with alphabetically sorted list of characters */
894
+ for(i = 0; i <= UCHAR_MAX; i++)
895
+ {
896
+ list[i] = (unsigned char)i;
897
+ }
898
+
899
+ /* move-to-front decoding - W2 */
900
+ for (i = 0; i < length; i++)
901
+ {
902
+ /* decode the character */
903
+ last[i] = list[encoded[i]];
904
+
905
+ /* now move the current character to the front of the list */
906
+ for (j = encoded[i]; j > 0; j--)
907
+ {
908
+ list[j] = list[j - 1];
909
+ }
910
+ list[0] = last[i];
911
+ }
912
+
913
+ free(encoded);
914
+ return;
915
+ }