RubyGems - scbi_fqbin - Versions diffs - 0.2.2 → 0.2.3 - Mend

scbi_fqbin 0.2.2 → 0.2.3

Files changed (116) hide show

checksums.yaml +7 -0
data/.DS_Store +0 -0
data/.gitignore +14 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/{README.rdoc → README.md} +0 -0
data/Rakefile +8 -28
data/lib/scbi_fqbin.rb +3 -5
data/lib/scbi_fqbin/fastabin.rb +411 -0
data/lib/scbi_fqbin/fastq_file_c.rb +373 -0
data/lib/scbi_fqbin/fbin_file.rb +1 -1
data/lib/scbi_fqbin/t.rb +9 -0
data/lib/scbi_fqbin/t2.rb +12 -0
data/lib/scbi_fqbin/version.rb +3 -0
data/lib_fqbin_src.zip +0 -0
data/lib_fqbin_src/Makefile +66 -0
data/lib_fqbin_src/fq +0 -0
data/lib_fqbin_src/fq.c +165 -0
data/lib_fqbin_src/hash_fqbin +0 -0
data/lib_fqbin_src/hash_fqbin.c +212 -0
data/lib_fqbin_src/idx_fqbin +21 -0
data/lib_fqbin_src/iterate_fqbin +0 -0
data/lib_fqbin_src/iterate_fqbin.c +136 -0
data/lib_fqbin_src/lib_fqbin.c +1748 -0
data/lib_fqbin_src/lib_fqbin.h +194 -0
data/lib_fqbin_src/mk_fqbin +0 -0
data/lib_fqbin_src/mk_fqbin.c +138 -0
data/lib_fqbin_src/other/bwxform.c +915 -0
data/lib_fqbin_src/other/bwxform.h +74 -0
data/lib_fqbin_src/other/find_in_index.c +130 -0
data/lib_fqbin_src/other/hash_fbin_nogzchunks.c +164 -0
data/lib_fqbin_src/other/idx_fqbin +0 -0
data/lib_fqbin_src/other/idx_fqbin.c +67 -0
data/lib_fqbin_src/other/make_hsh.sh +14 -0
data/lib_fqbin_src/other/rd_extras_fbin.c +45 -0
data/lib_fqbin_src/read_fq +0 -0
data/lib_fqbin_src/read_fq.c +143 -0
data/lib_fqbin_src/read_fqbin +0 -0
data/lib_fqbin_src/read_fqbin.c +101 -0
data/lib_fqbin_src/sort_index +9 -0
data/lib_fqbin_src/test.rb +13 -0
data/scbi_fqbin.gemspec +25 -0
data/test/build.rake +15 -0
data/test/fbinfile +0 -0
data/test/fbinfile.index +0 -0
data/test/no_test_fill_file.rb +66 -0
data/test/old/app.rb +43 -0
data/test/old/bin/iterate_fastabin.rb +54 -0
data/test/old/bin/mk_fastabin.rb +22 -0
data/test/old/bin/rd_fastabin.rb +36 -0
data/test/old/bin/rd_fq.rb +20 -0
data/test/old/bioruby.rb +27 -0
data/test/old/c/Makefile +34 -0
data/test/old/c/fbin_lib.zip +0 -0
data/test/old/c/iterate_fbin.c +54 -0
data/test/old/c/libreria_gz.c +707 -0
data/test/old/c/libreria_gz.h +127 -0
data/test/old/c/main.c +86 -0
data/test/old/c/mk_fbin.c +24 -0
data/test/old/c/rd_seq_fbin.c +44 -0
data/test/old/c/test_ffi/a.out +0 -0
data/test/old/c/test_ffi/app.c +26 -0
data/test/old/c/test_ffi/app.rb +19 -0
data/test/old/c/test_ffi/liblibreria_gz.dylib +0 -0
data/test/old/c/test_ffi/libmylibrary.dylib +0 -0
data/test/old/c/test_ffi/my_library.rb +23 -0
data/test/old/c/test_ffi/mylibrary.c +22 -0
data/test/old/c/test_ffi/mylibrary.h +6 -0
data/test/old/c/usage_instructions.txt +62 -0
data/test/old/ext/Makefile +187 -0
data/test/old/ext/Makefile.dario +34 -0
data/test/old/ext/extconf.rb +8 -0
data/test/old/ext/mk_fbin.c +24 -0
data/test/old/ext/sample/extras.txt +4 -0
data/{.gemtest → test/old/ext/sample/extras2.txt} +0 -0
data/test/old/ext/sample/f1.fasta +10 -0
data/test/old/ext/sample/f1.fasta.qual +10 -0
data/test/old/ext/sample/f1.fbin +0 -0
data/test/old/ext/sample/f1.fbin.index +0 -0
data/test/old/ext/sample/main.c +86 -0
data/test/old/ext/usage_instructions.txt +62 -0
data/test/old/t_scbi_fastabin.rb +140 -0
data/test/read_tests/10-original_sizes.sh +16 -0
data/test/read_tests/20-fq_time.sh +23 -0
data/test/read_tests/30-fbin_read_time.sh +23 -0
data/test/read_tests/40-bsc_read_time.sh +21 -0
data/test/read_tests/50-fq_time_x4.sh +25 -0
data/test/read_tests/60-fbin_read_time_x4.sh +24 -0
data/test/read_tests/70-bsc_read_time_x4.sh +32 -0
data/test/results_bio_scbi_fasta.txt +11 -0
data/test/{test_scbi_fbin_file.rb → scbi_fbin_file_test.rb} +0 -0
data/test/speed.txt +81 -0
data/test/t_scbi_fasta.rb +12 -0
data/test/write_tests/10-original_sizes.sh +16 -0
data/test/write_tests/20-zip_time.sh +17 -0
data/test/write_tests/30-mk_fbin_time.sh +23 -0
data/test/write_tests/31-mk_fbin_time_f30.sh +21 -0
data/test/write_tests/40-gzip_time.sh +16 -0
data/test/write_tests/41-bsc_time.sh +16 -0
data/test/write_tests/50-zip_sizes.sh +16 -0
data/test/write_tests/60-fbin_sizes.sh +17 -0
data/test/write_tests/61-fbin_sizes_f30.sh +16 -0
data/test/write_tests/70-gzip_sizes.sh +17 -0
data/test/write_tests/80-bsc_sizes.sh +17 -0
data/website/index.html +87 -0
data/website/index.txt +81 -0
data/website/javascripts/rounded_corners_lite.inc.js +285 -0
data/website/stylesheets/screen.css +159 -0
data/website/template.html.erb +50 -0
metadata +208 -95
data/History.txt +0 -19
data/Manifest.txt +0 -12
data/PostInstall.txt +0 -7
data/script/console +0 -10
data/script/destroy +0 -14
data/script/generate +0 -14

data/lib_fqbin_src/lib_fqbin.h ADDED Viewed

@@ -0,0 +1,194 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <zlib.h>
+#include <stdlib.h>
+#include <zlib.h>
+#define VERSION 1
+#define SUBVERSION 0
+#define CHUNK 262144
+// Maximum file name (including .idx)
+#define MAXFNAME 512
+// Maximum lenght of the name of a sequence
+#define MAXSEQNAME 1024
+#define ONEMB 1000000
+#define MAXSEQLENGTH 500*ONEMB
+#define INVALID_FASTQ_FORMAT -5
+#define INVALID_FASTA_FORMAT -6
+#define MAX_SEQ_SIZE_ERROR -7
+#define DEBUG 0
+#define FALSE 0
+#define TRUE 1
+// Maximum size of the metadata of a sequence, including name, lenght of fasta, qual and extras.
+// It should be a maximum of 10000
+#define SEQ_METADATA 10000
+struct file_data {
+	char name[10000];
+	char index_name[10000];
+	gzFile gzf_bin;
+	// int file_bin;
+	gzFile gzf_index;
+	// int file_index;
+	// char file_outname[10000];
+	long long pos_chunk_gz;
+	// Contains the version and subversion of this file
+	int version;
+	int subversion;
+	// bin_search is true when a binary search can be used.
+	int bin_search;
+	// Counts the number of sequences written to the bin file, so it can
+	// decide where to create a new gz chunk
+	long long counter;
+	// If there is an error it is stored here so it can be retrieved.
+	int error;
+    int discretize_qual;
+    int flatten_qual;
+    int create_index;
+};
+// two modes:
+// 1 .- new files
+// 2 .- add data to files, if they don't exist they are created
+int initialize_writes(struct file_data ** file, char *output_name, int mode, int discretize_qual, int flatten_qual, int create_index);
+/*
+   write_seq writes a sequence to the files f_bin and its index to f_index
+   pos_chunk_gz is the offset of the beggining of the current gz chunk inside the file
+   seq_name is a pointer to the name of the sequence
+   fasta, quanta and extras are pointers to strings, must be zero terminated.
+   Returns 0 if all goes fine.
+ */
+void inspect_file_data_struct(struct file_data *file);
+// int write_seq(gzFile *f_bin, FILE *f_index, long pos_chunk_gz, char *seq_name, char *fasta, char *quanta, char *extras);
+int write_seq(struct file_data *file, char *seq_name, char *fasta, char *qual, char *extras);
+int close_writes(struct file_data *file);
+/*
+   read_seq reads from filename the sequence named seq_name and returns its
+   fasta, quanta and extras in those variables.
+   It returns 0 if there are no errors, otherwise it returns:
+   -2 : error opening index file (it doesn't exists)
+   -3 : error reading index file
+   -4 : error sequence not found in index file
+   -5 : error opening file (it doesn't exists)
+   -6 : error reading file
+   -7 : error sequence not found
+   -8 : error uncompressing sequence
+   -9 : EOF
+ */
+int read_seq(char *filename, char *seq_name, char **fasta, char **quanta, char **extras);
+// For doing sequential reads of the whole file:
+// int initialize_sequential_reads(struct file_data *filed, char *filename);
+int initialize_sequential_reads(struct file_data ** filed, char *filename);
+// return -9 on EOF
+int read_data_sequential(struct file_data *filed, char **seq_name, char **fasta, char **qual, char **extras);
+int close_sequential_reads(struct file_data *filed);
+/* process_biofile reads from fname (and fname.quanta) and writes to outname
+   (and outname.index) with the binary format
+   Returns 0 if all goes fine */
+// int process_biofile(char *fname,char *qfname, char *efname, char *outname);
+int open_file(char *fname, FILE **file);
+int close_file(FILE *file);
+int chomp(char *str);
+int split_name(char *fname, char *name, char *comments);
+int get_next_seq_fastq(FILE *file, char **name, char **fasta, char **qual, char **comments);
+int get_next_seq_fasta(FILE *file, char *name, char *fasta, char *comments);
+int process_fastq(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index);
+int process_fasta(char *fname, char *efname, char *outname, int discretize_qual, int flatten_qual, int create_index);
+/*
+Format definition
+Main file that contains chunks compressed in gz
+For each sequence the information of that sequence is written with the format:
+  28F143CJN01EBIJN 105 312 0
+That is:
+4 chars for the size of this header, excluding itself, that is, it is the size of
+	the rest of the header
+sequence name
+fasta size
+qual size
+extras size
+The First sequence can be a special sequence with metainfo for this file:
+  30UMACOMPRESSEDFORMAT_version 0 0 0
+  27UMACOMPRESSEDFORMAT_1 0 0 0
+Index file
+Compressed using chunks
+At the beggining a special sequence can be used to store metadata
+like the number of fields, if a binary search can be used, etc.
+That sequence will be:
+UMACOMPRESSEDFORMAT version binary_search begin_of_sequential_index  0 0
+If binary_search is yes then a metaindex follows to do a fast access to the
+index data.
+That will be the first sequence of each chunk and its offset inside the file.
+(Or perphaps it can be put in another file....)
+The rest of the index file will be indexes to the stored sequences, with
+the following fields separated by spaces:
+F143CJN01ETK00 0 471
+Sequence name
+begin of the compressed chunk
+offset inside the chunk of the header of that sequence.
+*/
+int check_error(int error_condition,char *message, int return_value);
+// #ifndef _libfbin
+//
+//     int check_error(int error_condition,char *message, int return_value){
+//         if (error_condition) {
+//             fprintf(stderr,"Error %d; %s\nMSG:%s\n",errno ,message, strerror(errno));
+//             return return_value;
+//         }
+//     }
+//
+//     #define _libfbin
+// #endif
+int free_string(char **string);
+int regenerate_index(char * filename);

data/lib_fqbin_src/mk_fqbin ADDED Viewed

Binary file

data/lib_fqbin_src/mk_fqbin.c ADDED Viewed

@@ -0,0 +1,138 @@
+#include "lib_fqbin.h"
+#include <stdio.h>
+#include <ctype.h>
+#include <unistd.h>
+void usage(){
+    // printf("Usage: mk_fqbin [-i] [-f flatten_qual] [-d discretize_qual] [-e extras_file] -o output_file [fastq_file_input_file]\n\n");
+    printf("mk_fqbin converts a fastQ input file or STDIN stream (use no filename or '-') to compressed fqbin format.\n\n");
+    printf("Usage: mk_fqbin [OPTIONS] -o output_file [fastq_file_input_file]\n\n");
+    printf("Options:\n");
+    printf("    -i create random access index\n");
+    printf("    -d discretize_qual: quality values are discretized in groups of size discretize_qual. This way less quality values are used and a better compression is obtained\n");
+    printf("    -f flatten_qual: quality values over flatten_qual (use phred scale) will be set to flatten_qual value in order to achieve a better compression\n");
+    printf("    -e extras_file: a file with extra metadata for each sequence if standard FASTA format\n");
+    printf("Mandatory parameters:\n");
+    printf("    -o output_file: output fqbin file\n");
+    printf("    -F input is in fasta format (will look for filename.qual for qualities) \n");
+    printf("\nSCBI - Supercomputación y Bioinformática. University of Malaga. http://www.scbi.uma.es. Copyright 2011\n\n");
+    exit(-1);
+}
+/*******************************************************/
+/* main                                                */
+/*******************************************************/
+int main(int argc, char *argv[])
+{
+  int res = 0;
+  int ch;
+  int output_fasta = 0;
+  int output_qual = 0;
+  int flatten_qual=0;
+  int discretize_qual=0;
+  char *extras_file=NULL;
+  char *output_file=NULL;
+  int create_index=0;
+  int input_in_fasta=0;
+  while ((ch = getopt(argc, argv, "o:e:d:f:iFh")) != -1) {
+      // printf("opt %d\n",ch);
+          switch (ch) {
+          case 'e':
+                  // strcopy(extras_file,optarg);
+                  extras_file=optarg;
+                  break;
+          case 'f':
+                  flatten_qual = atoi(optarg)+33;
+                  break;
+          case 'd':
+                  discretize_qual = atoi(optarg);
+                  if(discretize_qual<2)
+                  {
+                    discretize_qual=0;
+                  }
+                  break;
+          case 'o':
+                  output_file=optarg;
+                  break;
+          case 'i':
+                  create_index=1;
+                  break;
+          case 'F':
+                  input_in_fasta=1;
+                  break;
+          case 'h':
+                  usage();
+                  break;
+          case '?':
+          default:
+                  usage();
+          }
+  }
+  argc -= optind;
+  argv += optind;
+  // printf("argc: %d", argc);
+  // printf("argv: %s", argv[0]);
+  if(output_file==NULL)
+  {
+      printf("Output file is a mandatory option. Provide one with -o filename\n");
+      usage();
+      exit -1;
+  }
+  printf("Extra metadata file: %s\nFlattenning qual over:%d (%c char)\nDistretizing qual in groups of:%d\n",extras_file,flatten_qual,flatten_qual,discretize_qual);
+  if (create_index){printf("Creating random access index\n");}
+   if(input_in_fasta)
+   {
+       // check remaining params
+       if (argc==1){
+         res=process_fasta(argv[0],extras_file,output_file,discretize_qual,flatten_qual,create_index);
+       }
+       else if(argc==0)
+       {
+           res=process_fasta("-",extras_file,output_file,discretize_qual,flatten_qual,create_index);
+       }
+       else{
+         usage();
+       }
+   }else{
+       // check remaining params
+       if (argc==1){
+         res=process_fastq(argv[0],extras_file,output_file,discretize_qual,flatten_qual,create_index);
+       }
+       else if(argc==0)
+       {
+           res=process_fastq("-",extras_file,output_file,discretize_qual,flatten_qual,create_index);
+       }
+       else{
+         usage();
+       }
+   }
+  return res;
+}

data/lib_fqbin_src/other/bwxform.c ADDED Viewed

@@ -0,0 +1,915 @@
+/***************************************************************************
+*                    Burrows-Wheeler Transform Library
+*
+*   File    : bwxform.c
+*   Purpose : Provides prototypes for functions that apply and reverse the
+*             Burrows-Wheeler transform (with or without move to front
+*             coding/decoding).  The algorithms implemented are based upon
+*             those described in "A Block-sorting Lossless Data Compression
+*             Algorithm" by M. Burrows and D.J. Wheeler.
+*   Author  : Michael Dipperstein
+*   Date    : August 20, 2004
+*
+****************************************************************************
+*   UPDATES
+*
+*   $Id: bwxform.c,v 1.6 2007/09/17 13:21:19 michael Exp $
+*   $Log: bwxform.c,v $
+*   Revision 1.6  2007/09/17 13:21:19  michael
+*   Changes required for LGPL v3.
+*
+*   Revision 1.5  2005/11/03 15:01:46  michael
+*   Speed up block sorting using the algorithm suggested by the
+*   Burrows-Wheeler paper.  Radix sort all rotations by the first
+*   two charcters before employing quicksort.
+*
+*   Revision 1.4  2005/05/02 13:33:41  michael
+*   Allocate large arrays on heap instead of stack so that gcc builds code
+*   that can handle larger blocks.
+*
+*   Update e-mail address
+*
+*   Revision 1.3  2004/08/27 01:24:16  michael
+*   Write S[0] index (I) before transformed block to aviod having to
+*   find I in a partial block.
+*
+*   Revision 1.2  2004/08/26 06:16:08  michael
+*   Handle partial blocks without need to store block size.  Use size
+*   returned by fread() to indicate smaller than standard block.
+*
+*   Revision 1.1.1.1  2004/08/23 04:34:18  michael
+*   Burrows-Wheeler Transform
+*
+****************************************************************************
+*
+* bwxform: An ANSI C Burrows-Wheeler Transform/Reverse Transform Routines
+* Copyright (C) 2004-2005, 2007 by
+* Michael Dipperstein (mdipper@alumni.engr.ucsb.edu)
+*
+* This file is part of the BWT library.
+*
+* The BWT library is free software; you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License as published
+* by the Free Software Foundation; either version 3 of the License, or (at
+* your option) any later version.
+*
+* The BWT library is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+* General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License
+* along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+***************************************************************************/
+/***************************************************************************
+*                             INCLUDED FILES
+***************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include "bwxform.h"
+/***************************************************************************
+*                                CONSTANTS
+***************************************************************************/
+#define BLOCK_SIZE  4096        /* size of blocks */
+#if BLOCK_SIZE > INT_MAX
+#error BLOCK_SIZE must be <= INT_MAX and maximum size_t
+#endif
+/* NOTE: Need to find a way to check for maximum size_t */
+/***************************************************************************
+*                            TYPE DEFINITIONS
+***************************************************************************/
+unsigned char block[BLOCK_SIZE];        /* block being (un)transformed */
+size_t blockSize;                       /* actual size of block */
+/* counters and offsets used for radix sorting with characters */
+unsigned int counters[256];
+unsigned int offsetTable[256];
+/***************************************************************************
+*                                 MACROS
+***************************************************************************/
+/* wraps array index within array bounds (assumes value < 2 * limit) */
+#define Wrap(value, limit)      (((value) < (limit)) ? (value) : ((value) - (limit)))
+/***************************************************************************
+*                               PROTOTYPES
+***************************************************************************/
+/* move to front functions */
+void DoMTF(unsigned char *last, int length);
+void UndoMTF(unsigned char *last, int length);
+/***************************************************************************
+*                                FUNCTIONS
+***************************************************************************/
+/***************************************************************************
+*   Function   : ComparePresorted
+*   Description: This comparison function is designed for use with qsort
+*                and "block", a global array of "blockSize" unsigned chars.
+*                It compares two strings in "block" starting at indices
+*                s1 and s2 and ending at indices s1 - 1 and s2 - 1.
+*                The strings are assumed to be presorted so that first two
+*                characters are known to be matching.
+*   Parameters : s1 - The starting index of a string in block
+*                s2 - The starting index of a string in block
+*   Effects    : NONE
+*   Returned   : > 0 if string s1 > string s2
+*                0 if string s1 == string s2
+*                < 0 if string s1 < string s2
+***************************************************************************/
+int ComparePresorted(const void *s1, const void *s2)
+{
+    int offset1, offset2;
+    int i;
+    int result;
+    offset1 = *((int *)s1);
+    offset2 = *((int *)s2);
+    /***********************************************************************
+    * Compare 1 character at a time until there's difference or the end of
+    * the block is reached.  Since we're only sorting strings that already
+    * match at the first two characters, start with the third character.
+    ***********************************************************************/
+    for(i = 2; i < blockSize; i++)
+    {
+        result = (int)block[Wrap((offset1 + i), blockSize)] -
+            (int)block[Wrap((offset2 + i), blockSize)];
+        if (result != 0)
+        {
+            return result;
+        }
+    }
+    /* strings are identical */
+    return 0;
+}
+/***************************************************************************
+*   Function   : BWXformFile
+*   Description: This function performs a Burrows-Wheeler transformation
+*                on a file (with optional move to front) and writes the
+*                resulting data to the specified output file.  Comments in
+*                this function indicate corresponding variables, labels,
+*                and sections in "A Block-sorting Lossless Data Compression
+*                Algorithm" by M. Burrows and D.J. Wheeler.
+*   Parameters : inFile - Name of file to transform
+*                outFile - Name of file to write transformed output to
+*                mtf - Set to TRUE if move to front coding should be
+*                      applied.
+*   Effects    : A Burrows-Wheeler transformation (and possibly move to
+*                front encoding) is applied to inFile.   The results of
+*                the transformation are written to outFile.
+*   Returned   : TRUE for success, otherwise FALSE.
+***************************************************************************/
+int BWXformFile(char *inFile, char *outFile, char mtf)
+{
+    int i, j, k;
+    FILE *fpIn, *fpOut;
+    unsigned int *rotationIdx;      /* index of first char in rotation */
+    unsigned int *v;                /* index of radix sorted charaters */
+    int s0Idx;                      /* index of S0 in rotations (I) */
+    unsigned char *last;            /* last characters from sorted rotations */
+    /***********************************************************************
+    * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
+    * code that throws a Segmentation fault when the large arrays are
+    * allocated on the stack.
+    ***********************************************************************/
+    rotationIdx = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
+    if (NULL == rotationIdx)
+    {
+        perror("Allocating array of rotation indices");
+        return FALSE;
+    }
+    v = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
+    if (v == rotationIdx)
+    {
+        perror("Allocating array of sort indices");
+        free(rotationIdx);
+        return FALSE;
+    }
+    last = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
+    if (NULL == last)
+    {
+        perror("Allocating array of last characters");
+        free(rotationIdx);
+        free(v);
+        return FALSE;
+    }
+    /* open input and output files */
+    if ((fpIn = fopen(inFile, "rb")) == NULL)
+    {
+        perror(inFile);
+        return FALSE;
+    }
+    if (outFile == NULL)
+    {
+        fpOut = stdout;
+    }
+    else
+    {
+        if ((fpOut = fopen(outFile, "wb")) == NULL)
+        {
+            fclose(fpIn);
+            perror(outFile);
+            return FALSE;
+        }
+    }
+    while((blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn))
+        != 0)
+    {
+        /*******************************************************************
+        * Sort the rotated strings in the block.  A radix sort is performed
+        * on the first to characters of all the rotated strings (2nd
+        * character then 1st).  All rotated strings with matching initial
+        * characters are then quicksorted. - Q4..Q7
+        *******************************************************************/
+        /*** radix sort on second character in rotation ***/
+        /* count number of characters for radix sort */
+        memset(counters, 0, 256 * sizeof(int));
+        for (i = 0; i < blockSize; i++)
+        {
+            counters[block[i]]++;
+        }
+        offsetTable[0] = 0;
+        for(i = 1; i < 256; i++)
+        {
+            /* determine number of values before those sorted under i */
+            offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
+        }
+        /* sort on 2nd character */
+        for (i = 0; i < blockSize - 1; i++)
+        {
+            j = block[i + 1];
+            v[offsetTable[j]] = i;
+            offsetTable[j] = offsetTable[j] + 1;
+        }
+        /* handle wrap around for string starting at end of block */
+        j = block[0];
+        v[offsetTable[j]] = i;
+        offsetTable[0] = 0;
+        /*** radix sort on first character in rotation ***/
+        for(i = 1; i < 256; i++)
+        {
+            /* determine number of values before those sorted under i */
+            offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
+        }
+        for (i = 0; i < blockSize; i++)
+        {
+            j = v[i];
+            j = block[j];
+            rotationIdx[offsetTable[j]] = v[i];
+            offsetTable[j] = offsetTable[j] + 1;
+        }
+        /*******************************************************************
+        * now rotationIdx contains the sort order of all strings sorted
+        * by their first 2 characters.  Use qsort to sort the strings
+        * that have their first two characters matching.
+        *******************************************************************/
+        for (i = 0, k = 0; (i <= UCHAR_MAX) && (k < (blockSize - 1)); i++)
+        {
+            for (j = 0; (j <= UCHAR_MAX) && (k < (blockSize - 1)); j++)
+            {
+                int first = k;
+                /* count strings starting with ij */
+                while ((i == block[rotationIdx[k]]) &&
+                    (j == block[Wrap(rotationIdx[k] + 1,  blockSize)]))
+                {
+                    k++;
+                    if (k == blockSize)
+                    {
+                        /* we've searched the whole block */
+                        break;
+                    }
+                }
+                if (k - first > 1)
+                {
+                    /* there are at least 2 strings staring with ij, sort them */
+                    qsort(&rotationIdx[first], k - first, sizeof(int),
+                        ComparePresorted);
+                }
+            }
+        }
+        /* find last characters of rotations (L) - C2 */
+        s0Idx = 0;
+        for (i = 0; i < blockSize; i++)
+        {
+            if (rotationIdx[i] != 0)
+            {
+                last[i] = block[rotationIdx[i] - 1];
+            }
+            else
+            {
+                /* unrotated string 1st character is end of string */
+                s0Idx = i;
+                last[i] = block[blockSize - 1];
+            }
+        }
+        if (mtf)
+        {
+            DoMTF(last, blockSize);
+        }
+        /* write index of end of unrotated string (I) */
+        fwrite(&s0Idx, sizeof(int), 1, fpOut);
+        /* write out last characters of rotations (L) */
+        fwrite(last, sizeof(unsigned char), blockSize, fpOut);
+    }
+    /* clean up */
+    free(rotationIdx);
+    free(v);
+    free(last);
+    fclose(fpIn);
+    fclose(fpOut);
+    return TRUE;
+}
+int BWXform(char *inString, char *outString, int mtf)
+{
+    int i, j, k;
+    FILE *fpIn, *fpOut;
+    unsigned int *rotationIdx;      /* index of first char in rotation */
+    unsigned int *v;                /* index of radix sorted charaters */
+    int s0Idx;                      /* index of S0 in rotations (I) */
+    unsigned char *last;            /* last characters from sorted rotations */
+    /***********************************************************************
+    * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
+    * code that throws a Segmentation fault when the large arrays are
+    * allocated on the stack.
+    ***********************************************************************/
+    rotationIdx = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
+    if (NULL == rotationIdx)
+    {
+        perror("Allocating array of rotation indices");
+        return FALSE;
+    }
+    v = (unsigned int *)malloc(BLOCK_SIZE * sizeof(unsigned int));
+    if (v == rotationIdx)
+    {
+        perror("Allocating array of sort indices");
+        free(rotationIdx);
+        return FALSE;
+    }
+    last = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
+    if (NULL == last)
+    {
+        perror("Allocating array of last characters");
+        free(rotationIdx);
+        free(v);
+        return FALSE;
+    }
+    strcpy(block,inString);
+    blockSize=strlen(inString);
+    block[blockSize]='\0';
+    // printf("block:%s, SIZE: %ld\n",block,blockSize);
+    // while((blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn))
+    //     != 0)
+    // {
+        /*******************************************************************
+        * Sort the rotated strings in the block.  A radix sort is performed
+        * on the first to characters of all the rotated strings (2nd
+        * character then 1st).  All rotated strings with matching initial
+        * characters are then quicksorted. - Q4..Q7
+        *******************************************************************/
+        /*** radix sort on second character in rotation ***/
+        /* count number of characters for radix sort */
+        memset(counters, 0, 256 * sizeof(int));
+        for (i = 0; i < blockSize; i++)
+        {
+            counters[block[i]]++;
+        }
+        offsetTable[0] = 0;
+        for(i = 1; i < 256; i++)
+        {
+            /* determine number of values before those sorted under i */
+            offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
+        }
+        /* sort on 2nd character */
+        for (i = 0; i < blockSize - 1; i++)
+        {
+            j = block[i + 1];
+            v[offsetTable[j]] = i;
+            offsetTable[j] = offsetTable[j] + 1;
+        }
+        /* handle wrap around for string starting at end of block */
+        j = block[0];
+        v[offsetTable[j]] = i;
+        offsetTable[0] = 0;
+        /*** radix sort on first character in rotation ***/
+        for(i = 1; i < 256; i++)
+        {
+            /* determine number of values before those sorted under i */
+            offsetTable[i] = offsetTable[i - 1] + counters[i - 1];
+        }
+        for (i = 0; i < blockSize; i++)
+        {
+            j = v[i];
+            j = block[j];
+            rotationIdx[offsetTable[j]] = v[i];
+            offsetTable[j] = offsetTable[j] + 1;
+        }
+        /*******************************************************************
+        * now rotationIdx contains the sort order of all strings sorted
+        * by their first 2 characters.  Use qsort to sort the strings
+        * that have their first two characters matching.
+        *******************************************************************/
+        for (i = 0, k = 0; (i <= UCHAR_MAX) && (k < (blockSize - 1)); i++)
+        {
+            for (j = 0; (j <= UCHAR_MAX) && (k < (blockSize - 1)); j++)
+            {
+                int first = k;
+                /* count strings starting with ij */
+                while ((i == block[rotationIdx[k]]) &&
+                    (j == block[Wrap(rotationIdx[k] + 1,  blockSize)]))
+                {
+                    k++;
+                    if (k == blockSize)
+                    {
+                        /* we've searched the whole block */
+                        break;
+                    }
+                }
+                if (k - first > 1)
+                {
+                    /* there are at least 2 strings staring with ij, sort them */
+                    qsort(&rotationIdx[first], k - first, sizeof(int),
+                        ComparePresorted);
+                }
+            }
+        }
+        /* find last characters of rotations (L) - C2 */
+        s0Idx = 0;
+        for (i = 0; i < blockSize; i++)
+        {
+            if (rotationIdx[i] != 0)
+            {
+                last[i] = block[rotationIdx[i] - 1];
+            }
+            else
+            {
+                /* unrotated string 1st character is end of string */
+                s0Idx = i;
+                last[i] = block[blockSize - 1];
+            }
+        }
+        // printf("ANTES:\n");
+        // for (i = 0; i < blockSize; i++)
+        // {
+        //     printf("%2d,",last[i]);
+        // }
+        // printf("\nDESPUES:\n");
+        if (mtf)
+        {
+            DoMTF(last, blockSize);
+        }
+        // for (i = 0; i < blockSize; i++)
+        // {
+        //     printf("%2d,",last[i]);
+        // }
+        // printf("\n");
+        // /* write index of end of unrotated string (I) */
+        // fwrite(&s0Idx, sizeof(int), 1, fpOut);
+        //
+        // /* write out last characters of rotations (L) */
+        // fwrite(last, sizeof(unsigned char), blockSize, fpOut);
+    // } //FIN WHILE
+        memcpy((char *)outString, (void *)last, sizeof(unsigned char) * blockSize);
+        // strncpy(outString,last,blockSize+1);
+        // blockSize=strlen(last);
+        // printf("block2:%s, SIZE: %ld\n",last,strlen(last));
+    /* clean up */
+    free(rotationIdx);
+    free(v);
+    free(last);
+    // fclose(fpIn);
+    // fclose(fpOut);
+    return TRUE;
+}
+/***************************************************************************
+*   Function   : DoMTF
+*   Description: This function performs move to front encoding on a block
+*                on of data that has already had the Burrows-Wheeler
+*                transformation applied to it.  Comments in this function
+*                indicate corresponding variables, labels, and sections in
+*                "A Block-sorting Lossless Data Compression Algorithm" by
+*                M. Burrows and D.J. Wheeler.
+*   Parameters : last - pointer an array of "last" characters from
+*                       Burrows-Wheeler rotations (L)
+*                length - the number of unsigned chars contained in last.
+*   Effects    : Move to front encoding is applied on an array of last
+*                characters.  The results of the encoding replace the data
+*                that was stored in last.
+*   Returned   : NONE
+***************************************************************************/
+void DoMTF(unsigned char *last, int length)
+{
+    unsigned char list[UCHAR_MAX + 1];      /* list of characters (Y) */
+    unsigned char *encoded;                 /* mtf encoded block (R) */
+    int i, j;
+    /***********************************************************************
+    * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
+    * code that throws a Segmentation fault when the large arrays are
+    * allocated on the stack.
+    ***********************************************************************/
+    encoded = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
+    if (NULL == encoded)
+    {
+        perror("Allocating array to store MTF encoding");
+        return;
+    }
+    /* start with alphabetically sorted list of characters */
+    for(i = 0; i <= UCHAR_MAX; i++)
+    {
+        list[i] = (unsigned char)i;
+    }
+    /* move-to-front coding - M1 */
+    for (i = 0; i < length; i++)
+    {
+        /*******************************************************************
+        * Find the character in the list of characters.  I do a sequential
+        * search because of move to front causes common characters to be
+        * near the front of the list.
+        *******************************************************************/
+        for (j = 0; j <= UCHAR_MAX; j++)
+        {
+            if (list[j] == last[i])
+            {
+                /* we found the character */
+                encoded[i] = j;
+                break;
+            }
+        }
+        /* now move the current character to the front of the list */
+        for (; j > 0; j--)
+        {
+            list[j] = list[j - 1];
+        }
+        list[0] = last[i];
+    }
+    /* copy mtf encoded vector of last characters (R) to input */
+    memcpy((void *)last, (void *)encoded, sizeof(unsigned char) * length);
+    free(encoded);
+    return;
+}
+/***************************************************************************
+*   Function   : BWReverseXformFile
+*   Description: This function reverses a Burrows-Wheeler transformation
+*                on a file (with optional move to front) and writes the
+*                resulting data to the specified output file.  Comments in
+*                this function indicate corresponding variables, labels,
+*                and sections in "A Block-sorting Lossless Data Compression
+*                Algorithm" by M. Burrows and D.J. Wheeler.
+*   Parameters : inFile - Name of file to reverse transform
+*                outFile - Name of file to write reverse transformed
+*                          output to
+*                mtf - Set to TRUE if move to front decoding should be
+*                      applied
+*   Effects    : A Burrows-Wheeler reverse transformation (and possibly
+*                move to front encoding) is applied to inFile.   The results
+*                of the reverse transformation are written to outFile.
+*   Returned   : TRUE for success, otherwise FALSE.
+***************************************************************************/
+int BWReverseXformFile(char *inFile, char *outFile, char mtf)
+{
+    FILE *fpIn, *fpOut;
+    int i, j, sum;
+    int count[UCHAR_MAX + 1];   /* count[i] = # of chars in block <= i */
+    int *pred;                  /* pred[i] = # of times block[i] appears in
+                                   block[0 .. i - 1] */
+    unsigned char *unrotated;   /* original block */
+    int s0Idx;                  /* index of S0 in rotations (I) */
+    /***********************************************************************
+    * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
+    * code that throws a Segmentation fault when the large arrays are
+    * allocated on the stack.
+    ***********************************************************************/
+    pred = (int *)malloc(BLOCK_SIZE * sizeof(int));
+    if (NULL == pred)
+    {
+        perror("Allocating array of matching predicessors");
+        return FALSE;
+    }
+    unrotated = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
+    if (NULL == unrotated)
+    {
+        perror("Allocating array to store unrotated block");
+        free(pred);
+        return FALSE;
+    }
+    /* open input and output files */
+    if ((fpIn = fopen(inFile, "rb")) == NULL)
+    {
+        perror(inFile);
+        return FALSE;
+    }
+    if (outFile == NULL)
+    {
+        fpOut = stdout;
+    }
+    else
+    {
+        if ((fpOut = fopen(outFile, "wb")) == NULL)
+        {
+            fclose(fpIn);
+            perror(outFile);
+            return FALSE;
+        }
+    }
+    while(fread(&s0Idx, sizeof(int), 1, fpIn) != 0)
+    {
+        blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn);
+        if(mtf)
+        {
+            UndoMTF(block, blockSize);
+        }
+        /* code based on pseudo code from section 4.2 (D1 and D2) follows */
+        for(i = 0; i <= UCHAR_MAX; i++)
+        {
+            count[i] = 0;
+        }
+        /*******************************************************************
+        * Set pred[i] to the number of times block[i] appears in the
+        * substring block[0 .. i - 1].  As a useful side effect count[i]
+        * will be the number of times character i appears in block.
+        *******************************************************************/
+        for (i = 0; i < blockSize; i++)
+        {
+            pred[i] = count[block[i]];
+            count[block[i]]++;
+        }
+        /*******************************************************************
+        * Finally, set count[i] to the number of characters in block
+        * lexicographically less than i.
+        *******************************************************************/
+        sum = 0;
+        for(i = 0; i <= UCHAR_MAX; i++)
+        {
+            j = count[i];
+            count[i] = sum;
+            sum += j;
+        }
+        /* construct the initial unrotated string (S[0]) */
+        i = s0Idx;
+        for(j = blockSize - 1; j >= 0; j--)
+        {
+            unrotated[j] = block[i];
+            i = pred[i] + count[block[i]];
+        }
+        fwrite(unrotated, sizeof(unsigned char), blockSize, fpOut);
+    }
+    /* clean up */
+    free(pred);
+    free(unrotated);
+    fclose(fpIn);
+    fclose(fpOut);
+    return TRUE;
+}
+int BWReverseXform(char *inString, char *outString, int mtf, long size)
+{
+    // FILE *fpIn, *fpOut;
+    int i, j, sum;
+    int count[UCHAR_MAX + 1];   /* count[i] = # of chars in block <= i */
+    int *pred;                  /* pred[i] = # of times block[i] appears in
+                                   block[0 .. i - 1] */
+    unsigned char *unrotated;   /* original block */
+    int s0Idx;                  /* index of S0 in rotations (I) */
+    /***********************************************************************
+    * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
+    * code that throws a Segmentation fault when the large arrays are
+    * allocated on the stack.
+    ***********************************************************************/
+    pred = (int *)malloc(BLOCK_SIZE * sizeof(int));
+    if (NULL == pred)
+    {
+        perror("Allocating array of matching predicessors");
+        return FALSE;
+    }
+    unrotated = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
+    if (NULL == unrotated)
+    {
+        perror("Allocating array to store unrotated block");
+        free(pred);
+        return FALSE;
+    }
+    // while(fread(&s0Idx, sizeof(int), 1, fpIn) != 0)
+    //     {
+        // blockSize = fread(block, sizeof(unsigned char), BLOCK_SIZE, fpIn);
+    blockSize=size;
+    strncpy(block,inString,blockSize);
+        if(mtf)
+        {
+            UndoMTF(block, blockSize);
+        }
+        /* code based on pseudo code from section 4.2 (D1 and D2) follows */
+        for(i = 0; i <= UCHAR_MAX; i++)
+        {
+            count[i] = 0;
+        }
+        /*******************************************************************
+        * Set pred[i] to the number of times block[i] appears in the
+        * substring block[0 .. i - 1].  As a useful side effect count[i]
+        * will be the number of times character i appears in block.
+        *******************************************************************/
+        for (i = 0; i < blockSize; i++)
+        {
+            pred[i] = count[block[i]];
+            count[block[i]]++;
+        }
+        /*******************************************************************
+        * Finally, set count[i] to the number of characters in block
+        * lexicographically less than i.
+        *******************************************************************/
+        sum = 0;
+        for(i = 0; i <= UCHAR_MAX; i++)
+        {
+            j = count[i];
+            count[i] = sum;
+            sum += j;
+        }
+        /* construct the initial unrotated string (S[0]) */
+        i = s0Idx;
+        for(j = blockSize - 1; j >= 0; j--)
+        {
+            unrotated[j] = block[i];
+            i = pred[i] + count[block[i]];
+        }
+        // fwrite(unrotated, sizeof(unsigned char), blockSize, fpOut);
+    // }
+    strncpy(outString,unrotated,blockSize);
+    /* clean up */
+    free(pred);
+    free(unrotated);
+    // fclose(fpIn);
+    //  fclose(fpOut);
+    //
+    return TRUE;
+}
+/***************************************************************************
+*   Function   : UndoMTF
+*   Description: This function reverses move to front encoding on a block
+*                on of data that has already had the Burrows-Wheeler
+*                transformation applied to it.  Comments in this function
+*                indicate corresponding variables, labels, and sections in
+*                "A Block-sorting Lossless Data Compression Algorithm" by
+*                M. Burrows and D.J. Wheeler.
+*   Parameters : last - pointer an array of mtf encoded characters from
+*                       Burrows-Wheeler rotations.
+*                length - the number of unsigned chars contained in last.
+*   Effects    : Move to front encoding is reversed on an array of last
+*                characters.  The results of the reversal are stored in
+*                the array last (L), providing an array of last characters
+*                of sorted rotations.
+*   Returned   : NONE
+***************************************************************************/
+void UndoMTF(unsigned char *last, int length)
+{
+    unsigned char list[UCHAR_MAX + 1];      /* list of characters (Y) */
+    unsigned char *encoded;                 /* mtf encoded block (R) */
+    int i, j;
+    /***********************************************************************
+    * BLOCK_SIZE arrays are allocated on the heap, because gcc generates
+    * code that throws a Segmentation fault when the large arrays are
+    * allocated on the stack.
+    ***********************************************************************/
+    encoded = (unsigned char *)malloc(BLOCK_SIZE * sizeof(unsigned char));
+    if (NULL == encoded)
+    {
+        perror("Allocating array to store MTF encoding");
+        return;
+    }
+    /* copy last into encoded */
+    memcpy((void *)encoded, (void *)last, sizeof(unsigned char) * length);
+    /* start with alphabetically sorted list of characters */
+    for(i = 0; i <= UCHAR_MAX; i++)
+    {
+        list[i] = (unsigned char)i;
+    }
+    /* move-to-front decoding - W2 */
+    for (i = 0; i < length; i++)
+    {
+        /* decode the character */
+        last[i] = list[encoded[i]];
+        /* now move the current character to the front of the list */
+        for (j = encoded[i]; j > 0; j--)
+        {
+            list[j] = list[j - 1];
+        }
+        list[0] = last[i];
+    }
+    free(encoded);
+    return;
+}