RubyGems - jk-ferret - Versions diffs - 0.11.8.2 - Mend

jk-ferret 0.11.8.2

Files changed (228) hide show

data/CHANGELOG +24 -0
data/MIT-LICENSE +20 -0
data/README +90 -0
data/RELEASE_CHANGES +137 -0
data/RELEASE_NOTES +60 -0
data/Rakefile +443 -0
data/TODO +109 -0
data/TUTORIAL +231 -0
data/bin/ferret-browser +79 -0
data/ext/BZLIB_blocksort.c +1094 -0
data/ext/BZLIB_bzlib.c +1578 -0
data/ext/BZLIB_compress.c +672 -0
data/ext/BZLIB_crctable.c +104 -0
data/ext/BZLIB_decompress.c +626 -0
data/ext/BZLIB_huffman.c +205 -0
data/ext/BZLIB_randtable.c +84 -0
data/ext/STEMMER_api.c +66 -0
data/ext/STEMMER_libstemmer.c +93 -0
data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
data/ext/STEMMER_stem_UTF_8_german.c +509 -0
data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
data/ext/STEMMER_utilities.c +478 -0
data/ext/analysis.c +1710 -0
data/ext/analysis.h +266 -0
data/ext/api.h +26 -0
data/ext/array.c +125 -0
data/ext/array.h +62 -0
data/ext/bitvector.c +96 -0
data/ext/bitvector.h +594 -0
data/ext/bzlib.h +282 -0
data/ext/bzlib_private.h +503 -0
data/ext/compound_io.c +384 -0
data/ext/config.h +52 -0
data/ext/document.c +159 -0
data/ext/document.h +63 -0
data/ext/except.c +102 -0
data/ext/except.h +176 -0
data/ext/extconf.rb +15 -0
data/ext/ferret.c +416 -0
data/ext/ferret.h +94 -0
data/ext/field_index.c +262 -0
data/ext/field_index.h +52 -0
data/ext/filter.c +157 -0
data/ext/fs_store.c +493 -0
data/ext/global.c +458 -0
data/ext/global.h +302 -0
data/ext/hash.c +524 -0
data/ext/hash.h +515 -0
data/ext/hashset.c +192 -0
data/ext/hashset.h +215 -0
data/ext/header.h +58 -0
data/ext/helper.c +63 -0
data/ext/helper.h +21 -0
data/ext/index.c +6804 -0
data/ext/index.h +935 -0
data/ext/internal.h +1019 -0
data/ext/lang.c +10 -0
data/ext/lang.h +68 -0
data/ext/libstemmer.h +79 -0
data/ext/mempool.c +88 -0
data/ext/mempool.h +43 -0
data/ext/modules.h +190 -0
data/ext/multimapper.c +351 -0
data/ext/multimapper.h +60 -0
data/ext/posh.c +1006 -0
data/ext/posh.h +973 -0
data/ext/priorityqueue.c +149 -0
data/ext/priorityqueue.h +155 -0
data/ext/q_boolean.c +1621 -0
data/ext/q_const_score.c +162 -0
data/ext/q_filtered_query.c +212 -0
data/ext/q_fuzzy.c +280 -0
data/ext/q_match_all.c +149 -0
data/ext/q_multi_term.c +673 -0
data/ext/q_parser.c +3103 -0
data/ext/q_phrase.c +1206 -0
data/ext/q_prefix.c +98 -0
data/ext/q_range.c +682 -0
data/ext/q_span.c +2390 -0
data/ext/q_term.c +337 -0
data/ext/q_wildcard.c +167 -0
data/ext/r_analysis.c +2626 -0
data/ext/r_index.c +3468 -0
data/ext/r_qparser.c +635 -0
data/ext/r_search.c +4490 -0
data/ext/r_store.c +513 -0
data/ext/r_utils.c +1131 -0
data/ext/ram_store.c +476 -0
data/ext/scanner.c +895 -0
data/ext/scanner.h +36 -0
data/ext/scanner_mb.c +6701 -0
data/ext/scanner_utf8.c +4415 -0
data/ext/search.c +1864 -0
data/ext/search.h +953 -0
data/ext/similarity.c +151 -0
data/ext/similarity.h +89 -0
data/ext/sort.c +786 -0
data/ext/stem_ISO_8859_1_danish.h +16 -0
data/ext/stem_ISO_8859_1_dutch.h +16 -0
data/ext/stem_ISO_8859_1_english.h +16 -0
data/ext/stem_ISO_8859_1_finnish.h +16 -0
data/ext/stem_ISO_8859_1_french.h +16 -0
data/ext/stem_ISO_8859_1_german.h +16 -0
data/ext/stem_ISO_8859_1_hungarian.h +16 -0
data/ext/stem_ISO_8859_1_italian.h +16 -0
data/ext/stem_ISO_8859_1_norwegian.h +16 -0
data/ext/stem_ISO_8859_1_porter.h +16 -0
data/ext/stem_ISO_8859_1_portuguese.h +16 -0
data/ext/stem_ISO_8859_1_spanish.h +16 -0
data/ext/stem_ISO_8859_1_swedish.h +16 -0
data/ext/stem_ISO_8859_2_romanian.h +16 -0
data/ext/stem_KOI8_R_russian.h +16 -0
data/ext/stem_UTF_8_danish.h +16 -0
data/ext/stem_UTF_8_dutch.h +16 -0
data/ext/stem_UTF_8_english.h +16 -0
data/ext/stem_UTF_8_finnish.h +16 -0
data/ext/stem_UTF_8_french.h +16 -0
data/ext/stem_UTF_8_german.h +16 -0
data/ext/stem_UTF_8_hungarian.h +16 -0
data/ext/stem_UTF_8_italian.h +16 -0
data/ext/stem_UTF_8_norwegian.h +16 -0
data/ext/stem_UTF_8_porter.h +16 -0
data/ext/stem_UTF_8_portuguese.h +16 -0
data/ext/stem_UTF_8_romanian.h +16 -0
data/ext/stem_UTF_8_russian.h +16 -0
data/ext/stem_UTF_8_spanish.h +16 -0
data/ext/stem_UTF_8_swedish.h +16 -0
data/ext/stem_UTF_8_turkish.h +16 -0
data/ext/stopwords.c +410 -0
data/ext/store.c +698 -0
data/ext/store.h +799 -0
data/ext/symbol.c +10 -0
data/ext/symbol.h +23 -0
data/ext/term_vectors.c +73 -0
data/ext/threading.h +31 -0
data/ext/win32.h +62 -0
data/lib/ferret.rb +30 -0
data/lib/ferret/browser.rb +246 -0
data/lib/ferret/browser/s/global.js +192 -0
data/lib/ferret/browser/s/style.css +148 -0
data/lib/ferret/browser/views/document/list.rhtml +49 -0
data/lib/ferret/browser/views/document/show.rhtml +27 -0
data/lib/ferret/browser/views/error/index.rhtml +7 -0
data/lib/ferret/browser/views/help/index.rhtml +8 -0
data/lib/ferret/browser/views/home/index.rhtml +29 -0
data/lib/ferret/browser/views/layout.rhtml +22 -0
data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
data/lib/ferret/browser/views/term/index.rhtml +199 -0
data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
data/lib/ferret/browser/webrick.rb +14 -0
data/lib/ferret/document.rb +130 -0
data/lib/ferret/field_infos.rb +44 -0
data/lib/ferret/field_symbol.rb +87 -0
data/lib/ferret/index.rb +973 -0
data/lib/ferret/number_tools.rb +157 -0
data/lib/ferret/version.rb +3 -0
data/setup.rb +1555 -0
data/test/long_running/largefile/tc_largefile.rb +46 -0
data/test/test_all.rb +5 -0
data/test/test_helper.rb +29 -0
data/test/test_installed.rb +1 -0
data/test/threading/number_to_spoken.rb +132 -0
data/test/threading/thread_safety_index_test.rb +88 -0
data/test/threading/thread_safety_read_write_test.rb +73 -0
data/test/threading/thread_safety_test.rb +133 -0
data/test/unit/analysis/tc_analyzer.rb +550 -0
data/test/unit/analysis/tc_token_stream.rb +653 -0
data/test/unit/index/tc_index.rb +867 -0
data/test/unit/index/tc_index_reader.rb +699 -0
data/test/unit/index/tc_index_writer.rb +447 -0
data/test/unit/index/th_doc.rb +332 -0
data/test/unit/query_parser/tc_query_parser.rb +238 -0
data/test/unit/search/tc_filter.rb +156 -0
data/test/unit/search/tc_fuzzy_query.rb +147 -0
data/test/unit/search/tc_index_searcher.rb +67 -0
data/test/unit/search/tc_multi_searcher.rb +128 -0
data/test/unit/search/tc_multiple_search_requests.rb +58 -0
data/test/unit/search/tc_search_and_sort.rb +179 -0
data/test/unit/search/tc_sort.rb +49 -0
data/test/unit/search/tc_sort_field.rb +27 -0
data/test/unit/search/tc_spans.rb +190 -0
data/test/unit/search/tm_searcher.rb +436 -0
data/test/unit/store/tc_fs_store.rb +115 -0
data/test/unit/store/tc_ram_store.rb +35 -0
data/test/unit/store/tm_store.rb +34 -0
data/test/unit/store/tm_store_lock.rb +68 -0
data/test/unit/tc_document.rb +81 -0
data/test/unit/tc_field_symbol.rb +26 -0
data/test/unit/ts_analysis.rb +2 -0
data/test/unit/ts_index.rb +2 -0
data/test/unit/ts_largefile.rb +4 -0
data/test/unit/ts_query_parser.rb +2 -0
data/test/unit/ts_search.rb +2 -0
data/test/unit/ts_store.rb +2 -0
data/test/unit/ts_utils.rb +2 -0
data/test/unit/utils/tc_bit_vector.rb +295 -0
data/test/unit/utils/tc_number_tools.rb +117 -0
data/test/unit/utils/tc_priority_queue.rb +106 -0
data/test/utils/content_generator.rb +226 -0
metadata +319 -0

data/TODO ADDED Viewed

@@ -0,0 +1,109 @@
+TODO
+====
+* C
+  - IMPORTANT:
+    + FIX file descriptor overflow. See Tickets #341 and #343
+  - add .. operator to query parser. For example, [100 200] could be written as
+    100..200 or 100...201 like in Ruby Ranges
+  - remove exception handling from C code. All errors to be handled by return
+    values.
+  - Move to sqlite's locking model. Ferret should work fine in a multi-process
+    environment.
+  - Add optional logging. To be enabled at compilation time, perhaps?
+  - Add support for changing zlib and bzlib compression parameters
+  - Improve unit test coverage to 100%
+  - Add benchmark suite
+  - Add Rakefile for development purposes
+    + task to publish gcov and benchmark results to ferret wiki
+  - Index rebuilding of old versioned indexes.
+  - Add a globally accessable, threadsafe symbol table. This will be very
+    useful for storing field names so that no objects need to strdup the
+    field-names but can just store the symbol representative instead.
+    + this has been done but it can be improved using actual Symbol structs
+      instead of plain char*
+  - Make threading optional at compile time
+  - to_json should limit output to prevent memory overflow on large indexes.
+    Perhaps we could use some type of buffered read for this.
+  - Make BitVector run as fast as bitset from C++ STL. See;
+      c/benchmark/bm_bitvector.c
+  - Add a symbol table for field names. This will mean that we won't need to
+    worry about mallocing and freeing field names which happens all over the
+    place.
+  - Divide the headers into public and private (the private headers to be
+    stored in the src directory).
+  - Group-by search. ie you should be able to pass a field to group search
+    results by
+  - Auto-loading of documents during search. ie actual documents get returned
+    instead of document numbers.
+* Ruby bindings
+  - argument checking for every method. We need a new api for argument checking
+    so that the arguments get checked at the start of each method that could
+    cause a segfault.
+  - improve memory management. It was way to complex at the moment. I also need
+    to document how it works so that other developers understand what is going
+    on.
+  - Replace Data_Wrap_Struct with ferret alternative which handles rewrapping
+    of structs automatically and also knows when to release a struct by using
+    refcounting.
+* Ruby
+  - integrate rcov
+  - improve unit test coverage to 100%
+* Documentation.
+  - generate Ruby binding documentation with custom build template similar
+    jaxdoc http://rubyforge.org/projects/jaxdoc
+  - all documentation should meet DOCUMENTATION_STANDARDS
+  - documentation in C code to be generated by doxygen
+Someday Maybe
+=============
+* apply for Google Summer of Code 2009
+* optimize read and write vint
+  - test the following outside of ferret before implementing
+  - perform a binary scan using bit-wise or to find out how many bytes need
+    to be written
+  - if the write/read will overflow the buffer, split it into two, refreshing
+    the buffer in between
+  - use Duff's device to write bytes now that we know how many we need
+* add a super fast language based dictionary compression
+* add portable stacktrace function. Perhaps implement as an external library.
+  - See http://www.nongnu.org/libunwind/
+  - See http://www.tlug.org.za/wiki/index.php/Obtaining_a_stack_trace_in_C_upon_SIGSEGV
+* investigate unscored searching
+* user defined sorting
+* Fix highlighting to work for external fields
+* investigate faster string hashing method
+Done
+====
+* add rake install task
+* FIX :create parameter so that it only deletes the files owned by Ferret.
+* fix compression. Currently nothing is happening if you set a field to
+  :compress. I guess we'll just assume zlib is installed, as I think it has to
+  be for Ruby to be installed.
+* add bzlib support
+* integrate gcov
+* add a field cache to IndexReader
+* setup email alerts for svn commits
+* Ranged, unordered searching. Ie search through the index until you have the
+  required number of documents and then break. This will require the ability to
+  start searches from a particular doc-num.
+  + See searcher_search_unordered in the C code and Searcher#scan in Ruby
+* improve unit test code. I'd like to implement some way to print out a stack
+  trace when a test fails so that it is easy to find the source of the error.
+* catch segfaults and print stack trace so users can post helpful bug tickets.
+  again, see the same links for adding stacktrace to unit tests.
+* Add string Sort descripter
+* fix memory bug
+* add MultiReader interface
+* add lexicographical sort (byte sort)
+* Add highlighting
+* add field compression
+* Fix highlighting to work for compressed fields
+* Add Ferret::Index::Index
+* Fix:
+  + Working Query:  field1:value1 AND NOT field2:value2
+  + Failing Query:    field1:value1 AND ( NOT field2:value2 )
+* update benchmark suite to use getrusage

data/TUTORIAL ADDED Viewed

@@ -0,0 +1,231 @@
+= Quick Introduction to Ferret
+The simplest way to use Ferret is through the Ferret::Index::Index class.
+This is now aliased by Ferret::I for quick and easy access. Start by including
+the Ferret module.
+  require 'ferret'
+  include Ferret
+=== Creating an index
+To create an in memory index is very simple;
+  index = Index::Index.new()
+To create a persistent index;
+  index = Index::Index.new(:path => '/path/to/index')
+Both of these methods create new Indexes with the StandardAnalyzer. An
+analyzer is what you use to divide the input data up into tokens which you can
+search for later. If you'd like to use a different analyzer you can specify it
+here, eg;
+  index = Index::Index.new(:path => '/path/to/index',
+                           :analyzer => Analysis::WhiteSpaceAnalyzer.new)
+For more options when creating an Index refer to Ferret::Index::Index.
+=== Adding Documents
+To add a document you can simply add a string or an array of strings. This will
+store all the strings in the "" (ie empty string) field (unless you specify the
+default field when you create the index).
+  index << "This is a new document to be indexed"
+  index << ["And here", "is another", "new document", "to be indexed"]
+But these are pretty simple documents. If this is all you want to index you
+could probably just use SimpleSearch. So let's give our documents some fields;
+  index << {:title => "Programming Ruby", :content => "blah blah blah"}
+  index << {:title => "Programming Ruby", :content => "yada yada yada"}
+Note the way that all field-names are Symbols. Although Strings will work,
+this is a best-practice in Ferret. Or if you are indexing data stored in a
+database, you'll probably want to store the id;
+  index << {:id => row.id, :title => row.title, :date => row.date}
+So far we have been storing and tokenizing all of the input data along with
+term vectors. If we want to change this we need to change the way we setup the
+index. You must create a FieldInfos object describing the index:
+  field_infos = FieldInfos.new(:store => :no,
+                               :index => :untokenized_omit_norms,
+                               :term_vector => :no)
+The values that you set FieldInfos to have will be used by default by all
+fields. If you want to change the properties for specific fields, you need to
+add a FieldInfo to field_infos.
+  field_infos.add_field(:title, :store => :yes, :index => :yes, :boost => 10.0)
+  field_infos.add_field(:content, :store => :yes,
+                                  :index => :yes,
+                                  :term_vector => :with_positions_offsets)
+If you need to add a field to an already open index you do so like this:
+  index.field_infos.add_field(:new_field, :store => :yes)
+=== Searching
+Now that we have data in our index, how do we actually use this index to
+search the data? The Index offers two search methods, Index#search and
+Index#search_each. The first method returns a Ferret::Index::TopDocs object.
+The second we'll show here. Lets say we wanted to find all documents with the
+phrase "quick brown fox" in the content field. We'd write;
+  index.search_each('content:"quick brown fox"') do |id, score|
+    puts "Document #{id} found with a score of #{score}"
+  end
+But "fast" has a pretty similar meaning to "quick" and we don't mind if the
+fox is a little red. Also, the phrase could be in the title so we'll search
+there as well. So we could expand our search like this;
+  index.search_each('title|content:"quick|fast brown|red fox"') do |id, score|
+    puts "Document #{id} found with a score of #{score}"
+  end
+What if we want to find all documents entered on or after 5th of September,
+2005 with the words "ruby" or "rails" in any field. We could type something like;
+  index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |id, score|
+    puts "Document #{index[id][:title]} found with a score of #{score}"
+  end
+Ferret has quite a complex query language. To find out more about Ferret's
+query language, see Ferret::QueryParser. You can also construct even more
+complex queries like Ferret::Search::Spans by hand. See Ferret::Search::Query
+for more information.
+=== Highlighting
+Ferret now has a super-fast highlighting method. See
+Ferret::Index::Index#highlight. Here is an example of how you would use it
+when printing to the console:
+  index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |id, score|
+    puts "Document #{index[id][:title]} found with a score of #{score}"
+    highlights = index.highlight("content:(ruby OR rails)", 0,
+                                 :field => :content,
+                                 :pre_tag = "\033[36m",
+                                 :post_tag = "\033[m")
+    puts highlights
+  end
+And if you want to highlight a whole document, set :excerpt_length to :all:
+  puts index.highlight(query, doc_id,
+                       :field => :content,
+                       :pre_tag = "\033[36m",
+                       :post_tag = "\033[m",
+                       :excerpt_length => :all)
+=== Accessing Documents
+You may have noticed that when we run a search we only get the document id
+back. By itself this isn't much use to us. Getting the data from the index is
+very straightforward. For example if we want the :title field form the 3rd
+document type;
+  index[2][:title]
+Documents are lazy loading so if you try this:
+  puts index[2]
+You will always get an empty Hash. To load all fields, call the load method:
+  puts index[2].load
+NOTE: documents are indexed from 0. You can also use array-like index
+parameters to access index. For example
+  index[1..4]
+  index[10, 10]
+  index[-5]
+The default field is :id (although you can change this with index's
+:default_create_field parameter);
+  index << "This is a document"
+  index[0][:id]
+Let's go back to the database example above. If we store all of our documents
+with an id then we can access that field using the id. As long as we called
+our id field :id we can do this
+  index["89721347"]["title"]
+Pretty simple huh? You should note though that if there are more then one
+document with the same *id* or *key* then only the first one will be returned
+so it is probably better that you ensure the key is unique somehow. By setting
+Index's :key attribute to :id, Ferret will do this automatically for you. It
+can even handle multiple field primary keys. For example, you could set to
+:key to [:id, :model] and Ferret would keep the documents unique for that pair
+of fields.
+=== Modifying and Deleting Documents
+What if we want to change the data in the index. Ferret doesn't actually let
+you change the data once it is in the index. But you can delete documents so
+the standard way to modify data is to delete it and re-add it again with the
+modifications made. It is important to note that when doing this the documents
+will get a new document number so you should be careful not to use a document
+number after the document has been deleted. Here is an example of modifying a
+document;
+  index << {:title => "Programing Rbuy", :content => "blah blah blah"}
+  doc_num = nil
+  index.search_each('title:"Programing Rbuy"') {|id, score| doc_id = id}
+  return unless doc_id
+  doc = index[doc_id]
+  index.delete(doc_id)
+  # modify doc. It is just a Hash after all
+  doc[:title] = "Programming Ruby"
+  index << doc
+If you set the :key parameter as described in the last section there is no
+need to delete the document. It will be automatically deleted when you add
+another document with the same key.
+Also, we can use the id field, as above, to delete documents. This time though
+every document that matches the id will be deleted. Again, it is probably a
+good idea if you somehow ensure that your *ids* are kept unique.
+  id = "23453422"
+  index.delete(id)
+=== Onwards
+This is just a small sampling of what Ferret allows you to do.  Ferret, like
+Lucene, is designed to be extended, and allows you to construct your own query
+types, analyzers, and so on. Going onwards you should check out the following
+documentation:
+* Ferret::Analysis: for more information on how the data is processed when it
+  is tokenized. There are a number of things you can do with your data such as
+  adding stop lists or perhaps a porter stemmer. There are also a number of
+  analyzers already available and it is almost trivial to create a new one
+  with a simple regular expression.
+* Ferret::Search: for more information on querying the index. There are a
+  number of already available queries and it's unlikely you'll need to create
+  your own. You may however want to take advantage of the sorting or filtering
+  abilities of Ferret to present your data the best way you see fit.
+* Ferret::QueryParser: if you want to find out more about what you can do with
+  Ferret's Query Parser, this is the place to look. The query parser is one
+  area that could use a bit of work so please send your suggestions.
+* Ferret::Index: for more advanced access to the index you'll probably want to
+  use the Ferret::Index::IndexWriter and Ferret::Index::IndexReader. This is
+  the place to look for more information on them.
+* Ferret::Store: This is the module used to access the actual index storage
+  and won't be of much interest to most people.

data/bin/ferret-browser ADDED Viewed

@@ -0,0 +1,79 @@
+#!/usr/bin/env ruby
+$: << File.expand_path(File.join(File.basename(__FILE__), '../lib'))
+require 'ferret'
+require 'ferret/browser'
+require 'optparse'
+require 'ostruct'
+SERVER_OPTIONS = ['webrick']
+conf = OpenStruct.new(:host => '0.0.0.0', :port => 3301)
+opts = OptionParser.new do |opts|
+  opts.banner = "Usage: #{File.basename($0)} /path/to/index"
+  opts.separator ""
+  opts.separator "Specific Options:"
+  opts.on("-h", "--host HOSTNAME",
+          "Host for web server to bind to (default is all IPs)") { |conf.host| }
+  opts.on("-p", "--port NUM",
+          "Port for web server (defaults to #{conf.port})") { |conf.port| }
+  opts.on("-s", "--server NAME",
+          "Server to force (#{SERVER_OPTIONS.join(', ')}).") { |s| conf.server = s.to_sym }
+  opts.separator ""
+  opts.separator "Common options:"
+  opts.on_tail("-?", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+  opts.on_tail("-v", "--version", "Show version") do
+    puts Ferret::VERSION
+    exit
+  end
+end
+opts.parse! ARGV
+if ARGV.length != 1
+  puts opts
+  exit
+end
+@path = ARGV[0]
+# Load the Ferret index
+begin
+  @reader = Ferret::Index::IndexReader.new(@path)
+rescue Ferret::FileNotFoundError => e
+  puts "\033[31mCannot start Ferret. No index exists at \"\033[m" +
+    "\033[33m#{@path}\033[m\033[31m\".\033[m"
+  exit
+rescue Exception => e
+  puts "\033[31mCannot start Ferret.\n\033[m\033[33m#{e.to_s}\031[m"
+  exit
+end
+unless conf.server
+  conf.server = :webrick
+end
+case conf.server.to_s
+when 'webrick'
+  require 'webrick/httpserver'
+  require 'ferret/browser/webrick'
+  # Mount the root
+  s = WEBrick::HTTPServer.new(:BindAddress => conf.host, :Port => conf.port)
+  s.mount "/s", WEBrick::HTTPServlet::FileHandler, Ferret::Browser::Controller::STATIC_DIR, true
+  s.mount "/", WEBrick::FerretBrowserHandler, @reader, @path
+  # Server up
+  trap(:INT) do
+    s.shutdown
+  end
+  s.start
+else
+  raise "server #{conf.server} not known. Must be one of [#{SERVER_OPTIONS.join(', ')}]"
+end

data/ext/BZLIB_blocksort.c ADDED Viewed

@@ -0,0 +1,1094 @@
+/*-------------------------------------------------------------*/
+/*--- Block sorting machinery                               ---*/
+/*---                                           blocksort.c ---*/
+/*-------------------------------------------------------------*/
+/* ------------------------------------------------------------------
+   This file is part of bzip2/libbzip2, a program and library for
+   lossless, block-sorting data compression.
+   bzip2/libbzip2 version 1.0.4 of 20 December 2006
+   Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
+   Please read the WARNING, DISCLAIMER and PATENTS sections in the
+   README file.
+   This program is released under the terms of the license contained
+   in the file LICENSE.
+   ------------------------------------------------------------------ */
+#include "bzlib_private.h"
+/*---------------------------------------------*/
+/*--- Fallback O(N log(N)^2) sorting        ---*/
+/*--- algorithm, for repetitive blocks      ---*/
+/*---------------------------------------------*/
+/*---------------------------------------------*/
+static
+__inline__
+void fallbackSimpleSort ( UInt32* fmap,
+                          UInt32* eclass,
+                          Int32   lo,
+                          Int32   hi )
+{
+   Int32 i, j, tmp;
+   UInt32 ec_tmp;
+   if (lo == hi) return;
+   if (hi - lo > 3) {
+      for ( i = hi-4; i >= lo; i-- ) {
+         tmp = fmap[i];
+         ec_tmp = eclass[tmp];
+         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
+            fmap[j-4] = fmap[j];
+         fmap[j-4] = tmp;
+      }
+   }
+   for ( i = hi-1; i >= lo; i-- ) {
+      tmp = fmap[i];
+      ec_tmp = eclass[tmp];
+      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
+         fmap[j-1] = fmap[j];
+      fmap[j-1] = tmp;
+   }
+}
+/*---------------------------------------------*/
+#define fswap(zz1, zz2) \
+   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
+#define fvswap(zzp1, zzp2, zzn)       \
+{                                     \
+   Int32 yyp1 = (zzp1);               \
+   Int32 yyp2 = (zzp2);               \
+   Int32 yyn  = (zzn);                \
+   while (yyn > 0) {                  \
+      fswap(fmap[yyp1], fmap[yyp2]);  \
+      yyp1++; yyp2++; yyn--;          \
+   }                                  \
+}
+#define fmin(a,b) ((a) < (b)) ? (a) : (b)
+#define fpush(lz,hz) { stackLo[sp] = lz; \
+                       stackHi[sp] = hz; \
+                       sp++; }
+#define fpop(lz,hz) { sp--;              \
+                      lz = stackLo[sp];  \
+                      hz = stackHi[sp]; }
+#define FALLBACK_QSORT_SMALL_THRESH 10
+#define FALLBACK_QSORT_STACK_SIZE   100
+static
+void fallbackQSort3 ( UInt32* fmap,
+                      UInt32* eclass,
+                      Int32   loSt,
+                      Int32   hiSt )
+{
+   Int32 unLo, unHi, ltLo, gtHi, n, m;
+   Int32 sp, lo, hi;
+   UInt32 med, r, r3;
+   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
+   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
+   r = 0;
+   sp = 0;
+   fpush ( loSt, hiSt );
+   while (sp > 0) {
+      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
+      fpop ( lo, hi );
+      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
+         fallbackSimpleSort ( fmap, eclass, lo, hi );
+         continue;
+      }
+      /* Random partitioning.  Median of 3 sometimes fails to
+         avoid bad cases.  Median of 9 seems to help but
+         looks rather expensive.  This too seems to work but
+         is cheaper.  Guidance for the magic constants
+         7621 and 32768 is taken from Sedgewick's algorithms
+         book, chapter 35.
+      */
+      r = ((r * 7621) + 1) % 32768;
+      r3 = r % 3;
+      if (r3 == 0) med = eclass[fmap[lo]]; else
+      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
+                   med = eclass[fmap[hi]];
+      unLo = ltLo = lo;
+      unHi = gtHi = hi;
+      while (1) {
+         while (1) {
+            if (unLo > unHi) break;
+            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
+            if (n == 0) {
+               fswap(fmap[unLo], fmap[ltLo]);
+               ltLo++; unLo++;
+               continue;
+            };
+            if (n > 0) break;
+            unLo++;
+         }
+         while (1) {
+            if (unLo > unHi) break;
+            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
+            if (n == 0) {
+               fswap(fmap[unHi], fmap[gtHi]);
+               gtHi--; unHi--;
+               continue;
+            };
+            if (n < 0) break;
+            unHi--;
+         }
+         if (unLo > unHi) break;
+         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
+      }
+      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
+      if (gtHi < ltLo) continue;
+      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
+      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
+      n = lo + unLo - ltLo - 1;
+      m = hi - (gtHi - unHi) + 1;
+      if (n - lo > hi - m) {
+         fpush ( lo, n );
+         fpush ( m, hi );
+      } else {
+         fpush ( m, hi );
+         fpush ( lo, n );
+      }
+   }
+}
+#undef fmin
+#undef fpush
+#undef fpop
+#undef fswap
+#undef fvswap
+#undef FALLBACK_QSORT_SMALL_THRESH
+#undef FALLBACK_QSORT_STACK_SIZE
+/*---------------------------------------------*/
+/* Pre:
+      nblock > 0
+      eclass exists for [0 .. nblock-1]
+      ((UChar*)eclass) [0 .. nblock-1] holds block
+      ptr exists for [0 .. nblock-1]
+   Post:
+      ((UChar*)eclass) [0 .. nblock-1] holds block
+      All other areas of eclass destroyed
+      fmap [0 .. nblock-1] holds sorted order
+      bhtab [ 0 .. 2+(nblock/32) ] destroyed
+*/
+#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
+#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
+#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
+#define      WORD_BH(zz)  bhtab[(zz) >> 5]
+#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
+static
+void fallbackSort ( UInt32* fmap,
+                    UInt32* eclass,
+                    UInt32* bhtab,
+                    Int32   nblock,
+                    Int32   verb )
+{
+   Int32 ftab[257];
+   Int32 ftabCopy[256];
+   Int32 H, i, j, k, l, r, cc, cc1;
+   Int32 nNotDone;
+   Int32 nBhtab;
+   UChar* eclass8 = (UChar*)eclass;
+   /*--
+      Initial 1-char radix sort to generate
+      initial fmap and initial BH bits.
+   --*/
+   if (verb >= 4)
+      VPrintf0 ( "        bucket sorting ...\n" );
+   for (i = 0; i < 257;    i++) ftab[i] = 0;
+   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
+   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
+   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
+   for (i = 0; i < nblock; i++) {
+      j = eclass8[i];
+      k = ftab[j] - 1;
+      ftab[j] = k;
+      fmap[k] = i;
+   }
+   nBhtab = 2 + (nblock / 32);
+   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
+   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
+   /*--
+      Inductively refine the buckets.  Kind-of an
+      "exponential radix sort" (!), inspired by the
+      Manber-Myers suffix array construction algorithm.
+   --*/
+   /*-- set sentinel bits for block-end detection --*/
+   for (i = 0; i < 32; i++) {
+      SET_BH(nblock + 2*i);
+      CLEAR_BH(nblock + 2*i + 1);
+   }
+   /*-- the log(N) loop --*/
+   H = 1;
+   while (1) {
+      if (verb >= 4)
+         VPrintf1 ( "        depth %6d has ", H );
+      j = 0;
+      for (i = 0; i < nblock; i++) {
+         if (ISSET_BH(i)) j = i;
+         k = fmap[i] - H; if (k < 0) k += nblock;
+         eclass[k] = j;
+      }
+      nNotDone = 0;
+      r = -1;
+      while (1) {
+	 /*-- find the next non-singleton bucket --*/
+         k = r + 1;
+         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
+         if (ISSET_BH(k)) {
+            while (WORD_BH(k) == 0xffffffff) k += 32;
+            while (ISSET_BH(k)) k++;
+         }
+         l = k - 1;
+         if (l >= nblock) break;
+         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
+         if (!ISSET_BH(k)) {
+            while (WORD_BH(k) == 0x00000000) k += 32;
+            while (!ISSET_BH(k)) k++;
+         }
+         r = k - 1;
+         if (r >= nblock) break;
+         /*-- now [l, r] bracket current bucket --*/
+         if (r > l) {
+            nNotDone += (r - l + 1);
+            fallbackQSort3 ( fmap, eclass, l, r );
+            /*-- scan bucket and generate header bits-- */
+            cc = -1;
+            for (i = l; i <= r; i++) {
+               cc1 = eclass[fmap[i]];
+               if (cc != cc1) { SET_BH(i); cc = cc1; };
+            }
+         }
+      }
+      if (verb >= 4)
+         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
+      H *= 2;
+      if (H > nblock || nNotDone == 0) break;
+   }
+   /*--
+      Reconstruct the original block in
+      eclass8 [0 .. nblock-1], since the
+      previous phase destroyed it.
+   --*/
+   if (verb >= 4)
+      VPrintf0 ( "        reconstructing block ...\n" );
+   j = 0;
+   for (i = 0; i < nblock; i++) {
+      while (ftabCopy[j] == 0) j++;
+      ftabCopy[j]--;
+      eclass8[fmap[i]] = (UChar)j;
+   }
+   AssertH ( j < 256, 1005 );
+}
+#undef       SET_BH
+#undef     CLEAR_BH
+#undef     ISSET_BH
+#undef      WORD_BH
+#undef UNALIGNED_BH
+/*---------------------------------------------*/
+/*--- The main, O(N^2 log(N)) sorting       ---*/
+/*--- algorithm.  Faster for "normal"       ---*/
+/*--- non-repetitive blocks.                ---*/
+/*---------------------------------------------*/
+/*---------------------------------------------*/
+static
+__inline__
+Bool mainGtU ( UInt32  i1,
+               UInt32  i2,
+               UChar*  block,
+               UInt16* quadrant,
+               UInt32  nblock,
+               Int32*  budget )
+{
+   Int32  k;
+   UChar  c1, c2;
+   UInt16 s1, s2;
+   AssertD ( i1 != i2, "mainGtU" );
+   /* 1 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 2 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 3 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 4 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 5 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 6 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 7 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 8 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 9 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 10 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 11 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   /* 12 */
+   c1 = block[i1]; c2 = block[i2];
+   if (c1 != c2) return (c1 > c2);
+   i1++; i2++;
+   k = nblock + 8;
+   do {
+      /* 1 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 2 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 3 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 4 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 5 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 6 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 7 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      /* 8 */
+      c1 = block[i1]; c2 = block[i2];
+      if (c1 != c2) return (c1 > c2);
+      s1 = quadrant[i1]; s2 = quadrant[i2];
+      if (s1 != s2) return (s1 > s2);
+      i1++; i2++;
+      if (i1 >= nblock) i1 -= nblock;
+      if (i2 >= nblock) i2 -= nblock;
+      k -= 8;
+      (*budget)--;
+   }
+      while (k >= 0);
+   return False;
+}
+/*---------------------------------------------*/
+/*--
+   Knuth's increments seem to work better
+   than Incerpi-Sedgewick here.  Possibly
+   because the number of elems to sort is
+   usually small, typically <= 20.
+--*/
+static
+Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
+                   9841, 29524, 88573, 265720,
+                   797161, 2391484 };
+static
+void mainSimpleSort ( UInt32* ptr,
+                      UChar*  block,
+                      UInt16* quadrant,
+                      Int32   nblock,
+                      Int32   lo,
+                      Int32   hi,
+                      Int32   d,
+                      Int32*  budget )
+{
+   Int32 i, j, h, bigN, hp;
+   UInt32 v;
+   bigN = hi - lo + 1;
+   if (bigN < 2) return;
+   hp = 0;
+   while (incs[hp] < bigN) hp++;
+   hp--;
+   for (; hp >= 0; hp--) {
+      h = incs[hp];
+      i = lo + h;
+      while (True) {
+         /*-- copy 1 --*/
+         if (i > hi) break;
+         v = ptr[i];
+         j = i;
+         while ( mainGtU (
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
+                 ) ) {
+            ptr[j] = ptr[j-h];
+            j = j - h;
+            if (j <= (lo + h - 1)) break;
+         }
+         ptr[j] = v;
+         i++;
+         /*-- copy 2 --*/
+         if (i > hi) break;
+         v = ptr[i];
+         j = i;
+         while ( mainGtU (
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
+                 ) ) {
+            ptr[j] = ptr[j-h];
+            j = j - h;
+            if (j <= (lo + h - 1)) break;
+         }
+         ptr[j] = v;
+         i++;
+         /*-- copy 3 --*/
+         if (i > hi) break;
+         v = ptr[i];
+         j = i;
+         while ( mainGtU (
+                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
+                 ) ) {
+            ptr[j] = ptr[j-h];
+            j = j - h;
+            if (j <= (lo + h - 1)) break;
+         }
+         ptr[j] = v;
+         i++;
+         if (*budget < 0) return;
+      }
+   }
+}
+/*---------------------------------------------*/
+/*--
+   The following is an implementation of
+   an elegant 3-way quicksort for strings,
+   described in a paper "Fast Algorithms for
+   Sorting and Searching Strings", by Robert
+   Sedgewick and Jon L. Bentley.
+--*/
+#define mswap(zz1, zz2) \
+   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
+#define mvswap(zzp1, zzp2, zzn)       \
+{                                     \
+   Int32 yyp1 = (zzp1);               \
+   Int32 yyp2 = (zzp2);               \
+   Int32 yyn  = (zzn);                \
+   while (yyn > 0) {                  \
+      mswap(ptr[yyp1], ptr[yyp2]);    \
+      yyp1++; yyp2++; yyn--;          \
+   }                                  \
+}
+static
+__inline__
+UChar mmed3 ( UChar a, UChar b, UChar c )
+{
+   UChar t;
+   if (a > b) { t = a; a = b; b = t; };
+   if (b > c) {
+      b = c;
+      if (a > b) b = a;
+   }
+   return b;
+}
+#define mmin(a,b) ((a) < (b)) ? (a) : (b)
+#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
+                          stackHi[sp] = hz; \
+                          stackD [sp] = dz; \
+                          sp++; }
+#define mpop(lz,hz,dz) { sp--;             \
+                         lz = stackLo[sp]; \
+                         hz = stackHi[sp]; \
+                         dz = stackD [sp]; }
+#define mnextsize(az) (nextHi[az]-nextLo[az])
+#define mnextswap(az,bz)                                        \
+   { Int32 tz;                                                  \
+     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
+     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
+     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
+#define MAIN_QSORT_SMALL_THRESH 20
+#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
+#define MAIN_QSORT_STACK_SIZE 100
+static
+void mainQSort3 ( UInt32* ptr,
+                  UChar*  block,
+                  UInt16* quadrant,
+                  Int32   nblock,
+                  Int32   loSt,
+                  Int32   hiSt,
+                  Int32   dSt,
+                  Int32*  budget )
+{
+   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
+   Int32 sp, lo, hi, d;
+   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
+   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
+   Int32 stackD [MAIN_QSORT_STACK_SIZE];
+   Int32 nextLo[3];
+   Int32 nextHi[3];
+   Int32 nextD [3];
+   sp = 0;
+   mpush ( loSt, hiSt, dSt );
+   while (sp > 0) {
+      AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
+      mpop ( lo, hi, d );
+      if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
+          d > MAIN_QSORT_DEPTH_THRESH) {
+         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
+         if (*budget < 0) return;
+         continue;
+      }
+      med = (Int32)
+            mmed3 ( block[ptr[ lo         ]+d],
+                    block[ptr[ hi         ]+d],
+                    block[ptr[ (lo+hi)>>1 ]+d] );
+      unLo = ltLo = lo;
+      unHi = gtHi = hi;
+      while (True) {
+         while (True) {
+            if (unLo > unHi) break;
+            n = ((Int32)block[ptr[unLo]+d]) - med;
+            if (n == 0) {
+               mswap(ptr[unLo], ptr[ltLo]);
+               ltLo++; unLo++; continue;
+            };
+            if (n >  0) break;
+            unLo++;
+         }
+         while (True) {
+            if (unLo > unHi) break;
+            n = ((Int32)block[ptr[unHi]+d]) - med;
+            if (n == 0) {
+               mswap(ptr[unHi], ptr[gtHi]);
+               gtHi--; unHi--; continue;
+            };
+            if (n <  0) break;
+            unHi--;
+         }
+         if (unLo > unHi) break;
+         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
+      }
+      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
+      if (gtHi < ltLo) {
+         mpush(lo, hi, d+1 );
+         continue;
+      }
+      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
+      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
+      n = lo + unLo - ltLo - 1;
+      m = hi - (gtHi - unHi) + 1;
+      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
+      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
+      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
+      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
+      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
+      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
+      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
+      mpush (nextLo[0], nextHi[0], nextD[0]);
+      mpush (nextLo[1], nextHi[1], nextD[1]);
+      mpush (nextLo[2], nextHi[2], nextD[2]);
+   }
+}
+#undef mswap
+#undef mvswap
+#undef mpush
+#undef mpop
+#undef mmin
+#undef mnextsize
+#undef mnextswap
+#undef MAIN_QSORT_SMALL_THRESH
+#undef MAIN_QSORT_DEPTH_THRESH
+#undef MAIN_QSORT_STACK_SIZE
+/*---------------------------------------------*/
+/* Pre:
+      nblock > N_OVERSHOOT
+      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
+      ((UChar*)block32) [0 .. nblock-1] holds block
+      ptr exists for [0 .. nblock-1]
+   Post:
+      ((UChar*)block32) [0 .. nblock-1] holds block
+      All other areas of block32 destroyed
+      ftab [0 .. 65536 ] destroyed
+      ptr [0 .. nblock-1] holds sorted order
+      if (*budget < 0), sorting was abandoned
+*/
+#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
+#define SETMASK (1 << 21)
+#define CLEARMASK (~(SETMASK))
+static
+void mainSort ( UInt32* ptr,
+                UChar*  block,
+                UInt16* quadrant,
+                UInt32* ftab,
+                Int32   nblock,
+                Int32   verb,
+                Int32*  budget )
+{
+   Int32  i, j, k, ss, sb;
+   Int32  runningOrder[256];
+   Bool   bigDone[256];
+   Int32  copyStart[256];
+   Int32  copyEnd  [256];
+   UChar  c1;
+   Int32  numQSorted;
+   UInt16 s;
+   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
+   /*-- set up the 2-byte frequency table --*/
+   for (i = 65536; i >= 0; i--) ftab[i] = 0;
+   j = block[0] << 8;
+   i = nblock-1;
+   for (; i >= 3; i -= 4) {
+      quadrant[i] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
+      ftab[j]++;
+      quadrant[i-1] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
+      ftab[j]++;
+      quadrant[i-2] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
+      ftab[j]++;
+      quadrant[i-3] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
+      ftab[j]++;
+   }
+   for (; i >= 0; i--) {
+      quadrant[i] = 0;
+      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
+      ftab[j]++;
+   }
+   /*-- (emphasises close relationship of block & quadrant) --*/
+   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
+      block   [nblock+i] = block[i];
+      quadrant[nblock+i] = 0;
+   }
+   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
+   /*-- Complete the initial radix sort --*/
+   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
+   s = block[0] << 8;
+   i = nblock-1;
+   for (; i >= 3; i -= 4) {
+      s = (s >> 8) | (block[i] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i;
+      s = (s >> 8) | (block[i-1] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i-1;
+      s = (s >> 8) | (block[i-2] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i-2;
+      s = (s >> 8) | (block[i-3] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i-3;
+   }
+   for (; i >= 0; i--) {
+      s = (s >> 8) | (block[i] << 8);
+      j = ftab[s] -1;
+      ftab[s] = j;
+      ptr[j] = i;
+   }
+   /*--
+      Now ftab contains the first loc of every small bucket.
+      Calculate the running order, from smallest to largest
+      big bucket.
+   --*/
+   for (i = 0; i <= 255; i++) {
+      bigDone     [i] = False;
+      runningOrder[i] = i;
+   }
+   {
+      Int32 vv;
+      Int32 h = 1;
+      do h = 3 * h + 1; while (h <= 256);
+      do {
+         h = h / 3;
+         for (i = h; i <= 255; i++) {
+            vv = runningOrder[i];
+            j = i;
+            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
+               runningOrder[j] = runningOrder[j-h];
+               j = j - h;
+               if (j <= (h - 1)) goto zero;
+            }
+            zero:
+            runningOrder[j] = vv;
+         }
+      } while (h != 1);
+   }
+   /*--
+      The main sorting loop.
+   --*/
+   numQSorted = 0;
+   for (i = 0; i <= 255; i++) {
+      /*--
+         Process big buckets, starting with the least full.
+         Basically this is a 3-step process in which we call
+         mainQSort3 to sort the small buckets [ss, j], but
+         also make a big effort to avoid the calls if we can.
+      --*/
+      ss = runningOrder[i];
+      /*--
+         Step 1:
+         Complete the big bucket [ss] by quicksorting
+         any unsorted small buckets [ss, j], for j != ss.
+         Hopefully previous pointer-scanning phases have already
+         completed many of the small buckets [ss, j], so
+         we don't have to sort them at all.
+      --*/
+      for (j = 0; j <= 255; j++) {
+         if (j != ss) {
+            sb = (ss << 8) + j;
+            if ( ! (ftab[sb] & SETMASK) ) {
+               Int32 lo = ftab[sb]   & CLEARMASK;
+               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
+               if (hi > lo) {
+                  if (verb >= 4)
+                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
+                                "done %d   this %d\n",
+                                ss, j, numQSorted, hi - lo + 1 );
+                  mainQSort3 (
+                     ptr, block, quadrant, nblock,
+                     lo, hi, BZ_N_RADIX, budget
+                  );
+                  numQSorted += (hi - lo + 1);
+                  if (*budget < 0) return;
+               }
+            }
+            ftab[sb] |= SETMASK;
+         }
+      }
+      AssertH ( !bigDone[ss], 1006 );
+      /*--
+         Step 2:
+         Now scan this big bucket [ss] so as to synthesise the
+         sorted order for small buckets [t, ss] for all t,
+         including, magically, the bucket [ss,ss] too.
+         This will avoid doing Real Work in subsequent Step 1's.
+      --*/
+      {
+         for (j = 0; j <= 255; j++) {
+            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
+            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
+         }
+         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
+            k = ptr[j]-1; if (k < 0) k += nblock;
+            c1 = block[k];
+            if (!bigDone[c1])
+               ptr[ copyStart[c1]++ ] = k;
+         }
+         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
+            k = ptr[j]-1; if (k < 0) k += nblock;
+            c1 = block[k];
+            if (!bigDone[c1])
+               ptr[ copyEnd[c1]-- ] = k;
+         }
+      }
+      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
+                ||
+                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
+                   Necessity for this case is demonstrated by compressing
+                   a sequence of approximately 48.5 million of character
+                   251; 1.0.0/1.0.1 will then die here. */
+                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
+                1007 )
+      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
+      /*--
+         Step 3:
+         The [ss] big bucket is now done.  Record this fact,
+         and update the quadrant descriptors.  Remember to
+         update quadrants in the overshoot area too, if
+         necessary.  The "if (i < 255)" test merely skips
+         this updating for the last bucket processed, since
+         updating for the last bucket is pointless.
+         The quadrant array provides a way to incrementally
+         cache sort orderings, as they appear, so as to
+         make subsequent comparisons in fullGtU() complete
+         faster.  For repetitive blocks this makes a big
+         difference (but not big enough to be able to avoid
+         the fallback sorting mechanism, exponential radix sort).
+         The precise meaning is: at all times:
+            for 0 <= i < nblock and 0 <= j <= nblock
+            if block[i] != block[j],
+               then the relative values of quadrant[i] and
+                    quadrant[j] are meaningless.
+               else {
+                  if quadrant[i] < quadrant[j]
+                     then the string starting at i lexicographically
+                     precedes the string starting at j
+                  else if quadrant[i] > quadrant[j]
+                     then the string starting at j lexicographically
+                     precedes the string starting at i
+                  else
+                     the relative ordering of the strings starting
+                     at i and j has not yet been determined.
+               }
+      --*/
+      bigDone[ss] = True;
+      if (i < 255) {
+         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
+         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
+         Int32 shifts   = 0;
+         while ((bbSize >> shifts) > 65534) shifts++;
+         for (j = bbSize-1; j >= 0; j--) {
+            Int32 a2update     = ptr[bbStart + j];
+            UInt16 qVal        = (UInt16)(j >> shifts);
+            quadrant[a2update] = qVal;
+            if (a2update < BZ_N_OVERSHOOT)
+               quadrant[a2update + nblock] = qVal;
+         }
+         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
+      }
+   }
+   if (verb >= 4)
+      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
+                 nblock, numQSorted, nblock - numQSorted );
+}
+#undef BIGFREQ
+#undef SETMASK
+#undef CLEARMASK
+/*---------------------------------------------*/
+/* Pre:
+      nblock > 0
+      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
+      ((UChar*)arr2)  [0 .. nblock-1] holds block
+      arr1 exists for [0 .. nblock-1]
+   Post:
+      ((UChar*)arr2) [0 .. nblock-1] holds block
+      All other areas of block destroyed
+      ftab [ 0 .. 65536 ] destroyed
+      arr1 [0 .. nblock-1] holds sorted order
+*/
+void BZ2_blockSort ( EState* s )
+{
+   UInt32* ptr    = s->ptr;
+   UChar*  block  = s->block;
+   UInt32* ftab   = s->ftab;
+   Int32   nblock = s->nblock;
+   Int32   verb   = s->verbosity;
+   Int32   wfact  = s->workFactor;
+   UInt16* quadrant;
+   Int32   budget;
+   Int32   budgetInit;
+   Int32   i;
+   if (nblock < 10000) {
+      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
+   } else {
+      /* Calculate the location for quadrant, remembering to get
+         the alignment right.  Assumes that &(block[0]) is at least
+         2-byte aligned -- this should be ok since block is really
+         the first section of arr2.
+      */
+      i = nblock+BZ_N_OVERSHOOT;
+      if (i & 1) i++;
+      quadrant = (UInt16*)(&(block[i]));
+      /* (wfact-1) / 3 puts the default-factor-30
+         transition point at very roughly the same place as
+         with v0.1 and v0.9.0.
+         Not that it particularly matters any more, since the
+         resulting compressed stream is now the same regardless
+         of whether or not we use the main sort or fallback sort.
+      */
+      if (wfact < 1  ) wfact = 1;
+      if (wfact > 100) wfact = 100;
+      budgetInit = nblock * ((wfact-1) / 3);
+      budget = budgetInit;
+      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
+      if (verb >= 3)
+         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
+                    budgetInit - budget,
+                    nblock,
+                    (float)(budgetInit - budget) /
+                    (float)(nblock==0 ? 1 : nblock) );
+      if (budget < 0) {
+         if (verb >= 2)
+            VPrintf0 ( "    too repetitive; using fallback"
+                       " sorting algorithm\n" );
+         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
+      }
+   }
+   s->origPtr = -1;
+   for (i = 0; i < s->nblock; i++)
+      if (ptr[i] == 0)
+         { s->origPtr = i; break; };
+   AssertH( s->origPtr != -1, 1003 );
+}
+/*-------------------------------------------------------------*/
+/*--- end                                       blocksort.c ---*/
+/*-------------------------------------------------------------*/