jk-ferret 0.11.8.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
data/TODO ADDED
@@ -0,0 +1,109 @@
1
+ TODO
2
+ ====
3
+ * C
4
+ - IMPORTANT:
5
+ + FIX file descriptor overflow. See Tickets #341 and #343
6
+ - add .. operator to query parser. For example, [100 200] could be written as
7
+ 100..200 or 100...201 like in Ruby Ranges
8
+ - remove exception handling from C code. All errors to be handled by return
9
+ values.
10
+ - Move to sqlite's locking model. Ferret should work fine in a multi-process
11
+ environment.
12
+ - Add optional logging. To be enabled at compilation time, perhaps?
13
+ - Add support for changing zlib and bzlib compression parameters
14
+ - Improve unit test coverage to 100%
15
+ - Add benchmark suite
16
+ - Add Rakefile for development purposes
17
+ + task to publish gcov and benchmark results to ferret wiki
18
+ - Index rebuilding of old versioned indexes.
19
+ - Add a globally accessable, threadsafe symbol table. This will be very
20
+ useful for storing field names so that no objects need to strdup the
21
+ field-names but can just store the symbol representative instead.
22
+ + this has been done but it can be improved using actual Symbol structs
23
+ instead of plain char*
24
+ - Make threading optional at compile time
25
+ - to_json should limit output to prevent memory overflow on large indexes.
26
+ Perhaps we could use some type of buffered read for this.
27
+ - Make BitVector run as fast as bitset from C++ STL. See;
28
+ c/benchmark/bm_bitvector.c
29
+ - Add a symbol table for field names. This will mean that we won't need to
30
+ worry about mallocing and freeing field names which happens all over the
31
+ place.
32
+ - Divide the headers into public and private (the private headers to be
33
+ stored in the src directory).
34
+ - Group-by search. ie you should be able to pass a field to group search
35
+ results by
36
+ - Auto-loading of documents during search. ie actual documents get returned
37
+ instead of document numbers.
38
+
39
+ * Ruby bindings
40
+ - argument checking for every method. We need a new api for argument checking
41
+ so that the arguments get checked at the start of each method that could
42
+ cause a segfault.
43
+ - improve memory management. It was way to complex at the moment. I also need
44
+ to document how it works so that other developers understand what is going
45
+ on.
46
+ - Replace Data_Wrap_Struct with ferret alternative which handles rewrapping
47
+ of structs automatically and also knows when to release a struct by using
48
+ refcounting.
49
+
50
+ * Ruby
51
+ - integrate rcov
52
+ - improve unit test coverage to 100%
53
+
54
+ * Documentation.
55
+ - generate Ruby binding documentation with custom build template similar
56
+ jaxdoc http://rubyforge.org/projects/jaxdoc
57
+ - all documentation should meet DOCUMENTATION_STANDARDS
58
+ - documentation in C code to be generated by doxygen
59
+
60
+ Someday Maybe
61
+ =============
62
+ * apply for Google Summer of Code 2009
63
+ * optimize read and write vint
64
+ - test the following outside of ferret before implementing
65
+ - perform a binary scan using bit-wise or to find out how many bytes need
66
+ to be written
67
+ - if the write/read will overflow the buffer, split it into two, refreshing
68
+ the buffer in between
69
+ - use Duff's device to write bytes now that we know how many we need
70
+ * add a super fast language based dictionary compression
71
+ * add portable stacktrace function. Perhaps implement as an external library.
72
+ - See http://www.nongnu.org/libunwind/
73
+ - See http://www.tlug.org.za/wiki/index.php/Obtaining_a_stack_trace_in_C_upon_SIGSEGV
74
+ * investigate unscored searching
75
+ * user defined sorting
76
+ * Fix highlighting to work for external fields
77
+ * investigate faster string hashing method
78
+
79
+ Done
80
+ ====
81
+ * add rake install task
82
+ * FIX :create parameter so that it only deletes the files owned by Ferret.
83
+ * fix compression. Currently nothing is happening if you set a field to
84
+ :compress. I guess we'll just assume zlib is installed, as I think it has to
85
+ be for Ruby to be installed.
86
+ * add bzlib support
87
+ * integrate gcov
88
+ * add a field cache to IndexReader
89
+ * setup email alerts for svn commits
90
+ * Ranged, unordered searching. Ie search through the index until you have the
91
+ required number of documents and then break. This will require the ability to
92
+ start searches from a particular doc-num.
93
+ + See searcher_search_unordered in the C code and Searcher#scan in Ruby
94
+ * improve unit test code. I'd like to implement some way to print out a stack
95
+ trace when a test fails so that it is easy to find the source of the error.
96
+ * catch segfaults and print stack trace so users can post helpful bug tickets.
97
+ again, see the same links for adding stacktrace to unit tests.
98
+ * Add string Sort descripter
99
+ * fix memory bug
100
+ * add MultiReader interface
101
+ * add lexicographical sort (byte sort)
102
+ * Add highlighting
103
+ * add field compression
104
+ * Fix highlighting to work for compressed fields
105
+ * Add Ferret::Index::Index
106
+ * Fix:
107
+ + Working Query: field1:value1 AND NOT field2:value2
108
+ + Failing Query: field1:value1 AND ( NOT field2:value2 )
109
+ * update benchmark suite to use getrusage
data/TUTORIAL ADDED
@@ -0,0 +1,231 @@
1
+ = Quick Introduction to Ferret
2
+
3
+ The simplest way to use Ferret is through the Ferret::Index::Index class.
4
+ This is now aliased by Ferret::I for quick and easy access. Start by including
5
+ the Ferret module.
6
+
7
+ require 'ferret'
8
+ include Ferret
9
+
10
+ === Creating an index
11
+
12
+ To create an in memory index is very simple;
13
+
14
+ index = Index::Index.new()
15
+
16
+ To create a persistent index;
17
+
18
+ index = Index::Index.new(:path => '/path/to/index')
19
+
20
+ Both of these methods create new Indexes with the StandardAnalyzer. An
21
+ analyzer is what you use to divide the input data up into tokens which you can
22
+ search for later. If you'd like to use a different analyzer you can specify it
23
+ here, eg;
24
+
25
+ index = Index::Index.new(:path => '/path/to/index',
26
+ :analyzer => Analysis::WhiteSpaceAnalyzer.new)
27
+
28
+ For more options when creating an Index refer to Ferret::Index::Index.
29
+
30
+ === Adding Documents
31
+
32
+ To add a document you can simply add a string or an array of strings. This will
33
+ store all the strings in the "" (ie empty string) field (unless you specify the
34
+ default field when you create the index).
35
+
36
+ index << "This is a new document to be indexed"
37
+ index << ["And here", "is another", "new document", "to be indexed"]
38
+
39
+ But these are pretty simple documents. If this is all you want to index you
40
+ could probably just use SimpleSearch. So let's give our documents some fields;
41
+
42
+ index << {:title => "Programming Ruby", :content => "blah blah blah"}
43
+ index << {:title => "Programming Ruby", :content => "yada yada yada"}
44
+
45
+ Note the way that all field-names are Symbols. Although Strings will work,
46
+ this is a best-practice in Ferret. Or if you are indexing data stored in a
47
+ database, you'll probably want to store the id;
48
+
49
+ index << {:id => row.id, :title => row.title, :date => row.date}
50
+
51
+ So far we have been storing and tokenizing all of the input data along with
52
+ term vectors. If we want to change this we need to change the way we setup the
53
+ index. You must create a FieldInfos object describing the index:
54
+
55
+ field_infos = FieldInfos.new(:store => :no,
56
+ :index => :untokenized_omit_norms,
57
+ :term_vector => :no)
58
+
59
+ The values that you set FieldInfos to have will be used by default by all
60
+ fields. If you want to change the properties for specific fields, you need to
61
+ add a FieldInfo to field_infos.
62
+
63
+ field_infos.add_field(:title, :store => :yes, :index => :yes, :boost => 10.0)
64
+ field_infos.add_field(:content, :store => :yes,
65
+ :index => :yes,
66
+ :term_vector => :with_positions_offsets)
67
+
68
+ If you need to add a field to an already open index you do so like this:
69
+
70
+ index.field_infos.add_field(:new_field, :store => :yes)
71
+
72
+ === Searching
73
+
74
+ Now that we have data in our index, how do we actually use this index to
75
+ search the data? The Index offers two search methods, Index#search and
76
+ Index#search_each. The first method returns a Ferret::Index::TopDocs object.
77
+ The second we'll show here. Lets say we wanted to find all documents with the
78
+ phrase "quick brown fox" in the content field. We'd write;
79
+
80
+ index.search_each('content:"quick brown fox"') do |id, score|
81
+ puts "Document #{id} found with a score of #{score}"
82
+ end
83
+
84
+ But "fast" has a pretty similar meaning to "quick" and we don't mind if the
85
+ fox is a little red. Also, the phrase could be in the title so we'll search
86
+ there as well. So we could expand our search like this;
87
+
88
+ index.search_each('title|content:"quick|fast brown|red fox"') do |id, score|
89
+ puts "Document #{id} found with a score of #{score}"
90
+ end
91
+
92
+ What if we want to find all documents entered on or after 5th of September,
93
+ 2005 with the words "ruby" or "rails" in any field. We could type something like;
94
+
95
+ index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |id, score|
96
+ puts "Document #{index[id][:title]} found with a score of #{score}"
97
+ end
98
+
99
+ Ferret has quite a complex query language. To find out more about Ferret's
100
+ query language, see Ferret::QueryParser. You can also construct even more
101
+ complex queries like Ferret::Search::Spans by hand. See Ferret::Search::Query
102
+ for more information.
103
+
104
+ === Highlighting
105
+
106
+ Ferret now has a super-fast highlighting method. See
107
+ Ferret::Index::Index#highlight. Here is an example of how you would use it
108
+ when printing to the console:
109
+
110
+ index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |id, score|
111
+ puts "Document #{index[id][:title]} found with a score of #{score}"
112
+ highlights = index.highlight("content:(ruby OR rails)", 0,
113
+ :field => :content,
114
+ :pre_tag = "\033[36m",
115
+ :post_tag = "\033[m")
116
+ puts highlights
117
+ end
118
+
119
+ And if you want to highlight a whole document, set :excerpt_length to :all:
120
+
121
+ puts index.highlight(query, doc_id,
122
+ :field => :content,
123
+ :pre_tag = "\033[36m",
124
+ :post_tag = "\033[m",
125
+ :excerpt_length => :all)
126
+
127
+ === Accessing Documents
128
+
129
+ You may have noticed that when we run a search we only get the document id
130
+ back. By itself this isn't much use to us. Getting the data from the index is
131
+ very straightforward. For example if we want the :title field form the 3rd
132
+ document type;
133
+
134
+ index[2][:title]
135
+
136
+ Documents are lazy loading so if you try this:
137
+
138
+ puts index[2]
139
+
140
+ You will always get an empty Hash. To load all fields, call the load method:
141
+
142
+ puts index[2].load
143
+
144
+ NOTE: documents are indexed from 0. You can also use array-like index
145
+ parameters to access index. For example
146
+
147
+ index[1..4]
148
+ index[10, 10]
149
+ index[-5]
150
+
151
+ The default field is :id (although you can change this with index's
152
+ :default_create_field parameter);
153
+
154
+ index << "This is a document"
155
+ index[0][:id]
156
+
157
+ Let's go back to the database example above. If we store all of our documents
158
+ with an id then we can access that field using the id. As long as we called
159
+ our id field :id we can do this
160
+
161
+ index["89721347"]["title"]
162
+
163
+ Pretty simple huh? You should note though that if there are more then one
164
+ document with the same *id* or *key* then only the first one will be returned
165
+ so it is probably better that you ensure the key is unique somehow. By setting
166
+ Index's :key attribute to :id, Ferret will do this automatically for you. It
167
+ can even handle multiple field primary keys. For example, you could set to
168
+ :key to [:id, :model] and Ferret would keep the documents unique for that pair
169
+ of fields.
170
+
171
+ === Modifying and Deleting Documents
172
+
173
+ What if we want to change the data in the index. Ferret doesn't actually let
174
+ you change the data once it is in the index. But you can delete documents so
175
+ the standard way to modify data is to delete it and re-add it again with the
176
+ modifications made. It is important to note that when doing this the documents
177
+ will get a new document number so you should be careful not to use a document
178
+ number after the document has been deleted. Here is an example of modifying a
179
+ document;
180
+
181
+ index << {:title => "Programing Rbuy", :content => "blah blah blah"}
182
+ doc_num = nil
183
+ index.search_each('title:"Programing Rbuy"') {|id, score| doc_id = id}
184
+ return unless doc_id
185
+ doc = index[doc_id]
186
+ index.delete(doc_id)
187
+
188
+ # modify doc. It is just a Hash after all
189
+ doc[:title] = "Programming Ruby"
190
+
191
+ index << doc
192
+
193
+ If you set the :key parameter as described in the last section there is no
194
+ need to delete the document. It will be automatically deleted when you add
195
+ another document with the same key.
196
+
197
+ Also, we can use the id field, as above, to delete documents. This time though
198
+ every document that matches the id will be deleted. Again, it is probably a
199
+ good idea if you somehow ensure that your *ids* are kept unique.
200
+
201
+ id = "23453422"
202
+ index.delete(id)
203
+
204
+ === Onwards
205
+
206
+ This is just a small sampling of what Ferret allows you to do. Ferret, like
207
+ Lucene, is designed to be extended, and allows you to construct your own query
208
+ types, analyzers, and so on. Going onwards you should check out the following
209
+ documentation:
210
+
211
+ * Ferret::Analysis: for more information on how the data is processed when it
212
+ is tokenized. There are a number of things you can do with your data such as
213
+ adding stop lists or perhaps a porter stemmer. There are also a number of
214
+ analyzers already available and it is almost trivial to create a new one
215
+ with a simple regular expression.
216
+
217
+ * Ferret::Search: for more information on querying the index. There are a
218
+ number of already available queries and it's unlikely you'll need to create
219
+ your own. You may however want to take advantage of the sorting or filtering
220
+ abilities of Ferret to present your data the best way you see fit.
221
+
222
+ * Ferret::QueryParser: if you want to find out more about what you can do with
223
+ Ferret's Query Parser, this is the place to look. The query parser is one
224
+ area that could use a bit of work so please send your suggestions.
225
+
226
+ * Ferret::Index: for more advanced access to the index you'll probably want to
227
+ use the Ferret::Index::IndexWriter and Ferret::Index::IndexReader. This is
228
+ the place to look for more information on them.
229
+
230
+ * Ferret::Store: This is the module used to access the actual index storage
231
+ and won't be of much interest to most people.
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path(File.join(File.basename(__FILE__), '../lib'))
4
+ require 'ferret'
5
+ require 'ferret/browser'
6
+
7
+ require 'optparse'
8
+ require 'ostruct'
9
+
10
+ SERVER_OPTIONS = ['webrick']
11
+ conf = OpenStruct.new(:host => '0.0.0.0', :port => 3301)
12
+
13
+ opts = OptionParser.new do |opts|
14
+ opts.banner = "Usage: #{File.basename($0)} /path/to/index"
15
+ opts.separator ""
16
+ opts.separator "Specific Options:"
17
+
18
+ opts.on("-h", "--host HOSTNAME",
19
+ "Host for web server to bind to (default is all IPs)") { |conf.host| }
20
+ opts.on("-p", "--port NUM",
21
+ "Port for web server (defaults to #{conf.port})") { |conf.port| }
22
+ opts.on("-s", "--server NAME",
23
+ "Server to force (#{SERVER_OPTIONS.join(', ')}).") { |s| conf.server = s.to_sym }
24
+
25
+ opts.separator ""
26
+ opts.separator "Common options:"
27
+
28
+ opts.on_tail("-?", "--help", "Show this message") do
29
+ puts opts
30
+ exit
31
+ end
32
+
33
+ opts.on_tail("-v", "--version", "Show version") do
34
+ puts Ferret::VERSION
35
+ exit
36
+ end
37
+ end
38
+
39
+ opts.parse! ARGV
40
+ if ARGV.length != 1
41
+ puts opts
42
+ exit
43
+ end
44
+ @path = ARGV[0]
45
+
46
+ # Load the Ferret index
47
+ begin
48
+ @reader = Ferret::Index::IndexReader.new(@path)
49
+ rescue Ferret::FileNotFoundError => e
50
+ puts "\033[31mCannot start Ferret. No index exists at \"\033[m" +
51
+ "\033[33m#{@path}\033[m\033[31m\".\033[m"
52
+ exit
53
+ rescue Exception => e
54
+ puts "\033[31mCannot start Ferret.\n\033[m\033[33m#{e.to_s}\031[m"
55
+ exit
56
+ end
57
+
58
+ unless conf.server
59
+ conf.server = :webrick
60
+ end
61
+
62
+ case conf.server.to_s
63
+ when 'webrick'
64
+ require 'webrick/httpserver'
65
+ require 'ferret/browser/webrick'
66
+
67
+ # Mount the root
68
+ s = WEBrick::HTTPServer.new(:BindAddress => conf.host, :Port => conf.port)
69
+ s.mount "/s", WEBrick::HTTPServlet::FileHandler, Ferret::Browser::Controller::STATIC_DIR, true
70
+ s.mount "/", WEBrick::FerretBrowserHandler, @reader, @path
71
+
72
+ # Server up
73
+ trap(:INT) do
74
+ s.shutdown
75
+ end
76
+ s.start
77
+ else
78
+ raise "server #{conf.server} not known. Must be one of [#{SERVER_OPTIONS.join(', ')}]"
79
+ end
@@ -0,0 +1,1094 @@
1
+
2
+ /*-------------------------------------------------------------*/
3
+ /*--- Block sorting machinery ---*/
4
+ /*--- blocksort.c ---*/
5
+ /*-------------------------------------------------------------*/
6
+
7
+ /* ------------------------------------------------------------------
8
+ This file is part of bzip2/libbzip2, a program and library for
9
+ lossless, block-sorting data compression.
10
+
11
+ bzip2/libbzip2 version 1.0.4 of 20 December 2006
12
+ Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
13
+
14
+ Please read the WARNING, DISCLAIMER and PATENTS sections in the
15
+ README file.
16
+
17
+ This program is released under the terms of the license contained
18
+ in the file LICENSE.
19
+ ------------------------------------------------------------------ */
20
+
21
+
22
+ #include "bzlib_private.h"
23
+
24
+ /*---------------------------------------------*/
25
+ /*--- Fallback O(N log(N)^2) sorting ---*/
26
+ /*--- algorithm, for repetitive blocks ---*/
27
+ /*---------------------------------------------*/
28
+
29
+ /*---------------------------------------------*/
30
+ static
31
+ __inline__
32
+ void fallbackSimpleSort ( UInt32* fmap,
33
+ UInt32* eclass,
34
+ Int32 lo,
35
+ Int32 hi )
36
+ {
37
+ Int32 i, j, tmp;
38
+ UInt32 ec_tmp;
39
+
40
+ if (lo == hi) return;
41
+
42
+ if (hi - lo > 3) {
43
+ for ( i = hi-4; i >= lo; i-- ) {
44
+ tmp = fmap[i];
45
+ ec_tmp = eclass[tmp];
46
+ for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
47
+ fmap[j-4] = fmap[j];
48
+ fmap[j-4] = tmp;
49
+ }
50
+ }
51
+
52
+ for ( i = hi-1; i >= lo; i-- ) {
53
+ tmp = fmap[i];
54
+ ec_tmp = eclass[tmp];
55
+ for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
56
+ fmap[j-1] = fmap[j];
57
+ fmap[j-1] = tmp;
58
+ }
59
+ }
60
+
61
+
62
+ /*---------------------------------------------*/
63
+ #define fswap(zz1, zz2) \
64
+ { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
65
+
66
+ #define fvswap(zzp1, zzp2, zzn) \
67
+ { \
68
+ Int32 yyp1 = (zzp1); \
69
+ Int32 yyp2 = (zzp2); \
70
+ Int32 yyn = (zzn); \
71
+ while (yyn > 0) { \
72
+ fswap(fmap[yyp1], fmap[yyp2]); \
73
+ yyp1++; yyp2++; yyn--; \
74
+ } \
75
+ }
76
+
77
+
78
+ #define fmin(a,b) ((a) < (b)) ? (a) : (b)
79
+
80
+ #define fpush(lz,hz) { stackLo[sp] = lz; \
81
+ stackHi[sp] = hz; \
82
+ sp++; }
83
+
84
+ #define fpop(lz,hz) { sp--; \
85
+ lz = stackLo[sp]; \
86
+ hz = stackHi[sp]; }
87
+
88
+ #define FALLBACK_QSORT_SMALL_THRESH 10
89
+ #define FALLBACK_QSORT_STACK_SIZE 100
90
+
91
+
92
+ static
93
+ void fallbackQSort3 ( UInt32* fmap,
94
+ UInt32* eclass,
95
+ Int32 loSt,
96
+ Int32 hiSt )
97
+ {
98
+ Int32 unLo, unHi, ltLo, gtHi, n, m;
99
+ Int32 sp, lo, hi;
100
+ UInt32 med, r, r3;
101
+ Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
102
+ Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
103
+
104
+ r = 0;
105
+
106
+ sp = 0;
107
+ fpush ( loSt, hiSt );
108
+
109
+ while (sp > 0) {
110
+
111
+ AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
112
+
113
+ fpop ( lo, hi );
114
+ if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
115
+ fallbackSimpleSort ( fmap, eclass, lo, hi );
116
+ continue;
117
+ }
118
+
119
+ /* Random partitioning. Median of 3 sometimes fails to
120
+ avoid bad cases. Median of 9 seems to help but
121
+ looks rather expensive. This too seems to work but
122
+ is cheaper. Guidance for the magic constants
123
+ 7621 and 32768 is taken from Sedgewick's algorithms
124
+ book, chapter 35.
125
+ */
126
+ r = ((r * 7621) + 1) % 32768;
127
+ r3 = r % 3;
128
+ if (r3 == 0) med = eclass[fmap[lo]]; else
129
+ if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
130
+ med = eclass[fmap[hi]];
131
+
132
+ unLo = ltLo = lo;
133
+ unHi = gtHi = hi;
134
+
135
+ while (1) {
136
+ while (1) {
137
+ if (unLo > unHi) break;
138
+ n = (Int32)eclass[fmap[unLo]] - (Int32)med;
139
+ if (n == 0) {
140
+ fswap(fmap[unLo], fmap[ltLo]);
141
+ ltLo++; unLo++;
142
+ continue;
143
+ };
144
+ if (n > 0) break;
145
+ unLo++;
146
+ }
147
+ while (1) {
148
+ if (unLo > unHi) break;
149
+ n = (Int32)eclass[fmap[unHi]] - (Int32)med;
150
+ if (n == 0) {
151
+ fswap(fmap[unHi], fmap[gtHi]);
152
+ gtHi--; unHi--;
153
+ continue;
154
+ };
155
+ if (n < 0) break;
156
+ unHi--;
157
+ }
158
+ if (unLo > unHi) break;
159
+ fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
160
+ }
161
+
162
+ AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
163
+
164
+ if (gtHi < ltLo) continue;
165
+
166
+ n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
167
+ m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
168
+
169
+ n = lo + unLo - ltLo - 1;
170
+ m = hi - (gtHi - unHi) + 1;
171
+
172
+ if (n - lo > hi - m) {
173
+ fpush ( lo, n );
174
+ fpush ( m, hi );
175
+ } else {
176
+ fpush ( m, hi );
177
+ fpush ( lo, n );
178
+ }
179
+ }
180
+ }
181
+
182
+ #undef fmin
183
+ #undef fpush
184
+ #undef fpop
185
+ #undef fswap
186
+ #undef fvswap
187
+ #undef FALLBACK_QSORT_SMALL_THRESH
188
+ #undef FALLBACK_QSORT_STACK_SIZE
189
+
190
+
191
+ /*---------------------------------------------*/
192
+ /* Pre:
193
+ nblock > 0
194
+ eclass exists for [0 .. nblock-1]
195
+ ((UChar*)eclass) [0 .. nblock-1] holds block
196
+ ptr exists for [0 .. nblock-1]
197
+
198
+ Post:
199
+ ((UChar*)eclass) [0 .. nblock-1] holds block
200
+ All other areas of eclass destroyed
201
+ fmap [0 .. nblock-1] holds sorted order
202
+ bhtab [ 0 .. 2+(nblock/32) ] destroyed
203
+ */
204
+
205
+ #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
206
+ #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
207
+ #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
208
+ #define WORD_BH(zz) bhtab[(zz) >> 5]
209
+ #define UNALIGNED_BH(zz) ((zz) & 0x01f)
210
+
211
+ static
212
+ void fallbackSort ( UInt32* fmap,
213
+ UInt32* eclass,
214
+ UInt32* bhtab,
215
+ Int32 nblock,
216
+ Int32 verb )
217
+ {
218
+ Int32 ftab[257];
219
+ Int32 ftabCopy[256];
220
+ Int32 H, i, j, k, l, r, cc, cc1;
221
+ Int32 nNotDone;
222
+ Int32 nBhtab;
223
+ UChar* eclass8 = (UChar*)eclass;
224
+
225
+ /*--
226
+ Initial 1-char radix sort to generate
227
+ initial fmap and initial BH bits.
228
+ --*/
229
+ if (verb >= 4)
230
+ VPrintf0 ( " bucket sorting ...\n" );
231
+ for (i = 0; i < 257; i++) ftab[i] = 0;
232
+ for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
233
+ for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
234
+ for (i = 1; i < 257; i++) ftab[i] += ftab[i-1];
235
+
236
+ for (i = 0; i < nblock; i++) {
237
+ j = eclass8[i];
238
+ k = ftab[j] - 1;
239
+ ftab[j] = k;
240
+ fmap[k] = i;
241
+ }
242
+
243
+ nBhtab = 2 + (nblock / 32);
244
+ for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
245
+ for (i = 0; i < 256; i++) SET_BH(ftab[i]);
246
+
247
+ /*--
248
+ Inductively refine the buckets. Kind-of an
249
+ "exponential radix sort" (!), inspired by the
250
+ Manber-Myers suffix array construction algorithm.
251
+ --*/
252
+
253
+ /*-- set sentinel bits for block-end detection --*/
254
+ for (i = 0; i < 32; i++) {
255
+ SET_BH(nblock + 2*i);
256
+ CLEAR_BH(nblock + 2*i + 1);
257
+ }
258
+
259
+ /*-- the log(N) loop --*/
260
+ H = 1;
261
+ while (1) {
262
+
263
+ if (verb >= 4)
264
+ VPrintf1 ( " depth %6d has ", H );
265
+
266
+ j = 0;
267
+ for (i = 0; i < nblock; i++) {
268
+ if (ISSET_BH(i)) j = i;
269
+ k = fmap[i] - H; if (k < 0) k += nblock;
270
+ eclass[k] = j;
271
+ }
272
+
273
+ nNotDone = 0;
274
+ r = -1;
275
+ while (1) {
276
+
277
+ /*-- find the next non-singleton bucket --*/
278
+ k = r + 1;
279
+ while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
280
+ if (ISSET_BH(k)) {
281
+ while (WORD_BH(k) == 0xffffffff) k += 32;
282
+ while (ISSET_BH(k)) k++;
283
+ }
284
+ l = k - 1;
285
+ if (l >= nblock) break;
286
+ while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
287
+ if (!ISSET_BH(k)) {
288
+ while (WORD_BH(k) == 0x00000000) k += 32;
289
+ while (!ISSET_BH(k)) k++;
290
+ }
291
+ r = k - 1;
292
+ if (r >= nblock) break;
293
+
294
+ /*-- now [l, r] bracket current bucket --*/
295
+ if (r > l) {
296
+ nNotDone += (r - l + 1);
297
+ fallbackQSort3 ( fmap, eclass, l, r );
298
+
299
+ /*-- scan bucket and generate header bits-- */
300
+ cc = -1;
301
+ for (i = l; i <= r; i++) {
302
+ cc1 = eclass[fmap[i]];
303
+ if (cc != cc1) { SET_BH(i); cc = cc1; };
304
+ }
305
+ }
306
+ }
307
+
308
+ if (verb >= 4)
309
+ VPrintf1 ( "%6d unresolved strings\n", nNotDone );
310
+
311
+ H *= 2;
312
+ if (H > nblock || nNotDone == 0) break;
313
+ }
314
+
315
+ /*--
316
+ Reconstruct the original block in
317
+ eclass8 [0 .. nblock-1], since the
318
+ previous phase destroyed it.
319
+ --*/
320
+ if (verb >= 4)
321
+ VPrintf0 ( " reconstructing block ...\n" );
322
+ j = 0;
323
+ for (i = 0; i < nblock; i++) {
324
+ while (ftabCopy[j] == 0) j++;
325
+ ftabCopy[j]--;
326
+ eclass8[fmap[i]] = (UChar)j;
327
+ }
328
+ AssertH ( j < 256, 1005 );
329
+ }
330
+
331
+ #undef SET_BH
332
+ #undef CLEAR_BH
333
+ #undef ISSET_BH
334
+ #undef WORD_BH
335
+ #undef UNALIGNED_BH
336
+
337
+
338
+ /*---------------------------------------------*/
339
+ /*--- The main, O(N^2 log(N)) sorting ---*/
340
+ /*--- algorithm. Faster for "normal" ---*/
341
+ /*--- non-repetitive blocks. ---*/
342
+ /*---------------------------------------------*/
343
+
344
+ /*---------------------------------------------*/
345
+ static
346
+ __inline__
347
+ Bool mainGtU ( UInt32 i1,
348
+ UInt32 i2,
349
+ UChar* block,
350
+ UInt16* quadrant,
351
+ UInt32 nblock,
352
+ Int32* budget )
353
+ {
354
+ Int32 k;
355
+ UChar c1, c2;
356
+ UInt16 s1, s2;
357
+
358
+ AssertD ( i1 != i2, "mainGtU" );
359
+ /* 1 */
360
+ c1 = block[i1]; c2 = block[i2];
361
+ if (c1 != c2) return (c1 > c2);
362
+ i1++; i2++;
363
+ /* 2 */
364
+ c1 = block[i1]; c2 = block[i2];
365
+ if (c1 != c2) return (c1 > c2);
366
+ i1++; i2++;
367
+ /* 3 */
368
+ c1 = block[i1]; c2 = block[i2];
369
+ if (c1 != c2) return (c1 > c2);
370
+ i1++; i2++;
371
+ /* 4 */
372
+ c1 = block[i1]; c2 = block[i2];
373
+ if (c1 != c2) return (c1 > c2);
374
+ i1++; i2++;
375
+ /* 5 */
376
+ c1 = block[i1]; c2 = block[i2];
377
+ if (c1 != c2) return (c1 > c2);
378
+ i1++; i2++;
379
+ /* 6 */
380
+ c1 = block[i1]; c2 = block[i2];
381
+ if (c1 != c2) return (c1 > c2);
382
+ i1++; i2++;
383
+ /* 7 */
384
+ c1 = block[i1]; c2 = block[i2];
385
+ if (c1 != c2) return (c1 > c2);
386
+ i1++; i2++;
387
+ /* 8 */
388
+ c1 = block[i1]; c2 = block[i2];
389
+ if (c1 != c2) return (c1 > c2);
390
+ i1++; i2++;
391
+ /* 9 */
392
+ c1 = block[i1]; c2 = block[i2];
393
+ if (c1 != c2) return (c1 > c2);
394
+ i1++; i2++;
395
+ /* 10 */
396
+ c1 = block[i1]; c2 = block[i2];
397
+ if (c1 != c2) return (c1 > c2);
398
+ i1++; i2++;
399
+ /* 11 */
400
+ c1 = block[i1]; c2 = block[i2];
401
+ if (c1 != c2) return (c1 > c2);
402
+ i1++; i2++;
403
+ /* 12 */
404
+ c1 = block[i1]; c2 = block[i2];
405
+ if (c1 != c2) return (c1 > c2);
406
+ i1++; i2++;
407
+
408
+ k = nblock + 8;
409
+
410
+ do {
411
+ /* 1 */
412
+ c1 = block[i1]; c2 = block[i2];
413
+ if (c1 != c2) return (c1 > c2);
414
+ s1 = quadrant[i1]; s2 = quadrant[i2];
415
+ if (s1 != s2) return (s1 > s2);
416
+ i1++; i2++;
417
+ /* 2 */
418
+ c1 = block[i1]; c2 = block[i2];
419
+ if (c1 != c2) return (c1 > c2);
420
+ s1 = quadrant[i1]; s2 = quadrant[i2];
421
+ if (s1 != s2) return (s1 > s2);
422
+ i1++; i2++;
423
+ /* 3 */
424
+ c1 = block[i1]; c2 = block[i2];
425
+ if (c1 != c2) return (c1 > c2);
426
+ s1 = quadrant[i1]; s2 = quadrant[i2];
427
+ if (s1 != s2) return (s1 > s2);
428
+ i1++; i2++;
429
+ /* 4 */
430
+ c1 = block[i1]; c2 = block[i2];
431
+ if (c1 != c2) return (c1 > c2);
432
+ s1 = quadrant[i1]; s2 = quadrant[i2];
433
+ if (s1 != s2) return (s1 > s2);
434
+ i1++; i2++;
435
+ /* 5 */
436
+ c1 = block[i1]; c2 = block[i2];
437
+ if (c1 != c2) return (c1 > c2);
438
+ s1 = quadrant[i1]; s2 = quadrant[i2];
439
+ if (s1 != s2) return (s1 > s2);
440
+ i1++; i2++;
441
+ /* 6 */
442
+ c1 = block[i1]; c2 = block[i2];
443
+ if (c1 != c2) return (c1 > c2);
444
+ s1 = quadrant[i1]; s2 = quadrant[i2];
445
+ if (s1 != s2) return (s1 > s2);
446
+ i1++; i2++;
447
+ /* 7 */
448
+ c1 = block[i1]; c2 = block[i2];
449
+ if (c1 != c2) return (c1 > c2);
450
+ s1 = quadrant[i1]; s2 = quadrant[i2];
451
+ if (s1 != s2) return (s1 > s2);
452
+ i1++; i2++;
453
+ /* 8 */
454
+ c1 = block[i1]; c2 = block[i2];
455
+ if (c1 != c2) return (c1 > c2);
456
+ s1 = quadrant[i1]; s2 = quadrant[i2];
457
+ if (s1 != s2) return (s1 > s2);
458
+ i1++; i2++;
459
+
460
+ if (i1 >= nblock) i1 -= nblock;
461
+ if (i2 >= nblock) i2 -= nblock;
462
+
463
+ k -= 8;
464
+ (*budget)--;
465
+ }
466
+ while (k >= 0);
467
+
468
+ return False;
469
+ }
470
+
471
+
472
+ /*---------------------------------------------*/
473
+ /*--
474
+ Knuth's increments seem to work better
475
+ than Incerpi-Sedgewick here. Possibly
476
+ because the number of elems to sort is
477
+ usually small, typically <= 20.
478
+ --*/
479
+ static
480
+ Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
481
+ 9841, 29524, 88573, 265720,
482
+ 797161, 2391484 };
483
+
484
+ static
485
+ void mainSimpleSort ( UInt32* ptr,
486
+ UChar* block,
487
+ UInt16* quadrant,
488
+ Int32 nblock,
489
+ Int32 lo,
490
+ Int32 hi,
491
+ Int32 d,
492
+ Int32* budget )
493
+ {
494
+ Int32 i, j, h, bigN, hp;
495
+ UInt32 v;
496
+
497
+ bigN = hi - lo + 1;
498
+ if (bigN < 2) return;
499
+
500
+ hp = 0;
501
+ while (incs[hp] < bigN) hp++;
502
+ hp--;
503
+
504
+ for (; hp >= 0; hp--) {
505
+ h = incs[hp];
506
+
507
+ i = lo + h;
508
+ while (True) {
509
+
510
+ /*-- copy 1 --*/
511
+ if (i > hi) break;
512
+ v = ptr[i];
513
+ j = i;
514
+ while ( mainGtU (
515
+ ptr[j-h]+d, v+d, block, quadrant, nblock, budget
516
+ ) ) {
517
+ ptr[j] = ptr[j-h];
518
+ j = j - h;
519
+ if (j <= (lo + h - 1)) break;
520
+ }
521
+ ptr[j] = v;
522
+ i++;
523
+
524
+ /*-- copy 2 --*/
525
+ if (i > hi) break;
526
+ v = ptr[i];
527
+ j = i;
528
+ while ( mainGtU (
529
+ ptr[j-h]+d, v+d, block, quadrant, nblock, budget
530
+ ) ) {
531
+ ptr[j] = ptr[j-h];
532
+ j = j - h;
533
+ if (j <= (lo + h - 1)) break;
534
+ }
535
+ ptr[j] = v;
536
+ i++;
537
+
538
+ /*-- copy 3 --*/
539
+ if (i > hi) break;
540
+ v = ptr[i];
541
+ j = i;
542
+ while ( mainGtU (
543
+ ptr[j-h]+d, v+d, block, quadrant, nblock, budget
544
+ ) ) {
545
+ ptr[j] = ptr[j-h];
546
+ j = j - h;
547
+ if (j <= (lo + h - 1)) break;
548
+ }
549
+ ptr[j] = v;
550
+ i++;
551
+
552
+ if (*budget < 0) return;
553
+ }
554
+ }
555
+ }
556
+
557
+
558
+ /*---------------------------------------------*/
559
+ /*--
560
+ The following is an implementation of
561
+ an elegant 3-way quicksort for strings,
562
+ described in a paper "Fast Algorithms for
563
+ Sorting and Searching Strings", by Robert
564
+ Sedgewick and Jon L. Bentley.
565
+ --*/
566
+
567
+ #define mswap(zz1, zz2) \
568
+ { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
569
+
570
+ #define mvswap(zzp1, zzp2, zzn) \
571
+ { \
572
+ Int32 yyp1 = (zzp1); \
573
+ Int32 yyp2 = (zzp2); \
574
+ Int32 yyn = (zzn); \
575
+ while (yyn > 0) { \
576
+ mswap(ptr[yyp1], ptr[yyp2]); \
577
+ yyp1++; yyp2++; yyn--; \
578
+ } \
579
+ }
580
+
581
+ static
582
+ __inline__
583
+ UChar mmed3 ( UChar a, UChar b, UChar c )
584
+ {
585
+ UChar t;
586
+ if (a > b) { t = a; a = b; b = t; };
587
+ if (b > c) {
588
+ b = c;
589
+ if (a > b) b = a;
590
+ }
591
+ return b;
592
+ }
593
+
594
+ #define mmin(a,b) ((a) < (b)) ? (a) : (b)
595
+
596
+ #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
597
+ stackHi[sp] = hz; \
598
+ stackD [sp] = dz; \
599
+ sp++; }
600
+
601
+ #define mpop(lz,hz,dz) { sp--; \
602
+ lz = stackLo[sp]; \
603
+ hz = stackHi[sp]; \
604
+ dz = stackD [sp]; }
605
+
606
+
607
+ #define mnextsize(az) (nextHi[az]-nextLo[az])
608
+
609
+ #define mnextswap(az,bz) \
610
+ { Int32 tz; \
611
+ tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
612
+ tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
613
+ tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
614
+
615
+
616
+ #define MAIN_QSORT_SMALL_THRESH 20
617
+ #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
618
+ #define MAIN_QSORT_STACK_SIZE 100
619
+
620
+ static
621
+ void mainQSort3 ( UInt32* ptr,
622
+ UChar* block,
623
+ UInt16* quadrant,
624
+ Int32 nblock,
625
+ Int32 loSt,
626
+ Int32 hiSt,
627
+ Int32 dSt,
628
+ Int32* budget )
629
+ {
630
+ Int32 unLo, unHi, ltLo, gtHi, n, m, med;
631
+ Int32 sp, lo, hi, d;
632
+
633
+ Int32 stackLo[MAIN_QSORT_STACK_SIZE];
634
+ Int32 stackHi[MAIN_QSORT_STACK_SIZE];
635
+ Int32 stackD [MAIN_QSORT_STACK_SIZE];
636
+
637
+ Int32 nextLo[3];
638
+ Int32 nextHi[3];
639
+ Int32 nextD [3];
640
+
641
+ sp = 0;
642
+ mpush ( loSt, hiSt, dSt );
643
+
644
+ while (sp > 0) {
645
+
646
+ AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
647
+
648
+ mpop ( lo, hi, d );
649
+ if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
650
+ d > MAIN_QSORT_DEPTH_THRESH) {
651
+ mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
652
+ if (*budget < 0) return;
653
+ continue;
654
+ }
655
+
656
+ med = (Int32)
657
+ mmed3 ( block[ptr[ lo ]+d],
658
+ block[ptr[ hi ]+d],
659
+ block[ptr[ (lo+hi)>>1 ]+d] );
660
+
661
+ unLo = ltLo = lo;
662
+ unHi = gtHi = hi;
663
+
664
+ while (True) {
665
+ while (True) {
666
+ if (unLo > unHi) break;
667
+ n = ((Int32)block[ptr[unLo]+d]) - med;
668
+ if (n == 0) {
669
+ mswap(ptr[unLo], ptr[ltLo]);
670
+ ltLo++; unLo++; continue;
671
+ };
672
+ if (n > 0) break;
673
+ unLo++;
674
+ }
675
+ while (True) {
676
+ if (unLo > unHi) break;
677
+ n = ((Int32)block[ptr[unHi]+d]) - med;
678
+ if (n == 0) {
679
+ mswap(ptr[unHi], ptr[gtHi]);
680
+ gtHi--; unHi--; continue;
681
+ };
682
+ if (n < 0) break;
683
+ unHi--;
684
+ }
685
+ if (unLo > unHi) break;
686
+ mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
687
+ }
688
+
689
+ AssertD ( unHi == unLo-1, "mainQSort3(2)" );
690
+
691
+ if (gtHi < ltLo) {
692
+ mpush(lo, hi, d+1 );
693
+ continue;
694
+ }
695
+
696
+ n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
697
+ m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
698
+
699
+ n = lo + unLo - ltLo - 1;
700
+ m = hi - (gtHi - unHi) + 1;
701
+
702
+ nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
703
+ nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
704
+ nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
705
+
706
+ if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
707
+ if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
708
+ if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
709
+
710
+ AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
711
+ AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
712
+
713
+ mpush (nextLo[0], nextHi[0], nextD[0]);
714
+ mpush (nextLo[1], nextHi[1], nextD[1]);
715
+ mpush (nextLo[2], nextHi[2], nextD[2]);
716
+ }
717
+ }
718
+
719
+ #undef mswap
720
+ #undef mvswap
721
+ #undef mpush
722
+ #undef mpop
723
+ #undef mmin
724
+ #undef mnextsize
725
+ #undef mnextswap
726
+ #undef MAIN_QSORT_SMALL_THRESH
727
+ #undef MAIN_QSORT_DEPTH_THRESH
728
+ #undef MAIN_QSORT_STACK_SIZE
729
+
730
+
731
+ /*---------------------------------------------*/
732
+ /* Pre:
733
+ nblock > N_OVERSHOOT
734
+ block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
735
+ ((UChar*)block32) [0 .. nblock-1] holds block
736
+ ptr exists for [0 .. nblock-1]
737
+
738
+ Post:
739
+ ((UChar*)block32) [0 .. nblock-1] holds block
740
+ All other areas of block32 destroyed
741
+ ftab [0 .. 65536 ] destroyed
742
+ ptr [0 .. nblock-1] holds sorted order
743
+ if (*budget < 0), sorting was abandoned
744
+ */
745
+
746
+ #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
747
+ #define SETMASK (1 << 21)
748
+ #define CLEARMASK (~(SETMASK))
749
+
750
+ static
751
+ void mainSort ( UInt32* ptr,
752
+ UChar* block,
753
+ UInt16* quadrant,
754
+ UInt32* ftab,
755
+ Int32 nblock,
756
+ Int32 verb,
757
+ Int32* budget )
758
+ {
759
+ Int32 i, j, k, ss, sb;
760
+ Int32 runningOrder[256];
761
+ Bool bigDone[256];
762
+ Int32 copyStart[256];
763
+ Int32 copyEnd [256];
764
+ UChar c1;
765
+ Int32 numQSorted;
766
+ UInt16 s;
767
+ if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" );
768
+
769
+ /*-- set up the 2-byte frequency table --*/
770
+ for (i = 65536; i >= 0; i--) ftab[i] = 0;
771
+
772
+ j = block[0] << 8;
773
+ i = nblock-1;
774
+ for (; i >= 3; i -= 4) {
775
+ quadrant[i] = 0;
776
+ j = (j >> 8) | ( ((UInt16)block[i]) << 8);
777
+ ftab[j]++;
778
+ quadrant[i-1] = 0;
779
+ j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
780
+ ftab[j]++;
781
+ quadrant[i-2] = 0;
782
+ j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
783
+ ftab[j]++;
784
+ quadrant[i-3] = 0;
785
+ j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
786
+ ftab[j]++;
787
+ }
788
+ for (; i >= 0; i--) {
789
+ quadrant[i] = 0;
790
+ j = (j >> 8) | ( ((UInt16)block[i]) << 8);
791
+ ftab[j]++;
792
+ }
793
+
794
+ /*-- (emphasises close relationship of block & quadrant) --*/
795
+ for (i = 0; i < BZ_N_OVERSHOOT; i++) {
796
+ block [nblock+i] = block[i];
797
+ quadrant[nblock+i] = 0;
798
+ }
799
+
800
+ if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" );
801
+
802
+ /*-- Complete the initial radix sort --*/
803
+ for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
804
+
805
+ s = block[0] << 8;
806
+ i = nblock-1;
807
+ for (; i >= 3; i -= 4) {
808
+ s = (s >> 8) | (block[i] << 8);
809
+ j = ftab[s] -1;
810
+ ftab[s] = j;
811
+ ptr[j] = i;
812
+ s = (s >> 8) | (block[i-1] << 8);
813
+ j = ftab[s] -1;
814
+ ftab[s] = j;
815
+ ptr[j] = i-1;
816
+ s = (s >> 8) | (block[i-2] << 8);
817
+ j = ftab[s] -1;
818
+ ftab[s] = j;
819
+ ptr[j] = i-2;
820
+ s = (s >> 8) | (block[i-3] << 8);
821
+ j = ftab[s] -1;
822
+ ftab[s] = j;
823
+ ptr[j] = i-3;
824
+ }
825
+ for (; i >= 0; i--) {
826
+ s = (s >> 8) | (block[i] << 8);
827
+ j = ftab[s] -1;
828
+ ftab[s] = j;
829
+ ptr[j] = i;
830
+ }
831
+
832
+ /*--
833
+ Now ftab contains the first loc of every small bucket.
834
+ Calculate the running order, from smallest to largest
835
+ big bucket.
836
+ --*/
837
+ for (i = 0; i <= 255; i++) {
838
+ bigDone [i] = False;
839
+ runningOrder[i] = i;
840
+ }
841
+
842
+ {
843
+ Int32 vv;
844
+ Int32 h = 1;
845
+ do h = 3 * h + 1; while (h <= 256);
846
+ do {
847
+ h = h / 3;
848
+ for (i = h; i <= 255; i++) {
849
+ vv = runningOrder[i];
850
+ j = i;
851
+ while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
852
+ runningOrder[j] = runningOrder[j-h];
853
+ j = j - h;
854
+ if (j <= (h - 1)) goto zero;
855
+ }
856
+ zero:
857
+ runningOrder[j] = vv;
858
+ }
859
+ } while (h != 1);
860
+ }
861
+
862
+ /*--
863
+ The main sorting loop.
864
+ --*/
865
+
866
+ numQSorted = 0;
867
+
868
+ for (i = 0; i <= 255; i++) {
869
+
870
+ /*--
871
+ Process big buckets, starting with the least full.
872
+ Basically this is a 3-step process in which we call
873
+ mainQSort3 to sort the small buckets [ss, j], but
874
+ also make a big effort to avoid the calls if we can.
875
+ --*/
876
+ ss = runningOrder[i];
877
+
878
+ /*--
879
+ Step 1:
880
+ Complete the big bucket [ss] by quicksorting
881
+ any unsorted small buckets [ss, j], for j != ss.
882
+ Hopefully previous pointer-scanning phases have already
883
+ completed many of the small buckets [ss, j], so
884
+ we don't have to sort them at all.
885
+ --*/
886
+ for (j = 0; j <= 255; j++) {
887
+ if (j != ss) {
888
+ sb = (ss << 8) + j;
889
+ if ( ! (ftab[sb] & SETMASK) ) {
890
+ Int32 lo = ftab[sb] & CLEARMASK;
891
+ Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
892
+ if (hi > lo) {
893
+ if (verb >= 4)
894
+ VPrintf4 ( " qsort [0x%x, 0x%x] "
895
+ "done %d this %d\n",
896
+ ss, j, numQSorted, hi - lo + 1 );
897
+ mainQSort3 (
898
+ ptr, block, quadrant, nblock,
899
+ lo, hi, BZ_N_RADIX, budget
900
+ );
901
+ numQSorted += (hi - lo + 1);
902
+ if (*budget < 0) return;
903
+ }
904
+ }
905
+ ftab[sb] |= SETMASK;
906
+ }
907
+ }
908
+
909
+ AssertH ( !bigDone[ss], 1006 );
910
+
911
+ /*--
912
+ Step 2:
913
+ Now scan this big bucket [ss] so as to synthesise the
914
+ sorted order for small buckets [t, ss] for all t,
915
+ including, magically, the bucket [ss,ss] too.
916
+ This will avoid doing Real Work in subsequent Step 1's.
917
+ --*/
918
+ {
919
+ for (j = 0; j <= 255; j++) {
920
+ copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
921
+ copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
922
+ }
923
+ for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
924
+ k = ptr[j]-1; if (k < 0) k += nblock;
925
+ c1 = block[k];
926
+ if (!bigDone[c1])
927
+ ptr[ copyStart[c1]++ ] = k;
928
+ }
929
+ for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
930
+ k = ptr[j]-1; if (k < 0) k += nblock;
931
+ c1 = block[k];
932
+ if (!bigDone[c1])
933
+ ptr[ copyEnd[c1]-- ] = k;
934
+ }
935
+ }
936
+
937
+ AssertH ( (copyStart[ss]-1 == copyEnd[ss])
938
+ ||
939
+ /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
940
+ Necessity for this case is demonstrated by compressing
941
+ a sequence of approximately 48.5 million of character
942
+ 251; 1.0.0/1.0.1 will then die here. */
943
+ (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
944
+ 1007 )
945
+
946
+ for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
947
+
948
+ /*--
949
+ Step 3:
950
+ The [ss] big bucket is now done. Record this fact,
951
+ and update the quadrant descriptors. Remember to
952
+ update quadrants in the overshoot area too, if
953
+ necessary. The "if (i < 255)" test merely skips
954
+ this updating for the last bucket processed, since
955
+ updating for the last bucket is pointless.
956
+
957
+ The quadrant array provides a way to incrementally
958
+ cache sort orderings, as they appear, so as to
959
+ make subsequent comparisons in fullGtU() complete
960
+ faster. For repetitive blocks this makes a big
961
+ difference (but not big enough to be able to avoid
962
+ the fallback sorting mechanism, exponential radix sort).
963
+
964
+ The precise meaning is: at all times:
965
+
966
+ for 0 <= i < nblock and 0 <= j <= nblock
967
+
968
+ if block[i] != block[j],
969
+
970
+ then the relative values of quadrant[i] and
971
+ quadrant[j] are meaningless.
972
+
973
+ else {
974
+ if quadrant[i] < quadrant[j]
975
+ then the string starting at i lexicographically
976
+ precedes the string starting at j
977
+
978
+ else if quadrant[i] > quadrant[j]
979
+ then the string starting at j lexicographically
980
+ precedes the string starting at i
981
+
982
+ else
983
+ the relative ordering of the strings starting
984
+ at i and j has not yet been determined.
985
+ }
986
+ --*/
987
+ bigDone[ss] = True;
988
+
989
+ if (i < 255) {
990
+ Int32 bbStart = ftab[ss << 8] & CLEARMASK;
991
+ Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
992
+ Int32 shifts = 0;
993
+
994
+ while ((bbSize >> shifts) > 65534) shifts++;
995
+
996
+ for (j = bbSize-1; j >= 0; j--) {
997
+ Int32 a2update = ptr[bbStart + j];
998
+ UInt16 qVal = (UInt16)(j >> shifts);
999
+ quadrant[a2update] = qVal;
1000
+ if (a2update < BZ_N_OVERSHOOT)
1001
+ quadrant[a2update + nblock] = qVal;
1002
+ }
1003
+ AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004
+ }
1005
+
1006
+ }
1007
+
1008
+ if (verb >= 4)
1009
+ VPrintf3 ( " %d pointers, %d sorted, %d scanned\n",
1010
+ nblock, numQSorted, nblock - numQSorted );
1011
+ }
1012
+
1013
+ #undef BIGFREQ
1014
+ #undef SETMASK
1015
+ #undef CLEARMASK
1016
+
1017
+
1018
+ /*---------------------------------------------*/
1019
+ /* Pre:
1020
+ nblock > 0
1021
+ arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022
+ ((UChar*)arr2) [0 .. nblock-1] holds block
1023
+ arr1 exists for [0 .. nblock-1]
1024
+
1025
+ Post:
1026
+ ((UChar*)arr2) [0 .. nblock-1] holds block
1027
+ All other areas of block destroyed
1028
+ ftab [ 0 .. 65536 ] destroyed
1029
+ arr1 [0 .. nblock-1] holds sorted order
1030
+ */
1031
+ void BZ2_blockSort ( EState* s )
1032
+ {
1033
+ UInt32* ptr = s->ptr;
1034
+ UChar* block = s->block;
1035
+ UInt32* ftab = s->ftab;
1036
+ Int32 nblock = s->nblock;
1037
+ Int32 verb = s->verbosity;
1038
+ Int32 wfact = s->workFactor;
1039
+ UInt16* quadrant;
1040
+ Int32 budget;
1041
+ Int32 budgetInit;
1042
+ Int32 i;
1043
+
1044
+ if (nblock < 10000) {
1045
+ fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046
+ } else {
1047
+ /* Calculate the location for quadrant, remembering to get
1048
+ the alignment right. Assumes that &(block[0]) is at least
1049
+ 2-byte aligned -- this should be ok since block is really
1050
+ the first section of arr2.
1051
+ */
1052
+ i = nblock+BZ_N_OVERSHOOT;
1053
+ if (i & 1) i++;
1054
+ quadrant = (UInt16*)(&(block[i]));
1055
+
1056
+ /* (wfact-1) / 3 puts the default-factor-30
1057
+ transition point at very roughly the same place as
1058
+ with v0.1 and v0.9.0.
1059
+ Not that it particularly matters any more, since the
1060
+ resulting compressed stream is now the same regardless
1061
+ of whether or not we use the main sort or fallback sort.
1062
+ */
1063
+ if (wfact < 1 ) wfact = 1;
1064
+ if (wfact > 100) wfact = 100;
1065
+ budgetInit = nblock * ((wfact-1) / 3);
1066
+ budget = budgetInit;
1067
+
1068
+ mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069
+ if (verb >= 3)
1070
+ VPrintf3 ( " %d work, %d block, ratio %5.2f\n",
1071
+ budgetInit - budget,
1072
+ nblock,
1073
+ (float)(budgetInit - budget) /
1074
+ (float)(nblock==0 ? 1 : nblock) );
1075
+ if (budget < 0) {
1076
+ if (verb >= 2)
1077
+ VPrintf0 ( " too repetitive; using fallback"
1078
+ " sorting algorithm\n" );
1079
+ fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080
+ }
1081
+ }
1082
+
1083
+ s->origPtr = -1;
1084
+ for (i = 0; i < s->nblock; i++)
1085
+ if (ptr[i] == 0)
1086
+ { s->origPtr = i; break; };
1087
+
1088
+ AssertH( s->origPtr != -1, 1003 );
1089
+ }
1090
+
1091
+
1092
+ /*-------------------------------------------------------------*/
1093
+ /*--- end blocksort.c ---*/
1094
+ /*-------------------------------------------------------------*/