isomorfeus-ferret 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 438b0c625088191ac9dd620da3612a34c72c6b314da3ff87722272aeff39e2fa
4
- data.tar.gz: a984ec0934467dad13687f84c4e9ff06c7ee1338d7928408772e423bf54b32b2
3
+ metadata.gz: be4c84d556459a8ed5d2585068378c156bdb2d68507ca2786844b2c69a4e7f35
4
+ data.tar.gz: '0096ef29b274ea39567e876d95d8441ff80589a8d3403c5aed3801c62377cffd'
5
5
  SHA512:
6
- metadata.gz: 3667dddba793f9b62215a6c7e40cc55f395d586f886853cc11f7b35a341a90ec63c542a176277720130d269ae6d86dc38cb235ea4a5ac9a0414ceaf07ff7d295
7
- data.tar.gz: ea56a056a43c7041c7bb4c8729557343b9c23d7ac7d011792ed31ec152bfcc1e06327cddc8ec5953b28d8774caaba01ede0262bea13a59e57a4459f3143040f3
6
+ metadata.gz: 55aa9f39fd4971e1a80bbcdeba14906928140fd023a7ee3e581ef2bafc63a8a664f5b0fbaf86dec1119ac62fd441703d33a4b6ee3731f811736a1b2eedcef9a3
7
+ data.tar.gz: 3843bd29450fb925069733913aee9f66958dfe41aba052d7d14a18cb7f2d4df376e0756d7c97bcd01527120a6a77776d9bee2a1c0a488648fc61f500dfe6e98e
data/LICENSE CHANGED
@@ -28,7 +28,86 @@ Copyright (c) 2005-2006 David Balmain
28
28
  MIT License as above
29
29
 
30
30
 
31
- stemmer files in ext/isomorfeus_ferret originally taken from https://snowballstem.org/:
31
+ brotli_* files in ext/isomorfeus_ferret originally taken from https://github.com/google/brotli:
32
+
33
+ Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
34
+
35
+ MIT License as above
36
+
37
+
38
+ bzlib* files in ext/isomorfeus_ferret originally taken from git://sourceware.org/git/bzip2.git:
39
+
40
+ This program, "bzip2", the associated library "libbzip2", and all
41
+ documentation, are copyright (C) 1996-2019 Julian R Seward. All
42
+ rights reserved.
43
+
44
+ Redistribution and use in source and binary forms, with or without
45
+ modification, are permitted provided that the following conditions
46
+ are met:
47
+
48
+ 1. Redistributions of source code must retain the above copyright
49
+ notice, this list of conditions and the following disclaimer.
50
+
51
+ 2. The origin of this software must not be misrepresented; you must
52
+ not claim that you wrote the original software. If you use this
53
+ software in a product, an acknowledgment in the product
54
+ documentation would be appreciated but is not required.
55
+
56
+ 3. Altered source versions must be plainly marked as such, and must
57
+ not be misrepresented as being the original software.
58
+
59
+ 4. The name of the author may not be used to endorse or promote
60
+ products derived from this software without specific prior written
61
+ permission.
62
+
63
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
64
+ OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
65
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
66
+ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
67
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
68
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
69
+ GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
70
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
71
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
72
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
73
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
74
+
75
+ Julian Seward, jseward@acm.org
76
+ bzip2/libbzip2 version 1.0.8 of 13 July 2019
77
+
78
+
79
+ lz4* files in ext/isomorfeus_ferret originally taken from https://github.com/lz4/lz4/tree/dev/lib:
80
+
81
+ Copyright (C) 2011-2020, Yann Collet.
82
+
83
+ Redistribution and use in source and binary forms, with or without
84
+ modification, are permitted provided that the following conditions are met:
85
+
86
+ * Redistributions of source code must retain the above copyright
87
+ notice, this list of conditions and the following disclaimer.
88
+ * Redistributions in binary form must reproduce the above
89
+ copyright notice, this list of conditions and the following disclaimer
90
+ in the documentation and/or other materials provided with the
91
+ distribution.
92
+
93
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
94
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
95
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
96
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
97
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
98
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
99
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
100
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
101
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
102
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
103
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104
+
105
+ You can contact the author at:
106
+ - LZ4 homepage : http://www.lz4.org
107
+ - LZ4 source repository : https://github.com/lz4/lz4
108
+
109
+
110
+ stem* and libstemmer* files in ext/isomorfeus_ferret originally taken from https://snowballstem.org/:
32
111
 
33
112
  Copyright (c) 2001, Dr Martin Porter
34
113
  Copyright (c) 2004,2005, Richard Boulton
@@ -37,8 +116,7 @@ Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
37
116
  All rights reserved.
38
117
 
39
118
  Redistribution and use in source and binary forms, with or without
40
- modification, are permitted provided that the following conditions
41
- are met:
119
+ modification, are permitted provided that the following conditions are met:
42
120
 
43
121
  1. Redistributions of source code must retain the above copyright notice,
44
122
  this list of conditions and the following disclaimer.
@@ -68,23 +146,27 @@ Copyright (C) 1993-2013 Yukihiro Matsumoto. All rights reserved.
68
146
  Redistribution and use in source and binary forms, with or without
69
147
  modification, are permitted provided that the following conditions
70
148
  are met:
71
- 1. Redistributions of source code must retain the above copyright
72
- notice, this list of conditions and the following disclaimer.
73
- 2. Redistributions in binary form must reproduce the above copyright
74
- notice, this list of conditions and the following disclaimer in the
75
- documentation and/or other materials provided with the distribution.
76
149
 
77
- THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
78
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
79
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
80
- ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
81
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
82
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
83
- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
84
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
85
- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
86
- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
87
- SUCH DAMAGE.
150
+ 1. Redistributions of source code must retain the above copyright notice,
151
+ this list of conditions and the following disclaimer.
152
+ 2. Redistributions in binary form must reproduce the above copyright notice,
153
+ this list of conditions and the following disclaimer in the documentation
154
+ and/or other materials provided with the distribution.
155
+
156
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
157
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
158
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
159
+ DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
160
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
161
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
162
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
163
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
164
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
165
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
166
+
167
+
168
+ The following licenses apply to files, which are distributed within the repo
169
+ but not distributed with the gem and not used at runtime:
88
170
 
89
171
 
90
172
  For the Reuter-21578 files in the misc/ferret_vs_lucene directory (corpus, etc.),
data/README.md CHANGED
@@ -16,7 +16,59 @@ During revival many things havbe been fixed, now all tests pass, no crashes and
16
16
  successfully compiles and runs with rubys >3. Its no longer a goal to have
17
17
  a c library available, but instead the usage is meant as ruby gem with a c extension only.
18
18
 
19
- It should work on *nixes and *nuxes and also works on Windows.
19
+ It should work on *nixes, *nuxes, *BSDs and also works on Windows.
20
+
21
+ ## Improvements and Changes in Version 0.13
22
+
23
+ ### Breaking
24
+
25
+ - For version 0.13 die index file format has changed and is no longer compatible with previous versions. Indexes of older versions must be recreated with 0.13 (export all data from and with previous version, import alls data with 0.13)
26
+ - The :store option no longer accepts :compress, compression must now be specified by the separate :compress options (see below).
27
+ - The ASCII-specific Tokenizers and Analyzers have been removed
28
+
29
+ ### Sring Encoding support
30
+
31
+ #### Input strings and stored fields
32
+
33
+ In versions prior 0.13 the string encoding had to match the locale string encoding.
34
+ In 0.13 the dependency on the locale setting has been resolved, input strings are now correctly tokenized
35
+ according to their source encoding, with positions correctly matching the input string.
36
+ All Ruby string encodings are supported.
37
+ When fields are stored, they are now stored with the encoding, so that when they are retrieved again, they
38
+ retain the original encoding with positions matching the string in its original encoding.
39
+
40
+ #### Tokens and Filters
41
+
42
+ Tokens are internally converted to UTF-8, which may change their length compared to their original encoding,
43
+ yet they retain position information according to the source in its original encoding.
44
+ The benefit is, that Filters, Stemmers or anything else working with Tokens only needs to support UTF-8 encoding,
45
+ greatly simplifying things and ensuring consistent query results.
46
+
47
+ ### Compression
48
+
49
+ Compression semantics have changed, now Brotli, BZip2 and LZ4 compression codecs are supported.
50
+ - BZip2: slow compression, slow decompression, high compression ratio
51
+ - Brotli: slow compression, fast decrompression, high compression ratio, recommended for general purpose.
52
+ - LZ4: fast compression, fast decrompression, low compression ratio
53
+ To see performance and compression ratios `rake ferret_compression_bench` can be run from the cloned repo.
54
+ It uses data and code within the misc/ferret_vs_lucene directory.
55
+
56
+ To compress a stored field the :compression option can be used with one of: :no, :brotli, :bz2 or :lz4.
57
+ Example:
58
+ ```ruby
59
+ fis.add_field(:compressed_field, :store => :yes, :compression => :brotli, :term_vector => :yes)
60
+ ```
61
+
62
+ ### Performance
63
+
64
+ The encoding support demands its toll, indexing performance dropped a bit in comparision to 0.12, but still thousands of docs per second, depending on machine/docs.
65
+ On Windows the indexing performance is still terrible, but that may be resolved in a future project.
66
+
67
+ Search performance is still excellent and multiple times faster than Lucene.
68
+
69
+ Lucene achieves roughly double the indexing performance. This seems to be because of the different way strings and
70
+ encodings are handled in Java. For example, the Java WhitespaceTokenizer code requires only one method call per character (check for whitespace), but for Ruby, to support all the different encodings, several method calls are required per character (retrieve character according to encoding, check character for whitespace).
71
+ Ferret is internally using the standard Ruby string encoding methods.
20
72
 
21
73
  ## Documentation
22
74
 
@@ -69,6 +121,7 @@ JVM 17.0.1 (Private Build)
69
121
  ## Future
70
122
 
71
123
  Lots of things to do:
124
+ - Improve indexing performance on Windows (WriteFile is terribly slow, maybe use mapping, see libuv)
72
125
  - Bring documentation in order in a docs directory
73
126
  - Review code (especially for memory/stack issues, typical c issues)
74
127
  - Take care of ruby GVL and threading
@@ -9,55 +9,51 @@
9
9
 
10
10
  static FrtBitVector *bv;
11
11
 
12
- static void setup()
13
- {
12
+ static void setup(void) {
14
13
  bv = frt_bv_new_capa(SCAN_SIZE);
15
14
  }
16
15
 
17
- static void teardown()
18
- {
16
+ static void teardown(void) {
19
17
  frt_bv_destroy(bv);
20
18
  }
21
19
 
22
- static void ferret_bv_and_sparse()
23
- {
20
+ static void ferret_bv_and_sparse(void) {
24
21
  FrtBitVector * _bv = frt_bv_and(bv, bv);
25
22
  free(_bv);
26
23
  }
27
- static void ferret_bv_or_sparse()
28
- {
24
+
25
+ static void ferret_bv_or_sparse(void) {
29
26
  FrtBitVector * _bv = frt_bv_or(bv, bv);
30
27
  free(_bv);
31
28
  }
32
- static void ferret_bv_xor_sparse()
33
- {
29
+
30
+ static void ferret_bv_xor_sparse(void) {
34
31
  FrtBitVector * _bv = frt_bv_xor(bv, bv);
35
32
  free(_bv);
36
33
  }
37
- static void ferret_bv_not_sparse()
38
- {
34
+
35
+ static void ferret_bv_not_sparse(void) {
39
36
  FrtBitVector * _bv = frt_bv_not(bv);
40
37
  free(_bv);
41
38
  }
42
- static void ferret_bv_and_dense()
43
- {
39
+
40
+ static void ferret_bv_and_dense(void) {
44
41
  ferret_bv_and_sparse();
45
42
  }
46
- static void ferret_bv_or_dense()
47
- {
43
+
44
+ static void ferret_bv_or_dense(void) {
48
45
  ferret_bv_or_sparse();
49
46
  }
50
- static void ferret_bv_xor_dense()
51
- {
47
+
48
+ static void ferret_bv_xor_dense(void) {
52
49
  ferret_bv_xor_sparse();
53
50
  }
54
- static void ferret_bv_not_dense()
55
- {
51
+
52
+ static void ferret_bv_not_dense(void) {
56
53
  ferret_bv_not_sparse();
57
54
  }
58
55
 
59
- static void ferret_bv_set_sparse()
60
- {
56
+ static void ferret_bv_set_sparse(void) {
61
57
  int i;
62
58
 
63
59
  for (i = SCAN_INC; i < SCAN_SIZE; i += SCAN_INC) {
@@ -67,8 +63,7 @@ static void ferret_bv_set_sparse()
67
63
  }
68
64
  }
69
65
 
70
- static void ferret_bv_scan_sparse()
71
- {
66
+ static void ferret_bv_scan_sparse(void) {
72
67
  int i, j;
73
68
 
74
69
  for (i = 0; i < N; i++) {
@@ -80,8 +75,7 @@ static void ferret_bv_scan_sparse()
80
75
  }
81
76
  }
82
77
 
83
- static void ferret_bv_set_dense()
84
- {
78
+ static void ferret_bv_set_dense(void) {
85
79
  int i;
86
80
  frt_bv_clear(bv);
87
81
  for (i = 0; i < DENSE_SCAN_SIZE; i++) {
@@ -89,8 +83,7 @@ static void ferret_bv_set_dense()
89
83
  }
90
84
  }
91
85
 
92
- static void ferret_bv_scan_dense()
93
- {
86
+ static void ferret_bv_scan_dense(void) {
94
87
  int i, j;
95
88
 
96
89
  for (i = 0; i < N; i++) {
@@ -102,8 +95,7 @@ static void ferret_bv_scan_dense()
102
95
  }
103
96
  }
104
97
 
105
- BENCH(bitvector_implementations)
106
- {
98
+ BENCH(bitvector_implementations) {
107
99
  BM_SETUP(setup);
108
100
 
109
101
  BM_ADD(ferret_bv_set_sparse);
@@ -4,8 +4,7 @@
4
4
 
5
5
  #define N 20
6
6
 
7
- static void ferret_hash()
8
- {
7
+ static void ferret_hash(void) {
9
8
  int i;
10
9
  void *res = NULL;
11
10
  for (i = 0; i < N; i++) {
@@ -24,13 +23,11 @@ static void ferret_hash()
24
23
  (void)res;
25
24
  }
26
25
 
27
- BENCH(hash_implementations)
28
- {
26
+ BENCH(hash_implementations) {
29
27
  BM_ADD(ferret_hash);
30
28
  }
31
29
 
32
- static void standard_hash()
33
- {
30
+ static void standard_hash(void) {
34
31
  int i;
35
32
  void *res = NULL;
36
33
  for (i = 0; i < N; i++) {
@@ -49,8 +46,7 @@ static void standard_hash()
49
46
 
50
47
  #define PERTURB_SHIFT 5
51
48
  static const char *dummy_key = "";
52
- static FrtHashEntry *h_lookup_str(FrtHash *ht, register const void *key)
53
- {
49
+ static FrtHashEntry *h_lookup_str(FrtHash *ht, register const void *key) {
54
50
  register const unsigned long hash = frt_str_hash((const char *)key);
55
51
  register unsigned int perturb;
56
52
  register int mask = ht->mask;
@@ -95,8 +91,7 @@ static FrtHashEntry *h_lookup_str(FrtHash *ht, register const void *key)
95
91
  }
96
92
  }
97
93
 
98
- static void string_hash()
99
- {
94
+ static void string_hash(void) {
100
95
  int i;
101
96
  void *res = NULL;
102
97
  for (i = 0; i < N; i++) {
@@ -114,8 +109,7 @@ static void string_hash()
114
109
  (void)res;
115
110
  }
116
111
 
117
- BENCH(specialized_string_hash)
118
- {
112
+ BENCH(specialized_string_hash) {
119
113
  BM_ADD(standard_hash);
120
114
  BM_ADD(string_hash);
121
115
  }
@@ -3,8 +3,7 @@
3
3
 
4
4
  #define N 10
5
5
 
6
- static void do_strcmp()
7
- {
6
+ static void do_strcmp(void) {
8
7
  const char **word;
9
8
  char buf[100];
10
9
  int res, i;
@@ -18,8 +17,7 @@ static void do_strcmp()
18
17
  (void)res;
19
18
  }
20
19
 
21
- static void do_strncmp()
22
- {
20
+ static void do_strncmp(void) {
23
21
  const char **word;
24
22
  char buf[100];
25
23
  int res, i;
@@ -33,8 +31,7 @@ static void do_strncmp()
33
31
  (void)res;
34
32
  }
35
33
 
36
- BENCH(strcmp_when_length_is_known)
37
- {
34
+ BENCH(strcmp_when_length_is_known) {
38
35
  BM_COUNT(6);
39
36
  BM_DISCARD(1);
40
37
  BM_ADD(do_strcmp);
@@ -6,46 +6,38 @@
6
6
  #define N 10
7
7
  #define write_byte(os, b) os->buf.buf[os->buf.pos++] = (frt_uchar)b
8
8
 
9
- void my_os_write_voff_t(FrtOutStream *os, register off_t num)
10
- {
9
+ void my_os_write_voff_t(FrtOutStream *os, register off_t num) {
11
10
  if (!(num&0x7f)) {
12
11
  if (os->buf.pos >= FRT_BUFFER_SIZE) {
13
12
  frt_os_write_byte(os, (frt_uchar)num);
14
- }
15
- else {
13
+ } else {
16
14
  write_byte(os, (frt_uchar)num);
17
15
  }
18
- }
19
- else if (!(num&0x3fff)) {
16
+ } else if (!(num&0x3fff)) {
20
17
  if (os->buf.pos >= FRT_BUFFER_SIZE - 1) {
21
18
  frt_os_write_byte(os, (frt_uchar)(0x80 | (0x3f & num))); num >>= 6;
22
19
  frt_os_write_byte(os, (frt_uchar)num);
23
- }
24
- else {
20
+ } else {
25
21
  write_byte(os, (frt_uchar)(0x80 | (0x3f & num))); num >>= 6;
26
22
  write_byte(os, (frt_uchar)num);
27
23
  }
28
- }
29
- else if (!(num&0x1fffff)) {
24
+ } else if (!(num&0x1fffff)) {
30
25
  if (os->buf.pos >= FRT_BUFFER_SIZE - 2) {
31
26
  frt_os_write_byte(os, (frt_uchar)(0xc0 | (0x1f & num))); num >>= 5;
32
27
  frt_os_write_byte(os, (frt_uchar)(0xff| num)); num >>= 8;
33
28
  frt_os_write_byte(os, (frt_uchar)num);
34
- }
35
- else {
29
+ } else {
36
30
  write_byte(os, (frt_uchar)(0xc0 | (0x1f & num))); num >>= 5;
37
31
  write_byte(os, (frt_uchar)(0xff| num)); num >>= 8;
38
32
  write_byte(os, (frt_uchar)num);
39
33
  }
40
- }
41
- else if (!(num&0xfffff)) {
34
+ } else if (!(num&0xfffff)) {
42
35
  if (os->buf.pos >= FRT_BUFFER_SIZE - 3) {
43
36
  frt_os_write_byte(os, (frt_uchar)(0xe0 | (0x0f & num))); num >>= 4;
44
37
  frt_os_write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
45
38
  frt_os_write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
46
39
  frt_os_write_byte(os, (frt_uchar)num);
47
- }
48
- else {
40
+ } else {
49
41
  write_byte(os, (frt_uchar)(0xe0 | (0x0f & num))); num >>= 4;
50
42
  write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
51
43
  write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
@@ -54,8 +46,7 @@ void my_os_write_voff_t(FrtOutStream *os, register off_t num)
54
46
  }
55
47
  }
56
48
 
57
- static void vint_out()
58
- {
49
+ static void vint_out(void) {
59
50
  int n;
60
51
  off_t i;
61
52
  FrtOutStream *os;
@@ -70,8 +61,7 @@ static void vint_out()
70
61
 
71
62
  }
72
63
 
73
- static void unrolled_vint_out()
74
- {
64
+ static void unrolled_vint_out(void) {
75
65
  int n;
76
66
  off_t i;
77
67
  FrtOutStream *os;
@@ -86,8 +76,7 @@ static void unrolled_vint_out()
86
76
 
87
77
  }
88
78
 
89
- BENCH(vint_io)
90
- {
79
+ BENCH(vint_io) {
91
80
  BM_ADD(vint_out);
92
81
  BM_ADD(unrolled_vint_out);
93
82
  }
@@ -5895,7 +5895,7 @@ static BrotliDictionary kBrotliDictionary = {
5895
5895
  #endif
5896
5896
  };
5897
5897
 
5898
- const BrotliDictionary* BrotliGetDictionary() {
5898
+ const BrotliDictionary* BrotliGetDictionary(void) {
5899
5899
  return &kBrotliDictionary;
5900
5900
  }
5901
5901
 
@@ -2599,7 +2599,7 @@ const char* BrotliDecoderErrorString(BrotliDecoderErrorCode c) {
2599
2599
  }
2600
2600
  }
2601
2601
 
2602
- uint32_t BrotliDecoderVersion() {
2602
+ uint32_t BrotliDecoderVersion(void) {
2603
2603
  return BROTLI_VERSION;
2604
2604
  }
2605
2605