isomorfeus-ferret 0.12.7 → 0.13.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +101 -19
- data/README.md +85 -13
- data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
- data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
- data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
- data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
- data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
- data/ext/isomorfeus_ferret_ext/bzlib_blocksort.c +1094 -0
- data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
- data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
- data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
- data/ext/isomorfeus_ferret_ext/bzlib_huffman.c +205 -0
- data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
- data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
- data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
- data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
- data/ext/isomorfeus_ferret_ext/frb_index.c +497 -495
- data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
- data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
- data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
- data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
- data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
- data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
- data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
- data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
- data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
- data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
- data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
- data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
- data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
- data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
- data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_global.c +91 -200
- data/ext/isomorfeus_ferret_ext/frt_global.h +7 -18
- data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
- data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
- data/ext/isomorfeus_ferret_ext/frt_index.c +603 -410
- data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
- data/ext/isomorfeus_ferret_ext/frt_lang.c +0 -2
- data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +68 -91
- data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
- data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
- data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
- data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
- data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
- data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
- data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
- data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
- data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
- data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
- data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
- data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
- data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
- data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
- data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
- data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
- data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
- data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +22 -112
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
- data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
- data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
- data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
- data/ext/isomorfeus_ferret_ext/test.c +0 -17
- data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
- data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
- data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
- data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
- data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
- data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
- data/ext/isomorfeus_ferret_ext/test_global.c +0 -46
- data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
- data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
- data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
- data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
- data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
- data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
- data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
- data/ext/isomorfeus_ferret_ext/test_search.c +60 -64
- data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
- data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
- data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
- data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
- data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
- data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +27 -57
- data/ext/isomorfeus_ferret_ext/email.rl +0 -21
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
- data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
- data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
- data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
- data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
- data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
- data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
- data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
- data/ext/isomorfeus_ferret_ext/url.rl +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a1a8509c12f5d180d38944adb53291958a9f8327a8b1706def9fecd7f9c60e73
|
4
|
+
data.tar.gz: fbbd38e08dd1992cd93663b1b04cc4b666b6a6dc59104414b75515bc2ec4d54d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7158cdc3f7a0624d35b668b31287d489ee3d389f17a2ab1b58235a7c2be639b6a7bbb3a8c0efee6d168804128fa5c0d7955c00f860b653a76f23d6421fc84c5d
|
7
|
+
data.tar.gz: 60c62db42081291a52e66be4b7e1b4c46eb12a458b0838f3d8793ffaeb013aff27a37aca06ab4831157d1dd65330395bfdaa2fa7924a278261f1187295592042
|
data/LICENSE
CHANGED
@@ -28,7 +28,86 @@ Copyright (c) 2005-2006 David Balmain
|
|
28
28
|
MIT License as above
|
29
29
|
|
30
30
|
|
31
|
-
|
31
|
+
brotli_* files in ext/isomorfeus_ferret originally taken from https://github.com/google/brotli:
|
32
|
+
|
33
|
+
Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
|
34
|
+
|
35
|
+
MIT License as above
|
36
|
+
|
37
|
+
|
38
|
+
bzlib* files in ext/isomorfeus_ferret originally taken from git://sourceware.org/git/bzip2.git:
|
39
|
+
|
40
|
+
This program, "bzip2", the associated library "libbzip2", and all
|
41
|
+
documentation, are copyright (C) 1996-2019 Julian R Seward. All
|
42
|
+
rights reserved.
|
43
|
+
|
44
|
+
Redistribution and use in source and binary forms, with or without
|
45
|
+
modification, are permitted provided that the following conditions
|
46
|
+
are met:
|
47
|
+
|
48
|
+
1. Redistributions of source code must retain the above copyright
|
49
|
+
notice, this list of conditions and the following disclaimer.
|
50
|
+
|
51
|
+
2. The origin of this software must not be misrepresented; you must
|
52
|
+
not claim that you wrote the original software. If you use this
|
53
|
+
software in a product, an acknowledgment in the product
|
54
|
+
documentation would be appreciated but is not required.
|
55
|
+
|
56
|
+
3. Altered source versions must be plainly marked as such, and must
|
57
|
+
not be misrepresented as being the original software.
|
58
|
+
|
59
|
+
4. The name of the author may not be used to endorse or promote
|
60
|
+
products derived from this software without specific prior written
|
61
|
+
permission.
|
62
|
+
|
63
|
+
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
|
64
|
+
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
65
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
66
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
67
|
+
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
68
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
69
|
+
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
70
|
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
71
|
+
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
72
|
+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
73
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
74
|
+
|
75
|
+
Julian Seward, jseward@acm.org
|
76
|
+
bzip2/libbzip2 version 1.0.8 of 13 July 2019
|
77
|
+
|
78
|
+
|
79
|
+
lz4* files in ext/isomorfeus_ferret originally taken from https://github.com/lz4/lz4/tree/dev/lib:
|
80
|
+
|
81
|
+
Copyright (C) 2011-2020, Yann Collet.
|
82
|
+
|
83
|
+
Redistribution and use in source and binary forms, with or without
|
84
|
+
modification, are permitted provided that the following conditions are met:
|
85
|
+
|
86
|
+
* Redistributions of source code must retain the above copyright
|
87
|
+
notice, this list of conditions and the following disclaimer.
|
88
|
+
* Redistributions in binary form must reproduce the above
|
89
|
+
copyright notice, this list of conditions and the following disclaimer
|
90
|
+
in the documentation and/or other materials provided with the
|
91
|
+
distribution.
|
92
|
+
|
93
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
94
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
95
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
96
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
97
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
98
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
99
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
100
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
101
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
102
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
103
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
104
|
+
|
105
|
+
You can contact the author at:
|
106
|
+
- LZ4 homepage : http://www.lz4.org
|
107
|
+
- LZ4 source repository : https://github.com/lz4/lz4
|
108
|
+
|
109
|
+
|
110
|
+
stem* and libstemmer* files in ext/isomorfeus_ferret originally taken from https://snowballstem.org/:
|
32
111
|
|
33
112
|
Copyright (c) 2001, Dr Martin Porter
|
34
113
|
Copyright (c) 2004,2005, Richard Boulton
|
@@ -37,8 +116,7 @@ Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
|
|
37
116
|
All rights reserved.
|
38
117
|
|
39
118
|
Redistribution and use in source and binary forms, with or without
|
40
|
-
modification, are permitted provided that the following conditions
|
41
|
-
are met:
|
119
|
+
modification, are permitted provided that the following conditions are met:
|
42
120
|
|
43
121
|
1. Redistributions of source code must retain the above copyright notice,
|
44
122
|
this list of conditions and the following disclaimer.
|
@@ -68,23 +146,27 @@ Copyright (C) 1993-2013 Yukihiro Matsumoto. All rights reserved.
|
|
68
146
|
Redistribution and use in source and binary forms, with or without
|
69
147
|
modification, are permitted provided that the following conditions
|
70
148
|
are met:
|
71
|
-
1. Redistributions of source code must retain the above copyright
|
72
|
-
notice, this list of conditions and the following disclaimer.
|
73
|
-
2. Redistributions in binary form must reproduce the above copyright
|
74
|
-
notice, this list of conditions and the following disclaimer in the
|
75
|
-
documentation and/or other materials provided with the distribution.
|
76
149
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
150
|
+
1. Redistributions of source code must retain the above copyright notice,
|
151
|
+
this list of conditions and the following disclaimer.
|
152
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
153
|
+
this list of conditions and the following disclaimer in the documentation
|
154
|
+
and/or other materials provided with the distribution.
|
155
|
+
|
156
|
+
THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
|
157
|
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
158
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
159
|
+
DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
|
160
|
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
161
|
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
162
|
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
163
|
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
164
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
165
|
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
166
|
+
|
167
|
+
|
168
|
+
The following licenses apply to files, which are distributed within the repo
|
169
|
+
but not distributed with the gem and not used at runtime:
|
88
170
|
|
89
171
|
|
90
172
|
For the Reuter-21578 files in the misc/ferret_vs_lucene directory (corpus, etc.),
|
data/README.md
CHANGED
@@ -16,7 +16,60 @@ During revival many things havbe been fixed, now all tests pass, no crashes and
|
|
16
16
|
successfully compiles and runs with rubys >3. Its no longer a goal to have
|
17
17
|
a c library available, but instead the usage is meant as ruby gem with a c extension only.
|
18
18
|
|
19
|
-
It should work on *nixes
|
19
|
+
It should work on *nixes, *nuxes, *BSDs and also works on Windows.
|
20
|
+
|
21
|
+
## Improvements and Changes in Version 0.13
|
22
|
+
|
23
|
+
### Breaking
|
24
|
+
|
25
|
+
- For version 0.13 die index file format has changed and is no longer compatible with previous versions. Indexes of older versions must be recreated with 0.13 (export all data from and with previous version, import alls data with 0.13)
|
26
|
+
- The :store option no longer accepts :compress, compression must now be specified by the separate :compress options (see below).
|
27
|
+
- The ASCII-specific Tokenizers and Analyzers have been removed
|
28
|
+
|
29
|
+
### Sring Encoding support
|
30
|
+
|
31
|
+
#### Input strings and stored fields
|
32
|
+
|
33
|
+
In versions prior 0.13 the string encoding had to match the locale string encoding.
|
34
|
+
In 0.13 the dependency on the locale setting has been resolved, input strings are now correctly tokenized
|
35
|
+
according to their source encoding, with positions correctly matching the input string.
|
36
|
+
All Ruby string encodings are supported.
|
37
|
+
When fields are stored, they are now stored with the encoding, so that when they are retrieved again, they
|
38
|
+
retain the original encoding with positions matching the string in its original encoding.
|
39
|
+
|
40
|
+
#### Tokens and Filters
|
41
|
+
|
42
|
+
Tokens are internally converted to UTF-8, which may change their length compared to their original encoding,
|
43
|
+
yet they retain position information according to the source in its original encoding.
|
44
|
+
The benefit is, that Filters, Stemmers or anything else working with Tokens only needs to support UTF-8 encoding,
|
45
|
+
greatly simplifying things and ensuring consistent query results.
|
46
|
+
|
47
|
+
### Compression
|
48
|
+
|
49
|
+
Compression semantics have changed, now Brotli, BZip2 and LZ4 compression codecs are supported.
|
50
|
+
- BZip2: slow compression, slow decompression, high compression ratio
|
51
|
+
- Brotli: slow compression, fast decrompression, high compression ratio, recommended for general purpose.
|
52
|
+
- LZ4: fast compression, fast decrompression, low compression ratio
|
53
|
+
|
54
|
+
To see performance and compression ratios `rake ferret_compression_bench` can be run from the cloned repo.
|
55
|
+
It uses data and code within the misc/ferret_vs_lucene directory.
|
56
|
+
|
57
|
+
To compress a stored field the :compression option can be used with one of: :no, :brotli, :bz2 or :lz4.
|
58
|
+
Example:
|
59
|
+
```ruby
|
60
|
+
fis.add_field(:compressed_field, :store => :yes, :compression => :brotli, :term_vector => :yes)
|
61
|
+
```
|
62
|
+
|
63
|
+
### Performance
|
64
|
+
|
65
|
+
The encoding support demands its toll, indexing performance dropped a bit in comparision to 0.12, but still thousands of docs per second, depending on machine/docs.
|
66
|
+
On Windows the indexing performance is still terrible, but that may be resolved in a future project.
|
67
|
+
|
68
|
+
Search performance is still excellent and multiple times faster than Lucene.
|
69
|
+
|
70
|
+
Lucene achieves roughly double the indexing performance. This seems to be because of the different way strings and
|
71
|
+
encodings are handled in Java. For example, the Java WhitespaceTokenizer code requires only one method call per character (check for whitespace), but for Ruby, to support all the different encodings, several method calls are required per character (retrieve character according to encoding, check character for whitespace).
|
72
|
+
Ferret is internally using the standard Ruby string encoding methods.
|
20
73
|
|
21
74
|
## Documentation
|
22
75
|
|
@@ -44,6 +97,7 @@ Ensure your locale is set to C.UTF-8, because the internal c tests don't know ho
|
|
44
97
|
|
45
98
|
## Benchmarks
|
46
99
|
|
100
|
+
### Indexing and Searching
|
47
101
|
- clone repo
|
48
102
|
- bundle install
|
49
103
|
- rake ferret_vs_lucene
|
@@ -52,23 +106,41 @@ A recent Java JDK must be installed to compile and run lucene benchmarks.
|
|
52
106
|
|
53
107
|
Results on Linux:
|
54
108
|
```
|
55
|
-
Ferret:
|
56
|
-
Indexing
|
57
|
-
Searching took: 0.
|
58
|
-
thats
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
109
|
+
Ferret 0.13.0:
|
110
|
+
Indexing: 9.35 secs, Docs: 19043, 2035 docs/s
|
111
|
+
Searching took: 0.3133133s for 8000 queries
|
112
|
+
thats 25533 q/s
|
113
|
+
Total found: 42000
|
114
|
+
Index size: 28Mb
|
115
|
+
|
116
|
+
Lucene 9.1.0:
|
117
|
+
Indexing: 4.20 secs, Docs: 19043, 4538 docs/s
|
118
|
+
Searching took: 1.64s for 8000 queries
|
119
|
+
thats 4875 q/s
|
120
|
+
Total found: 41000
|
121
|
+
index size: 35Mb
|
122
|
+
|
123
|
+
JVM 11.0.14.1 (Ubuntu)
|
67
124
|
```
|
68
125
|
|
126
|
+
### Storing Fields with Compression, Indexing and Retrieval
|
127
|
+
- clone repo
|
128
|
+
- bundle install
|
129
|
+
- rake ferret_compression_benchmark
|
130
|
+
|
131
|
+
Results on Linux, 0.13.0:
|
132
|
+
|
133
|
+
| Compression | Index & Store | Retrieve | Index size |
|
134
|
+
|-------------|---------------|---------------|------------|
|
135
|
+
| none | 2008 docs/s | 153853 docs/s | 43 MB |
|
136
|
+
| brotli | 1726 docs/s | 58315 docs/s | 36 MB |
|
137
|
+
| bzip2 | 1438 docs/s | 15382 docs/s | 38 MB |
|
138
|
+
| lz4 | 1932 docs/s | 127100 docs/s | 41 MB |
|
139
|
+
|
69
140
|
## Future
|
70
141
|
|
71
142
|
Lots of things to do:
|
143
|
+
- Improve indexing performance on Windows (WriteFile is terribly slow, maybe use mapping, see libuv)
|
72
144
|
- Bring documentation in order in a docs directory
|
73
145
|
- Review code (especially for memory/stack issues, typical c issues)
|
74
146
|
- Take care of ruby GVL and threading
|
@@ -9,55 +9,51 @@
|
|
9
9
|
|
10
10
|
static FrtBitVector *bv;
|
11
11
|
|
12
|
-
static void setup()
|
13
|
-
{
|
12
|
+
static void setup(void) {
|
14
13
|
bv = frt_bv_new_capa(SCAN_SIZE);
|
15
14
|
}
|
16
15
|
|
17
|
-
static void teardown()
|
18
|
-
{
|
16
|
+
static void teardown(void) {
|
19
17
|
frt_bv_destroy(bv);
|
20
18
|
}
|
21
19
|
|
22
|
-
static void ferret_bv_and_sparse()
|
23
|
-
{
|
20
|
+
static void ferret_bv_and_sparse(void) {
|
24
21
|
FrtBitVector * _bv = frt_bv_and(bv, bv);
|
25
22
|
free(_bv);
|
26
23
|
}
|
27
|
-
|
28
|
-
{
|
24
|
+
|
25
|
+
static void ferret_bv_or_sparse(void) {
|
29
26
|
FrtBitVector * _bv = frt_bv_or(bv, bv);
|
30
27
|
free(_bv);
|
31
28
|
}
|
32
|
-
|
33
|
-
{
|
29
|
+
|
30
|
+
static void ferret_bv_xor_sparse(void) {
|
34
31
|
FrtBitVector * _bv = frt_bv_xor(bv, bv);
|
35
32
|
free(_bv);
|
36
33
|
}
|
37
|
-
|
38
|
-
{
|
34
|
+
|
35
|
+
static void ferret_bv_not_sparse(void) {
|
39
36
|
FrtBitVector * _bv = frt_bv_not(bv);
|
40
37
|
free(_bv);
|
41
38
|
}
|
42
|
-
|
43
|
-
{
|
39
|
+
|
40
|
+
static void ferret_bv_and_dense(void) {
|
44
41
|
ferret_bv_and_sparse();
|
45
42
|
}
|
46
|
-
|
47
|
-
{
|
43
|
+
|
44
|
+
static void ferret_bv_or_dense(void) {
|
48
45
|
ferret_bv_or_sparse();
|
49
46
|
}
|
50
|
-
|
51
|
-
{
|
47
|
+
|
48
|
+
static void ferret_bv_xor_dense(void) {
|
52
49
|
ferret_bv_xor_sparse();
|
53
50
|
}
|
54
|
-
|
55
|
-
{
|
51
|
+
|
52
|
+
static void ferret_bv_not_dense(void) {
|
56
53
|
ferret_bv_not_sparse();
|
57
54
|
}
|
58
55
|
|
59
|
-
static void ferret_bv_set_sparse()
|
60
|
-
{
|
56
|
+
static void ferret_bv_set_sparse(void) {
|
61
57
|
int i;
|
62
58
|
|
63
59
|
for (i = SCAN_INC; i < SCAN_SIZE; i += SCAN_INC) {
|
@@ -67,8 +63,7 @@ static void ferret_bv_set_sparse()
|
|
67
63
|
}
|
68
64
|
}
|
69
65
|
|
70
|
-
static void ferret_bv_scan_sparse()
|
71
|
-
{
|
66
|
+
static void ferret_bv_scan_sparse(void) {
|
72
67
|
int i, j;
|
73
68
|
|
74
69
|
for (i = 0; i < N; i++) {
|
@@ -80,8 +75,7 @@ static void ferret_bv_scan_sparse()
|
|
80
75
|
}
|
81
76
|
}
|
82
77
|
|
83
|
-
static void ferret_bv_set_dense()
|
84
|
-
{
|
78
|
+
static void ferret_bv_set_dense(void) {
|
85
79
|
int i;
|
86
80
|
frt_bv_clear(bv);
|
87
81
|
for (i = 0; i < DENSE_SCAN_SIZE; i++) {
|
@@ -89,8 +83,7 @@ static void ferret_bv_set_dense()
|
|
89
83
|
}
|
90
84
|
}
|
91
85
|
|
92
|
-
static void ferret_bv_scan_dense()
|
93
|
-
{
|
86
|
+
static void ferret_bv_scan_dense(void) {
|
94
87
|
int i, j;
|
95
88
|
|
96
89
|
for (i = 0; i < N; i++) {
|
@@ -102,8 +95,7 @@ static void ferret_bv_scan_dense()
|
|
102
95
|
}
|
103
96
|
}
|
104
97
|
|
105
|
-
BENCH(bitvector_implementations)
|
106
|
-
{
|
98
|
+
BENCH(bitvector_implementations) {
|
107
99
|
BM_SETUP(setup);
|
108
100
|
|
109
101
|
BM_ADD(ferret_bv_set_sparse);
|
@@ -4,8 +4,7 @@
|
|
4
4
|
|
5
5
|
#define N 20
|
6
6
|
|
7
|
-
static void ferret_hash()
|
8
|
-
{
|
7
|
+
static void ferret_hash(void) {
|
9
8
|
int i;
|
10
9
|
void *res = NULL;
|
11
10
|
for (i = 0; i < N; i++) {
|
@@ -24,13 +23,11 @@ static void ferret_hash()
|
|
24
23
|
(void)res;
|
25
24
|
}
|
26
25
|
|
27
|
-
BENCH(hash_implementations)
|
28
|
-
{
|
26
|
+
BENCH(hash_implementations) {
|
29
27
|
BM_ADD(ferret_hash);
|
30
28
|
}
|
31
29
|
|
32
|
-
static void standard_hash()
|
33
|
-
{
|
30
|
+
static void standard_hash(void) {
|
34
31
|
int i;
|
35
32
|
void *res = NULL;
|
36
33
|
for (i = 0; i < N; i++) {
|
@@ -49,8 +46,7 @@ static void standard_hash()
|
|
49
46
|
|
50
47
|
#define PERTURB_SHIFT 5
|
51
48
|
static const char *dummy_key = "";
|
52
|
-
static FrtHashEntry *h_lookup_str(FrtHash *ht, register const void *key)
|
53
|
-
{
|
49
|
+
static FrtHashEntry *h_lookup_str(FrtHash *ht, register const void *key) {
|
54
50
|
register const unsigned long hash = frt_str_hash((const char *)key);
|
55
51
|
register unsigned int perturb;
|
56
52
|
register int mask = ht->mask;
|
@@ -95,8 +91,7 @@ static FrtHashEntry *h_lookup_str(FrtHash *ht, register const void *key)
|
|
95
91
|
}
|
96
92
|
}
|
97
93
|
|
98
|
-
static void string_hash()
|
99
|
-
{
|
94
|
+
static void string_hash(void) {
|
100
95
|
int i;
|
101
96
|
void *res = NULL;
|
102
97
|
for (i = 0; i < N; i++) {
|
@@ -114,8 +109,7 @@ static void string_hash()
|
|
114
109
|
(void)res;
|
115
110
|
}
|
116
111
|
|
117
|
-
BENCH(specialized_string_hash)
|
118
|
-
{
|
112
|
+
BENCH(specialized_string_hash) {
|
119
113
|
BM_ADD(standard_hash);
|
120
114
|
BM_ADD(string_hash);
|
121
115
|
}
|
@@ -3,8 +3,7 @@
|
|
3
3
|
|
4
4
|
#define N 10
|
5
5
|
|
6
|
-
static void do_strcmp()
|
7
|
-
{
|
6
|
+
static void do_strcmp(void) {
|
8
7
|
const char **word;
|
9
8
|
char buf[100];
|
10
9
|
int res, i;
|
@@ -18,8 +17,7 @@ static void do_strcmp()
|
|
18
17
|
(void)res;
|
19
18
|
}
|
20
19
|
|
21
|
-
static void do_strncmp()
|
22
|
-
{
|
20
|
+
static void do_strncmp(void) {
|
23
21
|
const char **word;
|
24
22
|
char buf[100];
|
25
23
|
int res, i;
|
@@ -33,8 +31,7 @@ static void do_strncmp()
|
|
33
31
|
(void)res;
|
34
32
|
}
|
35
33
|
|
36
|
-
BENCH(strcmp_when_length_is_known)
|
37
|
-
{
|
34
|
+
BENCH(strcmp_when_length_is_known) {
|
38
35
|
BM_COUNT(6);
|
39
36
|
BM_DISCARD(1);
|
40
37
|
BM_ADD(do_strcmp);
|
@@ -6,46 +6,38 @@
|
|
6
6
|
#define N 10
|
7
7
|
#define write_byte(os, b) os->buf.buf[os->buf.pos++] = (frt_uchar)b
|
8
8
|
|
9
|
-
void my_os_write_voff_t(FrtOutStream *os, register off_t num)
|
10
|
-
{
|
9
|
+
void my_os_write_voff_t(FrtOutStream *os, register off_t num) {
|
11
10
|
if (!(num&0x7f)) {
|
12
11
|
if (os->buf.pos >= FRT_BUFFER_SIZE) {
|
13
12
|
frt_os_write_byte(os, (frt_uchar)num);
|
14
|
-
}
|
15
|
-
else {
|
13
|
+
} else {
|
16
14
|
write_byte(os, (frt_uchar)num);
|
17
15
|
}
|
18
|
-
}
|
19
|
-
else if (!(num&0x3fff)) {
|
16
|
+
} else if (!(num&0x3fff)) {
|
20
17
|
if (os->buf.pos >= FRT_BUFFER_SIZE - 1) {
|
21
18
|
frt_os_write_byte(os, (frt_uchar)(0x80 | (0x3f & num))); num >>= 6;
|
22
19
|
frt_os_write_byte(os, (frt_uchar)num);
|
23
|
-
}
|
24
|
-
else {
|
20
|
+
} else {
|
25
21
|
write_byte(os, (frt_uchar)(0x80 | (0x3f & num))); num >>= 6;
|
26
22
|
write_byte(os, (frt_uchar)num);
|
27
23
|
}
|
28
|
-
}
|
29
|
-
else if (!(num&0x1fffff)) {
|
24
|
+
} else if (!(num&0x1fffff)) {
|
30
25
|
if (os->buf.pos >= FRT_BUFFER_SIZE - 2) {
|
31
26
|
frt_os_write_byte(os, (frt_uchar)(0xc0 | (0x1f & num))); num >>= 5;
|
32
27
|
frt_os_write_byte(os, (frt_uchar)(0xff| num)); num >>= 8;
|
33
28
|
frt_os_write_byte(os, (frt_uchar)num);
|
34
|
-
}
|
35
|
-
else {
|
29
|
+
} else {
|
36
30
|
write_byte(os, (frt_uchar)(0xc0 | (0x1f & num))); num >>= 5;
|
37
31
|
write_byte(os, (frt_uchar)(0xff| num)); num >>= 8;
|
38
32
|
write_byte(os, (frt_uchar)num);
|
39
33
|
}
|
40
|
-
}
|
41
|
-
else if (!(num&0xfffff)) {
|
34
|
+
} else if (!(num&0xfffff)) {
|
42
35
|
if (os->buf.pos >= FRT_BUFFER_SIZE - 3) {
|
43
36
|
frt_os_write_byte(os, (frt_uchar)(0xe0 | (0x0f & num))); num >>= 4;
|
44
37
|
frt_os_write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
|
45
38
|
frt_os_write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
|
46
39
|
frt_os_write_byte(os, (frt_uchar)num);
|
47
|
-
}
|
48
|
-
else {
|
40
|
+
} else {
|
49
41
|
write_byte(os, (frt_uchar)(0xe0 | (0x0f & num))); num >>= 4;
|
50
42
|
write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
|
51
43
|
write_byte(os, (frt_uchar)(0xff | num)); num >>= 8;
|
@@ -54,8 +46,7 @@ void my_os_write_voff_t(FrtOutStream *os, register off_t num)
|
|
54
46
|
}
|
55
47
|
}
|
56
48
|
|
57
|
-
static void vint_out()
|
58
|
-
{
|
49
|
+
static void vint_out(void) {
|
59
50
|
int n;
|
60
51
|
off_t i;
|
61
52
|
FrtOutStream *os;
|
@@ -70,8 +61,7 @@ static void vint_out()
|
|
70
61
|
|
71
62
|
}
|
72
63
|
|
73
|
-
static void unrolled_vint_out()
|
74
|
-
{
|
64
|
+
static void unrolled_vint_out(void) {
|
75
65
|
int n;
|
76
66
|
off_t i;
|
77
67
|
FrtOutStream *os;
|
@@ -86,8 +76,7 @@ static void unrolled_vint_out()
|
|
86
76
|
|
87
77
|
}
|
88
78
|
|
89
|
-
BENCH(vint_io)
|
90
|
-
{
|
79
|
+
BENCH(vint_io) {
|
91
80
|
BM_ADD(vint_out);
|
92
81
|
BM_ADD(unrolled_vint_out);
|
93
82
|
}
|