isomorfeus-ferret 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +101 -19
- data/README.md +54 -1
- data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
- data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
- data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
- data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
- data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
- data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
- data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
- data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
- data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
- data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
- data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
- data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
- data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
- data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
- data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
- data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
- data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
- data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
- data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
- data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
- data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
- data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
- data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
- data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
- data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
- data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
- data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
- data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
- data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
- data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
- data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
- data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
- data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
- data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
- data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
- data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
- data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
- data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
- data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
- data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
- data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
- data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
- data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
- data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
- data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
- data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
- data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
- data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
- data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
- data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
- data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
- data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
- data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
- data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
- data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
- data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
- data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
- data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
- data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
- data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
- data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
- data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
- data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
- data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
- data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
- data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
- data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
- data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
- data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
- data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
- data/ext/isomorfeus_ferret_ext/test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
- data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
- data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
- data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
- data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
- data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
- data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
- data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
- data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
- data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
- data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
- data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
- data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
- data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
- data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
- data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
- data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
- data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
- data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
- data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
- data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
- data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
- data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
- data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +27 -57
- data/ext/isomorfeus_ferret_ext/email.rl +0 -21
- data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
- data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
- data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
- data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
- data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
- data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
- data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
- data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
- data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
- data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
- data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
- data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
- data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
- data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -0,0 +1,328 @@
|
|
1
|
+
/*
|
2
|
+
xxHash - Extremely Fast Hash algorithm
|
3
|
+
Header File
|
4
|
+
Copyright (C) 2012-2016, Yann Collet.
|
5
|
+
|
6
|
+
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
7
|
+
|
8
|
+
Redistribution and use in source and binary forms, with or without
|
9
|
+
modification, are permitted provided that the following conditions are
|
10
|
+
met:
|
11
|
+
|
12
|
+
* Redistributions of source code must retain the above copyright
|
13
|
+
notice, this list of conditions and the following disclaimer.
|
14
|
+
* Redistributions in binary form must reproduce the above
|
15
|
+
copyright notice, this list of conditions and the following disclaimer
|
16
|
+
in the documentation and/or other materials provided with the
|
17
|
+
distribution.
|
18
|
+
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
20
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
21
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
22
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
23
|
+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
24
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
25
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
26
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
27
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
28
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
29
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
30
|
+
|
31
|
+
You can contact the author at :
|
32
|
+
- xxHash source repository : https://github.com/Cyan4973/xxHash
|
33
|
+
*/
|
34
|
+
|
35
|
+
/* Notice extracted from xxHash homepage :
|
36
|
+
|
37
|
+
xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
|
38
|
+
It also successfully passes all tests from the SMHasher suite.
|
39
|
+
|
40
|
+
Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
|
41
|
+
|
42
|
+
Name Speed Q.Score Author
|
43
|
+
xxHash 5.4 GB/s 10
|
44
|
+
CrapWow 3.2 GB/s 2 Andrew
|
45
|
+
MumurHash 3a 2.7 GB/s 10 Austin Appleby
|
46
|
+
SpookyHash 2.0 GB/s 10 Bob Jenkins
|
47
|
+
SBox 1.4 GB/s 9 Bret Mulvey
|
48
|
+
Lookup3 1.2 GB/s 9 Bob Jenkins
|
49
|
+
SuperFastHash 1.2 GB/s 1 Paul Hsieh
|
50
|
+
CityHash64 1.05 GB/s 10 Pike & Alakuijala
|
51
|
+
FNV 0.55 GB/s 5 Fowler, Noll, Vo
|
52
|
+
CRC32 0.43 GB/s 9
|
53
|
+
MD5-32 0.33 GB/s 10 Ronald L. Rivest
|
54
|
+
SHA1-32 0.28 GB/s 10
|
55
|
+
|
56
|
+
Q.Score is a measure of quality of the hash function.
|
57
|
+
It depends on successfully passing SMHasher test set.
|
58
|
+
10 is a perfect score.
|
59
|
+
|
60
|
+
A 64-bit version, named XXH64, is available since r35.
|
61
|
+
It offers much better speed, but for 64-bit applications only.
|
62
|
+
Name Speed on 64 bits Speed on 32 bits
|
63
|
+
XXH64 13.8 GB/s 1.9 GB/s
|
64
|
+
XXH32 6.8 GB/s 6.0 GB/s
|
65
|
+
*/
|
66
|
+
|
67
|
+
#ifndef XXHASH_H_5627135585666179
|
68
|
+
#define XXHASH_H_5627135585666179 1
|
69
|
+
|
70
|
+
#if defined (__cplusplus)
|
71
|
+
extern "C" {
|
72
|
+
#endif
|
73
|
+
|
74
|
+
|
75
|
+
/* ****************************
|
76
|
+
* Definitions
|
77
|
+
******************************/
|
78
|
+
#include <stddef.h> /* size_t */
|
79
|
+
typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
|
80
|
+
|
81
|
+
|
82
|
+
/* ****************************
|
83
|
+
* API modifier
|
84
|
+
******************************/
|
85
|
+
/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
|
86
|
+
* This is useful to include xxhash functions in `static` mode
|
87
|
+
* in order to inline them, and remove their symbol from the public list.
|
88
|
+
* Inlining can offer dramatic performance improvement on small keys.
|
89
|
+
* Methodology :
|
90
|
+
* #define XXH_INLINE_ALL
|
91
|
+
* #include "lz4xxhash.h"
|
92
|
+
* `xxhash.c` is automatically included.
|
93
|
+
* It's not useful to compile and link it as a separate module.
|
94
|
+
*/
|
95
|
+
#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
|
96
|
+
# ifndef XXH_STATIC_LINKING_ONLY
|
97
|
+
# define XXH_STATIC_LINKING_ONLY
|
98
|
+
# endif
|
99
|
+
# if defined(__GNUC__)
|
100
|
+
# define XXH_PUBLIC_API static __inline __attribute__((unused))
|
101
|
+
# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
|
102
|
+
# define XXH_PUBLIC_API static inline
|
103
|
+
# elif defined(_MSC_VER)
|
104
|
+
# define XXH_PUBLIC_API static __inline
|
105
|
+
# else
|
106
|
+
/* this version may generate warnings for unused static functions */
|
107
|
+
# define XXH_PUBLIC_API static
|
108
|
+
# endif
|
109
|
+
#else
|
110
|
+
# define XXH_PUBLIC_API /* do nothing */
|
111
|
+
#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
|
112
|
+
|
113
|
+
/*! XXH_NAMESPACE, aka Namespace Emulation :
|
114
|
+
*
|
115
|
+
* If you want to include _and expose_ xxHash functions from within your own library,
|
116
|
+
* but also want to avoid symbol collisions with other libraries which may also include xxHash,
|
117
|
+
*
|
118
|
+
* you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
|
119
|
+
* with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
|
120
|
+
*
|
121
|
+
* Note that no change is required within the calling program as long as it includes `xxhash.h` :
|
122
|
+
* regular symbol name will be automatically translated by this header.
|
123
|
+
*/
|
124
|
+
#ifdef XXH_NAMESPACE
|
125
|
+
# define XXH_CAT(A,B) A##B
|
126
|
+
# define XXH_NAME2(A,B) XXH_CAT(A,B)
|
127
|
+
# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
|
128
|
+
# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
|
129
|
+
# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
|
130
|
+
# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
|
131
|
+
# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
|
132
|
+
# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
|
133
|
+
# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
|
134
|
+
# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
|
135
|
+
# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
|
136
|
+
# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
|
137
|
+
# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
|
138
|
+
# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
|
139
|
+
# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
|
140
|
+
# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
|
141
|
+
# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
|
142
|
+
# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
|
143
|
+
# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
|
144
|
+
# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
|
145
|
+
# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
|
146
|
+
#endif
|
147
|
+
|
148
|
+
|
149
|
+
/* *************************************
|
150
|
+
* Version
|
151
|
+
***************************************/
|
152
|
+
#define XXH_VERSION_MAJOR 0
|
153
|
+
#define XXH_VERSION_MINOR 6
|
154
|
+
#define XXH_VERSION_RELEASE 5
|
155
|
+
#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
|
156
|
+
XXH_PUBLIC_API unsigned XXH_versionNumber (void);
|
157
|
+
|
158
|
+
|
159
|
+
/*-**********************************************************************
|
160
|
+
* 32-bit hash
|
161
|
+
************************************************************************/
|
162
|
+
typedef unsigned int XXH32_hash_t;
|
163
|
+
|
164
|
+
/*! XXH32() :
|
165
|
+
Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
|
166
|
+
The memory between input & input+length must be valid (allocated and read-accessible).
|
167
|
+
"seed" can be used to alter the result predictably.
|
168
|
+
Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
|
169
|
+
XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
|
170
|
+
|
171
|
+
/*====== Streaming ======*/
|
172
|
+
typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */
|
173
|
+
XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
|
174
|
+
XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);
|
175
|
+
XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
|
176
|
+
|
177
|
+
XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed);
|
178
|
+
XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
|
179
|
+
XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
|
180
|
+
|
181
|
+
/*
|
182
|
+
* Streaming functions generate the xxHash of an input provided in multiple segments.
|
183
|
+
* Note that, for small input, they are slower than single-call functions, due to state management.
|
184
|
+
* For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
|
185
|
+
*
|
186
|
+
* XXH state must first be allocated, using XXH*_createState() .
|
187
|
+
*
|
188
|
+
* Start a new hash by initializing state with a seed, using XXH*_reset().
|
189
|
+
*
|
190
|
+
* Then, feed the hash state by calling XXH*_update() as many times as necessary.
|
191
|
+
* The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
|
192
|
+
*
|
193
|
+
* Finally, a hash value can be produced anytime, by using XXH*_digest().
|
194
|
+
* This function returns the nn-bits hash as an int or long long.
|
195
|
+
*
|
196
|
+
* It's still possible to continue inserting input into the hash state after a digest,
|
197
|
+
* and generate some new hashes later on, by calling again XXH*_digest().
|
198
|
+
*
|
199
|
+
* When done, free XXH state space if it was allocated dynamically.
|
200
|
+
*/
|
201
|
+
|
202
|
+
/*====== Canonical representation ======*/
|
203
|
+
|
204
|
+
typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
|
205
|
+
XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
|
206
|
+
XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
|
207
|
+
|
208
|
+
/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
|
209
|
+
* The canonical representation uses human-readable write convention, aka big-endian (large digits first).
|
210
|
+
* These functions allow transformation of hash result into and from its canonical format.
|
211
|
+
* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
|
212
|
+
*/
|
213
|
+
|
214
|
+
|
215
|
+
#ifndef XXH_NO_LONG_LONG
|
216
|
+
/*-**********************************************************************
|
217
|
+
* 64-bit hash
|
218
|
+
************************************************************************/
|
219
|
+
typedef unsigned long long XXH64_hash_t;
|
220
|
+
|
221
|
+
/*! XXH64() :
|
222
|
+
Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
|
223
|
+
"seed" can be used to alter the result predictably.
|
224
|
+
This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
|
225
|
+
*/
|
226
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
|
227
|
+
|
228
|
+
/*====== Streaming ======*/
|
229
|
+
typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
|
230
|
+
XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
|
231
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
|
232
|
+
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
|
233
|
+
|
234
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed);
|
235
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
|
236
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
|
237
|
+
|
238
|
+
/*====== Canonical representation ======*/
|
239
|
+
typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
|
240
|
+
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
|
241
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
|
242
|
+
#endif /* XXH_NO_LONG_LONG */
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
#ifdef XXH_STATIC_LINKING_ONLY
|
247
|
+
|
248
|
+
/* ================================================================================================
|
249
|
+
This section contains declarations which are not guaranteed to remain stable.
|
250
|
+
They may change in future versions, becoming incompatible with a different version of the library.
|
251
|
+
These declarations should only be used with static linking.
|
252
|
+
Never use them in association with dynamic linking !
|
253
|
+
=================================================================================================== */
|
254
|
+
|
255
|
+
/* These definitions are only present to allow
|
256
|
+
* static allocation of XXH state, on stack or in a struct for example.
|
257
|
+
* Never **ever** use members directly. */
|
258
|
+
|
259
|
+
#if !defined (__VMS) \
|
260
|
+
&& (defined (__cplusplus) \
|
261
|
+
|| (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
|
262
|
+
# include <stdint.h>
|
263
|
+
|
264
|
+
struct XXH32_state_s {
|
265
|
+
uint32_t total_len_32;
|
266
|
+
uint32_t large_len;
|
267
|
+
uint32_t v1;
|
268
|
+
uint32_t v2;
|
269
|
+
uint32_t v3;
|
270
|
+
uint32_t v4;
|
271
|
+
uint32_t mem32[4];
|
272
|
+
uint32_t memsize;
|
273
|
+
uint32_t reserved; /* never read nor write, might be removed in a future version */
|
274
|
+
}; /* typedef'd to XXH32_state_t */
|
275
|
+
|
276
|
+
struct XXH64_state_s {
|
277
|
+
uint64_t total_len;
|
278
|
+
uint64_t v1;
|
279
|
+
uint64_t v2;
|
280
|
+
uint64_t v3;
|
281
|
+
uint64_t v4;
|
282
|
+
uint64_t mem64[4];
|
283
|
+
uint32_t memsize;
|
284
|
+
uint32_t reserved[2]; /* never read nor write, might be removed in a future version */
|
285
|
+
}; /* typedef'd to XXH64_state_t */
|
286
|
+
|
287
|
+
# else
|
288
|
+
|
289
|
+
struct XXH32_state_s {
|
290
|
+
unsigned total_len_32;
|
291
|
+
unsigned large_len;
|
292
|
+
unsigned v1;
|
293
|
+
unsigned v2;
|
294
|
+
unsigned v3;
|
295
|
+
unsigned v4;
|
296
|
+
unsigned mem32[4];
|
297
|
+
unsigned memsize;
|
298
|
+
unsigned reserved; /* never read nor write, might be removed in a future version */
|
299
|
+
}; /* typedef'd to XXH32_state_t */
|
300
|
+
|
301
|
+
# ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
|
302
|
+
struct XXH64_state_s {
|
303
|
+
unsigned long long total_len;
|
304
|
+
unsigned long long v1;
|
305
|
+
unsigned long long v2;
|
306
|
+
unsigned long long v3;
|
307
|
+
unsigned long long v4;
|
308
|
+
unsigned long long mem64[4];
|
309
|
+
unsigned memsize;
|
310
|
+
unsigned reserved[2]; /* never read nor write, might be removed in a future version */
|
311
|
+
}; /* typedef'd to XXH64_state_t */
|
312
|
+
# endif
|
313
|
+
|
314
|
+
# endif
|
315
|
+
|
316
|
+
|
317
|
+
#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
|
318
|
+
# include "xxhash.c" /* include xxhash function bodies as `static`, for inlining */
|
319
|
+
#endif
|
320
|
+
|
321
|
+
#endif /* XXH_STATIC_LINKING_ONLY */
|
322
|
+
|
323
|
+
|
324
|
+
#if defined (__cplusplus)
|
325
|
+
}
|
326
|
+
#endif
|
327
|
+
|
328
|
+
#endif /* XXHASH_H_5627135585666179 */
|
@@ -12,48 +12,29 @@
|
|
12
12
|
|
13
13
|
#include "stem_UTF_8_arabic.h"
|
14
14
|
#include "stem_UTF_8_armenian.h"
|
15
|
-
#include "stem_ISO_8859_1_basque.h"
|
16
15
|
#include "stem_UTF_8_basque.h"
|
17
|
-
#include "stem_ISO_8859_1_catalan.h"
|
18
16
|
#include "stem_UTF_8_catalan.h"
|
19
|
-
#include "stem_ISO_8859_1_danish.h"
|
20
17
|
#include "stem_UTF_8_danish.h"
|
21
|
-
#include "stem_ISO_8859_1_dutch.h"
|
22
18
|
#include "stem_UTF_8_dutch.h"
|
23
|
-
#include "stem_ISO_8859_1_english.h"
|
24
19
|
#include "stem_UTF_8_english.h"
|
25
|
-
#include "stem_ISO_8859_1_finnish.h"
|
26
20
|
#include "stem_UTF_8_finnish.h"
|
27
|
-
#include "stem_ISO_8859_1_french.h"
|
28
21
|
#include "stem_UTF_8_french.h"
|
29
|
-
#include "stem_ISO_8859_1_german.h"
|
30
22
|
#include "stem_UTF_8_german.h"
|
31
23
|
#include "stem_UTF_8_greek.h"
|
32
24
|
#include "stem_UTF_8_hindi.h"
|
33
|
-
#include "stem_ISO_8859_2_hungarian.h"
|
34
25
|
#include "stem_UTF_8_hungarian.h"
|
35
|
-
#include "stem_ISO_8859_1_indonesian.h"
|
36
26
|
#include "stem_UTF_8_indonesian.h"
|
37
|
-
#include "stem_ISO_8859_1_irish.h"
|
38
27
|
#include "stem_UTF_8_irish.h"
|
39
|
-
#include "stem_ISO_8859_1_italian.h"
|
40
28
|
#include "stem_UTF_8_italian.h"
|
41
29
|
#include "stem_UTF_8_lithuanian.h"
|
42
30
|
#include "stem_UTF_8_nepali.h"
|
43
|
-
#include "stem_ISO_8859_1_norwegian.h"
|
44
31
|
#include "stem_UTF_8_norwegian.h"
|
45
|
-
#include "stem_ISO_8859_1_porter.h"
|
46
32
|
#include "stem_UTF_8_porter.h"
|
47
|
-
#include "stem_ISO_8859_1_portuguese.h"
|
48
33
|
#include "stem_UTF_8_portuguese.h"
|
49
|
-
#include "stem_ISO_8859_2_romanian.h"
|
50
34
|
#include "stem_UTF_8_romanian.h"
|
51
|
-
#include "stem_KOI8_R_russian.h"
|
52
35
|
#include "stem_UTF_8_russian.h"
|
53
36
|
#include "stem_UTF_8_serbian.h"
|
54
|
-
#include "stem_ISO_8859_1_spanish.h"
|
55
37
|
#include "stem_UTF_8_spanish.h"
|
56
|
-
#include "stem_ISO_8859_1_swedish.h"
|
57
38
|
#include "stem_UTF_8_swedish.h"
|
58
39
|
#include "stem_UTF_8_tamil.h"
|
59
40
|
#include "stem_UTF_8_turkish.h"
|
@@ -61,9 +42,6 @@
|
|
61
42
|
|
62
43
|
typedef enum {
|
63
44
|
ENC_UNKNOWN=0,
|
64
|
-
ENC_ISO_8859_1,
|
65
|
-
ENC_ISO_8859_2,
|
66
|
-
ENC_KOI8_R,
|
67
45
|
ENC_UTF_8
|
68
46
|
} stemmer_encoding_t;
|
69
47
|
|
@@ -72,9 +50,6 @@ struct stemmer_encoding {
|
|
72
50
|
stemmer_encoding_t enc;
|
73
51
|
};
|
74
52
|
static const struct stemmer_encoding encodings[] = {
|
75
|
-
{"ISO_8859_1", ENC_ISO_8859_1},
|
76
|
-
{"ISO_8859_2", ENC_ISO_8859_2},
|
77
|
-
{"KOI8_R", ENC_KOI8_R},
|
78
53
|
{"UTF_8", ENC_UTF_8},
|
79
54
|
{0,ENC_UNKNOWN}
|
80
55
|
};
|
@@ -92,94 +67,54 @@ static const struct stemmer_modules modules[] = {
|
|
92
67
|
{"arabic", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
|
93
68
|
{"arm", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
94
69
|
{"armenian", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
95
|
-
{"baq", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
96
70
|
{"baq", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
97
|
-
{"basque", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
98
71
|
{"basque", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
99
|
-
{"ca", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
100
72
|
{"ca", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
101
|
-
{"cat", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
102
73
|
{"cat", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
103
|
-
{"catalan", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
|
104
74
|
{"catalan", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
|
105
|
-
{"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
106
75
|
{"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
107
|
-
{"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
108
76
|
{"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
109
|
-
{"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
|
110
77
|
{"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
|
111
|
-
{"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
112
78
|
{"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
113
|
-
{"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
114
79
|
{"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
115
|
-
{"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
116
80
|
{"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
117
|
-
{"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
118
81
|
{"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
119
82
|
{"el", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
120
83
|
{"ell", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
121
|
-
{"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
122
84
|
{"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
123
|
-
{"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
124
85
|
{"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
125
|
-
{"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
|
126
86
|
{"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
|
127
|
-
{"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
128
87
|
{"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
129
|
-
{"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
130
88
|
{"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
131
|
-
{"eu", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
132
89
|
{"eu", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
133
|
-
{"eus", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
|
134
90
|
{"eus", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
|
135
|
-
{"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
136
91
|
{"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
137
|
-
{"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
138
92
|
{"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
139
|
-
{"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
|
140
93
|
{"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
|
141
|
-
{"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
142
94
|
{"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
143
|
-
{"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
144
95
|
{"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
145
|
-
{"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
146
96
|
{"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
147
|
-
{"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
|
148
97
|
{"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
|
149
|
-
{"ga", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
150
98
|
{"ga", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
151
|
-
{"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
152
99
|
{"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
153
|
-
{"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
|
154
100
|
{"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
|
155
|
-
{"gle", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
156
101
|
{"gle", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
157
102
|
{"gre", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
158
103
|
{"greek", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
|
159
104
|
{"hi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
160
105
|
{"hin", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
161
106
|
{"hindi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
|
162
|
-
{"hu", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
163
107
|
{"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
164
|
-
{"hun", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
165
108
|
{"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
166
|
-
{"hungarian", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
|
167
109
|
{"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
|
168
110
|
{"hy", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
169
111
|
{"hye", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
|
170
|
-
{"id", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
171
112
|
{"id", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
172
|
-
{"ind", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
173
113
|
{"ind", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
174
|
-
{"indonesian", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
|
175
114
|
{"indonesian", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
|
176
|
-
{"irish", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
|
177
115
|
{"irish", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
|
178
|
-
{"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
179
116
|
{"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
180
|
-
{"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
181
117
|
{"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
182
|
-
{"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
|
183
118
|
{"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
|
184
119
|
{"lit", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
185
120
|
{"lithuanian", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
|
@@ -187,50 +122,29 @@ static const struct stemmer_modules modules[] = {
|
|
187
122
|
{"ne", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
188
123
|
{"nep", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
189
124
|
{"nepali", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
|
190
|
-
{"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
191
125
|
{"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
192
|
-
{"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
|
193
126
|
{"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
|
194
|
-
{"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
195
127
|
{"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
196
|
-
{"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
197
128
|
{"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
198
|
-
{"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
|
199
129
|
{"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
|
200
|
-
{"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
201
130
|
{"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
202
|
-
{"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
|
203
131
|
{"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
|
204
|
-
{"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
205
132
|
{"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
206
|
-
{"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
|
207
133
|
{"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
|
208
|
-
{"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
209
134
|
{"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
210
|
-
{"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
211
135
|
{"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
212
|
-
{"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
213
136
|
{"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
214
|
-
{"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
215
137
|
{"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
216
|
-
{"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
|
217
138
|
{"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
|
218
|
-
{"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
219
139
|
{"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
220
|
-
{"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
|
221
140
|
{"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
|
222
141
|
{"serbian", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
223
|
-
{"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
224
142
|
{"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
225
|
-
{"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
|
226
143
|
{"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
|
227
144
|
{"sr", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
228
145
|
{"srp", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
|
229
|
-
{"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
230
146
|
{"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
231
|
-
{"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
232
147
|
{"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
233
|
-
{"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
|
234
148
|
{"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
|
235
149
|
{"ta", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
236
150
|
{"tam", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
|
@@ -3,10 +3,10 @@
|
|
3
3
|
#include "testhelper.h"
|
4
4
|
#include <stdio.h>
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
extern rb_encoding *utf8_encoding;
|
7
|
+
|
8
|
+
static FrtFieldInfos *create_fis(void) {
|
9
|
+
FrtFieldInfos *fis = frt_fis_new(FRT_STORE_YES, FRT_COMPRESSION_NONE, FRT_INDEX_YES, FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS);
|
10
10
|
return fis;
|
11
11
|
}
|
12
12
|
|
@@ -15,21 +15,21 @@ static FrtIndexWriter *create_iw(FrtStore *store)
|
|
15
15
|
FrtFieldInfos *fis = create_fis();
|
16
16
|
frt_index_create(store, fis);
|
17
17
|
frt_fis_deref(fis);
|
18
|
-
return frt_iw_open(store, frt_standard_analyzer_new(true), &frt_default_config);
|
18
|
+
return frt_iw_open(NULL, store, frt_standard_analyzer_new(true), &frt_default_config);
|
19
19
|
}
|
20
20
|
|
21
|
-
static FrtDocument *prep_doc()
|
22
|
-
{
|
21
|
+
static FrtDocument *prep_doc(void) {
|
23
22
|
FrtDocument *doc = frt_doc_new();
|
23
|
+
rb_encoding *enc = utf8_encoding;
|
24
24
|
frt_doc_add_field(
|
25
25
|
doc,
|
26
26
|
frt_df_add_data(
|
27
27
|
frt_df_new(rb_intern("content")),
|
28
|
-
frt_estrdup("http://_____________________________________________________")
|
28
|
+
frt_estrdup("http://_____________________________________________________"),
|
29
|
+
enc
|
29
30
|
)
|
30
31
|
)->destroy_data = true;
|
31
32
|
return doc;
|
32
|
-
|
33
33
|
}
|
34
34
|
|
35
35
|
static void test_problem_text(TestCase *tc, void *data)
|
@@ -40,8 +40,7 @@ static void test_problem_text(TestCase *tc, void *data)
|
|
40
40
|
|
41
41
|
frt_iw_add_doc(iw, problem_text);
|
42
42
|
Aiequal(1, frt_iw_doc_count(iw));
|
43
|
-
Assert(!store->exists(store, "_0.cfs"),
|
44
|
-
"data shouldn't have been written yet");
|
43
|
+
Assert(!store->exists(store, "_0.cfs"), "data shouldn't have been written yet");
|
45
44
|
frt_iw_commit(iw);
|
46
45
|
Assert(store->exists(store, "_0.cfs"), "data should now be written");
|
47
46
|
frt_iw_close(iw);
|
@@ -50,7 +49,7 @@ static void test_problem_text(TestCase *tc, void *data)
|
|
50
49
|
|
51
50
|
TestSuite *ts_1710(TestSuite *suite)
|
52
51
|
{
|
53
|
-
FrtStore *store = frt_open_ram_store();
|
52
|
+
FrtStore *store = frt_open_ram_store(NULL);
|
54
53
|
|
55
54
|
suite = ADD_SUITE(suite);
|
56
55
|
|