isomorfeus-ferret 0.13.11 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +2 -2
- data/README.md +43 -22
- data/ext/isomorfeus_ferret_ext/frb_field_info.c +539 -0
- data/ext/isomorfeus_ferret_ext/frb_index.c +59 -687
- data/ext/isomorfeus_ferret_ext/frb_lazy_doc.c +705 -0
- data/ext/isomorfeus_ferret_ext/frb_qparser.c +1 -1
- data/ext/isomorfeus_ferret_ext/frb_search.c +0 -10
- data/ext/isomorfeus_ferret_ext/frb_store.c +1 -1
- data/ext/isomorfeus_ferret_ext/frt_hash.h +6 -8
- data/ext/isomorfeus_ferret_ext/frt_index.c +5 -2
- data/ext/isomorfeus_ferret_ext/frt_index.h +3 -1
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +8 -6
- data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +6 -6
- data/lib/isomorfeus/ferret/index/index.rb +13 -13
- data/lib/isomorfeus/ferret/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5818fce6d84b9bd4814be3bbed270127e05297dcf85adeebc495c8f334430d88
|
4
|
+
data.tar.gz: 77c9c3246c7777947084b47620d3aeeeb9eb76d7b0a17a4d30a37a38547a54da
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59632a0b46b9bd247da0f8b3908654a8027fbcef2aadc897f7681d25b03d4404191d037be323f666ef9bae679c72b135318aa853158e6bf0205b754ec3b2b18f
|
7
|
+
data.tar.gz: 2a037003347c6bca0900bf80410e83f43d397400f37e22f112e6ef6893a568dba29561b12594f803f3b28baee9f5f1ae67595c244d91b7dffa9d06e4e493c891
|
data/LICENSE
CHANGED
@@ -143,7 +143,7 @@ The following licenses apply to files, which are distributed within the repo
|
|
143
143
|
but not distributed with the gem and not used at runtime:
|
144
144
|
|
145
145
|
|
146
|
-
For the Reuter-21578 files in the misc/
|
146
|
+
For the Reuter-21578 files in the misc/ferret_vs_others directory (corpus, etc.),
|
147
147
|
used for research for developing search engine technology:
|
148
148
|
|
149
149
|
The copyright for the text of newswire articles and Reuters
|
@@ -156,7 +156,7 @@ Distribution 1.0", and inform your readers of the current location of
|
|
156
156
|
the data set (see "Availability & Questions").
|
157
157
|
|
158
158
|
|
159
|
-
Apache Lucene jars in the misc/
|
159
|
+
Apache Lucene jars in the misc/ferret_vs_others directory:
|
160
160
|
|
161
161
|
|
162
162
|
Apache License
|
data/README.md
CHANGED
@@ -11,13 +11,29 @@ At the [Isomorfeus Framework Project](https://isomorfeus.com)
|
|
11
11
|
|
12
12
|
## About this project
|
13
13
|
|
14
|
-
Isomorfeus-Ferret is a revived version of the original ferret gem created by Dave Balmain,
|
14
|
+
Isomorfeus-Ferret is a revived version of the original ferret gem created by Dave Balmain,
|
15
|
+
[https://github.com/dbalmain/ferret](https://github.com/dbalmain/ferret).
|
15
16
|
During revival many things havbe been fixed, now all tests pass, no crashes and it
|
16
17
|
successfully compiles and runs with rubys >3. Its no longer a goal to have
|
17
18
|
a c library available, but instead the usage is meant as ruby gem with a c extension only.
|
18
19
|
|
19
20
|
It works on *nixes, *nuxes, *BSDs and also works on Windows and RaspberryPi.
|
20
21
|
|
22
|
+
## Improvements and Changes in Version 0.14
|
23
|
+
|
24
|
+
### Breaking
|
25
|
+
|
26
|
+
- The API for LazyDocs has changed, they are read only now. LazyDoc#to_h may be used to create a hash, that may be changed and reindexed as doc.
|
27
|
+
|
28
|
+
### Performance
|
29
|
+
|
30
|
+
- LazyDoc is now truly lazy, fields are automatically retrieved. LazyDoc#load is no longer required, but may be used to preload all fields.
|
31
|
+
- Index#each is now multiple times faster, depending on use case.
|
32
|
+
|
33
|
+
### Other
|
34
|
+
|
35
|
+
- The Index class now includes Enumerable
|
36
|
+
|
21
37
|
## Improvements and Changes in Version 0.13
|
22
38
|
|
23
39
|
### Breaking
|
@@ -53,7 +69,7 @@ Compression semantics have changed, now Brotli, BZip2 and LZ4 compression codecs
|
|
53
69
|
- LZ4: fast compression, fast decrompression, low compression ratio
|
54
70
|
|
55
71
|
To see performance and compression ratios `rake ferret_compression_bench` can be run from the cloned repo.
|
56
|
-
It uses data and code within the misc/
|
72
|
+
It uses data and code within the misc/ferret_vs_others directory.
|
57
73
|
|
58
74
|
To compress a stored field the :compression option can be used with one of: :no, :brotli, :bz2 or :lz4.
|
59
75
|
Example:
|
@@ -80,7 +96,7 @@ https://github.com/isomorfeus/isomorfeus-ferret/blob/master/lib/isomorfeus/ferre
|
|
80
96
|
The query language and parser are documented here:
|
81
97
|
https://github.com/isomorfeus/isomorfeus-ferret/blob/master/ext/isomorfeus_ferret_ext/frb_qparser.c
|
82
98
|
|
83
|
-
Examples can be found in the 'test' directory or in 'misc/
|
99
|
+
Examples can be found in the 'test' directory or in 'misc/ferret_vs_others'.
|
84
100
|
|
85
101
|
## Running Specs
|
86
102
|
|
@@ -95,41 +111,46 @@ Ensure your locale is set to C.UTF-8, because the internal c tests don't know ho
|
|
95
111
|
### Indexing and Searching
|
96
112
|
- clone repo
|
97
113
|
- bundle install
|
98
|
-
- rake
|
114
|
+
- rake ferret_vs_others
|
99
115
|
|
100
116
|
A recent Java JDK must be installed to compile and run lucene benchmarks.
|
101
117
|
|
102
|
-
Results, Ferret 0.
|
103
|
-
Linux Ubuntu
|
118
|
+
Results, Ferret 0.14.0 vs. Lucene 9.2.0, WhitespaceAnalyzer,
|
119
|
+
Linux Ubuntu 22.04, FreeBSD 13.1 and Windows 10 on old Intel Core i5 from 2015,
|
104
120
|
LinuxPi on RaspberryPi 400:
|
105
121
|
|
106
122
|
| OS | Task | Ferret | Lucene* |
|
107
123
|
|---------|------------|-----------------|----------------|
|
108
|
-
| Linux | Indexing |
|
109
|
-
| FreeBSD | Indexing |
|
110
|
-
| Windows | Indexing |
|
111
|
-
| LinuxPi | Indexing |
|
112
|
-
| Linux | Searching |
|
113
|
-
| FreeBSD | Searching |
|
114
|
-
| Windows | Searching |
|
115
|
-
| LinuxPi | Searching |
|
124
|
+
| Linux | Indexing | 5125 docs/s | 4959 docs/s |
|
125
|
+
| FreeBSD | Indexing | 4537 docs/s | 3831 docs/s |
|
126
|
+
| Windows | Indexing | 2488 docs/s | 2588 docs/s |
|
127
|
+
| LinuxPi | Indexing | 1200 docs/s | 755 docs/s |
|
128
|
+
| Linux | Searching | 26610 queries/s | 7165 queries/s |
|
129
|
+
| FreeBSD | Searching | 24167 queries/s | 4288 queries/s |
|
130
|
+
| Windows | Searching | 3901 queries/s | 1033 queries/s |
|
131
|
+
| LinuxPi | Searching | 6194 queries/s | 785 queries/s |
|
116
132
|
| | Index Size | 28 MB | 35 MB |
|
117
133
|
|
118
|
-
*
|
134
|
+
* JVM Versions:
|
135
|
+
OpenJDK Runtime Environment (build 18-ea+36-Ubuntu-1) (Linux)
|
136
|
+
OpenJDK Runtime Environment (build 17.0.3+7-Raspbian-1deb11u1rpt1) (LinuxPi)
|
137
|
+
OpenJDK Runtime Environment Temurin-18.0.1+10 (build 18.0.1+10) (Windows)
|
138
|
+
OpenJDK Runtime Environment (build 17.0.2+8-1) (FreeBSD)
|
119
139
|
|
120
140
|
### Storing Fields with Compression, Indexing and Retrieval
|
141
|
+
|
121
142
|
- clone repo
|
122
143
|
- bundle install
|
123
144
|
- rake ferret_compression_benchmark
|
124
145
|
|
125
|
-
Results on Linux, 0.
|
146
|
+
Results on Linux, 0.14.0, on old Intel Core i5 from 2015:
|
126
147
|
|
127
|
-
| Compression | Index & Store | Retrieve
|
128
|
-
|
129
|
-
| none |
|
130
|
-
| brotli |
|
131
|
-
| bzip2 |
|
132
|
-
| lz4 |
|
148
|
+
| Compression | Index & Store | Retrieve Title | Index size |
|
149
|
+
|-------------|---------------|----------------|------------|
|
150
|
+
| none | 4862 docs/s | 278827 docs/s | 43 MB |
|
151
|
+
| brotli | 3559 docs/s | 178170 docs/s | 36 MB |
|
152
|
+
| bzip2 | 2628 docs/s | 81877 docs/s | 38 MB |
|
153
|
+
| lz4 | 4648 docs/s | 232236 docs/s | 41 MB |
|
133
154
|
|
134
155
|
## Future
|
135
156
|
|
@@ -0,0 +1,539 @@
|
|
1
|
+
#include "frt_index.h"
|
2
|
+
#include "isomorfeus_ferret.h"
|
3
|
+
|
4
|
+
VALUE cFieldInfo;
|
5
|
+
|
6
|
+
static VALUE sym_store;
|
7
|
+
static VALUE sym_index;
|
8
|
+
static VALUE sym_compression;
|
9
|
+
static VALUE sym_brotli;
|
10
|
+
static VALUE sym_bz2;
|
11
|
+
static VALUE sym_lz4;
|
12
|
+
static VALUE sym_term_vector;
|
13
|
+
static VALUE sym_omit_norms;
|
14
|
+
static VALUE sym_untokenized;
|
15
|
+
static VALUE sym_untokenized_omit_norms;
|
16
|
+
static VALUE sym_with_offsets;
|
17
|
+
static VALUE sym_with_positions;
|
18
|
+
static VALUE sym_with_positions_offsets;
|
19
|
+
|
20
|
+
extern VALUE sym_boost;
|
21
|
+
|
22
|
+
void frb_fi_get_params(VALUE roptions, FrtStoreValue *store, FrtCompressionType *compression, FrtIndexValue *index, FrtTermVectorValue *term_vector, float *boost) {
|
23
|
+
VALUE v;
|
24
|
+
Check_Type(roptions, T_HASH);
|
25
|
+
v = rb_hash_aref(roptions, sym_boost);
|
26
|
+
if (Qnil != v) {
|
27
|
+
*boost = (float)NUM2DBL(v);
|
28
|
+
} else {
|
29
|
+
*boost = 1.0f;
|
30
|
+
}
|
31
|
+
v = rb_hash_aref(roptions, sym_store);
|
32
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
33
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
34
|
+
*store = FRT_STORE_NO;
|
35
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
36
|
+
*store = FRT_STORE_YES;
|
37
|
+
} else if (v == Qnil) {
|
38
|
+
/* leave as default */
|
39
|
+
} else {
|
40
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for :store. Please choose from [:yes, :no]",
|
41
|
+
rb_id2name(SYM2ID(v)));
|
42
|
+
}
|
43
|
+
|
44
|
+
v = rb_hash_aref(roptions, sym_compression);
|
45
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
46
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
47
|
+
*compression = FRT_COMPRESSION_NONE;
|
48
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue || v == sym_brotli) {
|
49
|
+
*compression = FRT_COMPRESSION_BROTLI;
|
50
|
+
} else if (v == sym_bz2) {
|
51
|
+
*compression = FRT_COMPRESSION_BZ2;
|
52
|
+
} else if (v == sym_lz4) {
|
53
|
+
*compression = FRT_COMPRESSION_LZ4;
|
54
|
+
} else if (v == Qnil) {
|
55
|
+
/* leave as default */
|
56
|
+
} else {
|
57
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for :compression. Please choose from [:yes, :no, :brotli, :bz2, :lz4]",
|
58
|
+
rb_id2name(SYM2ID(v)));
|
59
|
+
}
|
60
|
+
|
61
|
+
v = rb_hash_aref(roptions, sym_index);
|
62
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
63
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
64
|
+
*index = FRT_INDEX_NO;
|
65
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
66
|
+
*index = FRT_INDEX_YES;
|
67
|
+
} else if (v == sym_untokenized) {
|
68
|
+
*index = FRT_INDEX_UNTOKENIZED;
|
69
|
+
} else if (v == sym_omit_norms) {
|
70
|
+
*index = FRT_INDEX_YES_OMIT_NORMS;
|
71
|
+
} else if (v == sym_untokenized_omit_norms) {
|
72
|
+
*index = FRT_INDEX_UNTOKENIZED_OMIT_NORMS;
|
73
|
+
} else if (v == Qnil) {
|
74
|
+
/* leave as default */
|
75
|
+
} else {
|
76
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for :index. Please choose from [:no, :yes, :untokenized, "
|
77
|
+
":omit_norms, :untokenized_omit_norms]", rb_id2name(SYM2ID(v)));
|
78
|
+
}
|
79
|
+
|
80
|
+
v = rb_hash_aref(roptions, sym_term_vector);
|
81
|
+
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
82
|
+
if (v == sym_no || v == sym_false || v == Qfalse) {
|
83
|
+
*term_vector = FRT_TERM_VECTOR_NO;
|
84
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
85
|
+
*term_vector = FRT_TERM_VECTOR_YES;
|
86
|
+
} else if (v == sym_with_positions) {
|
87
|
+
*term_vector = FRT_TERM_VECTOR_WITH_POSITIONS;
|
88
|
+
} else if (v == sym_with_offsets) {
|
89
|
+
*term_vector = FRT_TERM_VECTOR_WITH_OFFSETS;
|
90
|
+
} else if (v == sym_with_positions_offsets) {
|
91
|
+
*term_vector = FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
92
|
+
} else if (v == Qnil) {
|
93
|
+
/* leave as default */
|
94
|
+
if (*index == FRT_INDEX_NO) *term_vector = FRT_TERM_VECTOR_NO;
|
95
|
+
} else {
|
96
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for :term_vector. Please choose from [:no, :yes, "
|
97
|
+
":with_positions, :with_offsets, :with_positions_offsets]", rb_id2name(SYM2ID(v)));
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
static void frb_fi_free(void *p) {
|
102
|
+
frt_fi_deref((FrtFieldInfo *)p);
|
103
|
+
}
|
104
|
+
|
105
|
+
static size_t frb_fi_size(const void *p) {
|
106
|
+
return sizeof(FrtFieldInfo);
|
107
|
+
(void)p;
|
108
|
+
}
|
109
|
+
|
110
|
+
const rb_data_type_t frb_field_info_t = {
|
111
|
+
.wrap_struct_name = "FrbFieldInfo",
|
112
|
+
.function = {
|
113
|
+
.dmark = NULL,
|
114
|
+
.dfree = frb_fi_free,
|
115
|
+
.dsize = frb_fi_size,
|
116
|
+
.dcompact = NULL,
|
117
|
+
.reserved = {0},
|
118
|
+
},
|
119
|
+
.parent = NULL,
|
120
|
+
.data = NULL,
|
121
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY
|
122
|
+
};
|
123
|
+
|
124
|
+
VALUE frb_get_field_info(FrtFieldInfo *fi) {
|
125
|
+
if (fi) {
|
126
|
+
if (fi->rfi == 0 || fi->rfi == Qnil) {
|
127
|
+
fi->rfi = TypedData_Wrap_Struct(cFieldInfo, &frb_field_info_t, fi);
|
128
|
+
FRT_REF(fi);
|
129
|
+
}
|
130
|
+
return fi->rfi;
|
131
|
+
}
|
132
|
+
return Qnil;
|
133
|
+
}
|
134
|
+
|
135
|
+
/*
|
136
|
+
* call-seq:
|
137
|
+
* FieldInfo.new(name, options = {}) -> field_info
|
138
|
+
*
|
139
|
+
* Create a new FieldInfo object with the name +name+ and the properties
|
140
|
+
* specified in +options+. The available options are [:store, :compression,
|
141
|
+
* :index, :term_vector, :boost]. See the description of FieldInfo for more
|
142
|
+
* information on these properties.
|
143
|
+
*/
|
144
|
+
static VALUE frb_fi_alloc(VALUE rclass) {
|
145
|
+
FrtFieldInfo *fi = frt_fi_alloc();
|
146
|
+
return TypedData_Wrap_Struct(rclass, &frb_field_info_t, fi);
|
147
|
+
}
|
148
|
+
|
149
|
+
static VALUE frb_fi_init(int argc, VALUE *argv, VALUE self) {
|
150
|
+
VALUE roptions, rname;
|
151
|
+
FrtFieldInfo *fi;
|
152
|
+
TypedData_Get_Struct(self, FrtFieldInfo, &frb_field_info_t, fi);
|
153
|
+
FrtStoreValue store = FRT_STORE_YES;
|
154
|
+
FrtCompressionType compression = FRT_COMPRESSION_NONE;
|
155
|
+
FrtIndexValue index = FRT_INDEX_YES;
|
156
|
+
FrtTermVectorValue term_vector = FRT_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
157
|
+
float boost = 1.0f;
|
158
|
+
|
159
|
+
rb_scan_args(argc, argv, "11", &rname, &roptions);
|
160
|
+
if (argc > 1) {
|
161
|
+
frb_fi_get_params(roptions, &store, &compression, &index, &term_vector, &boost);
|
162
|
+
}
|
163
|
+
fi = frt_fi_init(fi, frb_field(rname), store, compression, index, term_vector);
|
164
|
+
fi->boost = boost;
|
165
|
+
fi->rfi = self;
|
166
|
+
return self;
|
167
|
+
}
|
168
|
+
|
169
|
+
/*
|
170
|
+
* call-seq:
|
171
|
+
* fi.name -> symbol
|
172
|
+
*
|
173
|
+
* Return the name of the field
|
174
|
+
*/
|
175
|
+
static VALUE frb_fi_name(VALUE self) {
|
176
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
177
|
+
return ID2SYM(fi->name);
|
178
|
+
}
|
179
|
+
|
180
|
+
/*
|
181
|
+
* call-seq:
|
182
|
+
* fi.stored? -> bool
|
183
|
+
*
|
184
|
+
* Return true if the field is stored in the index.
|
185
|
+
*/
|
186
|
+
static VALUE frb_fi_is_stored(VALUE self) {
|
187
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
188
|
+
return fi_is_stored(fi) ? Qtrue : Qfalse;
|
189
|
+
}
|
190
|
+
|
191
|
+
/*
|
192
|
+
* call-seq:
|
193
|
+
* fi.compressed? -> bool
|
194
|
+
*
|
195
|
+
* Return true if the field is stored in the index in compressed format.
|
196
|
+
*/
|
197
|
+
static VALUE frb_fi_is_compressed(VALUE self) {
|
198
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
199
|
+
return fi_is_compressed(fi) ? Qtrue : Qfalse;
|
200
|
+
}
|
201
|
+
|
202
|
+
/*
|
203
|
+
* call-seq:
|
204
|
+
* fi.indexed? -> bool
|
205
|
+
*
|
206
|
+
* Return true if the field is indexed, ie searchable in the index.
|
207
|
+
*/
|
208
|
+
static VALUE frb_fi_is_indexed(VALUE self) {
|
209
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
210
|
+
return fi_is_indexed(fi) ? Qtrue : Qfalse;
|
211
|
+
}
|
212
|
+
|
213
|
+
/*
|
214
|
+
* call-seq:
|
215
|
+
* fi.tokenized? -> bool
|
216
|
+
*
|
217
|
+
* Return true if the field is tokenized. Tokenizing is the process of
|
218
|
+
* breaking the field up into tokens. That is "the quick brown fox" becomes:
|
219
|
+
*
|
220
|
+
* ["the", "quick", "brown", "fox"]
|
221
|
+
*
|
222
|
+
* A field can only be tokenized if it is indexed.
|
223
|
+
*/
|
224
|
+
static VALUE frb_fi_is_tokenized(VALUE self) {
|
225
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
226
|
+
return fi_is_tokenized(fi) ? Qtrue : Qfalse;
|
227
|
+
}
|
228
|
+
|
229
|
+
/*
|
230
|
+
* call-seq:
|
231
|
+
* fi.omit_norms? -> bool
|
232
|
+
*
|
233
|
+
* Return true if the field omits the norm file. The norm file is the file
|
234
|
+
* used to store the field boosts for an indexed field. If you do not boost
|
235
|
+
* any fields, and you can live without scoring based on field length then
|
236
|
+
* you can omit the norms file. This will give the index a slight performance
|
237
|
+
* boost and it will use less memory, especially for indexes which have a
|
238
|
+
* large number of documents.
|
239
|
+
*/
|
240
|
+
static VALUE frb_fi_omit_norms(VALUE self) {
|
241
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
242
|
+
return fi_omit_norms(fi) ? Qtrue : Qfalse;
|
243
|
+
}
|
244
|
+
|
245
|
+
/*
|
246
|
+
* call-seq:
|
247
|
+
* fi.store_term_vector? -> bool
|
248
|
+
*
|
249
|
+
* Return true if the term-vectors are stored for this field.
|
250
|
+
*/
|
251
|
+
static VALUE frb_fi_store_term_vector(VALUE self) {
|
252
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
253
|
+
return fi_store_term_vector(fi) ? Qtrue : Qfalse;
|
254
|
+
}
|
255
|
+
|
256
|
+
/*
|
257
|
+
* call-seq:
|
258
|
+
* fi.store_positions? -> bool
|
259
|
+
*
|
260
|
+
* Return true if positions are stored with the term-vectors for this field.
|
261
|
+
*/
|
262
|
+
static VALUE frb_fi_store_positions(VALUE self) {
|
263
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
264
|
+
return fi_store_positions(fi) ? Qtrue : Qfalse;
|
265
|
+
}
|
266
|
+
|
267
|
+
/*
|
268
|
+
* call-seq:
|
269
|
+
* fi.store_offsets? -> bool
|
270
|
+
*
|
271
|
+
* Return true if offsets are stored with the term-vectors for this field.
|
272
|
+
*/
|
273
|
+
static VALUE frb_fi_store_offsets(VALUE self) {
|
274
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
275
|
+
return fi_store_offsets(fi) ? Qtrue : Qfalse;
|
276
|
+
}
|
277
|
+
|
278
|
+
/*
|
279
|
+
* call-seq:
|
280
|
+
* fi.has_norms? -> bool
|
281
|
+
*
|
282
|
+
* Return true if this field has a norms file. This is the same as calling;
|
283
|
+
*
|
284
|
+
* fi.indexed? and not fi.omit_norms?
|
285
|
+
*/
|
286
|
+
static VALUE frb_fi_has_norms(VALUE self) {
|
287
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
288
|
+
return fi_has_norms(fi) ? Qtrue : Qfalse;
|
289
|
+
}
|
290
|
+
|
291
|
+
/*
|
292
|
+
* call-seq:
|
293
|
+
* fi.boost -> boost
|
294
|
+
*
|
295
|
+
* Return the default boost for this field
|
296
|
+
*/
|
297
|
+
static VALUE frb_fi_boost(VALUE self) {
|
298
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
299
|
+
return rb_float_new((double)fi->boost);
|
300
|
+
}
|
301
|
+
|
302
|
+
/*
|
303
|
+
* call-seq:
|
304
|
+
* fi.to_s -> string
|
305
|
+
*
|
306
|
+
* Return a string representation of the FieldInfo object.
|
307
|
+
*/
|
308
|
+
static VALUE frb_fi_to_s(VALUE self) {
|
309
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
310
|
+
char *fi_s = frt_fi_to_s(fi);
|
311
|
+
VALUE rfi_s = rb_str_new2(fi_s);
|
312
|
+
free(fi_s);
|
313
|
+
return rfi_s;
|
314
|
+
}
|
315
|
+
|
316
|
+
/*
|
317
|
+
* call-seq:
|
318
|
+
* fi.to_h -> Hssh
|
319
|
+
*
|
320
|
+
* Return a Hash representation of the FieldInfo object.
|
321
|
+
*/
|
322
|
+
static VALUE frb_fi_to_h(VALUE self) {
|
323
|
+
FrtFieldInfo *fi = (FrtFieldInfo *)DATA_PTR(self);
|
324
|
+
VALUE hash = rb_hash_new();
|
325
|
+
VALUE val;
|
326
|
+
bool o;
|
327
|
+
|
328
|
+
// :index
|
329
|
+
if (!fi_is_indexed(fi)) val = sym_no;
|
330
|
+
else {
|
331
|
+
bool t = fi_is_tokenized(fi);
|
332
|
+
o = fi_omit_norms(fi);
|
333
|
+
if (!t && o) val = sym_untokenized_omit_norms;
|
334
|
+
else if (t && o) val = sym_omit_norms;
|
335
|
+
else if (!t && !o) val = sym_untokenized;
|
336
|
+
else val = sym_yes;
|
337
|
+
}
|
338
|
+
rb_hash_aset(hash, sym_index, val);
|
339
|
+
|
340
|
+
// :store
|
341
|
+
rb_hash_aset(hash, sym_store, fi_is_stored(fi) ? sym_yes : sym_no);
|
342
|
+
|
343
|
+
// :compress
|
344
|
+
if (!fi_is_compressed(fi)) val = sym_no;
|
345
|
+
else {
|
346
|
+
if (fi_is_compressed_brotli(fi)) val = sym_brotli;
|
347
|
+
else if (fi_is_compressed_bz2(fi)) val = sym_bz2;
|
348
|
+
else if (fi_is_compressed_lz4(fi)) val = sym_lz4;
|
349
|
+
else val = sym_yes;
|
350
|
+
}
|
351
|
+
rb_hash_aset(hash, sym_compression, val);
|
352
|
+
|
353
|
+
// :term_vector
|
354
|
+
if (!fi_store_term_vector(fi)) val = sym_no;
|
355
|
+
else {
|
356
|
+
bool p = fi_store_positions(fi);
|
357
|
+
o = fi_store_offsets(fi);
|
358
|
+
if (p && o) val = sym_with_positions_offsets;
|
359
|
+
else if (o) val = sym_with_offsets;
|
360
|
+
else if (p) val = sym_with_positions;
|
361
|
+
else val = sym_yes;
|
362
|
+
}
|
363
|
+
rb_hash_aset(hash, sym_term_vector, val);
|
364
|
+
|
365
|
+
// :boost
|
366
|
+
rb_hash_aset(hash, sym_boost, rb_float_new((double)fi->boost));
|
367
|
+
|
368
|
+
return hash;
|
369
|
+
}
|
370
|
+
|
371
|
+
/*
|
372
|
+
* Document-class: Ferret::Index::FieldInfo
|
373
|
+
*
|
374
|
+
* == Summary
|
375
|
+
*
|
376
|
+
* The FieldInfo class is the field descriptor for the index. It specifies
|
377
|
+
* whether a field is compressed or not or whether it should be indexed and
|
378
|
+
* tokenized. Every field has a name which must be a symbol. There are three
|
379
|
+
* properties that you can set, +:store+, +:index+ and +:term_vector+. You
|
380
|
+
* can also set the default +:boost+ for a field as well.
|
381
|
+
*
|
382
|
+
* == Properties
|
383
|
+
*
|
384
|
+
* === :store
|
385
|
+
*
|
386
|
+
* The +:store+ property allows you to specify how a field is stored. You can
|
387
|
+
* leave a field unstored (+:no+), store it in it's original format (+:yes+)
|
388
|
+
* or store it in compressed format (+:compressed+). By default the document
|
389
|
+
* is stored in its original format. If the field is large and it is stored
|
390
|
+
* elsewhere where it is easily accessible you might want to leave it
|
391
|
+
* unstored. This will keep the index size a lot smaller and make the
|
392
|
+
* indexing process a lot faster. For example, you should probably leave the
|
393
|
+
* +:content+ field unstored when indexing all the documents in your
|
394
|
+
* file-system.
|
395
|
+
*
|
396
|
+
* === :index
|
397
|
+
*
|
398
|
+
* The +:index+ property allows you to specify how a field is indexed. A
|
399
|
+
* field must be indexed to be searchable. However, a field doesn't need to
|
400
|
+
* be indexed to be store in the Ferret index. You may want to use the index
|
401
|
+
* as a simple database and store things like images or MP3s in the index. By
|
402
|
+
* default each field is indexed and tokenized (split into tokens) (+:yes+).
|
403
|
+
* If you don't want to index the field use +:no+. If you want the field
|
404
|
+
* indexed but not tokenized, use +:untokenized+. Do this for the fields you
|
405
|
+
* wish to sort by. There are two other values for +:index+; +:omit_norms+
|
406
|
+
* and +:untokenized_omit_norms+. These values correspond to +:yes+ and
|
407
|
+
* +:untokenized+ respectively and are useful if you are not boosting any
|
408
|
+
* fields and you'd like to speed up the index. The norms file is the file
|
409
|
+
* which contains the boost values for each document for a particular field.
|
410
|
+
*
|
411
|
+
* === :term_vector
|
412
|
+
*
|
413
|
+
* See TermVector for a description of term-vectors. You can specify whether
|
414
|
+
* or not you would like to store term-vectors. The available options are
|
415
|
+
* +:no+, +:yes+, +:with_positions+, +:with_offsets+ and
|
416
|
+
* +:with_positions_offsets+. Note that you need to store the positions to
|
417
|
+
* associate offsets with individual terms in the term_vector.
|
418
|
+
*
|
419
|
+
* == Property Table
|
420
|
+
*
|
421
|
+
* Property Value Description
|
422
|
+
* ------------------------------------------------------------------------
|
423
|
+
* :store | :no | Don't store field
|
424
|
+
* | |
|
425
|
+
* | :yes (default) | Store field in its original
|
426
|
+
* | | format. Use this value if you
|
427
|
+
* | | want to highlight matches.
|
428
|
+
* | | or print match excerpts a la
|
429
|
+
* | | Google search.
|
430
|
+
* -------------|-------------------------|------------------------------
|
431
|
+
* :compression | :no (default) | Don't compress stored field
|
432
|
+
* | |
|
433
|
+
* | :brotli | Compress field using Brotli
|
434
|
+
* | |
|
435
|
+
* | :bz2 | Compress field using BZip2
|
436
|
+
* | |
|
437
|
+
* | :lz4 | Compress field using LZ4
|
438
|
+
* -------------|-------------------------|------------------------------
|
439
|
+
* :index | :no | Do not make this field
|
440
|
+
* | | searchable.
|
441
|
+
* | |
|
442
|
+
* | :yes (default) | Make this field searchable and
|
443
|
+
* | | tokenize its contents.
|
444
|
+
* | |
|
445
|
+
* | :untokenized | Make this field searchable but
|
446
|
+
* | | do not tokenize its contents.
|
447
|
+
* | | use this value for fields you
|
448
|
+
* | | wish to sort by.
|
449
|
+
* | |
|
450
|
+
* | :omit_norms | Same as :yes except omit the
|
451
|
+
* | | norms file. The norms file can
|
452
|
+
* | | be omitted if you don't boost
|
453
|
+
* | | any fields and you don't need
|
454
|
+
* | | scoring based on field length.
|
455
|
+
* | |
|
456
|
+
* | :untokenized_omit_norms | Same as :untokenized except omit
|
457
|
+
* | | the norms file. Norms files can
|
458
|
+
* | | be omitted if you don't boost
|
459
|
+
* | | any fields and you don't need
|
460
|
+
* | | scoring based on field length.
|
461
|
+
* | |
|
462
|
+
* -------------|-------------------------|------------------------------
|
463
|
+
* :term_vector | :no | Don't store term-vectors
|
464
|
+
* | |
|
465
|
+
* | :yes | Store term-vectors without
|
466
|
+
* | | storing positions or offsets.
|
467
|
+
* | |
|
468
|
+
* | :with_positions | Store term-vectors with
|
469
|
+
* | | positions.
|
470
|
+
* | |
|
471
|
+
* | :with_offsets | Store term-vectors with
|
472
|
+
* | | offsets.
|
473
|
+
* | |
|
474
|
+
* | :with_positions_offsets | Store term-vectors with
|
475
|
+
* | (default) | positions and offsets.
|
476
|
+
* -------------|-------------------------|------------------------------
|
477
|
+
* :boost | Float | The boost property is used to
|
478
|
+
* | | set the default boost for a
|
479
|
+
* | | field. This boost value will
|
480
|
+
* | | used for all instances of the
|
481
|
+
* | | field in the index unless
|
482
|
+
* | | otherwise specified when you
|
483
|
+
* | | create the field. All values
|
484
|
+
* | | should be positive.
|
485
|
+
* | |
|
486
|
+
*
|
487
|
+
* == Examples
|
488
|
+
*
|
489
|
+
* fi = FieldInfo.new(:title, :index => :untokenized, :term_vector => :no,
|
490
|
+
* :boost => 10.0)
|
491
|
+
*
|
492
|
+
* fi = FieldInfo.new(:content)
|
493
|
+
*
|
494
|
+
* fi = FieldInfo.new(:created_on, :index => :untokenized_omit_norms,
|
495
|
+
* :term_vector => :no)
|
496
|
+
*
|
497
|
+
* fi = FieldInfo.new(:image, :store => :yes, :compression => :brotli, :index => :no,
|
498
|
+
* :term_vector => :no)
|
499
|
+
*/
|
500
|
+
void Init_FieldInfo(void) {
|
501
|
+
sym_store = ID2SYM(rb_intern("store"));
|
502
|
+
sym_index = ID2SYM(rb_intern("index"));
|
503
|
+
sym_term_vector = ID2SYM(rb_intern("term_vector"));
|
504
|
+
|
505
|
+
sym_brotli = ID2SYM(rb_intern("brotli"));
|
506
|
+
sym_bz2 = ID2SYM(rb_intern("bz2"));
|
507
|
+
sym_lz4 = ID2SYM(rb_intern("lz4"));
|
508
|
+
// sym_level = ID2SYM(rb_intern("level"));
|
509
|
+
sym_compression = ID2SYM(rb_intern("compression"));
|
510
|
+
|
511
|
+
sym_untokenized = ID2SYM(rb_intern("untokenized"));
|
512
|
+
sym_omit_norms = ID2SYM(rb_intern("omit_norms"));
|
513
|
+
sym_untokenized_omit_norms = ID2SYM(rb_intern("untokenized_omit_norms"));
|
514
|
+
|
515
|
+
sym_with_positions = ID2SYM(rb_intern("with_positions"));
|
516
|
+
sym_with_offsets = ID2SYM(rb_intern("with_offsets"));
|
517
|
+
sym_with_positions_offsets = ID2SYM(rb_intern("with_positions_offsets"));
|
518
|
+
|
519
|
+
cFieldInfo = rb_define_class_under(mIndex, "FieldInfo", rb_cObject);
|
520
|
+
rb_define_alloc_func(cFieldInfo, frb_fi_alloc);
|
521
|
+
|
522
|
+
rb_define_method(cFieldInfo, "initialize", frb_fi_init, -1);
|
523
|
+
rb_define_method(cFieldInfo, "name", frb_fi_name, 0);
|
524
|
+
rb_define_method(cFieldInfo, "stored?", frb_fi_is_stored, 0);
|
525
|
+
rb_define_method(cFieldInfo, "compressed?", frb_fi_is_compressed, 0);
|
526
|
+
rb_define_method(cFieldInfo, "indexed?", frb_fi_is_indexed, 0);
|
527
|
+
rb_define_method(cFieldInfo, "tokenized?", frb_fi_is_tokenized, 0);
|
528
|
+
rb_define_method(cFieldInfo, "omit_norms?", frb_fi_omit_norms, 0);
|
529
|
+
rb_define_method(cFieldInfo, "store_term_vector?",
|
530
|
+
frb_fi_store_term_vector, 0);
|
531
|
+
rb_define_method(cFieldInfo, "store_positions?",
|
532
|
+
frb_fi_store_positions, 0);
|
533
|
+
rb_define_method(cFieldInfo, "store_offsets?",
|
534
|
+
frb_fi_store_offsets, 0);
|
535
|
+
rb_define_method(cFieldInfo, "has_norms?", frb_fi_has_norms, 0);
|
536
|
+
rb_define_method(cFieldInfo, "boost", frb_fi_boost, 0);
|
537
|
+
rb_define_method(cFieldInfo, "to_s", frb_fi_to_s, 0);
|
538
|
+
rb_define_method(cFieldInfo, "to_h", frb_fi_to_h, 0);
|
539
|
+
}
|