ferret 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -121,7 +121,17 @@ file "ext/#{EXT}" => ["ext/Makefile"] do
121
121
  cp "ext/inc/threading.h", "ext/threading.h"
122
122
  cd "ext"
123
123
  if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
124
- sh "nmake"
124
+ begin
125
+ sh "nmake"
126
+ rescue Exception => e
127
+ puts
128
+ puts "**********************************************************************"
129
+ puts "You may need to call VCVARS32.BAT to set the environment variables."
130
+ puts ' c:\Program Files\Microsoft Visual Studio\VC98\Bin\VCVARS32.BAT'
131
+ puts "**********************************************************************"
132
+ puts
133
+ raise e
134
+ end
125
135
  else
126
136
  sh "make"
127
137
  end
@@ -132,6 +142,7 @@ file "ext/lang.h" => ["ext/inc/lang.h"] do
132
142
  rm_f "ext/lang.h"
133
143
  cp "ext/inc/lang.h", "ext/lang.h"
134
144
  end
145
+
135
146
  file "ext/threading.h" => ["ext/inc/threading.h"] do
136
147
  rm_f "ext/threading.h"
137
148
  cp "ext/inc/threading.h", "ext/threading.h"
@@ -158,7 +169,7 @@ end
158
169
  PKG_FILES = FileList[
159
170
  'setup.rb',
160
171
  '[-A-Z]*',
161
- 'ext/**/*',
172
+ 'ext/**/*.[ch]',
162
173
  'lib/**/*.rb',
163
174
  'test/**/*.rb',
164
175
  'test/**/wordfile',
@@ -176,7 +187,6 @@ else
176
187
  spec = Gem::Specification.new do |s|
177
188
 
178
189
  #### Basic information.
179
-
180
190
  s.name = 'ferret'
181
191
  s.version = PKG_VERSION
182
192
  s.summary = "Ruby indexing library."
@@ -186,29 +196,17 @@ else
186
196
  EOF
187
197
 
188
198
  #### Dependencies and requirements.
189
-
190
- #s.add_dependency('log4r', '> 1.0.4')
191
- #s.requirements << ""
192
-
193
- #### Which files are to be included in this gem? Everything! (Except CVS directories.)
194
-
199
+ s.add_dependency('rake')
195
200
  s.files = PKG_FILES.to_a
196
-
197
- #### C code extensions.
198
-
199
201
  s.extensions << "ext/extconf.rb"
200
-
201
- #### Load-time details: library and application (you will need one or both).
202
-
203
- s.require_path = 'lib' # Use these for libraries.
202
+ s.require_path = 'lib'
204
203
  s.autorequire = 'ferret'
205
204
 
206
-
207
- #s.bindir = "bin" # Use these for applications.
208
- #s.executables = ["rake"]
209
- #s.default_executable = "rake"
210
-
211
- #### Documentation and testing.
205
+ #### Author and project details.
206
+ s.author = "David Balmain"
207
+ s.email = "dbalmain@gmail.com"
208
+ s.homepage = "http://ferret.davebalmain.com/trac"
209
+ s.rubyforge_project = "ferret"
212
210
 
213
211
  s.has_rdoc = true
214
212
  s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
@@ -217,21 +215,18 @@ else
217
215
  '--main' << 'README' << '--line-numbers' <<
218
216
  'TUTORIAL' << 'TODO'
219
217
 
220
- #### Author and project details.
221
-
222
- s.author = "David Balmain"
223
- s.email = "dbalmain@gmail.com"
224
- s.homepage = "http://ferret.davebalmain.com/trac"
225
- s.rubyforge_project = "ferret"
226
- # if ENV['CERT_DIR']
227
- # s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
228
- # s.cert_chain = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
229
- # end
218
+ if RUBY_PLATFORM =~ /mswin/
219
+ s.files = PKG_FILES.to_a + ["ext/#{EXT}"]
220
+ s.extensions.clear
221
+ s.platform = Gem::Platform::WIN32
222
+ end
230
223
  end
231
224
 
232
225
  package_task = Rake::GemPackageTask.new(spec) do |pkg|
233
- pkg.need_zip = true
234
- pkg.need_tar = true
226
+ unless RUBY_PLATFORM =~ /mswin/
227
+ pkg.need_zip = true
228
+ pkg.need_tar = true
229
+ end
235
230
  end
236
231
  end
237
232
 
@@ -309,11 +304,11 @@ task :update_version => [:prerelease] do
309
304
  announce "No version change ... skipping version update"
310
305
  else
311
306
  announce "Updating Ferret version to #{PKG_VERSION}"
312
- reversion("lib/ferret.rb")
307
+ reversion("lib/ferret_version.rb")
313
308
  if ENV['RELTEST']
314
309
  announce "Release Task Testing, skipping commiting of new version"
315
310
  else
316
- sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
311
+ sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret_version.rb}
317
312
  end
318
313
  end
319
314
  end
@@ -55,7 +55,8 @@ __inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
55
55
  int tk_eq(Token *tk1, Token *tk2)
56
56
  {
57
57
  return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
58
- tk1->start == tk2->start && tk1->end == tk2->end);
58
+ tk1->start == tk2->start && tk1->end == tk2->end &&
59
+ tk1->pos_inc == tk2->pos_inc);
59
60
  }
60
61
 
61
62
  int tk_cmp(Token *tk1, Token *tk2)
@@ -724,7 +725,7 @@ static int std_get_url(char *input, char *token, int i)
724
725
  {
725
726
  while (isurlc(input[i])) {
726
727
  if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
727
- break; /* can't have to puncs in a row */
728
+ break; /* can't have two puncs in a row */
728
729
  }
729
730
  if (i < MAX_WORD_SIZE) {
730
731
  token[i] = input[i];
@@ -1061,18 +1062,18 @@ static TokenStream *sf_clone_i(TokenStream *orig_ts)
1061
1062
 
1062
1063
  static Token *sf_next(TokenStream *ts)
1063
1064
  {
1064
- int pos_inc = 1;
1065
+ int pos_inc = 0;
1065
1066
  HashTable *words = StopFilt(ts)->words;
1066
1067
  TokenFilter *tf = TkFilt(ts);
1067
1068
  Token *tk = tf->sub_ts->next(tf->sub_ts);
1068
1069
 
1069
1070
  while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
1071
+ pos_inc += tk->pos_inc;
1070
1072
  tk = tf->sub_ts->next(tf->sub_ts);
1071
- pos_inc++;
1072
1073
  }
1073
1074
 
1074
1075
  if (tk != NULL) {
1075
- tk->pos_inc = pos_inc;
1076
+ tk->pos_inc += pos_inc;
1076
1077
  }
1077
1078
 
1078
1079
  return tk;
@@ -1122,6 +1123,85 @@ TokenStream *stop_filter_new(TokenStream *ts)
1122
1123
  return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
1123
1124
  }
1124
1125
 
1126
+ /****************************************************************************
1127
+ * HyphenFilter
1128
+ ****************************************************************************/
1129
+
1130
+ #define HyphenFilt(filter) ((HyphenFilter *)(filter))
1131
+
1132
+ static TokenStream *hf_clone_i(TokenStream *orig_ts)
1133
+ {
1134
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
1135
+ return new_ts;
1136
+ }
1137
+
1138
+ static Token *hf_next(TokenStream *ts)
1139
+ {
1140
+ HyphenFilter *hf = HyphenFilt(ts);
1141
+ TokenFilter *tf = TkFilt(ts);
1142
+ Token *tk = hf->tk;
1143
+
1144
+ if (hf->pos < hf->len) {
1145
+ const int pos = hf->pos;
1146
+ const int text_len = strlen(hf->text + pos);
1147
+ strcpy(tk->text, hf->text + pos);
1148
+ tk->pos_inc = ((pos != 0) ? 1 : 0);
1149
+ tk->start = hf->start + pos;
1150
+ tk->end = tk->start + text_len;
1151
+ hf->pos += text_len + 1;
1152
+ tk->len = text_len;
1153
+ return tk;
1154
+ }
1155
+ else {
1156
+ char *p;
1157
+ bool seen_hyphen = false;
1158
+ bool seen_other_punc = false;
1159
+ hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
1160
+ if (NULL == tk) return NULL;
1161
+ p = tk->text + 1;
1162
+ while (*p) {
1163
+ if (*p == '-') {
1164
+ seen_hyphen = true;
1165
+ }
1166
+ else if (!isalpha(*p)) {
1167
+ seen_other_punc = true;
1168
+ break;
1169
+ }
1170
+ p++;
1171
+ }
1172
+ if (seen_hyphen && !seen_other_punc) {
1173
+ char *q = hf->text;
1174
+ char *r = tk->text;
1175
+ p = tk->text;
1176
+ while (*p) {
1177
+ if (*p == '-') {
1178
+ *q = '\0';
1179
+ }
1180
+ else {
1181
+ *r = *q = *p;
1182
+ r++;
1183
+ }
1184
+ q++;
1185
+ p++;
1186
+ }
1187
+ *r = *q = '\0';
1188
+ hf->start = tk->start;
1189
+ hf->pos = 0;
1190
+ hf->len = q - hf->text;
1191
+ tk->len = r - tk->text;
1192
+ }
1193
+ }
1194
+ return tk;
1195
+ }
1196
+
1197
+ TokenStream *hyphen_filter_new(TokenStream *sub_ts)
1198
+ {
1199
+ TokenStream *ts = tf_new(HyphenFilter, sub_ts);
1200
+ ts->next = &hf_next;
1201
+ ts->clone_i = &hf_clone_i;
1202
+ return ts;
1203
+ }
1204
+
1125
1205
  /****************************************************************************
1126
1206
  * LowerCaseFilter
1127
1207
  ****************************************************************************/
@@ -1257,64 +1337,44 @@ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
1257
1337
  Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
1258
1338
  bool lowercase)
1259
1339
  {
1260
- TokenStream *ts;
1340
+ TokenStream *ts = standard_tokenizer_new();
1261
1341
  if (lowercase) {
1262
- ts = stop_filter_new_with_words_len(lowercase_filter_new
1263
- (standard_tokenizer_new()),
1264
- words, len);
1265
- }
1266
- else {
1267
- ts = stop_filter_new_with_words_len(standard_tokenizer_new(),
1268
- words, len);
1342
+ ts = lowercase_filter_new(ts);
1269
1343
  }
1344
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1270
1345
  return analyzer_new(ts, NULL, NULL);
1271
1346
  }
1272
1347
 
1273
1348
  Analyzer *standard_analyzer_new_with_words(const char **words,
1274
1349
  bool lowercase)
1275
1350
  {
1276
- TokenStream *ts;
1351
+ TokenStream *ts = standard_tokenizer_new();
1277
1352
  if (lowercase) {
1278
- ts = stop_filter_new_with_words(lowercase_filter_new
1279
- (standard_tokenizer_new()),
1280
- words);
1281
- }
1282
- else {
1283
- ts = stop_filter_new_with_words(standard_tokenizer_new(),
1284
- words);
1353
+ ts = lowercase_filter_new(ts);
1285
1354
  }
1355
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1286
1356
  return analyzer_new(ts, NULL, NULL);
1287
1357
  }
1288
1358
 
1289
1359
  Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
1290
1360
  int len, bool lowercase)
1291
1361
  {
1292
- TokenStream *ts;
1362
+ TokenStream *ts = mb_standard_tokenizer_new();
1293
1363
  if (lowercase) {
1294
- ts = stop_filter_new_with_words_len(mb_lowercase_filter_new
1295
- (mb_standard_tokenizer_new
1296
- ()), words, len);
1297
- }
1298
- else {
1299
- ts = stop_filter_new_with_words_len(mb_standard_tokenizer_new(),
1300
- words, len);
1364
+ ts = mb_lowercase_filter_new(ts);
1301
1365
  }
1366
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1302
1367
  return analyzer_new(ts, NULL, NULL);
1303
1368
  }
1304
1369
 
1305
1370
  Analyzer *mb_standard_analyzer_new_with_words(const char **words,
1306
1371
  bool lowercase)
1307
1372
  {
1308
- TokenStream *ts;
1373
+ TokenStream *ts = mb_standard_tokenizer_new();
1309
1374
  if (lowercase) {
1310
- ts = stop_filter_new_with_words(mb_lowercase_filter_new
1311
- (mb_standard_tokenizer_new()),
1312
- words);
1313
- }
1314
- else {
1315
- ts = stop_filter_new_with_words(mb_standard_tokenizer_new(),
1316
- words);
1375
+ ts = mb_lowercase_filter_new(ts);
1317
1376
  }
1377
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1318
1378
  return analyzer_new(ts, NULL, NULL);
1319
1379
  }
1320
1380
 
@@ -89,6 +89,16 @@ typedef struct StopFilter
89
89
  HashTable *words;
90
90
  } StopFilter;
91
91
 
92
+ typedef struct HyphenFilter
93
+ {
94
+ TokenFilter super;
95
+ char text[MAX_WORD_SIZE];
96
+ int start;
97
+ int pos;
98
+ int len;
99
+ Token *tk;
100
+ } HyphenFilter;
101
+
92
102
  typedef struct StemFilter
93
103
  {
94
104
  TokenFilter super;
@@ -111,6 +121,7 @@ extern TokenStream *mb_letter_tokenizer_new(bool lowercase);
111
121
  extern TokenStream *standard_tokenizer_new();
112
122
  extern TokenStream *mb_standard_tokenizer_new();
113
123
 
124
+ extern TokenStream *hyphen_filter_new(TokenStream *ts);
114
125
  extern TokenStream *lowercase_filter_new(TokenStream *ts);
115
126
  extern TokenStream *mb_lowercase_filter_new(TokenStream *ts);
116
127
 
@@ -16,6 +16,7 @@ ID id_lt;
16
16
  ID id_call;
17
17
  ID id_is_directory;
18
18
  ID id_close;
19
+ ID id_cclass;
19
20
  ID id_data;
20
21
 
21
22
  static ID id_mkdir_p;
@@ -97,6 +98,13 @@ VALUE frt_data_alloc(VALUE klass)
97
98
  return Frt_Make_Struct(klass);
98
99
  }
99
100
 
101
+ VALUE frt_define_class_under(VALUE module, char *name, VALUE super)
102
+ {
103
+ VALUE klass = rb_define_class_under(module, name, super);
104
+ rb_ivar_set(klass, id_cclass, Qtrue);
105
+ return klass;
106
+ }
107
+
100
108
  void frt_deref_free(void *p)
101
109
  {
102
110
  object_del(p);
@@ -255,6 +263,8 @@ void Init_ferret_ext(void)
255
263
  id_is_directory = rb_intern("directory?");
256
264
  id_close = rb_intern("close");
257
265
 
266
+ id_cclass = rb_intern("cclass");
267
+
258
268
  id_data = rb_intern("@data");
259
269
 
260
270
  /* Symbols */
@@ -13,6 +13,7 @@ extern ID id_lt;
13
13
  extern ID id_call;
14
14
  extern ID id_is_directory;
15
15
  extern ID id_close;
16
+ extern ID id_cclass;
16
17
  extern ID id_data;
17
18
 
18
19
  /* Symbols */
@@ -60,6 +61,7 @@ extern void frt_create_dir(VALUE rpath);
60
61
  extern VALUE frt_hs_to_rb_ary(HashSet *hs);
61
62
  extern void *frt_rb_data_ptr(VALUE val);
62
63
  extern char * frt_field(VALUE rfield);
64
+ extern VALUE frt_define_class_under(VALUE module, char *name, VALUE super);
63
65
 
64
66
  #define Frt_Make_Struct(klass)\
65
67
  rb_data_object_alloc(klass,NULL,(RUBY_DATA_FUNC)NULL,(RUBY_DATA_FUNC)NULL)
@@ -8,6 +8,7 @@
8
8
 
9
9
  #undef close
10
10
  #undef rename
11
+ #undef read
11
12
 
12
13
  #define frt_malloc xmalloc
13
14
  #define frt_calloc(n) xcalloc(n, 1)
@@ -722,8 +722,8 @@ void lazy_df_get_bytes(LazyDocField *self, char *buf, int start, int len)
722
722
  RAISE(IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
723
723
  "is not between 0 and %d", start, self->len);
724
724
  }
725
- if (len < 0) {
726
- RAISE(IO_ERROR, "len %d should be greater than 0", len);
725
+ if (len <= 0) {
726
+ RAISE(IO_ERROR, "len = %d, but should be greater than 0", len);
727
727
  }
728
728
  if (start + len > self->len) {
729
729
  RAISE(IO_ERROR, "Tried to read past end of field. Field is only %d "
data/ext/lang.h CHANGED
@@ -8,6 +8,7 @@
8
8
 
9
9
  #undef close
10
10
  #undef rename
11
+ #undef read
11
12
 
12
13
  #define frt_malloc xmalloc
13
14
  #define frt_calloc(n) xcalloc(n, 1)
@@ -1984,7 +1984,14 @@ static Query *get_term_q(QParser *qp, char *field, char *word)
1984
1984
  q->destroy_i(q);
1985
1985
  q = phq;
1986
1986
  do {
1987
- phq_add_term(q, token->text, token->pos_inc);
1987
+ if (token->pos_inc) {
1988
+ phq_add_term(q, token->text, token->pos_inc);
1989
+ /* add some slop since single term was expected */
1990
+ ((PhraseQuery *)q)->slop++;
1991
+ }
1992
+ else {
1993
+ phq_append_multi_term(q, token->text);
1994
+ }
1988
1995
  } while ((token = ts_next(stream)) != NULL);
1989
1996
  }
1990
1997
  }
@@ -2157,7 +2164,7 @@ static Phrase *ph_add_multi_word(Phrase *self, char *word)
2157
2164
  }
2158
2165
 
2159
2166
  static Query *get_phrase_query(QParser *qp, char *field,
2160
- Phrase *phrase, char *slop_str)
2167
+ Phrase *phrase, char *slop_str)
2161
2168
  {
2162
2169
  const int pos_cnt = phrase->size;
2163
2170
  Query *q = NULL;
@@ -2180,6 +2187,7 @@ static Query *get_phrase_query(QParser *qp, char *field,
2180
2187
  Token *token;
2181
2188
  TokenStream *stream;
2182
2189
  int i, j;
2190
+ int pos_inc = 0;
2183
2191
  q = phq_new(field);
2184
2192
  if (slop_str) {
2185
2193
  int slop;
@@ -2188,14 +2196,24 @@ static Query *get_phrase_query(QParser *qp, char *field,
2188
2196
  }
2189
2197
 
2190
2198
  for (i = 0; i < pos_cnt; i++) {
2191
- int pos_inc = phrase->positions[i].pos; /* Actually holds pos_inc */
2192
2199
  char **words = phrase->positions[i].terms;
2193
2200
  const int word_count = ary_size(words);
2201
+ if (pos_inc) {
2202
+ ((PhraseQuery *)q)->slop++;
2203
+ }
2204
+ pos_inc += phrase->positions[i].pos + 1; /* Actually holds pos_inc*/
2194
2205
 
2195
2206
  if (word_count == 1) {
2196
2207
  stream = get_cached_ts(qp, field, words[0]);
2197
2208
  while ((token = ts_next(stream))) {
2198
- phq_add_term(q, token->text, token->pos_inc + pos_inc);
2209
+ if (token->pos_inc) {
2210
+ phq_add_term(q, token->text,
2211
+ pos_inc ? pos_inc : token->pos_inc);
2212
+ }
2213
+ else {
2214
+ phq_append_multi_term(q, token->text);
2215
+ ((PhraseQuery *)q)->slop++;
2216
+ }
2199
2217
  pos_inc = 0;
2200
2218
  }
2201
2219
  }
@@ -2206,8 +2224,10 @@ static Query *get_phrase_query(QParser *qp, char *field,
2206
2224
  stream = get_cached_ts(qp, field, words[j]);
2207
2225
  if ((token = ts_next(stream))) {
2208
2226
  if (!added_position) {
2209
- phq_add_term(q, token->text, token->pos_inc + pos_inc);
2227
+ phq_add_term(q, token->text,
2228
+ pos_inc ? pos_inc : token->pos_inc);
2210
2229
  added_position = true;
2230
+ pos_inc = 0;
2211
2231
  }
2212
2232
  else {
2213
2233
  phq_append_multi_term(q, token->text);