ferret 0.10.2 → 0.10.3

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -121,7 +121,17 @@ file "ext/#{EXT}" => ["ext/Makefile"] do
121
121
  cp "ext/inc/threading.h", "ext/threading.h"
122
122
  cd "ext"
123
123
  if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
124
- sh "nmake"
124
+ begin
125
+ sh "nmake"
126
+ rescue Exception => e
127
+ puts
128
+ puts "**********************************************************************"
129
+ puts "You may need to call VCVARS32.BAT to set the environment variables."
130
+ puts ' c:\Program Files\Microsoft Visual Studio\VC98\Bin\VCVARS32.BAT'
131
+ puts "**********************************************************************"
132
+ puts
133
+ raise e
134
+ end
125
135
  else
126
136
  sh "make"
127
137
  end
@@ -132,6 +142,7 @@ file "ext/lang.h" => ["ext/inc/lang.h"] do
132
142
  rm_f "ext/lang.h"
133
143
  cp "ext/inc/lang.h", "ext/lang.h"
134
144
  end
145
+
135
146
  file "ext/threading.h" => ["ext/inc/threading.h"] do
136
147
  rm_f "ext/threading.h"
137
148
  cp "ext/inc/threading.h", "ext/threading.h"
@@ -158,7 +169,7 @@ end
158
169
  PKG_FILES = FileList[
159
170
  'setup.rb',
160
171
  '[-A-Z]*',
161
- 'ext/**/*',
172
+ 'ext/**/*.[ch]',
162
173
  'lib/**/*.rb',
163
174
  'test/**/*.rb',
164
175
  'test/**/wordfile',
@@ -176,7 +187,6 @@ else
176
187
  spec = Gem::Specification.new do |s|
177
188
 
178
189
  #### Basic information.
179
-
180
190
  s.name = 'ferret'
181
191
  s.version = PKG_VERSION
182
192
  s.summary = "Ruby indexing library."
@@ -186,29 +196,17 @@ else
186
196
  EOF
187
197
 
188
198
  #### Dependencies and requirements.
189
-
190
- #s.add_dependency('log4r', '> 1.0.4')
191
- #s.requirements << ""
192
-
193
- #### Which files are to be included in this gem? Everything! (Except CVS directories.)
194
-
199
+ s.add_dependency('rake')
195
200
  s.files = PKG_FILES.to_a
196
-
197
- #### C code extensions.
198
-
199
201
  s.extensions << "ext/extconf.rb"
200
-
201
- #### Load-time details: library and application (you will need one or both).
202
-
203
- s.require_path = 'lib' # Use these for libraries.
202
+ s.require_path = 'lib'
204
203
  s.autorequire = 'ferret'
205
204
 
206
-
207
- #s.bindir = "bin" # Use these for applications.
208
- #s.executables = ["rake"]
209
- #s.default_executable = "rake"
210
-
211
- #### Documentation and testing.
205
+ #### Author and project details.
206
+ s.author = "David Balmain"
207
+ s.email = "dbalmain@gmail.com"
208
+ s.homepage = "http://ferret.davebalmain.com/trac"
209
+ s.rubyforge_project = "ferret"
212
210
 
213
211
  s.has_rdoc = true
214
212
  s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
@@ -217,21 +215,18 @@ else
217
215
  '--main' << 'README' << '--line-numbers' <<
218
216
  'TUTORIAL' << 'TODO'
219
217
 
220
- #### Author and project details.
221
-
222
- s.author = "David Balmain"
223
- s.email = "dbalmain@gmail.com"
224
- s.homepage = "http://ferret.davebalmain.com/trac"
225
- s.rubyforge_project = "ferret"
226
- # if ENV['CERT_DIR']
227
- # s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
228
- # s.cert_chain = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
229
- # end
218
+ if RUBY_PLATFORM =~ /mswin/
219
+ s.files = PKG_FILES.to_a + ["ext/#{EXT}"]
220
+ s.extensions.clear
221
+ s.platform = Gem::Platform::WIN32
222
+ end
230
223
  end
231
224
 
232
225
  package_task = Rake::GemPackageTask.new(spec) do |pkg|
233
- pkg.need_zip = true
234
- pkg.need_tar = true
226
+ unless RUBY_PLATFORM =~ /mswin/
227
+ pkg.need_zip = true
228
+ pkg.need_tar = true
229
+ end
235
230
  end
236
231
  end
237
232
 
@@ -309,11 +304,11 @@ task :update_version => [:prerelease] do
309
304
  announce "No version change ... skipping version update"
310
305
  else
311
306
  announce "Updating Ferret version to #{PKG_VERSION}"
312
- reversion("lib/ferret.rb")
307
+ reversion("lib/ferret_version.rb")
313
308
  if ENV['RELTEST']
314
309
  announce "Release Task Testing, skipping commiting of new version"
315
310
  else
316
- sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
311
+ sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret_version.rb}
317
312
  end
318
313
  end
319
314
  end
@@ -55,7 +55,8 @@ __inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
55
55
  int tk_eq(Token *tk1, Token *tk2)
56
56
  {
57
57
  return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
58
- tk1->start == tk2->start && tk1->end == tk2->end);
58
+ tk1->start == tk2->start && tk1->end == tk2->end &&
59
+ tk1->pos_inc == tk2->pos_inc);
59
60
  }
60
61
 
61
62
  int tk_cmp(Token *tk1, Token *tk2)
@@ -724,7 +725,7 @@ static int std_get_url(char *input, char *token, int i)
724
725
  {
725
726
  while (isurlc(input[i])) {
726
727
  if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
727
- break; /* can't have to puncs in a row */
728
+ break; /* can't have two puncs in a row */
728
729
  }
729
730
  if (i < MAX_WORD_SIZE) {
730
731
  token[i] = input[i];
@@ -1061,18 +1062,18 @@ static TokenStream *sf_clone_i(TokenStream *orig_ts)
1061
1062
 
1062
1063
  static Token *sf_next(TokenStream *ts)
1063
1064
  {
1064
- int pos_inc = 1;
1065
+ int pos_inc = 0;
1065
1066
  HashTable *words = StopFilt(ts)->words;
1066
1067
  TokenFilter *tf = TkFilt(ts);
1067
1068
  Token *tk = tf->sub_ts->next(tf->sub_ts);
1068
1069
 
1069
1070
  while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
1071
+ pos_inc += tk->pos_inc;
1070
1072
  tk = tf->sub_ts->next(tf->sub_ts);
1071
- pos_inc++;
1072
1073
  }
1073
1074
 
1074
1075
  if (tk != NULL) {
1075
- tk->pos_inc = pos_inc;
1076
+ tk->pos_inc += pos_inc;
1076
1077
  }
1077
1078
 
1078
1079
  return tk;
@@ -1122,6 +1123,85 @@ TokenStream *stop_filter_new(TokenStream *ts)
1122
1123
  return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
1123
1124
  }
1124
1125
 
1126
+ /****************************************************************************
1127
+ * HyphenFilter
1128
+ ****************************************************************************/
1129
+
1130
+ #define HyphenFilt(filter) ((HyphenFilter *)(filter))
1131
+
1132
+ static TokenStream *hf_clone_i(TokenStream *orig_ts)
1133
+ {
1134
+ TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(HyphenFilter));
1135
+ return new_ts;
1136
+ }
1137
+
1138
+ static Token *hf_next(TokenStream *ts)
1139
+ {
1140
+ HyphenFilter *hf = HyphenFilt(ts);
1141
+ TokenFilter *tf = TkFilt(ts);
1142
+ Token *tk = hf->tk;
1143
+
1144
+ if (hf->pos < hf->len) {
1145
+ const int pos = hf->pos;
1146
+ const int text_len = strlen(hf->text + pos);
1147
+ strcpy(tk->text, hf->text + pos);
1148
+ tk->pos_inc = ((pos != 0) ? 1 : 0);
1149
+ tk->start = hf->start + pos;
1150
+ tk->end = tk->start + text_len;
1151
+ hf->pos += text_len + 1;
1152
+ tk->len = text_len;
1153
+ return tk;
1154
+ }
1155
+ else {
1156
+ char *p;
1157
+ bool seen_hyphen = false;
1158
+ bool seen_other_punc = false;
1159
+ hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
1160
+ if (NULL == tk) return NULL;
1161
+ p = tk->text + 1;
1162
+ while (*p) {
1163
+ if (*p == '-') {
1164
+ seen_hyphen = true;
1165
+ }
1166
+ else if (!isalpha(*p)) {
1167
+ seen_other_punc = true;
1168
+ break;
1169
+ }
1170
+ p++;
1171
+ }
1172
+ if (seen_hyphen && !seen_other_punc) {
1173
+ char *q = hf->text;
1174
+ char *r = tk->text;
1175
+ p = tk->text;
1176
+ while (*p) {
1177
+ if (*p == '-') {
1178
+ *q = '\0';
1179
+ }
1180
+ else {
1181
+ *r = *q = *p;
1182
+ r++;
1183
+ }
1184
+ q++;
1185
+ p++;
1186
+ }
1187
+ *r = *q = '\0';
1188
+ hf->start = tk->start;
1189
+ hf->pos = 0;
1190
+ hf->len = q - hf->text;
1191
+ tk->len = r - tk->text;
1192
+ }
1193
+ }
1194
+ return tk;
1195
+ }
1196
+
1197
+ TokenStream *hyphen_filter_new(TokenStream *sub_ts)
1198
+ {
1199
+ TokenStream *ts = tf_new(HyphenFilter, sub_ts);
1200
+ ts->next = &hf_next;
1201
+ ts->clone_i = &hf_clone_i;
1202
+ return ts;
1203
+ }
1204
+
1125
1205
  /****************************************************************************
1126
1206
  * LowerCaseFilter
1127
1207
  ****************************************************************************/
@@ -1257,64 +1337,44 @@ TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
1257
1337
  Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
1258
1338
  bool lowercase)
1259
1339
  {
1260
- TokenStream *ts;
1340
+ TokenStream *ts = standard_tokenizer_new();
1261
1341
  if (lowercase) {
1262
- ts = stop_filter_new_with_words_len(lowercase_filter_new
1263
- (standard_tokenizer_new()),
1264
- words, len);
1265
- }
1266
- else {
1267
- ts = stop_filter_new_with_words_len(standard_tokenizer_new(),
1268
- words, len);
1342
+ ts = lowercase_filter_new(ts);
1269
1343
  }
1344
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1270
1345
  return analyzer_new(ts, NULL, NULL);
1271
1346
  }
1272
1347
 
1273
1348
  Analyzer *standard_analyzer_new_with_words(const char **words,
1274
1349
  bool lowercase)
1275
1350
  {
1276
- TokenStream *ts;
1351
+ TokenStream *ts = standard_tokenizer_new();
1277
1352
  if (lowercase) {
1278
- ts = stop_filter_new_with_words(lowercase_filter_new
1279
- (standard_tokenizer_new()),
1280
- words);
1281
- }
1282
- else {
1283
- ts = stop_filter_new_with_words(standard_tokenizer_new(),
1284
- words);
1353
+ ts = lowercase_filter_new(ts);
1285
1354
  }
1355
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1286
1356
  return analyzer_new(ts, NULL, NULL);
1287
1357
  }
1288
1358
 
1289
1359
  Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
1290
1360
  int len, bool lowercase)
1291
1361
  {
1292
- TokenStream *ts;
1362
+ TokenStream *ts = mb_standard_tokenizer_new();
1293
1363
  if (lowercase) {
1294
- ts = stop_filter_new_with_words_len(mb_lowercase_filter_new
1295
- (mb_standard_tokenizer_new
1296
- ()), words, len);
1297
- }
1298
- else {
1299
- ts = stop_filter_new_with_words_len(mb_standard_tokenizer_new(),
1300
- words, len);
1364
+ ts = mb_lowercase_filter_new(ts);
1301
1365
  }
1366
+ ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len));
1302
1367
  return analyzer_new(ts, NULL, NULL);
1303
1368
  }
1304
1369
 
1305
1370
  Analyzer *mb_standard_analyzer_new_with_words(const char **words,
1306
1371
  bool lowercase)
1307
1372
  {
1308
- TokenStream *ts;
1373
+ TokenStream *ts = mb_standard_tokenizer_new();
1309
1374
  if (lowercase) {
1310
- ts = stop_filter_new_with_words(mb_lowercase_filter_new
1311
- (mb_standard_tokenizer_new()),
1312
- words);
1313
- }
1314
- else {
1315
- ts = stop_filter_new_with_words(mb_standard_tokenizer_new(),
1316
- words);
1375
+ ts = mb_lowercase_filter_new(ts);
1317
1376
  }
1377
+ ts = hyphen_filter_new(stop_filter_new_with_words(ts, words));
1318
1378
  return analyzer_new(ts, NULL, NULL);
1319
1379
  }
1320
1380
 
@@ -89,6 +89,16 @@ typedef struct StopFilter
89
89
  HashTable *words;
90
90
  } StopFilter;
91
91
 
92
+ typedef struct HyphenFilter
93
+ {
94
+ TokenFilter super;
95
+ char text[MAX_WORD_SIZE];
96
+ int start;
97
+ int pos;
98
+ int len;
99
+ Token *tk;
100
+ } HyphenFilter;
101
+
92
102
  typedef struct StemFilter
93
103
  {
94
104
  TokenFilter super;
@@ -111,6 +121,7 @@ extern TokenStream *mb_letter_tokenizer_new(bool lowercase);
111
121
  extern TokenStream *standard_tokenizer_new();
112
122
  extern TokenStream *mb_standard_tokenizer_new();
113
123
 
124
+ extern TokenStream *hyphen_filter_new(TokenStream *ts);
114
125
  extern TokenStream *lowercase_filter_new(TokenStream *ts);
115
126
  extern TokenStream *mb_lowercase_filter_new(TokenStream *ts);
116
127
 
@@ -16,6 +16,7 @@ ID id_lt;
16
16
  ID id_call;
17
17
  ID id_is_directory;
18
18
  ID id_close;
19
+ ID id_cclass;
19
20
  ID id_data;
20
21
 
21
22
  static ID id_mkdir_p;
@@ -97,6 +98,13 @@ VALUE frt_data_alloc(VALUE klass)
97
98
  return Frt_Make_Struct(klass);
98
99
  }
99
100
 
101
+ VALUE frt_define_class_under(VALUE module, char *name, VALUE super)
102
+ {
103
+ VALUE klass = rb_define_class_under(module, name, super);
104
+ rb_ivar_set(klass, id_cclass, Qtrue);
105
+ return klass;
106
+ }
107
+
100
108
  void frt_deref_free(void *p)
101
109
  {
102
110
  object_del(p);
@@ -255,6 +263,8 @@ void Init_ferret_ext(void)
255
263
  id_is_directory = rb_intern("directory?");
256
264
  id_close = rb_intern("close");
257
265
 
266
+ id_cclass = rb_intern("cclass");
267
+
258
268
  id_data = rb_intern("@data");
259
269
 
260
270
  /* Symbols */
@@ -13,6 +13,7 @@ extern ID id_lt;
13
13
  extern ID id_call;
14
14
  extern ID id_is_directory;
15
15
  extern ID id_close;
16
+ extern ID id_cclass;
16
17
  extern ID id_data;
17
18
 
18
19
  /* Symbols */
@@ -60,6 +61,7 @@ extern void frt_create_dir(VALUE rpath);
60
61
  extern VALUE frt_hs_to_rb_ary(HashSet *hs);
61
62
  extern void *frt_rb_data_ptr(VALUE val);
62
63
  extern char * frt_field(VALUE rfield);
64
+ extern VALUE frt_define_class_under(VALUE module, char *name, VALUE super);
63
65
 
64
66
  #define Frt_Make_Struct(klass)\
65
67
  rb_data_object_alloc(klass,NULL,(RUBY_DATA_FUNC)NULL,(RUBY_DATA_FUNC)NULL)
@@ -8,6 +8,7 @@
8
8
 
9
9
  #undef close
10
10
  #undef rename
11
+ #undef read
11
12
 
12
13
  #define frt_malloc xmalloc
13
14
  #define frt_calloc(n) xcalloc(n, 1)
@@ -722,8 +722,8 @@ void lazy_df_get_bytes(LazyDocField *self, char *buf, int start, int len)
722
722
  RAISE(IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
723
723
  "is not between 0 and %d", start, self->len);
724
724
  }
725
- if (len < 0) {
726
- RAISE(IO_ERROR, "len %d should be greater than 0", len);
725
+ if (len <= 0) {
726
+ RAISE(IO_ERROR, "len = %d, but should be greater than 0", len);
727
727
  }
728
728
  if (start + len > self->len) {
729
729
  RAISE(IO_ERROR, "Tried to read past end of field. Field is only %d "
data/ext/lang.h CHANGED
@@ -8,6 +8,7 @@
8
8
 
9
9
  #undef close
10
10
  #undef rename
11
+ #undef read
11
12
 
12
13
  #define frt_malloc xmalloc
13
14
  #define frt_calloc(n) xcalloc(n, 1)
@@ -1984,7 +1984,14 @@ static Query *get_term_q(QParser *qp, char *field, char *word)
1984
1984
  q->destroy_i(q);
1985
1985
  q = phq;
1986
1986
  do {
1987
- phq_add_term(q, token->text, token->pos_inc);
1987
+ if (token->pos_inc) {
1988
+ phq_add_term(q, token->text, token->pos_inc);
1989
+ /* add some slop since single term was expected */
1990
+ ((PhraseQuery *)q)->slop++;
1991
+ }
1992
+ else {
1993
+ phq_append_multi_term(q, token->text);
1994
+ }
1988
1995
  } while ((token = ts_next(stream)) != NULL);
1989
1996
  }
1990
1997
  }
@@ -2157,7 +2164,7 @@ static Phrase *ph_add_multi_word(Phrase *self, char *word)
2157
2164
  }
2158
2165
 
2159
2166
  static Query *get_phrase_query(QParser *qp, char *field,
2160
- Phrase *phrase, char *slop_str)
2167
+ Phrase *phrase, char *slop_str)
2161
2168
  {
2162
2169
  const int pos_cnt = phrase->size;
2163
2170
  Query *q = NULL;
@@ -2180,6 +2187,7 @@ static Query *get_phrase_query(QParser *qp, char *field,
2180
2187
  Token *token;
2181
2188
  TokenStream *stream;
2182
2189
  int i, j;
2190
+ int pos_inc = 0;
2183
2191
  q = phq_new(field);
2184
2192
  if (slop_str) {
2185
2193
  int slop;
@@ -2188,14 +2196,24 @@ static Query *get_phrase_query(QParser *qp, char *field,
2188
2196
  }
2189
2197
 
2190
2198
  for (i = 0; i < pos_cnt; i++) {
2191
- int pos_inc = phrase->positions[i].pos; /* Actually holds pos_inc */
2192
2199
  char **words = phrase->positions[i].terms;
2193
2200
  const int word_count = ary_size(words);
2201
+ if (pos_inc) {
2202
+ ((PhraseQuery *)q)->slop++;
2203
+ }
2204
+ pos_inc += phrase->positions[i].pos + 1; /* Actually holds pos_inc*/
2194
2205
 
2195
2206
  if (word_count == 1) {
2196
2207
  stream = get_cached_ts(qp, field, words[0]);
2197
2208
  while ((token = ts_next(stream))) {
2198
- phq_add_term(q, token->text, token->pos_inc + pos_inc);
2209
+ if (token->pos_inc) {
2210
+ phq_add_term(q, token->text,
2211
+ pos_inc ? pos_inc : token->pos_inc);
2212
+ }
2213
+ else {
2214
+ phq_append_multi_term(q, token->text);
2215
+ ((PhraseQuery *)q)->slop++;
2216
+ }
2199
2217
  pos_inc = 0;
2200
2218
  }
2201
2219
  }
@@ -2206,8 +2224,10 @@ static Query *get_phrase_query(QParser *qp, char *field,
2206
2224
  stream = get_cached_ts(qp, field, words[j]);
2207
2225
  if ((token = ts_next(stream))) {
2208
2226
  if (!added_position) {
2209
- phq_add_term(q, token->text, token->pos_inc + pos_inc);
2227
+ phq_add_term(q, token->text,
2228
+ pos_inc ? pos_inc : token->pos_inc);
2210
2229
  added_position = true;
2230
+ pos_inc = 0;
2211
2231
  }
2212
2232
  else {
2213
2233
  phq_append_multi_term(q, token->text);