ferret 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -107,7 +107,7 @@ be true about Ferret. Apart from the bits about it being in Java.
107
107
 
108
108
  [<b>David Balmain</b>] Port to Ruby
109
109
 
110
- [<b>Doug Cutting and friends</b>] Original Java Lucene
110
+ [The Apache Software Foundation (Doug Cutting and friends)] Original Apache Lucene
111
111
 
112
112
  == License
113
113
 
data/TODO CHANGED
@@ -5,5 +5,8 @@ Send suggestions for this list to mailto:dbalmain@gmail.com
5
5
  === To Do
6
6
 
7
7
  * Add the ability to persist an in memory index to Ferret::Index::Index
8
- * Add unicode support
9
8
 
9
+ === Done
10
+
11
+ * Add UTF-8 support
12
+ * Multi Field Query
data/TUTORIAL CHANGED
@@ -28,7 +28,9 @@ For more options when creating an Index refer to Ferret::Index::Index.
28
28
 
29
29
  === Adding Documents
30
30
 
31
- To add a document you can simply add a string or an array of strings.
31
+ To add a document you can simply add a string or an array of strings. This will
32
+ store all the strings in the "" (ie empty string) field (unless you specify the
33
+ default field when you create the index).
32
34
 
33
35
  index << "This is a new document to be indexed"
34
36
  index << ["And here", "is another", "new document", "to be indexed"]
@@ -108,6 +110,12 @@ document type;
108
110
 
109
111
  NOTE: documents are indexed from 0.
110
112
 
113
+ The default field is an empty string when you use the simple string document so
114
+ to access those strings you'll have type;
115
+
116
+ index << "This is a document"
117
+ index[0][""]
118
+
111
119
  Let's go back to the database example above. If we store all of our documents
112
120
  with an id then we can access that field using the id. As long as we called
113
121
  our id field "id" we can do this
@@ -1,18 +1,43 @@
1
1
  #include "ferret.h"
2
2
 
3
+ /* IDs */
4
+ ID frt_newobj;
5
+
6
+ /* Modules */
7
+ VALUE mFerret;
8
+ VALUE mStore;
9
+ VALUE mIndex;
10
+ VALUE mUtils;
11
+ VALUE mStringHelper;
12
+
13
+ /* Classes */
14
+ VALUE cRAMDirectory;
15
+ VALUE cIndexIn;
16
+ VALUE cBufferedIndexIn;
17
+ VALUE cFSIndexIn;
18
+ VALUE cIndexOut;
19
+ VALUE cBufferedIndexOut;
20
+ VALUE cFSIndexOut;
21
+ VALUE cRAMIndexOut;
22
+ VALUE cRAMIndexIn;
23
+ VALUE cTerm;
24
+ VALUE cTermBuffer;
25
+ VALUE cPriorityQueue;
26
+ VALUE cSegmentMergeQueue;
27
+
3
28
  void
4
29
  Init_ferret_ext(void)
5
30
  {
6
- // IDs
31
+ /* IDs */
7
32
  frt_newobj = rb_intern("new");
8
33
 
9
- // Modules
34
+ /* Modules */
10
35
  mFerret = rb_define_module("Ferret");
11
36
  mStore = rb_define_module_under(mFerret, "Store");
12
37
  mIndex = rb_define_module_under(mFerret, "Index");
13
38
  mUtils = rb_define_module_under(mFerret, "Utils");
14
39
 
15
- // Inits
40
+ /* Inits */
16
41
  Init_indexio();
17
42
  Init_term();
18
43
  Init_term_buffer();
@@ -42,32 +42,32 @@ typedef struct RAMFile {
42
42
  int length;
43
43
  } RAMFile;
44
44
 
45
- // IDs
46
- ID frt_newobj;
45
+ /* IDs */
46
+ extern ID frt_newobj;
47
47
 
48
- // Modules
49
- VALUE mFerret;
50
- VALUE mStore;
51
- VALUE mIndex;
52
- VALUE mUtils;
53
- VALUE mStringHelper;
48
+ /* Modules */
49
+ extern VALUE mFerret;
50
+ extern VALUE mStore;
51
+ extern VALUE mIndex;
52
+ extern VALUE mUtils;
53
+ extern VALUE mStringHelper;
54
54
 
55
- // Classes
56
- VALUE cRAMDirectory;
57
- VALUE cIndexIn;
58
- VALUE cBufferedIndexIn;
59
- VALUE cFSIndexIn;
60
- VALUE cIndexOut;
61
- VALUE cBufferedIndexOut;
62
- VALUE cFSIndexOut;
63
- VALUE cRAMIndexOut;
64
- VALUE cRAMIndexIn;
65
- VALUE cTerm;
66
- VALUE cTermBuffer;
67
- VALUE cPriorityQueue;
68
- VALUE cSegmentMergeQueue;
55
+ /* Classes */
56
+ extern VALUE cRAMDirectory;
57
+ extern VALUE cIndexIn;
58
+ extern VALUE cBufferedIndexIn;
59
+ extern VALUE cFSIndexIn;
60
+ extern VALUE cIndexOut;
61
+ extern VALUE cBufferedIndexOut;
62
+ extern VALUE cFSIndexOut;
63
+ extern VALUE cRAMIndexOut;
64
+ extern VALUE cRAMIndexIn;
65
+ extern VALUE cTerm;
66
+ extern VALUE cTermBuffer;
67
+ extern VALUE cPriorityQueue;
68
+ extern VALUE cSegmentMergeQueue;
69
69
 
70
- // Ferret Inits
70
+ /* Ferret Inits */
71
71
  extern void Init_indexio();
72
72
  extern void Init_term();
73
73
  extern void Init_priority_queue();
@@ -76,7 +76,7 @@ extern void Init_segment_merge_queue();
76
76
  extern void Init_ram_directory();
77
77
  extern void Init_string_helper();
78
78
 
79
- // External functions
79
+ /* External functions */
80
80
  extern int frt_hash(register char *p, register int len);
81
81
  extern unsigned long long frt_read_vint(VALUE self);
82
82
  extern void frt_read_chars(VALUE self, char *buf, int offset, int len);
@@ -82,7 +82,7 @@ frt_indexin_refill(VALUE self)
82
82
  rStr, INT2FIX(0), INT2FIX(len_to_read));
83
83
 
84
84
  memcpy(my_buf->buffer, RSTRING(rStr)->ptr, BUFFER_SIZE);
85
- //my_buf->buffer = StringValuePtr(rStr);
85
+ /* my_buf->buffer = StringValuePtr(rStr); */
86
86
 
87
87
  my_buf->len = len_to_read;
88
88
  my_buf->start = start;
@@ -143,7 +143,7 @@ frt_read_bytes(VALUE self, VALUE rbuffer, int offset, int len)
143
143
 
144
144
  my_buf->start = my_buf->start + len;
145
145
  my_buf->pos = 0;
146
- my_buf->len = 0; // trigger refill() on read()
146
+ my_buf->len = 0; /* trigger refill() on read() */
147
147
  }
148
148
 
149
149
  return rbuf;
@@ -168,11 +168,11 @@ frt_indexin_seek(VALUE self, VALUE rpos)
168
168
  Data_Get_Struct(self, IndexBuffer, my_buf);
169
169
 
170
170
  if ((pos >= my_buf->start) && (pos < (my_buf->start + my_buf->len))) {
171
- my_buf->pos = pos - my_buf->start; // seek within buffer
171
+ my_buf->pos = pos - my_buf->start; /* seek within buffer */
172
172
  } else {
173
173
  my_buf->start = pos;
174
174
  my_buf->pos = 0;
175
- my_buf->len = 0; // trigger refill() on read()
175
+ my_buf->len = 0; /* trigger refill() on read() */
176
176
  rb_funcall(self, frt_seek_internal, 1, rpos);
177
177
  }
178
178
  return Qnil;
@@ -229,9 +229,9 @@ frt_read_vint(VALUE self)
229
229
  register int shift = 7;
230
230
 
231
231
  b = frt_read_byte(self);
232
- i = b & 0x7F; // 0x7F = 0b01111111
232
+ i = b & 0x7F; /* 0x7F = 0b01111111 */
233
233
 
234
- while ((b & 0x80) != 0) {// 0x80 = 0b10000000
234
+ while ((b & 0x80) != 0) {/* 0x80 = 0b10000000 */
235
235
  b = frt_read_byte(self);
236
236
  i |= (b & 0x7F) << shift;
237
237
  shift += 7;
@@ -249,7 +249,7 @@ frt_indexin_read_vint(VALUE self)
249
249
  void
250
250
  frt_read_chars(VALUE self, char* buffer, int off, int len)
251
251
  {
252
- //byte_t b, b1, b2;
252
+ /* byte_t b, b1, b2; */
253
253
  int end, i;
254
254
 
255
255
  end = off + len;
@@ -257,21 +257,6 @@ frt_read_chars(VALUE self, char* buffer, int off, int len)
257
257
  for(i = off; i < end; i++) {
258
258
  buffer[i] = frt_read_byte(self);
259
259
  }
260
- // for(i = off; i < end; i++){
261
- // b = frt_read_byte(self);
262
- // if((b & 0x80) == 0){
263
- // buffer[i] = (char)(b & 0x7F);
264
- // } else {
265
- // if((b & 0xE0) != 0xE0){
266
- // b1 = frt_read_byte(self);
267
- // buffer[i] = (char)(((b & 0x1F) << 6) | (b1 & 0x3F));
268
- // } else{
269
- // b1 = frt_read_byte(self);
270
- // b2 = frt_read_byte(self);
271
- // buffer[i] = (char)(((b & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
272
- // }
273
- // }
274
- // }
275
260
  }
276
261
 
277
262
  static VALUE
@@ -412,7 +397,7 @@ static VALUE
412
397
  frt_indexout_write_ulong(VALUE self, VALUE rulong)
413
398
  {
414
399
  unsigned long long l;
415
- l = rb_num2ull(rulong); // ruby 1.8 doesn't have NUM2ULL. Added in 1.9
400
+ l = rb_num2ull(rulong); /* ruby 1.8 doesn't have NUM2ULL. Added in 1.9 */
416
401
  frt_write_byte(self, (l >> 56) & 0xFF);
417
402
  frt_write_byte(self, (l >> 48) & 0xFF);
418
403
  frt_write_byte(self, (l >> 40) & 0xFF);
@@ -492,13 +477,13 @@ frt_indexout_write_string(VALUE self, VALUE rstr)
492
477
  void
493
478
  Init_indexio(void)
494
479
  {
495
- // IDs
480
+ /* IDs */
496
481
  frt_length = rb_intern("length");
497
482
  frt_flush_buffer = rb_intern("flush_buffer");
498
483
  frt_read_internal = rb_intern("read_internal");
499
484
  frt_seek_internal = rb_intern("seek_internal");
500
485
 
501
- // IndexInput
486
+ /* IndexInput */
502
487
  cIndexIn = rb_define_class_under(mStore, "IndexInput", rb_cObject);
503
488
  cBufferedIndexIn = rb_define_class_under(mStore, "BufferedIndexInput", cIndexIn);
504
489
  rb_define_alloc_func(cBufferedIndexIn, frt_indexbuffer_alloc);
@@ -518,7 +503,7 @@ Init_indexio(void)
518
503
  rb_define_method(cBufferedIndexIn, "read_string", frt_indexin_read_string, 0);
519
504
  rb_define_method(cBufferedIndexIn, "read_chars", frt_indexin_read_bytes, 3);
520
505
 
521
- // IndexOutput
506
+ /* IndexOutput */
522
507
  cIndexOut = rb_define_class_under(mStore, "IndexOutput", rb_cObject);
523
508
  cBufferedIndexOut = rb_define_class_under(mStore, "BufferedIndexOutput", cIndexOut);
524
509
  rb_define_alloc_func(cBufferedIndexOut, frt_indexbuffer_alloc);
@@ -538,6 +523,6 @@ Init_indexio(void)
538
523
  rb_define_method(cBufferedIndexOut, "write_chars", frt_indexout_write_chars, 3);
539
524
  rb_define_method(cBufferedIndexOut, "write_string", frt_indexout_write_string, 1);
540
525
 
541
- // FSIndexInput
542
- //cFSIndexIn = rb_define_class_under(mStore, "FSIndexInput", cBufferedIndexIn);
526
+ /* FSIndexInput */
527
+ /*cFSIndexIn = rb_define_class_under(mStore, "FSIndexInput", cBufferedIndexIn); */
543
528
  }
@@ -101,7 +101,7 @@ frt_rio_flush_buffer(VALUE self, VALUE rsrc, VALUE rlen)
101
101
  int buffer_number, buffer_offset, bytes_in_buffer, bytes_to_copy;
102
102
  int src_offset;
103
103
  int len = FIX2INT(rlen);
104
- //char *src = StringValuePtr(rsrc);
104
+ /* char *src = StringValuePtr(rsrc); */
105
105
  int pointer = FIX2INT(rb_iv_get(self, "pointer"));
106
106
 
107
107
  VALUE file = rb_iv_get(self, "file");
@@ -275,26 +275,26 @@ frt_rii_close(VALUE self)
275
275
  void
276
276
  Init_ram_directory(void)
277
277
  {
278
- // IDs
278
+ /* IDs */
279
279
  flush = rb_intern("flush");
280
280
  seek = rb_intern("seek");
281
281
 
282
- // RAMDirectory
282
+ /* RAMDirectory */
283
283
  VALUE cDirectory = rb_define_class_under(mStore, "Directory", rb_cObject);
284
284
  cRAMDirectory = rb_define_class_under(mStore, "RAMDirectory", cDirectory);
285
285
 
286
- // RAMFile
286
+ /* RAMFile */
287
287
  VALUE cRAMFile = rb_define_class_under(cRAMDirectory, "RAMFile", rb_cObject);
288
288
  rb_define_alloc_func(cRAMFile, frt_rf_alloc);
289
289
 
290
- // Methods
290
+ /* Methods */
291
291
  rb_define_method(cRAMFile, "length", frt_rf_length, 0);
292
292
 
293
- // RAMIndexOutput
293
+ /* RAMIndexOutput */
294
294
  cRAMIndexOut = rb_define_class_under(cRAMDirectory, "RAMIndexOutput", cBufferedIndexOut);
295
- //rb_define_alloc_func(cRAMIndexOut, frt_ramio_alloc);
295
+ /*rb_define_alloc_func(cRAMIndexOut, frt_ramio_alloc); */
296
296
 
297
- // Methods
297
+ /* Methods */
298
298
  rb_define_method(cRAMIndexOut, "initialize", frt_rio_init, 1);
299
299
  rb_define_method(cRAMIndexOut, "length", frt_rio_length, 0);
300
300
  rb_define_method(cRAMIndexOut, "flush_buffer", frt_rio_flush_buffer, 2);
@@ -303,11 +303,11 @@ Init_ram_directory(void)
303
303
  rb_define_method(cRAMIndexOut, "close", frt_rio_close, 0);
304
304
  rb_define_method(cRAMIndexOut, "write_to", frt_rio_write_to, 1);
305
305
 
306
- // RAMIndexInput
306
+ /* RAMIndexInput */
307
307
  cRAMIndexIn = rb_define_class_under(cRAMDirectory, "RAMIndexInput", cBufferedIndexIn);
308
- //rb_define_alloc_func(cRAMIndexIn, frt_ramio_alloc);
308
+ /*rb_define_alloc_func(cRAMIndexIn, frt_ramio_alloc); */
309
309
 
310
- // Methods
310
+ /* Methods */
311
311
  rb_define_method(cRAMIndexIn, "initialize", frt_rii_init, 1);
312
312
  rb_define_method(cRAMIndexIn, "length", frt_rii_length, 0);
313
313
  rb_define_method(cRAMIndexIn, "read_internal", frt_rii_read_internal, 3);
@@ -30,11 +30,11 @@ frt_smq_less_than(VALUE self, VALUE rsti1, VALUE rsti2)
30
30
  void
31
31
  Init_segment_merge_queue(void)
32
32
  {
33
- // IDs
33
+ /* IDs */
34
34
  eq = rb_intern("==");
35
35
  lt = rb_intern("<");
36
36
 
37
- // SegmentMergeQueue
37
+ /* SegmentMergeQueue */
38
38
  cSegmentMergeQueue = rb_define_class_under(mIndex, "SegmentMergeQueue", cPriorityQueue);
39
39
 
40
40
  rb_define_method(cSegmentMergeQueue, "less_than", frt_smq_less_than, 2);
@@ -35,7 +35,7 @@ frt_sh_string_difference(VALUE self, VALUE rstr1, VALUE rstr2)
35
35
  void
36
36
  Init_string_helper(void)
37
37
  {
38
- // StringHelper
38
+ /* StringHelper */
39
39
  mStringHelper = rb_define_module_under(mUtils, "StringHelper");
40
40
 
41
41
  rb_define_method(mStringHelper, "string_difference", frt_sh_string_difference, 2);
data/ext/term.c CHANGED
@@ -145,13 +145,15 @@ frt_term_compare_to(VALUE self, VALUE rother)
145
145
  } else
146
146
  comp = mylen > olen ? 1 : -1;
147
147
  }
148
- //comp = strcmp(term->field, other->field);
149
- //if(comp == 0)
150
- // comp = strcmp(term->text, other->text);
148
+ /*
149
+ comp = strcmp(term->field, other->field);
150
+ if(comp == 0)
151
+ comp = strcmp(term->text, other->text);
152
+ */
151
153
  return INT2FIX(comp);
152
154
  }
153
155
 
154
- // keep in synch with fuction above
156
+ /* keep in synch with fuction above */
155
157
  int
156
158
  frt_term_compare_to_int(VALUE self, VALUE rother)
157
159
  {
@@ -175,9 +177,11 @@ frt_term_compare_to_int(VALUE self, VALUE rother)
175
177
  } else
176
178
  comp = mylen > olen ? 1 : -1;
177
179
  }
178
- //comp = strcmp(term->field, other->field);
179
- //if(comp == 0)
180
- // comp = strcmp(term->text, other->text);
180
+ /*
181
+ comp = strcmp(term->field, other->field);
182
+ if(comp == 0)
183
+ comp = strcmp(term->text, other->text);
184
+ */
181
185
  return comp;
182
186
  }
183
187
 
@@ -214,11 +218,13 @@ frt_term_eq(VALUE self, VALUE rother)
214
218
  }
215
219
 
216
220
 
217
- //static VALUE
218
- //frt_term_compare_to(VALUE self, VALUE other)
219
- //{
220
- // return INT2FIX(frt_term_compare_to_int(self, other));
221
- //}
221
+ /*
222
+ static VALUE
223
+ frt_term_compare_to(VALUE self, VALUE other)
224
+ {
225
+ return INT2FIX(frt_term_compare_to_int(self, other));
226
+ }
227
+ */
222
228
 
223
229
  static VALUE
224
230
  frt_term_hash(VALUE self)
@@ -238,7 +244,7 @@ frt_term_hash(VALUE self)
238
244
  void
239
245
  Init_term(void)
240
246
  {
241
- //Term
247
+ /* Term */
242
248
  cTerm = rb_define_class_under(mIndex, "Term", rb_cObject);
243
249
  rb_define_alloc_func(cTerm, frt_term_alloc);
244
250
  rb_include_module(cTerm, rb_mComparable);
@@ -267,15 +267,15 @@ frt_termbuffer_hash(VALUE self)
267
267
 
268
268
  void
269
269
  Init_term_buffer(void) {
270
- // IDs
270
+ /* IDs */
271
271
  field_name = rb_intern("name");
272
272
 
273
- // TermBuffer
273
+ /* TermBuffer */
274
274
  cTermBuffer = rb_define_class_under(mIndex, "TermBuffer", rb_cObject);
275
275
  rb_define_alloc_func(cTermBuffer, frt_termbuffer_alloc);
276
276
  rb_include_module(cTermBuffer, rb_mComparable);
277
277
 
278
- // Methods
278
+ /* Methods */
279
279
  rb_define_method(cTermBuffer, "initialize", frt_termbuffer_init, 0);
280
280
  rb_define_method(cTermBuffer, "initialize_copy", frt_termbuffer_init_copy, 1);
281
281
  rb_define_method(cTermBuffer, "text", frt_termbuffer_get_text, 0);
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.1.1'
25
+ VERSION = '0.1.2'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -12,7 +12,7 @@ module Ferret::Analysis
12
12
  # words correctly as well as tokenizing things like email addresses, web
13
13
  # addresses, phone numbers, etc.
14
14
 
15
- class StandardTokenizer < RETokenizer
15
+ class StandardTokenizer < RegExpTokenizer
16
16
  ALPHA = /[[:alpha:]]+/
17
17
  APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
18
18
  ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/