ferret 0.11.5 → 0.11.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@
13
13
  ****************************************************************************/
14
14
 
15
15
  INLINE Token *tk_set(Token *tk,
16
- char *text, int tlen, int start, int end, int pos_inc)
16
+ char *text, int tlen, off_t start, off_t end, int pos_inc)
17
17
  {
18
18
  if (tlen >= MAX_WORD_SIZE) {
19
19
  tlen = MAX_WORD_SIZE - 1;
@@ -31,16 +31,16 @@ INLINE Token *tk_set_ts(Token *tk,
31
31
  char *start, char *end, char *text, int pos_inc)
32
32
  {
33
33
  return tk_set(tk, start, (int)(end - start),
34
- (int)(start - text), (int)(end - text), pos_inc);
34
+ (off_t)(start - text), (off_t)(end - text), pos_inc);
35
35
  }
36
36
 
37
37
  INLINE Token *tk_set_no_len(Token *tk,
38
- char *text, int start, int end, int pos_inc)
38
+ char *text, off_t start, off_t end, int pos_inc)
39
39
  {
40
40
  return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
41
41
  }
42
42
 
43
- INLINE Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
43
+ INLINE Token *w_tk_set(Token *tk, wchar_t *text, off_t start, off_t end,
44
44
  int pos_inc)
45
45
  {
46
46
  int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
@@ -374,8 +374,8 @@ static Token *mb_wst_next_lc(TokenStream *ts)
374
374
  }
375
375
  *w = 0;
376
376
  ts->t = t;
377
- return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
378
- (int)(t - ts->text), 1);
377
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
378
+ (off_t)(t - ts->text), 1);
379
379
  }
380
380
 
381
381
  TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
@@ -512,8 +512,8 @@ Token *mb_lt_next_lc(TokenStream *ts)
512
512
  }
513
513
  *w = 0;
514
514
  ts->t = t;
515
- return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
516
- (int)(t - ts->text), 1);
515
+ return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
516
+ (off_t)(t - ts->text), 1);
517
517
  }
518
518
 
519
519
  TokenStream *mb_letter_tokenizer_new(bool lowercase)
@@ -926,8 +926,8 @@ static Token *std_next(TokenStream *ts)
926
926
  }
927
927
  ts->t = t + len;
928
928
  token[len] = 0;
929
- return tk_set(&(CTS(ts)->token), token, len, (int)(start - ts->text),
930
- (int)(ts->t - ts->text), 1);
929
+ return tk_set(&(CTS(ts)->token), token, len, (off_t)(start - ts->text),
930
+ (off_t)(ts->t - ts->text), 1);
931
931
  }
932
932
 
933
933
  /* now see how long a url we can find. */
@@ -974,8 +974,8 @@ static Token *std_next(TokenStream *ts)
974
974
  }
975
975
  }
976
976
  tk_set(&(CTS(ts)->token), token, token_i,
977
- (int)(start - ts->text),
978
- (int)(t - ts->text), 1);
977
+ (off_t)(start - ts->text),
978
+ (off_t)(t - ts->text), 1);
979
979
  }
980
980
  else { /* just return the url as is */
981
981
  tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
@@ -16,16 +16,16 @@ typedef struct Token
16
16
  {
17
17
  char text[MAX_WORD_SIZE];
18
18
  int len;
19
- int start;
20
- int end;
19
+ off_t start;
20
+ off_t end;
21
21
  int pos_inc;
22
22
  } Token;
23
23
 
24
24
  extern Token *tk_new();
25
25
  extern void tk_destroy(void *p);
26
- extern Token *tk_set(Token *tk, char *text, int tlen, int start, int end,
26
+ extern Token *tk_set(Token *tk, char *text, int tlen, off_t start, off_t end,
27
27
  int pos_inc);
28
- extern Token *tk_set_no_len(Token *tk, char *text, int start, int end,
28
+ extern Token *tk_set_no_len(Token *tk, char *text, off_t start, off_t end,
29
29
  int pos_inc);
30
30
  extern int tk_eq(Token *tk1, Token *tk2);
31
31
  extern int tk_cmp(Token *tk1, Token *tk2);
@@ -1450,10 +1450,12 @@ TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
1450
1450
  if (store_offsets) {
1451
1451
  int num_positions = tv->offset_cnt = is_read_vint(fdt_in);
1452
1452
  Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
1453
- off_t offset = 0;
1453
+ long long offset = 0;
1454
1454
  for (i = 0; i < num_positions; i++) {
1455
- offsets[i].start = offset += is_read_vint(fdt_in);
1456
- offsets[i].end = offset += is_read_vint(fdt_in);
1455
+ offsets[i].start =
1456
+ (off_t)(offset += (long long)is_read_vll(fdt_in));
1457
+ offsets[i].end =
1458
+ (off_t)(offset += (long long)is_read_vll(fdt_in));
1457
1459
  }
1458
1460
  }
1459
1461
  }
@@ -1681,13 +1683,13 @@ void fw_add_postings(FieldsWriter *fw,
1681
1683
 
1682
1684
  if (fi_store_offsets(fi)) {
1683
1685
  /* use delta encoding for offsets */
1684
- int last_end = 0;
1686
+ long long last_end = 0;
1685
1687
  os_write_vint(fdt_out, offset_count); /* write shared prefix length */
1686
1688
  for (i = 0; i < offset_count; i++) {
1687
- off_t start = offsets[i].start;
1688
- off_t end = offsets[i].end;
1689
- os_write_vint(fdt_out, start - last_end);
1690
- os_write_vint(fdt_out, end - start);
1689
+ long long start = (long long)offsets[i].start;
1690
+ long long end = (long long)offsets[i].end;
1691
+ os_write_vll(fdt_out, (unsigned long long)(start - last_end));
1692
+ os_write_vll(fdt_out, (unsigned long long)(end - start));
1691
1693
  last_end = end;
1692
1694
  }
1693
1695
  }
@@ -5197,7 +5199,7 @@ static void dw_add_posting(MemoryPool *mp,
5197
5199
  }
5198
5200
  }
5199
5201
 
5200
- static INLINE void dw_add_offsets(DocWriter *dw, int pos, int start, int end)
5202
+ static INLINE void dw_add_offsets(DocWriter *dw, int pos, off_t start, off_t end)
5201
5203
  {
5202
5204
  if (pos >= dw->offsets_capa) {
5203
5205
  int old_capa = dw->offsets_capa;
@@ -1192,8 +1192,8 @@ static VALUE
1192
1192
  frt_get_tv_offsets(Offset *offset)
1193
1193
  {
1194
1194
  return rb_struct_new(cTVOffsets,
1195
- INT2FIX(offset->start),
1196
- INT2FIX(offset->end),
1195
+ ULL2NUM((unsigned long long)offset->start),
1196
+ ULL2NUM((unsigned long long)offset->end),
1197
1197
  NULL);
1198
1198
  }
1199
1199
 
@@ -403,6 +403,36 @@ INLINE off_t is_read_voff_t(InStream *is)
403
403
  return res;
404
404
  }
405
405
 
406
+ /* optimized to use unchecked read_byte if there is definitely space */
407
+ INLINE unsigned long long is_read_vll(InStream *is)
408
+ {
409
+ register unsigned long long res, b;
410
+ register int shift = 7;
411
+
412
+ if (is->buf.pos > (is->buf.len - VINT_MAX_LEN)) {
413
+ b = is_read_byte(is);
414
+ res = b & 0x7F; /* 0x7F = 0b01111111 */
415
+
416
+ while ((b & 0x80) != 0) { /* 0x80 = 0b10000000 */
417
+ b = is_read_byte(is);
418
+ res |= (b & 0x7F) << shift;
419
+ shift += 7;
420
+ }
421
+ }
422
+ else { /* unchecked optimization */
423
+ b = read_byte(is);
424
+ res = b & 0x7F; /* 0x7F = 0b01111111 */
425
+
426
+ while ((b & 0x80) != 0) { /* 0x80 = 0b10000000 */
427
+ b = read_byte(is);
428
+ res |= (b & 0x7F) << shift;
429
+ shift += 7;
430
+ }
431
+ }
432
+
433
+ return res;
434
+ }
435
+
406
436
  INLINE void is_skip_vints(InStream *is, register int cnt)
407
437
  {
408
438
  for (; cnt > 0; cnt--) {
@@ -545,6 +575,25 @@ INLINE void os_write_voff_t(OutStream *os, register off_t num)
545
575
  }
546
576
  }
547
577
 
578
+ /* optimized to use an unchecked write if there is space */
579
+ INLINE void os_write_vll(OutStream *os, register unsigned long long num)
580
+ {
581
+ if (os->buf.pos > VINT_END) {
582
+ while (num > 127) {
583
+ os_write_byte(os, (uchar)((num & 0x7f) | 0x80));
584
+ num >>= 7;
585
+ }
586
+ os_write_byte(os, (uchar)num);
587
+ }
588
+ else {
589
+ while (num > 127) {
590
+ write_byte(os, (uchar)((num & 0x7f) | 0x80));
591
+ num >>= 7;
592
+ }
593
+ write_byte(os, (uchar)num);
594
+ }
595
+ }
596
+
548
597
  void os_write_string(OutStream *os, char *str)
549
598
  {
550
599
  int len = (int)strlen(str);
@@ -176,7 +176,11 @@ struct Store
176
176
  CompoundStore *cmpd; /* for compound_store only */
177
177
  } dir;
178
178
 
179
+ #ifdef POSH_OS_WIN32
180
+ int file_mode;
181
+ #else
179
182
  mode_t file_mode;
183
+ #endif
180
184
  HashSet *locks;
181
185
 
182
186
  /**
@@ -554,6 +558,16 @@ extern void os_write_vint(OutStream *os, register unsigned int num);
554
558
  */
555
559
  extern void os_write_voff_t(OutStream *os, register off_t num);
556
560
 
561
+ /**
562
+ * Write an unsigned long long to OutStream in compressed VINT format.
563
+ * TODO: describe VINT format
564
+ *
565
+ * @param os OutStream to write to
566
+ * @param num the long long to write
567
+ * @raise IO_ERROR if there is an error writing to the file-system
568
+ */
569
+ extern void os_write_vll(OutStream *os, register unsigned long long num);
570
+
557
571
  /**
558
572
  * Write a string to the OutStream. A string is an integer +length+ in VINT
559
573
  * format (see os_write_vint) followed by +length+ bytes. The string can then
@@ -694,6 +708,17 @@ extern INLINE void is_skip_vints(InStream *is, register int cnt);
694
708
  */
695
709
  extern INLINE off_t is_read_voff_t(InStream *is);
696
710
 
711
+ /**
712
+ * Read a compressed (VINT) unsigned long long from the InStream.
713
+ * TODO: describe VINT format
714
+ *
715
+ * @param is the InStream to read from
716
+ * @return a long long
717
+ * @raise IO_ERROR if there is a error reading from the file-system
718
+ * @raise EOF_ERROR if there is an attempt to read past the end of the file
719
+ */
720
+ extern INLINE unsigned long long is_read_vll(InStream *is);
721
+
697
722
  /**
698
723
  * Read a string from the InStream. A string is an integer +length+ in vint
699
724
  * format (see is_read_vint) followed by +length+ bytes. This is the format
@@ -1,3 +1,3 @@
1
1
  module Ferret
2
- VERSION = '0.11.5'
2
+ VERSION = '0.11.6'
3
3
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.11.5
7
- date: 2007-11-17 00:00:00 +11:00
6
+ version: 0.11.6
7
+ date: 2007-11-29 00:00:00 +11:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib