ferret 0.10.9 → 0.10.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/index.c CHANGED
@@ -1552,7 +1552,7 @@ TermEnum *ste_new(InStream *is, SegmentFieldIndex *sfi)
1552
1552
 
1553
1553
  typedef struct TermEnumWrapper
1554
1554
  {
1555
- int base;
1555
+ int index;
1556
1556
  TermEnum *te;
1557
1557
  int *doc_map;
1558
1558
  IndexReader *ir;
@@ -1567,13 +1567,16 @@ typedef struct MultiTermEnum
1567
1567
  TermEnumWrapper *tews;
1568
1568
  int size;
1569
1569
  int **field_num_map;
1570
+ int ti_cnt;
1571
+ TermInfo *tis;
1572
+ int *ti_indexes;
1570
1573
  } MultiTermEnum;
1571
1574
 
1572
1575
  static bool tew_lt(const TermEnumWrapper *tew1, const TermEnumWrapper *tew2)
1573
1576
  {
1574
1577
  int cmpres = strcmp(tew1->term, tew2->term);
1575
1578
  if (cmpres == 0) {
1576
- return tew1->base < tew2->base;
1579
+ return tew1->index < tew2->index;
1577
1580
  }
1578
1581
  else {
1579
1582
  return cmpres < 0;
@@ -1617,10 +1620,10 @@ static void tew_destroy(TermEnumWrapper *tew)
1617
1620
  tew->te->close(tew->te);
1618
1621
  }
1619
1622
 
1620
- TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int base, TermEnum *te,
1623
+ TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, TermEnum *te,
1621
1624
  IndexReader *ir)
1622
1625
  {
1623
- tew->base = base;
1626
+ tew->index = index;
1624
1627
  tew->ir = ir;
1625
1628
  tew->te = te;
1626
1629
  tew->term = te->curr_term;
@@ -1646,9 +1649,12 @@ static char *mte_next(TermEnum *te)
1646
1649
 
1647
1650
  te->curr_ti.doc_freq = 0;
1648
1651
 
1652
+ MTE(te)->ti_cnt = 0;
1649
1653
  while ((top != NULL) && (strcmp(te->curr_term, top->term) == 0)) {
1650
1654
  pq_pop(MTE(te)->tew_queue);
1651
1655
  te->curr_ti.doc_freq += top->te->curr_ti.doc_freq;/* increment freq */
1656
+ MTE(te)->ti_indexes[MTE(te)->ti_cnt] = top->index;
1657
+ MTE(te)->tis[MTE(te)->ti_cnt++] = top->te->curr_ti;
1652
1658
  if (tew_next(top)) {
1653
1659
  pq_push(MTE(te)->tew_queue, top); /* restore queue */
1654
1660
  }
@@ -1711,6 +1717,8 @@ static void mte_close(TermEnum *te)
1711
1717
  tew_destroy(&(MTE(te)->tews[i]));
1712
1718
  }
1713
1719
  free(MTE(te)->tews);
1720
+ free(MTE(te)->tis);
1721
+ free(MTE(te)->ti_indexes);
1714
1722
  pq_destroy(MTE(te)->tew_queue);
1715
1723
  free(te);
1716
1724
  }
@@ -1718,7 +1726,6 @@ static void mte_close(TermEnum *te)
1718
1726
  TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1719
1727
  {
1720
1728
  IndexReader **readers = mr->sub_readers;
1721
- int *starts = mr->starts;
1722
1729
  int r_cnt = mr->r_cnt;
1723
1730
  int i;
1724
1731
  IndexReader *reader;
@@ -1731,6 +1738,8 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1731
1738
  TE(mte)->close = &mte_close;
1732
1739
 
1733
1740
  mte->size = r_cnt;
1741
+ mte->tis = ALLOC_AND_ZERO_N(TermInfo, r_cnt);
1742
+ mte->ti_indexes = ALLOC_AND_ZERO_N(int, r_cnt);
1734
1743
  mte->tews = ALLOC_AND_ZERO_N(TermEnumWrapper, r_cnt);
1735
1744
  mte->tew_queue = pq_new(r_cnt, (lt_ft)&tew_lt, (free_ft)NULL);
1736
1745
  mte->field_num_map = mr->field_num_map;
@@ -1750,7 +1759,7 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1750
1759
  sub_te = reader->terms(reader, fnum);
1751
1760
  }
1752
1761
 
1753
- tew = tew_setup(&(mte->tews[i]), starts[i], sub_te, reader);
1762
+ tew = tew_setup(&(mte->tews[i]), i, sub_te, reader);
1754
1763
  if (((term == NULL) && tew_next(tew))
1755
1764
  || (tew->term && (tew->term[0] != '\0'))) {
1756
1765
  pq_push(mte->tew_queue, tew); /* initialize queue */
@@ -1759,7 +1768,7 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1759
1768
  /* add the term_enum_wrapper just in case */
1760
1769
  sub_te = reader->terms(reader, 0);
1761
1770
  sub_te->field_num = -1;
1762
- tew_setup(&(mte->tews[i]), starts[i], sub_te, reader);
1771
+ tew_setup(&(mte->tews[i]), i, sub_te, reader);
1763
1772
  }
1764
1773
  }
1765
1774
 
@@ -2386,45 +2395,29 @@ typedef struct MultiTermDocEnum
2386
2395
  {
2387
2396
  TermDocEnum tde;
2388
2397
  int *starts;
2389
- char *term;
2390
- int field_num;
2391
2398
  int base;
2392
2399
  int ptr;
2393
2400
  int ir_cnt;
2394
- int **field_num_map;
2401
+ char *state;
2402
+ TermEnum *te;
2395
2403
  IndexReader **irs;
2396
2404
  TermDocEnum **irs_tde;
2397
2405
  TermDocEnum *curr_tde;
2398
- TermDocEnum *(*reader_tde_i)(IndexReader *ir);
2399
2406
  } MultiTermDocEnum;
2400
2407
 
2401
- static TermDocEnum *mtde_reader_tde_i(IndexReader *ir)
2402
- {
2403
- return ir->term_docs(ir);
2404
- }
2405
-
2406
- static TermDocEnum *mtde_get_tde_i(MultiTermDocEnum *mtde, int i)
2408
+ static TermDocEnum *mtde_next_tde(MultiTermDocEnum *mtde)
2407
2409
  {
2408
- if (mtde->term == NULL) {
2409
- return NULL;
2410
+ mtde->ptr++;
2411
+ while (mtde->ptr < mtde->ir_cnt && !mtde->state[mtde->ptr]) {
2412
+ mtde->ptr++;
2413
+ }
2414
+ if (mtde->ptr >= mtde->ir_cnt) {
2415
+ return mtde->curr_tde = NULL;
2410
2416
  }
2411
2417
  else {
2412
- int fnum = mtde->field_num_map
2413
- ? mtde->field_num_map[i][mtde->field_num]
2414
- : mtde->field_num;
2415
-
2416
- if (fnum >= 0) {
2417
- TermDocEnum *tde = mtde->irs_tde[i];
2418
- if (tde == NULL) {
2419
- tde = mtde->irs_tde[i] = mtde->reader_tde_i(mtde->irs[i]);
2420
- }
2421
-
2422
- tde->seek(tde, fnum, mtde->term);
2423
- return tde;
2424
- }
2425
- else {
2426
- return NULL;
2427
- }
2418
+ TermDocEnum *tde = mtde->curr_tde = mtde->irs_tde[mtde->ptr];
2419
+ mtde->base = mtde->starts[mtde->ptr];
2420
+ return tde;
2428
2421
  }
2429
2422
  }
2430
2423
 
@@ -2435,30 +2428,35 @@ static TermDocEnum *mtde_get_tde_i(MultiTermDocEnum *mtde, int i)
2435
2428
  }\
2436
2429
  } while (0)
2437
2430
 
2438
- static void mtde_seek(TermDocEnum *tde, int field_num, const char *term)
2431
+ static void mtde_seek_te(TermDocEnum *tde, TermEnum *te)
2439
2432
  {
2433
+ int i;
2440
2434
  MultiTermDocEnum *mtde = MTDE(tde);
2441
- if (mtde->term != NULL) {
2442
- free(mtde->term);
2435
+ memset(mtde->state, 0, mtde->ir_cnt);
2436
+ for (i = MTE(te)->ti_cnt - 1; i >= 0; i--) {
2437
+ int index = MTE(te)->ti_indexes[i];
2438
+ TermDocEnum *tde = mtde->irs_tde[index];
2439
+ mtde->state[index] = 1;
2440
+ if (tde->close == stde_close) {
2441
+ stde_seek_ti(STDE(tde), MTE(te)->tis + i);
2442
+ } else if (tde->close == stpe_close) {
2443
+ stpe_seek_ti(STDE(tde), MTE(te)->tis + i);
2444
+ } else {
2445
+ tde->seek(tde, MTE(te)->tews[index].te->field_num, te->curr_term);
2446
+ }
2443
2447
  }
2444
- mtde->term = estrdup(term);
2445
- mtde->field_num = field_num;
2446
2448
  mtde->base = 0;
2447
- mtde->ptr = 0;
2448
- mtde->curr_tde = NULL;
2449
+ mtde->ptr = -1;
2450
+ mtde_next_tde(mtde);
2449
2451
  }
2450
2452
 
2451
- static void mtde_seek_te(TermDocEnum *tde, TermEnum *te)
2453
+ static void mtde_seek(TermDocEnum *tde, int field_num, const char *term)
2452
2454
  {
2453
2455
  MultiTermDocEnum *mtde = MTDE(tde);
2454
- if (mtde->term != NULL) {
2455
- free(mtde->term);
2456
- }
2457
- mtde->term = estrdup(te->curr_term);
2458
- mtde->field_num = te->field_num;
2459
- mtde->base = 0;
2460
- mtde->ptr = 0;
2461
- mtde->curr_tde = NULL;
2456
+ TermEnum *te = mtde->te;
2457
+ te->set_field(te, field_num);
2458
+ te->skip_to(te, term);
2459
+ mtde_seek_te(tde, te);
2462
2460
  }
2463
2461
 
2464
2462
  static int mtde_doc_num(TermDocEnum *tde)
@@ -2479,10 +2477,7 @@ static bool mtde_next(TermDocEnum *tde)
2479
2477
  if (mtde->curr_tde != NULL && mtde->curr_tde->next(mtde->curr_tde)) {
2480
2478
  return true;
2481
2479
  }
2482
- else if (mtde->ptr < mtde->ir_cnt) {
2483
- mtde->base = mtde->starts[mtde->ptr];
2484
- mtde->curr_tde = mtde_get_tde_i(mtde, mtde->ptr);
2485
- mtde->ptr++;
2480
+ else if (mtde_next_tde(mtde)) {
2486
2481
  return mtde_next(tde);
2487
2482
  }
2488
2483
  else {
@@ -2495,19 +2490,11 @@ static int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
2495
2490
  int i, end = 0, last_end = 0, b;
2496
2491
  MultiTermDocEnum *mtde = MTDE(tde);
2497
2492
  while (true) {
2498
- while (mtde->curr_tde == NULL) {
2499
- if (mtde->ptr < mtde->ir_cnt) { /* try next segment */
2500
- mtde->base = mtde->starts[mtde->ptr];
2501
- mtde->curr_tde = mtde_get_tde_i(mtde, mtde->ptr++);
2502
- }
2503
- else {
2504
- return end;
2505
- }
2506
- }
2493
+ if (mtde->curr_tde == NULL) return end;
2507
2494
  end += mtde->curr_tde->read(mtde->curr_tde, docs + last_end,
2508
2495
  freqs + last_end, req_num - last_end);
2509
2496
  if (end == last_end) { /* none left in segment */
2510
- mtde->curr_tde = NULL;
2497
+ if (!mtde_next_tde(mtde)) return end;
2511
2498
  }
2512
2499
  else { /* got some */
2513
2500
  b = mtde->base; /* adjust doc numbers */
@@ -2528,19 +2515,15 @@ static bool mtde_skip_to(TermDocEnum *tde, int target_doc_num)
2528
2515
  {
2529
2516
  MultiTermDocEnum *mtde = MTDE(tde);
2530
2517
  TermDocEnum *curr_tde;
2531
- while (mtde->ptr < mtde->ir_cnt) {
2532
- curr_tde = mtde->curr_tde;
2533
- if (curr_tde && (target_doc_num < mtde->starts[mtde->ptr]) &&
2518
+ while (NULL != (curr_tde = mtde->curr_tde)) {
2519
+ if (target_doc_num < mtde->starts[mtde->ptr + 1] &&
2534
2520
  (curr_tde->skip_to(curr_tde, target_doc_num - mtde->base))) {
2535
2521
  return true;
2536
2522
  }
2537
2523
 
2538
- mtde->base = mtde->starts[mtde->ptr];
2539
- mtde->curr_tde = mtde_get_tde_i(mtde, mtde->ptr);
2540
- mtde->ptr++;
2524
+ mtde_next_tde(mtde);
2541
2525
  }
2542
2526
 
2543
- curr_tde = mtde->curr_tde;
2544
2527
  if (curr_tde) {
2545
2528
  return curr_tde->skip_to(curr_tde, target_doc_num - mtde->base);
2546
2529
  }
@@ -2554,20 +2537,18 @@ static void mtde_close(TermDocEnum *tde)
2554
2537
  MultiTermDocEnum *mtde = MTDE(tde);
2555
2538
  TermDocEnum *tmp_tde;
2556
2539
  int i = mtde->ir_cnt;
2540
+ mtde->te->close(mtde->te);
2557
2541
  while (i > 0) {
2558
2542
  i--;
2559
- if ((tmp_tde = mtde->irs_tde[i]) != NULL) {
2560
- tmp_tde->close(tmp_tde);
2561
- }
2562
- }
2563
- if (mtde->term != NULL) {
2564
- free(mtde->term);
2543
+ tmp_tde = mtde->irs_tde[i];
2544
+ tmp_tde->close(tmp_tde);
2565
2545
  }
2566
2546
  free(mtde->irs_tde);
2547
+ free(mtde->state);
2567
2548
  free(tde);
2568
2549
  }
2569
2550
 
2570
- TermDocEnum *mtde_new(MultiReader *mr)
2551
+ TermDocEnum *mtxe_new(MultiReader *mr)
2571
2552
  {
2572
2553
  MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
2573
2554
  TermDocEnum *tde = TDE(mtde);
@@ -2578,28 +2559,34 @@ TermDocEnum *mtde_new(MultiReader *mr)
2578
2559
  tde->next = &mtde_next;
2579
2560
  tde->read = &mtde_read;
2580
2561
  tde->skip_to = &mtde_skip_to;
2581
- tde->next_position = NULL;
2582
2562
  tde->close = &mtde_close;
2583
2563
 
2564
+ mtde->state = ALLOC_AND_ZERO_N(char, mr->r_cnt);
2565
+ mtde->te = ((IndexReader *)mr)->terms((IndexReader *)mr, 0);
2584
2566
  mtde->starts = mr->starts;
2585
2567
  mtde->ir_cnt = mr->r_cnt;
2586
2568
  mtde->irs = mr->sub_readers;
2587
- mtde->field_num_map = mr->field_num_map;
2588
2569
  mtde->irs_tde = ALLOC_AND_ZERO_N(TermDocEnum *, mr->r_cnt);
2589
- mtde->reader_tde_i = &mtde_reader_tde_i;
2590
2570
 
2591
2571
  return tde;
2592
2572
  }
2593
2573
 
2574
+ TermDocEnum *mtde_new(MultiReader *mr)
2575
+ {
2576
+ int i;
2577
+ TermDocEnum *tde = mtxe_new(mr);
2578
+ tde->next_position = NULL;
2579
+ for (i = mr->r_cnt - 1; i >= 0; i--) {
2580
+ IndexReader *ir = mr->sub_readers[i];
2581
+ MTDE(tde)->irs_tde[i] = ir->term_docs(ir);
2582
+ }
2583
+ return tde;
2584
+ }
2585
+
2594
2586
  /****************************************************************************
2595
2587
  * MultiTermPosEnum
2596
2588
  ****************************************************************************/
2597
2589
 
2598
- TermDocEnum *mtpe_reader_tde_i(IndexReader *ir)
2599
- {
2600
- return ir->term_positions(ir);
2601
- }
2602
-
2603
2590
  int mtpe_next_position(TermDocEnum *tde)
2604
2591
  {
2605
2592
  CHECK_CURR_TDE("next_position");
@@ -2608,9 +2595,13 @@ int mtpe_next_position(TermDocEnum *tde)
2608
2595
 
2609
2596
  TermDocEnum *mtpe_new(MultiReader *mr)
2610
2597
  {
2611
- TermDocEnum *tde = mtde_new(mr);
2598
+ int i;
2599
+ TermDocEnum *tde = mtxe_new(mr);
2612
2600
  tde->next_position = &mtpe_next_position;
2613
- MTDE(tde)->reader_tde_i = &mtpe_reader_tde_i;
2601
+ for (i = mr->r_cnt - 1; i >= 0; i--) {
2602
+ IndexReader *ir = mr->sub_readers[i];
2603
+ MTDE(tde)->irs_tde[i] = ir->term_positions(ir);
2604
+ }
2614
2605
  return tde;
2615
2606
  }
2616
2607
 
data/ext/index.h CHANGED
@@ -378,6 +378,7 @@ struct TermDocEnum
378
378
  {
379
379
  void (*seek)(TermDocEnum *tde, int field_num, const char *term);
380
380
  void (*seek_te)(TermDocEnum *tde, TermEnum *te);
381
+ void (*seek_ti)(TermDocEnum *tde, TermInfo *ti);
381
382
  int (*doc_num)(TermDocEnum *tde);
382
383
  int (*freq)(TermDocEnum *tde);
383
384
  bool (*next)(TermDocEnum *tde);
data/ext/q_fuzzy.c CHANGED
@@ -264,5 +264,5 @@ Query *fuzq_new_conf(const char *field, const char *term,
264
264
 
265
265
  Query *fuzq_new(const char *field, const char *term)
266
266
  {
267
- return fuzq_new_conf(term, field, 0.0f, 0, 0);
267
+ return fuzq_new_conf(field, term, 0.0f, 0, 0);
268
268
  }
data/ext/r_index.c CHANGED
@@ -564,6 +564,19 @@ frt_fis_to_s(VALUE self)
564
564
  free(fis_s);
565
565
  return rfis_s;
566
566
  }
567
+
568
+ /*
569
+ * call-seq:
570
+ * fis.size -> int
571
+ *
572
+ * Return the number of fields in the FieldInfos object.
573
+ */
574
+ static VALUE
575
+ frt_fis_size(VALUE self)
576
+ {
577
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
578
+ return INT2FIX(fis->size);
579
+ }
567
580
 
568
581
  /*
569
582
  * call-seq:
@@ -2225,7 +2238,7 @@ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
2225
2238
  pos = (pos < 0) ? (max + pos) : pos;
2226
2239
  if (pos < 0 || pos >= max) {
2227
2240
  rb_raise(rb_eArgError, ":%d is out of range [%d..%d] for "
2228
- "IndexWriter#[]", pos, 0, max,
2241
+ "IndexReader#[]", pos, 0, max,
2229
2242
  rb_id2name(SYM2ID(argv)));
2230
2243
  }
2231
2244
  return frt_get_lazy_doc(ir->get_lazy_doc(ir, pos));
@@ -2425,6 +2438,25 @@ frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2425
2438
  StringValuePtr(rterm)));
2426
2439
  }
2427
2440
 
2441
+ /*
2442
+ * call-seq:
2443
+ * index_reader.term_count(field) -> int
2444
+ *
2445
+ * Same return a count of the number of terms in the field
2446
+ */
2447
+ static VALUE
2448
+ frt_ir_term_count(VALUE self, VALUE rfield)
2449
+ {
2450
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2451
+ TermEnum *te = ir_terms(ir, frt_field(rfield));
2452
+ int count = 0;
2453
+ while (te->next(te)) {
2454
+ count++;
2455
+ }
2456
+ te->close(te);
2457
+ return INT2FIX(count);
2458
+ }
2459
+
2428
2460
  /*
2429
2461
  * call-seq:
2430
2462
  * index_reader.fields -> array of field-names
@@ -2483,6 +2515,19 @@ frt_ir_tk_fields(VALUE self)
2483
2515
  return rfield_names;
2484
2516
  }
2485
2517
 
2518
+ /*
2519
+ * call-seq:
2520
+ * index_reader.version -> int
2521
+ *
2522
+ * Returns the current version of the index reader.
2523
+ */
2524
+ static VALUE
2525
+ frt_ir_version(VALUE self)
2526
+ {
2527
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2528
+ return INT2FIX(ir->sis->version);
2529
+ }
2530
+
2486
2531
  /****************************************************************************
2487
2532
  *
2488
2533
  * Init Functions
@@ -2708,6 +2753,7 @@ Init_FieldInfos(void)
2708
2753
  rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
2709
2754
  rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
2710
2755
  rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
2756
+ rb_define_method(cFieldInfos, "size", frt_fis_size, 0);
2711
2757
  rb_define_method(cFieldInfos, "create_index",
2712
2758
  frt_fis_create_index, 1);
2713
2759
  rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, 0);
@@ -3188,6 +3234,7 @@ Init_IndexReader(void)
3188
3234
  {
3189
3235
  cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
3190
3236
  rb_define_alloc_func(cIndexReader, frt_data_alloc);
3237
+ /*rb_define_singleton_method(cIndexReader, "version", frt_class_ir_version, 0); */
3191
3238
  rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
3192
3239
  rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
3193
3240
  rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
@@ -3212,10 +3259,12 @@ Init_IndexReader(void)
3212
3259
  rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
3213
3260
  rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
3214
3261
  rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
3262
+ rb_define_method(cIndexReader, "term_count", frt_ir_term_count, 1);
3215
3263
  rb_define_method(cIndexReader, "fields", frt_ir_fields, 0);
3216
3264
  rb_define_method(cIndexReader, "field_names", frt_ir_fields, 0);
3217
3265
  rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
3218
3266
  rb_define_method(cIndexReader, "tokenized_fields", frt_ir_tk_fields, 0);
3267
+ rb_define_method(cIndexReader, "version", frt_ir_version, 0);
3219
3268
  }
3220
3269
 
3221
3270
  /* rdoc hack
data/ext/r_search.c CHANGED
@@ -104,6 +104,7 @@ static ID id_score;
104
104
  static ID id_hits;
105
105
  static ID id_total_hits;
106
106
  static ID id_max_score;
107
+ static ID id_searcher;
107
108
 
108
109
  /* Search */
109
110
  static VALUE sym_offset;
@@ -152,7 +153,7 @@ frt_get_hit(Hit *hit)
152
153
  ****************************************************************************/
153
154
 
154
155
  static VALUE
155
- frt_get_td(TopDocs *td)
156
+ frt_get_td(TopDocs *td, VALUE rsearcher)
156
157
  {
157
158
  int i;
158
159
  VALUE rtop_docs;
@@ -167,6 +168,7 @@ frt_get_td(TopDocs *td)
167
168
  INT2FIX(td->total_hits),
168
169
  hit_ary,
169
170
  rb_float_new((double)td->max_score),
171
+ rsearcher,
170
172
  NULL);
171
173
  td_destroy(td);
172
174
  return rtop_docs;
@@ -174,20 +176,26 @@ frt_get_td(TopDocs *td)
174
176
 
175
177
  /*
176
178
  * call-seq:
177
- * top_doc.to_s -> string
179
+ * top_doc.to_s(field = :id) -> string
178
180
  *
179
181
  * Returns a string represention of the top_doc in readable format.
180
182
  */
181
183
  static VALUE
182
- frt_td_to_s(VALUE self)
184
+ frt_td_to_s(int argc, VALUE *argv, VALUE self)
183
185
  {
184
186
  int i;
185
187
  VALUE rhits = rb_funcall(self, id_hits, 0);
188
+ Searcher *sea = (Searcher *)DATA_PTR(rb_funcall(self, id_searcher, 0));
186
189
  const int len = RARRAY(rhits)->len;
187
190
  char *str = ALLOC_N(char, len * 64 + 100);
188
191
  char *s = str;
192
+ char *field = "id";
189
193
  VALUE rstr;
190
194
 
195
+ if (argc) {
196
+ field = frt_field(argv[0]);
197
+ }
198
+
191
199
  sprintf(s, "TopDocs: total_hits = %d, max_score = %f [\n",
192
200
  FIX2INT(rb_funcall(self, id_total_hits, 0)),
193
201
  NUM2DBL(rb_funcall(self, id_max_score, 0)));
@@ -195,10 +203,18 @@ frt_td_to_s(VALUE self)
195
203
 
196
204
  for (i = 0; i < len; i++) {
197
205
  VALUE rhit = RARRAY(rhits)->ptr[i];
198
- sprintf(s, "\t%d: %f\n",
199
- FIX2INT(rb_funcall(rhit, id_doc, 0)),
206
+ int doc_id = FIX2INT(rb_funcall(rhit, id_doc, 0));
207
+ char *value = "";
208
+ LazyDoc *lzd = sea->get_lazy_doc(sea, doc_id);
209
+ LazyDocField *lzdf = h_get(lzd->field_dict, field);
210
+ if (NULL != lzdf) {
211
+ value = lazy_df_get_data(lzdf, 0);
212
+ }
213
+
214
+ sprintf(s, "\t%d \"%s\": %f\n", doc_id, value,
200
215
  NUM2DBL(rb_funcall(rhit, id_score, 0)));
201
216
  s += strlen(s);
217
+ lazy_doc_close(lzd);
202
218
  }
203
219
 
204
220
  sprintf(s, "]\n");
@@ -2388,7 +2404,7 @@ frt_sea_search(int argc, VALUE *argv, VALUE self)
2388
2404
  Query *query;
2389
2405
  rb_scan_args(argc, argv, "11", &rquery, &roptions);
2390
2406
  Data_Get_Struct(rquery, Query, query);
2391
- return frt_get_td(frt_sea_search_internal(query, roptions, sea));
2407
+ return frt_get_td(frt_sea_search_internal(query, roptions, sea), self);
2392
2408
  }
2393
2409
 
2394
2410
  /*
@@ -2760,13 +2776,15 @@ Init_TopDocs(void)
2760
2776
  "total_hits",
2761
2777
  "hits",
2762
2778
  "max_score",
2779
+ "searcher",
2763
2780
  NULL);
2764
2781
  rb_set_class_path(cTopDocs, mSearch, td_class);
2765
2782
  rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
2766
- rb_define_method(cTopDocs, "to_s", frt_td_to_s, 0);
2783
+ rb_define_method(cTopDocs, "to_s", frt_td_to_s, -1);
2767
2784
  id_hits = rb_intern("hits");
2768
2785
  id_total_hits = rb_intern("total_hits");
2769
2786
  id_max_score = rb_intern("max_score");
2787
+ id_searcher = rb_intern("searcher");
2770
2788
  }
2771
2789
 
2772
2790
  /*
data/ext/search.c CHANGED
@@ -122,11 +122,12 @@ static void hit_pq_down(PriorityQueue *pq)
122
122
  static Hit *hit_pq_pop(PriorityQueue *pq)
123
123
  {
124
124
  if (pq->size > 0) {
125
- Hit *result = (Hit *)pq->heap[1]; /* save first value */
126
- pq->heap[1] = pq->heap[pq->size]; /* move last to first */
127
- pq->heap[pq->size] = NULL;
125
+ Hit **heap = (Hit **)pq->heap;
126
+ Hit *result = heap[1]; /* save first value */
127
+ heap[1] = heap[pq->size]; /* move last to first */
128
+ heap[pq->size] = NULL;
128
129
  pq->size--;
129
- hit_pq_down(pq); /* adjust heap */
130
+ hit_pq_down(pq); /* adjust heap */
130
131
  return result;
131
132
  }
132
133
  else {
@@ -1079,8 +1080,8 @@ static TopDocs *isea_search_w(Searcher *self,
1079
1080
  for (i = num_docs - 1; i >= 0; i--) {
1080
1081
  score_docs[i] = hq_pop(hq);
1081
1082
  /*
1082
- hit = score_docs[i] = pq_pop(hq);
1083
- printf("hit = %d-->%f\n", hit->doc, hit->score);
1083
+ printf("score_docs[i][%d] = [%ld] => %d-->%f\n", i,
1084
+ score_docs[i], score_docs[i]->doc, score_docs[i]->score);
1084
1085
  */
1085
1086
  }
1086
1087
  }
data/ext/sort.c CHANGED
@@ -426,8 +426,8 @@ int sf_string_compare(void *index, Hit *hit1, Hit *hit2)
426
426
  char *s2 = ((StringIndex *)index)->values[
427
427
  ((StringIndex *)index)->index[hit2->doc]];
428
428
 
429
- if (s1 == NULL) return s1 ? -1 : 0;
430
- if (s2 == NULL) return 1;
429
+ if (s1 == NULL) return s2 ? 1 : 0;
430
+ if (s2 == NULL) return -1;
431
431
 
432
432
  #ifdef POSH_OS_WIN32
433
433
  return strcmp(s1, s2);
@@ -874,8 +874,8 @@ bool fdshq_lt(FieldDoc *fd1, FieldDoc *fd2)
874
874
  do {
875
875
  char *s1 = cmps1[i].val.s;
876
876
  char *s2 = cmps2[i].val.s;
877
- if (s1 == NULL) c = s2 ? -1 : 0;
878
- else if (s2 == NULL) c = 1;
877
+ if (s1 == NULL) c = s2 ? 1 : 0;
878
+ else if (s2 == NULL) c = -1;
879
879
  #ifdef POSH_OS_WIN32
880
880
  else c = strcmp(s1, s2);
881
881
  #else
data/lib/ferret/index.rb CHANGED
@@ -179,11 +179,13 @@ module Ferret::Index
179
179
  # Alternatively you may want to use the HTML entity
180
180
  # &#8230; or the UTF-8 string "\342\200\246".
181
181
  def highlight(query, doc_id, options = {})
182
- ensure_searcher_open()
183
- @searcher.highlight(do_process_query(query),
184
- doc_id,
185
- options[:field]||@options[:default_field],
186
- options)
182
+ @dir.synchronize do
183
+ ensure_searcher_open()
184
+ @searcher.highlight(do_process_query(query),
185
+ doc_id,
186
+ options[:field]||@options[:default_field],
187
+ options)
188
+ end
187
189
  end
188
190
 
189
191
  # Closes this index by closing its associated reader and writer objects.
@@ -273,9 +275,14 @@ module Ferret::Index
273
275
  end
274
276
  ensure_writer_open()
275
277
 
276
- old_analyzer = @writer.analyzer if analyzer
277
- @writer.add_document(doc)
278
- @writer.analyzer = old_analyzer if analyzer
278
+ if analyzer
279
+ old_analyzer = @writer.analyzer
280
+ @writer.analyzer = analyzer
281
+ @writer.add_document(doc)
282
+ @writer.analyzer = old_analyzer
283
+ else
284
+ @writer.add_document(doc)
285
+ end
279
286
 
280
287
  flush() if @auto_flush
281
288
  end
@@ -1,3 +1,3 @@
1
1
  module Ferret
2
- VERSION = '0.10.9'
2
+ VERSION = '0.10.10'
3
3
  end
@@ -0,0 +1,132 @@
1
+ # Author: Matthew D Moss
2
+ #
3
+ # Writtern for ruby quiz #25
4
+ #
5
+ class JapaneseTranslator
6
+ # My knowledge of counting Japanese is limited, so this may not
7
+ # be entirely correct; in particular, I don't know what rules
8
+ # to follow after 'hyaku man' (1,000,000).
9
+ # I also combine a digit with its group, such as 'gohyaku' rather
10
+ # than 'go hyaku'; I just like reading it better that way.
11
+
12
+ DIGITS = %w(zero ichi ni san yon go roku nana hachi kyu)
13
+ GROUPS = %w(nothingtoseeheremovealong ju hyaku sen)
14
+ MAN = 10000
15
+
16
+ def to_spoken(val)
17
+ case val <=> 0
18
+ when -1
19
+ '- ' + to_spoken(-val)
20
+ when 0
21
+ DIGITS[0]
22
+ else
23
+ group(val, 0)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def group(val, level)
30
+ if val >= MAN
31
+ group(val / MAN, 0) + 'man ' + group(val % MAN, 0)
32
+ else
33
+ case val
34
+ when 0
35
+ ''
36
+ when 1
37
+ level == 0 ? DIGITS[val] : GROUPS[level]
38
+ when 2...10
39
+ DIGITS[val] + (GROUPS[level] if level > 0).to_s
40
+ else
41
+ group(val / 10, level+1) + ' ' + group(val % 10, level)
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ class USEnglishTranslator
49
+ # Formal, US English. Optional 'and'. Will not produce things
50
+ # such as 'twelve hundred' but rather 'one thousand two hundred'.
51
+ # The use of 'and' is incomplete; it is sometimes missed.
52
+
53
+ DIGITS = %w(zero one two three four five six seven eight nine)
54
+ TEENS = %w(ten eleven twelve thirteen fourteen fifteen sixteen
55
+ seventeen eighteen nineteen)
56
+ TENS = %w(hello world twenty thirty forty fifty sixty seventy
57
+ eighty ninety)
58
+ GROUPS = %w(thousand million billion trillion quadrillion
59
+ quintillion sextillion septillion octillion nonillion
60
+ decillion)
61
+ K = 1000
62
+
63
+ def initialize(conjunction = true)
64
+ @conjunction = conjunction
65
+ end
66
+
67
+ def to_spoken(val)
68
+ case val <=> 0
69
+ when -1
70
+ 'negative ' + to_spoken(-val)
71
+ when 0
72
+ DIGITS[0]
73
+ else
74
+ group(val, 0).flatten.join(' ')
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ def group(val, level)
81
+ x = group(val / K, level + 1) << GROUPS[level] if val >= K
82
+ x.to_a << under_1000(val % K, level)
83
+ end
84
+
85
+ def under_1000(val, level)
86
+ x = [DIGITS[val / 100]] << 'hundred' if val >= 100
87
+ x.to_a << under_100(val % 100, (level == 0 and not x.nil?))
88
+ end
89
+
90
+ def under_100(val, junction)
91
+ x = [('and' if @conjunction and junction)] # wyf?
92
+ case val
93
+ when 0
94
+ []
95
+ when 1...10
96
+ x << DIGITS[val]
97
+ when 10...20
98
+ x << TEENS[val - 10]
99
+ else
100
+ d = val % 10
101
+ x << (TENS[val / 10] + ('-' + DIGITS[d] if d != 0).to_s)
102
+ end
103
+ end
104
+ end
105
+
106
+
107
+ class Integer
108
+ def to_spoken(translator = USEnglishTranslator.new)
109
+ translator.to_spoken(self).squeeze(' ').strip
110
+ end
111
+ end
112
+
113
+ if $0 == __FILE__
114
+ SAMPLES = [ 0, 1, 2, 5, 10, 11, 14, 18, 20, 21, 29, 33, 42, 50, 87, 99,
115
+ 100, 101, 110, 167, 199, 200, 201, 276, 300, 314, 500, 610,
116
+ 1000, 1039, 1347, 2309, 3098, 23501, 32767, 70000, 5480283,
117
+ 2435489238, 234100090000, -42, -2001 ]
118
+
119
+ TRANSLATORS = { 'US English' => USEnglishTranslator.new,
120
+ 'Japanese' => JapaneseTranslator.new }
121
+
122
+
123
+ # main
124
+ TRANSLATORS.each do |lang, translator|
125
+ puts
126
+ puts lang
127
+ puts '-' * lang.length
128
+ SAMPLES.each do |val|
129
+ puts "%12d => %s" % [val, val.to_spoken(translator)]
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,78 @@
1
+ require File.dirname(__FILE__) + "/../test_helper"
2
+ require File.dirname(__FILE__) + "/number_to_spoken.rb"
3
+ require 'thread'
4
+
5
+ class IndexThreadSafetyTest < Test::Unit::TestCase
6
+ include Ferret::Index
7
+
8
+ INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
9
+ ITERATIONS = 1000
10
+ NUM_THREADS = 2
11
+ ANALYZER = Ferret::Analysis::StandardAnalyzer.new()
12
+
13
+ def setup
14
+ index = Index.new(:path => INDEX_DIR,
15
+ :create => true,
16
+ :analyzer => ANALYZER,
17
+ :default_field => :content)
18
+ index.close
19
+ end
20
+
21
+ def indexing_thread()
22
+ index = Index.new(:path => INDEX_DIR,
23
+ :analyzer => ANALYZER,
24
+ :default_field => :content)
25
+
26
+ ITERATIONS.times do
27
+ choice = rand()
28
+
29
+ if choice > 0.98
30
+ do_optimize(index)
31
+ elsif choice > 0.7
32
+ do_delete_doc(index)
33
+ elsif choice > 0.5
34
+ do_search(index)
35
+ else
36
+ do_add_doc(index)
37
+ end
38
+ end
39
+ end
40
+
41
+ def do_optimize(index)
42
+ puts "Optimizing the index"
43
+ index.optimize
44
+ end
45
+
46
+ def do_delete_doc(index)
47
+ return if index.size == 0
48
+ doc_num = rand(index.size)
49
+ puts "Deleting #{doc_num} from index which has#{index.has_deletions? ? "" : " no"} deletions"
50
+ puts "document was already deleted" if (index.deleted?(doc_num))
51
+ index.delete(doc_num)
52
+ end
53
+
54
+ def do_add_doc(index)
55
+ n = rand(0xFFFFFFFF)
56
+ d = {:id => n, :content => n.to_spoken}
57
+ puts("Adding #{n}")
58
+ index << d
59
+ end
60
+
61
+ def do_search(index)
62
+ n = rand(0xFFFFFFFF)
63
+ puts("Searching for #{n}")
64
+ hits = index.search_each(n.to_spoken, :num_docs => 3) do |d, s|
65
+ puts "Hit for #{n}: #{index[d][:id]} - #{s}"
66
+ end
67
+ puts("Searched for #{n}: total = #{hits}")
68
+ end
69
+
70
+ def test_threading
71
+ threads = []
72
+ NUM_THREADS.times do
73
+ threads << Thread.new { indexing_thread }
74
+ end
75
+
76
+ threads.each {|t| t.join}
77
+ end
78
+ end
@@ -0,0 +1,137 @@
1
+ require File.dirname(__FILE__) + "/../test_helper"
2
+ require File.dirname(__FILE__) + "/../utils/number_to_spoken.rb"
3
+ require 'thread'
4
+
5
+ class ThreadSafetyTest
6
+ include Ferret::Index
7
+ include Ferret::Search
8
+ include Ferret::Store
9
+ include Ferret::Document
10
+
11
+ def initialize(options)
12
+ @options = options
13
+ end
14
+
15
+ INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
16
+ ANALYZER = Ferret::Analysis::Analyzer.new()
17
+ ITERATIONS = 19
18
+ @@searcher = nil
19
+
20
+ def run_index_thread(writer)
21
+ reopen_interval = 30 + rand(60)
22
+
23
+ use_compound_file = false
24
+
25
+ (400*ITERATIONS).times do |i|
26
+ d = Document.new()
27
+ n = rand(0xFFFFFFFF)
28
+ d << Field.new("id", n.to_s, Field::Store::YES, Field::Index::UNTOKENIZED)
29
+ d << Field.new("contents", n.to_spoken, Field::Store::NO, Field::Index::TOKENIZED)
30
+ puts("Adding #{n}")
31
+
32
+ # Switch between single and multiple file segments
33
+ use_compound_file = (rand < 0.5)
34
+ writer.use_compound_file = use_compound_file
35
+
36
+ writer << d
37
+
38
+ if (i % reopen_interval == 0)
39
+ writer.close()
40
+ writer = IndexWriter.new(INDEX_DIR, :analyzer => ANALYZER)
41
+ end
42
+ end
43
+
44
+ writer.close()
45
+ rescue => e
46
+ puts e
47
+ puts e.backtrace
48
+ raise e
49
+ end
50
+
51
+ def run_search_thread(use_global)
52
+ reopen_interval = 10 + rand(20)
53
+
54
+ unless use_global
55
+ searcher = IndexSearcher.new(INDEX_DIR)
56
+ end
57
+
58
+ (50*ITERATIONS).times do |i|
59
+ search_for(rand(0xFFFFFFFF), (searcher.nil? ? @@searcher : searcher))
60
+ if (i%reopen_interval == 0)
61
+ if (searcher == nil)
62
+ @@searcher = IndexSearcher.new(INDEX_DIR)
63
+ else
64
+ searcher.close()
65
+ searcher = IndexSearcher.new(INDEX_DIR)
66
+ end
67
+ end
68
+ end
69
+ rescue => e
70
+ puts e
71
+ puts e.backtrace
72
+ raise e
73
+ end
74
+
75
+ def search_for(n, searcher)
76
+ puts("Searching for #{n}")
77
+ hits =
78
+ searcher.search(Ferret::QueryParser.parse(n.to_spoken, "contents", :analyzer => ANALYZER),
79
+ :num_docs => 3)
80
+ puts("Search for #{n}: total = #{hits.size}")
81
+ hits.each do |d, s|
82
+ puts "Hit for #{n}: #{searcher.reader.get_document(d)["id"]} - #{s}"
83
+ end
84
+ end
85
+
86
+ def run_test_threads
87
+
88
+ threads = []
89
+ unless @options[:read_only]
90
+ writer = IndexWriter.new(INDEX_DIR, :analyzer => ANALYZER,
91
+ :create => !@options[:add])
92
+
93
+ threads << Thread.new { run_index_thread(writer) }
94
+
95
+ sleep(1)
96
+ end
97
+
98
+ threads << Thread.new { run_search_thread(false)}
99
+
100
+ @@searcher = IndexSearcher.new(INDEX_DIR)
101
+ threads << Thread.new { run_search_thread(true)}
102
+
103
+ threads << Thread.new { run_search_thread(true)}
104
+
105
+ threads.each {|t| t.join}
106
+ end
107
+ end
108
+
109
+
110
+ if $0 == __FILE__
111
+ require 'optparse'
112
+
113
+ OPTIONS = {
114
+ :all => false,
115
+ :read_only => false,
116
+ }
117
+
118
+ ARGV.options do |opts|
119
+ script_name = File.basename($0)
120
+ opts.banner = "Usage: ruby #{script_name} [options]"
121
+
122
+ opts.separator ""
123
+
124
+ opts.on("-r", "--read-only", "Read Only.") { OPTIONS[:all] = true }
125
+ opts.on("-a", "--all", "All.") { OPTIONS[:read_only] = true }
126
+
127
+ opts.separator ""
128
+
129
+ opts.on("-h", "--help",
130
+ "Show this help message.") { puts opts; exit }
131
+
132
+ opts.parse!
133
+ end
134
+
135
+ tst = ThreadSafetyTest.new(OPTIONS)
136
+ tst.run_test_threads
137
+ end
@@ -766,4 +766,12 @@ class IndexTest < Test::Unit::TestCase
766
766
 
767
767
  index.close
768
768
  end
769
+
770
+ def test_changing_analyzer
771
+ index = Ferret::I.new
772
+ a = Ferret::Analysis::WhiteSpaceAnalyzer.new(false)
773
+ index.add_document({:content => "Content With Capitals"}, a)
774
+ tv = index.reader.term_vector(0, :content)
775
+ assert_equal("Capitals", tv.terms[0].text)
776
+ end
769
777
  end
@@ -16,8 +16,8 @@ class SearchAndSortTest < Test::Unit::TestCase
16
16
  {:x => "findall", :string => "c", :int => "5", :float => "0.1"}, # 3 3
17
17
  {:x => "findall", :string => "e", :int => "2", :float => "0.001"}, # 5 1
18
18
  {:x => "findall", :string => "g", :int => "1", :float => "1.0"}, # 3 3
19
- {:x => "findall", :string => "i", :int => "3", :float => "0.0001"}, # 6 2
20
- {:x => "findall", :string => "j", :int => "4", :float => "10.0"}, # 4 0
19
+ {:x => "findall", :string => nil, :int => "3", :float => "0.0001"}, # 6 2
20
+ {:x => "findall", :string => "", :int => "4", :float => "10.0"}, # 4 0
21
21
  {:x => "findall", :string => "h", :int => "5", :float => "0.00001"}, # 7 3
22
22
  {:x => "findall", :string => "f", :int => "2", :float => "100.0"}, # 5 1
23
23
  {:x => "findall", :string => "d", :int => "3", :float => "1000.0"}, # 6 2
@@ -145,7 +145,7 @@ class SearchAndSortTest < Test::Unit::TestCase
145
145
 
146
146
  ## str
147
147
  sf_str = SortField.new(:string, {:type => :string})
148
- do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,4,5], [sf_str, SortField::SCORE])
148
+ do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,5,4], [sf_str, SortField::SCORE])
149
149
  do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,4,5], "string")
150
150
 
151
151
  ## auto
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.10.9
7
- date: 2006-09-27 00:00:00 +09:00
6
+ version: 0.10.10
7
+ date: 2006-10-08 00:00:00 +09:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib
@@ -198,6 +198,9 @@ files:
198
198
  - test/unit/search/tc_search_and_sort.rb
199
199
  - test/unit/search/tm_searcher.rb
200
200
  - test/unit/query_parser/tc_query_parser.rb
201
+ - test/threading/thread_safety_index_test.rb
202
+ - test/threading/thread_safety_test.rb
203
+ - test/threading/number_to_spoken.rb
201
204
  test_files: []
202
205
 
203
206
  rdoc_options: