ferret 0.10.9 → 0.10.10

Sign up to get free protection for your applications and to get access to all the features.
data/ext/index.c CHANGED
@@ -1552,7 +1552,7 @@ TermEnum *ste_new(InStream *is, SegmentFieldIndex *sfi)
1552
1552
 
1553
1553
  typedef struct TermEnumWrapper
1554
1554
  {
1555
- int base;
1555
+ int index;
1556
1556
  TermEnum *te;
1557
1557
  int *doc_map;
1558
1558
  IndexReader *ir;
@@ -1567,13 +1567,16 @@ typedef struct MultiTermEnum
1567
1567
  TermEnumWrapper *tews;
1568
1568
  int size;
1569
1569
  int **field_num_map;
1570
+ int ti_cnt;
1571
+ TermInfo *tis;
1572
+ int *ti_indexes;
1570
1573
  } MultiTermEnum;
1571
1574
 
1572
1575
  static bool tew_lt(const TermEnumWrapper *tew1, const TermEnumWrapper *tew2)
1573
1576
  {
1574
1577
  int cmpres = strcmp(tew1->term, tew2->term);
1575
1578
  if (cmpres == 0) {
1576
- return tew1->base < tew2->base;
1579
+ return tew1->index < tew2->index;
1577
1580
  }
1578
1581
  else {
1579
1582
  return cmpres < 0;
@@ -1617,10 +1620,10 @@ static void tew_destroy(TermEnumWrapper *tew)
1617
1620
  tew->te->close(tew->te);
1618
1621
  }
1619
1622
 
1620
- TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int base, TermEnum *te,
1623
+ TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, TermEnum *te,
1621
1624
  IndexReader *ir)
1622
1625
  {
1623
- tew->base = base;
1626
+ tew->index = index;
1624
1627
  tew->ir = ir;
1625
1628
  tew->te = te;
1626
1629
  tew->term = te->curr_term;
@@ -1646,9 +1649,12 @@ static char *mte_next(TermEnum *te)
1646
1649
 
1647
1650
  te->curr_ti.doc_freq = 0;
1648
1651
 
1652
+ MTE(te)->ti_cnt = 0;
1649
1653
  while ((top != NULL) && (strcmp(te->curr_term, top->term) == 0)) {
1650
1654
  pq_pop(MTE(te)->tew_queue);
1651
1655
  te->curr_ti.doc_freq += top->te->curr_ti.doc_freq;/* increment freq */
1656
+ MTE(te)->ti_indexes[MTE(te)->ti_cnt] = top->index;
1657
+ MTE(te)->tis[MTE(te)->ti_cnt++] = top->te->curr_ti;
1652
1658
  if (tew_next(top)) {
1653
1659
  pq_push(MTE(te)->tew_queue, top); /* restore queue */
1654
1660
  }
@@ -1711,6 +1717,8 @@ static void mte_close(TermEnum *te)
1711
1717
  tew_destroy(&(MTE(te)->tews[i]));
1712
1718
  }
1713
1719
  free(MTE(te)->tews);
1720
+ free(MTE(te)->tis);
1721
+ free(MTE(te)->ti_indexes);
1714
1722
  pq_destroy(MTE(te)->tew_queue);
1715
1723
  free(te);
1716
1724
  }
@@ -1718,7 +1726,6 @@ static void mte_close(TermEnum *te)
1718
1726
  TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1719
1727
  {
1720
1728
  IndexReader **readers = mr->sub_readers;
1721
- int *starts = mr->starts;
1722
1729
  int r_cnt = mr->r_cnt;
1723
1730
  int i;
1724
1731
  IndexReader *reader;
@@ -1731,6 +1738,8 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1731
1738
  TE(mte)->close = &mte_close;
1732
1739
 
1733
1740
  mte->size = r_cnt;
1741
+ mte->tis = ALLOC_AND_ZERO_N(TermInfo, r_cnt);
1742
+ mte->ti_indexes = ALLOC_AND_ZERO_N(int, r_cnt);
1734
1743
  mte->tews = ALLOC_AND_ZERO_N(TermEnumWrapper, r_cnt);
1735
1744
  mte->tew_queue = pq_new(r_cnt, (lt_ft)&tew_lt, (free_ft)NULL);
1736
1745
  mte->field_num_map = mr->field_num_map;
@@ -1750,7 +1759,7 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1750
1759
  sub_te = reader->terms(reader, fnum);
1751
1760
  }
1752
1761
 
1753
- tew = tew_setup(&(mte->tews[i]), starts[i], sub_te, reader);
1762
+ tew = tew_setup(&(mte->tews[i]), i, sub_te, reader);
1754
1763
  if (((term == NULL) && tew_next(tew))
1755
1764
  || (tew->term && (tew->term[0] != '\0'))) {
1756
1765
  pq_push(mte->tew_queue, tew); /* initialize queue */
@@ -1759,7 +1768,7 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
1759
1768
  /* add the term_enum_wrapper just in case */
1760
1769
  sub_te = reader->terms(reader, 0);
1761
1770
  sub_te->field_num = -1;
1762
- tew_setup(&(mte->tews[i]), starts[i], sub_te, reader);
1771
+ tew_setup(&(mte->tews[i]), i, sub_te, reader);
1763
1772
  }
1764
1773
  }
1765
1774
 
@@ -2386,45 +2395,29 @@ typedef struct MultiTermDocEnum
2386
2395
  {
2387
2396
  TermDocEnum tde;
2388
2397
  int *starts;
2389
- char *term;
2390
- int field_num;
2391
2398
  int base;
2392
2399
  int ptr;
2393
2400
  int ir_cnt;
2394
- int **field_num_map;
2401
+ char *state;
2402
+ TermEnum *te;
2395
2403
  IndexReader **irs;
2396
2404
  TermDocEnum **irs_tde;
2397
2405
  TermDocEnum *curr_tde;
2398
- TermDocEnum *(*reader_tde_i)(IndexReader *ir);
2399
2406
  } MultiTermDocEnum;
2400
2407
 
2401
- static TermDocEnum *mtde_reader_tde_i(IndexReader *ir)
2402
- {
2403
- return ir->term_docs(ir);
2404
- }
2405
-
2406
- static TermDocEnum *mtde_get_tde_i(MultiTermDocEnum *mtde, int i)
2408
+ static TermDocEnum *mtde_next_tde(MultiTermDocEnum *mtde)
2407
2409
  {
2408
- if (mtde->term == NULL) {
2409
- return NULL;
2410
+ mtde->ptr++;
2411
+ while (mtde->ptr < mtde->ir_cnt && !mtde->state[mtde->ptr]) {
2412
+ mtde->ptr++;
2413
+ }
2414
+ if (mtde->ptr >= mtde->ir_cnt) {
2415
+ return mtde->curr_tde = NULL;
2410
2416
  }
2411
2417
  else {
2412
- int fnum = mtde->field_num_map
2413
- ? mtde->field_num_map[i][mtde->field_num]
2414
- : mtde->field_num;
2415
-
2416
- if (fnum >= 0) {
2417
- TermDocEnum *tde = mtde->irs_tde[i];
2418
- if (tde == NULL) {
2419
- tde = mtde->irs_tde[i] = mtde->reader_tde_i(mtde->irs[i]);
2420
- }
2421
-
2422
- tde->seek(tde, fnum, mtde->term);
2423
- return tde;
2424
- }
2425
- else {
2426
- return NULL;
2427
- }
2418
+ TermDocEnum *tde = mtde->curr_tde = mtde->irs_tde[mtde->ptr];
2419
+ mtde->base = mtde->starts[mtde->ptr];
2420
+ return tde;
2428
2421
  }
2429
2422
  }
2430
2423
 
@@ -2435,30 +2428,35 @@ static TermDocEnum *mtde_get_tde_i(MultiTermDocEnum *mtde, int i)
2435
2428
  }\
2436
2429
  } while (0)
2437
2430
 
2438
- static void mtde_seek(TermDocEnum *tde, int field_num, const char *term)
2431
+ static void mtde_seek_te(TermDocEnum *tde, TermEnum *te)
2439
2432
  {
2433
+ int i;
2440
2434
  MultiTermDocEnum *mtde = MTDE(tde);
2441
- if (mtde->term != NULL) {
2442
- free(mtde->term);
2435
+ memset(mtde->state, 0, mtde->ir_cnt);
2436
+ for (i = MTE(te)->ti_cnt - 1; i >= 0; i--) {
2437
+ int index = MTE(te)->ti_indexes[i];
2438
+ TermDocEnum *tde = mtde->irs_tde[index];
2439
+ mtde->state[index] = 1;
2440
+ if (tde->close == stde_close) {
2441
+ stde_seek_ti(STDE(tde), MTE(te)->tis + i);
2442
+ } else if (tde->close == stpe_close) {
2443
+ stpe_seek_ti(STDE(tde), MTE(te)->tis + i);
2444
+ } else {
2445
+ tde->seek(tde, MTE(te)->tews[index].te->field_num, te->curr_term);
2446
+ }
2443
2447
  }
2444
- mtde->term = estrdup(term);
2445
- mtde->field_num = field_num;
2446
2448
  mtde->base = 0;
2447
- mtde->ptr = 0;
2448
- mtde->curr_tde = NULL;
2449
+ mtde->ptr = -1;
2450
+ mtde_next_tde(mtde);
2449
2451
  }
2450
2452
 
2451
- static void mtde_seek_te(TermDocEnum *tde, TermEnum *te)
2453
+ static void mtde_seek(TermDocEnum *tde, int field_num, const char *term)
2452
2454
  {
2453
2455
  MultiTermDocEnum *mtde = MTDE(tde);
2454
- if (mtde->term != NULL) {
2455
- free(mtde->term);
2456
- }
2457
- mtde->term = estrdup(te->curr_term);
2458
- mtde->field_num = te->field_num;
2459
- mtde->base = 0;
2460
- mtde->ptr = 0;
2461
- mtde->curr_tde = NULL;
2456
+ TermEnum *te = mtde->te;
2457
+ te->set_field(te, field_num);
2458
+ te->skip_to(te, term);
2459
+ mtde_seek_te(tde, te);
2462
2460
  }
2463
2461
 
2464
2462
  static int mtde_doc_num(TermDocEnum *tde)
@@ -2479,10 +2477,7 @@ static bool mtde_next(TermDocEnum *tde)
2479
2477
  if (mtde->curr_tde != NULL && mtde->curr_tde->next(mtde->curr_tde)) {
2480
2478
  return true;
2481
2479
  }
2482
- else if (mtde->ptr < mtde->ir_cnt) {
2483
- mtde->base = mtde->starts[mtde->ptr];
2484
- mtde->curr_tde = mtde_get_tde_i(mtde, mtde->ptr);
2485
- mtde->ptr++;
2480
+ else if (mtde_next_tde(mtde)) {
2486
2481
  return mtde_next(tde);
2487
2482
  }
2488
2483
  else {
@@ -2495,19 +2490,11 @@ static int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
2495
2490
  int i, end = 0, last_end = 0, b;
2496
2491
  MultiTermDocEnum *mtde = MTDE(tde);
2497
2492
  while (true) {
2498
- while (mtde->curr_tde == NULL) {
2499
- if (mtde->ptr < mtde->ir_cnt) { /* try next segment */
2500
- mtde->base = mtde->starts[mtde->ptr];
2501
- mtde->curr_tde = mtde_get_tde_i(mtde, mtde->ptr++);
2502
- }
2503
- else {
2504
- return end;
2505
- }
2506
- }
2493
+ if (mtde->curr_tde == NULL) return end;
2507
2494
  end += mtde->curr_tde->read(mtde->curr_tde, docs + last_end,
2508
2495
  freqs + last_end, req_num - last_end);
2509
2496
  if (end == last_end) { /* none left in segment */
2510
- mtde->curr_tde = NULL;
2497
+ if (!mtde_next_tde(mtde)) return end;
2511
2498
  }
2512
2499
  else { /* got some */
2513
2500
  b = mtde->base; /* adjust doc numbers */
@@ -2528,19 +2515,15 @@ static bool mtde_skip_to(TermDocEnum *tde, int target_doc_num)
2528
2515
  {
2529
2516
  MultiTermDocEnum *mtde = MTDE(tde);
2530
2517
  TermDocEnum *curr_tde;
2531
- while (mtde->ptr < mtde->ir_cnt) {
2532
- curr_tde = mtde->curr_tde;
2533
- if (curr_tde && (target_doc_num < mtde->starts[mtde->ptr]) &&
2518
+ while (NULL != (curr_tde = mtde->curr_tde)) {
2519
+ if (target_doc_num < mtde->starts[mtde->ptr + 1] &&
2534
2520
  (curr_tde->skip_to(curr_tde, target_doc_num - mtde->base))) {
2535
2521
  return true;
2536
2522
  }
2537
2523
 
2538
- mtde->base = mtde->starts[mtde->ptr];
2539
- mtde->curr_tde = mtde_get_tde_i(mtde, mtde->ptr);
2540
- mtde->ptr++;
2524
+ mtde_next_tde(mtde);
2541
2525
  }
2542
2526
 
2543
- curr_tde = mtde->curr_tde;
2544
2527
  if (curr_tde) {
2545
2528
  return curr_tde->skip_to(curr_tde, target_doc_num - mtde->base);
2546
2529
  }
@@ -2554,20 +2537,18 @@ static void mtde_close(TermDocEnum *tde)
2554
2537
  MultiTermDocEnum *mtde = MTDE(tde);
2555
2538
  TermDocEnum *tmp_tde;
2556
2539
  int i = mtde->ir_cnt;
2540
+ mtde->te->close(mtde->te);
2557
2541
  while (i > 0) {
2558
2542
  i--;
2559
- if ((tmp_tde = mtde->irs_tde[i]) != NULL) {
2560
- tmp_tde->close(tmp_tde);
2561
- }
2562
- }
2563
- if (mtde->term != NULL) {
2564
- free(mtde->term);
2543
+ tmp_tde = mtde->irs_tde[i];
2544
+ tmp_tde->close(tmp_tde);
2565
2545
  }
2566
2546
  free(mtde->irs_tde);
2547
+ free(mtde->state);
2567
2548
  free(tde);
2568
2549
  }
2569
2550
 
2570
- TermDocEnum *mtde_new(MultiReader *mr)
2551
+ TermDocEnum *mtxe_new(MultiReader *mr)
2571
2552
  {
2572
2553
  MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
2573
2554
  TermDocEnum *tde = TDE(mtde);
@@ -2578,28 +2559,34 @@ TermDocEnum *mtde_new(MultiReader *mr)
2578
2559
  tde->next = &mtde_next;
2579
2560
  tde->read = &mtde_read;
2580
2561
  tde->skip_to = &mtde_skip_to;
2581
- tde->next_position = NULL;
2582
2562
  tde->close = &mtde_close;
2583
2563
 
2564
+ mtde->state = ALLOC_AND_ZERO_N(char, mr->r_cnt);
2565
+ mtde->te = ((IndexReader *)mr)->terms((IndexReader *)mr, 0);
2584
2566
  mtde->starts = mr->starts;
2585
2567
  mtde->ir_cnt = mr->r_cnt;
2586
2568
  mtde->irs = mr->sub_readers;
2587
- mtde->field_num_map = mr->field_num_map;
2588
2569
  mtde->irs_tde = ALLOC_AND_ZERO_N(TermDocEnum *, mr->r_cnt);
2589
- mtde->reader_tde_i = &mtde_reader_tde_i;
2590
2570
 
2591
2571
  return tde;
2592
2572
  }
2593
2573
 
2574
+ TermDocEnum *mtde_new(MultiReader *mr)
2575
+ {
2576
+ int i;
2577
+ TermDocEnum *tde = mtxe_new(mr);
2578
+ tde->next_position = NULL;
2579
+ for (i = mr->r_cnt - 1; i >= 0; i--) {
2580
+ IndexReader *ir = mr->sub_readers[i];
2581
+ MTDE(tde)->irs_tde[i] = ir->term_docs(ir);
2582
+ }
2583
+ return tde;
2584
+ }
2585
+
2594
2586
  /****************************************************************************
2595
2587
  * MultiTermPosEnum
2596
2588
  ****************************************************************************/
2597
2589
 
2598
- TermDocEnum *mtpe_reader_tde_i(IndexReader *ir)
2599
- {
2600
- return ir->term_positions(ir);
2601
- }
2602
-
2603
2590
  int mtpe_next_position(TermDocEnum *tde)
2604
2591
  {
2605
2592
  CHECK_CURR_TDE("next_position");
@@ -2608,9 +2595,13 @@ int mtpe_next_position(TermDocEnum *tde)
2608
2595
 
2609
2596
  TermDocEnum *mtpe_new(MultiReader *mr)
2610
2597
  {
2611
- TermDocEnum *tde = mtde_new(mr);
2598
+ int i;
2599
+ TermDocEnum *tde = mtxe_new(mr);
2612
2600
  tde->next_position = &mtpe_next_position;
2613
- MTDE(tde)->reader_tde_i = &mtpe_reader_tde_i;
2601
+ for (i = mr->r_cnt - 1; i >= 0; i--) {
2602
+ IndexReader *ir = mr->sub_readers[i];
2603
+ MTDE(tde)->irs_tde[i] = ir->term_positions(ir);
2604
+ }
2614
2605
  return tde;
2615
2606
  }
2616
2607
 
data/ext/index.h CHANGED
@@ -378,6 +378,7 @@ struct TermDocEnum
378
378
  {
379
379
  void (*seek)(TermDocEnum *tde, int field_num, const char *term);
380
380
  void (*seek_te)(TermDocEnum *tde, TermEnum *te);
381
+ void (*seek_ti)(TermDocEnum *tde, TermInfo *ti);
381
382
  int (*doc_num)(TermDocEnum *tde);
382
383
  int (*freq)(TermDocEnum *tde);
383
384
  bool (*next)(TermDocEnum *tde);
data/ext/q_fuzzy.c CHANGED
@@ -264,5 +264,5 @@ Query *fuzq_new_conf(const char *field, const char *term,
264
264
 
265
265
  Query *fuzq_new(const char *field, const char *term)
266
266
  {
267
- return fuzq_new_conf(term, field, 0.0f, 0, 0);
267
+ return fuzq_new_conf(field, term, 0.0f, 0, 0);
268
268
  }
data/ext/r_index.c CHANGED
@@ -564,6 +564,19 @@ frt_fis_to_s(VALUE self)
564
564
  free(fis_s);
565
565
  return rfis_s;
566
566
  }
567
+
568
+ /*
569
+ * call-seq:
570
+ * fis.size -> int
571
+ *
572
+ * Return the number of fields in the FieldInfos object.
573
+ */
574
+ static VALUE
575
+ frt_fis_size(VALUE self)
576
+ {
577
+ FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
578
+ return INT2FIX(fis->size);
579
+ }
567
580
 
568
581
  /*
569
582
  * call-seq:
@@ -2225,7 +2238,7 @@ frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
2225
2238
  pos = (pos < 0) ? (max + pos) : pos;
2226
2239
  if (pos < 0 || pos >= max) {
2227
2240
  rb_raise(rb_eArgError, ":%d is out of range [%d..%d] for "
2228
- "IndexWriter#[]", pos, 0, max,
2241
+ "IndexReader#[]", pos, 0, max,
2229
2242
  rb_id2name(SYM2ID(argv)));
2230
2243
  }
2231
2244
  return frt_get_lazy_doc(ir->get_lazy_doc(ir, pos));
@@ -2425,6 +2438,25 @@ frt_ir_terms_from(VALUE self, VALUE rfield, VALUE rterm)
2425
2438
  StringValuePtr(rterm)));
2426
2439
  }
2427
2440
 
2441
+ /*
2442
+ * call-seq:
2443
+ * index_reader.term_count(field) -> int
2444
+ *
2445
+ * Same return a count of the number of terms in the field
2446
+ */
2447
+ static VALUE
2448
+ frt_ir_term_count(VALUE self, VALUE rfield)
2449
+ {
2450
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2451
+ TermEnum *te = ir_terms(ir, frt_field(rfield));
2452
+ int count = 0;
2453
+ while (te->next(te)) {
2454
+ count++;
2455
+ }
2456
+ te->close(te);
2457
+ return INT2FIX(count);
2458
+ }
2459
+
2428
2460
  /*
2429
2461
  * call-seq:
2430
2462
  * index_reader.fields -> array of field-names
@@ -2483,6 +2515,19 @@ frt_ir_tk_fields(VALUE self)
2483
2515
  return rfield_names;
2484
2516
  }
2485
2517
 
2518
+ /*
2519
+ * call-seq:
2520
+ * index_reader.version -> int
2521
+ *
2522
+ * Returns the current version of the index reader.
2523
+ */
2524
+ static VALUE
2525
+ frt_ir_version(VALUE self)
2526
+ {
2527
+ IndexReader *ir = (IndexReader *)DATA_PTR(self);
2528
+ return INT2FIX(ir->sis->version);
2529
+ }
2530
+
2486
2531
  /****************************************************************************
2487
2532
  *
2488
2533
  * Init Functions
@@ -2708,6 +2753,7 @@ Init_FieldInfos(void)
2708
2753
  rb_define_method(cFieldInfos, "add_field", frt_fis_add_field, -1);
2709
2754
  rb_define_method(cFieldInfos, "each", frt_fis_each, 0);
2710
2755
  rb_define_method(cFieldInfos, "to_s", frt_fis_to_s, 0);
2756
+ rb_define_method(cFieldInfos, "size", frt_fis_size, 0);
2711
2757
  rb_define_method(cFieldInfos, "create_index",
2712
2758
  frt_fis_create_index, 1);
2713
2759
  rb_define_method(cFieldInfos, "fields", frt_fis_get_fields, 0);
@@ -3188,6 +3234,7 @@ Init_IndexReader(void)
3188
3234
  {
3189
3235
  cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
3190
3236
  rb_define_alloc_func(cIndexReader, frt_data_alloc);
3237
+ /*rb_define_singleton_method(cIndexReader, "version", frt_class_ir_version, 0); */
3191
3238
  rb_define_method(cIndexReader, "initialize", frt_ir_init, 1);
3192
3239
  rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
3193
3240
  rb_define_method(cIndexReader, "norms", frt_ir_norms, 1);
@@ -3212,10 +3259,12 @@ Init_IndexReader(void)
3212
3259
  rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 2);
3213
3260
  rb_define_method(cIndexReader, "terms", frt_ir_terms, 1);
3214
3261
  rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 2);
3262
+ rb_define_method(cIndexReader, "term_count", frt_ir_term_count, 1);
3215
3263
  rb_define_method(cIndexReader, "fields", frt_ir_fields, 0);
3216
3264
  rb_define_method(cIndexReader, "field_names", frt_ir_fields, 0);
3217
3265
  rb_define_method(cIndexReader, "field_infos", frt_ir_field_infos, 0);
3218
3266
  rb_define_method(cIndexReader, "tokenized_fields", frt_ir_tk_fields, 0);
3267
+ rb_define_method(cIndexReader, "version", frt_ir_version, 0);
3219
3268
  }
3220
3269
 
3221
3270
  /* rdoc hack
data/ext/r_search.c CHANGED
@@ -104,6 +104,7 @@ static ID id_score;
104
104
  static ID id_hits;
105
105
  static ID id_total_hits;
106
106
  static ID id_max_score;
107
+ static ID id_searcher;
107
108
 
108
109
  /* Search */
109
110
  static VALUE sym_offset;
@@ -152,7 +153,7 @@ frt_get_hit(Hit *hit)
152
153
  ****************************************************************************/
153
154
 
154
155
  static VALUE
155
- frt_get_td(TopDocs *td)
156
+ frt_get_td(TopDocs *td, VALUE rsearcher)
156
157
  {
157
158
  int i;
158
159
  VALUE rtop_docs;
@@ -167,6 +168,7 @@ frt_get_td(TopDocs *td)
167
168
  INT2FIX(td->total_hits),
168
169
  hit_ary,
169
170
  rb_float_new((double)td->max_score),
171
+ rsearcher,
170
172
  NULL);
171
173
  td_destroy(td);
172
174
  return rtop_docs;
@@ -174,20 +176,26 @@ frt_get_td(TopDocs *td)
174
176
 
175
177
  /*
176
178
  * call-seq:
177
- * top_doc.to_s -> string
179
+ * top_doc.to_s(field = :id) -> string
178
180
  *
179
181
  * Returns a string represention of the top_doc in readable format.
180
182
  */
181
183
  static VALUE
182
- frt_td_to_s(VALUE self)
184
+ frt_td_to_s(int argc, VALUE *argv, VALUE self)
183
185
  {
184
186
  int i;
185
187
  VALUE rhits = rb_funcall(self, id_hits, 0);
188
+ Searcher *sea = (Searcher *)DATA_PTR(rb_funcall(self, id_searcher, 0));
186
189
  const int len = RARRAY(rhits)->len;
187
190
  char *str = ALLOC_N(char, len * 64 + 100);
188
191
  char *s = str;
192
+ char *field = "id";
189
193
  VALUE rstr;
190
194
 
195
+ if (argc) {
196
+ field = frt_field(argv[0]);
197
+ }
198
+
191
199
  sprintf(s, "TopDocs: total_hits = %d, max_score = %f [\n",
192
200
  FIX2INT(rb_funcall(self, id_total_hits, 0)),
193
201
  NUM2DBL(rb_funcall(self, id_max_score, 0)));
@@ -195,10 +203,18 @@ frt_td_to_s(VALUE self)
195
203
 
196
204
  for (i = 0; i < len; i++) {
197
205
  VALUE rhit = RARRAY(rhits)->ptr[i];
198
- sprintf(s, "\t%d: %f\n",
199
- FIX2INT(rb_funcall(rhit, id_doc, 0)),
206
+ int doc_id = FIX2INT(rb_funcall(rhit, id_doc, 0));
207
+ char *value = "";
208
+ LazyDoc *lzd = sea->get_lazy_doc(sea, doc_id);
209
+ LazyDocField *lzdf = h_get(lzd->field_dict, field);
210
+ if (NULL != lzdf) {
211
+ value = lazy_df_get_data(lzdf, 0);
212
+ }
213
+
214
+ sprintf(s, "\t%d \"%s\": %f\n", doc_id, value,
200
215
  NUM2DBL(rb_funcall(rhit, id_score, 0)));
201
216
  s += strlen(s);
217
+ lazy_doc_close(lzd);
202
218
  }
203
219
 
204
220
  sprintf(s, "]\n");
@@ -2388,7 +2404,7 @@ frt_sea_search(int argc, VALUE *argv, VALUE self)
2388
2404
  Query *query;
2389
2405
  rb_scan_args(argc, argv, "11", &rquery, &roptions);
2390
2406
  Data_Get_Struct(rquery, Query, query);
2391
- return frt_get_td(frt_sea_search_internal(query, roptions, sea));
2407
+ return frt_get_td(frt_sea_search_internal(query, roptions, sea), self);
2392
2408
  }
2393
2409
 
2394
2410
  /*
@@ -2760,13 +2776,15 @@ Init_TopDocs(void)
2760
2776
  "total_hits",
2761
2777
  "hits",
2762
2778
  "max_score",
2779
+ "searcher",
2763
2780
  NULL);
2764
2781
  rb_set_class_path(cTopDocs, mSearch, td_class);
2765
2782
  rb_const_set(mSearch, rb_intern(td_class), cTopDocs);
2766
- rb_define_method(cTopDocs, "to_s", frt_td_to_s, 0);
2783
+ rb_define_method(cTopDocs, "to_s", frt_td_to_s, -1);
2767
2784
  id_hits = rb_intern("hits");
2768
2785
  id_total_hits = rb_intern("total_hits");
2769
2786
  id_max_score = rb_intern("max_score");
2787
+ id_searcher = rb_intern("searcher");
2770
2788
  }
2771
2789
 
2772
2790
  /*
data/ext/search.c CHANGED
@@ -122,11 +122,12 @@ static void hit_pq_down(PriorityQueue *pq)
122
122
  static Hit *hit_pq_pop(PriorityQueue *pq)
123
123
  {
124
124
  if (pq->size > 0) {
125
- Hit *result = (Hit *)pq->heap[1]; /* save first value */
126
- pq->heap[1] = pq->heap[pq->size]; /* move last to first */
127
- pq->heap[pq->size] = NULL;
125
+ Hit **heap = (Hit **)pq->heap;
126
+ Hit *result = heap[1]; /* save first value */
127
+ heap[1] = heap[pq->size]; /* move last to first */
128
+ heap[pq->size] = NULL;
128
129
  pq->size--;
129
- hit_pq_down(pq); /* adjust heap */
130
+ hit_pq_down(pq); /* adjust heap */
130
131
  return result;
131
132
  }
132
133
  else {
@@ -1079,8 +1080,8 @@ static TopDocs *isea_search_w(Searcher *self,
1079
1080
  for (i = num_docs - 1; i >= 0; i--) {
1080
1081
  score_docs[i] = hq_pop(hq);
1081
1082
  /*
1082
- hit = score_docs[i] = pq_pop(hq);
1083
- printf("hit = %d-->%f\n", hit->doc, hit->score);
1083
+ printf("score_docs[i][%d] = [%ld] => %d-->%f\n", i,
1084
+ score_docs[i], score_docs[i]->doc, score_docs[i]->score);
1084
1085
  */
1085
1086
  }
1086
1087
  }
data/ext/sort.c CHANGED
@@ -426,8 +426,8 @@ int sf_string_compare(void *index, Hit *hit1, Hit *hit2)
426
426
  char *s2 = ((StringIndex *)index)->values[
427
427
  ((StringIndex *)index)->index[hit2->doc]];
428
428
 
429
- if (s1 == NULL) return s1 ? -1 : 0;
430
- if (s2 == NULL) return 1;
429
+ if (s1 == NULL) return s2 ? 1 : 0;
430
+ if (s2 == NULL) return -1;
431
431
 
432
432
  #ifdef POSH_OS_WIN32
433
433
  return strcmp(s1, s2);
@@ -874,8 +874,8 @@ bool fdshq_lt(FieldDoc *fd1, FieldDoc *fd2)
874
874
  do {
875
875
  char *s1 = cmps1[i].val.s;
876
876
  char *s2 = cmps2[i].val.s;
877
- if (s1 == NULL) c = s2 ? -1 : 0;
878
- else if (s2 == NULL) c = 1;
877
+ if (s1 == NULL) c = s2 ? 1 : 0;
878
+ else if (s2 == NULL) c = -1;
879
879
  #ifdef POSH_OS_WIN32
880
880
  else c = strcmp(s1, s2);
881
881
  #else
data/lib/ferret/index.rb CHANGED
@@ -179,11 +179,13 @@ module Ferret::Index
179
179
  # Alternatively you may want to use the HTML entity
180
180
  # &#8230; or the UTF-8 string "\342\200\246".
181
181
  def highlight(query, doc_id, options = {})
182
- ensure_searcher_open()
183
- @searcher.highlight(do_process_query(query),
184
- doc_id,
185
- options[:field]||@options[:default_field],
186
- options)
182
+ @dir.synchronize do
183
+ ensure_searcher_open()
184
+ @searcher.highlight(do_process_query(query),
185
+ doc_id,
186
+ options[:field]||@options[:default_field],
187
+ options)
188
+ end
187
189
  end
188
190
 
189
191
  # Closes this index by closing its associated reader and writer objects.
@@ -273,9 +275,14 @@ module Ferret::Index
273
275
  end
274
276
  ensure_writer_open()
275
277
 
276
- old_analyzer = @writer.analyzer if analyzer
277
- @writer.add_document(doc)
278
- @writer.analyzer = old_analyzer if analyzer
278
+ if analyzer
279
+ old_analyzer = @writer.analyzer
280
+ @writer.analyzer = analyzer
281
+ @writer.add_document(doc)
282
+ @writer.analyzer = old_analyzer
283
+ else
284
+ @writer.add_document(doc)
285
+ end
279
286
 
280
287
  flush() if @auto_flush
281
288
  end
@@ -1,3 +1,3 @@
1
1
  module Ferret
2
- VERSION = '0.10.9'
2
+ VERSION = '0.10.10'
3
3
  end
@@ -0,0 +1,132 @@
1
+ # Author: Matthew D Moss
2
+ #
3
+ # Writtern for ruby quiz #25
4
+ #
5
+ class JapaneseTranslator
6
+ # My knowledge of counting Japanese is limited, so this may not
7
+ # be entirely correct; in particular, I don't know what rules
8
+ # to follow after 'hyaku man' (1,000,000).
9
+ # I also combine a digit with its group, such as 'gohyaku' rather
10
+ # than 'go hyaku'; I just like reading it better that way.
11
+
12
+ DIGITS = %w(zero ichi ni san yon go roku nana hachi kyu)
13
+ GROUPS = %w(nothingtoseeheremovealong ju hyaku sen)
14
+ MAN = 10000
15
+
16
+ def to_spoken(val)
17
+ case val <=> 0
18
+ when -1
19
+ '- ' + to_spoken(-val)
20
+ when 0
21
+ DIGITS[0]
22
+ else
23
+ group(val, 0)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def group(val, level)
30
+ if val >= MAN
31
+ group(val / MAN, 0) + 'man ' + group(val % MAN, 0)
32
+ else
33
+ case val
34
+ when 0
35
+ ''
36
+ when 1
37
+ level == 0 ? DIGITS[val] : GROUPS[level]
38
+ when 2...10
39
+ DIGITS[val] + (GROUPS[level] if level > 0).to_s
40
+ else
41
+ group(val / 10, level+1) + ' ' + group(val % 10, level)
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+
48
+ class USEnglishTranslator
49
+ # Formal, US English. Optional 'and'. Will not produce things
50
+ # such as 'twelve hundred' but rather 'one thousand two hundred'.
51
+ # The use of 'and' is incomplete; it is sometimes missed.
52
+
53
+ DIGITS = %w(zero one two three four five six seven eight nine)
54
+ TEENS = %w(ten eleven twelve thirteen fourteen fifteen sixteen
55
+ seventeen eighteen nineteen)
56
+ TENS = %w(hello world twenty thirty forty fifty sixty seventy
57
+ eighty ninety)
58
+ GROUPS = %w(thousand million billion trillion quadrillion
59
+ quintillion sextillion septillion octillion nonillion
60
+ decillion)
61
+ K = 1000
62
+
63
+ def initialize(conjunction = true)
64
+ @conjunction = conjunction
65
+ end
66
+
67
+ def to_spoken(val)
68
+ case val <=> 0
69
+ when -1
70
+ 'negative ' + to_spoken(-val)
71
+ when 0
72
+ DIGITS[0]
73
+ else
74
+ group(val, 0).flatten.join(' ')
75
+ end
76
+ end
77
+
78
+ private
79
+
80
+ def group(val, level)
81
+ x = group(val / K, level + 1) << GROUPS[level] if val >= K
82
+ x.to_a << under_1000(val % K, level)
83
+ end
84
+
85
+ def under_1000(val, level)
86
+ x = [DIGITS[val / 100]] << 'hundred' if val >= 100
87
+ x.to_a << under_100(val % 100, (level == 0 and not x.nil?))
88
+ end
89
+
90
+ def under_100(val, junction)
91
+ x = [('and' if @conjunction and junction)] # wyf?
92
+ case val
93
+ when 0
94
+ []
95
+ when 1...10
96
+ x << DIGITS[val]
97
+ when 10...20
98
+ x << TEENS[val - 10]
99
+ else
100
+ d = val % 10
101
+ x << (TENS[val / 10] + ('-' + DIGITS[d] if d != 0).to_s)
102
+ end
103
+ end
104
+ end
105
+
106
+
107
+ class Integer
108
+ def to_spoken(translator = USEnglishTranslator.new)
109
+ translator.to_spoken(self).squeeze(' ').strip
110
+ end
111
+ end
112
+
113
+ if $0 == __FILE__
114
+ SAMPLES = [ 0, 1, 2, 5, 10, 11, 14, 18, 20, 21, 29, 33, 42, 50, 87, 99,
115
+ 100, 101, 110, 167, 199, 200, 201, 276, 300, 314, 500, 610,
116
+ 1000, 1039, 1347, 2309, 3098, 23501, 32767, 70000, 5480283,
117
+ 2435489238, 234100090000, -42, -2001 ]
118
+
119
+ TRANSLATORS = { 'US English' => USEnglishTranslator.new,
120
+ 'Japanese' => JapaneseTranslator.new }
121
+
122
+
123
+ # main
124
+ TRANSLATORS.each do |lang, translator|
125
+ puts
126
+ puts lang
127
+ puts '-' * lang.length
128
+ SAMPLES.each do |val|
129
+ puts "%12d => %s" % [val, val.to_spoken(translator)]
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,78 @@
1
+ require File.dirname(__FILE__) + "/../test_helper"
2
+ require File.dirname(__FILE__) + "/number_to_spoken.rb"
3
+ require 'thread'
4
+
5
+ class IndexThreadSafetyTest < Test::Unit::TestCase
6
+ include Ferret::Index
7
+
8
+ INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
9
+ ITERATIONS = 1000
10
+ NUM_THREADS = 2
11
+ ANALYZER = Ferret::Analysis::StandardAnalyzer.new()
12
+
13
+ def setup
14
+ index = Index.new(:path => INDEX_DIR,
15
+ :create => true,
16
+ :analyzer => ANALYZER,
17
+ :default_field => :content)
18
+ index.close
19
+ end
20
+
21
+ def indexing_thread()
22
+ index = Index.new(:path => INDEX_DIR,
23
+ :analyzer => ANALYZER,
24
+ :default_field => :content)
25
+
26
+ ITERATIONS.times do
27
+ choice = rand()
28
+
29
+ if choice > 0.98
30
+ do_optimize(index)
31
+ elsif choice > 0.7
32
+ do_delete_doc(index)
33
+ elsif choice > 0.5
34
+ do_search(index)
35
+ else
36
+ do_add_doc(index)
37
+ end
38
+ end
39
+ end
40
+
41
+ def do_optimize(index)
42
+ puts "Optimizing the index"
43
+ index.optimize
44
+ end
45
+
46
+ def do_delete_doc(index)
47
+ return if index.size == 0
48
+ doc_num = rand(index.size)
49
+ puts "Deleting #{doc_num} from index which has#{index.has_deletions? ? "" : " no"} deletions"
50
+ puts "document was already deleted" if (index.deleted?(doc_num))
51
+ index.delete(doc_num)
52
+ end
53
+
54
+ def do_add_doc(index)
55
+ n = rand(0xFFFFFFFF)
56
+ d = {:id => n, :content => n.to_spoken}
57
+ puts("Adding #{n}")
58
+ index << d
59
+ end
60
+
61
+ def do_search(index)
62
+ n = rand(0xFFFFFFFF)
63
+ puts("Searching for #{n}")
64
+ hits = index.search_each(n.to_spoken, :num_docs => 3) do |d, s|
65
+ puts "Hit for #{n}: #{index[d][:id]} - #{s}"
66
+ end
67
+ puts("Searched for #{n}: total = #{hits}")
68
+ end
69
+
70
+ def test_threading
71
+ threads = []
72
+ NUM_THREADS.times do
73
+ threads << Thread.new { indexing_thread }
74
+ end
75
+
76
+ threads.each {|t| t.join}
77
+ end
78
+ end
@@ -0,0 +1,137 @@
1
+ require File.dirname(__FILE__) + "/../test_helper"
2
+ require File.dirname(__FILE__) + "/../utils/number_to_spoken.rb"
3
+ require 'thread'
4
+
5
+ class ThreadSafetyTest
6
+ include Ferret::Index
7
+ include Ferret::Search
8
+ include Ferret::Store
9
+ include Ferret::Document
10
+
11
+ def initialize(options)
12
+ @options = options
13
+ end
14
+
15
+ INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
16
+ ANALYZER = Ferret::Analysis::Analyzer.new()
17
+ ITERATIONS = 19
18
+ @@searcher = nil
19
+
20
+ def run_index_thread(writer)
21
+ reopen_interval = 30 + rand(60)
22
+
23
+ use_compound_file = false
24
+
25
+ (400*ITERATIONS).times do |i|
26
+ d = Document.new()
27
+ n = rand(0xFFFFFFFF)
28
+ d << Field.new("id", n.to_s, Field::Store::YES, Field::Index::UNTOKENIZED)
29
+ d << Field.new("contents", n.to_spoken, Field::Store::NO, Field::Index::TOKENIZED)
30
+ puts("Adding #{n}")
31
+
32
+ # Switch between single and multiple file segments
33
+ use_compound_file = (rand < 0.5)
34
+ writer.use_compound_file = use_compound_file
35
+
36
+ writer << d
37
+
38
+ if (i % reopen_interval == 0)
39
+ writer.close()
40
+ writer = IndexWriter.new(INDEX_DIR, :analyzer => ANALYZER)
41
+ end
42
+ end
43
+
44
+ writer.close()
45
+ rescue => e
46
+ puts e
47
+ puts e.backtrace
48
+ raise e
49
+ end
50
+
51
+ def run_search_thread(use_global)
52
+ reopen_interval = 10 + rand(20)
53
+
54
+ unless use_global
55
+ searcher = IndexSearcher.new(INDEX_DIR)
56
+ end
57
+
58
+ (50*ITERATIONS).times do |i|
59
+ search_for(rand(0xFFFFFFFF), (searcher.nil? ? @@searcher : searcher))
60
+ if (i%reopen_interval == 0)
61
+ if (searcher == nil)
62
+ @@searcher = IndexSearcher.new(INDEX_DIR)
63
+ else
64
+ searcher.close()
65
+ searcher = IndexSearcher.new(INDEX_DIR)
66
+ end
67
+ end
68
+ end
69
+ rescue => e
70
+ puts e
71
+ puts e.backtrace
72
+ raise e
73
+ end
74
+
75
+ def search_for(n, searcher)
76
+ puts("Searching for #{n}")
77
+ hits =
78
+ searcher.search(Ferret::QueryParser.parse(n.to_spoken, "contents", :analyzer => ANALYZER),
79
+ :num_docs => 3)
80
+ puts("Search for #{n}: total = #{hits.size}")
81
+ hits.each do |d, s|
82
+ puts "Hit for #{n}: #{searcher.reader.get_document(d)["id"]} - #{s}"
83
+ end
84
+ end
85
+
86
+ def run_test_threads
87
+
88
+ threads = []
89
+ unless @options[:read_only]
90
+ writer = IndexWriter.new(INDEX_DIR, :analyzer => ANALYZER,
91
+ :create => !@options[:add])
92
+
93
+ threads << Thread.new { run_index_thread(writer) }
94
+
95
+ sleep(1)
96
+ end
97
+
98
+ threads << Thread.new { run_search_thread(false)}
99
+
100
+ @@searcher = IndexSearcher.new(INDEX_DIR)
101
+ threads << Thread.new { run_search_thread(true)}
102
+
103
+ threads << Thread.new { run_search_thread(true)}
104
+
105
+ threads.each {|t| t.join}
106
+ end
107
+ end
108
+
109
+
110
+ if $0 == __FILE__
111
+ require 'optparse'
112
+
113
+ OPTIONS = {
114
+ :all => false,
115
+ :read_only => false,
116
+ }
117
+
118
+ ARGV.options do |opts|
119
+ script_name = File.basename($0)
120
+ opts.banner = "Usage: ruby #{script_name} [options]"
121
+
122
+ opts.separator ""
123
+
124
+ opts.on("-r", "--read-only", "Read Only.") { OPTIONS[:all] = true }
125
+ opts.on("-a", "--all", "All.") { OPTIONS[:read_only] = true }
126
+
127
+ opts.separator ""
128
+
129
+ opts.on("-h", "--help",
130
+ "Show this help message.") { puts opts; exit }
131
+
132
+ opts.parse!
133
+ end
134
+
135
+ tst = ThreadSafetyTest.new(OPTIONS)
136
+ tst.run_test_threads
137
+ end
@@ -766,4 +766,12 @@ class IndexTest < Test::Unit::TestCase
766
766
 
767
767
  index.close
768
768
  end
769
+
770
+ def test_changing_analyzer
771
+ index = Ferret::I.new
772
+ a = Ferret::Analysis::WhiteSpaceAnalyzer.new(false)
773
+ index.add_document({:content => "Content With Capitals"}, a)
774
+ tv = index.reader.term_vector(0, :content)
775
+ assert_equal("Capitals", tv.terms[0].text)
776
+ end
769
777
  end
@@ -16,8 +16,8 @@ class SearchAndSortTest < Test::Unit::TestCase
16
16
  {:x => "findall", :string => "c", :int => "5", :float => "0.1"}, # 3 3
17
17
  {:x => "findall", :string => "e", :int => "2", :float => "0.001"}, # 5 1
18
18
  {:x => "findall", :string => "g", :int => "1", :float => "1.0"}, # 3 3
19
- {:x => "findall", :string => "i", :int => "3", :float => "0.0001"}, # 6 2
20
- {:x => "findall", :string => "j", :int => "4", :float => "10.0"}, # 4 0
19
+ {:x => "findall", :string => nil, :int => "3", :float => "0.0001"}, # 6 2
20
+ {:x => "findall", :string => "", :int => "4", :float => "10.0"}, # 4 0
21
21
  {:x => "findall", :string => "h", :int => "5", :float => "0.00001"}, # 7 3
22
22
  {:x => "findall", :string => "f", :int => "2", :float => "100.0"}, # 5 1
23
23
  {:x => "findall", :string => "d", :int => "3", :float => "1000.0"}, # 6 2
@@ -145,7 +145,7 @@ class SearchAndSortTest < Test::Unit::TestCase
145
145
 
146
146
  ## str
147
147
  sf_str = SortField.new(:string, {:type => :string})
148
- do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,4,5], [sf_str, SortField::SCORE])
148
+ do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,5,4], [sf_str, SortField::SCORE])
149
149
  do_test_top_docs(is, q, [0,9,1,8,2,7,3,6,4,5], "string")
150
150
 
151
151
  ## auto
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.10.9
7
- date: 2006-09-27 00:00:00 +09:00
6
+ version: 0.10.10
7
+ date: 2006-10-08 00:00:00 +09:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib
@@ -198,6 +198,9 @@ files:
198
198
  - test/unit/search/tc_search_and_sort.rb
199
199
  - test/unit/search/tm_searcher.rb
200
200
  - test/unit/query_parser/tc_query_parser.rb
201
+ - test/threading/thread_safety_index_test.rb
202
+ - test/threading/thread_safety_test.rb
203
+ - test/threading/number_to_spoken.rb
201
204
  test_files: []
202
205
 
203
206
  rdoc_options: