ferret 0.10.2 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ static VALUE cRegExpTokenizer;
18
18
  static VALUE cAsciiLowerCaseFilter;
19
19
  static VALUE cLowerCaseFilter;
20
20
  static VALUE cStopFilter;
21
+ static VALUE cHyphenFilter;
21
22
  static VALUE cStemFilter;
22
23
 
23
24
  static VALUE cAnalyzer;
@@ -568,22 +569,20 @@ static TokenStream *
568
569
  frt_get_cwrapped_rts(VALUE rts)
569
570
  {
570
571
  TokenStream *ts;
571
- switch (TYPE(rts)) {
572
- case T_DATA:
573
- GET_TS(ts, rts);
574
- REF(ts);
575
- break;
576
- default:
577
- ts = ts_new(CWrappedTokenStream);
578
- CWTS(ts)->rts = rts;
579
- ts->next = &cwrts_next;
580
- ts->reset = &cwrts_reset;
581
- ts->clone_i = &cwrts_clone_i;
582
- ts->destroy_i = &cwrts_destroy_i;
583
- /* prevent from being garbage collected */
584
- rb_hash_aset(object_space, LONG2NUM(rts), rts);
585
- ts->ref_cnt = 1;
586
- break;
572
+ if (rb_ivar_get(CLASS_OF(rts), id_cclass) == Qtrue) {
573
+ GET_TS(ts, rts);
574
+ REF(ts);
575
+ }
576
+ else {
577
+ ts = ts_new(CWrappedTokenStream);
578
+ CWTS(ts)->rts = rts;
579
+ ts->next = &cwrts_next;
580
+ ts->reset = &cwrts_reset;
581
+ ts->clone_i = &cwrts_clone_i;
582
+ ts->destroy_i = &cwrts_destroy_i;
583
+ /* prevent from being garbage collected */
584
+ rb_hash_aset(object_space, LONG2NUM(rts), rts);
585
+ ts->ref_cnt = 1;
587
586
  }
588
587
  return ts;
589
588
  }
@@ -911,6 +910,28 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
911
910
  return self;
912
911
  }
913
912
 
913
+ /*
914
+ * call-seq:
915
+ * HyphenFilter.new(token_stream) -> token_stream
916
+ *
917
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
918
+ * by adding both the word concatenated into a single word and split into
919
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
920
+ * search for "e-mail", "email" and "mail" will all match. This filter is
921
+ * used by default by the StandardAnalyzer.
922
+ */
923
+ static VALUE
924
+ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
925
+ {
926
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
927
+ ts = hyphen_filter_new(ts);
928
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
929
+
930
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
931
+ object_add(ts, self);
932
+ return self;
933
+ }
934
+
914
935
  /*
915
936
  * call-seq:
916
937
  * StopFilter.new(token_stream) -> token_stream
@@ -1021,20 +1042,18 @@ Analyzer *
1021
1042
  frt_get_cwrapped_analyzer(VALUE ranalyzer)
1022
1043
  {
1023
1044
  Analyzer *a = NULL;
1024
- switch (TYPE(ranalyzer)) {
1025
- case T_DATA:
1026
- Data_Get_Struct(ranalyzer, Analyzer, a);
1027
- REF(a);
1028
- break;
1029
- default:
1030
- a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1031
- a->destroy_i = &cwa_destroy_i;
1032
- a->get_ts = &cwa_get_ts;
1033
- a->ref_cnt = 1;
1034
- ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1035
- /* prevent from being garbage collected */
1036
- rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1037
- break;
1045
+ if (rb_ivar_get(CLASS_OF(ranalyzer), id_cclass) == Qtrue) {
1046
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1047
+ REF(a);
1048
+ }
1049
+ else {
1050
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1051
+ a->destroy_i = &cwa_destroy_i;
1052
+ a->get_ts = &cwa_get_ts;
1053
+ a->ref_cnt = 1;
1054
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1055
+ /* prevent from being garbage collected */
1056
+ rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1038
1057
  }
1039
1058
  return a;
1040
1059
  }
@@ -1350,11 +1369,14 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1350
1369
 
1351
1370
  ts = rets_new(Qnil, regex, proc);
1352
1371
  rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1353
- REF(ts);
1354
1372
  /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
1355
1373
  object_add(ts, rets);
1356
1374
 
1357
- if (lower != Qfalse) ts = mb_lowercase_filter_new(ts);
1375
+ if (lower != Qfalse) {
1376
+ rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
1377
+ ts = DATA_PTR(rets);
1378
+ }
1379
+ REF(ts);
1358
1380
 
1359
1381
  a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1360
1382
  Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
@@ -1461,7 +1483,7 @@ static void Init_Token(void)
1461
1483
  */
1462
1484
  static void Init_TokenStream(void)
1463
1485
  {
1464
- cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1486
+ cTokenStream = frt_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1465
1487
  rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1466
1488
  rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1467
1489
  rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
@@ -1482,7 +1504,7 @@ static void Init_TokenStream(void)
1482
1504
  static void Init_AsciiLetterTokenizer(void)
1483
1505
  {
1484
1506
  cAsciiLetterTokenizer =
1485
- rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1507
+ frt_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1486
1508
  rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1487
1509
  rb_define_method(cAsciiLetterTokenizer, "initialize",
1488
1510
  frt_a_letter_tokenizer_init, 1);
@@ -1504,7 +1526,7 @@ static void Init_AsciiLetterTokenizer(void)
1504
1526
  static void Init_LetterTokenizer(void)
1505
1527
  {
1506
1528
  cLetterTokenizer =
1507
- rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1529
+ frt_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1508
1530
  rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1509
1531
  rb_define_method(cLetterTokenizer, "initialize",
1510
1532
  frt_letter_tokenizer_init, -1);
@@ -1524,7 +1546,7 @@ static void Init_LetterTokenizer(void)
1524
1546
  static void Init_AsciiWhiteSpaceTokenizer(void)
1525
1547
  {
1526
1548
  cAsciiWhiteSpaceTokenizer =
1527
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1549
+ frt_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1528
1550
  cTokenStream);
1529
1551
  rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1530
1552
  rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
@@ -1545,7 +1567,7 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
1545
1567
  static void Init_WhiteSpaceTokenizer(void)
1546
1568
  {
1547
1569
  cWhiteSpaceTokenizer =
1548
- rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1570
+ frt_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1549
1571
  rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1550
1572
  rb_define_method(cWhiteSpaceTokenizer, "initialize",
1551
1573
  frt_whitespace_tokenizer_init, -1);
@@ -1566,7 +1588,7 @@ static void Init_WhiteSpaceTokenizer(void)
1566
1588
  static void Init_AsciiStandardTokenizer(void)
1567
1589
  {
1568
1590
  cAsciiStandardTokenizer =
1569
- rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1591
+ frt_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1570
1592
  rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1571
1593
  rb_define_method(cAsciiStandardTokenizer, "initialize",
1572
1594
  frt_a_standard_tokenizer_init, 1);
@@ -1587,7 +1609,7 @@ static void Init_AsciiStandardTokenizer(void)
1587
1609
  static void Init_StandardTokenizer(void)
1588
1610
  {
1589
1611
  cStandardTokenizer =
1590
- rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1612
+ frt_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1591
1613
  rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1592
1614
  rb_define_method(cStandardTokenizer, "initialize",
1593
1615
  frt_standard_tokenizer_init, 1);
@@ -1614,7 +1636,7 @@ static void Init_StandardTokenizer(void)
1614
1636
  static void Init_RegExpTokenizer(void)
1615
1637
  {
1616
1638
  cRegExpTokenizer =
1617
- rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1639
+ frt_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1618
1640
  rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1619
1641
  rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1620
1642
  rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
@@ -1642,7 +1664,7 @@ static void Init_RegExpTokenizer(void)
1642
1664
  static void Init_AsciiLowerCaseFilter(void)
1643
1665
  {
1644
1666
  cAsciiLowerCaseFilter =
1645
- rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1667
+ frt_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1646
1668
  rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1647
1669
  rb_define_method(cAsciiLowerCaseFilter, "initialize",
1648
1670
  frt_a_lowercase_filter_init, 1);
@@ -1662,12 +1684,33 @@ static void Init_AsciiLowerCaseFilter(void)
1662
1684
  static void Init_LowerCaseFilter(void)
1663
1685
  {
1664
1686
  cLowerCaseFilter =
1665
- rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1687
+ frt_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1666
1688
  rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1667
1689
  rb_define_method(cLowerCaseFilter, "initialize",
1668
1690
  frt_lowercase_filter_init, 1);
1669
1691
  }
1670
1692
 
1693
+ /*
1694
+ * Document-class: Ferret::Analysis::HyphenFilter
1695
+ *
1696
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
1697
+ * into a single word and split into multiple words. ie "e-mail" becomes
1698
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
1699
+ * will all match. This filter is used by default by the StandardAnalyzer.
1700
+ *
1701
+ * === Example
1702
+ *
1703
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
1704
+ *
1705
+ */
1706
+ static void Init_HyphenFilter(void)
1707
+ {
1708
+ cHyphenFilter =
1709
+ frt_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1710
+ rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
1711
+ rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
1712
+ }
1713
+
1671
1714
  /*
1672
1715
  * Document-class: Ferret::Analysis::StopFilter
1673
1716
  *
@@ -1682,7 +1725,7 @@ static void Init_LowerCaseFilter(void)
1682
1725
  static void Init_StopFilter(void)
1683
1726
  {
1684
1727
  cStopFilter =
1685
- rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1728
+ frt_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1686
1729
  rb_define_alloc_func(cStopFilter, frt_data_alloc);
1687
1730
  rb_define_method(cStopFilter, "initialize",
1688
1731
  frt_stop_filter_init, -1);
@@ -1741,7 +1784,7 @@ static void Init_StopFilter(void)
1741
1784
  static void Init_StemFilter(void)
1742
1785
  {
1743
1786
  cStemFilter =
1744
- rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1787
+ frt_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1745
1788
  rb_define_alloc_func(cStemFilter, frt_data_alloc);
1746
1789
  rb_define_method(cStemFilter, "initialize",
1747
1790
  frt_stem_filter_init, -1);
@@ -1784,7 +1827,7 @@ static void Init_StemFilter(void)
1784
1827
  static void Init_Analyzer(void)
1785
1828
  {
1786
1829
  cAnalyzer =
1787
- rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1830
+ frt_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1788
1831
  rb_define_alloc_func(cAnalyzer, frt_data_alloc);
1789
1832
  rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
1790
1833
  rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
@@ -1821,7 +1864,7 @@ static void Init_Analyzer(void)
1821
1864
  static void Init_AsciiLetterAnalyzer(void)
1822
1865
  {
1823
1866
  cAsciiLetterAnalyzer =
1824
- rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1867
+ frt_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1825
1868
  rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
1826
1869
  rb_define_method(cAsciiLetterAnalyzer, "initialize",
1827
1870
  frt_a_letter_analyzer_init, -1);
@@ -1851,7 +1894,7 @@ static void Init_AsciiLetterAnalyzer(void)
1851
1894
  static void Init_LetterAnalyzer(void)
1852
1895
  {
1853
1896
  cLetterAnalyzer =
1854
- rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1897
+ frt_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1855
1898
  rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
1856
1899
  rb_define_method(cLetterAnalyzer, "initialize",
1857
1900
  frt_letter_analyzer_init, -1);
@@ -1887,7 +1930,7 @@ static void Init_LetterAnalyzer(void)
1887
1930
  static void Init_AsciiWhiteSpaceAnalyzer(void)
1888
1931
  {
1889
1932
  cAsciiWhiteSpaceAnalyzer =
1890
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1933
+ frt_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1891
1934
  rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
1892
1935
  rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
1893
1936
  frt_a_white_space_analyzer_init, -1);
@@ -1917,7 +1960,7 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
1917
1960
  static void Init_WhiteSpaceAnalyzer(void)
1918
1961
  {
1919
1962
  cWhiteSpaceAnalyzer =
1920
- rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1963
+ frt_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1921
1964
  rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
1922
1965
  rb_define_method(cWhiteSpaceAnalyzer, "initialize",
1923
1966
  frt_white_space_analyzer_init, -1);
@@ -1955,7 +1998,7 @@ static void Init_WhiteSpaceAnalyzer(void)
1955
1998
  static void Init_AsciiStandardAnalyzer(void)
1956
1999
  {
1957
2000
  cAsciiStandardAnalyzer =
1958
- rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2001
+ frt_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
1959
2002
  rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
1960
2003
  rb_define_method(cAsciiStandardAnalyzer, "initialize",
1961
2004
  frt_a_standard_analyzer_init, -1);
@@ -1986,7 +2029,7 @@ static void Init_AsciiStandardAnalyzer(void)
1986
2029
  static void Init_StandardAnalyzer(void)
1987
2030
  {
1988
2031
  cStandardAnalyzer =
1989
- rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2032
+ frt_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
1990
2033
  rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
1991
2034
  rb_define_method(cStandardAnalyzer, "initialize",
1992
2035
  frt_standard_analyzer_init, -1);
@@ -2015,7 +2058,7 @@ static void Init_StandardAnalyzer(void)
2015
2058
  static void Init_PerFieldAnalyzer(void)
2016
2059
  {
2017
2060
  cPerFieldAnalyzer =
2018
- rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2061
+ frt_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2019
2062
  rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2020
2063
  rb_define_method(cPerFieldAnalyzer, "initialize",
2021
2064
  frt_per_field_analyzer_init, 1);
@@ -2055,7 +2098,7 @@ static void Init_PerFieldAnalyzer(void)
2055
2098
  static void Init_RegExpAnalyzer(void)
2056
2099
  {
2057
2100
  cRegExpAnalyzer =
2058
- rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2101
+ frt_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2059
2102
  rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2060
2103
  rb_define_method(cRegExpAnalyzer, "initialize",
2061
2104
  frt_re_analyzer_init, -1);
@@ -2171,6 +2214,7 @@ Init_Analysis(void)
2171
2214
 
2172
2215
  Init_AsciiLowerCaseFilter();
2173
2216
  Init_LowerCaseFilter();
2217
+ Init_HyphenFilter();
2174
2218
  Init_StopFilter();
2175
2219
  Init_StemFilter();
2176
2220
 
@@ -1875,7 +1875,6 @@ frt_ir_init(VALUE self, VALUE rdir)
1875
1875
  VALUE rfield_num_map = rb_hash_new();
1876
1876
 
1877
1877
  if (TYPE(rdir) == T_ARRAY) {
1878
- VALUE rreader;
1879
1878
  VALUE rdirs = rdir;
1880
1879
  const int reader_cnt = RARRAY(rdir)->len;
1881
1880
  IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
@@ -2181,7 +2181,7 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
2181
2181
  * searcher.search(query, options = {}) -> TopDocs
2182
2182
  *
2183
2183
  * Run a query through the Searcher on the index. A TopDocs object is
2184
- * returned with the relevant results. The +query+ is an built in Query
2184
+ * returned with the relevant results. The +query+ is a built in Query
2185
2185
  * object. Here are the options;
2186
2186
  *
2187
2187
  * === Options
@@ -741,19 +741,23 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
741
741
  for (i = e->start; i <= e->end; i++) {
742
742
  MatchRange *mr = mv->matches + i;
743
743
  len = mr->start_offset - last_offset;
744
- lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
744
+ if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
745
745
  e_ptr += len;
746
746
  memcpy(e_ptr, pre_tag, pre_tag_len);
747
747
  e_ptr += pre_tag_len;
748
748
  len = mr->end_offset - mr->start_offset;
749
- lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
749
+ if (len) lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
750
750
  e_ptr += len;
751
751
  memcpy(e_ptr, post_tag, post_tag_len);
752
752
  e_ptr += post_tag_len;
753
753
  last_offset = mr->end_offset;
754
754
  }
755
+ if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
756
+ /* no point using ellipsis if it takes up more space */
757
+ e->end_offset = lazy_df->len;
758
+ }
755
759
  len = e->end_offset - last_offset;
756
- lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
760
+ if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
757
761
  e_ptr += len;
758
762
  if (e->end_offset < lazy_df->len) {
759
763
  memcpy(e_ptr, ellipsis, ellipsis_len);
@@ -45,7 +45,7 @@ int tv_get_tv_term_index(TermVector *tv, const char *term)
45
45
  return mid;
46
46
  }
47
47
  }
48
- if (strcmp(term, tv->terms[hi].text) == 0) {
48
+ if (hi >= 0 && strcmp(term, tv->terms[hi].text) == 0) {
49
49
  return hi;
50
50
  }
51
51
  else {
@@ -146,25 +146,25 @@ module Ferret::Index
146
146
  #
147
147
  # === Options
148
148
  #
149
- # :field:: Default: @options[:default_field]. The default_field
150
- # is the field that is usually highlighted but you can
151
- # specify which field you want to highlight here. If
152
- # you want to highlight multiple fields then you will
153
- # need to call this method multiple times.
154
- # :excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
155
- # terms will be in the centre of the excerpt.
156
- # :num_excerpts:: Default: 2. Number of excerpts to return.
157
- # :pre_tag:: Default: "<b>". Tag to place to the left of the
158
- # match. You'll probably want to change this to a
159
- # "<span>" tag with a class "\033[7m" for use in a
160
- # terminal.
161
- # :post_tag:: Default: "</b>". This tag should close the
162
- # +:pre_tag+. Try tag "\033[m" in the terminal.
163
- # :ellipsis:: Default: "...". This is the string that is appended
164
- # at the beginning and end of excerpts (unless the
165
- # excerpt hits the start or end of the field. You'll
166
- # probably want to change this so a Unicode elipsis
167
- # character.
149
+ # field:: Default: @options[:default_field]. The default_field
150
+ # is the field that is usually highlighted but you can
151
+ # specify which field you want to highlight here. If
152
+ # you want to highlight multiple fields then you will
153
+ # need to call this method multiple times.
154
+ # excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
155
+ # terms will be in the centre of the excerpt.
156
+ # num_excerpts:: Default: 2. Number of excerpts to return.
157
+ # pre_tag:: Default: "<b>". Tag to place to the left of the
158
+ # match. You'll probably want to change this to a
159
+ # "<span>" tag with a class "\033[7m" for use in a
160
+ # terminal.
161
+ # post_tag:: Default: "</b>". This tag should close the
162
+ # +:pre_tag+. Try tag "\033[m" in the terminal.
163
+ # ellipsis:: Default: "...". This is the string that is appended
164
+ # at the beginning and end of excerpts (unless the
165
+ # excerpt hits the start or end of the field. You'll
166
+ # probably want to change this so a Unicode elipsis
167
+ # character.
168
168
  def highlight(query, doc_id, options = {})
169
169
  ensure_searcher_open()
170
170
  @searcher.highlight(process_query(query),
@@ -270,33 +270,79 @@ module Ferret::Index
270
270
  end
271
271
  alias :<< :add_document
272
272
 
273
- # The main search method for the index. You need to create a query to
274
- # pass to this method. You can also pass a hash with one or more of the
275
- # following; {filter, num_docs, first_doc, sort}
276
- #
277
- # query:: The query to run on the index
278
- # filter:: Filters docs from the search result
279
- # first_doc:: The index in the results of the first doc retrieved.
280
- # Default is 0
281
- # num_docs:: The number of results returned. Default is 10
282
- # sort:: An array of SortFields describing how to sort the results.
283
- # filter_proc:: A proc which takes |doc_id, score, searcher| as arguments
284
- # and returns true if the document passes the filter.
273
+ # Run a query through the Searcher on the index. A TopDocs object is
274
+ # returned with the relevant results. The +query+ is a built in Query
275
+ # object or a query string that can be parsed by the Ferret::QueryParser.
276
+ # Here are the options;
277
+ #
278
+ # === Options
279
+ #
280
+ # offset:: Default: 0. The offset of the start of the section of the
281
+ # result-set to return. This is used for paging through
282
+ # results. Let's say you have a page size of 10. If you
283
+ # don't find the result you want among the first 10 results
284
+ # then set +:offset+ to 10 and look at the next 10 results,
285
+ # then 20 and so on.
286
+ # limit:: Default: 10. This is the number of results you want
287
+ # returned, also called the page size. Set +:limit+ to
288
+ # +:all+ to return all results
289
+ # sort:: A Sort object or sort string describing how the field
290
+ # should be sorted. A sort string is made up of field names
291
+ # which cannot contain spaces and the word "DESC" if you
292
+ # want the field reversed, all seperated by commas. For
293
+ # example; "rating DESC, author, title"
294
+ # filter:: a Filter object to filter the search results with
295
+ # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
296
+ # and the Searcher object as its parameters and returns a
297
+ # Boolean value specifying whether the result should be
298
+ # included in the result set.
285
299
  def search(query, options = {})
286
300
  @dir.synchronize do
287
301
  return do_search(query, options)
288
302
  end
289
303
  end
290
304
 
291
- # See Index#search
305
+ # Run a query through the Searcher on the index. A TopDocs object is
306
+ # returned with the relevant results. The +query+ is a Query object or a
307
+ # query string that can be validly parsed by the Ferret::QueryParser. The
308
+ # Searcher#search_each method yields the internal document id (used to
309
+ # reference documents in the Searcher object like this;
310
+ # +searcher[doc_id]+) and the search score for that document. It is
311
+ # possible for the score to be greater than 1.0 for some queries and
312
+ # taking boosts into account. This method will also normalize scores to
313
+ # the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
314
+ # options;
315
+ #
316
+ # === Options
292
317
  #
293
- # This method yields the doc and score for each hit.
318
+ # offset:: Default: 0. The offset of the start of the section of the
319
+ # result-set to return. This is used for paging through
320
+ # results. Let's say you have a page size of 10. If you
321
+ # don't find the result you want among the first 10 results
322
+ # then set +:offset+ to 10 and look at the next 10 results,
323
+ # then 20 and so on.
324
+ # limit:: Default: 10. This is the number of results you want
325
+ # returned, also called the page size. Set +:limit+ to
326
+ # +:all+ to return all results
327
+ # sort:: A Sort object or sort string describing how the field
328
+ # should be sorted. A sort string is made up of field names
329
+ # which cannot contain spaces and the word "DESC" if you
330
+ # want the field reversed, all seperated by commas. For
331
+ # example; "rating DESC, author, title"
332
+ # filter:: a Filter object to filter the search results with
333
+ # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
334
+ # and the Searcher object as its parameters and returns a
335
+ # Boolean value specifying whether the result should be
336
+ # included in the result set.
337
+ #
338
+ # returns:: The total number of hits.
339
+ #
340
+ # === Example
294
341
  # eg.
295
- # index.search_each() do |doc, score|
342
+ # index.search_each(query, options = {}) do |doc, score|
296
343
  # puts "hit document number #{doc} with a score of #{score}"
297
344
  # end
298
345
  #
299
- # returns:: The total number of hits.
300
346
  def search_each(query, options = {}) # :yield: doc, score
301
347
  @dir.synchronize do
302
348
  ensure_searcher_open()
@@ -571,6 +617,19 @@ module Ferret::Index
571
617
  end
572
618
  end
573
619
 
620
+ # Turn a query string into a Query object with the Index's QueryParser
621
+ def process_query(query)
622
+ if query.is_a?(String)
623
+ if @qp.nil?
624
+ @qp = Ferret::QueryParser.new(@options)
625
+ end
626
+ # we need to set this ever time, in case a new field has been added
627
+ @qp.fields = @reader.field_names
628
+ query = @qp.parse(query)
629
+ end
630
+ return query
631
+ end
632
+
574
633
  protected
575
634
  def ensure_writer_open()
576
635
  raise "tried to use a closed index" if not @open
@@ -623,19 +682,6 @@ module Ferret::Index
623
682
 
624
683
  return @searcher.search(query, options)
625
684
  end
626
-
627
- def process_query(query)
628
- if query.is_a?(String)
629
- if @qp.nil?
630
- @qp = Ferret::QueryParser.new(@options)
631
- end
632
- # we need to set this ever time, in case a new field has been added
633
- @qp.fields = @reader.field_names
634
- query = @qp.parse(query)
635
- end
636
- return query
637
- end
638
-
639
685
  end
640
686
  end
641
687