ferret 0.10.2 → 0.10.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,6 +18,7 @@ static VALUE cRegExpTokenizer;
18
18
  static VALUE cAsciiLowerCaseFilter;
19
19
  static VALUE cLowerCaseFilter;
20
20
  static VALUE cStopFilter;
21
+ static VALUE cHyphenFilter;
21
22
  static VALUE cStemFilter;
22
23
 
23
24
  static VALUE cAnalyzer;
@@ -568,22 +569,20 @@ static TokenStream *
568
569
  frt_get_cwrapped_rts(VALUE rts)
569
570
  {
570
571
  TokenStream *ts;
571
- switch (TYPE(rts)) {
572
- case T_DATA:
573
- GET_TS(ts, rts);
574
- REF(ts);
575
- break;
576
- default:
577
- ts = ts_new(CWrappedTokenStream);
578
- CWTS(ts)->rts = rts;
579
- ts->next = &cwrts_next;
580
- ts->reset = &cwrts_reset;
581
- ts->clone_i = &cwrts_clone_i;
582
- ts->destroy_i = &cwrts_destroy_i;
583
- /* prevent from being garbage collected */
584
- rb_hash_aset(object_space, LONG2NUM(rts), rts);
585
- ts->ref_cnt = 1;
586
- break;
572
+ if (rb_ivar_get(CLASS_OF(rts), id_cclass) == Qtrue) {
573
+ GET_TS(ts, rts);
574
+ REF(ts);
575
+ }
576
+ else {
577
+ ts = ts_new(CWrappedTokenStream);
578
+ CWTS(ts)->rts = rts;
579
+ ts->next = &cwrts_next;
580
+ ts->reset = &cwrts_reset;
581
+ ts->clone_i = &cwrts_clone_i;
582
+ ts->destroy_i = &cwrts_destroy_i;
583
+ /* prevent from being garbage collected */
584
+ rb_hash_aset(object_space, LONG2NUM(rts), rts);
585
+ ts->ref_cnt = 1;
587
586
  }
588
587
  return ts;
589
588
  }
@@ -911,6 +910,28 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
911
910
  return self;
912
911
  }
913
912
 
913
+ /*
914
+ * call-seq:
915
+ * HyphenFilter.new(token_stream) -> token_stream
916
+ *
917
+ * Create an HyphenFilter which filters hyphenated words. The way it works is
918
+ * by adding both the word concatenated into a single word and split into
919
+ * multiple words. ie "e-mail" becomes "email" and "e mail". This way a
920
+ * search for "e-mail", "email" and "mail" will all match. This filter is
921
+ * used by default by the StandardAnalyzer.
922
+ */
923
+ static VALUE
924
+ frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
925
+ {
926
+ TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
927
+ ts = hyphen_filter_new(ts);
928
+ object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
929
+
930
+ Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
931
+ object_add(ts, self);
932
+ return self;
933
+ }
934
+
914
935
  /*
915
936
  * call-seq:
916
937
  * StopFilter.new(token_stream) -> token_stream
@@ -1021,20 +1042,18 @@ Analyzer *
1021
1042
  frt_get_cwrapped_analyzer(VALUE ranalyzer)
1022
1043
  {
1023
1044
  Analyzer *a = NULL;
1024
- switch (TYPE(ranalyzer)) {
1025
- case T_DATA:
1026
- Data_Get_Struct(ranalyzer, Analyzer, a);
1027
- REF(a);
1028
- break;
1029
- default:
1030
- a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1031
- a->destroy_i = &cwa_destroy_i;
1032
- a->get_ts = &cwa_get_ts;
1033
- a->ref_cnt = 1;
1034
- ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1035
- /* prevent from being garbage collected */
1036
- rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1037
- break;
1045
+ if (rb_ivar_get(CLASS_OF(ranalyzer), id_cclass) == Qtrue) {
1046
+ Data_Get_Struct(ranalyzer, Analyzer, a);
1047
+ REF(a);
1048
+ }
1049
+ else {
1050
+ a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
1051
+ a->destroy_i = &cwa_destroy_i;
1052
+ a->get_ts = &cwa_get_ts;
1053
+ a->ref_cnt = 1;
1054
+ ((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
1055
+ /* prevent from being garbage collected */
1056
+ rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
1038
1057
  }
1039
1058
  return a;
1040
1059
  }
@@ -1350,11 +1369,14 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
1350
1369
 
1351
1370
  ts = rets_new(Qnil, regex, proc);
1352
1371
  rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
1353
- REF(ts);
1354
1372
  /* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
1355
1373
  object_add(ts, rets);
1356
1374
 
1357
- if (lower != Qfalse) ts = mb_lowercase_filter_new(ts);
1375
+ if (lower != Qfalse) {
1376
+ rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
1377
+ ts = DATA_PTR(rets);
1378
+ }
1379
+ REF(ts);
1358
1380
 
1359
1381
  a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
1360
1382
  Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
@@ -1461,7 +1483,7 @@ static void Init_Token(void)
1461
1483
  */
1462
1484
  static void Init_TokenStream(void)
1463
1485
  {
1464
- cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1486
+ cTokenStream = frt_define_class_under(mAnalysis, "TokenStream", rb_cObject);
1465
1487
  rb_define_method(cTokenStream, "next", frt_ts_next, 0);
1466
1488
  rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
1467
1489
  rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
@@ -1482,7 +1504,7 @@ static void Init_TokenStream(void)
1482
1504
  static void Init_AsciiLetterTokenizer(void)
1483
1505
  {
1484
1506
  cAsciiLetterTokenizer =
1485
- rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1507
+ frt_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
1486
1508
  rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
1487
1509
  rb_define_method(cAsciiLetterTokenizer, "initialize",
1488
1510
  frt_a_letter_tokenizer_init, 1);
@@ -1504,7 +1526,7 @@ static void Init_AsciiLetterTokenizer(void)
1504
1526
  static void Init_LetterTokenizer(void)
1505
1527
  {
1506
1528
  cLetterTokenizer =
1507
- rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1529
+ frt_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
1508
1530
  rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
1509
1531
  rb_define_method(cLetterTokenizer, "initialize",
1510
1532
  frt_letter_tokenizer_init, -1);
@@ -1524,7 +1546,7 @@ static void Init_LetterTokenizer(void)
1524
1546
  static void Init_AsciiWhiteSpaceTokenizer(void)
1525
1547
  {
1526
1548
  cAsciiWhiteSpaceTokenizer =
1527
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1549
+ frt_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
1528
1550
  cTokenStream);
1529
1551
  rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
1530
1552
  rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
@@ -1545,7 +1567,7 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
1545
1567
  static void Init_WhiteSpaceTokenizer(void)
1546
1568
  {
1547
1569
  cWhiteSpaceTokenizer =
1548
- rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1570
+ frt_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
1549
1571
  rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
1550
1572
  rb_define_method(cWhiteSpaceTokenizer, "initialize",
1551
1573
  frt_whitespace_tokenizer_init, -1);
@@ -1566,7 +1588,7 @@ static void Init_WhiteSpaceTokenizer(void)
1566
1588
  static void Init_AsciiStandardTokenizer(void)
1567
1589
  {
1568
1590
  cAsciiStandardTokenizer =
1569
- rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1591
+ frt_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
1570
1592
  rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
1571
1593
  rb_define_method(cAsciiStandardTokenizer, "initialize",
1572
1594
  frt_a_standard_tokenizer_init, 1);
@@ -1587,7 +1609,7 @@ static void Init_AsciiStandardTokenizer(void)
1587
1609
  static void Init_StandardTokenizer(void)
1588
1610
  {
1589
1611
  cStandardTokenizer =
1590
- rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1612
+ frt_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
1591
1613
  rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
1592
1614
  rb_define_method(cStandardTokenizer, "initialize",
1593
1615
  frt_standard_tokenizer_init, 1);
@@ -1614,7 +1636,7 @@ static void Init_StandardTokenizer(void)
1614
1636
  static void Init_RegExpTokenizer(void)
1615
1637
  {
1616
1638
  cRegExpTokenizer =
1617
- rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1639
+ frt_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
1618
1640
  rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
1619
1641
  rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
1620
1642
  rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
@@ -1642,7 +1664,7 @@ static void Init_RegExpTokenizer(void)
1642
1664
  static void Init_AsciiLowerCaseFilter(void)
1643
1665
  {
1644
1666
  cAsciiLowerCaseFilter =
1645
- rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1667
+ frt_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
1646
1668
  rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
1647
1669
  rb_define_method(cAsciiLowerCaseFilter, "initialize",
1648
1670
  frt_a_lowercase_filter_init, 1);
@@ -1662,12 +1684,33 @@ static void Init_AsciiLowerCaseFilter(void)
1662
1684
  static void Init_LowerCaseFilter(void)
1663
1685
  {
1664
1686
  cLowerCaseFilter =
1665
- rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1687
+ frt_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
1666
1688
  rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
1667
1689
  rb_define_method(cLowerCaseFilter, "initialize",
1668
1690
  frt_lowercase_filter_init, 1);
1669
1691
  }
1670
1692
 
1693
+ /*
1694
+ * Document-class: Ferret::Analysis::HyphenFilter
1695
+ *
1696
+ * HyphenFilter filters hyphenated words by adding both the word concatenated
1697
+ * into a single word and split into multiple words. ie "e-mail" becomes
1698
+ * "email" and "e mail". This way a search for "e-mail", "email" and "mail"
1699
+ * will all match. This filter is used by default by the StandardAnalyzer.
1700
+ *
1701
+ * === Example
1702
+ *
1703
+ * ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
1704
+ *
1705
+ */
1706
+ static void Init_HyphenFilter(void)
1707
+ {
1708
+ cHyphenFilter =
1709
+ frt_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
1710
+ rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
1711
+ rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
1712
+ }
1713
+
1671
1714
  /*
1672
1715
  * Document-class: Ferret::Analysis::StopFilter
1673
1716
  *
@@ -1682,7 +1725,7 @@ static void Init_LowerCaseFilter(void)
1682
1725
  static void Init_StopFilter(void)
1683
1726
  {
1684
1727
  cStopFilter =
1685
- rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1728
+ frt_define_class_under(mAnalysis, "StopFilter", cTokenStream);
1686
1729
  rb_define_alloc_func(cStopFilter, frt_data_alloc);
1687
1730
  rb_define_method(cStopFilter, "initialize",
1688
1731
  frt_stop_filter_init, -1);
@@ -1741,7 +1784,7 @@ static void Init_StopFilter(void)
1741
1784
  static void Init_StemFilter(void)
1742
1785
  {
1743
1786
  cStemFilter =
1744
- rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1787
+ frt_define_class_under(mAnalysis, "StemFilter", cTokenStream);
1745
1788
  rb_define_alloc_func(cStemFilter, frt_data_alloc);
1746
1789
  rb_define_method(cStemFilter, "initialize",
1747
1790
  frt_stem_filter_init, -1);
@@ -1784,7 +1827,7 @@ static void Init_StemFilter(void)
1784
1827
  static void Init_Analyzer(void)
1785
1828
  {
1786
1829
  cAnalyzer =
1787
- rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1830
+ frt_define_class_under(mAnalysis, "Analyzer", rb_cObject);
1788
1831
  rb_define_alloc_func(cAnalyzer, frt_data_alloc);
1789
1832
  rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
1790
1833
  rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
@@ -1821,7 +1864,7 @@ static void Init_Analyzer(void)
1821
1864
  static void Init_AsciiLetterAnalyzer(void)
1822
1865
  {
1823
1866
  cAsciiLetterAnalyzer =
1824
- rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1867
+ frt_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
1825
1868
  rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
1826
1869
  rb_define_method(cAsciiLetterAnalyzer, "initialize",
1827
1870
  frt_a_letter_analyzer_init, -1);
@@ -1851,7 +1894,7 @@ static void Init_AsciiLetterAnalyzer(void)
1851
1894
  static void Init_LetterAnalyzer(void)
1852
1895
  {
1853
1896
  cLetterAnalyzer =
1854
- rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1897
+ frt_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
1855
1898
  rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
1856
1899
  rb_define_method(cLetterAnalyzer, "initialize",
1857
1900
  frt_letter_analyzer_init, -1);
@@ -1887,7 +1930,7 @@ static void Init_LetterAnalyzer(void)
1887
1930
  static void Init_AsciiWhiteSpaceAnalyzer(void)
1888
1931
  {
1889
1932
  cAsciiWhiteSpaceAnalyzer =
1890
- rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1933
+ frt_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
1891
1934
  rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
1892
1935
  rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
1893
1936
  frt_a_white_space_analyzer_init, -1);
@@ -1917,7 +1960,7 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
1917
1960
  static void Init_WhiteSpaceAnalyzer(void)
1918
1961
  {
1919
1962
  cWhiteSpaceAnalyzer =
1920
- rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1963
+ frt_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
1921
1964
  rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
1922
1965
  rb_define_method(cWhiteSpaceAnalyzer, "initialize",
1923
1966
  frt_white_space_analyzer_init, -1);
@@ -1955,7 +1998,7 @@ static void Init_WhiteSpaceAnalyzer(void)
1955
1998
  static void Init_AsciiStandardAnalyzer(void)
1956
1999
  {
1957
2000
  cAsciiStandardAnalyzer =
1958
- rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
2001
+ frt_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
1959
2002
  rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
1960
2003
  rb_define_method(cAsciiStandardAnalyzer, "initialize",
1961
2004
  frt_a_standard_analyzer_init, -1);
@@ -1986,7 +2029,7 @@ static void Init_AsciiStandardAnalyzer(void)
1986
2029
  static void Init_StandardAnalyzer(void)
1987
2030
  {
1988
2031
  cStandardAnalyzer =
1989
- rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
2032
+ frt_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
1990
2033
  rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
1991
2034
  rb_define_method(cStandardAnalyzer, "initialize",
1992
2035
  frt_standard_analyzer_init, -1);
@@ -2015,7 +2058,7 @@ static void Init_StandardAnalyzer(void)
2015
2058
  static void Init_PerFieldAnalyzer(void)
2016
2059
  {
2017
2060
  cPerFieldAnalyzer =
2018
- rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2061
+ frt_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
2019
2062
  rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
2020
2063
  rb_define_method(cPerFieldAnalyzer, "initialize",
2021
2064
  frt_per_field_analyzer_init, 1);
@@ -2055,7 +2098,7 @@ static void Init_PerFieldAnalyzer(void)
2055
2098
  static void Init_RegExpAnalyzer(void)
2056
2099
  {
2057
2100
  cRegExpAnalyzer =
2058
- rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2101
+ frt_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
2059
2102
  rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
2060
2103
  rb_define_method(cRegExpAnalyzer, "initialize",
2061
2104
  frt_re_analyzer_init, -1);
@@ -2171,6 +2214,7 @@ Init_Analysis(void)
2171
2214
 
2172
2215
  Init_AsciiLowerCaseFilter();
2173
2216
  Init_LowerCaseFilter();
2217
+ Init_HyphenFilter();
2174
2218
  Init_StopFilter();
2175
2219
  Init_StemFilter();
2176
2220
 
@@ -1875,7 +1875,6 @@ frt_ir_init(VALUE self, VALUE rdir)
1875
1875
  VALUE rfield_num_map = rb_hash_new();
1876
1876
 
1877
1877
  if (TYPE(rdir) == T_ARRAY) {
1878
- VALUE rreader;
1879
1878
  VALUE rdirs = rdir;
1880
1879
  const int reader_cnt = RARRAY(rdir)->len;
1881
1880
  IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
@@ -2181,7 +2181,7 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
2181
2181
  * searcher.search(query, options = {}) -> TopDocs
2182
2182
  *
2183
2183
  * Run a query through the Searcher on the index. A TopDocs object is
2184
- * returned with the relevant results. The +query+ is an built in Query
2184
+ * returned with the relevant results. The +query+ is a built in Query
2185
2185
  * object. Here are the options;
2186
2186
  *
2187
2187
  * === Options
@@ -741,19 +741,23 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
741
741
  for (i = e->start; i <= e->end; i++) {
742
742
  MatchRange *mr = mv->matches + i;
743
743
  len = mr->start_offset - last_offset;
744
- lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
744
+ if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
745
745
  e_ptr += len;
746
746
  memcpy(e_ptr, pre_tag, pre_tag_len);
747
747
  e_ptr += pre_tag_len;
748
748
  len = mr->end_offset - mr->start_offset;
749
- lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
749
+ if (len) lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
750
750
  e_ptr += len;
751
751
  memcpy(e_ptr, post_tag, post_tag_len);
752
752
  e_ptr += post_tag_len;
753
753
  last_offset = mr->end_offset;
754
754
  }
755
+ if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
756
+ /* no point using ellipsis if it takes up more space */
757
+ e->end_offset = lazy_df->len;
758
+ }
755
759
  len = e->end_offset - last_offset;
756
- lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
760
+ if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
757
761
  e_ptr += len;
758
762
  if (e->end_offset < lazy_df->len) {
759
763
  memcpy(e_ptr, ellipsis, ellipsis_len);
@@ -45,7 +45,7 @@ int tv_get_tv_term_index(TermVector *tv, const char *term)
45
45
  return mid;
46
46
  }
47
47
  }
48
- if (strcmp(term, tv->terms[hi].text) == 0) {
48
+ if (hi >= 0 && strcmp(term, tv->terms[hi].text) == 0) {
49
49
  return hi;
50
50
  }
51
51
  else {
@@ -146,25 +146,25 @@ module Ferret::Index
146
146
  #
147
147
  # === Options
148
148
  #
149
- # :field:: Default: @options[:default_field]. The default_field
150
- # is the field that is usually highlighted but you can
151
- # specify which field you want to highlight here. If
152
- # you want to highlight multiple fields then you will
153
- # need to call this method multiple times.
154
- # :excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
155
- # terms will be in the centre of the excerpt.
156
- # :num_excerpts:: Default: 2. Number of excerpts to return.
157
- # :pre_tag:: Default: "<b>". Tag to place to the left of the
158
- # match. You'll probably want to change this to a
159
- # "<span>" tag with a class "\033[7m" for use in a
160
- # terminal.
161
- # :post_tag:: Default: "</b>". This tag should close the
162
- # +:pre_tag+. Try tag "\033[m" in the terminal.
163
- # :ellipsis:: Default: "...". This is the string that is appended
164
- # at the beginning and end of excerpts (unless the
165
- # excerpt hits the start or end of the field. You'll
166
- # probably want to change this so a Unicode elipsis
167
- # character.
149
+ # field:: Default: @options[:default_field]. The default_field
150
+ # is the field that is usually highlighted but you can
151
+ # specify which field you want to highlight here. If
152
+ # you want to highlight multiple fields then you will
153
+ # need to call this method multiple times.
154
+ # excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
155
+ # terms will be in the centre of the excerpt.
156
+ # num_excerpts:: Default: 2. Number of excerpts to return.
157
+ # pre_tag:: Default: "<b>". Tag to place to the left of the
158
+ # match. You'll probably want to change this to a
159
+ # "<span>" tag with a class "\033[7m" for use in a
160
+ # terminal.
161
+ # post_tag:: Default: "</b>". This tag should close the
162
+ # +:pre_tag+. Try tag "\033[m" in the terminal.
163
+ # ellipsis:: Default: "...". This is the string that is appended
164
+ # at the beginning and end of excerpts (unless the
165
+ # excerpt hits the start or end of the field. You'll
166
+ # probably want to change this so a Unicode elipsis
167
+ # character.
168
168
  def highlight(query, doc_id, options = {})
169
169
  ensure_searcher_open()
170
170
  @searcher.highlight(process_query(query),
@@ -270,33 +270,79 @@ module Ferret::Index
270
270
  end
271
271
  alias :<< :add_document
272
272
 
273
- # The main search method for the index. You need to create a query to
274
- # pass to this method. You can also pass a hash with one or more of the
275
- # following; {filter, num_docs, first_doc, sort}
276
- #
277
- # query:: The query to run on the index
278
- # filter:: Filters docs from the search result
279
- # first_doc:: The index in the results of the first doc retrieved.
280
- # Default is 0
281
- # num_docs:: The number of results returned. Default is 10
282
- # sort:: An array of SortFields describing how to sort the results.
283
- # filter_proc:: A proc which takes |doc_id, score, searcher| as arguments
284
- # and returns true if the document passes the filter.
273
+ # Run a query through the Searcher on the index. A TopDocs object is
274
+ # returned with the relevant results. The +query+ is a built in Query
275
+ # object or a query string that can be parsed by the Ferret::QueryParser.
276
+ # Here are the options;
277
+ #
278
+ # === Options
279
+ #
280
+ # offset:: Default: 0. The offset of the start of the section of the
281
+ # result-set to return. This is used for paging through
282
+ # results. Let's say you have a page size of 10. If you
283
+ # don't find the result you want among the first 10 results
284
+ # then set +:offset+ to 10 and look at the next 10 results,
285
+ # then 20 and so on.
286
+ # limit:: Default: 10. This is the number of results you want
287
+ # returned, also called the page size. Set +:limit+ to
288
+ # +:all+ to return all results
289
+ # sort:: A Sort object or sort string describing how the field
290
+ # should be sorted. A sort string is made up of field names
291
+ # which cannot contain spaces and the word "DESC" if you
292
+ # want the field reversed, all seperated by commas. For
293
+ # example; "rating DESC, author, title"
294
+ # filter:: a Filter object to filter the search results with
295
+ # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
296
+ # and the Searcher object as its parameters and returns a
297
+ # Boolean value specifying whether the result should be
298
+ # included in the result set.
285
299
  def search(query, options = {})
286
300
  @dir.synchronize do
287
301
  return do_search(query, options)
288
302
  end
289
303
  end
290
304
 
291
- # See Index#search
305
+ # Run a query through the Searcher on the index. A TopDocs object is
306
+ # returned with the relevant results. The +query+ is a Query object or a
307
+ # query string that can be validly parsed by the Ferret::QueryParser. The
308
+ # Searcher#search_each method yields the internal document id (used to
309
+ # reference documents in the Searcher object like this;
310
+ # +searcher[doc_id]+) and the search score for that document. It is
311
+ # possible for the score to be greater than 1.0 for some queries and
312
+ # taking boosts into account. This method will also normalize scores to
313
+ # the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
314
+ # options;
315
+ #
316
+ # === Options
292
317
  #
293
- # This method yields the doc and score for each hit.
318
+ # offset:: Default: 0. The offset of the start of the section of the
319
+ # result-set to return. This is used for paging through
320
+ # results. Let's say you have a page size of 10. If you
321
+ # don't find the result you want among the first 10 results
322
+ # then set +:offset+ to 10 and look at the next 10 results,
323
+ # then 20 and so on.
324
+ # limit:: Default: 10. This is the number of results you want
325
+ # returned, also called the page size. Set +:limit+ to
326
+ # +:all+ to return all results
327
+ # sort:: A Sort object or sort string describing how the field
328
+ # should be sorted. A sort string is made up of field names
329
+ # which cannot contain spaces and the word "DESC" if you
330
+ # want the field reversed, all seperated by commas. For
331
+ # example; "rating DESC, author, title"
332
+ # filter:: a Filter object to filter the search results with
333
+ # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
334
+ # and the Searcher object as its parameters and returns a
335
+ # Boolean value specifying whether the result should be
336
+ # included in the result set.
337
+ #
338
+ # returns:: The total number of hits.
339
+ #
340
+ # === Example
294
341
  # eg.
295
- # index.search_each() do |doc, score|
342
+ # index.search_each(query, options = {}) do |doc, score|
296
343
  # puts "hit document number #{doc} with a score of #{score}"
297
344
  # end
298
345
  #
299
- # returns:: The total number of hits.
300
346
  def search_each(query, options = {}) # :yield: doc, score
301
347
  @dir.synchronize do
302
348
  ensure_searcher_open()
@@ -571,6 +617,19 @@ module Ferret::Index
571
617
  end
572
618
  end
573
619
 
620
+ # Turn a query string into a Query object with the Index's QueryParser
621
+ def process_query(query)
622
+ if query.is_a?(String)
623
+ if @qp.nil?
624
+ @qp = Ferret::QueryParser.new(@options)
625
+ end
626
+ # we need to set this ever time, in case a new field has been added
627
+ @qp.fields = @reader.field_names
628
+ query = @qp.parse(query)
629
+ end
630
+ return query
631
+ end
632
+
574
633
  protected
575
634
  def ensure_writer_open()
576
635
  raise "tried to use a closed index" if not @open
@@ -623,19 +682,6 @@ module Ferret::Index
623
682
 
624
683
  return @searcher.search(query, options)
625
684
  end
626
-
627
- def process_query(query)
628
- if query.is_a?(String)
629
- if @qp.nil?
630
- @qp = Ferret::QueryParser.new(@options)
631
- end
632
- # we need to set this ever time, in case a new field has been added
633
- @qp.fields = @reader.field_names
634
- query = @qp.parse(query)
635
- end
636
- return query
637
- end
638
-
639
685
  end
640
686
  end
641
687