ferret 0.10.2 → 0.10.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +31 -36
- data/ext/analysis.c +97 -37
- data/ext/analysis.h +11 -0
- data/ext/ferret.c +10 -0
- data/ext/ferret.h +2 -0
- data/ext/inc/lang.h +1 -0
- data/ext/index.c +2 -2
- data/ext/lang.h +1 -0
- data/ext/q_parser.c +25 -5
- data/ext/r_analysis.c +97 -53
- data/ext/r_index.c +0 -1
- data/ext/r_search.c +1 -1
- data/ext/search.c +7 -3
- data/ext/term_vectors.c +1 -1
- data/lib/ferret/index.rb +94 -48
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_analyzer.rb +24 -8
- data/test/unit/analysis/tc_token_stream.rb +7 -0
- data/test/unit/index/tc_index.rb +2 -2
- data/test/unit/query_parser/tc_query_parser.rb +3 -3
- metadata +12 -7
- data/ext/tags +0 -7841
data/ext/r_analysis.c
CHANGED
@@ -18,6 +18,7 @@ static VALUE cRegExpTokenizer;
|
|
18
18
|
static VALUE cAsciiLowerCaseFilter;
|
19
19
|
static VALUE cLowerCaseFilter;
|
20
20
|
static VALUE cStopFilter;
|
21
|
+
static VALUE cHyphenFilter;
|
21
22
|
static VALUE cStemFilter;
|
22
23
|
|
23
24
|
static VALUE cAnalyzer;
|
@@ -568,22 +569,20 @@ static TokenStream *
|
|
568
569
|
frt_get_cwrapped_rts(VALUE rts)
|
569
570
|
{
|
570
571
|
TokenStream *ts;
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
ts->ref_cnt = 1;
|
586
|
-
break;
|
572
|
+
if (rb_ivar_get(CLASS_OF(rts), id_cclass) == Qtrue) {
|
573
|
+
GET_TS(ts, rts);
|
574
|
+
REF(ts);
|
575
|
+
}
|
576
|
+
else {
|
577
|
+
ts = ts_new(CWrappedTokenStream);
|
578
|
+
CWTS(ts)->rts = rts;
|
579
|
+
ts->next = &cwrts_next;
|
580
|
+
ts->reset = &cwrts_reset;
|
581
|
+
ts->clone_i = &cwrts_clone_i;
|
582
|
+
ts->destroy_i = &cwrts_destroy_i;
|
583
|
+
/* prevent from being garbage collected */
|
584
|
+
rb_hash_aset(object_space, LONG2NUM(rts), rts);
|
585
|
+
ts->ref_cnt = 1;
|
587
586
|
}
|
588
587
|
return ts;
|
589
588
|
}
|
@@ -911,6 +910,28 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
|
911
910
|
return self;
|
912
911
|
}
|
913
912
|
|
913
|
+
/*
|
914
|
+
* call-seq:
|
915
|
+
* HyphenFilter.new(token_stream) -> token_stream
|
916
|
+
*
|
917
|
+
* Create an HyphenFilter which filters hyphenated words. The way it works is
|
918
|
+
* by adding both the word concatenated into a single word and split into
|
919
|
+
* multiple words. ie "e-mail" becomes "email" and "e mail". This way a
|
920
|
+
* search for "e-mail", "email" and "mail" will all match. This filter is
|
921
|
+
* used by default by the StandardAnalyzer.
|
922
|
+
*/
|
923
|
+
static VALUE
|
924
|
+
frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
|
925
|
+
{
|
926
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
927
|
+
ts = hyphen_filter_new(ts);
|
928
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
929
|
+
|
930
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
931
|
+
object_add(ts, self);
|
932
|
+
return self;
|
933
|
+
}
|
934
|
+
|
914
935
|
/*
|
915
936
|
* call-seq:
|
916
937
|
* StopFilter.new(token_stream) -> token_stream
|
@@ -1021,20 +1042,18 @@ Analyzer *
|
|
1021
1042
|
frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
1022
1043
|
{
|
1023
1044
|
Analyzer *a = NULL;
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
1037
|
-
break;
|
1045
|
+
if (rb_ivar_get(CLASS_OF(ranalyzer), id_cclass) == Qtrue) {
|
1046
|
+
Data_Get_Struct(ranalyzer, Analyzer, a);
|
1047
|
+
REF(a);
|
1048
|
+
}
|
1049
|
+
else {
|
1050
|
+
a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
|
1051
|
+
a->destroy_i = &cwa_destroy_i;
|
1052
|
+
a->get_ts = &cwa_get_ts;
|
1053
|
+
a->ref_cnt = 1;
|
1054
|
+
((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
|
1055
|
+
/* prevent from being garbage collected */
|
1056
|
+
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
1038
1057
|
}
|
1039
1058
|
return a;
|
1040
1059
|
}
|
@@ -1350,11 +1369,14 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1350
1369
|
|
1351
1370
|
ts = rets_new(Qnil, regex, proc);
|
1352
1371
|
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
1353
|
-
REF(ts);
|
1354
1372
|
/* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
|
1355
1373
|
object_add(ts, rets);
|
1356
1374
|
|
1357
|
-
if (lower != Qfalse)
|
1375
|
+
if (lower != Qfalse) {
|
1376
|
+
rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
|
1377
|
+
ts = DATA_PTR(rets);
|
1378
|
+
}
|
1379
|
+
REF(ts);
|
1358
1380
|
|
1359
1381
|
a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
|
1360
1382
|
Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
|
@@ -1461,7 +1483,7 @@ static void Init_Token(void)
|
|
1461
1483
|
*/
|
1462
1484
|
static void Init_TokenStream(void)
|
1463
1485
|
{
|
1464
|
-
cTokenStream =
|
1486
|
+
cTokenStream = frt_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
1465
1487
|
rb_define_method(cTokenStream, "next", frt_ts_next, 0);
|
1466
1488
|
rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
|
1467
1489
|
rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
|
@@ -1482,7 +1504,7 @@ static void Init_TokenStream(void)
|
|
1482
1504
|
static void Init_AsciiLetterTokenizer(void)
|
1483
1505
|
{
|
1484
1506
|
cAsciiLetterTokenizer =
|
1485
|
-
|
1507
|
+
frt_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
1486
1508
|
rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
|
1487
1509
|
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
1488
1510
|
frt_a_letter_tokenizer_init, 1);
|
@@ -1504,7 +1526,7 @@ static void Init_AsciiLetterTokenizer(void)
|
|
1504
1526
|
static void Init_LetterTokenizer(void)
|
1505
1527
|
{
|
1506
1528
|
cLetterTokenizer =
|
1507
|
-
|
1529
|
+
frt_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
1508
1530
|
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
1509
1531
|
rb_define_method(cLetterTokenizer, "initialize",
|
1510
1532
|
frt_letter_tokenizer_init, -1);
|
@@ -1524,7 +1546,7 @@ static void Init_LetterTokenizer(void)
|
|
1524
1546
|
static void Init_AsciiWhiteSpaceTokenizer(void)
|
1525
1547
|
{
|
1526
1548
|
cAsciiWhiteSpaceTokenizer =
|
1527
|
-
|
1549
|
+
frt_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
|
1528
1550
|
cTokenStream);
|
1529
1551
|
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
|
1530
1552
|
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
@@ -1545,7 +1567,7 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
|
|
1545
1567
|
static void Init_WhiteSpaceTokenizer(void)
|
1546
1568
|
{
|
1547
1569
|
cWhiteSpaceTokenizer =
|
1548
|
-
|
1570
|
+
frt_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
1549
1571
|
rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
|
1550
1572
|
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
1551
1573
|
frt_whitespace_tokenizer_init, -1);
|
@@ -1566,7 +1588,7 @@ static void Init_WhiteSpaceTokenizer(void)
|
|
1566
1588
|
static void Init_AsciiStandardTokenizer(void)
|
1567
1589
|
{
|
1568
1590
|
cAsciiStandardTokenizer =
|
1569
|
-
|
1591
|
+
frt_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1570
1592
|
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
1571
1593
|
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1572
1594
|
frt_a_standard_tokenizer_init, 1);
|
@@ -1587,7 +1609,7 @@ static void Init_AsciiStandardTokenizer(void)
|
|
1587
1609
|
static void Init_StandardTokenizer(void)
|
1588
1610
|
{
|
1589
1611
|
cStandardTokenizer =
|
1590
|
-
|
1612
|
+
frt_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1591
1613
|
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
1592
1614
|
rb_define_method(cStandardTokenizer, "initialize",
|
1593
1615
|
frt_standard_tokenizer_init, 1);
|
@@ -1614,7 +1636,7 @@ static void Init_StandardTokenizer(void)
|
|
1614
1636
|
static void Init_RegExpTokenizer(void)
|
1615
1637
|
{
|
1616
1638
|
cRegExpTokenizer =
|
1617
|
-
|
1639
|
+
frt_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1618
1640
|
rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
|
1619
1641
|
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1620
1642
|
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
@@ -1642,7 +1664,7 @@ static void Init_RegExpTokenizer(void)
|
|
1642
1664
|
static void Init_AsciiLowerCaseFilter(void)
|
1643
1665
|
{
|
1644
1666
|
cAsciiLowerCaseFilter =
|
1645
|
-
|
1667
|
+
frt_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1646
1668
|
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
1647
1669
|
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1648
1670
|
frt_a_lowercase_filter_init, 1);
|
@@ -1662,12 +1684,33 @@ static void Init_AsciiLowerCaseFilter(void)
|
|
1662
1684
|
static void Init_LowerCaseFilter(void)
|
1663
1685
|
{
|
1664
1686
|
cLowerCaseFilter =
|
1665
|
-
|
1687
|
+
frt_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1666
1688
|
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
1667
1689
|
rb_define_method(cLowerCaseFilter, "initialize",
|
1668
1690
|
frt_lowercase_filter_init, 1);
|
1669
1691
|
}
|
1670
1692
|
|
1693
|
+
/*
|
1694
|
+
* Document-class: Ferret::Analysis::HyphenFilter
|
1695
|
+
*
|
1696
|
+
* HyphenFilter filters hyphenated words by adding both the word concatenated
|
1697
|
+
* into a single word and split into multiple words. ie "e-mail" becomes
|
1698
|
+
* "email" and "e mail". This way a search for "e-mail", "email" and "mail"
|
1699
|
+
* will all match. This filter is used by default by the StandardAnalyzer.
|
1700
|
+
*
|
1701
|
+
* === Example
|
1702
|
+
*
|
1703
|
+
* ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
|
1704
|
+
*
|
1705
|
+
*/
|
1706
|
+
static void Init_HyphenFilter(void)
|
1707
|
+
{
|
1708
|
+
cHyphenFilter =
|
1709
|
+
frt_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
|
1710
|
+
rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
|
1711
|
+
rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
|
1712
|
+
}
|
1713
|
+
|
1671
1714
|
/*
|
1672
1715
|
* Document-class: Ferret::Analysis::StopFilter
|
1673
1716
|
*
|
@@ -1682,7 +1725,7 @@ static void Init_LowerCaseFilter(void)
|
|
1682
1725
|
static void Init_StopFilter(void)
|
1683
1726
|
{
|
1684
1727
|
cStopFilter =
|
1685
|
-
|
1728
|
+
frt_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1686
1729
|
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
1687
1730
|
rb_define_method(cStopFilter, "initialize",
|
1688
1731
|
frt_stop_filter_init, -1);
|
@@ -1741,7 +1784,7 @@ static void Init_StopFilter(void)
|
|
1741
1784
|
static void Init_StemFilter(void)
|
1742
1785
|
{
|
1743
1786
|
cStemFilter =
|
1744
|
-
|
1787
|
+
frt_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
1745
1788
|
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
1746
1789
|
rb_define_method(cStemFilter, "initialize",
|
1747
1790
|
frt_stem_filter_init, -1);
|
@@ -1784,7 +1827,7 @@ static void Init_StemFilter(void)
|
|
1784
1827
|
static void Init_Analyzer(void)
|
1785
1828
|
{
|
1786
1829
|
cAnalyzer =
|
1787
|
-
|
1830
|
+
frt_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
1788
1831
|
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
1789
1832
|
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
1790
1833
|
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
@@ -1821,7 +1864,7 @@ static void Init_Analyzer(void)
|
|
1821
1864
|
static void Init_AsciiLetterAnalyzer(void)
|
1822
1865
|
{
|
1823
1866
|
cAsciiLetterAnalyzer =
|
1824
|
-
|
1867
|
+
frt_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
1825
1868
|
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
1826
1869
|
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
1827
1870
|
frt_a_letter_analyzer_init, -1);
|
@@ -1851,7 +1894,7 @@ static void Init_AsciiLetterAnalyzer(void)
|
|
1851
1894
|
static void Init_LetterAnalyzer(void)
|
1852
1895
|
{
|
1853
1896
|
cLetterAnalyzer =
|
1854
|
-
|
1897
|
+
frt_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
1855
1898
|
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
1856
1899
|
rb_define_method(cLetterAnalyzer, "initialize",
|
1857
1900
|
frt_letter_analyzer_init, -1);
|
@@ -1887,7 +1930,7 @@ static void Init_LetterAnalyzer(void)
|
|
1887
1930
|
static void Init_AsciiWhiteSpaceAnalyzer(void)
|
1888
1931
|
{
|
1889
1932
|
cAsciiWhiteSpaceAnalyzer =
|
1890
|
-
|
1933
|
+
frt_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
1891
1934
|
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
1892
1935
|
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
1893
1936
|
frt_a_white_space_analyzer_init, -1);
|
@@ -1917,7 +1960,7 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
|
|
1917
1960
|
static void Init_WhiteSpaceAnalyzer(void)
|
1918
1961
|
{
|
1919
1962
|
cWhiteSpaceAnalyzer =
|
1920
|
-
|
1963
|
+
frt_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
1921
1964
|
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
1922
1965
|
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
1923
1966
|
frt_white_space_analyzer_init, -1);
|
@@ -1955,7 +1998,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1955
1998
|
static void Init_AsciiStandardAnalyzer(void)
|
1956
1999
|
{
|
1957
2000
|
cAsciiStandardAnalyzer =
|
1958
|
-
|
2001
|
+
frt_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
1959
2002
|
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
1960
2003
|
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
1961
2004
|
frt_a_standard_analyzer_init, -1);
|
@@ -1986,7 +2029,7 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
1986
2029
|
static void Init_StandardAnalyzer(void)
|
1987
2030
|
{
|
1988
2031
|
cStandardAnalyzer =
|
1989
|
-
|
2032
|
+
frt_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
1990
2033
|
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
1991
2034
|
rb_define_method(cStandardAnalyzer, "initialize",
|
1992
2035
|
frt_standard_analyzer_init, -1);
|
@@ -2015,7 +2058,7 @@ static void Init_StandardAnalyzer(void)
|
|
2015
2058
|
static void Init_PerFieldAnalyzer(void)
|
2016
2059
|
{
|
2017
2060
|
cPerFieldAnalyzer =
|
2018
|
-
|
2061
|
+
frt_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
2019
2062
|
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
2020
2063
|
rb_define_method(cPerFieldAnalyzer, "initialize",
|
2021
2064
|
frt_per_field_analyzer_init, 1);
|
@@ -2055,7 +2098,7 @@ static void Init_PerFieldAnalyzer(void)
|
|
2055
2098
|
static void Init_RegExpAnalyzer(void)
|
2056
2099
|
{
|
2057
2100
|
cRegExpAnalyzer =
|
2058
|
-
|
2101
|
+
frt_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
2059
2102
|
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2060
2103
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2061
2104
|
frt_re_analyzer_init, -1);
|
@@ -2171,6 +2214,7 @@ Init_Analysis(void)
|
|
2171
2214
|
|
2172
2215
|
Init_AsciiLowerCaseFilter();
|
2173
2216
|
Init_LowerCaseFilter();
|
2217
|
+
Init_HyphenFilter();
|
2174
2218
|
Init_StopFilter();
|
2175
2219
|
Init_StemFilter();
|
2176
2220
|
|
data/ext/r_index.c
CHANGED
@@ -1875,7 +1875,6 @@ frt_ir_init(VALUE self, VALUE rdir)
|
|
1875
1875
|
VALUE rfield_num_map = rb_hash_new();
|
1876
1876
|
|
1877
1877
|
if (TYPE(rdir) == T_ARRAY) {
|
1878
|
-
VALUE rreader;
|
1879
1878
|
VALUE rdirs = rdir;
|
1880
1879
|
const int reader_cnt = RARRAY(rdir)->len;
|
1881
1880
|
IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
|
data/ext/r_search.c
CHANGED
@@ -2181,7 +2181,7 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
|
|
2181
2181
|
* searcher.search(query, options = {}) -> TopDocs
|
2182
2182
|
*
|
2183
2183
|
* Run a query through the Searcher on the index. A TopDocs object is
|
2184
|
-
* returned with the relevant results. The +query+ is
|
2184
|
+
* returned with the relevant results. The +query+ is a built in Query
|
2185
2185
|
* object. Here are the options;
|
2186
2186
|
*
|
2187
2187
|
* === Options
|
data/ext/search.c
CHANGED
@@ -741,19 +741,23 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
|
|
741
741
|
for (i = e->start; i <= e->end; i++) {
|
742
742
|
MatchRange *mr = mv->matches + i;
|
743
743
|
len = mr->start_offset - last_offset;
|
744
|
-
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
744
|
+
if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
745
745
|
e_ptr += len;
|
746
746
|
memcpy(e_ptr, pre_tag, pre_tag_len);
|
747
747
|
e_ptr += pre_tag_len;
|
748
748
|
len = mr->end_offset - mr->start_offset;
|
749
|
-
lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
749
|
+
if (len) lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
750
750
|
e_ptr += len;
|
751
751
|
memcpy(e_ptr, post_tag, post_tag_len);
|
752
752
|
e_ptr += post_tag_len;
|
753
753
|
last_offset = mr->end_offset;
|
754
754
|
}
|
755
|
+
if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
|
756
|
+
/* no point using ellipsis if it takes up more space */
|
757
|
+
e->end_offset = lazy_df->len;
|
758
|
+
}
|
755
759
|
len = e->end_offset - last_offset;
|
756
|
-
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
760
|
+
if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
757
761
|
e_ptr += len;
|
758
762
|
if (e->end_offset < lazy_df->len) {
|
759
763
|
memcpy(e_ptr, ellipsis, ellipsis_len);
|
data/ext/term_vectors.c
CHANGED
data/lib/ferret/index.rb
CHANGED
@@ -146,25 +146,25 @@ module Ferret::Index
|
|
146
146
|
#
|
147
147
|
# === Options
|
148
148
|
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
153
|
-
#
|
154
|
-
#
|
155
|
-
#
|
156
|
-
#
|
157
|
-
#
|
158
|
-
#
|
159
|
-
#
|
160
|
-
#
|
161
|
-
#
|
162
|
-
#
|
163
|
-
#
|
164
|
-
#
|
165
|
-
#
|
166
|
-
#
|
167
|
-
#
|
149
|
+
# field:: Default: @options[:default_field]. The default_field
|
150
|
+
# is the field that is usually highlighted but you can
|
151
|
+
# specify which field you want to highlight here. If
|
152
|
+
# you want to highlight multiple fields then you will
|
153
|
+
# need to call this method multiple times.
|
154
|
+
# excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
|
155
|
+
# terms will be in the centre of the excerpt.
|
156
|
+
# num_excerpts:: Default: 2. Number of excerpts to return.
|
157
|
+
# pre_tag:: Default: "<b>". Tag to place to the left of the
|
158
|
+
# match. You'll probably want to change this to a
|
159
|
+
# "<span>" tag with a class "\033[7m" for use in a
|
160
|
+
# terminal.
|
161
|
+
# post_tag:: Default: "</b>". This tag should close the
|
162
|
+
# +:pre_tag+. Try tag "\033[m" in the terminal.
|
163
|
+
# ellipsis:: Default: "...". This is the string that is appended
|
164
|
+
# at the beginning and end of excerpts (unless the
|
165
|
+
# excerpt hits the start or end of the field. You'll
|
166
|
+
# probably want to change this so a Unicode elipsis
|
167
|
+
# character.
|
168
168
|
def highlight(query, doc_id, options = {})
|
169
169
|
ensure_searcher_open()
|
170
170
|
@searcher.highlight(process_query(query),
|
@@ -270,33 +270,79 @@ module Ferret::Index
|
|
270
270
|
end
|
271
271
|
alias :<< :add_document
|
272
272
|
|
273
|
-
#
|
274
|
-
#
|
275
|
-
#
|
276
|
-
#
|
277
|
-
#
|
278
|
-
#
|
279
|
-
#
|
280
|
-
#
|
281
|
-
#
|
282
|
-
#
|
283
|
-
#
|
284
|
-
#
|
273
|
+
# Run a query through the Searcher on the index. A TopDocs object is
|
274
|
+
# returned with the relevant results. The +query+ is a built in Query
|
275
|
+
# object or a query string that can be parsed by the Ferret::QueryParser.
|
276
|
+
# Here are the options;
|
277
|
+
#
|
278
|
+
# === Options
|
279
|
+
#
|
280
|
+
# offset:: Default: 0. The offset of the start of the section of the
|
281
|
+
# result-set to return. This is used for paging through
|
282
|
+
# results. Let's say you have a page size of 10. If you
|
283
|
+
# don't find the result you want among the first 10 results
|
284
|
+
# then set +:offset+ to 10 and look at the next 10 results,
|
285
|
+
# then 20 and so on.
|
286
|
+
# limit:: Default: 10. This is the number of results you want
|
287
|
+
# returned, also called the page size. Set +:limit+ to
|
288
|
+
# +:all+ to return all results
|
289
|
+
# sort:: A Sort object or sort string describing how the field
|
290
|
+
# should be sorted. A sort string is made up of field names
|
291
|
+
# which cannot contain spaces and the word "DESC" if you
|
292
|
+
# want the field reversed, all seperated by commas. For
|
293
|
+
# example; "rating DESC, author, title"
|
294
|
+
# filter:: a Filter object to filter the search results with
|
295
|
+
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
296
|
+
# and the Searcher object as its parameters and returns a
|
297
|
+
# Boolean value specifying whether the result should be
|
298
|
+
# included in the result set.
|
285
299
|
def search(query, options = {})
|
286
300
|
@dir.synchronize do
|
287
301
|
return do_search(query, options)
|
288
302
|
end
|
289
303
|
end
|
290
304
|
|
291
|
-
#
|
305
|
+
# Run a query through the Searcher on the index. A TopDocs object is
|
306
|
+
# returned with the relevant results. The +query+ is a Query object or a
|
307
|
+
# query string that can be validly parsed by the Ferret::QueryParser. The
|
308
|
+
# Searcher#search_each method yields the internal document id (used to
|
309
|
+
# reference documents in the Searcher object like this;
|
310
|
+
# +searcher[doc_id]+) and the search score for that document. It is
|
311
|
+
# possible for the score to be greater than 1.0 for some queries and
|
312
|
+
# taking boosts into account. This method will also normalize scores to
|
313
|
+
# the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
|
314
|
+
# options;
|
315
|
+
#
|
316
|
+
# === Options
|
292
317
|
#
|
293
|
-
#
|
318
|
+
# offset:: Default: 0. The offset of the start of the section of the
|
319
|
+
# result-set to return. This is used for paging through
|
320
|
+
# results. Let's say you have a page size of 10. If you
|
321
|
+
# don't find the result you want among the first 10 results
|
322
|
+
# then set +:offset+ to 10 and look at the next 10 results,
|
323
|
+
# then 20 and so on.
|
324
|
+
# limit:: Default: 10. This is the number of results you want
|
325
|
+
# returned, also called the page size. Set +:limit+ to
|
326
|
+
# +:all+ to return all results
|
327
|
+
# sort:: A Sort object or sort string describing how the field
|
328
|
+
# should be sorted. A sort string is made up of field names
|
329
|
+
# which cannot contain spaces and the word "DESC" if you
|
330
|
+
# want the field reversed, all seperated by commas. For
|
331
|
+
# example; "rating DESC, author, title"
|
332
|
+
# filter:: a Filter object to filter the search results with
|
333
|
+
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
334
|
+
# and the Searcher object as its parameters and returns a
|
335
|
+
# Boolean value specifying whether the result should be
|
336
|
+
# included in the result set.
|
337
|
+
#
|
338
|
+
# returns:: The total number of hits.
|
339
|
+
#
|
340
|
+
# === Example
|
294
341
|
# eg.
|
295
|
-
# index.search_each() do |doc, score|
|
342
|
+
# index.search_each(query, options = {}) do |doc, score|
|
296
343
|
# puts "hit document number #{doc} with a score of #{score}"
|
297
344
|
# end
|
298
345
|
#
|
299
|
-
# returns:: The total number of hits.
|
300
346
|
def search_each(query, options = {}) # :yield: doc, score
|
301
347
|
@dir.synchronize do
|
302
348
|
ensure_searcher_open()
|
@@ -571,6 +617,19 @@ module Ferret::Index
|
|
571
617
|
end
|
572
618
|
end
|
573
619
|
|
620
|
+
# Turn a query string into a Query object with the Index's QueryParser
|
621
|
+
def process_query(query)
|
622
|
+
if query.is_a?(String)
|
623
|
+
if @qp.nil?
|
624
|
+
@qp = Ferret::QueryParser.new(@options)
|
625
|
+
end
|
626
|
+
# we need to set this ever time, in case a new field has been added
|
627
|
+
@qp.fields = @reader.field_names
|
628
|
+
query = @qp.parse(query)
|
629
|
+
end
|
630
|
+
return query
|
631
|
+
end
|
632
|
+
|
574
633
|
protected
|
575
634
|
def ensure_writer_open()
|
576
635
|
raise "tried to use a closed index" if not @open
|
@@ -623,19 +682,6 @@ module Ferret::Index
|
|
623
682
|
|
624
683
|
return @searcher.search(query, options)
|
625
684
|
end
|
626
|
-
|
627
|
-
def process_query(query)
|
628
|
-
if query.is_a?(String)
|
629
|
-
if @qp.nil?
|
630
|
-
@qp = Ferret::QueryParser.new(@options)
|
631
|
-
end
|
632
|
-
# we need to set this ever time, in case a new field has been added
|
633
|
-
@qp.fields = @reader.field_names
|
634
|
-
query = @qp.parse(query)
|
635
|
-
end
|
636
|
-
return query
|
637
|
-
end
|
638
|
-
|
639
685
|
end
|
640
686
|
end
|
641
687
|
|