ferret 0.10.2 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +31 -36
- data/ext/analysis.c +97 -37
- data/ext/analysis.h +11 -0
- data/ext/ferret.c +10 -0
- data/ext/ferret.h +2 -0
- data/ext/inc/lang.h +1 -0
- data/ext/index.c +2 -2
- data/ext/lang.h +1 -0
- data/ext/q_parser.c +25 -5
- data/ext/r_analysis.c +97 -53
- data/ext/r_index.c +0 -1
- data/ext/r_search.c +1 -1
- data/ext/search.c +7 -3
- data/ext/term_vectors.c +1 -1
- data/lib/ferret/index.rb +94 -48
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_analyzer.rb +24 -8
- data/test/unit/analysis/tc_token_stream.rb +7 -0
- data/test/unit/index/tc_index.rb +2 -2
- data/test/unit/query_parser/tc_query_parser.rb +3 -3
- metadata +12 -7
- data/ext/tags +0 -7841
data/ext/r_analysis.c
CHANGED
@@ -18,6 +18,7 @@ static VALUE cRegExpTokenizer;
|
|
18
18
|
static VALUE cAsciiLowerCaseFilter;
|
19
19
|
static VALUE cLowerCaseFilter;
|
20
20
|
static VALUE cStopFilter;
|
21
|
+
static VALUE cHyphenFilter;
|
21
22
|
static VALUE cStemFilter;
|
22
23
|
|
23
24
|
static VALUE cAnalyzer;
|
@@ -568,22 +569,20 @@ static TokenStream *
|
|
568
569
|
frt_get_cwrapped_rts(VALUE rts)
|
569
570
|
{
|
570
571
|
TokenStream *ts;
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
ts->ref_cnt = 1;
|
586
|
-
break;
|
572
|
+
if (rb_ivar_get(CLASS_OF(rts), id_cclass) == Qtrue) {
|
573
|
+
GET_TS(ts, rts);
|
574
|
+
REF(ts);
|
575
|
+
}
|
576
|
+
else {
|
577
|
+
ts = ts_new(CWrappedTokenStream);
|
578
|
+
CWTS(ts)->rts = rts;
|
579
|
+
ts->next = &cwrts_next;
|
580
|
+
ts->reset = &cwrts_reset;
|
581
|
+
ts->clone_i = &cwrts_clone_i;
|
582
|
+
ts->destroy_i = &cwrts_destroy_i;
|
583
|
+
/* prevent from being garbage collected */
|
584
|
+
rb_hash_aset(object_space, LONG2NUM(rts), rts);
|
585
|
+
ts->ref_cnt = 1;
|
587
586
|
}
|
588
587
|
return ts;
|
589
588
|
}
|
@@ -911,6 +910,28 @@ frt_lowercase_filter_init(VALUE self, VALUE rsub_ts)
|
|
911
910
|
return self;
|
912
911
|
}
|
913
912
|
|
913
|
+
/*
|
914
|
+
* call-seq:
|
915
|
+
* HyphenFilter.new(token_stream) -> token_stream
|
916
|
+
*
|
917
|
+
* Create an HyphenFilter which filters hyphenated words. The way it works is
|
918
|
+
* by adding both the word concatenated into a single word and split into
|
919
|
+
* multiple words. ie "e-mail" becomes "email" and "e mail". This way a
|
920
|
+
* search for "e-mail", "email" and "mail" will all match. This filter is
|
921
|
+
* used by default by the StandardAnalyzer.
|
922
|
+
*/
|
923
|
+
static VALUE
|
924
|
+
frt_hyphen_filter_init(VALUE self, VALUE rsub_ts)
|
925
|
+
{
|
926
|
+
TokenStream *ts = frt_get_cwrapped_rts(rsub_ts);
|
927
|
+
ts = hyphen_filter_new(ts);
|
928
|
+
object_add(&(TkFilt(ts)->sub_ts), rsub_ts);
|
929
|
+
|
930
|
+
Frt_Wrap_Struct(self, &frt_tf_mark, &frt_tf_free, ts);
|
931
|
+
object_add(ts, self);
|
932
|
+
return self;
|
933
|
+
}
|
934
|
+
|
914
935
|
/*
|
915
936
|
* call-seq:
|
916
937
|
* StopFilter.new(token_stream) -> token_stream
|
@@ -1021,20 +1042,18 @@ Analyzer *
|
|
1021
1042
|
frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
1022
1043
|
{
|
1023
1044
|
Analyzer *a = NULL;
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
1037
|
-
break;
|
1045
|
+
if (rb_ivar_get(CLASS_OF(ranalyzer), id_cclass) == Qtrue) {
|
1046
|
+
Data_Get_Struct(ranalyzer, Analyzer, a);
|
1047
|
+
REF(a);
|
1048
|
+
}
|
1049
|
+
else {
|
1050
|
+
a = (Analyzer *)ecalloc(sizeof(CWrappedAnalyzer));
|
1051
|
+
a->destroy_i = &cwa_destroy_i;
|
1052
|
+
a->get_ts = &cwa_get_ts;
|
1053
|
+
a->ref_cnt = 1;
|
1054
|
+
((CWrappedAnalyzer *)a)->ranalyzer = ranalyzer;
|
1055
|
+
/* prevent from being garbage collected */
|
1056
|
+
rb_hash_aset(object_space, LONG2NUM(ranalyzer), ranalyzer);
|
1038
1057
|
}
|
1039
1058
|
return a;
|
1040
1059
|
}
|
@@ -1350,11 +1369,14 @@ frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1350
1369
|
|
1351
1370
|
ts = rets_new(Qnil, regex, proc);
|
1352
1371
|
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
|
1353
|
-
REF(ts);
|
1354
1372
|
/* rb_hash_aset(object_space, LONG2NUM((long)rets), rets); */
|
1355
1373
|
object_add(ts, rets);
|
1356
1374
|
|
1357
|
-
if (lower != Qfalse)
|
1375
|
+
if (lower != Qfalse) {
|
1376
|
+
rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
|
1377
|
+
ts = DATA_PTR(rets);
|
1378
|
+
}
|
1379
|
+
REF(ts);
|
1358
1380
|
|
1359
1381
|
a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
|
1360
1382
|
Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
|
@@ -1461,7 +1483,7 @@ static void Init_Token(void)
|
|
1461
1483
|
*/
|
1462
1484
|
static void Init_TokenStream(void)
|
1463
1485
|
{
|
1464
|
-
cTokenStream =
|
1486
|
+
cTokenStream = frt_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
1465
1487
|
rb_define_method(cTokenStream, "next", frt_ts_next, 0);
|
1466
1488
|
rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
|
1467
1489
|
rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
|
@@ -1482,7 +1504,7 @@ static void Init_TokenStream(void)
|
|
1482
1504
|
static void Init_AsciiLetterTokenizer(void)
|
1483
1505
|
{
|
1484
1506
|
cAsciiLetterTokenizer =
|
1485
|
-
|
1507
|
+
frt_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
1486
1508
|
rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
|
1487
1509
|
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
1488
1510
|
frt_a_letter_tokenizer_init, 1);
|
@@ -1504,7 +1526,7 @@ static void Init_AsciiLetterTokenizer(void)
|
|
1504
1526
|
static void Init_LetterTokenizer(void)
|
1505
1527
|
{
|
1506
1528
|
cLetterTokenizer =
|
1507
|
-
|
1529
|
+
frt_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
1508
1530
|
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
1509
1531
|
rb_define_method(cLetterTokenizer, "initialize",
|
1510
1532
|
frt_letter_tokenizer_init, -1);
|
@@ -1524,7 +1546,7 @@ static void Init_LetterTokenizer(void)
|
|
1524
1546
|
static void Init_AsciiWhiteSpaceTokenizer(void)
|
1525
1547
|
{
|
1526
1548
|
cAsciiWhiteSpaceTokenizer =
|
1527
|
-
|
1549
|
+
frt_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
|
1528
1550
|
cTokenStream);
|
1529
1551
|
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
|
1530
1552
|
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
@@ -1545,7 +1567,7 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
|
|
1545
1567
|
static void Init_WhiteSpaceTokenizer(void)
|
1546
1568
|
{
|
1547
1569
|
cWhiteSpaceTokenizer =
|
1548
|
-
|
1570
|
+
frt_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
1549
1571
|
rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
|
1550
1572
|
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
1551
1573
|
frt_whitespace_tokenizer_init, -1);
|
@@ -1566,7 +1588,7 @@ static void Init_WhiteSpaceTokenizer(void)
|
|
1566
1588
|
static void Init_AsciiStandardTokenizer(void)
|
1567
1589
|
{
|
1568
1590
|
cAsciiStandardTokenizer =
|
1569
|
-
|
1591
|
+
frt_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1570
1592
|
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
1571
1593
|
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1572
1594
|
frt_a_standard_tokenizer_init, 1);
|
@@ -1587,7 +1609,7 @@ static void Init_AsciiStandardTokenizer(void)
|
|
1587
1609
|
static void Init_StandardTokenizer(void)
|
1588
1610
|
{
|
1589
1611
|
cStandardTokenizer =
|
1590
|
-
|
1612
|
+
frt_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1591
1613
|
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
1592
1614
|
rb_define_method(cStandardTokenizer, "initialize",
|
1593
1615
|
frt_standard_tokenizer_init, 1);
|
@@ -1614,7 +1636,7 @@ static void Init_StandardTokenizer(void)
|
|
1614
1636
|
static void Init_RegExpTokenizer(void)
|
1615
1637
|
{
|
1616
1638
|
cRegExpTokenizer =
|
1617
|
-
|
1639
|
+
frt_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1618
1640
|
rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
|
1619
1641
|
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1620
1642
|
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
@@ -1642,7 +1664,7 @@ static void Init_RegExpTokenizer(void)
|
|
1642
1664
|
static void Init_AsciiLowerCaseFilter(void)
|
1643
1665
|
{
|
1644
1666
|
cAsciiLowerCaseFilter =
|
1645
|
-
|
1667
|
+
frt_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1646
1668
|
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
1647
1669
|
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1648
1670
|
frt_a_lowercase_filter_init, 1);
|
@@ -1662,12 +1684,33 @@ static void Init_AsciiLowerCaseFilter(void)
|
|
1662
1684
|
static void Init_LowerCaseFilter(void)
|
1663
1685
|
{
|
1664
1686
|
cLowerCaseFilter =
|
1665
|
-
|
1687
|
+
frt_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1666
1688
|
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
1667
1689
|
rb_define_method(cLowerCaseFilter, "initialize",
|
1668
1690
|
frt_lowercase_filter_init, 1);
|
1669
1691
|
}
|
1670
1692
|
|
1693
|
+
/*
|
1694
|
+
* Document-class: Ferret::Analysis::HyphenFilter
|
1695
|
+
*
|
1696
|
+
* HyphenFilter filters hyphenated words by adding both the word concatenated
|
1697
|
+
* into a single word and split into multiple words. ie "e-mail" becomes
|
1698
|
+
* "email" and "e mail". This way a search for "e-mail", "email" and "mail"
|
1699
|
+
* will all match. This filter is used by default by the StandardAnalyzer.
|
1700
|
+
*
|
1701
|
+
* === Example
|
1702
|
+
*
|
1703
|
+
* ["e-mail", "set-up"] => ["email", "e", "mail", "setup", "set", "up"]
|
1704
|
+
*
|
1705
|
+
*/
|
1706
|
+
static void Init_HyphenFilter(void)
|
1707
|
+
{
|
1708
|
+
cHyphenFilter =
|
1709
|
+
frt_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
|
1710
|
+
rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
|
1711
|
+
rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
|
1712
|
+
}
|
1713
|
+
|
1671
1714
|
/*
|
1672
1715
|
* Document-class: Ferret::Analysis::StopFilter
|
1673
1716
|
*
|
@@ -1682,7 +1725,7 @@ static void Init_LowerCaseFilter(void)
|
|
1682
1725
|
static void Init_StopFilter(void)
|
1683
1726
|
{
|
1684
1727
|
cStopFilter =
|
1685
|
-
|
1728
|
+
frt_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1686
1729
|
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
1687
1730
|
rb_define_method(cStopFilter, "initialize",
|
1688
1731
|
frt_stop_filter_init, -1);
|
@@ -1741,7 +1784,7 @@ static void Init_StopFilter(void)
|
|
1741
1784
|
static void Init_StemFilter(void)
|
1742
1785
|
{
|
1743
1786
|
cStemFilter =
|
1744
|
-
|
1787
|
+
frt_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
1745
1788
|
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
1746
1789
|
rb_define_method(cStemFilter, "initialize",
|
1747
1790
|
frt_stem_filter_init, -1);
|
@@ -1784,7 +1827,7 @@ static void Init_StemFilter(void)
|
|
1784
1827
|
static void Init_Analyzer(void)
|
1785
1828
|
{
|
1786
1829
|
cAnalyzer =
|
1787
|
-
|
1830
|
+
frt_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
1788
1831
|
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
1789
1832
|
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
1790
1833
|
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
@@ -1821,7 +1864,7 @@ static void Init_Analyzer(void)
|
|
1821
1864
|
static void Init_AsciiLetterAnalyzer(void)
|
1822
1865
|
{
|
1823
1866
|
cAsciiLetterAnalyzer =
|
1824
|
-
|
1867
|
+
frt_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
1825
1868
|
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
1826
1869
|
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
1827
1870
|
frt_a_letter_analyzer_init, -1);
|
@@ -1851,7 +1894,7 @@ static void Init_AsciiLetterAnalyzer(void)
|
|
1851
1894
|
static void Init_LetterAnalyzer(void)
|
1852
1895
|
{
|
1853
1896
|
cLetterAnalyzer =
|
1854
|
-
|
1897
|
+
frt_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
1855
1898
|
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
1856
1899
|
rb_define_method(cLetterAnalyzer, "initialize",
|
1857
1900
|
frt_letter_analyzer_init, -1);
|
@@ -1887,7 +1930,7 @@ static void Init_LetterAnalyzer(void)
|
|
1887
1930
|
static void Init_AsciiWhiteSpaceAnalyzer(void)
|
1888
1931
|
{
|
1889
1932
|
cAsciiWhiteSpaceAnalyzer =
|
1890
|
-
|
1933
|
+
frt_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
1891
1934
|
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
1892
1935
|
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
1893
1936
|
frt_a_white_space_analyzer_init, -1);
|
@@ -1917,7 +1960,7 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
|
|
1917
1960
|
static void Init_WhiteSpaceAnalyzer(void)
|
1918
1961
|
{
|
1919
1962
|
cWhiteSpaceAnalyzer =
|
1920
|
-
|
1963
|
+
frt_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
1921
1964
|
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
1922
1965
|
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
1923
1966
|
frt_white_space_analyzer_init, -1);
|
@@ -1955,7 +1998,7 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1955
1998
|
static void Init_AsciiStandardAnalyzer(void)
|
1956
1999
|
{
|
1957
2000
|
cAsciiStandardAnalyzer =
|
1958
|
-
|
2001
|
+
frt_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
1959
2002
|
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
1960
2003
|
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
1961
2004
|
frt_a_standard_analyzer_init, -1);
|
@@ -1986,7 +2029,7 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
1986
2029
|
static void Init_StandardAnalyzer(void)
|
1987
2030
|
{
|
1988
2031
|
cStandardAnalyzer =
|
1989
|
-
|
2032
|
+
frt_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
1990
2033
|
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
1991
2034
|
rb_define_method(cStandardAnalyzer, "initialize",
|
1992
2035
|
frt_standard_analyzer_init, -1);
|
@@ -2015,7 +2058,7 @@ static void Init_StandardAnalyzer(void)
|
|
2015
2058
|
static void Init_PerFieldAnalyzer(void)
|
2016
2059
|
{
|
2017
2060
|
cPerFieldAnalyzer =
|
2018
|
-
|
2061
|
+
frt_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
2019
2062
|
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
2020
2063
|
rb_define_method(cPerFieldAnalyzer, "initialize",
|
2021
2064
|
frt_per_field_analyzer_init, 1);
|
@@ -2055,7 +2098,7 @@ static void Init_PerFieldAnalyzer(void)
|
|
2055
2098
|
static void Init_RegExpAnalyzer(void)
|
2056
2099
|
{
|
2057
2100
|
cRegExpAnalyzer =
|
2058
|
-
|
2101
|
+
frt_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
2059
2102
|
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2060
2103
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2061
2104
|
frt_re_analyzer_init, -1);
|
@@ -2171,6 +2214,7 @@ Init_Analysis(void)
|
|
2171
2214
|
|
2172
2215
|
Init_AsciiLowerCaseFilter();
|
2173
2216
|
Init_LowerCaseFilter();
|
2217
|
+
Init_HyphenFilter();
|
2174
2218
|
Init_StopFilter();
|
2175
2219
|
Init_StemFilter();
|
2176
2220
|
|
data/ext/r_index.c
CHANGED
@@ -1875,7 +1875,6 @@ frt_ir_init(VALUE self, VALUE rdir)
|
|
1875
1875
|
VALUE rfield_num_map = rb_hash_new();
|
1876
1876
|
|
1877
1877
|
if (TYPE(rdir) == T_ARRAY) {
|
1878
|
-
VALUE rreader;
|
1879
1878
|
VALUE rdirs = rdir;
|
1880
1879
|
const int reader_cnt = RARRAY(rdir)->len;
|
1881
1880
|
IndexReader **sub_readers = ALLOC_N(IndexReader *, reader_cnt);
|
data/ext/r_search.c
CHANGED
@@ -2181,7 +2181,7 @@ frt_sea_search_internal(Query *query, VALUE roptions, Searcher *sea)
|
|
2181
2181
|
* searcher.search(query, options = {}) -> TopDocs
|
2182
2182
|
*
|
2183
2183
|
* Run a query through the Searcher on the index. A TopDocs object is
|
2184
|
-
* returned with the relevant results. The +query+ is
|
2184
|
+
* returned with the relevant results. The +query+ is a built in Query
|
2185
2185
|
* object. Here are the options;
|
2186
2186
|
*
|
2187
2187
|
* === Options
|
data/ext/search.c
CHANGED
@@ -741,19 +741,23 @@ static char *excerpt_get_str(Excerpt *e, MatchVector *mv,
|
|
741
741
|
for (i = e->start; i <= e->end; i++) {
|
742
742
|
MatchRange *mr = mv->matches + i;
|
743
743
|
len = mr->start_offset - last_offset;
|
744
|
-
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
744
|
+
if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
745
745
|
e_ptr += len;
|
746
746
|
memcpy(e_ptr, pre_tag, pre_tag_len);
|
747
747
|
e_ptr += pre_tag_len;
|
748
748
|
len = mr->end_offset - mr->start_offset;
|
749
|
-
lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
749
|
+
if (len) lazy_df_get_bytes(lazy_df, e_ptr, mr->start_offset, len);
|
750
750
|
e_ptr += len;
|
751
751
|
memcpy(e_ptr, post_tag, post_tag_len);
|
752
752
|
e_ptr += post_tag_len;
|
753
753
|
last_offset = mr->end_offset;
|
754
754
|
}
|
755
|
+
if ((lazy_df->len - e->end_offset) <= ellipsis_len) {
|
756
|
+
/* no point using ellipsis if it takes up more space */
|
757
|
+
e->end_offset = lazy_df->len;
|
758
|
+
}
|
755
759
|
len = e->end_offset - last_offset;
|
756
|
-
lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
760
|
+
if (len) lazy_df_get_bytes(lazy_df, e_ptr, last_offset, len);
|
757
761
|
e_ptr += len;
|
758
762
|
if (e->end_offset < lazy_df->len) {
|
759
763
|
memcpy(e_ptr, ellipsis, ellipsis_len);
|
data/ext/term_vectors.c
CHANGED
data/lib/ferret/index.rb
CHANGED
@@ -146,25 +146,25 @@ module Ferret::Index
|
|
146
146
|
#
|
147
147
|
# === Options
|
148
148
|
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
#
|
153
|
-
#
|
154
|
-
#
|
155
|
-
#
|
156
|
-
#
|
157
|
-
#
|
158
|
-
#
|
159
|
-
#
|
160
|
-
#
|
161
|
-
#
|
162
|
-
#
|
163
|
-
#
|
164
|
-
#
|
165
|
-
#
|
166
|
-
#
|
167
|
-
#
|
149
|
+
# field:: Default: @options[:default_field]. The default_field
|
150
|
+
# is the field that is usually highlighted but you can
|
151
|
+
# specify which field you want to highlight here. If
|
152
|
+
# you want to highlight multiple fields then you will
|
153
|
+
# need to call this method multiple times.
|
154
|
+
# excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
|
155
|
+
# terms will be in the centre of the excerpt.
|
156
|
+
# num_excerpts:: Default: 2. Number of excerpts to return.
|
157
|
+
# pre_tag:: Default: "<b>". Tag to place to the left of the
|
158
|
+
# match. You'll probably want to change this to a
|
159
|
+
# "<span>" tag with a class "\033[7m" for use in a
|
160
|
+
# terminal.
|
161
|
+
# post_tag:: Default: "</b>". This tag should close the
|
162
|
+
# +:pre_tag+. Try tag "\033[m" in the terminal.
|
163
|
+
# ellipsis:: Default: "...". This is the string that is appended
|
164
|
+
# at the beginning and end of excerpts (unless the
|
165
|
+
# excerpt hits the start or end of the field. You'll
|
166
|
+
# probably want to change this so a Unicode elipsis
|
167
|
+
# character.
|
168
168
|
def highlight(query, doc_id, options = {})
|
169
169
|
ensure_searcher_open()
|
170
170
|
@searcher.highlight(process_query(query),
|
@@ -270,33 +270,79 @@ module Ferret::Index
|
|
270
270
|
end
|
271
271
|
alias :<< :add_document
|
272
272
|
|
273
|
-
#
|
274
|
-
#
|
275
|
-
#
|
276
|
-
#
|
277
|
-
#
|
278
|
-
#
|
279
|
-
#
|
280
|
-
#
|
281
|
-
#
|
282
|
-
#
|
283
|
-
#
|
284
|
-
#
|
273
|
+
# Run a query through the Searcher on the index. A TopDocs object is
|
274
|
+
# returned with the relevant results. The +query+ is a built in Query
|
275
|
+
# object or a query string that can be parsed by the Ferret::QueryParser.
|
276
|
+
# Here are the options;
|
277
|
+
#
|
278
|
+
# === Options
|
279
|
+
#
|
280
|
+
# offset:: Default: 0. The offset of the start of the section of the
|
281
|
+
# result-set to return. This is used for paging through
|
282
|
+
# results. Let's say you have a page size of 10. If you
|
283
|
+
# don't find the result you want among the first 10 results
|
284
|
+
# then set +:offset+ to 10 and look at the next 10 results,
|
285
|
+
# then 20 and so on.
|
286
|
+
# limit:: Default: 10. This is the number of results you want
|
287
|
+
# returned, also called the page size. Set +:limit+ to
|
288
|
+
# +:all+ to return all results
|
289
|
+
# sort:: A Sort object or sort string describing how the field
|
290
|
+
# should be sorted. A sort string is made up of field names
|
291
|
+
# which cannot contain spaces and the word "DESC" if you
|
292
|
+
# want the field reversed, all seperated by commas. For
|
293
|
+
# example; "rating DESC, author, title"
|
294
|
+
# filter:: a Filter object to filter the search results with
|
295
|
+
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
296
|
+
# and the Searcher object as its parameters and returns a
|
297
|
+
# Boolean value specifying whether the result should be
|
298
|
+
# included in the result set.
|
285
299
|
def search(query, options = {})
|
286
300
|
@dir.synchronize do
|
287
301
|
return do_search(query, options)
|
288
302
|
end
|
289
303
|
end
|
290
304
|
|
291
|
-
#
|
305
|
+
# Run a query through the Searcher on the index. A TopDocs object is
|
306
|
+
# returned with the relevant results. The +query+ is a Query object or a
|
307
|
+
# query string that can be validly parsed by the Ferret::QueryParser. The
|
308
|
+
# Searcher#search_each method yields the internal document id (used to
|
309
|
+
# reference documents in the Searcher object like this;
|
310
|
+
# +searcher[doc_id]+) and the search score for that document. It is
|
311
|
+
# possible for the score to be greater than 1.0 for some queries and
|
312
|
+
# taking boosts into account. This method will also normalize scores to
|
313
|
+
# the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
|
314
|
+
# options;
|
315
|
+
#
|
316
|
+
# === Options
|
292
317
|
#
|
293
|
-
#
|
318
|
+
# offset:: Default: 0. The offset of the start of the section of the
|
319
|
+
# result-set to return. This is used for paging through
|
320
|
+
# results. Let's say you have a page size of 10. If you
|
321
|
+
# don't find the result you want among the first 10 results
|
322
|
+
# then set +:offset+ to 10 and look at the next 10 results,
|
323
|
+
# then 20 and so on.
|
324
|
+
# limit:: Default: 10. This is the number of results you want
|
325
|
+
# returned, also called the page size. Set +:limit+ to
|
326
|
+
# +:all+ to return all results
|
327
|
+
# sort:: A Sort object or sort string describing how the field
|
328
|
+
# should be sorted. A sort string is made up of field names
|
329
|
+
# which cannot contain spaces and the word "DESC" if you
|
330
|
+
# want the field reversed, all seperated by commas. For
|
331
|
+
# example; "rating DESC, author, title"
|
332
|
+
# filter:: a Filter object to filter the search results with
|
333
|
+
# filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
|
334
|
+
# and the Searcher object as its parameters and returns a
|
335
|
+
# Boolean value specifying whether the result should be
|
336
|
+
# included in the result set.
|
337
|
+
#
|
338
|
+
# returns:: The total number of hits.
|
339
|
+
#
|
340
|
+
# === Example
|
294
341
|
# eg.
|
295
|
-
# index.search_each() do |doc, score|
|
342
|
+
# index.search_each(query, options = {}) do |doc, score|
|
296
343
|
# puts "hit document number #{doc} with a score of #{score}"
|
297
344
|
# end
|
298
345
|
#
|
299
|
-
# returns:: The total number of hits.
|
300
346
|
def search_each(query, options = {}) # :yield: doc, score
|
301
347
|
@dir.synchronize do
|
302
348
|
ensure_searcher_open()
|
@@ -571,6 +617,19 @@ module Ferret::Index
|
|
571
617
|
end
|
572
618
|
end
|
573
619
|
|
620
|
+
# Turn a query string into a Query object with the Index's QueryParser
|
621
|
+
def process_query(query)
|
622
|
+
if query.is_a?(String)
|
623
|
+
if @qp.nil?
|
624
|
+
@qp = Ferret::QueryParser.new(@options)
|
625
|
+
end
|
626
|
+
# we need to set this ever time, in case a new field has been added
|
627
|
+
@qp.fields = @reader.field_names
|
628
|
+
query = @qp.parse(query)
|
629
|
+
end
|
630
|
+
return query
|
631
|
+
end
|
632
|
+
|
574
633
|
protected
|
575
634
|
def ensure_writer_open()
|
576
635
|
raise "tried to use a closed index" if not @open
|
@@ -623,19 +682,6 @@ module Ferret::Index
|
|
623
682
|
|
624
683
|
return @searcher.search(query, options)
|
625
684
|
end
|
626
|
-
|
627
|
-
def process_query(query)
|
628
|
-
if query.is_a?(String)
|
629
|
-
if @qp.nil?
|
630
|
-
@qp = Ferret::QueryParser.new(@options)
|
631
|
-
end
|
632
|
-
# we need to set this ever time, in case a new field has been added
|
633
|
-
@qp.fields = @reader.field_names
|
634
|
-
query = @qp.parse(query)
|
635
|
-
end
|
636
|
-
return query
|
637
|
-
end
|
638
|
-
|
639
685
|
end
|
640
686
|
end
|
641
687
|
|