ferret 0.10.4 → 0.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/ext/analysis.c +7 -1
- data/ext/bitvector.c +5 -2
- data/ext/bitvector.h +1 -0
- data/ext/ferret.c +55 -8
- data/ext/ferret.h +8 -2
- data/ext/index.c +34 -43
- data/ext/index.h +1 -1
- data/ext/q_boolean.c +1 -1
- data/ext/q_multi_term.c +13 -1
- data/ext/q_parser.c +33 -18
- data/ext/r_analysis.c +68 -45
- data/ext/r_index.c +64 -10
- data/ext/r_search.c +145 -10
- data/ext/search.c +71 -12
- data/lib/ferret/index.rb +42 -28
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_analyzer.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +0 -1
- data/test/unit/index/tc_index.rb +3 -3
- data/test/unit/index/tc_index_reader.rb +5 -0
- data/test/unit/search/tc_filter.rb +15 -0
- data/test/unit/search/tm_searcher.rb +13 -2
- metadata +2 -2
data/ext/r_analysis.c
CHANGED
@@ -569,7 +569,7 @@ static TokenStream *
|
|
569
569
|
frt_get_cwrapped_rts(VALUE rts)
|
570
570
|
{
|
571
571
|
TokenStream *ts;
|
572
|
-
if (
|
572
|
+
if (frt_is_cclass(rts) && DATA_PTR(rts)) {
|
573
573
|
GET_TS(ts, rts);
|
574
574
|
REF(ts);
|
575
575
|
}
|
@@ -972,7 +972,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
972
972
|
* StemFilter.new(token_stream) -> token_stream
|
973
973
|
* StemFilter.new(token_stream,
|
974
974
|
* algorithm="english",
|
975
|
-
* encoding=
|
975
|
+
* encoding="UTF-8") -> token_stream
|
976
976
|
*
|
977
977
|
* Create an StemFilter which uses a snowball stemmer (thankyou Martin
|
978
978
|
* Porter) to stem words. You can optionally specify the algorithm (default:
|
@@ -1034,7 +1034,7 @@ static TokenStream *
|
|
1034
1034
|
cwa_get_ts(Analyzer *a, char *field, char *text)
|
1035
1035
|
{
|
1036
1036
|
VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1037
|
-
|
1037
|
+
rb_intern(field), rb_str_new2(text));
|
1038
1038
|
return frt_get_cwrapped_rts(rts);
|
1039
1039
|
}
|
1040
1040
|
|
@@ -1042,7 +1042,7 @@ Analyzer *
|
|
1042
1042
|
frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
1043
1043
|
{
|
1044
1044
|
Analyzer *a = NULL;
|
1045
|
-
if (
|
1045
|
+
if (frt_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
|
1046
1046
|
Data_Get_Struct(ranalyzer, Analyzer, a);
|
1047
1047
|
REF(a);
|
1048
1048
|
}
|
@@ -1230,7 +1230,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1230
1230
|
bool lower;
|
1231
1231
|
VALUE rlower, rstop_words;
|
1232
1232
|
Analyzer *a;
|
1233
|
-
rb_scan_args(argc, argv, "02", &
|
1233
|
+
rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
|
1234
1234
|
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
1235
1235
|
if (rstop_words != Qnil) {
|
1236
1236
|
char **stop_words = get_stopwords(rstop_words);
|
@@ -1246,7 +1246,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1246
1246
|
|
1247
1247
|
/*
|
1248
1248
|
* call-seq:
|
1249
|
-
* StandardAnalyzer.new(
|
1249
|
+
* StandardAnalyzer.new(stop_words=ENGLISH_STOP_WORDS, lower=true)
|
1250
1250
|
* -> analyzer
|
1251
1251
|
*
|
1252
1252
|
* Create a new StandardAnalyzer which downcases tokens by default but can
|
@@ -1330,7 +1330,7 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
1330
1330
|
Data_Get_Struct(self, Analyzer, pfa);
|
1331
1331
|
a = frt_get_cwrapped_analyzer(ranalyzer);
|
1332
1332
|
|
1333
|
-
pfa_add_field(pfa,
|
1333
|
+
pfa_add_field(pfa, frt_field(rfield), a);
|
1334
1334
|
return self;
|
1335
1335
|
}
|
1336
1336
|
|
@@ -1483,7 +1483,8 @@ static void Init_Token(void)
|
|
1483
1483
|
*/
|
1484
1484
|
static void Init_TokenStream(void)
|
1485
1485
|
{
|
1486
|
-
cTokenStream =
|
1486
|
+
cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
1487
|
+
frt_mark_cclass(cTokenStream);
|
1487
1488
|
rb_define_method(cTokenStream, "next", frt_ts_next, 0);
|
1488
1489
|
rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
|
1489
1490
|
rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
|
@@ -1504,7 +1505,8 @@ static void Init_TokenStream(void)
|
|
1504
1505
|
static void Init_AsciiLetterTokenizer(void)
|
1505
1506
|
{
|
1506
1507
|
cAsciiLetterTokenizer =
|
1507
|
-
|
1508
|
+
rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
1509
|
+
frt_mark_cclass(cAsciiLetterTokenizer);
|
1508
1510
|
rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
|
1509
1511
|
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
1510
1512
|
frt_a_letter_tokenizer_init, 1);
|
@@ -1526,7 +1528,8 @@ static void Init_AsciiLetterTokenizer(void)
|
|
1526
1528
|
static void Init_LetterTokenizer(void)
|
1527
1529
|
{
|
1528
1530
|
cLetterTokenizer =
|
1529
|
-
|
1531
|
+
rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
1532
|
+
frt_mark_cclass(cLetterTokenizer);
|
1530
1533
|
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
1531
1534
|
rb_define_method(cLetterTokenizer, "initialize",
|
1532
1535
|
frt_letter_tokenizer_init, -1);
|
@@ -1546,8 +1549,9 @@ static void Init_LetterTokenizer(void)
|
|
1546
1549
|
static void Init_AsciiWhiteSpaceTokenizer(void)
|
1547
1550
|
{
|
1548
1551
|
cAsciiWhiteSpaceTokenizer =
|
1549
|
-
|
1552
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
|
1550
1553
|
cTokenStream);
|
1554
|
+
frt_mark_cclass(cAsciiWhiteSpaceTokenizer);
|
1551
1555
|
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
|
1552
1556
|
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
1553
1557
|
frt_a_whitespace_tokenizer_init, 1);
|
@@ -1567,7 +1571,8 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
|
|
1567
1571
|
static void Init_WhiteSpaceTokenizer(void)
|
1568
1572
|
{
|
1569
1573
|
cWhiteSpaceTokenizer =
|
1570
|
-
|
1574
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
1575
|
+
frt_mark_cclass(cWhiteSpaceTokenizer);
|
1571
1576
|
rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
|
1572
1577
|
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
1573
1578
|
frt_whitespace_tokenizer_init, -1);
|
@@ -1588,7 +1593,8 @@ static void Init_WhiteSpaceTokenizer(void)
|
|
1588
1593
|
static void Init_AsciiStandardTokenizer(void)
|
1589
1594
|
{
|
1590
1595
|
cAsciiStandardTokenizer =
|
1591
|
-
|
1596
|
+
rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1597
|
+
frt_mark_cclass(cAsciiStandardTokenizer);
|
1592
1598
|
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
1593
1599
|
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1594
1600
|
frt_a_standard_tokenizer_init, 1);
|
@@ -1609,7 +1615,8 @@ static void Init_AsciiStandardTokenizer(void)
|
|
1609
1615
|
static void Init_StandardTokenizer(void)
|
1610
1616
|
{
|
1611
1617
|
cStandardTokenizer =
|
1612
|
-
|
1618
|
+
rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1619
|
+
frt_mark_cclass(cStandardTokenizer);
|
1613
1620
|
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
1614
1621
|
rb_define_method(cStandardTokenizer, "initialize",
|
1615
1622
|
frt_standard_tokenizer_init, 1);
|
@@ -1636,7 +1643,8 @@ static void Init_StandardTokenizer(void)
|
|
1636
1643
|
static void Init_RegExpTokenizer(void)
|
1637
1644
|
{
|
1638
1645
|
cRegExpTokenizer =
|
1639
|
-
|
1646
|
+
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1647
|
+
frt_mark_cclass(cRegExpTokenizer);
|
1640
1648
|
rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
|
1641
1649
|
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1642
1650
|
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
@@ -1664,7 +1672,8 @@ static void Init_RegExpTokenizer(void)
|
|
1664
1672
|
static void Init_AsciiLowerCaseFilter(void)
|
1665
1673
|
{
|
1666
1674
|
cAsciiLowerCaseFilter =
|
1667
|
-
|
1675
|
+
rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1676
|
+
frt_mark_cclass(cAsciiLowerCaseFilter);
|
1668
1677
|
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
1669
1678
|
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1670
1679
|
frt_a_lowercase_filter_init, 1);
|
@@ -1684,7 +1693,8 @@ static void Init_AsciiLowerCaseFilter(void)
|
|
1684
1693
|
static void Init_LowerCaseFilter(void)
|
1685
1694
|
{
|
1686
1695
|
cLowerCaseFilter =
|
1687
|
-
|
1696
|
+
rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1697
|
+
frt_mark_cclass(cLowerCaseFilter);
|
1688
1698
|
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
1689
1699
|
rb_define_method(cLowerCaseFilter, "initialize",
|
1690
1700
|
frt_lowercase_filter_init, 1);
|
@@ -1706,7 +1716,8 @@ static void Init_LowerCaseFilter(void)
|
|
1706
1716
|
static void Init_HyphenFilter(void)
|
1707
1717
|
{
|
1708
1718
|
cHyphenFilter =
|
1709
|
-
|
1719
|
+
rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
|
1720
|
+
frt_mark_cclass(cHyphenFilter);
|
1710
1721
|
rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
|
1711
1722
|
rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
|
1712
1723
|
}
|
@@ -1725,7 +1736,8 @@ static void Init_HyphenFilter(void)
|
|
1725
1736
|
static void Init_StopFilter(void)
|
1726
1737
|
{
|
1727
1738
|
cStopFilter =
|
1728
|
-
|
1739
|
+
rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1740
|
+
frt_mark_cclass(cStopFilter);
|
1729
1741
|
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
1730
1742
|
rb_define_method(cStopFilter, "initialize",
|
1731
1743
|
frt_stop_filter_init, -1);
|
@@ -1738,14 +1750,10 @@ static void Init_StopFilter(void)
|
|
1738
1750
|
*
|
1739
1751
|
* A StemFilter takes a term and transforms the term as per the SnowBall
|
1740
1752
|
* stemming algorithm. Note: the input to the stemming filter must already
|
1741
|
-
* be in lower case, so you will need to use LowerCaseFilter or
|
1742
|
-
*
|
1743
|
-
*
|
1753
|
+
* be in lower case, so you will need to use LowerCaseFilter or lowercasing
|
1754
|
+
* Tokenizer further down the Tokenizer chain in order for this to work
|
1755
|
+
* properly!
|
1744
1756
|
*
|
1745
|
-
* To use this filter with other analyzers, you'll want to write an Analyzer
|
1746
|
-
* class that sets up the TokenStream chain as you want it. To use this with
|
1747
|
-
* LowerCaseTokenizer, for example, you'd write an analyzer like this:
|
1748
|
-
*
|
1749
1757
|
* === Available algorithms and encodings
|
1750
1758
|
*
|
1751
1759
|
* Algorithm Algorithm Pseudonyms Encoding
|
@@ -1766,6 +1774,10 @@ static void Init_StopFilter(void)
|
|
1766
1774
|
*
|
1767
1775
|
* === Example
|
1768
1776
|
*
|
1777
|
+
* To use this filter with other analyzers, you'll want to write an Analyzer
|
1778
|
+
* class that sets up the TokenStream chain as you want it. To use this with
|
1779
|
+
* a lowercasing Tokenizer, for example, you'd write an analyzer like this:
|
1780
|
+
*
|
1769
1781
|
* def MyAnalyzer < Analyzer
|
1770
1782
|
* def token_stream(field, str)
|
1771
1783
|
* return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
|
@@ -1784,7 +1796,8 @@ static void Init_StopFilter(void)
|
|
1784
1796
|
static void Init_StemFilter(void)
|
1785
1797
|
{
|
1786
1798
|
cStemFilter =
|
1787
|
-
|
1799
|
+
rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
1800
|
+
frt_mark_cclass(cStemFilter);
|
1788
1801
|
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
1789
1802
|
rb_define_method(cStemFilter, "initialize",
|
1790
1803
|
frt_stem_filter_init, -1);
|
@@ -1827,7 +1840,8 @@ static void Init_StemFilter(void)
|
|
1827
1840
|
static void Init_Analyzer(void)
|
1828
1841
|
{
|
1829
1842
|
cAnalyzer =
|
1830
|
-
|
1843
|
+
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
1844
|
+
frt_mark_cclass(cAnalyzer);
|
1831
1845
|
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
1832
1846
|
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
1833
1847
|
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
@@ -1864,7 +1878,8 @@ static void Init_Analyzer(void)
|
|
1864
1878
|
static void Init_AsciiLetterAnalyzer(void)
|
1865
1879
|
{
|
1866
1880
|
cAsciiLetterAnalyzer =
|
1867
|
-
|
1881
|
+
rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
1882
|
+
frt_mark_cclass(cAsciiLetterAnalyzer);
|
1868
1883
|
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
1869
1884
|
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
1870
1885
|
frt_a_letter_analyzer_init, -1);
|
@@ -1894,7 +1909,8 @@ static void Init_AsciiLetterAnalyzer(void)
|
|
1894
1909
|
static void Init_LetterAnalyzer(void)
|
1895
1910
|
{
|
1896
1911
|
cLetterAnalyzer =
|
1897
|
-
|
1912
|
+
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
1913
|
+
frt_mark_cclass(cLetterAnalyzer);
|
1898
1914
|
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
1899
1915
|
rb_define_method(cLetterAnalyzer, "initialize",
|
1900
1916
|
frt_letter_analyzer_init, -1);
|
@@ -1930,7 +1946,8 @@ static void Init_LetterAnalyzer(void)
|
|
1930
1946
|
static void Init_AsciiWhiteSpaceAnalyzer(void)
|
1931
1947
|
{
|
1932
1948
|
cAsciiWhiteSpaceAnalyzer =
|
1933
|
-
|
1949
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
1950
|
+
frt_mark_cclass(cAsciiWhiteSpaceAnalyzer);
|
1934
1951
|
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
1935
1952
|
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
1936
1953
|
frt_a_white_space_analyzer_init, -1);
|
@@ -1960,7 +1977,8 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
|
|
1960
1977
|
static void Init_WhiteSpaceAnalyzer(void)
|
1961
1978
|
{
|
1962
1979
|
cWhiteSpaceAnalyzer =
|
1963
|
-
|
1980
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
1981
|
+
frt_mark_cclass(cWhiteSpaceAnalyzer);
|
1964
1982
|
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
1965
1983
|
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
1966
1984
|
frt_white_space_analyzer_init, -1);
|
@@ -1975,18 +1993,16 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1975
1993
|
* ascii-analyzers. If it were implemented in Ruby it would look like this;
|
1976
1994
|
*
|
1977
1995
|
* class AsciiStandardAnalyzer
|
1978
|
-
* def initialize(
|
1996
|
+
* def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
|
1979
1997
|
* @lower = lower
|
1980
1998
|
* @stop_words = stop_words
|
1981
1999
|
* end
|
1982
2000
|
*
|
1983
2001
|
* def token_stream(field, str)
|
1984
|
-
*
|
1985
|
-
*
|
1986
|
-
*
|
1987
|
-
*
|
1988
|
-
* return StopFilter.new(AsciiStandardTokenizer.new(str), @stop_words)
|
1989
|
-
* end
|
2002
|
+
* ts = AsciiStandardTokenizer.new(str)
|
2003
|
+
* ts = AsciiLowerCaseFilter.new(ts) if @lower
|
2004
|
+
* ts = StopFilter.new(ts, @stop_words)
|
2005
|
+
* ts = HyphenFilter.new(ts)
|
1990
2006
|
* end
|
1991
2007
|
* end
|
1992
2008
|
*
|
@@ -1998,7 +2014,8 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1998
2014
|
static void Init_AsciiStandardAnalyzer(void)
|
1999
2015
|
{
|
2000
2016
|
cAsciiStandardAnalyzer =
|
2001
|
-
|
2017
|
+
rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
2018
|
+
frt_mark_cclass(cAsciiStandardAnalyzer);
|
2002
2019
|
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
2003
2020
|
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
2004
2021
|
frt_a_standard_analyzer_init, -1);
|
@@ -2013,13 +2030,16 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2013
2030
|
* it were implemented in Ruby it would look like this;
|
2014
2031
|
*
|
2015
2032
|
* class StandardAnalyzer
|
2016
|
-
* def initialize(
|
2033
|
+
* def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
|
2017
2034
|
* @lower = lower
|
2018
2035
|
* @stop_words = stop_words
|
2019
2036
|
* end
|
2020
2037
|
*
|
2021
2038
|
* def token_stream(field, str)
|
2022
|
-
*
|
2039
|
+
* ts = StandardTokenizer.new(str)
|
2040
|
+
* ts = LowerCaseFilter.new(ts) if @lower
|
2041
|
+
* ts = StopFilter.new(ts, @stop_words)
|
2042
|
+
* ts = HyphenFilter.new(ts)
|
2023
2043
|
* end
|
2024
2044
|
* end
|
2025
2045
|
*
|
@@ -2029,7 +2049,8 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2029
2049
|
static void Init_StandardAnalyzer(void)
|
2030
2050
|
{
|
2031
2051
|
cStandardAnalyzer =
|
2032
|
-
|
2052
|
+
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
2053
|
+
frt_mark_cclass(cStandardAnalyzer);
|
2033
2054
|
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
2034
2055
|
rb_define_method(cStandardAnalyzer, "initialize",
|
2035
2056
|
frt_standard_analyzer_init, -1);
|
@@ -2058,7 +2079,8 @@ static void Init_StandardAnalyzer(void)
|
|
2058
2079
|
static void Init_PerFieldAnalyzer(void)
|
2059
2080
|
{
|
2060
2081
|
cPerFieldAnalyzer =
|
2061
|
-
|
2082
|
+
rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
2083
|
+
frt_mark_cclass(cPerFieldAnalyzer);
|
2062
2084
|
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
2063
2085
|
rb_define_method(cPerFieldAnalyzer, "initialize",
|
2064
2086
|
frt_per_field_analyzer_init, 1);
|
@@ -2098,7 +2120,8 @@ static void Init_PerFieldAnalyzer(void)
|
|
2098
2120
|
static void Init_RegExpAnalyzer(void)
|
2099
2121
|
{
|
2100
2122
|
cRegExpAnalyzer =
|
2101
|
-
|
2123
|
+
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
2124
|
+
frt_mark_cclass(cRegExpAnalyzer);
|
2102
2125
|
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2103
2126
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2104
2127
|
frt_re_analyzer_init, -1);
|
data/ext/r_index.c
CHANGED
@@ -94,10 +94,12 @@ frt_fi_get_params(VALUE roptions,
|
|
94
94
|
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
95
95
|
if (v == sym_no || v == sym_false || v == Qfalse) {
|
96
96
|
*store = STORE_NO;
|
97
|
-
} else if (v == sym_yes || v == sym_true || v == Qtrue
|
97
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
98
98
|
*store = STORE_YES;
|
99
99
|
} else if (v == sym_compress || v == sym_compressed) {
|
100
100
|
*store = STORE_COMPRESS;
|
101
|
+
} else if (v == Qnil) {
|
102
|
+
/* leave as default */
|
101
103
|
} else {
|
102
104
|
rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
|
103
105
|
" Please choose from [:yes, :no, :compressed]",
|
@@ -108,7 +110,7 @@ frt_fi_get_params(VALUE roptions,
|
|
108
110
|
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
109
111
|
if (v == sym_no || v == sym_false || v == Qfalse) {
|
110
112
|
*index = INDEX_NO;
|
111
|
-
} else if (v == sym_yes || v == sym_true || v == Qtrue
|
113
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
112
114
|
*index = INDEX_YES;
|
113
115
|
} else if (v == sym_untokenized) {
|
114
116
|
*index = INDEX_UNTOKENIZED;
|
@@ -116,6 +118,8 @@ frt_fi_get_params(VALUE roptions,
|
|
116
118
|
*index = INDEX_YES_OMIT_NORMS;
|
117
119
|
} else if (v == sym_untokenized_omit_norms) {
|
118
120
|
*index = INDEX_UNTOKENIZED_OMIT_NORMS;
|
121
|
+
} else if (v == Qnil) {
|
122
|
+
/* leave as default */
|
119
123
|
} else {
|
120
124
|
rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
|
121
125
|
" Please choose from [:no, :yes, :untokenized, "
|
@@ -133,8 +137,10 @@ frt_fi_get_params(VALUE roptions,
|
|
133
137
|
*term_vector = TERM_VECTOR_WITH_POSITIONS;
|
134
138
|
} else if (v == sym_with_offsets) {
|
135
139
|
*term_vector = TERM_VECTOR_WITH_OFFSETS;
|
136
|
-
} else if (v == sym_with_positions_offsets
|
140
|
+
} else if (v == sym_with_positions_offsets) {
|
137
141
|
*term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
142
|
+
} else if (v == Qnil) {
|
143
|
+
/* leave as default */
|
138
144
|
} else {
|
139
145
|
rb_raise(rb_eArgError, ":%s isn't a valid argument for "
|
140
146
|
":term_vector. Please choose from [:no, :yes, "
|
@@ -507,9 +513,9 @@ frt_fis_add_field(int argc, VALUE *argv, VALUE self)
|
|
507
513
|
{
|
508
514
|
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
509
515
|
FieldInfo *fi;
|
510
|
-
enum StoreValues store =
|
511
|
-
enum IndexValues index =
|
512
|
-
enum TermVectorValues term_vector =
|
516
|
+
enum StoreValues store = fis->store;
|
517
|
+
enum IndexValues index = fis->index;
|
518
|
+
enum TermVectorValues term_vector = fis->term_vector;
|
513
519
|
float boost = 1.0f;
|
514
520
|
VALUE rname, roptions;
|
515
521
|
|
@@ -2134,6 +2140,21 @@ frt_ir_undelete_all(VALUE self)
|
|
2134
2140
|
return self;
|
2135
2141
|
}
|
2136
2142
|
|
2143
|
+
static VALUE
|
2144
|
+
frt_get_doc_range(IndexReader *ir, int pos, int len, int max)
|
2145
|
+
{
|
2146
|
+
VALUE ary;
|
2147
|
+
int i;
|
2148
|
+
max = min2(max, pos+len);
|
2149
|
+
len = max - pos;
|
2150
|
+
ary = rb_ary_new2(len);
|
2151
|
+
for (i = 0; i < len; i++) {
|
2152
|
+
RARRAY(ary)->ptr[i] = frt_get_lazy_doc(ir->get_lazy_doc(ir, i + pos));
|
2153
|
+
}
|
2154
|
+
RARRAY(ary)->len = len;
|
2155
|
+
return ary;
|
2156
|
+
}
|
2157
|
+
|
2137
2158
|
/*
|
2138
2159
|
* call-seq:
|
2139
2160
|
* index_reader.get_document(doc_id) -> LazyDoc
|
@@ -2144,10 +2165,43 @@ frt_ir_undelete_all(VALUE self)
|
|
2144
2165
|
* which are returned by the Searchers search methods.
|
2145
2166
|
*/
|
2146
2167
|
static VALUE
|
2147
|
-
frt_ir_get_doc(VALUE
|
2168
|
+
frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
|
2148
2169
|
{
|
2149
2170
|
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2150
|
-
|
2171
|
+
VALUE arg1, arg2;
|
2172
|
+
long pos, len;
|
2173
|
+
long max = ir->max_doc(ir);
|
2174
|
+
rb_scan_args(argc, argv, "11", &arg1, &arg2);
|
2175
|
+
if (argc == 1) {
|
2176
|
+
if (FIXNUM_P(arg1)) {
|
2177
|
+
pos = FIX2INT(arg1);
|
2178
|
+
pos = (pos < 0) ? (max + pos) : pos;
|
2179
|
+
if (pos < 0 || pos >= max) {
|
2180
|
+
rb_raise(rb_eArgError, ":%d is out of range [%d..%d] for "
|
2181
|
+
"IndexWriter#[]", pos, 0, max,
|
2182
|
+
rb_id2name(SYM2ID(argv)));
|
2183
|
+
}
|
2184
|
+
return frt_get_lazy_doc(ir->get_lazy_doc(ir, pos));
|
2185
|
+
}
|
2186
|
+
|
2187
|
+
/* check if idx is Range */
|
2188
|
+
switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) {
|
2189
|
+
case Qfalse:
|
2190
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for "
|
2191
|
+
"IndexReader.get_document(index)",
|
2192
|
+
rb_id2name(SYM2ID(argv)));
|
2193
|
+
case Qnil:
|
2194
|
+
return Qnil;
|
2195
|
+
default:
|
2196
|
+
return frt_get_doc_range(ir, pos, len, max);
|
2197
|
+
}
|
2198
|
+
}
|
2199
|
+
else {
|
2200
|
+
pos = FIX2LONG(arg1);
|
2201
|
+
len = FIX2LONG(arg2);
|
2202
|
+
return frt_get_doc_range(ir, pos, len, max);
|
2203
|
+
}
|
2204
|
+
return Qnil;
|
2151
2205
|
}
|
2152
2206
|
|
2153
2207
|
/*
|
@@ -3043,8 +3097,8 @@ Init_IndexReader(void)
|
|
3043
3097
|
rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
|
3044
3098
|
rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
|
3045
3099
|
rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
|
3046
|
-
rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, 1);
|
3047
|
-
rb_define_method(cIndexReader, "[]", frt_ir_get_doc, 1);
|
3100
|
+
rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, -1);
|
3101
|
+
rb_define_method(cIndexReader, "[]", frt_ir_get_doc, -1);
|
3048
3102
|
rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
|
3049
3103
|
rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
|
3050
3104
|
rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
|