ferret 0.10.4 → 0.10.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/ext/analysis.c +7 -1
- data/ext/bitvector.c +5 -2
- data/ext/bitvector.h +1 -0
- data/ext/ferret.c +55 -8
- data/ext/ferret.h +8 -2
- data/ext/index.c +34 -43
- data/ext/index.h +1 -1
- data/ext/q_boolean.c +1 -1
- data/ext/q_multi_term.c +13 -1
- data/ext/q_parser.c +33 -18
- data/ext/r_analysis.c +68 -45
- data/ext/r_index.c +64 -10
- data/ext/r_search.c +145 -10
- data/ext/search.c +71 -12
- data/lib/ferret/index.rb +42 -28
- data/lib/ferret_version.rb +1 -1
- data/test/unit/analysis/tc_analyzer.rb +1 -1
- data/test/unit/analysis/tc_token_stream.rb +0 -1
- data/test/unit/index/tc_index.rb +3 -3
- data/test/unit/index/tc_index_reader.rb +5 -0
- data/test/unit/search/tc_filter.rb +15 -0
- data/test/unit/search/tm_searcher.rb +13 -2
- metadata +2 -2
data/ext/r_analysis.c
CHANGED
@@ -569,7 +569,7 @@ static TokenStream *
|
|
569
569
|
frt_get_cwrapped_rts(VALUE rts)
|
570
570
|
{
|
571
571
|
TokenStream *ts;
|
572
|
-
if (
|
572
|
+
if (frt_is_cclass(rts) && DATA_PTR(rts)) {
|
573
573
|
GET_TS(ts, rts);
|
574
574
|
REF(ts);
|
575
575
|
}
|
@@ -972,7 +972,7 @@ frt_stop_filter_init(int argc, VALUE *argv, VALUE self)
|
|
972
972
|
* StemFilter.new(token_stream) -> token_stream
|
973
973
|
* StemFilter.new(token_stream,
|
974
974
|
* algorithm="english",
|
975
|
-
* encoding=
|
975
|
+
* encoding="UTF-8") -> token_stream
|
976
976
|
*
|
977
977
|
* Create an StemFilter which uses a snowball stemmer (thankyou Martin
|
978
978
|
* Porter) to stem words. You can optionally specify the algorithm (default:
|
@@ -1034,7 +1034,7 @@ static TokenStream *
|
|
1034
1034
|
cwa_get_ts(Analyzer *a, char *field, char *text)
|
1035
1035
|
{
|
1036
1036
|
VALUE rts = rb_funcall(CWA(a)->ranalyzer, id_token_stream, 2,
|
1037
|
-
|
1037
|
+
rb_intern(field), rb_str_new2(text));
|
1038
1038
|
return frt_get_cwrapped_rts(rts);
|
1039
1039
|
}
|
1040
1040
|
|
@@ -1042,7 +1042,7 @@ Analyzer *
|
|
1042
1042
|
frt_get_cwrapped_analyzer(VALUE ranalyzer)
|
1043
1043
|
{
|
1044
1044
|
Analyzer *a = NULL;
|
1045
|
-
if (
|
1045
|
+
if (frt_is_cclass(ranalyzer) && DATA_PTR(ranalyzer)) {
|
1046
1046
|
Data_Get_Struct(ranalyzer, Analyzer, a);
|
1047
1047
|
REF(a);
|
1048
1048
|
}
|
@@ -1230,7 +1230,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1230
1230
|
bool lower;
|
1231
1231
|
VALUE rlower, rstop_words;
|
1232
1232
|
Analyzer *a;
|
1233
|
-
rb_scan_args(argc, argv, "02", &
|
1233
|
+
rb_scan_args(argc, argv, "02", &rstop_words, &rlower);
|
1234
1234
|
lower = ((rlower == Qnil) ? true : RTEST(rlower));
|
1235
1235
|
if (rstop_words != Qnil) {
|
1236
1236
|
char **stop_words = get_stopwords(rstop_words);
|
@@ -1246,7 +1246,7 @@ frt_a_standard_analyzer_init(int argc, VALUE *argv, VALUE self)
|
|
1246
1246
|
|
1247
1247
|
/*
|
1248
1248
|
* call-seq:
|
1249
|
-
* StandardAnalyzer.new(
|
1249
|
+
* StandardAnalyzer.new(stop_words=ENGLISH_STOP_WORDS, lower=true)
|
1250
1250
|
* -> analyzer
|
1251
1251
|
*
|
1252
1252
|
* Create a new StandardAnalyzer which downcases tokens by default but can
|
@@ -1330,7 +1330,7 @@ frt_per_field_analyzer_add_field(VALUE self, VALUE rfield, VALUE ranalyzer)
|
|
1330
1330
|
Data_Get_Struct(self, Analyzer, pfa);
|
1331
1331
|
a = frt_get_cwrapped_analyzer(ranalyzer);
|
1332
1332
|
|
1333
|
-
pfa_add_field(pfa,
|
1333
|
+
pfa_add_field(pfa, frt_field(rfield), a);
|
1334
1334
|
return self;
|
1335
1335
|
}
|
1336
1336
|
|
@@ -1483,7 +1483,8 @@ static void Init_Token(void)
|
|
1483
1483
|
*/
|
1484
1484
|
static void Init_TokenStream(void)
|
1485
1485
|
{
|
1486
|
-
cTokenStream =
|
1486
|
+
cTokenStream = rb_define_class_under(mAnalysis, "TokenStream", rb_cObject);
|
1487
|
+
frt_mark_cclass(cTokenStream);
|
1487
1488
|
rb_define_method(cTokenStream, "next", frt_ts_next, 0);
|
1488
1489
|
rb_define_method(cTokenStream, "text=", frt_ts_set_text, 1);
|
1489
1490
|
rb_define_method(cTokenStream, "text", frt_ts_get_text, 0);
|
@@ -1504,7 +1505,8 @@ static void Init_TokenStream(void)
|
|
1504
1505
|
static void Init_AsciiLetterTokenizer(void)
|
1505
1506
|
{
|
1506
1507
|
cAsciiLetterTokenizer =
|
1507
|
-
|
1508
|
+
rb_define_class_under(mAnalysis, "AsciiLetterTokenizer", cTokenStream);
|
1509
|
+
frt_mark_cclass(cAsciiLetterTokenizer);
|
1508
1510
|
rb_define_alloc_func(cAsciiLetterTokenizer, frt_data_alloc);
|
1509
1511
|
rb_define_method(cAsciiLetterTokenizer, "initialize",
|
1510
1512
|
frt_a_letter_tokenizer_init, 1);
|
@@ -1526,7 +1528,8 @@ static void Init_AsciiLetterTokenizer(void)
|
|
1526
1528
|
static void Init_LetterTokenizer(void)
|
1527
1529
|
{
|
1528
1530
|
cLetterTokenizer =
|
1529
|
-
|
1531
|
+
rb_define_class_under(mAnalysis, "LetterTokenizer", cTokenStream);
|
1532
|
+
frt_mark_cclass(cLetterTokenizer);
|
1530
1533
|
rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
|
1531
1534
|
rb_define_method(cLetterTokenizer, "initialize",
|
1532
1535
|
frt_letter_tokenizer_init, -1);
|
@@ -1546,8 +1549,9 @@ static void Init_LetterTokenizer(void)
|
|
1546
1549
|
static void Init_AsciiWhiteSpaceTokenizer(void)
|
1547
1550
|
{
|
1548
1551
|
cAsciiWhiteSpaceTokenizer =
|
1549
|
-
|
1552
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceTokenizer",
|
1550
1553
|
cTokenStream);
|
1554
|
+
frt_mark_cclass(cAsciiWhiteSpaceTokenizer);
|
1551
1555
|
rb_define_alloc_func(cAsciiWhiteSpaceTokenizer, frt_data_alloc);
|
1552
1556
|
rb_define_method(cAsciiWhiteSpaceTokenizer, "initialize",
|
1553
1557
|
frt_a_whitespace_tokenizer_init, 1);
|
@@ -1567,7 +1571,8 @@ static void Init_AsciiWhiteSpaceTokenizer(void)
|
|
1567
1571
|
static void Init_WhiteSpaceTokenizer(void)
|
1568
1572
|
{
|
1569
1573
|
cWhiteSpaceTokenizer =
|
1570
|
-
|
1574
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceTokenizer", cTokenStream);
|
1575
|
+
frt_mark_cclass(cWhiteSpaceTokenizer);
|
1571
1576
|
rb_define_alloc_func(cWhiteSpaceTokenizer, frt_data_alloc);
|
1572
1577
|
rb_define_method(cWhiteSpaceTokenizer, "initialize",
|
1573
1578
|
frt_whitespace_tokenizer_init, -1);
|
@@ -1588,7 +1593,8 @@ static void Init_WhiteSpaceTokenizer(void)
|
|
1588
1593
|
static void Init_AsciiStandardTokenizer(void)
|
1589
1594
|
{
|
1590
1595
|
cAsciiStandardTokenizer =
|
1591
|
-
|
1596
|
+
rb_define_class_under(mAnalysis, "AsciiStandardTokenizer", cTokenStream);
|
1597
|
+
frt_mark_cclass(cAsciiStandardTokenizer);
|
1592
1598
|
rb_define_alloc_func(cAsciiStandardTokenizer, frt_data_alloc);
|
1593
1599
|
rb_define_method(cAsciiStandardTokenizer, "initialize",
|
1594
1600
|
frt_a_standard_tokenizer_init, 1);
|
@@ -1609,7 +1615,8 @@ static void Init_AsciiStandardTokenizer(void)
|
|
1609
1615
|
static void Init_StandardTokenizer(void)
|
1610
1616
|
{
|
1611
1617
|
cStandardTokenizer =
|
1612
|
-
|
1618
|
+
rb_define_class_under(mAnalysis, "StandardTokenizer", cTokenStream);
|
1619
|
+
frt_mark_cclass(cStandardTokenizer);
|
1613
1620
|
rb_define_alloc_func(cStandardTokenizer, frt_data_alloc);
|
1614
1621
|
rb_define_method(cStandardTokenizer, "initialize",
|
1615
1622
|
frt_standard_tokenizer_init, 1);
|
@@ -1636,7 +1643,8 @@ static void Init_StandardTokenizer(void)
|
|
1636
1643
|
static void Init_RegExpTokenizer(void)
|
1637
1644
|
{
|
1638
1645
|
cRegExpTokenizer =
|
1639
|
-
|
1646
|
+
rb_define_class_under(mAnalysis, "RegExpTokenizer", cTokenStream);
|
1647
|
+
frt_mark_cclass(cRegExpTokenizer);
|
1640
1648
|
rtoken_re = rb_reg_new(TOKEN_RE, strlen(TOKEN_RE), 0);
|
1641
1649
|
rb_define_const(cRegExpTokenizer, "REGEXP", rtoken_re);
|
1642
1650
|
rb_define_alloc_func(cRegExpTokenizer, frt_data_alloc);
|
@@ -1664,7 +1672,8 @@ static void Init_RegExpTokenizer(void)
|
|
1664
1672
|
static void Init_AsciiLowerCaseFilter(void)
|
1665
1673
|
{
|
1666
1674
|
cAsciiLowerCaseFilter =
|
1667
|
-
|
1675
|
+
rb_define_class_under(mAnalysis, "AsciiLowerCaseFilter", cTokenStream);
|
1676
|
+
frt_mark_cclass(cAsciiLowerCaseFilter);
|
1668
1677
|
rb_define_alloc_func(cAsciiLowerCaseFilter, frt_data_alloc);
|
1669
1678
|
rb_define_method(cAsciiLowerCaseFilter, "initialize",
|
1670
1679
|
frt_a_lowercase_filter_init, 1);
|
@@ -1684,7 +1693,8 @@ static void Init_AsciiLowerCaseFilter(void)
|
|
1684
1693
|
static void Init_LowerCaseFilter(void)
|
1685
1694
|
{
|
1686
1695
|
cLowerCaseFilter =
|
1687
|
-
|
1696
|
+
rb_define_class_under(mAnalysis, "LowerCaseFilter", cTokenStream);
|
1697
|
+
frt_mark_cclass(cLowerCaseFilter);
|
1688
1698
|
rb_define_alloc_func(cLowerCaseFilter, frt_data_alloc);
|
1689
1699
|
rb_define_method(cLowerCaseFilter, "initialize",
|
1690
1700
|
frt_lowercase_filter_init, 1);
|
@@ -1706,7 +1716,8 @@ static void Init_LowerCaseFilter(void)
|
|
1706
1716
|
static void Init_HyphenFilter(void)
|
1707
1717
|
{
|
1708
1718
|
cHyphenFilter =
|
1709
|
-
|
1719
|
+
rb_define_class_under(mAnalysis, "HyphenFilter", cTokenStream);
|
1720
|
+
frt_mark_cclass(cHyphenFilter);
|
1710
1721
|
rb_define_alloc_func(cHyphenFilter, frt_data_alloc);
|
1711
1722
|
rb_define_method(cHyphenFilter, "initialize", frt_hyphen_filter_init, 1);
|
1712
1723
|
}
|
@@ -1725,7 +1736,8 @@ static void Init_HyphenFilter(void)
|
|
1725
1736
|
static void Init_StopFilter(void)
|
1726
1737
|
{
|
1727
1738
|
cStopFilter =
|
1728
|
-
|
1739
|
+
rb_define_class_under(mAnalysis, "StopFilter", cTokenStream);
|
1740
|
+
frt_mark_cclass(cStopFilter);
|
1729
1741
|
rb_define_alloc_func(cStopFilter, frt_data_alloc);
|
1730
1742
|
rb_define_method(cStopFilter, "initialize",
|
1731
1743
|
frt_stop_filter_init, -1);
|
@@ -1738,14 +1750,10 @@ static void Init_StopFilter(void)
|
|
1738
1750
|
*
|
1739
1751
|
* A StemFilter takes a term and transforms the term as per the SnowBall
|
1740
1752
|
* stemming algorithm. Note: the input to the stemming filter must already
|
1741
|
-
* be in lower case, so you will need to use LowerCaseFilter or
|
1742
|
-
*
|
1743
|
-
*
|
1753
|
+
* be in lower case, so you will need to use LowerCaseFilter or lowercasing
|
1754
|
+
* Tokenizer further down the Tokenizer chain in order for this to work
|
1755
|
+
* properly!
|
1744
1756
|
*
|
1745
|
-
* To use this filter with other analyzers, you'll want to write an Analyzer
|
1746
|
-
* class that sets up the TokenStream chain as you want it. To use this with
|
1747
|
-
* LowerCaseTokenizer, for example, you'd write an analyzer like this:
|
1748
|
-
*
|
1749
1757
|
* === Available algorithms and encodings
|
1750
1758
|
*
|
1751
1759
|
* Algorithm Algorithm Pseudonyms Encoding
|
@@ -1766,6 +1774,10 @@ static void Init_StopFilter(void)
|
|
1766
1774
|
*
|
1767
1775
|
* === Example
|
1768
1776
|
*
|
1777
|
+
* To use this filter with other analyzers, you'll want to write an Analyzer
|
1778
|
+
* class that sets up the TokenStream chain as you want it. To use this with
|
1779
|
+
* a lowercasing Tokenizer, for example, you'd write an analyzer like this:
|
1780
|
+
*
|
1769
1781
|
* def MyAnalyzer < Analyzer
|
1770
1782
|
* def token_stream(field, str)
|
1771
1783
|
* return StemFilter.new(LowerCaseFilter.new(StandardTokenizer.new(str)))
|
@@ -1784,7 +1796,8 @@ static void Init_StopFilter(void)
|
|
1784
1796
|
static void Init_StemFilter(void)
|
1785
1797
|
{
|
1786
1798
|
cStemFilter =
|
1787
|
-
|
1799
|
+
rb_define_class_under(mAnalysis, "StemFilter", cTokenStream);
|
1800
|
+
frt_mark_cclass(cStemFilter);
|
1788
1801
|
rb_define_alloc_func(cStemFilter, frt_data_alloc);
|
1789
1802
|
rb_define_method(cStemFilter, "initialize",
|
1790
1803
|
frt_stem_filter_init, -1);
|
@@ -1827,7 +1840,8 @@ static void Init_StemFilter(void)
|
|
1827
1840
|
static void Init_Analyzer(void)
|
1828
1841
|
{
|
1829
1842
|
cAnalyzer =
|
1830
|
-
|
1843
|
+
rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
|
1844
|
+
frt_mark_cclass(cAnalyzer);
|
1831
1845
|
rb_define_alloc_func(cAnalyzer, frt_data_alloc);
|
1832
1846
|
rb_define_method(cAnalyzer, "initialize", frt_letter_analyzer_init, -1);
|
1833
1847
|
rb_define_method(cAnalyzer, "token_stream", frt_analyzer_token_stream, 2);
|
@@ -1864,7 +1878,8 @@ static void Init_Analyzer(void)
|
|
1864
1878
|
static void Init_AsciiLetterAnalyzer(void)
|
1865
1879
|
{
|
1866
1880
|
cAsciiLetterAnalyzer =
|
1867
|
-
|
1881
|
+
rb_define_class_under(mAnalysis, "AsciiLetterAnalyzer", cAnalyzer);
|
1882
|
+
frt_mark_cclass(cAsciiLetterAnalyzer);
|
1868
1883
|
rb_define_alloc_func(cAsciiLetterAnalyzer, frt_data_alloc);
|
1869
1884
|
rb_define_method(cAsciiLetterAnalyzer, "initialize",
|
1870
1885
|
frt_a_letter_analyzer_init, -1);
|
@@ -1894,7 +1909,8 @@ static void Init_AsciiLetterAnalyzer(void)
|
|
1894
1909
|
static void Init_LetterAnalyzer(void)
|
1895
1910
|
{
|
1896
1911
|
cLetterAnalyzer =
|
1897
|
-
|
1912
|
+
rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
|
1913
|
+
frt_mark_cclass(cLetterAnalyzer);
|
1898
1914
|
rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
|
1899
1915
|
rb_define_method(cLetterAnalyzer, "initialize",
|
1900
1916
|
frt_letter_analyzer_init, -1);
|
@@ -1930,7 +1946,8 @@ static void Init_LetterAnalyzer(void)
|
|
1930
1946
|
static void Init_AsciiWhiteSpaceAnalyzer(void)
|
1931
1947
|
{
|
1932
1948
|
cAsciiWhiteSpaceAnalyzer =
|
1933
|
-
|
1949
|
+
rb_define_class_under(mAnalysis, "AsciiWhiteSpaceAnalyzer", cAnalyzer);
|
1950
|
+
frt_mark_cclass(cAsciiWhiteSpaceAnalyzer);
|
1934
1951
|
rb_define_alloc_func(cAsciiWhiteSpaceAnalyzer, frt_data_alloc);
|
1935
1952
|
rb_define_method(cAsciiWhiteSpaceAnalyzer, "initialize",
|
1936
1953
|
frt_a_white_space_analyzer_init, -1);
|
@@ -1960,7 +1977,8 @@ static void Init_AsciiWhiteSpaceAnalyzer(void)
|
|
1960
1977
|
static void Init_WhiteSpaceAnalyzer(void)
|
1961
1978
|
{
|
1962
1979
|
cWhiteSpaceAnalyzer =
|
1963
|
-
|
1980
|
+
rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
|
1981
|
+
frt_mark_cclass(cWhiteSpaceAnalyzer);
|
1964
1982
|
rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
|
1965
1983
|
rb_define_method(cWhiteSpaceAnalyzer, "initialize",
|
1966
1984
|
frt_white_space_analyzer_init, -1);
|
@@ -1975,18 +1993,16 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1975
1993
|
* ascii-analyzers. If it were implemented in Ruby it would look like this;
|
1976
1994
|
*
|
1977
1995
|
* class AsciiStandardAnalyzer
|
1978
|
-
* def initialize(
|
1996
|
+
* def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
|
1979
1997
|
* @lower = lower
|
1980
1998
|
* @stop_words = stop_words
|
1981
1999
|
* end
|
1982
2000
|
*
|
1983
2001
|
* def token_stream(field, str)
|
1984
|
-
*
|
1985
|
-
*
|
1986
|
-
*
|
1987
|
-
*
|
1988
|
-
* return StopFilter.new(AsciiStandardTokenizer.new(str), @stop_words)
|
1989
|
-
* end
|
2002
|
+
* ts = AsciiStandardTokenizer.new(str)
|
2003
|
+
* ts = AsciiLowerCaseFilter.new(ts) if @lower
|
2004
|
+
* ts = StopFilter.new(ts, @stop_words)
|
2005
|
+
* ts = HyphenFilter.new(ts)
|
1990
2006
|
* end
|
1991
2007
|
* end
|
1992
2008
|
*
|
@@ -1998,7 +2014,8 @@ static void Init_WhiteSpaceAnalyzer(void)
|
|
1998
2014
|
static void Init_AsciiStandardAnalyzer(void)
|
1999
2015
|
{
|
2000
2016
|
cAsciiStandardAnalyzer =
|
2001
|
-
|
2017
|
+
rb_define_class_under(mAnalysis, "AsciiStandardAnalyzer", cAnalyzer);
|
2018
|
+
frt_mark_cclass(cAsciiStandardAnalyzer);
|
2002
2019
|
rb_define_alloc_func(cAsciiStandardAnalyzer, frt_data_alloc);
|
2003
2020
|
rb_define_method(cAsciiStandardAnalyzer, "initialize",
|
2004
2021
|
frt_a_standard_analyzer_init, -1);
|
@@ -2013,13 +2030,16 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2013
2030
|
* it were implemented in Ruby it would look like this;
|
2014
2031
|
*
|
2015
2032
|
* class StandardAnalyzer
|
2016
|
-
* def initialize(
|
2033
|
+
* def initialize(stop_words = ENGLISH_STOP_WORDS, lower = true)
|
2017
2034
|
* @lower = lower
|
2018
2035
|
* @stop_words = stop_words
|
2019
2036
|
* end
|
2020
2037
|
*
|
2021
2038
|
* def token_stream(field, str)
|
2022
|
-
*
|
2039
|
+
* ts = StandardTokenizer.new(str)
|
2040
|
+
* ts = LowerCaseFilter.new(ts) if @lower
|
2041
|
+
* ts = StopFilter.new(ts, @stop_words)
|
2042
|
+
* ts = HyphenFilter.new(ts)
|
2023
2043
|
* end
|
2024
2044
|
* end
|
2025
2045
|
*
|
@@ -2029,7 +2049,8 @@ static void Init_AsciiStandardAnalyzer(void)
|
|
2029
2049
|
static void Init_StandardAnalyzer(void)
|
2030
2050
|
{
|
2031
2051
|
cStandardAnalyzer =
|
2032
|
-
|
2052
|
+
rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
|
2053
|
+
frt_mark_cclass(cStandardAnalyzer);
|
2033
2054
|
rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
|
2034
2055
|
rb_define_method(cStandardAnalyzer, "initialize",
|
2035
2056
|
frt_standard_analyzer_init, -1);
|
@@ -2058,7 +2079,8 @@ static void Init_StandardAnalyzer(void)
|
|
2058
2079
|
static void Init_PerFieldAnalyzer(void)
|
2059
2080
|
{
|
2060
2081
|
cPerFieldAnalyzer =
|
2061
|
-
|
2082
|
+
rb_define_class_under(mAnalysis, "PerFieldAnalyzer", cAnalyzer);
|
2083
|
+
frt_mark_cclass(cPerFieldAnalyzer);
|
2062
2084
|
rb_define_alloc_func(cPerFieldAnalyzer, frt_data_alloc);
|
2063
2085
|
rb_define_method(cPerFieldAnalyzer, "initialize",
|
2064
2086
|
frt_per_field_analyzer_init, 1);
|
@@ -2098,7 +2120,8 @@ static void Init_PerFieldAnalyzer(void)
|
|
2098
2120
|
static void Init_RegExpAnalyzer(void)
|
2099
2121
|
{
|
2100
2122
|
cRegExpAnalyzer =
|
2101
|
-
|
2123
|
+
rb_define_class_under(mAnalysis, "RegExpAnalyzer", cAnalyzer);
|
2124
|
+
frt_mark_cclass(cRegExpAnalyzer);
|
2102
2125
|
rb_define_alloc_func(cRegExpAnalyzer, frt_data_alloc);
|
2103
2126
|
rb_define_method(cRegExpAnalyzer, "initialize",
|
2104
2127
|
frt_re_analyzer_init, -1);
|
data/ext/r_index.c
CHANGED
@@ -94,10 +94,12 @@ frt_fi_get_params(VALUE roptions,
|
|
94
94
|
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
95
95
|
if (v == sym_no || v == sym_false || v == Qfalse) {
|
96
96
|
*store = STORE_NO;
|
97
|
-
} else if (v == sym_yes || v == sym_true || v == Qtrue
|
97
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
98
98
|
*store = STORE_YES;
|
99
99
|
} else if (v == sym_compress || v == sym_compressed) {
|
100
100
|
*store = STORE_COMPRESS;
|
101
|
+
} else if (v == Qnil) {
|
102
|
+
/* leave as default */
|
101
103
|
} else {
|
102
104
|
rb_raise(rb_eArgError, ":%s isn't a valid argument for :store."
|
103
105
|
" Please choose from [:yes, :no, :compressed]",
|
@@ -108,7 +110,7 @@ frt_fi_get_params(VALUE roptions,
|
|
108
110
|
if (Qnil != v) Check_Type(v, T_SYMBOL);
|
109
111
|
if (v == sym_no || v == sym_false || v == Qfalse) {
|
110
112
|
*index = INDEX_NO;
|
111
|
-
} else if (v == sym_yes || v == sym_true || v == Qtrue
|
113
|
+
} else if (v == sym_yes || v == sym_true || v == Qtrue) {
|
112
114
|
*index = INDEX_YES;
|
113
115
|
} else if (v == sym_untokenized) {
|
114
116
|
*index = INDEX_UNTOKENIZED;
|
@@ -116,6 +118,8 @@ frt_fi_get_params(VALUE roptions,
|
|
116
118
|
*index = INDEX_YES_OMIT_NORMS;
|
117
119
|
} else if (v == sym_untokenized_omit_norms) {
|
118
120
|
*index = INDEX_UNTOKENIZED_OMIT_NORMS;
|
121
|
+
} else if (v == Qnil) {
|
122
|
+
/* leave as default */
|
119
123
|
} else {
|
120
124
|
rb_raise(rb_eArgError, ":%s isn't a valid argument for :index."
|
121
125
|
" Please choose from [:no, :yes, :untokenized, "
|
@@ -133,8 +137,10 @@ frt_fi_get_params(VALUE roptions,
|
|
133
137
|
*term_vector = TERM_VECTOR_WITH_POSITIONS;
|
134
138
|
} else if (v == sym_with_offsets) {
|
135
139
|
*term_vector = TERM_VECTOR_WITH_OFFSETS;
|
136
|
-
} else if (v == sym_with_positions_offsets
|
140
|
+
} else if (v == sym_with_positions_offsets) {
|
137
141
|
*term_vector = TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
142
|
+
} else if (v == Qnil) {
|
143
|
+
/* leave as default */
|
138
144
|
} else {
|
139
145
|
rb_raise(rb_eArgError, ":%s isn't a valid argument for "
|
140
146
|
":term_vector. Please choose from [:no, :yes, "
|
@@ -507,9 +513,9 @@ frt_fis_add_field(int argc, VALUE *argv, VALUE self)
|
|
507
513
|
{
|
508
514
|
FieldInfos *fis = (FieldInfos *)DATA_PTR(self);
|
509
515
|
FieldInfo *fi;
|
510
|
-
enum StoreValues store =
|
511
|
-
enum IndexValues index =
|
512
|
-
enum TermVectorValues term_vector =
|
516
|
+
enum StoreValues store = fis->store;
|
517
|
+
enum IndexValues index = fis->index;
|
518
|
+
enum TermVectorValues term_vector = fis->term_vector;
|
513
519
|
float boost = 1.0f;
|
514
520
|
VALUE rname, roptions;
|
515
521
|
|
@@ -2134,6 +2140,21 @@ frt_ir_undelete_all(VALUE self)
|
|
2134
2140
|
return self;
|
2135
2141
|
}
|
2136
2142
|
|
2143
|
+
static VALUE
|
2144
|
+
frt_get_doc_range(IndexReader *ir, int pos, int len, int max)
|
2145
|
+
{
|
2146
|
+
VALUE ary;
|
2147
|
+
int i;
|
2148
|
+
max = min2(max, pos+len);
|
2149
|
+
len = max - pos;
|
2150
|
+
ary = rb_ary_new2(len);
|
2151
|
+
for (i = 0; i < len; i++) {
|
2152
|
+
RARRAY(ary)->ptr[i] = frt_get_lazy_doc(ir->get_lazy_doc(ir, i + pos));
|
2153
|
+
}
|
2154
|
+
RARRAY(ary)->len = len;
|
2155
|
+
return ary;
|
2156
|
+
}
|
2157
|
+
|
2137
2158
|
/*
|
2138
2159
|
* call-seq:
|
2139
2160
|
* index_reader.get_document(doc_id) -> LazyDoc
|
@@ -2144,10 +2165,43 @@ frt_ir_undelete_all(VALUE self)
|
|
2144
2165
|
* which are returned by the Searchers search methods.
|
2145
2166
|
*/
|
2146
2167
|
static VALUE
|
2147
|
-
frt_ir_get_doc(VALUE
|
2168
|
+
frt_ir_get_doc(int argc, VALUE *argv, VALUE self)
|
2148
2169
|
{
|
2149
2170
|
IndexReader *ir = (IndexReader *)DATA_PTR(self);
|
2150
|
-
|
2171
|
+
VALUE arg1, arg2;
|
2172
|
+
long pos, len;
|
2173
|
+
long max = ir->max_doc(ir);
|
2174
|
+
rb_scan_args(argc, argv, "11", &arg1, &arg2);
|
2175
|
+
if (argc == 1) {
|
2176
|
+
if (FIXNUM_P(arg1)) {
|
2177
|
+
pos = FIX2INT(arg1);
|
2178
|
+
pos = (pos < 0) ? (max + pos) : pos;
|
2179
|
+
if (pos < 0 || pos >= max) {
|
2180
|
+
rb_raise(rb_eArgError, ":%d is out of range [%d..%d] for "
|
2181
|
+
"IndexWriter#[]", pos, 0, max,
|
2182
|
+
rb_id2name(SYM2ID(argv)));
|
2183
|
+
}
|
2184
|
+
return frt_get_lazy_doc(ir->get_lazy_doc(ir, pos));
|
2185
|
+
}
|
2186
|
+
|
2187
|
+
/* check if idx is Range */
|
2188
|
+
switch (rb_range_beg_len(arg1, &pos, &len, max, 0)) {
|
2189
|
+
case Qfalse:
|
2190
|
+
rb_raise(rb_eArgError, ":%s isn't a valid argument for "
|
2191
|
+
"IndexReader.get_document(index)",
|
2192
|
+
rb_id2name(SYM2ID(argv)));
|
2193
|
+
case Qnil:
|
2194
|
+
return Qnil;
|
2195
|
+
default:
|
2196
|
+
return frt_get_doc_range(ir, pos, len, max);
|
2197
|
+
}
|
2198
|
+
}
|
2199
|
+
else {
|
2200
|
+
pos = FIX2LONG(arg1);
|
2201
|
+
len = FIX2LONG(arg2);
|
2202
|
+
return frt_get_doc_range(ir, pos, len, max);
|
2203
|
+
}
|
2204
|
+
return Qnil;
|
2151
2205
|
}
|
2152
2206
|
|
2153
2207
|
/*
|
@@ -3043,8 +3097,8 @@ Init_IndexReader(void)
|
|
3043
3097
|
rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
|
3044
3098
|
rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
|
3045
3099
|
rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
|
3046
|
-
rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, 1);
|
3047
|
-
rb_define_method(cIndexReader, "[]", frt_ir_get_doc, 1);
|
3100
|
+
rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, -1);
|
3101
|
+
rb_define_method(cIndexReader, "[]", frt_ir_get_doc, -1);
|
3048
3102
|
rb_define_method(cIndexReader, "term_vector", frt_ir_term_vector, 2);
|
3049
3103
|
rb_define_method(cIndexReader, "term_vectors", frt_ir_term_vectors, 1);
|
3050
3104
|
rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
|