isomorfeus-ferret 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,741 +1,346 @@
1
1
  #include <string.h>
2
2
  #include <ctype.h>
3
- #include <wctype.h>
4
- #include <wchar.h>
5
3
  #include "frt_analysis.h"
6
4
  #include "frt_hash.h"
7
5
  #include "libstemmer.h"
8
- #include "frt_scanner.h"
9
6
 
10
- /****************************************************************************
11
- *
12
- * Token
13
- *
14
- ****************************************************************************/
7
+ /*****************************************************************************/
8
+ /*** Helpers *****************************************************************/
9
+ /*****************************************************************************/
15
10
 
16
- FrtToken *frt_tk_set(FrtToken *tk,
17
- char *text, int tlen, off_t start, off_t end, int pos_inc)
18
- {
11
+ /* initialized in frt_global.c */
12
+ extern rb_encoding *utf8_encoding;
13
+ extern OnigCodePoint cp_apostrophe;
14
+ extern OnigCodePoint cp_dot;
15
+ extern OnigCodePoint cp_comma;
16
+ extern OnigCodePoint cp_backslash;
17
+ extern OnigCodePoint cp_slash;
18
+ extern OnigCodePoint cp_underscore;
19
+ extern OnigCodePoint cp_dash;
20
+ extern OnigCodePoint cp_hyphen;
21
+ extern OnigCodePoint cp_at;
22
+ extern OnigCodePoint cp_ampersand;
23
+ extern OnigCodePoint cp_colon;
24
+
25
+ static int cp_isnumpunc(OnigCodePoint cp) {
26
+ return (cp == cp_dot || cp == cp_comma || cp == cp_backslash || cp == cp_slash || cp == cp_underscore || cp == cp_dash);
27
+ }
28
+
29
+ static int cp_isurlpunc(OnigCodePoint cp) {
30
+ return (cp == cp_dot || cp == cp_slash || cp == cp_dash || cp == cp_underscore);
31
+ }
32
+
33
+ static int cp_enc_isurlc(OnigCodePoint cp, rb_encoding *enc) {
34
+ return (cp_isurlpunc(cp) || rb_enc_isalnum(cp, enc));
35
+ }
36
+
37
+ static int cp_isurlxatpunc(OnigCodePoint cp) {
38
+ return (cp == cp_dot || cp == cp_slash || cp == cp_dash || cp == cp_underscore || cp == cp_at);
39
+ }
40
+
41
+ static int cp_enc_isurlxatc(OnigCodePoint cp, rb_encoding *enc){
42
+ return (cp_isurlxatpunc(cp) || rb_enc_isalnum(cp, enc));
43
+ }
44
+
45
+ static bool cp_enc_istok(OnigCodePoint cp, rb_encoding *enc) {
46
+ if (rb_enc_isspace(cp, enc)) /* most common so check first. */
47
+ return false;
48
+ if (rb_enc_isalnum(cp, enc) || cp_isnumpunc(cp) ||
49
+ cp == cp_ampersand || cp == cp_at || cp == cp_apostrophe || cp == cp_colon) {
50
+ return true;
51
+ }
52
+ return false;
53
+ }
54
+
55
+ static inline int get_cp(char *start, char *end, int *cp_len, rb_encoding *enc) {
56
+ if (start >= end) {
57
+ *cp_len = 0;
58
+ return 0;
59
+ }
60
+ return rb_enc_codepoint_len(start, end, cp_len, enc);
61
+ }
62
+
63
+ /*****************************************************************************/
64
+ /*** FrtToken ****************************************************************/
65
+ /*****************************************************************************/
66
+
67
+ FrtToken *frt_tk_set(FrtToken *tk, char *text, int tlen, off_t start, off_t end, int pos_inc, rb_encoding *encoding) {
19
68
  if (tlen >= FRT_MAX_WORD_SIZE) {
20
- tlen = FRT_MAX_WORD_SIZE - 1;
69
+ tlen = FRT_MAX_WORD_SIZE - 1; // TODO: this may invalidate mbc's
70
+ }
71
+
72
+ if (encoding == utf8_encoding) {
73
+ memcpy(tk->text, text, sizeof(char) * tlen);
74
+ } else {
75
+ const unsigned char *sp = (unsigned char *)text;
76
+ unsigned char *dp = (unsigned char *)tk->text;
77
+ rb_econv_t *ec = rb_econv_open(rb_enc_name(encoding), "UTF-8", RUBY_ECONV_INVALID_REPLACE);
78
+ assert(ec != NULL);
79
+ rb_econv_convert(ec, &sp, (unsigned char *)text + tlen, &dp, (unsigned char *)tk->text + FRT_MAX_WORD_SIZE - 1, 0);
80
+ rb_econv_close(ec);
81
+ tlen = dp - (unsigned char *)tk->text;
21
82
  }
22
- memcpy(tk->text, text, sizeof(char) * tlen);
23
83
  tk->text[tlen] = '\0';
24
- tk->len = tlen;
25
- tk->start = start;
26
- tk->end = end;
84
+ tk->len = tlen; // in bytes in utf8_encoding
85
+ tk->start = start; // in original encoding
86
+ tk->end = end; // in original encoding
27
87
  tk->pos_inc = pos_inc;
28
88
  return tk;
29
89
  }
30
90
 
31
- static FrtToken *frt_tk_set_ts(FrtToken *tk, char *start, char *end,
32
- char *text, int pos_inc)
33
- {
34
- return frt_tk_set(tk, start, (int)(end - start),
35
- (off_t)(start - text), (off_t)(end - text), pos_inc);
91
+ static FrtToken *frt_tk_set_ts(FrtToken *tk, char *start, char *end, char *text, int pos_inc, rb_encoding *encoding) {
92
+ return frt_tk_set(tk, start, (int)(end - start), (off_t)(start - text), (off_t)(end - text), pos_inc, encoding);
36
93
  }
37
94
 
38
- FrtToken *frt_tk_set_no_len(FrtToken *tk,
39
- char *text, off_t start, off_t end, int pos_inc)
40
- {
41
- return frt_tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
95
+ FrtToken *frt_tk_set_no_len(FrtToken *tk, char *text, off_t start, off_t end, int pos_inc, rb_encoding *encoding) {
96
+ return frt_tk_set(tk, text, (int)strlen(text), start, end, pos_inc, encoding);
42
97
  }
43
98
 
44
- static FrtToken *w_tk_set(FrtToken *tk, wchar_t *text, off_t start,
45
- off_t end, int pos_inc)
46
- {
47
- int len = wcstombs(tk->text, text, FRT_MAX_WORD_SIZE - 1);
48
- tk->text[len] = '\0';
49
- tk->len = len;
50
- tk->start = start;
51
- tk->end = end;
52
- tk->pos_inc = pos_inc;
53
- return tk;
54
- }
55
-
56
- int frt_tk_eq(FrtToken *tk1, FrtToken *tk2)
57
- {
99
+ int frt_tk_eq(FrtToken *tk1, FrtToken *tk2) {
58
100
  return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
59
101
  tk1->start == tk2->start && tk1->end == tk2->end &&
60
102
  tk1->pos_inc == tk2->pos_inc);
61
103
  }
62
104
 
63
- int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2)
64
- {
105
+ int frt_tk_cmp(FrtToken *tk1, FrtToken *tk2) {
65
106
  int cmp;
66
107
  if (tk1->start > tk2->start) {
67
108
  cmp = 1;
68
- }
69
- else if (tk1->start < tk2->start) {
109
+ } else if (tk1->start < tk2->start) {
70
110
  cmp = -1;
71
- }
72
- else {
111
+ } else {
73
112
  if (tk1->end > tk2->end) {
74
113
  cmp = 1;
75
- }
76
- else if (tk1->end < tk2->end) {
114
+ } else if (tk1->end < tk2->end) {
77
115
  cmp = -1;
78
- }
79
- else {
116
+ } else {
80
117
  cmp = strcmp((char *)tk1->text, (char *)tk2->text);
81
118
  }
82
119
  }
83
120
  return cmp;
84
121
  }
85
122
 
86
- void frt_tk_destroy(void *p)
87
- {
123
+ void frt_tk_destroy(void *p) {
88
124
  free(p);
89
125
  }
90
126
 
91
- FrtToken *frt_tk_new()
92
- {
127
+ FrtToken *frt_tk_new(void) {
93
128
  return FRT_ALLOC(FrtToken);
94
129
  }
95
- /****************************************************************************
96
- *
97
- * TokenStream
98
- *
99
- ****************************************************************************/
100
130
 
101
- void frt_ts_deref(FrtTokenStream *ts)
102
- {
103
- if (--ts->ref_cnt <= 0) {
131
+ /*****************************************************************************/
132
+ /*** FrtTokenStream **********************************************************/
133
+ /*****************************************************************************/
134
+
135
+ void frt_ts_deref(FrtTokenStream *ts) {
136
+ if (--ts->ref_cnt <= 0)
104
137
  ts->destroy_i(ts);
105
- }
106
138
  }
107
139
 
108
- static FrtTokenStream *ts_reset(FrtTokenStream *ts, char *text)
109
- {
140
+ FrtTokenStream *frt_ts_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
110
141
  ts->t = ts->text = text;
142
+ ts->length = strlen(text);
143
+ ts->encoding = encoding;
111
144
  return ts;
112
145
  }
113
146
 
114
- FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size)
115
- {
147
+ FrtTokenStream *frt_ts_clone_size(FrtTokenStream *orig_ts, size_t size) {
116
148
  FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
117
149
  memcpy(ts, orig_ts, size);
118
150
  ts->ref_cnt = 1;
151
+ ts->rts = 0;
152
+ ts->rts = Qnil;
119
153
  return ts;
120
154
  }
121
155
 
122
- FrtTokenStream *frt_ts_new_i(size_t size)
123
- {
124
- FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
156
+ FrtTokenStream *frt_ts_alloc_i(size_t size) {
157
+ return (FrtTokenStream *)frt_ecalloc(size);
158
+ }
125
159
 
160
+ FrtTokenStream *frt_ts_init(FrtTokenStream *ts) {
126
161
  ts->destroy_i = (void (*)(FrtTokenStream *))&free;
127
- ts->reset = &ts_reset;
162
+ ts->reset = &frt_ts_reset;
128
163
  ts->ref_cnt = 1;
129
-
164
+ ts->rts = Qnil;
130
165
  return ts;
131
166
  }
132
167
 
133
- /****************************************************************************
134
- * CachedTokenStream
135
- ****************************************************************************/
136
-
137
- #define CTS(token_stream) ((FrtCachedTokenStream *)(token_stream))
138
-
139
- static FrtTokenStream *cts_clone_i(FrtTokenStream *orig_ts)
140
- {
141
- return frt_ts_clone_size(orig_ts, sizeof(FrtCachedTokenStream));
168
+ FrtTokenStream *frt_ts_new_i(size_t size) {
169
+ FrtTokenStream *ts = frt_ts_alloc_i(size);
170
+ return frt_ts_init(ts);
142
171
  }
143
172
 
144
- static FrtTokenStream *cts_new()
145
- {
146
- FrtTokenStream *ts = frt_ts_new(FrtCachedTokenStream);
147
- ts->clone_i = &cts_clone_i;
148
- return ts;
149
- }
150
-
151
- /* * Multi-byte TokenStream * */
152
-
153
- #define MBTS(token_stream) ((FrtMultiByteTokenStream *)(token_stream))
154
-
155
- static int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
156
- {
157
- int num_bytes;
158
- if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
159
- const char *t = s;
160
- do {
161
- t++;
162
- FRT_ZEROSET(state, mbstate_t);
163
- num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
164
- } while ((num_bytes < 0) && (*t != 0));
165
- num_bytes = t - s;
166
- if (*t == 0) *wchr = 0;
167
- }
168
- return num_bytes;
169
- }
173
+ /*****************************************************************************/
174
+ /*** FrtCachedTokenStream ****************************************************/
175
+ /*****************************************************************************/
170
176
 
171
- static FrtTokenStream *mb_ts_reset(FrtTokenStream *ts, char *text)
172
- {
173
- FRT_ZEROSET(&(MBTS(ts)->state), mbstate_t);
174
- ts_reset(ts, text);
175
- return ts;
177
+ static FrtTokenStream *cts_clone_i(FrtTokenStream *orig_ts) {
178
+ return frt_ts_clone_size(orig_ts, sizeof(FrtTokenStream));
176
179
  }
177
180
 
178
- static FrtTokenStream *mb_ts_clone_i(FrtTokenStream *orig_ts)
179
- {
180
- return frt_ts_clone_size(orig_ts, sizeof(FrtMultiByteTokenStream));
181
+ static FrtTokenStream *frt_cts_alloc(void) {
182
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtTokenStream));
181
183
  }
182
184
 
183
- static FrtTokenStream *mb_ts_new()
184
- {
185
- FrtTokenStream *ts = frt_ts_new(FrtMultiByteTokenStream);
186
- ts->reset = &mb_ts_reset;
187
- ts->clone_i = &mb_ts_clone_i;
185
+ static FrtTokenStream *frt_cts_init(FrtTokenStream *ts) {
186
+ frt_ts_init(ts);
187
+ ts->reset = &frt_ts_reset;
188
+ ts->clone_i = &cts_clone_i;
188
189
  ts->ref_cnt = 1;
189
190
  return ts;
190
191
  }
191
192
 
192
- /****************************************************************************
193
- *
194
- * Analyzer
195
- *
196
- ****************************************************************************/
197
-
198
- void frt_a_deref(FrtAnalyzer *a)
199
- {
200
- if (--a->ref_cnt <= 0) {
201
- a->destroy_i(a);
202
- }
203
- }
204
-
205
- static void frt_a_standard_destroy_i(FrtAnalyzer *a)
206
- {
207
- if (a->current_ts) {
208
- frt_ts_deref(a->current_ts);
209
- }
210
- free(a);
211
- }
212
-
213
- static FrtTokenStream *a_standard_get_ts(FrtAnalyzer *a,
214
- FrtSymbol field,
215
- char *text)
216
- {
217
- FrtTokenStream *ts;
218
- (void)field;
219
- ts = frt_ts_clone(a->current_ts);
220
- return ts->reset(ts, text);
193
+ static FrtTokenStream *frt_cts_new(void) {
194
+ FrtTokenStream *ts = frt_cts_alloc();
195
+ return frt_cts_init(ts);
221
196
  }
222
197
 
223
- FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts,
224
- void (*destroy_i)(FrtAnalyzer *a),
225
- FrtTokenStream *(*get_ts)(FrtAnalyzer *a,
226
- FrtSymbol field,
227
- char *text))
228
- {
229
- FrtAnalyzer *a = FRT_ALLOC(FrtAnalyzer);
230
- a->current_ts = ts;
231
- a->destroy_i = (destroy_i ? destroy_i : &frt_a_standard_destroy_i);
232
- a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
233
- a->ref_cnt = 1;
234
- return a;
235
- }
198
+ /*****************************************************************************/
199
+ /*** Tokenizer ***************************************************************/
200
+ /*****************************************************************************/
236
201
 
237
- /****************************************************************************
238
- *
239
- * Non
240
- *
241
- ****************************************************************************/
202
+ /*****************************************************************************/
203
+ /*** FrtNonTokenizer *********************************************************/
204
+ /*****************************************************************************/
242
205
 
243
- /*
244
- * NonTokenizer
245
- */
246
- static FrtToken *nt_next(FrtTokenStream *ts)
247
- {
206
+ static FrtToken *nt_next(FrtTokenStream *ts) {
248
207
  if (ts->t) {
249
208
  size_t len = strlen(ts->t);
250
209
  ts->t = NULL;
251
-
252
- return frt_tk_set(&(CTS(ts)->token), ts->text, len, 0, len, 1);
253
- }
254
- else {
210
+ return frt_tk_set(&(ts->token), ts->text, len, 0, len, 1, ts->encoding);
211
+ } else {
255
212
  return NULL;
256
213
  }
257
214
  }
258
215
 
259
- FrtTokenStream *frt_non_tokenizer_new()
260
- {
261
- FrtTokenStream *ts = cts_new();
216
+ FrtTokenStream *frt_non_tokenizer_new(void) {
217
+ FrtTokenStream *ts = frt_cts_new();
262
218
  ts->next = &nt_next;
263
219
  return ts;
264
220
  }
265
221
 
266
- /*
267
- * NonAnalyzer
268
- */
269
- FrtAnalyzer *frt_non_analyzer_new()
270
- {
271
- return frt_analyzer_new(frt_non_tokenizer_new(), NULL, NULL);
272
- }
273
-
274
- /****************************************************************************
275
- *
276
- * Whitespace
277
- *
278
- ****************************************************************************/
222
+ /*****************************************************************************/
223
+ /*** FrtWhiteSpaceTokenizer **************************************************/
224
+ /*****************************************************************************/
279
225
 
280
- /*
281
- * WhitespaceTokenizer
282
- */
283
226
  static FrtToken *wst_next(FrtTokenStream *ts)
284
227
  {
285
- char *t = ts->t;
286
- char *start;
287
-
288
- while (*t != '\0' && isspace(*t)) {
289
- t++;
290
- }
291
-
292
- if (*t == '\0') {
293
- return NULL;
294
- }
295
-
296
- start = t;
297
- while (*t != '\0' && !isspace(*t)) {
298
- t++;
299
- }
300
-
301
- ts->t = t;
302
- return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
303
- }
304
-
305
- FrtTokenStream *frt_whitespace_tokenizer_new()
306
- {
307
- FrtTokenStream *ts = cts_new();
308
- ts->next = &wst_next;
309
- return ts;
310
- }
311
-
312
- /*
313
- * Multi-byte WhitespaceTokenizer
314
- */
315
- static FrtToken *mb_wst_next(FrtTokenStream *ts)
316
- {
317
- int i;
228
+ int cp_len = 0;
229
+ OnigCodePoint cp;
230
+ rb_encoding *enc = ts->encoding;
231
+ char *end = ts->text + ts->length;
318
232
  char *start;
319
233
  char *t = ts->t;
320
- wchar_t wchr;
321
- mbstate_t *state = &(MBTS(ts)->state);
322
234
 
323
- i = mb_next_char(&wchr, t, state);
324
- while (wchr != 0 && iswspace(wchr)) {
325
- t += i;
326
- i = mb_next_char(&wchr, t, state);
327
- }
328
- if (wchr == 0) {
235
+ cp = get_cp(t, end, &cp_len, enc);
236
+ if (cp < 1)
329
237
  return NULL;
330
- }
331
238
 
332
- start = t;
333
- t += i;
334
- i = mb_next_char(&wchr, t, state);
335
- while (wchr != 0 && !iswspace(wchr)) {
336
- t += i;
337
- i = mb_next_char(&wchr, t, state);
239
+ while (cp_len > 0 && rb_enc_isspace(cp, enc)) {
240
+ t += cp_len;
241
+ cp = get_cp(t, end, &cp_len, enc);
338
242
  }
339
- ts->t = t;
340
- return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
341
- }
342
-
343
- /*
344
- * Lowercasing Multi-byte WhitespaceTokenizer
345
- */
346
- static FrtToken *mb_wst_next_lc(FrtTokenStream *ts)
347
- {
348
- int i;
349
- char *start;
350
- char *t = ts->t;
351
- wchar_t wchr;
352
- wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *w, *w_end;
353
- mbstate_t *state = &(MBTS(ts)->state);
354
-
355
- w = wbuf;
356
- w_end = &wbuf[FRT_MAX_WORD_SIZE];
357
243
 
358
- i = mb_next_char(&wchr, t, state);
359
- while (wchr != 0 && iswspace(wchr)) {
360
- t += i;
361
- i = mb_next_char(&wchr, t, state);
362
- }
363
- if (wchr == 0) {
244
+ start = t;
245
+ if (start >= end)
364
246
  return NULL;
365
- }
366
247
 
367
- start = t;
368
- t += i;
369
- *w++ = towlower(wchr);
370
- i = mb_next_char(&wchr, t, state);
371
- while (wchr != 0 && !iswspace(wchr)) {
372
- if (w < w_end) {
373
- *w++ = towlower(wchr);
374
- }
375
- t += i;
376
- i = mb_next_char(&wchr, t, state);
377
- }
378
- *w = 0;
248
+ do {
249
+ t += cp_len;
250
+ cp = get_cp(t, end, &cp_len, enc);
251
+ } while (cp_len > 0 && !rb_enc_isspace(cp, enc));
252
+
379
253
  ts->t = t;
380
- return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
381
- (off_t)(t - ts->text), 1);
254
+ return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
382
255
  }
383
256
 
384
- FrtTokenStream *frt_mb_whitespace_tokenizer_new(bool lowercase)
385
- {
386
- FrtTokenStream *ts = mb_ts_new();
387
- ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
388
- return ts;
257
+ FrtTokenStream *frt_whitespace_tokenizer_alloc(void) {
258
+ return frt_cts_alloc();
389
259
  }
390
260
 
391
- /*
392
- * WhitespaceAnalyzers
393
- */
394
- FrtAnalyzer *frt_whitespace_analyzer_new(bool lowercase)
395
- {
396
- FrtTokenStream *ts;
397
- if (lowercase) {
398
- ts = frt_lowercase_filter_new(frt_whitespace_tokenizer_new());
399
- }
400
- else {
401
- ts = frt_whitespace_tokenizer_new();
402
- }
403
- return frt_analyzer_new(ts, NULL, NULL);
261
+ FrtTokenStream *frt_whitespace_tokenizer_init(FrtTokenStream *ts) {
262
+ ts = frt_cts_init(ts);
263
+ ts->next = &wst_next;
264
+ return ts;
404
265
  }
405
266
 
406
- FrtAnalyzer *frt_mb_whitespace_analyzer_new(bool lowercase)
407
- {
408
- return frt_analyzer_new(frt_mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
267
+ FrtTokenStream *frt_whitespace_tokenizer_new(void) {
268
+ FrtTokenStream *ts = frt_whitespace_tokenizer_alloc();
269
+ return frt_whitespace_tokenizer_init(ts);
409
270
  }
410
271
 
411
- /****************************************************************************
412
- *
413
- * Letter
414
- *
415
- ****************************************************************************/
272
+ /*****************************************************************************/
273
+ /*** FrtLetterTokenizer ******************************************************/
274
+ /*****************************************************************************/
416
275
 
417
- /*
418
- * LetterTokenizer
419
- */
420
- static FrtToken *lt_next(FrtTokenStream *ts)
421
- {
276
+ static FrtToken *lt_next(FrtTokenStream *ts) {
277
+ int cp_len = 0;
278
+ OnigCodePoint cp;
279
+ rb_encoding *enc = ts->encoding;
280
+ char *end = ts->text + ts->length;
422
281
  char *start;
423
282
  char *t = ts->t;
424
283
 
425
- while (*t != '\0' && !isalpha(*t)) {
426
- t++;
427
- }
428
-
429
- if (*t == '\0') {
284
+ cp = get_cp(t, end, &cp_len, enc);
285
+ if (cp < 1)
430
286
  return NULL;
431
- }
432
287
 
433
- start = t;
434
- while (*t != '\0' && isalpha(*t)) {
435
- t++;
436
- }
437
-
438
- ts->t = t;
439
- return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
440
- }
441
-
442
- FrtTokenStream *frt_letter_tokenizer_new()
443
- {
444
- FrtTokenStream *ts = cts_new();
445
- ts->next = &lt_next;
446
- return ts;
447
- }
448
-
449
- /*
450
- * Multi-byte LetterTokenizer
451
- */
452
- static FrtToken *mb_lt_next(FrtTokenStream *ts)
453
- {
454
- int i;
455
- char *start;
456
- char *t = ts->t;
457
- wchar_t wchr;
458
- mbstate_t *state = &(MBTS(ts)->state);
459
-
460
- i = mb_next_char(&wchr, t, state);
461
- while (wchr != 0 && !iswalpha(wchr)) {
462
- t += i;
463
- i = mb_next_char(&wchr, t, state);
464
- }
465
-
466
- if (wchr == 0) {
467
- return NULL;
288
+ while (cp_len > 0 && !rb_enc_isalpha(cp, enc)) {
289
+ t += cp_len;
290
+ cp = get_cp(t, end, &cp_len, enc);
468
291
  }
469
292
 
470
293
  start = t;
471
- t += i;
472
- i = mb_next_char(&wchr, t, state);
473
- while (wchr != 0 && iswalpha(wchr)) {
474
- t += i;
475
- i = mb_next_char(&wchr, t, state);
476
- }
477
- ts->t = t;
478
- return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
479
- }
480
-
481
- /*
482
- * Lowercasing Multi-byte LetterTokenizer
483
- */
484
- static FrtToken *mb_lt_next_lc(FrtTokenStream *ts)
485
- {
486
- int i;
487
- char *start;
488
- char *t = ts->t;
489
- wchar_t wchr;
490
- wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *w, *w_end;
491
- mbstate_t *state = &(MBTS(ts)->state);
492
-
493
- w = wbuf;
494
- w_end = &wbuf[FRT_MAX_WORD_SIZE];
495
-
496
- i = mb_next_char(&wchr, t, state);
497
- while (wchr != 0 && !iswalpha(wchr)) {
498
- t += i;
499
- i = mb_next_char(&wchr, t, state);
500
- }
501
- if (wchr == 0) {
502
- return NULL;
503
- }
504
-
505
- start = t;
506
- t += i;
507
- *w++ = towlower(wchr);
508
- i = mb_next_char(&wchr, t, state);
509
- while (wchr != 0 && iswalpha(wchr)) {
510
- if (w < w_end) {
511
- *w++ = towlower(wchr);
512
- }
513
- t += i;
514
- i = mb_next_char(&wchr, t, state);
515
- }
516
- *w = 0;
517
- ts->t = t;
518
- return w_tk_set(&(CTS(ts)->token), wbuf, (off_t)(start - ts->text),
519
- (off_t)(t - ts->text), 1);
520
- }
521
-
522
- FrtTokenStream *frt_mb_letter_tokenizer_new(bool lowercase)
523
- {
524
- FrtTokenStream *ts = mb_ts_new();
525
- ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
526
- return ts;
527
- }
528
-
529
- /*
530
- * LetterAnalyzers
531
- */
532
- FrtAnalyzer *frt_letter_analyzer_new(bool lowercase)
533
- {
534
- FrtTokenStream *ts;
535
- if (lowercase) {
536
- ts = frt_lowercase_filter_new(frt_letter_tokenizer_new());
537
- }
538
- else {
539
- ts = frt_letter_tokenizer_new();
540
- }
541
- return frt_analyzer_new(ts, NULL, NULL);
542
- }
543
-
544
- FrtAnalyzer *frt_mb_letter_analyzer_new(bool lowercase)
545
- {
546
- return frt_analyzer_new(frt_mb_letter_tokenizer_new(lowercase), NULL, NULL);
547
- }
548
-
549
- /****************************************************************************
550
- *
551
- * Standard
552
- *
553
- ****************************************************************************/
554
-
555
- #define STDTS(token_stream) ((FrtStandardTokenizer *)(token_stream))
556
-
557
- /*
558
- * FrtStandardTokenizer
559
- */
560
- static FrtToken *std_next(FrtTokenStream *ts)
561
- {
562
- FrtStandardTokenizer *std_tz = STDTS(ts);
563
- const char *start = NULL;
564
- const char *end = NULL;
565
- int len;
566
- FrtToken *tk = &(CTS(ts)->token);
567
-
568
- switch (std_tz->type) {
569
- case FRT_STT_ASCII:
570
- frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1,
571
- &start, &end, &len);
572
- break;
573
- case FRT_STT_MB:
574
- frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1,
575
- &start, &end, &len);
576
- break;
577
- case FRT_STT_UTF8:
578
- frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1,
579
- &start, &end, &len);
580
- break;
581
- }
582
-
583
- if (len == 0)
294
+ if (start >= end)
584
295
  return NULL;
585
296
 
586
- ts->t = (char *)end;
587
- tk->len = len;
588
- tk->start = start - ts->text;
589
- tk->end = end - ts->text;
590
- tk->pos_inc = 1;
591
- return &(CTS(ts)->token);
592
- }
297
+ do {
298
+ t += cp_len;
299
+ cp = get_cp(t, end, &cp_len, enc);
300
+ } while (cp_len > 0 && rb_enc_isalpha(cp, enc));
593
301
 
594
- static FrtTokenStream *std_ts_clone_i(FrtTokenStream *orig_ts)
595
- {
596
- return frt_ts_clone_size(orig_ts, sizeof(FrtStandardTokenizer));
302
+ ts->t = t;
303
+ return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
597
304
  }
598
305
 
599
- static FrtTokenStream *std_ts_new()
600
- {
601
- FrtTokenStream *ts = frt_ts_new(FrtStandardTokenizer);
602
-
603
- ts->clone_i = &std_ts_clone_i;
604
- ts->next = &std_next;
605
-
606
- return ts;
306
+ FrtTokenStream *frt_letter_tokenizer_alloc(void) {
307
+ return frt_cts_alloc();
607
308
  }
608
309
 
609
- FrtTokenStream *frt_standard_tokenizer_new()
610
- {
611
- FrtTokenStream *ts = std_ts_new();
612
- STDTS(ts)->type = FRT_STT_ASCII;
613
- return ts;
614
- }
615
-
616
- FrtTokenStream *frt_mb_standard_tokenizer_new()
617
- {
618
- FrtTokenStream *ts = std_ts_new();
619
- STDTS(ts)->type = FRT_STT_MB;
310
+ FrtTokenStream *frt_letter_tokenizer_init(FrtTokenStream *ts) {
311
+ ts = frt_cts_init(ts);
312
+ ts->next = &lt_next;
620
313
  return ts;
621
314
  }
622
315
 
623
- FrtTokenStream *frt_utf8_standard_tokenizer_new()
624
- {
625
- FrtTokenStream *ts = std_ts_new();
626
- STDTS(ts)->type = FRT_STT_UTF8;
627
- return ts;
316
+ FrtTokenStream *frt_letter_tokenizer_new(void) {
317
+ FrtTokenStream *ts = frt_letter_tokenizer_alloc();
318
+ return frt_letter_tokenizer_init(ts);
628
319
  }
629
320
 
630
- /****************************************************************************
631
- *
632
- * LegacyStandard
633
- *
634
- ****************************************************************************/
635
-
636
- #define LSTDTS(token_stream) ((FrtLegacyStandardTokenizer *)(token_stream))
321
+ /*****************************************************************************/
322
+ /*** FrtStandardTokenizer ****************************************************/
323
+ /*****************************************************************************/
637
324
 
638
- /*
639
- * LegacyStandardTokenizer
640
- */
641
- static int legacy_std_get_alpha(FrtTokenStream *ts, char *token)
642
- {
643
- int i = 0;
325
+ static int std_get_alnum(FrtTokenStream *ts, char *token, OnigCodePoint cp, int *cp_len_p, OnigCodePoint *cp_out_p, rb_encoding *enc) {
326
+ char *end = ts->text + ts->length;
644
327
  char *t = ts->t;
645
- while (t[i] != '\0' && isalnum(t[i])) {
646
- if (i < FRT_MAX_WORD_SIZE) {
647
- token[i] = t[i];
648
- }
649
- i++;
650
- }
651
- return i;
652
- }
653
-
654
- static int mb_legacy_std_get_alpha(FrtTokenStream *ts, char *token)
655
- {
656
- char *t = ts->t;
657
- wchar_t wchr;
658
- int i;
659
- mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
328
+ char *tt = ts->t;
329
+ int cp_len = *cp_len_p;
660
330
 
661
- i = mb_next_char(&wchr, t, &state);
662
-
663
- while (wchr != 0 && iswalnum(wchr)) {
664
- t += i;
665
- i = mb_next_char(&wchr, t, &state);
666
- }
667
-
668
- i = (int)(t - ts->t);
669
- if (i > FRT_MAX_WORD_SIZE) {
670
- i = FRT_MAX_WORD_SIZE - 1;
331
+ while (cp > 0 && rb_enc_isalnum(cp, enc)) {
332
+ if ((t - ts->t + cp_len) < FRT_MAX_WORD_SIZE)
333
+ tt += cp_len;
334
+ t += cp_len;
335
+ cp = get_cp(t, end, &cp_len, enc);
671
336
  }
672
- memcpy(token, ts->t, i);
673
- return i;
674
- }
675
-
676
- static int isnumpunc(char c)
677
- {
678
- return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
679
- || c == '-');
680
- }
681
-
682
- static int w_isnumpunc(wchar_t c)
683
- {
684
- return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
685
- || c == L'-');
686
- }
687
-
688
- static int isurlpunc(char c)
689
- {
690
- return (c == '.' || c == '/' || c == '-' || c == '_');
691
- }
692
-
693
- static int isurlc(char c)
694
- {
695
- return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
696
- }
697
337
 
698
- static int isurlxatpunc(char c)
699
- {
700
- return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
701
- }
702
-
703
- static int isurlxatc(char c)
704
- {
705
- return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
706
- || isalnum(c));
707
- }
338
+ memcpy(token, ts->t, tt - ts->t);
339
+ token[tt - ts->t] = '\0';
708
340
 
709
- static bool legacy_std_is_tok_char(char *c)
710
- {
711
- if (isspace(*c)) {
712
- return false; /* most common so check first. */
713
- }
714
- if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
715
- *c == '@' || *c == '\'' || *c == ':') {
716
- return true;
717
- }
718
- return false;
719
- }
720
-
721
- static bool mb_legacy_std_is_tok_char(char *t)
722
- {
723
- wchar_t c;
724
- mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
725
-
726
- if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
727
- /* error which we can handle next time round. For now just return
728
- * false so that we can return a token */
729
- return false;
730
- }
731
- if (iswspace(c)) {
732
- return false; /* most common so check first. */
733
- }
734
- if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
735
- || c == L':') {
736
- return true;
737
- }
738
- return false;
341
+ *cp_out_p = cp;
342
+ *cp_len_p = cp_len;
343
+ return t - ts->t;
739
344
  }
740
345
 
741
346
  /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
@@ -743,242 +348,261 @@ static bool mb_legacy_std_is_tok_char(char *t)
743
348
  * (alnum) = [a-zA-Z0-9]
744
349
  * (punc) = [_\/.,-]
745
350
  */
746
- static int legacy_std_get_number(char *input)
747
- {
748
- int i = 0;
749
- int count = 0;
351
+ static int std_get_number(FrtTokenStream *ts, char *start, char *end, OnigCodePoint cp, int cp_len_a, rb_encoding *enc) {
352
+
353
+ OnigCodePoint cp_1 = 0;
354
+ char *t = start;
355
+ int cp_len = cp_len_a;
356
+ int cp_1_len = 0;
750
357
  int last_seen_digit = 2;
751
358
  int seen_digit = false;
752
359
 
753
- while (last_seen_digit >= 0) {
754
- while ((input[i] != '\0') && isalnum(input[i])) {
755
- if ((last_seen_digit < 2) && isdigit(input[i])) {
360
+ while (cp > 0 && last_seen_digit >= 0) {
361
+ while ((cp > 0) && rb_enc_isalnum(cp, enc)) {
362
+ if ((last_seen_digit < 2) && rb_enc_isdigit(cp, enc)) {
756
363
  last_seen_digit = 2;
757
364
  }
758
- if ((seen_digit == false) && isdigit(input[i])) {
365
+ if ((seen_digit == false) && rb_enc_isdigit(cp, enc)) {
759
366
  seen_digit = true;
760
367
  }
761
- i++;
368
+ t += cp_len;
369
+ cp = get_cp(t, end, &cp_len, enc);
762
370
  }
763
371
  last_seen_digit--;
764
- if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
765
-
766
- if (last_seen_digit >= 0) {
767
- count = i;
768
- }
372
+ cp_1 = get_cp(t + cp_len, end, &cp_1_len, enc);
373
+ if (!cp_isnumpunc(cp) || !rb_enc_isalnum(cp_1, enc)) {
769
374
  break;
770
375
  }
771
- count = i;
772
- i++;
376
+ t += cp_len;
377
+ cp = cp_1;
378
+ cp_len = cp_1_len;
773
379
  }
774
380
  if (seen_digit) {
775
- return count;
776
- }
777
- else {
381
+ return t - start;
382
+ } else {
778
383
  return 0;
779
384
  }
780
385
  }
781
386
 
782
- static int legacy_std_get_apostrophe(char *input)
783
- {
387
+ static int std_get_apostrophe(FrtTokenStream *ts, char *input, OnigCodePoint cp, int *cp_len_p, rb_encoding *enc) {
388
+ int cp_len = *cp_len_p;
389
+ char *end = ts->text + ts->length;
784
390
  char *t = input;
785
391
 
786
- while (isalpha(*t) || *t == '\'') {
787
- t++;
392
+ while (cp_len > 0 && (rb_enc_isalpha(cp, enc) || cp == cp_apostrophe)) {
393
+ t += cp_len;
394
+ cp = get_cp(t, end, &cp_len, enc);
788
395
  }
789
-
790
396
  return (int)(t - input);
791
397
  }
792
398
 
793
- static int mb_legacy_std_get_apostrophe(char *input)
794
- {
795
- char *t = input;
796
- wchar_t wchr;
797
- int i;
798
- mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
799
-
800
- i = mb_next_char(&wchr, t, &state);
399
+ static char *std_get_url(FrtTokenStream *ts, char *start, char *end, char *token, int *len, int bufred) {
400
+ rb_encoding *enc = ts->encoding;
401
+ OnigCodePoint cp;
402
+ OnigCodePoint prev_cp = 0;
403
+ int cp_len = 0;
404
+ int prev_cp_len = 0;
405
+ char *t = start;
406
+ char *tt = start;
801
407
 
802
- while (iswalpha(wchr) || wchr == L'\'') {
803
- t += i;
804
- i = mb_next_char(&wchr, t, &state);
805
- }
806
- return (int)(t - input);
807
- }
808
-
809
- static char *std_get_url(char *input, char *token, int i, int *len)
810
- {
811
- char *next = NULL;
812
- while (isurlc(input[i])) {
813
- if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
408
+ cp = get_cp(t, end, &cp_len, enc);
409
+ while (cp > 0 && cp_enc_isurlc(cp, enc)) {
410
+ if (cp_isurlpunc(cp) && cp_isurlpunc(prev_cp)) {
814
411
  break; /* can't have two puncs in a row */
815
412
  }
816
- if (i < FRT_MAX_WORD_SIZE) {
817
- token[i] = input[i];
818
- }
819
- i++;
413
+ prev_cp = cp;
414
+ prev_cp_len = cp_len;
415
+ t += cp_len;
416
+ if (((t + cp_len) - start) <= (FRT_MAX_WORD_SIZE - bufred))
417
+ tt += cp_len;
418
+ cp = get_cp(t, end, &cp_len, enc);
820
419
  }
821
- next = input + i;
822
420
 
823
- /* We don't want to index past the end of the token capacity) */
824
- if (i >= FRT_MAX_WORD_SIZE) {
825
- i = FRT_MAX_WORD_SIZE - 1;
421
+ /* strip trailing punc */
422
+ if (t == tt && cp_isurlpunc(prev_cp)) {
423
+ tt -= prev_cp_len;
826
424
  }
827
425
 
828
- /* strip trailing puncs */
829
- while (isurlpunc(input[i - 1])) {
830
- i--;
831
- }
832
- *len = i;
833
- token[i] = '\0';
426
+ *len = (tt - start) + bufred;
427
+ memcpy(token, start, tt - start);
428
+ token[tt - start] = '\0';
834
429
 
835
- return next;
430
+ return t;
836
431
  }
837
432
 
838
- /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
839
- */
840
- static int legacy_std_get_company_name(char *input)
841
- {
842
- int i = 0;
843
- while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
844
- i++;
845
- }
433
+ /* Company names can contain '@' and '&' like AT&T and Excite@Home. */
434
+ static int std_get_company_name(FrtTokenStream *ts, char *start, char* end) {
435
+ rb_encoding *enc = ts->encoding;
436
+ char * t = start;
437
+ OnigCodePoint cp;
438
+ int cp_len = 0;
846
439
 
847
- return i;
848
- }
849
-
850
- static bool legacy_std_advance_to_start(FrtTokenStream *ts)
851
- {
852
- char *t = ts->t;
853
- while (*t != '\0' && !isalnum(*t)) {
854
- if (isnumpunc(*t) && isdigit(t[1])) break;
855
- t++;
440
+ cp = get_cp(t, end, &cp_len, enc);
441
+ while (cp > 0 && (rb_enc_isalpha(cp, enc) || cp == cp_at || cp == cp_ampersand)) {
442
+ t += cp_len;
443
+ cp = get_cp(t, end, &cp_len, enc);
856
444
  }
857
445
 
858
- ts->t = t;
859
-
860
- return (*t != '\0');
446
+ return t - start;
861
447
  }
862
448
 
863
- static bool mb_legacy_std_advance_to_start(FrtTokenStream *ts)
864
- {
865
- int i;
866
- wchar_t wchr;
867
- mbstate_t state; FRT_ZEROSET(&state, mbstate_t);
868
-
869
- i = mb_next_char(&wchr, ts->t, &state);
449
+ static int std_advance_to_start(FrtTokenStream *ts, int *cp_len_p, OnigCodePoint *cp_out_p, rb_encoding *enc) {
450
+ int cp_len = 0;
451
+ int cp_next = 0;
452
+ int cp_len_next = 0;
453
+ OnigCodePoint cp;
454
+ char *end = ts->text + ts->length;
455
+ char *t = ts->t;
870
456
 
871
- while (wchr != 0 && !iswalnum(wchr)) {
872
- if (isnumpunc(*ts->t) && isdigit(ts->t[1])) break;
873
- ts->t += i;
874
- i = mb_next_char(&wchr, ts->t, &state);
457
+ cp = get_cp(t, end, &cp_len, enc);
458
+ while (cp > 0 && !rb_enc_isalnum(cp, enc)) {
459
+ if (cp_isnumpunc(cp)) {
460
+ cp_next = get_cp(t + cp_len, end, &cp_len_next, enc);
461
+ if (cp_next > 0 && rb_enc_isdigit(cp_next, enc))
462
+ break;
463
+ }
464
+ t += cp_len;
465
+ cp = get_cp(t, end, &cp_len, enc);
875
466
  }
876
-
877
- return (wchr != 0);
467
+ ts->t = t;
468
+ *cp_out_p = cp;
469
+ *cp_len_p = cp_len;
470
+ return (t < end);
878
471
  }
879
472
 
880
- static FrtToken *legacy_std_next(FrtTokenStream *ts)
881
- {
882
- FrtLegacyStandardTokenizer *std_tz = LSTDTS(ts);
473
+ static FrtToken *std_next(FrtTokenStream *ts) {
883
474
  char *s;
884
475
  char *t;
885
476
  char *start = NULL;
477
+ char *end;
886
478
  char *num_end = NULL;
887
479
  char token[FRT_MAX_WORD_SIZE + 1];
480
+ OnigCodePoint cp = 0;
481
+ OnigCodePoint cp_1 = 0;
482
+ OnigCodePoint cp_2 = 0;
483
+ OnigCodePoint prev_cp = 0;
484
+ int cp_len = 0;
485
+ int cp_1_len = 0;
486
+ int cp_2_len = 0;
888
487
  int token_i = 0;
889
488
  int len;
890
489
  bool is_acronym;
891
490
  bool seen_at_symbol;
491
+ rb_encoding *enc = ts->encoding;
892
492
 
893
-
894
- if (!std_tz->advance_to_start(ts)) {
493
+ /* advance to start and return first cp and len */
494
+ if (!std_advance_to_start(ts, &cp_len, &cp, enc))
895
495
  return NULL;
896
- }
897
496
 
497
+ end = ts->text + ts->length;
898
498
  start = t = ts->t;
899
- token_i = std_tz->get_alpha(ts, token);
499
+
500
+ /* get all alnums */
501
+ token_i = std_get_alnum(ts, token, cp, &cp_len, &cp, enc);
900
502
  t += token_i;
901
503
 
902
- if (!std_tz->is_tok_char(t)) {
504
+ if (t >= end && token_i > 0) {
505
+ ts->t += token_i;
506
+ return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
507
+ }
508
+
509
+ // already got cp and cp_len from get_alnum above
510
+ // cp = get_cp(t, end, &cp_len, enc);
511
+ if (cp < 1)
512
+ return NULL;
513
+
514
+ if (!cp_enc_istok(cp, enc)) {
903
515
  /* very common case, ie a plain word, so check and return */
904
- ts->t = t;
905
- return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
516
+ ts->t = t + cp_len;
517
+ return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
906
518
  }
907
519
 
908
- if (*t == '\'') { /* apostrophe case. */
909
- t += std_tz->get_apostrophe(t);
520
+ if (cp == cp_apostrophe) { /* apostrophe case. */
521
+ t += std_get_apostrophe(ts, t, cp, &cp_len, enc);
910
522
  ts->t = t;
911
523
  len = (int)(t - start);
912
524
  /* strip possesive */
525
+ /* TODO: wont work with multibyte */
913
526
  if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
914
527
  t -= 2;
915
- frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
916
- CTS(ts)->token.end += 2;
528
+ frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
529
+ ts->token.end += 2;
917
530
  }
918
531
  else if (t[-1] == '\'') {
919
532
  t -= 1;
920
- frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
921
- CTS(ts)->token.end += 1;
533
+ frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
534
+ ts->token.end += 1;
922
535
  }
923
536
  else {
924
- frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
537
+ frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
925
538
  }
926
-
927
- return &(CTS(ts)->token);
539
+ return &(ts->token);
928
540
  }
929
541
 
930
- if (*t == '&') { /* apostrophe case. */
931
- t += legacy_std_get_company_name(t);
542
+ // already got cp and cp_len from get_alnum above
543
+ // cp = get_cp(t, end, &cp_len, enc);
544
+ if (cp == cp_ampersand) { /* ampersand case. */
545
+ t += std_get_company_name(ts, t, end);
932
546
  ts->t = t;
933
- return frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
547
+ return frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
934
548
  }
935
549
 
936
- if ((isdigit(*start) || isnumpunc(*start)) /* possibly a number */
937
- && ((len = legacy_std_get_number(start)) > 0)) {
550
+ // already got cp and cp_len from get_alnum above
551
+ // cp = get_cp(start, end, &cp_len, enc);
552
+ if ((rb_enc_isdigit(cp, enc) || cp_isnumpunc(cp))
553
+ && ((len = std_get_number(ts, start, end, cp, cp_len, enc)) > 0)) { /* possibly a number */
938
554
  num_end = start + len;
939
- if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
555
+ cp = get_cp(num_end, end, &cp_len, enc);
556
+ if (cp > 0 && !cp_enc_istok(cp, enc)) { /* won't find a longer token */
940
557
  ts->t = num_end;
941
- return frt_tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
558
+ return frt_tk_set_ts(&(ts->token), start, num_end, ts->text, 1, enc);
942
559
  }
943
560
  /* else there may be a longer token so check */
944
561
  }
945
562
 
946
- if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
563
+ // already got cp and cp_len from get_alnum or the last block above
564
+ // cp = get_cp(t, end, &cp_len, enc);
565
+ cp_1 = get_cp(t + cp_len, end, &cp_1_len, enc);
566
+ cp_2 = get_cp(t + cp_len + cp_1_len, end, &cp_2_len, enc);
567
+ if (cp == cp_colon && cp_1 == cp_slash && cp_2 == cp_slash) {
947
568
  /* check for a known url start */
948
569
  token[token_i] = '\0';
949
- t += 3;
950
- token_i += 3;
951
- while (*t == '/') {
952
- t++;
953
- }
954
- if (isalpha(*t) &&
955
- (memcmp(token, "ftp", 3) == 0 ||
956
- memcmp(token, "http", 4) == 0 ||
957
- memcmp(token, "https", 5) == 0 ||
958
- memcmp(token, "file", 4) == 0)) {
959
- ts->t = std_get_url(t, token, 0, &len); /* dispose of first part of the URL */
570
+ t += cp_len + cp_1_len + cp_2_len;
571
+ token_i += cp_len + cp_1_len + cp_2_len;
572
+ cp = get_cp(t, end, &cp_len, enc);
573
+ while (cp > 0 && cp == cp_slash) {
574
+ t += cp_len;
575
+ cp = get_cp(t, end, &cp_len, enc);
960
576
  }
961
- else { /* still treat as url but keep the first part */
577
+ if (rb_enc_isalpha(cp, enc) &&
578
+ (memcmp(token, "ftp", 3) == 0 ||
579
+ memcmp(token, "http", 4) == 0 ||
580
+ memcmp(token, "https", 5) == 0 ||
581
+ memcmp(token, "file", 4) == 0)) {
582
+ ts->t = std_get_url(ts, t, end, token, &len, 0); /* dispose of first part of the URL */
583
+ } else { /* still treat as url but keep the first part */
962
584
  token_i = (int)(t - start);
963
585
  memcpy(token, start, token_i * sizeof(char));
964
- ts->t = std_get_url(start, token, token_i, &len); /* keep start */
586
+ ts->t = std_get_url(ts, t, end, token + token_i, &len, token_i); /* keep start */
965
587
  }
966
- return frt_tk_set(&(CTS(ts)->token), token, len,
588
+ return frt_tk_set(&(ts->token), token, len,
967
589
  (off_t)(start - ts->text),
968
- (off_t)(ts->t - ts->text), 1);
590
+ (off_t)(ts->t - ts->text), 1, enc);
969
591
  }
970
592
 
971
593
  /* now see how long a url we can find. */
972
594
  is_acronym = true;
973
595
  seen_at_symbol = false;
974
- while (isurlxatc(*t)) {
975
- if (is_acronym && !isalpha(*t) && (*t != '.')) {
596
+
597
+ cp = get_cp(t, end, &cp_len, enc);
598
+ while (cp_enc_isurlxatc(cp, enc)) {
599
+ if (is_acronym && !rb_enc_isalpha(cp, enc) && (cp != cp_dot)) {
976
600
  is_acronym = false;
977
601
  }
978
- if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
602
+ if (cp_isurlxatpunc(cp) && cp_isurlxatpunc(prev_cp)) {
979
603
  break; /* can't have two punctuation characters in a row */
980
604
  }
981
- if (*t == '@') {
605
+ if (cp == cp_at) {
982
606
  if (seen_at_symbol) {
983
607
  break; /* we can only have one @ symbol */
984
608
  }
@@ -986,10 +610,12 @@ static FrtToken *legacy_std_next(FrtTokenStream *ts)
986
610
  seen_at_symbol = true;
987
611
  }
988
612
  }
989
- t++;
613
+ prev_cp = cp;
614
+ t += cp_len;
615
+ cp = get_cp(t, end, &cp_len, enc);
990
616
  }
991
- while (isurlxatpunc(t[-1]) && t > ts->t) {
992
- t--; /* strip trailing punctuation */
617
+ if (cp_isurlxatpunc(prev_cp) && t > ts->t) {
618
+ t -= cp_len; /* strip trailing punctuation */
993
619
  }
994
620
 
995
621
  if (t < ts->t || (num_end != NULL && num_end < ts->t)) {
@@ -999,140 +625,119 @@ static FrtToken *legacy_std_next(FrtTokenStream *ts)
999
625
  ts->t = t;
1000
626
 
1001
627
  if (is_acronym) { /* check it is one letter followed by one '.' */
1002
- for (s = start; s < t - 1; s++) {
1003
- if (isalpha(*s) && (s[1] != '.'))
628
+ cp_len = 0;
629
+ for (s = start; s < t - 1; s += cp_len) {
630
+ cp = get_cp(s, end, &cp_len, enc);
631
+ cp_1 = get_cp(s + cp_len, end, &cp_1_len, enc);
632
+ if (rb_enc_isalpha(cp, enc) && (cp_1 != cp_dot))
1004
633
  is_acronym = false;
1005
634
  }
1006
635
  }
1007
636
  if (is_acronym) { /* strip '.'s */
1008
- for (s = start + token_i; s < t; s++) {
1009
- if (*s != '.') {
1010
- token[token_i] = *s;
1011
- token_i++;
637
+ cp_len = 0;
638
+ for (s = start + token_i; s < t; s += cp_len) {
639
+ cp = get_cp(s, end, &cp_len, enc);
640
+ if (cp > 0 && cp != cp_dot) {
641
+ memcpy(token + token_i, s, cp_len);
642
+ token_i += cp_len;
1012
643
  }
1013
644
  }
1014
- frt_tk_set(&(CTS(ts)->token), token, token_i,
645
+ token[token_i] = '\0';
646
+ frt_tk_set(&(ts->token), token, token_i,
1015
647
  (off_t)(start - ts->text),
1016
- (off_t)(t - ts->text), 1);
1017
- }
1018
- else { /* just return the url as is */
1019
- frt_tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
648
+ (off_t)(t - ts->text), 1, enc);
649
+ } else { /* just return the url as is */
650
+ frt_tk_set_ts(&(ts->token), start, t, ts->text, 1, enc);
1020
651
  }
1021
- }
1022
- else { /* return the number */
652
+ } else { /* return the number */
1023
653
  ts->t = num_end;
1024
- frt_tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
654
+ frt_tk_set_ts(&(ts->token), start, num_end, ts->text, 1, enc);
1025
655
  }
1026
-
1027
- return &(CTS(ts)->token);
656
+ return &(ts->token);
1028
657
  }
1029
658
 
1030
- static FrtTokenStream *legacy_std_ts_clone_i(FrtTokenStream *orig_ts)
1031
- {
1032
- return frt_ts_clone_size(orig_ts, sizeof(FrtLegacyStandardTokenizer));
659
+ static FrtTokenStream *std_ts_clone_i(FrtTokenStream *orig_ts) {
660
+ return frt_ts_clone_size(orig_ts, sizeof(FrtTokenStream));
1033
661
  }
1034
662
 
1035
- static FrtTokenStream *legacy_std_ts_new()
1036
- {
1037
- FrtTokenStream *ts = frt_ts_new(FrtLegacyStandardTokenizer);
1038
-
1039
- ts->clone_i = &legacy_std_ts_clone_i;
1040
- ts->next = &legacy_std_next;
1041
-
1042
- return ts;
663
+ FrtTokenStream *frt_standard_tokenizer_alloc(void) {
664
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtTokenStream));
1043
665
  }
1044
666
 
1045
- FrtTokenStream *frt_legacy_standard_tokenizer_new()
1046
- {
1047
- FrtTokenStream *ts = legacy_std_ts_new();
1048
-
1049
- LSTDTS(ts)->advance_to_start = &legacy_std_advance_to_start;
1050
- LSTDTS(ts)->get_alpha = &legacy_std_get_alpha;
1051
- LSTDTS(ts)->is_tok_char = &legacy_std_is_tok_char;
1052
- LSTDTS(ts)->get_apostrophe = &legacy_std_get_apostrophe;
1053
-
667
+ FrtTokenStream *frt_standard_tokenizer_init(FrtTokenStream *ts) {
668
+ ts = frt_ts_init(ts);
669
+ ts->clone_i = &std_ts_clone_i;
670
+ ts->next = &std_next;
1054
671
  return ts;
1055
672
  }
1056
673
 
1057
- FrtTokenStream *frt_mb_legacy_standard_tokenizer_new()
1058
- {
1059
- FrtTokenStream *ts = legacy_std_ts_new();
1060
-
1061
- LSTDTS(ts)->advance_to_start = &mb_legacy_std_advance_to_start;
1062
- LSTDTS(ts)->get_alpha = &mb_legacy_std_get_alpha;
1063
- LSTDTS(ts)->is_tok_char = &mb_legacy_std_is_tok_char;
1064
- LSTDTS(ts)->get_apostrophe = &mb_legacy_std_get_apostrophe;
1065
-
1066
- return ts;
674
+ FrtTokenStream *frt_standard_tokenizer_new(void) {
675
+ FrtTokenStream *ts = frt_standard_tokenizer_alloc();
676
+ return frt_standard_tokenizer_init(ts);
1067
677
  }
1068
678
 
1069
- /****************************************************************************
1070
- *
1071
- * Filters
1072
- *
1073
- ****************************************************************************/
679
+ /*****************************************************************************/
680
+ /*** FrtFilters **************************************************************/
681
+ /*****************************************************************************/
1074
682
 
1075
683
  #define TkFilt(filter) ((FrtTokenFilter *)(filter))
1076
684
 
1077
- FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size)
1078
- {
685
+ FrtTokenStream *frt_filter_clone_size(FrtTokenStream *ts, size_t size) {
1079
686
  FrtTokenStream *ts_new = frt_ts_clone_size(ts, size);
1080
687
  TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
1081
688
  return ts_new;
1082
689
  }
1083
690
 
1084
- static FrtTokenStream *filter_clone_i(FrtTokenStream *ts)
1085
- {
691
+ static FrtTokenStream *filter_clone_i(FrtTokenStream *ts) {
1086
692
  return frt_filter_clone_size(ts, sizeof(FrtTokenFilter));
1087
693
  }
1088
694
 
1089
- static FrtTokenStream *filter_reset(FrtTokenStream *ts, char *text)
1090
- {
1091
- TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
695
+ static FrtTokenStream *filter_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
696
+ TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text, encoding);
1092
697
  return ts;
1093
698
  }
1094
699
 
1095
- static void filter_destroy_i(FrtTokenStream *ts)
1096
- {
700
+ static void filter_destroy_i(FrtTokenStream *ts) {
1097
701
  frt_ts_deref(TkFilt(ts)->sub_ts);
1098
702
  free(ts);
1099
703
  }
1100
704
 
1101
- FrtTokenStream *frt_tf_new_i(size_t size, FrtTokenStream *sub_ts)
1102
- {
1103
- FrtTokenStream *ts = (FrtTokenStream *)frt_ecalloc(size);
1104
-
1105
- TkFilt(ts)->sub_ts = sub_ts;
1106
-
1107
- ts->clone_i = &filter_clone_i;
1108
- ts->destroy_i = &filter_destroy_i;
1109
- ts->reset = &filter_reset;
1110
- ts->ref_cnt = 1;
705
+ FrtTokenStream *frt_tf_alloc_i(size_t size) {
706
+ return (FrtTokenStream *)frt_ecalloc(size);
707
+ }
1111
708
 
709
+ FrtTokenStream *frt_tf_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
710
+ ts->clone_i = &filter_clone_i;
711
+ ts->destroy_i = &filter_destroy_i;
712
+ ts->reset = &filter_reset;
713
+ ts->ref_cnt = 1;
714
+ TkFilt(ts)->sub_ts = sub_ts;
1112
715
  return ts;
1113
716
  }
1114
717
 
1115
- /****************************************************************************
1116
- * FrtStopFilter
1117
- ****************************************************************************/
718
+ FrtTokenStream *frt_tf_new_i(size_t size, FrtTokenStream *sub_ts) {
719
+ FrtTokenStream *ts = frt_tf_alloc_i(size);
720
+ return frt_tf_init(ts, sub_ts);
721
+ }
722
+
723
+ /*****************************************************************************/
724
+ /**** FrtStopFilter **********************************************************/
725
+ /*****************************************************************************/
1118
726
 
1119
727
  #define StopFilt(filter) ((FrtStopFilter *)(filter))
1120
728
 
1121
- static void sf_destroy_i(FrtTokenStream *ts)
1122
- {
729
+ static void sf_destroy_i(FrtTokenStream *ts) {
1123
730
  frt_h_destroy(StopFilt(ts)->words);
1124
731
  filter_destroy_i(ts);
1125
732
  }
1126
733
 
1127
- static FrtTokenStream *sf_clone_i(FrtTokenStream *orig_ts)
1128
- {
734
+ static FrtTokenStream *sf_clone_i(FrtTokenStream *orig_ts) {
1129
735
  FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtMappingFilter));
1130
736
  FRT_REF(StopFilt(new_ts)->words);
1131
737
  return new_ts;
1132
738
  }
1133
739
 
1134
- static FrtToken *sf_next(FrtTokenStream *ts)
1135
- {
740
+ static FrtToken *sf_next(FrtTokenStream *ts) {
1136
741
  int pos_inc = 0;
1137
742
  FrtHash *words = StopFilt(ts)->words;
1138
743
  FrtTokenFilter *tf = TkFilt(ts);
@@ -1150,71 +755,76 @@ static FrtToken *sf_next(FrtTokenStream *ts)
1150
755
  return tk;
1151
756
  }
1152
757
 
1153
- FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *sub_ts,
1154
- const char **words, int len)
1155
- {
758
+ FrtTokenStream *frt_stop_filter_alloc(void) {
759
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtStopFilter));
760
+ }
761
+
762
+ FrtTokenStream *frt_stop_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
763
+ frt_tf_init(ts, sub_ts);
764
+ ts->next = &sf_next;
765
+ ts->destroy_i = &sf_destroy_i;
766
+ ts->clone_i = &sf_clone_i;
767
+ return ts;
768
+ }
769
+
770
+ void frt_stop_filter_set_words_len(FrtTokenStream *ts, const char **words, int len) {
1156
771
  int i;
1157
772
  char *word;
1158
773
  FrtHash *word_table = frt_h_new_str(&free, (frt_free_ft) NULL);
1159
- FrtTokenStream *ts = tf_new(FrtStopFilter, sub_ts);
1160
-
1161
774
  for (i = 0; i < len; i++) {
1162
775
  word = frt_estrdup(words[i]);
1163
776
  frt_h_set(word_table, word, word);
1164
777
  }
1165
778
  StopFilt(ts)->words = word_table;
1166
- ts->next = &sf_next;
1167
- ts->destroy_i = &sf_destroy_i;
1168
- ts->clone_i = &sf_clone_i;
779
+ }
780
+
781
+ FrtTokenStream *frt_stop_filter_new_with_words_len(FrtTokenStream *sub_ts, const char **words, int len) {
782
+ FrtTokenStream *ts = frt_stop_filter_alloc();
783
+ ts = frt_stop_filter_init(ts, sub_ts);
784
+ frt_stop_filter_set_words_len(ts, words, len);
1169
785
  return ts;
1170
786
  }
1171
787
 
1172
- FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *sub_ts,
1173
- const char **words)
1174
- {
788
+ void frt_stop_filter_set_words(FrtTokenStream *ts, const char **words) {
1175
789
  char *word;
1176
790
  FrtHash *word_table = frt_h_new_str(&free, (frt_free_ft) NULL);
1177
- FrtTokenStream *ts = tf_new(FrtStopFilter, sub_ts);
1178
-
1179
791
  while (*words) {
1180
792
  word = frt_estrdup(*words);
1181
793
  frt_h_set(word_table, word, word);
1182
794
  words++;
1183
795
  }
1184
-
1185
796
  StopFilt(ts)->words = word_table;
1186
- ts->next = &sf_next;
1187
- ts->destroy_i = &sf_destroy_i;
1188
- ts->clone_i = &sf_clone_i;
797
+ }
798
+
799
+ FrtTokenStream *frt_stop_filter_new_with_words(FrtTokenStream *sub_ts, const char **words) {
800
+ FrtTokenStream *ts = frt_stop_filter_alloc();
801
+ frt_stop_filter_init(ts, sub_ts);
802
+ frt_stop_filter_set_words(ts, words);
1189
803
  return ts;
1190
804
  }
1191
805
 
1192
- FrtTokenStream *frt_stop_filter_new(FrtTokenStream *ts)
1193
- {
1194
- return frt_stop_filter_new_with_words(ts, FRT_FULL_ENGLISH_STOP_WORDS);
806
+ FrtTokenStream *frt_stop_filter_new(FrtTokenStream *sub_ts) {
807
+ return frt_stop_filter_new_with_words(sub_ts, FRT_FULL_ENGLISH_STOP_WORDS);
1195
808
  }
1196
809
 
1197
- /****************************************************************************
1198
- * MappingFilter
1199
- ****************************************************************************/
810
+ /*****************************************************************************/
811
+ /*** MappingFilter ***********************************************************/
812
+ /*****************************************************************************/
1200
813
 
1201
814
  #define MFilt(filter) ((FrtMappingFilter *)(filter))
1202
815
 
1203
- static void mf_destroy_i(FrtTokenStream *ts)
1204
- {
816
+ static void mf_destroy_i(FrtTokenStream *ts) {
1205
817
  frt_mulmap_destroy(MFilt(ts)->mapper);
1206
818
  filter_destroy_i(ts);
1207
819
  }
1208
820
 
1209
- static FrtTokenStream *mf_clone_i(FrtTokenStream *orig_ts)
1210
- {
821
+ static FrtTokenStream *mf_clone_i(FrtTokenStream *orig_ts) {
1211
822
  FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtMappingFilter));
1212
823
  FRT_REF(MFilt(new_ts)->mapper);
1213
824
  return new_ts;
1214
825
  }
1215
826
 
1216
- static FrtToken *mf_next(FrtTokenStream *ts)
1217
- {
827
+ static FrtToken *mf_next(FrtTokenStream *ts) {
1218
828
  char buf[FRT_MAX_WORD_SIZE + 1];
1219
829
  FrtMultiMapper *mapper = MFilt(ts)->mapper;
1220
830
  FrtTokenFilter *tf = TkFilt(ts);
@@ -1226,48 +836,53 @@ static FrtToken *mf_next(FrtTokenStream *ts)
1226
836
  return tk;
1227
837
  }
1228
838
 
1229
- static FrtTokenStream *mf_reset(FrtTokenStream *ts, char *text)
1230
- {
839
+ static FrtTokenStream *mf_reset(FrtTokenStream *ts, char *text, rb_encoding *encoding) {
1231
840
  FrtMultiMapper *mm = MFilt(ts)->mapper;
1232
- if (mm->d_size == 0) {
841
+ if (mm->d_size == 0)
1233
842
  frt_mulmap_compile(MFilt(ts)->mapper);
1234
- }
1235
- filter_reset(ts, text);
843
+ filter_reset(ts, text, encoding);
1236
844
  return ts;
1237
845
  }
1238
846
 
1239
- FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *sub_ts)
1240
- {
1241
- FrtTokenStream *ts = tf_new(FrtMappingFilter, sub_ts);
1242
- MFilt(ts)->mapper = frt_mulmap_new();
1243
- ts->next = &mf_next;
1244
- ts->destroy_i = &mf_destroy_i;
1245
- ts->clone_i = &mf_clone_i;
1246
- ts->reset = &mf_reset;
847
+ FrtTokenStream *frt_mapping_filter_alloc(void) {
848
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtMappingFilter));
849
+ }
850
+
851
+ void frt_mapping_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
852
+ frt_tf_init(ts, sub_ts);
853
+ ts->next = &mf_next;
854
+ ts->destroy_i = &mf_destroy_i;
855
+ ts->clone_i = &mf_clone_i;
856
+ ts->reset = &mf_reset;
857
+ MFilt(ts)->mapper = frt_mulmap_new();
858
+ }
859
+
860
+ FrtTokenStream *frt_mapping_filter_new(FrtTokenStream *sub_ts) {
861
+ FrtTokenStream *ts = frt_mapping_filter_alloc();
862
+ frt_mapping_filter_init(ts, sub_ts);
1247
863
  return ts;
1248
864
  }
1249
865
 
1250
- FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern,
1251
- const char *replacement)
1252
- {
866
+ FrtTokenStream *frt_mapping_filter_add(FrtTokenStream *ts, const char *pattern, const char *replacement) {
1253
867
  frt_mulmap_add_mapping(MFilt(ts)->mapper, pattern, replacement);
1254
868
  return ts;
1255
869
  }
1256
870
 
1257
- /****************************************************************************
1258
- * HyphenFilter
1259
- ****************************************************************************/
871
+ /*****************************************************************************/
872
+ /*** FrtHyphenFilter *********************************************************/
873
+ /*****************************************************************************/
1260
874
 
1261
875
  #define HyphenFilt(filter) ((FrtHyphenFilter *)(filter))
1262
876
 
1263
- static FrtTokenStream *hf_clone_i(FrtTokenStream *orig_ts)
1264
- {
877
+ static FrtTokenStream *hf_clone_i(FrtTokenStream *orig_ts) {
1265
878
  FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtHyphenFilter));
1266
879
  return new_ts;
1267
880
  }
1268
881
 
1269
- static FrtToken *hf_next(FrtTokenStream *ts)
1270
- {
882
+ static FrtToken *hf_next(FrtTokenStream *ts) {
883
+ int cp_len = 0;
884
+ OnigCodePoint cp;
885
+ rb_encoding *enc = utf8_encoding;
1271
886
  FrtHyphenFilter *hf = HyphenFilt(ts);
1272
887
  FrtTokenFilter *tf = TkFilt(ts);
1273
888
  FrtToken *tk = hf->tk;
@@ -1282,38 +897,48 @@ static FrtToken *hf_next(FrtTokenStream *ts)
1282
897
  hf->pos += text_len + 1;
1283
898
  tk->len = text_len;
1284
899
  return tk;
1285
- }
1286
- else {
1287
- char *p;
900
+ } else {
901
+ char *t;
902
+ char *end;
903
+
1288
904
  bool seen_hyphen = false;
1289
905
  bool seen_other_punc = false;
1290
906
  hf->tk = tk = tf->sub_ts->next(tf->sub_ts);
1291
907
  if (NULL == tk) return NULL;
1292
- p = tk->text + 1;
1293
- while (*p) {
1294
- if (*p == '-') {
908
+ t = tk->text;
909
+ end = tk->text + tk->len;
910
+ get_cp(t, end, &cp_len, enc);
911
+ t += cp_len; // skip first
912
+ cp = get_cp(t, end, &cp_len, enc);
913
+ while (cp > 0) {
914
+ if (cp == cp_dash || cp == cp_hyphen) {
1295
915
  seen_hyphen = true;
1296
- }
1297
- else if (!isalpha(*p)) {
916
+ } else if (!rb_enc_isalpha(cp, enc)) {
1298
917
  seen_other_punc = true;
1299
918
  break;
1300
919
  }
1301
- p++;
920
+ t += cp_len;
921
+ cp = get_cp(t, end, &cp_len, enc);
1302
922
  }
1303
923
  if (seen_hyphen && !seen_other_punc) {
1304
924
  char *q = hf->text;
1305
925
  char *r = tk->text;
1306
- p = tk->text;
1307
- while (*p) {
1308
- if (*p == '-') {
926
+ t = tk->text;
927
+ end = tk->text + tk->len;
928
+ cp = 0;
929
+ cp = get_cp(t, end, &cp_len, enc);
930
+ while (cp > 0) {
931
+ if (cp == cp_dash || cp == cp_hyphen) {
1309
932
  *q = '\0';
933
+ q++;
934
+ } else {
935
+ memcpy(q, t, cp_len);
936
+ if (r!=t) memcpy(r, t, cp_len);
937
+ r += cp_len;
938
+ q += cp_len;
1310
939
  }
1311
- else {
1312
- *r = *q = *p;
1313
- r++;
1314
- }
1315
- q++;
1316
- p++;
940
+ t += cp_len;
941
+ cp = get_cp(t, end, &cp_len, enc);
1317
942
  }
1318
943
  *r = *q = '\0';
1319
944
  hf->start = tk->start;
@@ -1325,89 +950,76 @@ static FrtToken *hf_next(FrtTokenStream *ts)
1325
950
  return tk;
1326
951
  }
1327
952
 
1328
- FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *sub_ts)
1329
- {
1330
- FrtTokenStream *ts = tf_new(FrtHyphenFilter, sub_ts);
1331
- ts->next = &hf_next;
1332
- ts->clone_i = &hf_clone_i;
953
+ FrtTokenStream *frt_hyphen_filter_alloc(void) {
954
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtHyphenFilter));
955
+ }
956
+
957
+ FrtTokenStream *frt_hyphen_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
958
+ frt_tf_init(ts, sub_ts);
959
+ ts->next = &hf_next;
960
+ ts->clone_i = &hf_clone_i;
1333
961
  return ts;
1334
962
  }
1335
963
 
1336
- /****************************************************************************
1337
- * LowerCaseFilter
1338
- ****************************************************************************/
964
+ FrtTokenStream *frt_hyphen_filter_new(FrtTokenStream *sub_ts) {
965
+ FrtTokenStream *ts = frt_hyphen_filter_alloc();
966
+ return frt_hyphen_filter_init(ts, sub_ts);
967
+ }
1339
968
 
969
+ /*****************************************************************************/
970
+ /*** FrtLowercaseFilter ******************************************************/
971
+ /*****************************************************************************/
972
+
973
+ static FrtToken *lcf_next(FrtTokenStream *ts) {
974
+ int len = 0;
975
+ OnigCaseFoldType fold_type = ONIGENC_CASE_DOWNCASE;
976
+ rb_encoding *enc = utf8_encoding; // Token encoding is always UTF-8
977
+ char buf[FRT_MAX_WORD_SIZE + 20]; // CASE_MAPPING_ADDITIONAL_LENGTH
978
+ char *buf_end = buf + FRT_MAX_WORD_SIZE + 19;
1340
979
 
1341
- static FrtToken *mb_lcf_next(FrtTokenStream *ts)
1342
- {
1343
- wchar_t wbuf[FRT_MAX_WORD_SIZE + 1], *wchr;
1344
980
  FrtToken *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1345
- int x;
1346
- wbuf[FRT_MAX_WORD_SIZE] = 0;
981
+ if (tk == NULL) { return tk; }
982
+ if (tk->len < 1) { return tk; }
1347
983
 
1348
- if (tk == NULL) {
1349
- return tk;
1350
- }
984
+ const OnigUChar *t = (const OnigUChar *)tk->text;
985
+
986
+ len = enc->case_map(&fold_type, &t, (const OnigUChar *)(tk->text + tk->len), (OnigUChar *)buf, (OnigUChar *)buf_end, enc);
987
+ tk->len = len;
988
+ memcpy(tk->text, buf, len);
989
+ tk->text[len] = '\0';
1351
990
 
1352
- if ((x=mbstowcs(wbuf, tk->text, FRT_MAX_WORD_SIZE)) <= 0) return tk;
1353
- wchr = wbuf;
1354
- while (*wchr != 0) {
1355
- *wchr = towlower(*wchr);
1356
- wchr++;
1357
- }
1358
- tk->len = wcstombs(tk->text, wbuf, FRT_MAX_WORD_SIZE);
1359
- if (tk->len <= 0) {
1360
- strcpy(tk->text, "BAD_DATA");
1361
- tk->len = 8;
1362
- }
1363
- tk->text[tk->len] = '\0';
1364
991
  return tk;
1365
992
  }
1366
993
 
1367
- FrtTokenStream *frt_mb_lowercase_filter_new(FrtTokenStream *sub_ts)
1368
- {
1369
- FrtTokenStream *ts = tf_new(FrtTokenFilter, sub_ts);
1370
- ts->next = &mb_lcf_next;
1371
- return ts;
994
+ FrtTokenStream *frt_lowercase_filter_alloc(void) {
995
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtTokenFilter));
1372
996
  }
1373
997
 
1374
- static FrtToken *lcf_next(FrtTokenStream *ts)
1375
- {
1376
- int i = 0;
1377
- FrtToken *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
1378
- if (tk == NULL) {
1379
- return tk;
1380
- }
1381
- while (tk->text[i] != '\0') {
1382
- tk->text[i] = tolower(tk->text[i]);
1383
- i++;
1384
- }
1385
- return tk;
998
+ void frt_lowercase_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts) {
999
+ frt_tf_init(ts, sub_ts);
1000
+ ts->next = &lcf_next;
1386
1001
  }
1387
1002
 
1388
- FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts)
1389
- {
1390
- FrtTokenStream *ts = tf_new(FrtTokenFilter, sub_ts);
1391
- ts->next = &lcf_next;
1003
+ FrtTokenStream *frt_lowercase_filter_new(FrtTokenStream *sub_ts) {
1004
+ FrtTokenStream *ts = frt_lowercase_filter_alloc();
1005
+ frt_lowercase_filter_init(ts, sub_ts);
1392
1006
  return ts;
1393
1007
  }
1394
1008
 
1395
- /****************************************************************************
1396
- * FrtStemFilter
1397
- ****************************************************************************/
1009
+ /*****************************************************************************/
1010
+ /*** FrtStemFilter ***********************************************************/
1011
+ /*****************************************************************************/
1398
1012
 
1399
1013
  #define StemFilt(filter) ((FrtStemFilter *)(filter))
1400
1014
 
1401
- static void stemf_destroy_i(FrtTokenStream *ts)
1402
- {
1015
+ static void stemf_destroy_i(FrtTokenStream *ts) {
1403
1016
  sb_stemmer_delete(StemFilt(ts)->stemmer);
1404
1017
  free(StemFilt(ts)->algorithm);
1405
1018
  free(StemFilt(ts)->charenc);
1406
1019
  filter_destroy_i(ts);
1407
1020
  }
1408
1021
 
1409
- static FrtToken *stemf_next(FrtTokenStream *ts)
1410
- {
1022
+ static FrtToken *stemf_next(FrtTokenStream *ts) {
1411
1023
  int len;
1412
1024
  const sb_symbol *stemmed;
1413
1025
  struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
@@ -1428,26 +1040,27 @@ static FrtToken *stemf_next(FrtTokenStream *ts)
1428
1040
  return tk;
1429
1041
  }
1430
1042
 
1431
- static FrtTokenStream *stemf_clone_i(FrtTokenStream *orig_ts)
1432
- {
1433
- FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtStemFilter));
1434
- FrtStemFilter *stemf = StemFilt(new_ts);
1435
- FrtStemFilter *orig_stemf = StemFilt(orig_ts);
1436
- stemf->stemmer =
1437
- sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1438
- stemf->algorithm =
1439
- orig_stemf->algorithm ? frt_estrdup(orig_stemf->algorithm) : NULL;
1440
- stemf->charenc =
1441
- orig_stemf->charenc ? frt_estrdup(orig_stemf->charenc) : NULL;
1043
+ static FrtTokenStream *stemf_clone_i(FrtTokenStream *orig_ts) {
1044
+ FrtTokenStream *new_ts = frt_filter_clone_size(orig_ts, sizeof(FrtStemFilter));
1045
+ FrtStemFilter *stemf = StemFilt(new_ts);
1046
+ FrtStemFilter *orig_stemf = StemFilt(orig_ts);
1047
+ stemf->stemmer = sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
1048
+ stemf->algorithm = orig_stemf->algorithm ? frt_estrdup(orig_stemf->algorithm) : NULL;
1049
+ stemf->charenc = orig_stemf->charenc ? frt_estrdup(orig_stemf->charenc) : NULL;
1442
1050
  return new_ts;
1443
1051
  }
1444
1052
 
1445
- FrtTokenStream *frt_stem_filter_new(FrtTokenStream *ts, const char *algorithm,
1446
- const char *charenc)
1447
- {
1448
- FrtTokenStream *tf = tf_new(FrtStemFilter, ts);
1053
+ FrtTokenStream *frt_stem_filter_alloc(void) {
1054
+ return (FrtTokenStream *)frt_ecalloc(sizeof(FrtStemFilter));
1055
+ }
1056
+
1057
+ void frt_stem_filter_init(FrtTokenStream *ts, FrtTokenStream *sub_ts, const char *algorithm) {
1058
+ frt_tf_init(ts, sub_ts);
1059
+ ts->next = &stemf_next;
1060
+ ts->destroy_i = &stemf_destroy_i;
1061
+ ts->clone_i = &stemf_clone_i;
1062
+
1449
1063
  char *my_algorithm = NULL;
1450
- char *my_charenc = NULL;
1451
1064
  char *s = NULL;
1452
1065
 
1453
1066
  if (algorithm) {
@@ -1459,186 +1072,178 @@ FrtTokenStream *frt_stem_filter_new(FrtTokenStream *ts, const char *algorithm,
1459
1072
  *s = tolower(*s);
1460
1073
  s++;
1461
1074
  }
1462
- StemFilt(tf)->algorithm = my_algorithm;
1075
+ StemFilt(ts)->algorithm = my_algorithm;
1463
1076
  }
1464
1077
 
1465
- if (charenc) {
1466
- my_charenc = frt_estrdup(charenc);
1078
+ StemFilt(ts)->stemmer = sb_stemmer_new(my_algorithm, "UTF_8");
1079
+ }
1467
1080
 
1468
- /* encodings are uppercase and use '_' instead of '-' */
1469
- s = my_charenc;
1470
- while (*s) {
1471
- *s = (*s == '-') ? '_' : toupper(*s);
1472
- s++;
1473
- }
1474
- StemFilt(tf)->charenc = my_charenc;
1475
- }
1081
+ FrtTokenStream *frt_stem_filter_new(FrtTokenStream *sub_ts, const char *algorithm) {
1082
+ FrtTokenStream *ts = frt_stem_filter_alloc();
1083
+ frt_stem_filter_init(ts, sub_ts, algorithm);
1084
+ return ts;
1085
+ }
1476
1086
 
1477
- StemFilt(tf)->stemmer = sb_stemmer_new(my_algorithm, my_charenc);
1087
+ /*****************************************************************************/
1088
+ /*** FrtAnalyzer *************************************************************/
1089
+ /*****************************************************************************/
1478
1090
 
1479
- tf->next = &stemf_next;
1480
- tf->destroy_i = &stemf_destroy_i;
1481
- tf->clone_i = &stemf_clone_i;
1482
- return tf;
1091
+ void frt_a_deref(FrtAnalyzer *a) {
1092
+ if (--a->ref_cnt <= 0)
1093
+ a->destroy_i(a);
1483
1094
  }
1484
1095
 
1485
- /****************************************************************************
1486
- *
1487
- * Analyzers
1488
- *
1489
- ****************************************************************************/
1096
+ static void frt_a_standard_destroy_i(FrtAnalyzer *a) {
1097
+ if (a->current_ts)
1098
+ frt_ts_deref(a->current_ts);
1099
+ free(a);
1100
+ }
1490
1101
 
1491
- /****************************************************************************
1492
- * Standard
1493
- ****************************************************************************/
1102
+ static FrtTokenStream *a_standard_get_ts(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding) {
1103
+ FrtTokenStream *ts;
1104
+ (void)field;
1105
+ ts = frt_ts_clone(a->current_ts);
1106
+ return ts->reset(ts, text, encoding);
1107
+ }
1494
1108
 
1495
- FrtAnalyzer *frt_standard_analyzer_new_with_words_len(const char **words, int len,
1496
- bool lowercase)
1497
- {
1498
- FrtTokenStream *ts = frt_standard_tokenizer_new();
1499
- if (lowercase) {
1500
- ts = frt_lowercase_filter_new(ts);
1501
- }
1502
- ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words_len(ts, words, len));
1503
- return frt_analyzer_new(ts, NULL, NULL);
1109
+ FrtAnalyzer *frt_analyzer_alloc(void) {
1110
+ return (FrtAnalyzer *) FRT_ALLOC(FrtAnalyzer);
1504
1111
  }
1505
1112
 
1506
- FrtAnalyzer *frt_standard_analyzer_new_with_words(const char **words,
1507
- bool lowercase)
1508
- {
1509
- FrtTokenStream *ts = frt_standard_tokenizer_new();
1510
- if (lowercase) {
1511
- ts = frt_lowercase_filter_new(ts);
1512
- }
1513
- ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1514
- return frt_analyzer_new(ts, NULL, NULL);
1113
+ void frt_analyzer_init(FrtAnalyzer *a, FrtTokenStream *ts, void (*destroy_i)(FrtAnalyzer *a),
1114
+ FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding)) {
1115
+ a->current_ts = ts;
1116
+ a->destroy_i = (destroy_i ? destroy_i : &frt_a_standard_destroy_i);
1117
+ a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
1118
+ a->ref_cnt = 1;
1119
+ a->ranalyzer = Qnil;
1515
1120
  }
1516
1121
 
1517
- FrtAnalyzer *frt_mb_standard_analyzer_new_with_words(const char **words,
1518
- bool lowercase)
1519
- {
1520
- FrtTokenStream *ts = frt_mb_standard_tokenizer_new();
1521
- if (lowercase) {
1522
- ts = frt_mb_lowercase_filter_new(ts);
1523
- }
1524
- ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1525
- return frt_analyzer_new(ts, NULL, NULL);
1122
+ FrtAnalyzer *frt_analyzer_new(FrtTokenStream *ts, void (*destroy_i)(FrtAnalyzer *a),
1123
+ FrtTokenStream *(*get_ts)(FrtAnalyzer *a, ID field, char *text, rb_encoding *encoding)) {
1124
+ FrtAnalyzer *a = frt_analyzer_alloc();
1125
+ frt_analyzer_init(a, ts, destroy_i, get_ts);
1126
+ return a;
1526
1127
  }
1527
1128
 
1528
- FrtAnalyzer *frt_utf8_standard_analyzer_new_with_words(const char **words,
1529
- bool lowercase)
1530
- {
1531
- FrtTokenStream *ts = frt_utf8_standard_tokenizer_new();
1532
- if (lowercase) {
1533
- ts = frt_mb_lowercase_filter_new(ts);
1534
- }
1535
- ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1536
- return frt_analyzer_new(ts, NULL, NULL);
1129
+ /*****************************************************************************/
1130
+ /*** FrtNonAnalyzer **********************************************************/
1131
+ /*****************************************************************************/
1132
+
1133
+ FrtAnalyzer *frt_non_analyzer_new(void) {
1134
+ return frt_analyzer_new(frt_non_tokenizer_new(), NULL, NULL);
1537
1135
  }
1538
1136
 
1539
- FrtAnalyzer *frt_standard_analyzer_new(bool lowercase)
1540
- {
1541
- return frt_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1542
- lowercase);
1137
+ /*****************************************************************************/
1138
+ /*** FrtWhiteSpaceAnalyzer ***************************************************/
1139
+ /*****************************************************************************/
1140
+
1141
+ FrtAnalyzer *frt_whitespace_analyzer_alloc(void) {
1142
+ return frt_analyzer_alloc();
1543
1143
  }
1544
1144
 
1545
- FrtAnalyzer *frt_mb_standard_analyzer_new(bool lowercase)
1546
- {
1547
- return frt_mb_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1548
- lowercase);
1145
+ void frt_whitespace_analyzer_init(FrtAnalyzer *a, bool lowercase) {
1146
+ FrtTokenStream *ts = frt_whitespace_tokenizer_new();
1147
+ if (lowercase)
1148
+ ts = frt_lowercase_filter_new(ts);
1149
+ frt_analyzer_init(a, ts, NULL, NULL);
1549
1150
  }
1550
1151
 
1551
- FrtAnalyzer *frt_utf8_standard_analyzer_new(bool lowercase)
1552
- {
1553
- return frt_utf8_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1554
- lowercase);
1152
+ FrtAnalyzer *frt_whitespace_analyzer_new(bool lowercase) {
1153
+ FrtAnalyzer *a = frt_whitespace_analyzer_alloc();
1154
+ frt_whitespace_analyzer_init(a, lowercase);
1155
+ return a;
1555
1156
  }
1556
1157
 
1557
- /****************************************************************************
1558
- * Legacy
1559
- ****************************************************************************/
1158
+ /*****************************************************************************/
1159
+ /*** FrtLetterAnalyzer *******************************************************/
1160
+ /*****************************************************************************/
1560
1161
 
1561
- FrtAnalyzer *frt_legacy_standard_analyzer_new_with_words(const char **words,
1562
- bool lowercase)
1563
- {
1564
- FrtTokenStream *ts = frt_legacy_standard_tokenizer_new();
1565
- if (lowercase) {
1162
+ FrtAnalyzer *frt_letter_analyzer_alloc(void) {
1163
+ return frt_analyzer_alloc();
1164
+ }
1165
+
1166
+ void frt_letter_analyzer_init(FrtAnalyzer *a, bool lowercase) {
1167
+ FrtTokenStream *ts = frt_letter_tokenizer_new();
1168
+ if (lowercase)
1566
1169
  ts = frt_lowercase_filter_new(ts);
1567
- }
1568
- ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1569
- return frt_analyzer_new(ts, NULL, NULL);
1170
+ frt_analyzer_init(a, ts, NULL, NULL);
1570
1171
  }
1571
1172
 
1572
- FrtAnalyzer *frt_mb_legacy_standard_analyzer_new_with_words(const char **words,
1573
- bool lowercase)
1574
- {
1575
- FrtTokenStream *ts = frt_mb_legacy_standard_tokenizer_new();
1576
- if (lowercase) {
1577
- ts = frt_mb_lowercase_filter_new(ts);
1578
- }
1173
+ FrtAnalyzer *frt_letter_analyzer_new(bool lowercase) {
1174
+ FrtAnalyzer *a = frt_letter_analyzer_alloc();
1175
+ frt_letter_analyzer_init(a, lowercase);
1176
+ return a;
1177
+ }
1178
+
1179
+ /*****************************************************************************/
1180
+ /*** FrtStandardAnalyzer *****************************************************/
1181
+ /*****************************************************************************/
1182
+
1183
+ FrtAnalyzer *frt_standard_analyzer_alloc(void) {
1184
+ return frt_analyzer_alloc();
1185
+ }
1186
+
1187
+ void frt_standard_analyzer_init(FrtAnalyzer *a, bool lowercase, const char **words) {
1188
+ FrtTokenStream *ts = frt_standard_tokenizer_new();
1189
+ if (lowercase)
1190
+ ts = frt_lowercase_filter_new(ts);
1579
1191
  ts = frt_hyphen_filter_new(frt_stop_filter_new_with_words(ts, words));
1580
- return frt_analyzer_new(ts, NULL, NULL);
1192
+ frt_analyzer_init(a, ts, NULL, NULL);
1581
1193
  }
1582
1194
 
1583
- FrtAnalyzer *frt_legacy_standard_analyzer_new(bool lowercase)
1584
- {
1585
- return frt_legacy_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1586
- lowercase);
1195
+ FrtAnalyzer *frt_standard_analyzer_new_with_words(bool lowercase, const char **words) {
1196
+ FrtAnalyzer *a = frt_standard_analyzer_alloc();
1197
+ frt_standard_analyzer_init(a, lowercase, words);
1198
+ return a;
1587
1199
  }
1588
1200
 
1589
- FrtAnalyzer *frt_mb_legacy_standard_analyzer_new(bool lowercase)
1590
- {
1591
- return frt_mb_legacy_standard_analyzer_new_with_words(FRT_FULL_ENGLISH_STOP_WORDS,
1592
- lowercase);
1201
+ FrtAnalyzer *frt_standard_analyzer_new(bool lowercase) {
1202
+ return frt_standard_analyzer_new_with_words(lowercase, FRT_FULL_ENGLISH_STOP_WORDS);
1593
1203
  }
1594
1204
 
1595
- /****************************************************************************
1596
- *
1597
- * PerFieldAnalyzer
1598
- *
1599
- ****************************************************************************/
1205
+ /*****************************************************************************/
1206
+ /*** FrtPerFieldAnalyzer *****************************************************/
1207
+ /*****************************************************************************/
1600
1208
 
1601
- static void pfa_destroy_i(FrtAnalyzer *self)
1602
- {
1209
+ static void pfa_destroy_i(FrtAnalyzer *self) {
1603
1210
  frt_h_destroy(PFA(self)->dict);
1604
1211
 
1605
1212
  frt_a_deref(PFA(self)->default_a);
1606
1213
  free(self);
1607
1214
  }
1608
1215
 
1609
- static FrtTokenStream *pfa_get_ts(FrtAnalyzer *self,
1610
- FrtSymbol field, char *text)
1611
- {
1216
+ static FrtTokenStream *pfa_get_ts(FrtAnalyzer *self, ID field, char *text, rb_encoding *encoding) {
1612
1217
  FrtAnalyzer *a = (FrtAnalyzer *)frt_h_get(PFA(self)->dict, (void *)field);
1613
- if (a == NULL) {
1218
+ if (a == NULL)
1614
1219
  a = PFA(self)->default_a;
1615
- }
1616
- return frt_a_get_ts(a, field, text);
1220
+ return frt_a_get_ts(a, field, text, encoding);
1617
1221
  }
1618
1222
 
1619
- static void pfa_sub_a_destroy_i(void *p)
1620
- {
1223
+ static void pfa_sub_a_destroy_i(void *p) {
1621
1224
  FrtAnalyzer *a = (FrtAnalyzer *) p;
1622
1225
  frt_a_deref(a);
1623
1226
  }
1624
1227
 
1625
- void frt_pfa_add_field(FrtAnalyzer *self,
1626
- FrtSymbol field,
1627
- FrtAnalyzer *analyzer)
1628
- {
1228
+ void frt_pfa_add_field(FrtAnalyzer *self, ID field, FrtAnalyzer *analyzer) {
1629
1229
  frt_h_set(PFA(self)->dict, (void *)field, analyzer);
1630
1230
  }
1631
1231
 
1632
- FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *default_a)
1633
- {
1634
- FrtAnalyzer *a = (FrtAnalyzer *)frt_ecalloc(sizeof(FrtPerFieldAnalyzer));
1635
-
1636
- PFA(a)->default_a = default_a;
1637
- PFA(a)->dict = frt_h_new_ptr(&pfa_sub_a_destroy_i);
1232
+ FrtAnalyzer *frt_per_field_analyzer_alloc(void) {
1233
+ return (FrtAnalyzer *)frt_ecalloc(sizeof(FrtPerFieldAnalyzer));
1234
+ }
1638
1235
 
1236
+ void frt_per_field_analyzer_init(FrtAnalyzer *a, FrtAnalyzer *default_a) {
1639
1237
  a->destroy_i = &pfa_destroy_i;
1640
- a->get_ts = pfa_get_ts;
1238
+ a->get_ts = &pfa_get_ts;
1641
1239
  a->ref_cnt = 1;
1642
1240
 
1241
+ PFA(a)->default_a = default_a;
1242
+ PFA(a)->dict = frt_h_new_ptr(&pfa_sub_a_destroy_i);
1243
+ }
1244
+
1245
+ FrtAnalyzer *frt_per_field_analyzer_new(FrtAnalyzer *default_a) {
1246
+ FrtAnalyzer *a = frt_per_field_analyzer_alloc();
1247
+ frt_per_field_analyzer_init(a, default_a);
1643
1248
  return a;
1644
1249
  }