isomorfeus-ferret 0.12.7 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (164) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +101 -19
  3. data/README.md +54 -1
  4. data/ext/isomorfeus_ferret_ext/bm_bitvector.c +22 -30
  5. data/ext/isomorfeus_ferret_ext/bm_hash.c +6 -12
  6. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +3 -6
  7. data/ext/isomorfeus_ferret_ext/bm_store.c +11 -22
  8. data/ext/isomorfeus_ferret_ext/brotli_common_dictionary.c +1 -1
  9. data/ext/isomorfeus_ferret_ext/brotli_dec_decode.c +1 -1
  10. data/ext/isomorfeus_ferret_ext/bzip_blocksort.c +1094 -0
  11. data/ext/isomorfeus_ferret_ext/bzip_huffman.c +205 -0
  12. data/ext/isomorfeus_ferret_ext/bzlib.c +1572 -0
  13. data/ext/isomorfeus_ferret_ext/bzlib.h +282 -0
  14. data/ext/isomorfeus_ferret_ext/bzlib_compress.c +672 -0
  15. data/ext/isomorfeus_ferret_ext/bzlib_crctable.c +104 -0
  16. data/ext/isomorfeus_ferret_ext/bzlib_decompress.c +652 -0
  17. data/ext/isomorfeus_ferret_ext/bzlib_private.h +509 -0
  18. data/ext/isomorfeus_ferret_ext/bzlib_randtable.c +84 -0
  19. data/ext/isomorfeus_ferret_ext/fio_tmpfile.h +53 -53
  20. data/ext/isomorfeus_ferret_ext/frb_analysis.c +785 -1192
  21. data/ext/isomorfeus_ferret_ext/frb_index.c +492 -474
  22. data/ext/isomorfeus_ferret_ext/frb_qparser.c +48 -60
  23. data/ext/isomorfeus_ferret_ext/frb_search.c +1520 -1002
  24. data/ext/isomorfeus_ferret_ext/frb_store.c +96 -96
  25. data/ext/isomorfeus_ferret_ext/frb_threading.h +0 -1
  26. data/ext/isomorfeus_ferret_ext/frb_utils.c +147 -196
  27. data/ext/isomorfeus_ferret_ext/frt_analysis.c +695 -1090
  28. data/ext/isomorfeus_ferret_ext/frt_analysis.h +174 -170
  29. data/ext/isomorfeus_ferret_ext/frt_array.c +2 -4
  30. data/ext/isomorfeus_ferret_ext/frt_bitvector.c +9 -16
  31. data/ext/isomorfeus_ferret_ext/frt_bitvector.h +32 -81
  32. data/ext/isomorfeus_ferret_ext/frt_document.c +15 -20
  33. data/ext/isomorfeus_ferret_ext/frt_document.h +10 -10
  34. data/ext/isomorfeus_ferret_ext/frt_except.c +5 -12
  35. data/ext/isomorfeus_ferret_ext/frt_field_index.c +3 -3
  36. data/ext/isomorfeus_ferret_ext/frt_field_index.h +6 -7
  37. data/ext/isomorfeus_ferret_ext/frt_filter.c +35 -46
  38. data/ext/isomorfeus_ferret_ext/frt_fs_store.c +1 -0
  39. data/ext/isomorfeus_ferret_ext/frt_global.c +105 -63
  40. data/ext/isomorfeus_ferret_ext/frt_global.h +7 -3
  41. data/ext/isomorfeus_ferret_ext/frt_hash.c +1 -2
  42. data/ext/isomorfeus_ferret_ext/frt_ind.c +32 -35
  43. data/ext/isomorfeus_ferret_ext/frt_ind.h +9 -9
  44. data/ext/isomorfeus_ferret_ext/frt_index.c +580 -399
  45. data/ext/isomorfeus_ferret_ext/frt_index.h +272 -291
  46. data/ext/isomorfeus_ferret_ext/frt_mempool.c +1 -2
  47. data/ext/isomorfeus_ferret_ext/frt_multimapper.c +4 -7
  48. data/ext/isomorfeus_ferret_ext/frt_q_boolean.c +67 -91
  49. data/ext/isomorfeus_ferret_ext/frt_q_const_score.c +35 -38
  50. data/ext/isomorfeus_ferret_ext/frt_q_filtered_query.c +53 -72
  51. data/ext/isomorfeus_ferret_ext/frt_q_fuzzy.c +25 -32
  52. data/ext/isomorfeus_ferret_ext/frt_q_match_all.c +21 -23
  53. data/ext/isomorfeus_ferret_ext/frt_q_multi_term.c +66 -103
  54. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +207 -195
  55. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +20 -16
  56. data/ext/isomorfeus_ferret_ext/frt_q_prefix.c +17 -14
  57. data/ext/isomorfeus_ferret_ext/frt_q_range.c +102 -131
  58. data/ext/isomorfeus_ferret_ext/frt_q_span.c +179 -178
  59. data/ext/isomorfeus_ferret_ext/frt_q_term.c +47 -60
  60. data/ext/isomorfeus_ferret_ext/frt_q_wildcard.c +18 -16
  61. data/ext/isomorfeus_ferret_ext/frt_ram_store.c +45 -84
  62. data/ext/isomorfeus_ferret_ext/frt_search.c +105 -146
  63. data/ext/isomorfeus_ferret_ext/frt_search.h +331 -320
  64. data/ext/isomorfeus_ferret_ext/frt_similarity.c +5 -13
  65. data/ext/isomorfeus_ferret_ext/frt_similarity.h +7 -12
  66. data/ext/isomorfeus_ferret_ext/frt_sort.c +105 -149
  67. data/ext/isomorfeus_ferret_ext/frt_store.c +13 -7
  68. data/ext/isomorfeus_ferret_ext/frt_store.h +10 -2
  69. data/ext/isomorfeus_ferret_ext/frt_threading.h +0 -1
  70. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.c +21 -109
  71. data/ext/isomorfeus_ferret_ext/isomorfeus_ferret.h +2 -32
  72. data/ext/isomorfeus_ferret_ext/lz4.c +2495 -0
  73. data/ext/isomorfeus_ferret_ext/lz4.h +774 -0
  74. data/ext/isomorfeus_ferret_ext/lz4frame.c +1899 -0
  75. data/ext/isomorfeus_ferret_ext/lz4frame.h +623 -0
  76. data/ext/isomorfeus_ferret_ext/lz4hc.c +1615 -0
  77. data/ext/isomorfeus_ferret_ext/lz4hc.h +413 -0
  78. data/ext/isomorfeus_ferret_ext/lz4xxhash.c +1030 -0
  79. data/ext/isomorfeus_ferret_ext/lz4xxhash.h +328 -0
  80. data/ext/isomorfeus_ferret_ext/stem_modules.h +0 -86
  81. data/ext/isomorfeus_ferret_ext/test.c +1 -2
  82. data/ext/isomorfeus_ferret_ext/test_1710.c +11 -12
  83. data/ext/isomorfeus_ferret_ext/test_analysis.c +590 -583
  84. data/ext/isomorfeus_ferret_ext/test_compound_io.c +1 -1
  85. data/ext/isomorfeus_ferret_ext/test_document.c +19 -15
  86. data/ext/isomorfeus_ferret_ext/test_except.c +1 -2
  87. data/ext/isomorfeus_ferret_ext/test_fields.c +59 -60
  88. data/ext/isomorfeus_ferret_ext/test_file_deleter.c +10 -27
  89. data/ext/isomorfeus_ferret_ext/test_filter.c +11 -8
  90. data/ext/isomorfeus_ferret_ext/test_hash.c +2 -2
  91. data/ext/isomorfeus_ferret_ext/test_hashset.c +1 -1
  92. data/ext/isomorfeus_ferret_ext/test_highlighter.c +15 -11
  93. data/ext/isomorfeus_ferret_ext/test_index.c +372 -365
  94. data/ext/isomorfeus_ferret_ext/test_q_const_score.c +5 -3
  95. data/ext/isomorfeus_ferret_ext/test_q_filtered.c +5 -3
  96. data/ext/isomorfeus_ferret_ext/test_q_fuzzy.c +13 -10
  97. data/ext/isomorfeus_ferret_ext/test_q_parser.c +45 -7
  98. data/ext/isomorfeus_ferret_ext/test_q_span.c +15 -12
  99. data/ext/isomorfeus_ferret_ext/test_ram_store.c +3 -3
  100. data/ext/isomorfeus_ferret_ext/test_search.c +60 -62
  101. data/ext/isomorfeus_ferret_ext/test_segments.c +5 -4
  102. data/ext/isomorfeus_ferret_ext/test_sort.c +17 -14
  103. data/ext/isomorfeus_ferret_ext/test_store.c +2 -0
  104. data/ext/isomorfeus_ferret_ext/test_term.c +3 -1
  105. data/ext/isomorfeus_ferret_ext/test_term_vectors.c +9 -10
  106. data/ext/isomorfeus_ferret_ext/test_test.c +1 -2
  107. data/ext/isomorfeus_ferret_ext/test_threading.c +9 -10
  108. data/ext/isomorfeus_ferret_ext/testhelper.c +1 -2
  109. data/lib/isomorfeus/ferret/version.rb +1 -1
  110. metadata +27 -57
  111. data/ext/isomorfeus_ferret_ext/email.rl +0 -21
  112. data/ext/isomorfeus_ferret_ext/frt_scanner.c +0 -900
  113. data/ext/isomorfeus_ferret_ext/frt_scanner.h +0 -28
  114. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +0 -6706
  115. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +0 -4420
  116. data/ext/isomorfeus_ferret_ext/scanner.h +0 -28
  117. data/ext/isomorfeus_ferret_ext/scanner.in +0 -43
  118. data/ext/isomorfeus_ferret_ext/scanner.rl +0 -84
  119. data/ext/isomorfeus_ferret_ext/scanner_mb.rl +0 -200
  120. data/ext/isomorfeus_ferret_ext/scanner_utf8.rl +0 -85
  121. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +0 -1167
  122. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +0 -6
  123. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +0 -1433
  124. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +0 -6
  125. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +0 -301
  126. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +0 -6
  127. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +0 -590
  128. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +0 -6
  129. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +0 -1049
  130. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +0 -6
  131. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +0 -705
  132. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +0 -6
  133. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +0 -1239
  134. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +0 -6
  135. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +0 -477
  136. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +0 -6
  137. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +0 -1217
  138. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.h +0 -7
  139. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +0 -394
  140. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +0 -6
  141. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +0 -457
  142. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +0 -6
  143. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +0 -1009
  144. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +0 -6
  145. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +0 -259
  146. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +0 -6
  147. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +0 -704
  148. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +0 -6
  149. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +0 -948
  150. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +0 -6
  151. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +0 -1028
  152. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +0 -6
  153. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +0 -275
  154. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +0 -6
  155. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +0 -849
  156. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +0 -6
  157. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +0 -952
  158. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +0 -6
  159. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +0 -669
  160. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +0 -6
  161. data/ext/isomorfeus_ferret_ext/stem_modules.txt +0 -63
  162. data/ext/isomorfeus_ferret_ext/uchar-ucs4.rl +0 -1854
  163. data/ext/isomorfeus_ferret_ext/uchar-utf8.rl +0 -1999
  164. data/ext/isomorfeus_ferret_ext/url.rl +0 -27
@@ -1,28 +0,0 @@
1
- #ifndef FRT_SCANNER_H
2
- #define FRT_SCANNER_H
3
-
4
- /*
5
- * Scan +in+ and copy the token into +out+, up until +out_size+ bytes.
6
- * The +start+ and +end+ are pointers to the original untouched token
7
- * somewhere inside +in+. This token may not always be copied
8
- * verbatim into +out+. For example, the http://google.com token will
9
- * be truncated down to just google.com during the copy.
10
- * +token_length+ is the size of the resulting token.
11
- */
12
- void frt_std_scan(const char *in,
13
- char *out, size_t out_size,
14
- const char **start, const char **end,
15
- int *token_length);
16
-
17
-
18
- void frt_std_scan_mb(const char *in,
19
- char *out, size_t out_size,
20
- const char **start, const char **end,
21
- int *token_length);
22
-
23
- void frt_std_scan_utf8(const char *in,
24
- char *out, size_t out_size,
25
- const char **start, const char **end,
26
- int *token_length);
27
-
28
- #endif /* FRT_SCANNER */
@@ -1,43 +0,0 @@
1
- #// scanner.in -*-C-*-
2
-
3
- %%{
4
- machine StdTok;
5
- include URL "url.rl";
6
- include Email "email.rl";
7
-
8
- token = frt_alpha frt_alnum*;
9
-
10
- frt_tokenizer := |*
11
- #// Email
12
- email { RET; };
13
-
14
- #// Token, or token with possessive
15
- token { RET; };
16
- token [\'] { trunc = 1; RET; };
17
- token [\'][sS] { trunc = 2; RET; };
18
-
19
- #// contractions
20
- frt_alpha+ [\'] frt_alpha+ { RET; };
21
-
22
- #// Token with hyphens
23
- frt_alnum+ ([\-_] frt_alnum+)* { RET; };
24
-
25
- #// Company name
26
- token [\&\@] token* { RET; };
27
-
28
- #// URL
29
- url { RET; };
30
-
31
- #// Acronym
32
- (frt_alpha '.')+ frt_alpha { STRIP('.'); };
33
-
34
- #// Int+float
35
- [\-\+]?frt_digit+ { RET; };
36
- [\-\+]?frt_digit+ '.' frt_digit+ { RET; };
37
-
38
- #// Ignore whitespace and other crap
39
- 0 { return; };
40
- (any - frt_alnum) {};
41
-
42
- *|;
43
- }%%
@@ -1,84 +0,0 @@
1
- /* scanner.rl -*-C-*- */
2
- #include <ctype.h>
3
- #include <stdio.h>
4
- #include <stdlib.h>
5
- #include <string.h>
6
- #include <unistd.h>
7
- #include "frt_global.h"
8
-
9
- #define RET goto ret;
10
-
11
- #define STRIP(c) do { \
12
- strip_char = c; \
13
- goto ret; \
14
- } while(0)
15
-
16
- %%{
17
- machine StdTok;
18
- alphtype unsigned char;
19
-
20
- frt_alpha = alpha;
21
- frt_alnum = alnum;
22
- frt_digit = digit;
23
-
24
- include StdTok "scanner.in";
25
-
26
- main := any @{ fhold; fcall frt_tokenizer; };
27
- }%%
28
-
29
- %% write data nofinal;
30
-
31
- void frt_std_scan(const char *in,
32
- char *out, size_t out_size,
33
- const char **start,
34
- const char **end,
35
- int *token_size)
36
- {
37
- int cs, act, top;
38
- int stack[32];
39
- char *ts = 0, *te = 0;
40
-
41
- %% write init;
42
-
43
- char *p = (char *)in, *pe = 0, *eof = pe;
44
- int skip = 0;
45
- int trunc = 0;
46
- char strip_char = 0;
47
-
48
- *end = 0;
49
- *start = 0;
50
- *token_size = 0;
51
-
52
- %% write exec;
53
-
54
- if ( cs == StdTok_error )
55
- fprintf(stderr, "PARSE ERROR\n" );
56
- else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
57
- return;
58
-
59
- ret:
60
- {
61
- size_t __len = te - ts - skip - trunc;
62
- if (__len > out_size)
63
- __len = out_size;
64
-
65
- *start = ts;
66
- *end = te;
67
-
68
- if (strip_char) {
69
- char *__p = ts + skip;
70
- char *__o = out;
71
- for (; __p < (ts + skip + __len); ++__p) {
72
- if (*__p != strip_char)
73
- *__o++ = *__p;
74
- }
75
- *token_size = __o - out;
76
- }
77
- else {
78
- memcpy(out, ts + skip, __len);
79
- *token_size = __len;
80
- }
81
-
82
- out[*token_size] = 0;
83
- }
84
- }
@@ -1,200 +0,0 @@
1
- /* scanner.rl -*-C-*- */
2
- #include <ctype.h>
3
- #include <stdio.h>
4
- #include <stdlib.h>
5
- #include <string.h>
6
- #include <unistd.h>
7
- #include <wchar.h>
8
- #include <wctype.h>
9
- #include "frt_global.h"
10
-
11
- #define RET goto ret;
12
-
13
- #define STRIP(c) do { \
14
- strip_char = c; \
15
- goto ret; \
16
- } while(0)
17
-
18
- %%{
19
- machine StdTok;
20
- alphtype unsigned int;
21
- include WChar "uchar-ucs4.rl";
22
-
23
- frt_alpha = ualpha;
24
- frt_alnum = ualnum;
25
- frt_digit = udigit;
26
-
27
- include StdTok "scanner.in";
28
-
29
- main := any @{ fhold; fcall frt_tokenizer; };
30
- }%%
31
-
32
- %% write data nofinal;
33
-
34
- static const char *position_in_mb( const unsigned int *orig_wc,
35
- const char *orig_mb,
36
- const unsigned int *curr_wc )
37
- {
38
- const char *mb = orig_mb;
39
- const unsigned int *wc = orig_wc;
40
-
41
- while (wc < curr_wc)
42
- {
43
- char buf[MB_LEN_MAX];
44
- mb += wctomb(buf, *wc);
45
- ++wc;
46
- }
47
-
48
- return mb;
49
- }
50
-
51
- static int mb_next_char(unsigned int *wchr, const char *s, mbstate_t *state)
52
- {
53
- int num_bytes;
54
- if ((num_bytes = (int)mbrtowc((wchar_t*)wchr, s, MB_CUR_MAX, state)) < 0) {
55
- const char *t = s;
56
- do {
57
- t++;
58
- FRT_ZEROSET(state, mbstate_t);
59
- num_bytes = (int)mbrtowc((wchar_t*)wchr, t, MB_CUR_MAX, state);
60
- } while ((num_bytes < 0) && (*t != 0));
61
- num_bytes = t - s;
62
- if (*t == 0) *wchr = 0;
63
- }
64
- return num_bytes;
65
- }
66
-
67
- static int wc_next_char(char *s, const unsigned int *wchr, mbstate_t *state)
68
- {
69
- return (int)wcrtomb(s, *wchr, state);
70
- }
71
-
72
- /*
73
- * All input to Ragel must be in a buffer of 32bit unicode codepoints.
74
- * To that end, we require that the input to the scanner must be in a
75
- * codepage that mbtowc will convert to unicode codepoints. The easy
76
- * way to do this is to supply the scanner with UTF8. It will call
77
- * mbtowc on the buffer, pass it to the tokenizer which will extract
78
- * one token out. This token will then be converted back with wctomb.
79
- *
80
- * frt_scan_mb takes in a pointer to the mb buffer +inmb+ that has max
81
- * size +in_size+. The resulting token will be stored in +outmb+,
82
- * with at most +out_size+ bytes written.
83
- *
84
- * While tokenizing, part of the token may be stripped out. Eg,
85
- * 'foo!!' will become 'foo', and 'http://www.bar.com' will become
86
- * 'www.bar.com'. So that the caller can track what exactly has been
87
- * tokenized in +inmb+, +startmb+ and +endmb+ are set to point to the
88
- * extremeties of the original unmodified token in +inmb+.
89
- *
90
- * The size of the token written out to +outmb+ is stored in
91
- * +token_size+.
92
- */
93
-
94
- static void mb_to_wc(const char *in,
95
- unsigned int *out, size_t out_size)
96
- {
97
- mbstate_t state;
98
- const char *in_p = in;
99
- unsigned int *out_p = out;
100
- FRT_ZEROSET(&state, mbstate_t);
101
-
102
- while (*in_p && out_p < (out + out_size/sizeof(*out)))
103
- {
104
- /* We can break out early here on, say, a space XXX */
105
-
106
- int n = mb_next_char(out_p, in_p, &state);
107
- if (n < 0)
108
- {
109
- ++in_p;
110
- continue;
111
- }
112
-
113
- in_p += n;
114
- ++out_p;
115
- }
116
- }
117
-
118
- static void wc_to_mb(char *out, size_t out_size, int *token_size,
119
- const unsigned int *in_wc, size_t in_wc_size)
120
- {
121
- mbstate_t state;
122
- char *out_p = out;
123
- const unsigned int *in_wc_p = in_wc;
124
- FRT_ZEROSET(&state, mbstate_t);
125
- *token_size = 0;
126
-
127
- while (out_p < (out + out_size) && in_wc_p < (in_wc + in_wc_size))
128
- {
129
- if (!*in_wc_p)
130
- break;
131
-
132
- int n = wc_next_char(out_p, in_wc_p, &state);
133
- if (n < 0)
134
- {
135
- ++in_wc_p;
136
- continue;
137
- }
138
-
139
- out_p += n;
140
- ++in_wc_p;
141
- }
142
-
143
- *token_size = out_p - out;
144
- }
145
-
146
- void frt_std_scan_mb(const char *in_mb,
147
- char *out_mb, size_t out_mb_size,
148
- const char **start_mb,
149
- const char **end_mb,
150
- int *token_size)
151
- {
152
- int cs, act, top;
153
- int stack[32];
154
- unsigned int *ts = 0, *te = 0;
155
-
156
- %% write init;
157
-
158
- unsigned int in_wc[4096] = {0};
159
- mb_to_wc(in_mb, in_wc, sizeof(in_wc));
160
-
161
- unsigned int *p = in_wc, *pe = 0, *eof = pe;
162
- int skip = 0;
163
- int trunc = 0;
164
- unsigned int strip_char = 0;
165
-
166
- *end_mb = 0;
167
- *start_mb = 0;
168
- *token_size = 0;
169
-
170
- %% write exec;
171
-
172
- if ( cs == StdTok_error )
173
- fwprintf(stderr, L"PARSE ERROR\n");
174
- else if ( ts ) fwprintf(stderr, L"STUFF LEFT: '%ls'\n", ts);
175
- return;
176
-
177
- ret:
178
- {
179
- unsigned int out_wc[4096] = {0};
180
- size_t __len = te - ts - skip - trunc;
181
-
182
- *start_mb = position_in_mb(in_wc, in_mb, ts);
183
- *end_mb = position_in_mb(in_wc, in_mb, te);
184
-
185
- if (strip_char) {
186
- unsigned int *__p = ts + skip;
187
- unsigned int *__o = out_wc;
188
- for (; __p < (ts + skip + __len); ++__p) {
189
- if (*__p != strip_char)
190
- *__o++ = *__p;
191
- }
192
- }
193
- else {
194
- memcpy(out_wc, ts + skip, __len*sizeof(unsigned int));
195
- }
196
-
197
- wc_to_mb(out_mb, out_mb_size, token_size, out_wc, sizeof(out_wc));
198
- out_mb[*token_size] = 0;
199
- }
200
- }
@@ -1,85 +0,0 @@
1
- /* scanner.rl -*-C-*- */
2
- #include <ctype.h>
3
- #include <stdio.h>
4
- #include <stdlib.h>
5
- #include <string.h>
6
- #include <unistd.h>
7
- #include "frt_global.h"
8
-
9
- #define RET goto ret;
10
-
11
- #define STRIP(c) do { \
12
- strip_char = c; \
13
- goto ret; \
14
- } while(0)
15
-
16
- %%{
17
- machine StdTok;
18
- alphtype unsigned char;
19
- include UTF8 "uchar-utf8.rl";
20
-
21
- frt_alpha = ualpha;
22
- frt_alnum = ualnum;
23
- frt_digit = udigit;
24
-
25
- include StdTok "scanner.in";
26
-
27
- main := any @{ fhold; fcall frt_tokenizer; };
28
- }%%
29
-
30
- %% write data nofinal;
31
-
32
- void frt_std_scan_utf8(const char *in,
33
- char *out, size_t out_size,
34
- const char **start,
35
- const char **end,
36
- int *token_size)
37
- {
38
- int cs, act, top;
39
- int stack[32];
40
- char *ts = 0, *te = 0;
41
-
42
- %% write init;
43
-
44
- char *p = (char *)in, *pe = 0, *eof = pe;
45
- int skip = 0;
46
- int trunc = 0;
47
- char strip_char = 0;
48
-
49
- *end = 0;
50
- *start = 0;
51
- *token_size = 0;
52
-
53
- %% write exec;
54
-
55
- if ( cs == StdTok_error )
56
- fprintf(stderr, "PARSE ERROR\n" );
57
- else if ( ts ) fprintf(stderr, "STUFF LEFT: '%s'\n", ts);
58
- return;
59
-
60
- ret:
61
- {
62
- size_t __len = te - ts - skip - trunc;
63
- if (__len > out_size)
64
- __len = out_size;
65
-
66
- *start = ts;
67
- *end = te;
68
-
69
- if (strip_char) {
70
- char *__p = ts + skip;
71
- char *__o = out;
72
- for (; __p < (ts + skip + __len); ++__p) {
73
- if (*__p != strip_char)
74
- *__o++ = *__p;
75
- }
76
- *token_size = __o - out;
77
- }
78
- else {
79
- memcpy(out, ts + skip, __len);
80
- *token_size = __len;
81
- }
82
-
83
- out[*token_size] = 0;
84
- }
85
- }