sdsykes-ferret 0.11.6.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (195) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +102 -0
  4. data/Rakefile +338 -0
  5. data/TODO +17 -0
  6. data/TUTORIAL +231 -0
  7. data/bin/ferret-browser +79 -0
  8. data/ext/analysis.c +1555 -0
  9. data/ext/analysis.h +219 -0
  10. data/ext/api.c +69 -0
  11. data/ext/api.h +27 -0
  12. data/ext/array.c +123 -0
  13. data/ext/array.h +53 -0
  14. data/ext/bitvector.c +540 -0
  15. data/ext/bitvector.h +272 -0
  16. data/ext/compound_io.c +383 -0
  17. data/ext/config.h +42 -0
  18. data/ext/document.c +156 -0
  19. data/ext/document.h +53 -0
  20. data/ext/except.c +120 -0
  21. data/ext/except.h +168 -0
  22. data/ext/extconf.rb +14 -0
  23. data/ext/ferret.c +402 -0
  24. data/ext/ferret.h +91 -0
  25. data/ext/filter.c +156 -0
  26. data/ext/fs_store.c +483 -0
  27. data/ext/global.c +418 -0
  28. data/ext/global.h +117 -0
  29. data/ext/hash.c +567 -0
  30. data/ext/hash.h +473 -0
  31. data/ext/hashset.c +170 -0
  32. data/ext/hashset.h +187 -0
  33. data/ext/header.h +58 -0
  34. data/ext/helper.c +62 -0
  35. data/ext/helper.h +13 -0
  36. data/ext/inc/lang.h +48 -0
  37. data/ext/inc/threading.h +31 -0
  38. data/ext/index.c +6425 -0
  39. data/ext/index.h +961 -0
  40. data/ext/lang.h +66 -0
  41. data/ext/libstemmer.c +92 -0
  42. data/ext/libstemmer.h +79 -0
  43. data/ext/mempool.c +87 -0
  44. data/ext/mempool.h +35 -0
  45. data/ext/modules.h +162 -0
  46. data/ext/multimapper.c +310 -0
  47. data/ext/multimapper.h +51 -0
  48. data/ext/posh.c +1006 -0
  49. data/ext/posh.h +1007 -0
  50. data/ext/priorityqueue.c +151 -0
  51. data/ext/priorityqueue.h +143 -0
  52. data/ext/q_boolean.c +1608 -0
  53. data/ext/q_const_score.c +161 -0
  54. data/ext/q_filtered_query.c +209 -0
  55. data/ext/q_fuzzy.c +268 -0
  56. data/ext/q_match_all.c +148 -0
  57. data/ext/q_multi_term.c +677 -0
  58. data/ext/q_parser.c +2825 -0
  59. data/ext/q_phrase.c +1126 -0
  60. data/ext/q_prefix.c +100 -0
  61. data/ext/q_range.c +350 -0
  62. data/ext/q_span.c +2402 -0
  63. data/ext/q_term.c +337 -0
  64. data/ext/q_wildcard.c +171 -0
  65. data/ext/r_analysis.c +2575 -0
  66. data/ext/r_index.c +3472 -0
  67. data/ext/r_qparser.c +585 -0
  68. data/ext/r_search.c +4105 -0
  69. data/ext/r_store.c +513 -0
  70. data/ext/r_utils.c +963 -0
  71. data/ext/ram_store.c +471 -0
  72. data/ext/search.c +1741 -0
  73. data/ext/search.h +885 -0
  74. data/ext/similarity.c +150 -0
  75. data/ext/similarity.h +82 -0
  76. data/ext/sort.c +983 -0
  77. data/ext/stem_ISO_8859_1_danish.c +338 -0
  78. data/ext/stem_ISO_8859_1_danish.h +16 -0
  79. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  80. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  81. data/ext/stem_ISO_8859_1_english.c +1156 -0
  82. data/ext/stem_ISO_8859_1_english.h +16 -0
  83. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  84. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  85. data/ext/stem_ISO_8859_1_french.c +1276 -0
  86. data/ext/stem_ISO_8859_1_french.h +16 -0
  87. data/ext/stem_ISO_8859_1_german.c +512 -0
  88. data/ext/stem_ISO_8859_1_german.h +16 -0
  89. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  90. data/ext/stem_ISO_8859_1_italian.h +16 -0
  91. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  92. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  93. data/ext/stem_ISO_8859_1_porter.c +776 -0
  94. data/ext/stem_ISO_8859_1_porter.h +16 -0
  95. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  96. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  97. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  98. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  99. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  100. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  101. data/ext/stem_KOI8_R_russian.c +701 -0
  102. data/ext/stem_KOI8_R_russian.h +16 -0
  103. data/ext/stem_UTF_8_danish.c +344 -0
  104. data/ext/stem_UTF_8_danish.h +16 -0
  105. data/ext/stem_UTF_8_dutch.c +653 -0
  106. data/ext/stem_UTF_8_dutch.h +16 -0
  107. data/ext/stem_UTF_8_english.c +1176 -0
  108. data/ext/stem_UTF_8_english.h +16 -0
  109. data/ext/stem_UTF_8_finnish.c +808 -0
  110. data/ext/stem_UTF_8_finnish.h +16 -0
  111. data/ext/stem_UTF_8_french.c +1296 -0
  112. data/ext/stem_UTF_8_french.h +16 -0
  113. data/ext/stem_UTF_8_german.c +526 -0
  114. data/ext/stem_UTF_8_german.h +16 -0
  115. data/ext/stem_UTF_8_italian.c +1113 -0
  116. data/ext/stem_UTF_8_italian.h +16 -0
  117. data/ext/stem_UTF_8_norwegian.c +302 -0
  118. data/ext/stem_UTF_8_norwegian.h +16 -0
  119. data/ext/stem_UTF_8_porter.c +794 -0
  120. data/ext/stem_UTF_8_porter.h +16 -0
  121. data/ext/stem_UTF_8_portuguese.c +1055 -0
  122. data/ext/stem_UTF_8_portuguese.h +16 -0
  123. data/ext/stem_UTF_8_russian.c +709 -0
  124. data/ext/stem_UTF_8_russian.h +16 -0
  125. data/ext/stem_UTF_8_spanish.c +1137 -0
  126. data/ext/stem_UTF_8_spanish.h +16 -0
  127. data/ext/stem_UTF_8_swedish.c +313 -0
  128. data/ext/stem_UTF_8_swedish.h +16 -0
  129. data/ext/stopwords.c +401 -0
  130. data/ext/store.c +692 -0
  131. data/ext/store.h +777 -0
  132. data/ext/term_vectors.c +352 -0
  133. data/ext/threading.h +31 -0
  134. data/ext/utilities.c +446 -0
  135. data/ext/win32.h +54 -0
  136. data/lib/ferret.rb +29 -0
  137. data/lib/ferret/browser.rb +246 -0
  138. data/lib/ferret/browser/s/global.js +192 -0
  139. data/lib/ferret/browser/s/style.css +148 -0
  140. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  141. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  142. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  143. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  144. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  145. data/lib/ferret/browser/views/layout.rhtml +22 -0
  146. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  147. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  148. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  149. data/lib/ferret/browser/webrick.rb +14 -0
  150. data/lib/ferret/document.rb +130 -0
  151. data/lib/ferret/field_infos.rb +44 -0
  152. data/lib/ferret/index.rb +786 -0
  153. data/lib/ferret/number_tools.rb +157 -0
  154. data/lib/ferret_version.rb +3 -0
  155. data/setup.rb +1555 -0
  156. data/test/test_all.rb +5 -0
  157. data/test/test_helper.rb +24 -0
  158. data/test/threading/number_to_spoken.rb +132 -0
  159. data/test/threading/thread_safety_index_test.rb +79 -0
  160. data/test/threading/thread_safety_read_write_test.rb +76 -0
  161. data/test/threading/thread_safety_test.rb +133 -0
  162. data/test/unit/analysis/tc_analyzer.rb +548 -0
  163. data/test/unit/analysis/tc_token_stream.rb +646 -0
  164. data/test/unit/index/tc_index.rb +762 -0
  165. data/test/unit/index/tc_index_reader.rb +699 -0
  166. data/test/unit/index/tc_index_writer.rb +437 -0
  167. data/test/unit/index/th_doc.rb +315 -0
  168. data/test/unit/largefile/tc_largefile.rb +46 -0
  169. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  170. data/test/unit/search/tc_filter.rb +135 -0
  171. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  172. data/test/unit/search/tc_index_searcher.rb +61 -0
  173. data/test/unit/search/tc_multi_searcher.rb +128 -0
  174. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  175. data/test/unit/search/tc_search_and_sort.rb +179 -0
  176. data/test/unit/search/tc_sort.rb +49 -0
  177. data/test/unit/search/tc_sort_field.rb +27 -0
  178. data/test/unit/search/tc_spans.rb +190 -0
  179. data/test/unit/search/tm_searcher.rb +384 -0
  180. data/test/unit/store/tc_fs_store.rb +77 -0
  181. data/test/unit/store/tc_ram_store.rb +35 -0
  182. data/test/unit/store/tm_store.rb +34 -0
  183. data/test/unit/store/tm_store_lock.rb +68 -0
  184. data/test/unit/tc_document.rb +81 -0
  185. data/test/unit/ts_analysis.rb +2 -0
  186. data/test/unit/ts_index.rb +2 -0
  187. data/test/unit/ts_largefile.rb +4 -0
  188. data/test/unit/ts_query_parser.rb +2 -0
  189. data/test/unit/ts_search.rb +2 -0
  190. data/test/unit/ts_store.rb +2 -0
  191. data/test/unit/ts_utils.rb +2 -0
  192. data/test/unit/utils/tc_bit_vector.rb +295 -0
  193. data/test/unit/utils/tc_number_tools.rb +117 -0
  194. data/test/unit/utils/tc_priority_queue.rb +106 -0
  195. metadata +285 -0
@@ -0,0 +1,352 @@
1
+ #include <string.h>
2
+ #include "index.h"
3
+ #include "array.h"
4
+ #include "helper.h"
5
+
6
+ /****************************************************************************
7
+ *
8
+ * TermVector
9
+ *
10
+ ****************************************************************************/
11
+
12
+ void tv_destroy(TermVector *tv)
13
+ {
14
+ int i = tv->term_cnt;
15
+ while (i > 0) {
16
+ i--;
17
+ free(tv->terms[i].text);
18
+ free(tv->terms[i].positions);
19
+ }
20
+ free(tv->offsets);
21
+ free(tv->field);
22
+ free(tv->terms);
23
+ free(tv);
24
+ }
25
+
26
+ int tv_get_tv_term_index(TermVector *tv, const char *term)
27
+ {
28
+ int lo = 0; /* search starts array */
29
+ int hi = tv->term_cnt - 1; /* for 1st element < n, return its index */
30
+ int mid;
31
+ int cmp;
32
+ char *mid_term;
33
+
34
+ while (hi >= lo) {
35
+ mid = (lo + hi) >> 1;
36
+ mid_term = tv->terms[mid].text;
37
+ cmp = strcmp(term, mid_term);
38
+ if (cmp < 0) {
39
+ hi = mid - 1;
40
+ }
41
+ else if (cmp > 0) {
42
+ lo = mid + 1;
43
+ }
44
+ else { /* found a match */
45
+ return mid;
46
+ }
47
+ }
48
+ if (hi >= 0 && strcmp(term, tv->terms[hi].text) == 0) {
49
+ return hi;
50
+ }
51
+ else {
52
+ return -1;
53
+ }
54
+ return hi;
55
+ }
56
+
57
+ extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term)
58
+ {
59
+ int index = tv_get_tv_term_index(tv, term);
60
+ if (index >= 0) {
61
+ return &(tv->terms[index]);
62
+ }
63
+ else {
64
+ return NULL;
65
+ }
66
+ }
67
+
68
+ /****************************************************************************
69
+ *
70
+ * TermVectorsReader
71
+ *
72
+ ****************************************************************************/
73
+
74
+ TermVectorsReader *tvr_open(Store *store,
75
+ const char *segment,
76
+ FieldInfos *fis)
77
+ {
78
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
79
+ char file_name[SEGMENT_NAME_MAX_LENGTH];
80
+
81
+ tvr->fis = fis;
82
+ sprintf(file_name, "%s.tvx", segment);
83
+ tvr->tvx_in = store->open_input(store, file_name);
84
+ tvr->size = is_length(tvr->tvx_in) / 12;
85
+
86
+ sprintf(file_name, "%s.tvd", segment);
87
+ tvr->tvd_in = store->open_input(store, file_name);
88
+ return tvr;
89
+ }
90
+
91
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig)
92
+ {
93
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
94
+
95
+ memcpy(tvr, orig, sizeof(TermVectorsReader));
96
+ tvr->tvx_in = is_clone(orig->tvx_in);
97
+ tvr->tvd_in = is_clone(orig->tvd_in);
98
+
99
+ return tvr;
100
+ }
101
+
102
+ void tvr_close(TermVectorsReader *tvr)
103
+ {
104
+ is_close(tvr->tvx_in);
105
+ is_close(tvr->tvd_in);
106
+ free(tvr);
107
+ }
108
+
109
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr, int field_num)
110
+ {
111
+ TermVector *tv = ALLOC_AND_ZERO(TermVector);
112
+ InStream *tvd_in = tvr->tvd_in;
113
+ FieldInfo *fi = tvr->fis->fields[field_num];
114
+ const int num_terms = is_read_vint(tvd_in);
115
+
116
+ tv->field_num = field_num;
117
+ tv->field = estrdup(fi->name);
118
+
119
+ if (num_terms > 0) {
120
+ int i, j, delta_start, delta_len, total_len, freq;
121
+ int store_positions = fi_store_positions(fi);
122
+ int store_offsets = fi_store_offsets(fi);
123
+ uchar buffer[MAX_WORD_SIZE];
124
+ TVTerm *term;
125
+
126
+ tv->term_cnt = num_terms;
127
+ tv->terms = ALLOC_AND_ZERO_N(TVTerm, num_terms);
128
+
129
+ for (i = 0; i < num_terms; i++) {
130
+ term = &(tv->terms[i]);
131
+ /* read delta encoded term */
132
+ delta_start = is_read_vint(tvd_in);
133
+ delta_len = is_read_vint(tvd_in);
134
+ total_len = delta_start + delta_len;
135
+ is_read_bytes(tvd_in, buffer + delta_start, delta_len);
136
+ buffer[total_len++] = '\0';
137
+ term->text = memcpy(ALLOC_N(char, total_len), buffer, total_len);
138
+
139
+ /* read freq */
140
+ freq = term->freq = is_read_vint(tvd_in);
141
+
142
+ /* read positions if necessary */
143
+ if (store_positions) {
144
+ int *positions = term->positions = ALLOC_N(int, freq);
145
+ int pos = 0;
146
+ for (j = 0; j < freq; j++) {
147
+ positions[j] = pos += is_read_vint(tvd_in);
148
+ }
149
+ }
150
+
151
+ /* read offsets if necessary */
152
+ }
153
+ if (store_offsets) {
154
+ int num_positions = tv->offset_cnt = is_read_vint(tvd_in);
155
+ Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
156
+ int offset = 0;
157
+ for (i = 0; i < num_positions; i++) {
158
+ offsets[i].start = offset += is_read_vint(tvd_in);
159
+ offsets[i].end = offset += is_read_vint(tvd_in);
160
+ }
161
+ }
162
+ }
163
+ return tv;
164
+ }
165
+
166
+ HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
167
+ {
168
+ HashTable *term_vectors = h_new_str((free_ft)NULL, (free_ft)&tv_destroy);
169
+ int i;
170
+ InStream *tvx_in = tvr->tvx_in;
171
+ InStream *tvd_in = tvr->tvd_in;
172
+ off_t data_ptr, field_index_ptr;
173
+ int field_cnt;
174
+ int *field_nums;
175
+
176
+ if (doc_num >= 0 && doc_num < tvr->size) {
177
+ is_seek(tvx_in, 12 * doc_num);
178
+
179
+ data_ptr = (off_t)is_read_u64(tvx_in);
180
+ field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
181
+
182
+ /* scan fields to get position of field_num's term vector */
183
+ is_seek(tvd_in, field_index_ptr);
184
+
185
+ field_cnt = is_read_vint(tvd_in);
186
+ field_nums = ALLOC_N(int, field_cnt);
187
+
188
+ for (i = 0; i < field_cnt; i++) {
189
+ field_nums[i] = is_read_vint(tvd_in);
190
+ is_read_vint(tvd_in); /* skip space, we don't need it */
191
+ }
192
+ is_seek(tvd_in, data_ptr);
193
+
194
+ for (i = 0; i < field_cnt; i++) {
195
+ TermVector *tv = tvr_read_term_vector(tvr, field_nums[i]);
196
+ h_set(term_vectors, tv->field, tv);
197
+ }
198
+ free(field_nums);
199
+ }
200
+ return term_vectors;
201
+ }
202
+
203
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
204
+ int doc_num,
205
+ int field_num)
206
+ {
207
+ int i;
208
+ InStream *tvx_in = tvr->tvx_in;
209
+ InStream *tvd_in = tvr->tvd_in;
210
+ off_t data_ptr, field_index_ptr;
211
+ int field_cnt;
212
+ int offset = 0;
213
+ TermVector *tv = NULL;
214
+
215
+ if (doc_num >= 0 && doc_num < tvr->size) {
216
+ is_seek(tvx_in, 12 * doc_num);
217
+
218
+ data_ptr = (off_t)is_read_u64(tvx_in);
219
+ field_index_ptr = data_ptr + (off_t)is_read_u32(tvx_in);
220
+
221
+ /* scan fields to get position of field_num's term vector */
222
+ is_seek(tvd_in, field_index_ptr);
223
+
224
+ field_cnt = is_read_vint(tvd_in);
225
+ for (i = 0; i < field_cnt; i++) {
226
+ if ((int)is_read_vint(tvd_in) == field_num) {
227
+ break;
228
+ }
229
+ offset += is_read_vint(tvd_in); /* space taken by field */
230
+ }
231
+ if (i < field_cnt) {
232
+ /* field was found */
233
+ is_seek(tvd_in, data_ptr + offset);
234
+ tv = tvr_read_term_vector(tvr, field_num);
235
+ }
236
+ }
237
+ return tv;
238
+ }
239
+
240
+ /****************************************************************************
241
+ *
242
+ * TermVectorsWriter
243
+ *
244
+ ****************************************************************************/
245
+
246
+ TermVectorsWriter *tvw_open(Store *store, const char *segment, FieldInfos *fis)
247
+ {
248
+ TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
249
+ char file_name[SEGMENT_NAME_MAX_LENGTH];
250
+ tvw->fis = fis;
251
+ tvw->fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
252
+
253
+ snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvx", segment);
254
+ tvw->tvx_out = store->new_output(store, file_name);
255
+
256
+ snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvd", segment);
257
+ tvw->tvd_out = store->new_output(store, file_name);
258
+
259
+ return tvw;
260
+ }
261
+
262
+ void tvw_close(TermVectorsWriter *tvw)
263
+ {
264
+ os_close(tvw->tvx_out);
265
+ os_close(tvw->tvd_out);
266
+ ary_free(tvw->fields);
267
+ free(tvw);
268
+ }
269
+
270
+ void tvw_open_doc(TermVectorsWriter *tvw)
271
+ {
272
+ ary_size(tvw->fields) = 0;
273
+ tvw->tvd_ptr = os_pos(tvw->tvd_out);
274
+ os_write_u64(tvw->tvx_out, tvw->tvd_ptr);
275
+ }
276
+
277
+ void tvw_close_doc(TermVectorsWriter *tvw)
278
+ {
279
+ int i;
280
+ OutStream *tvd_out = tvw->tvd_out;
281
+ os_write_u32(tvw->tvx_out, (f_u32)(os_pos(tvw->tvd_out) - tvw->tvd_ptr));
282
+ os_write_vint(tvd_out, ary_size(tvw->fields));
283
+ for (i = 0; i < ary_size(tvw->fields); i++) {
284
+ os_write_vint(tvd_out, tvw->fields[i].field_num);
285
+ os_write_vint(tvd_out, tvw->fields[i].size);
286
+ }
287
+ }
288
+
289
+ void tvw_add_postings(TermVectorsWriter *tvw,
290
+ int field_num,
291
+ PostingList **plists,
292
+ int posting_count,
293
+ Offset *offsets,
294
+ int offset_count)
295
+ {
296
+ int i, delta_start, delta_length;
297
+ const char *last_term = EMPTY_STRING;
298
+ off_t tvd_start_pos = os_pos(tvw->tvd_out);
299
+ OutStream *tvd_out = tvw->tvd_out;
300
+ PostingList *plist;
301
+ Posting *posting;
302
+ Occurence *occ;
303
+ FieldInfo *fi = tvw->fis->fields[field_num];
304
+ int store_positions = fi_store_positions(fi);
305
+
306
+ ary_grow(tvw->fields);
307
+ ary_last(tvw->fields).field_num = field_num;
308
+
309
+ os_write_vint(tvd_out, posting_count);
310
+ for (i = 0; i < posting_count; i++) {
311
+ plist = plists[i];
312
+ posting = plist->last;
313
+ delta_start = hlp_string_diff(last_term, plist->term);
314
+ delta_length = plist->term_len - delta_start;
315
+
316
+ os_write_vint(tvd_out, delta_start); /* write shared prefix length */
317
+ os_write_vint(tvd_out, delta_length); /* write delta length */
318
+ /* write delta chars */
319
+ os_write_bytes(tvd_out,
320
+ (uchar *)(plist->term + delta_start),
321
+ delta_length);
322
+ os_write_vint(tvd_out, posting->freq);
323
+ last_term = plist->term;
324
+
325
+ if (store_positions) {
326
+ /* use delta encoding for positions */
327
+ int last_pos = 0;
328
+ for (occ = posting->first_occ; occ; occ = occ->next) {
329
+ os_write_vint(tvd_out, occ->pos - last_pos);
330
+ last_pos = occ->pos;
331
+ }
332
+ }
333
+
334
+ }
335
+
336
+ if (fi_store_offsets(fi)) {
337
+ /* use delta encoding for offsets */
338
+ int last_end = 0;
339
+ os_write_vint(tvd_out, offset_count); /* write shared prefix length */
340
+ for (i = 0; i < offset_count; i++) {
341
+ int start = offsets[i].start;
342
+ int end = offsets[i].end;
343
+ os_write_vint(tvd_out, start - last_end);
344
+ os_write_vint(tvd_out, end - start);
345
+ last_end = end;
346
+ }
347
+ }
348
+
349
+ ary_last(tvw->fields).size = os_pos(tvd_out) - tvd_start_pos;
350
+ }
351
+
352
+
@@ -0,0 +1,31 @@
1
+ #ifndef FRT_THREADING_H
2
+ #define FRT_THREADING_H
3
+
4
+ #include "hash.h"
5
+ #define UNTHREADED 1
6
+
7
+ typedef void * mutex_t;
8
+ typedef struct HashTable *thread_key_t;
9
+ typedef int thread_once_t;
10
+ #define MUTEX_INITIALIZER NULL
11
+ #define MUTEX_RECURSIVE_INITIALIZER NULL
12
+ #define THREAD_ONCE_INIT 1;
13
+ #define mutex_init(a, b)
14
+ #define mutex_lock(a)
15
+ #define mutex_trylock(a)
16
+ #define mutex_unlock(a)
17
+ #define mutex_destroy(a)
18
+ #define thread_key_create(a, b) frt_thread_key_create(a, b)
19
+ #define thread_key_delete(a) frt_thread_key_delete(a)
20
+ #define thread_setspecific(a, b) frt_thread_setspecific(a, b)
21
+ #define thread_getspecific(a) frt_thread_getspecific(a)
22
+ #define thread_exit(a)
23
+ #define thread_once(a, b) frt_thread_once(a, b)
24
+
25
+ void frt_thread_once(int *once_control, void (*init_routine)(void));
26
+ void frt_thread_key_create(thread_key_t *key, void (*destr_function)(void *));
27
+ void frt_thread_key_delete(thread_key_t key);
28
+ void frt_thread_setspecific(thread_key_t key, const void *pointer);
29
+ void *frt_thread_getspecific(thread_key_t key);
30
+
31
+ #endif
@@ -0,0 +1,446 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+
6
+ #include "header.h"
7
+
8
+ #define unless(C) if(!(C))
9
+
10
+ #define CREATE_SIZE 1
11
+
12
+ extern symbol * create_s(void) {
13
+ symbol * p;
14
+ void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
15
+ if (mem == NULL) return NULL;
16
+ p = (symbol *) (HEAD + (char *) mem);
17
+ CAPACITY(p) = CREATE_SIZE;
18
+ SET_SIZE(p, CREATE_SIZE);
19
+ return p;
20
+ }
21
+
22
+ extern void lose_s(symbol * p) {
23
+ if (p == NULL) return;
24
+ free((char *) p - HEAD);
25
+ }
26
+
27
+ /*
28
+ new_p = X_skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
29
+ if n +ve, or n characters backwards from p +c - 1 if n -ve. new_p is the new
30
+ position, or 0 on failure.
31
+
32
+ -- used to implement hop and next in the utf8 case.
33
+ */
34
+
35
+ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
36
+ int b;
37
+ if (n >= 0) {
38
+ for (; n > 0; n--) {
39
+ if (c >= l) return -1;
40
+ b = p[c++];
41
+ if (b >= 0xC0) { /* 1100 0000 */
42
+ while (c < l) {
43
+ b = p[c];
44
+ if (b >= 0xC0 || b < 0x80) break;
45
+ /* break unless b is 10------ */
46
+ c++;
47
+ }
48
+ }
49
+ }
50
+ } else {
51
+ for (; n < 0; n++) {
52
+ if (c <= lb) return -1;
53
+ b = p[--c];
54
+ if (b >= 0x80) { /* 1000 0000 */
55
+ while (c > lb) {
56
+ b = p[c];
57
+ if (b >= 0xC0) break; /* 1100 0000 */
58
+ c--;
59
+ }
60
+ }
61
+ }
62
+ }
63
+ return c;
64
+ }
65
+
66
+ /* Code for character groupings: utf8 cases */
67
+
68
+ static int get_utf8(const symbol * p, int c, int l, int * slot) {
69
+ int b0, b1;
70
+ if (c >= l) return 0;
71
+ b0 = p[c++];
72
+ if (b0 < 0xC0 || c == l) { /* 1100 0000 */
73
+ * slot = b0; return 1;
74
+ }
75
+ b1 = p[c++];
76
+ if (b0 < 0xE0 || c == l) { /* 1110 0000 */
77
+ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
78
+ }
79
+ * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3;
80
+ }
81
+
82
+ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
83
+ int b0, b1;
84
+ if (c <= lb) return 0;
85
+ b0 = p[--c];
86
+ if (b0 < 0x80 || c == lb) { /* 1000 0000 */
87
+ * slot = b0; return 1;
88
+ }
89
+ b1 = p[--c];
90
+ if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
91
+ * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
92
+ }
93
+ * slot = (*p & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
94
+ }
95
+
96
+ extern int in_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
97
+ int ch;
98
+ int w = get_utf8(z->p, z->c, z->l, & ch);
99
+ unless (w) return 0;
100
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
101
+ z->c += w; return 1;
102
+ }
103
+
104
+ extern int in_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
105
+ int ch;
106
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
107
+ unless (w) return 0;
108
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
109
+ z->c -= w; return 1;
110
+ }
111
+
112
+ extern int out_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
113
+ int ch;
114
+ int w = get_utf8(z->p, z->c, z->l, & ch);
115
+ unless (w) return 0;
116
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
117
+ z->c += w; return 1;
118
+ }
119
+
120
+ extern int out_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
121
+ int ch;
122
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
123
+ unless (w) return 0;
124
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
125
+ z->c -= w; return 1;
126
+ }
127
+
128
+ /* Code for character groupings: non-utf8 cases */
129
+
130
+ extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
131
+ int ch;
132
+ if (z->c >= z->l) return 0;
133
+ ch = z->p[z->c];
134
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
135
+ z->c++; return 1;
136
+ }
137
+
138
+ extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
139
+ int ch;
140
+ if (z->c <= z->lb) return 0;
141
+ ch = z->p[z->c - 1];
142
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
143
+ z->c--; return 1;
144
+ }
145
+
146
+ extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
147
+ int ch;
148
+ if (z->c >= z->l) return 0;
149
+ ch = z->p[z->c];
150
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
151
+ z->c++; return 1;
152
+ }
153
+
154
+ extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
155
+ int ch;
156
+ if (z->c <= z->lb) return 0;
157
+ ch = z->p[z->c - 1];
158
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
159
+ z->c--; return 1;
160
+ }
161
+
162
+ extern int eq_s(struct SN_env * z, int s_size, symbol * s) {
163
+ if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
164
+ z->c += s_size; return 1;
165
+ }
166
+
167
+ extern int eq_s_b(struct SN_env * z, int s_size, symbol * s) {
168
+ if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
169
+ z->c -= s_size; return 1;
170
+ }
171
+
172
+ extern int eq_v(struct SN_env * z, symbol * p) {
173
+ return eq_s(z, SIZE(p), p);
174
+ }
175
+
176
+ extern int eq_v_b(struct SN_env * z, symbol * p) {
177
+ return eq_s_b(z, SIZE(p), p);
178
+ }
179
+
180
+ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
181
+
182
+ int i = 0;
183
+ int j = v_size;
184
+
185
+ int c = z->c; int l = z->l;
186
+ symbol * q = z->p + c;
187
+
188
+ struct among * w;
189
+
190
+ int common_i = 0;
191
+ int common_j = 0;
192
+
193
+ int first_key_inspected = 0;
194
+
195
+ while(1) {
196
+ int k = i + ((j - i) >> 1);
197
+ int diff = 0;
198
+ int common = common_i < common_j ? common_i : common_j; /* smaller */
199
+ w = v + k;
200
+ {
201
+ int i; for (i = common; i < w->s_size; i++) {
202
+ if (c + common == l) { diff = -1; break; }
203
+ diff = q[common] - w->s[i];
204
+ if (diff != 0) break;
205
+ common++;
206
+ }
207
+ }
208
+ if (diff < 0) { j = k; common_j = common; }
209
+ else { i = k; common_i = common; }
210
+ if (j - i <= 1) {
211
+ if (i > 0) break; /* v->s has been inspected */
212
+ if (j == i) break; /* only one item in v */
213
+
214
+ /* - but now we need to go round once more to get
215
+ v->s inspected. This looks messy, but is actually
216
+ the optimal approach. */
217
+
218
+ if (first_key_inspected) break;
219
+ first_key_inspected = 1;
220
+ }
221
+ }
222
+ while(1) {
223
+ w = v + i;
224
+ if (common_i >= w->s_size) {
225
+ z->c = c + w->s_size;
226
+ if (w->function == 0) return w->result;
227
+ {
228
+ int res = w->function(z);
229
+ z->c = c + w->s_size;
230
+ if (res) return w->result;
231
+ }
232
+ }
233
+ i = w->substring_i;
234
+ if (i < 0) return 0;
235
+ }
236
+ }
237
+
238
+ /* find_among_b is for backwards processing. Same comments apply */
239
+
240
+ extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
241
+
242
+ int i = 0;
243
+ int j = v_size;
244
+
245
+ int c = z->c; int lb = z->lb;
246
+ symbol * q = z->p + c - 1;
247
+
248
+ struct among * w;
249
+
250
+ int common_i = 0;
251
+ int common_j = 0;
252
+
253
+ int first_key_inspected = 0;
254
+
255
+ while(1) {
256
+ int k = i + ((j - i) >> 1);
257
+ int diff = 0;
258
+ int common = common_i < common_j ? common_i : common_j;
259
+ w = v + k;
260
+ {
261
+ int i; for (i = w->s_size - 1 - common; i >= 0; i--) {
262
+ if (c - common == lb) { diff = -1; break; }
263
+ diff = q[- common] - w->s[i];
264
+ if (diff != 0) break;
265
+ common++;
266
+ }
267
+ }
268
+ if (diff < 0) { j = k; common_j = common; }
269
+ else { i = k; common_i = common; }
270
+ if (j - i <= 1) {
271
+ if (i > 0) break;
272
+ if (j == i) break;
273
+ if (first_key_inspected) break;
274
+ first_key_inspected = 1;
275
+ }
276
+ }
277
+ while(1) {
278
+ w = v + i;
279
+ if (common_i >= w->s_size) {
280
+ z->c = c - w->s_size;
281
+ if (w->function == 0) return w->result;
282
+ {
283
+ int res = w->function(z);
284
+ z->c = c - w->s_size;
285
+ if (res) return w->result;
286
+ }
287
+ }
288
+ i = w->substring_i;
289
+ if (i < 0) return 0;
290
+ }
291
+ }
292
+
293
+
294
+ /* Increase the size of the buffer pointed to by p to at least n symbols.
295
+ * If insufficient memory, returns NULL and frees the old buffer.
296
+ */
297
+ static symbol * increase_size(symbol * p, int n) {
298
+ symbol * q;
299
+ int new_size = n + 20;
300
+ void * mem = realloc((char *) p - HEAD,
301
+ HEAD + (new_size + 1) * sizeof(symbol));
302
+ if (mem == NULL) {
303
+ lose_s(p);
304
+ return NULL;
305
+ }
306
+ q = (symbol *) (HEAD + (char *)mem);
307
+ CAPACITY(q) = new_size;
308
+ return q;
309
+ }
310
+
311
+ /* to replace symbols between c_bra and c_ket in z->p by the
312
+ s_size symbols at s.
313
+ Returns 0 on success, -1 on error.
314
+ Also, frees z->p (and sets it to NULL) on error.
315
+ */
316
+ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
317
+ {
318
+ int adjustment;
319
+ int len;
320
+ if (z->p == NULL) {
321
+ z->p = create_s();
322
+ if (z->p == NULL) return -1;
323
+ }
324
+ adjustment = s_size - (c_ket - c_bra);
325
+ len = SIZE(z->p);
326
+ if (adjustment != 0) {
327
+ if (adjustment + len > CAPACITY(z->p)) {
328
+ z->p = increase_size(z->p, adjustment + len);
329
+ if (z->p == NULL) return -1;
330
+ }
331
+ memmove(z->p + c_ket + adjustment,
332
+ z->p + c_ket,
333
+ (len - c_ket) * sizeof(symbol));
334
+ SET_SIZE(z->p, adjustment + len);
335
+ z->l += adjustment;
336
+ if (z->c >= c_ket)
337
+ z->c += adjustment;
338
+ else
339
+ if (z->c > c_bra)
340
+ z->c = c_bra;
341
+ }
342
+ unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
343
+ if (adjptr != NULL)
344
+ *adjptr = adjustment;
345
+ return 0;
346
+ }
347
+
348
+ static int slice_check(struct SN_env * z) {
349
+
350
+ if (z->bra < 0 ||
351
+ z->bra > z->ket ||
352
+ z->ket > z->l ||
353
+ z->p == NULL ||
354
+ z->l > SIZE(z->p)) /* this line could be removed */
355
+ {
356
+ #if 0
357
+ fprintf(stderr, "faulty slice operation:\n");
358
+ debug(z, -1, 0);
359
+ #endif
360
+ return -1;
361
+ }
362
+ return 0;
363
+ }
364
+
365
+ extern int slice_from_s(struct SN_env * z, int s_size, symbol * s) {
366
+ if (slice_check(z)) return -1;
367
+ return replace_s(z, z->bra, z->ket, s_size, s, NULL);
368
+ }
369
+
370
+ extern int slice_from_v(struct SN_env * z, symbol * p) {
371
+ return slice_from_s(z, SIZE(p), p);
372
+ }
373
+
374
+ extern int slice_del(struct SN_env * z) {
375
+ return slice_from_s(z, 0, 0);
376
+ }
377
+
378
+ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s) {
379
+ int adjustment;
380
+ if (replace_s(z, bra, ket, s_size, s, &adjustment))
381
+ return -1;
382
+ if (bra <= z->bra) z->bra += adjustment;
383
+ if (bra <= z->ket) z->ket += adjustment;
384
+ return 0;
385
+ }
386
+
387
+ extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p) {
388
+ int adjustment;
389
+ if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
390
+ return -1;
391
+ if (bra <= z->bra) z->bra += adjustment;
392
+ if (bra <= z->ket) z->ket += adjustment;
393
+ return 0;
394
+ }
395
+
396
+ extern symbol * slice_to(struct SN_env * z, symbol * p) {
397
+ if (slice_check(z)) {
398
+ lose_s(p);
399
+ return NULL;
400
+ }
401
+ {
402
+ int len = z->ket - z->bra;
403
+ if (CAPACITY(p) < len) {
404
+ p = increase_size(p, len);
405
+ if (p == NULL)
406
+ return NULL;
407
+ }
408
+ memmove(p, z->p + z->bra, len * sizeof(symbol));
409
+ SET_SIZE(p, len);
410
+ }
411
+ return p;
412
+ }
413
+
414
+ extern symbol * assign_to(struct SN_env * z, symbol * p) {
415
+ int len = z->l;
416
+ if (CAPACITY(p) < len) {
417
+ p = increase_size(p, len);
418
+ if (p == NULL)
419
+ return NULL;
420
+ }
421
+ memmove(p, z->p, len * sizeof(symbol));
422
+ SET_SIZE(p, len);
423
+ return p;
424
+ }
425
+
426
+ #if 0
427
+ extern void debug(struct SN_env * z, int number, int line_count) {
428
+ int i;
429
+ int limit = SIZE(z->p);
430
+ /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
431
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
432
+ for (i = 0; i <= limit; i++) {
433
+ if (z->lb == i) printf("{");
434
+ if (z->bra == i) printf("[");
435
+ if (z->c == i) printf("|");
436
+ if (z->ket == i) printf("]");
437
+ if (z->l == i) printf("}");
438
+ if (i < limit)
439
+ { int ch = z->p[i];
440
+ if (ch == 0) ch = '#';
441
+ printf("%c", ch);
442
+ }
443
+ }
444
+ printf("'\n");
445
+ }
446
+ #endif