ferret 0.9.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (187) hide show
  1. data/Rakefile +23 -5
  2. data/TODO +2 -1
  3. data/ext/analysis.c +838 -177
  4. data/ext/analysis.h +55 -7
  5. data/ext/api.c +69 -0
  6. data/ext/api.h +27 -0
  7. data/ext/array.c +8 -5
  8. data/ext/compound_io.c +132 -96
  9. data/ext/document.c +58 -28
  10. data/ext/except.c +59 -0
  11. data/ext/except.h +88 -0
  12. data/ext/ferret.c +47 -3
  13. data/ext/ferret.h +3 -0
  14. data/ext/field.c +15 -9
  15. data/ext/filter.c +1 -1
  16. data/ext/fs_store.c +215 -34
  17. data/ext/global.c +72 -3
  18. data/ext/global.h +4 -3
  19. data/ext/hash.c +44 -3
  20. data/ext/hash.h +9 -0
  21. data/ext/header.h +58 -0
  22. data/ext/inc/except.h +88 -0
  23. data/ext/inc/lang.h +23 -13
  24. data/ext/ind.c +16 -10
  25. data/ext/index.h +2 -22
  26. data/ext/index_io.c +3 -11
  27. data/ext/index_rw.c +245 -193
  28. data/ext/lang.h +23 -13
  29. data/ext/libstemmer.c +92 -0
  30. data/ext/libstemmer.h +79 -0
  31. data/ext/modules.h +162 -0
  32. data/ext/q_boolean.c +34 -21
  33. data/ext/q_const_score.c +6 -12
  34. data/ext/q_filtered_query.c +206 -0
  35. data/ext/q_fuzzy.c +18 -15
  36. data/ext/q_match_all.c +3 -7
  37. data/ext/q_multi_phrase.c +10 -14
  38. data/ext/q_parser.c +29 -2
  39. data/ext/q_phrase.c +14 -21
  40. data/ext/q_prefix.c +15 -12
  41. data/ext/q_range.c +30 -28
  42. data/ext/q_span.c +13 -21
  43. data/ext/q_term.c +17 -26
  44. data/ext/r_analysis.c +693 -21
  45. data/ext/r_doc.c +11 -12
  46. data/ext/r_index_io.c +4 -1
  47. data/ext/r_qparser.c +21 -2
  48. data/ext/r_search.c +285 -18
  49. data/ext/ram_store.c +5 -2
  50. data/ext/search.c +11 -17
  51. data/ext/search.h +21 -45
  52. data/ext/similarity.h +67 -0
  53. data/ext/sort.c +30 -25
  54. data/ext/stem_ISO_8859_1_danish.c +338 -0
  55. data/ext/stem_ISO_8859_1_danish.h +16 -0
  56. data/ext/stem_ISO_8859_1_dutch.c +635 -0
  57. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  58. data/ext/stem_ISO_8859_1_english.c +1156 -0
  59. data/ext/stem_ISO_8859_1_english.h +16 -0
  60. data/ext/stem_ISO_8859_1_finnish.c +792 -0
  61. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  62. data/ext/stem_ISO_8859_1_french.c +1276 -0
  63. data/ext/stem_ISO_8859_1_french.h +16 -0
  64. data/ext/stem_ISO_8859_1_german.c +512 -0
  65. data/ext/stem_ISO_8859_1_german.h +16 -0
  66. data/ext/stem_ISO_8859_1_italian.c +1091 -0
  67. data/ext/stem_ISO_8859_1_italian.h +16 -0
  68. data/ext/stem_ISO_8859_1_norwegian.c +296 -0
  69. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  70. data/ext/stem_ISO_8859_1_porter.c +776 -0
  71. data/ext/stem_ISO_8859_1_porter.h +16 -0
  72. data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
  73. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  74. data/ext/stem_ISO_8859_1_spanish.c +1119 -0
  75. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  76. data/ext/stem_ISO_8859_1_swedish.c +307 -0
  77. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  78. data/ext/stem_KOI8_R_russian.c +701 -0
  79. data/ext/stem_KOI8_R_russian.h +16 -0
  80. data/ext/stem_UTF_8_danish.c +344 -0
  81. data/ext/stem_UTF_8_danish.h +16 -0
  82. data/ext/stem_UTF_8_dutch.c +653 -0
  83. data/ext/stem_UTF_8_dutch.h +16 -0
  84. data/ext/stem_UTF_8_english.c +1176 -0
  85. data/ext/stem_UTF_8_english.h +16 -0
  86. data/ext/stem_UTF_8_finnish.c +808 -0
  87. data/ext/stem_UTF_8_finnish.h +16 -0
  88. data/ext/stem_UTF_8_french.c +1296 -0
  89. data/ext/stem_UTF_8_french.h +16 -0
  90. data/ext/stem_UTF_8_german.c +526 -0
  91. data/ext/stem_UTF_8_german.h +16 -0
  92. data/ext/stem_UTF_8_italian.c +1113 -0
  93. data/ext/stem_UTF_8_italian.h +16 -0
  94. data/ext/stem_UTF_8_norwegian.c +302 -0
  95. data/ext/stem_UTF_8_norwegian.h +16 -0
  96. data/ext/stem_UTF_8_porter.c +794 -0
  97. data/ext/stem_UTF_8_porter.h +16 -0
  98. data/ext/stem_UTF_8_portuguese.c +1055 -0
  99. data/ext/stem_UTF_8_portuguese.h +16 -0
  100. data/ext/stem_UTF_8_russian.c +709 -0
  101. data/ext/stem_UTF_8_russian.h +16 -0
  102. data/ext/stem_UTF_8_spanish.c +1137 -0
  103. data/ext/stem_UTF_8_spanish.h +16 -0
  104. data/ext/stem_UTF_8_swedish.c +313 -0
  105. data/ext/stem_UTF_8_swedish.h +16 -0
  106. data/ext/stopwords.c +325 -0
  107. data/ext/store.c +34 -2
  108. data/ext/tags +2953 -0
  109. data/ext/term.c +21 -15
  110. data/ext/termdocs.c +5 -3
  111. data/ext/utilities.c +446 -0
  112. data/ext/vector.c +27 -13
  113. data/lib/ferret/document/document.rb +1 -1
  114. data/lib/ferret/index/index.rb +44 -6
  115. data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
  116. data/lib/rferret.rb +2 -1
  117. data/test/test_helper.rb +2 -2
  118. data/test/unit/analysis/ctc_analyzer.rb +401 -0
  119. data/test/unit/analysis/ctc_tokenstream.rb +423 -0
  120. data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
  121. data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
  122. data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
  123. data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
  124. data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
  125. data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
  126. data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
  127. data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
  128. data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
  129. data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
  130. data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
  131. data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
  132. data/test/unit/analysis/tc_analyzer.rb +1 -2
  133. data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
  134. data/test/unit/document/rtc_field.rb +28 -0
  135. data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
  136. data/test/unit/document/tc_field.rb +82 -12
  137. data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
  138. data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
  139. data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
  140. data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
  141. data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
  142. data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
  143. data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
  144. data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
  145. data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
  146. data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
  147. data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
  148. data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
  149. data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
  150. data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
  151. data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
  152. data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
  153. data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
  154. data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
  155. data/test/unit/query_parser/tc_query_parser.rb +24 -16
  156. data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
  157. data/test/unit/search/rtc_sort_field.rb +14 -0
  158. data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
  159. data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
  160. data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
  161. data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
  162. data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
  163. data/test/unit/search/tc_sort_field.rb +20 -7
  164. data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
  165. data/test/unit/store/rtc_fs_store.rb +62 -0
  166. data/test/unit/store/rtc_ram_store.rb +15 -0
  167. data/test/unit/store/rtm_store.rb +150 -0
  168. data/test/unit/store/rtm_store_lock.rb +2 -0
  169. data/test/unit/store/tc_fs_store.rb +54 -40
  170. data/test/unit/store/tc_ram_store.rb +20 -0
  171. data/test/unit/store/tm_store.rb +30 -146
  172. data/test/unit/store/tm_store_lock.rb +66 -0
  173. data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
  174. data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
  175. data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
  176. data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
  177. data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
  178. data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
  179. data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
  180. data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
  181. metadata +360 -289
  182. data/test/unit/document/c_field.rb +0 -98
  183. data/test/unit/search/c_sort_field.rb +0 -27
  184. data/test/unit/store/c_fs_store.rb +0 -76
  185. data/test/unit/store/c_ram_store.rb +0 -35
  186. data/test/unit/store/m_store.rb +0 -34
  187. data/test/unit/store/m_store_lock.rb +0 -68
data/ext/term.c CHANGED
@@ -3,6 +3,11 @@
3
3
  #include <helper.h>
4
4
  #include <hash.h>
5
5
 
6
+ static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
7
+ static char * const TERM_ORDER_ERROR_MSG = "term out of order";
8
+ static char * const FP_ORDER_ERROR_MSG = "freq pointer out of order";
9
+ static char * const PP_ORDER_ERROR_MSG = "prox pointer out of order";
10
+
6
11
  /****************************************************************************
7
12
  *
8
13
  * Term
@@ -339,7 +344,7 @@ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
339
344
  } else {
340
345
  // check that it is a format we can understand
341
346
  if (first_int < TERM_INFO_FORMAT)
342
- eprintf(ERROR, "Unknown format version:%d", first_int);
347
+ RAISE(ERROR, FORMAT_VERSION_ERROR_MSG);
343
348
 
344
349
  // we have a format version number
345
350
  ste->format = first_int;
@@ -443,7 +448,7 @@ void mte_close(TermEnum *te)
443
448
 
444
449
  TermEnum *mte_clone(TermEnum *te)
445
450
  {
446
- eprintf(ERROR, "MultiTermEnum does not support cloning");
451
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
447
452
  return NULL;
448
453
  }
449
454
 
@@ -555,14 +560,13 @@ void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
555
560
  void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
556
561
  {
557
562
  if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
558
- eprintf(STATE_ERROR,
559
- "term out of order %s < %s", t->text, tiw->last_term->text);
563
+ RAISE(STATE_ERROR, TERM_ORDER_ERROR_MSG);
560
564
  }
561
565
  if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
562
- eprintf(STATE_ERROR, "freq pointer out of order");
566
+ RAISE(STATE_ERROR, FP_ORDER_ERROR_MSG);
563
567
  }
564
568
  if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
565
- eprintf(STATE_ERROR, "prox pointer out of order");
569
+ RAISE(STATE_ERROR, PP_ORDER_ERROR_MSG);
566
570
  }
567
571
 
568
572
  if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0)
@@ -666,15 +670,17 @@ void tir_ensure_index_is_read(TermInfosReader *tir)
666
670
  TermEnum *index_te = tir->index_te;
667
671
  SegmentTermEnum *ste = index_te->data;
668
672
 
669
- while (ste_next(index_te) != NULL) {
670
- tir->index_terms[i] = te_get_term(index_te);
671
- tir->index_term_infos[i] = te_get_ti(index_te);
672
- tir->index_pointers[i] = ste->index_pointer;
673
- i++;
674
- }
675
-
676
- index_te->close(index_te);
677
- tir->index_te = NULL;
673
+ TRY
674
+ while (ste_next(index_te) != NULL) {
675
+ tir->index_terms[i] = te_get_term(index_te);
676
+ tir->index_term_infos[i] = te_get_ti(index_te);
677
+ tir->index_pointers[i] = ste->index_pointer;
678
+ i++;
679
+ }
680
+ XFINALLY
681
+ index_te->close(index_te);
682
+ tir->index_te = NULL;
683
+ XENDTRY
678
684
  }
679
685
  mutex_unlock(&tir->mutex);
680
686
  }
@@ -1,6 +1,8 @@
1
1
  #include <index.h>
2
2
  #include <string.h>
3
3
 
4
+ static char * const TPE_VS_TDE_ERROR_MSG = "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.";
5
+
4
6
  /****************************************************************************
5
7
  *
6
8
  * SegmentTermDocEnum
@@ -250,7 +252,7 @@ bool stpe_next(TermDocEnum *tde)
250
252
 
251
253
  int stpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
252
254
  {
253
- eprintf(ARG_ERROR, "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.");
255
+ RAISE(ARG_ERROR, TPE_VS_TDE_ERROR_MSG);
254
256
  return -1;
255
257
  }
256
258
 
@@ -482,7 +484,7 @@ void mtdpe_close(TermDocEnum *self)
482
484
  }
483
485
 
484
486
  void mtdpe_seek(TermDocEnum *tde, Term *term)
485
- { eprintf(UNSUPPORTED_ERROR, "Unsupported op seek on MultipleTDPE");}
487
+ { RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);}
486
488
 
487
489
  bool mtdpe_next(TermDocEnum *self)
488
490
  {
@@ -556,7 +558,7 @@ bool mtdpe_skip_to(TermDocEnum *self, int target_doc_num)
556
558
 
557
559
  int mtdpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
558
560
  {
559
- eprintf(UNSUPPORTED_ERROR, "Unsupported op read on MultipleTDPE");
561
+ RAISE(UNSUPPORTED_ERROR, UNSUPPORTED_ERROR_MSG);
560
562
  return -1;
561
563
  }
562
564
 
@@ -0,0 +1,446 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+
6
+ #include "header.h"
7
+
8
+ #define unless(C) if(!(C))
9
+
10
+ #define CREATE_SIZE 1
11
+
12
+ extern symbol * create_s(void) {
13
+ symbol * p;
14
+ void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
15
+ if (mem == NULL) return NULL;
16
+ p = (symbol *) (HEAD + (char *) mem);
17
+ CAPACITY(p) = CREATE_SIZE;
18
+ SET_SIZE(p, CREATE_SIZE);
19
+ return p;
20
+ }
21
+
22
+ extern void lose_s(symbol * p) {
23
+ if (p == NULL) return;
24
+ free((char *) p - HEAD);
25
+ }
26
+
27
+ /*
28
+ new_p = X_skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
29
+ if n +ve, or n characters backwards from p +c - 1 if n -ve. new_p is the new
30
+ position, or 0 on failure.
31
+
32
+ -- used to implement hop and next in the utf8 case.
33
+ */
34
+
35
+ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
36
+ int b;
37
+ if (n >= 0) {
38
+ for (; n > 0; n--) {
39
+ if (c >= l) return -1;
40
+ b = p[c++];
41
+ if (b >= 0xC0) { /* 1100 0000 */
42
+ while (c < l) {
43
+ b = p[c];
44
+ if (b >= 0xC0 || b < 0x80) break;
45
+ /* break unless b is 10------ */
46
+ c++;
47
+ }
48
+ }
49
+ }
50
+ } else {
51
+ for (; n < 0; n++) {
52
+ if (c <= lb) return -1;
53
+ b = p[--c];
54
+ if (b >= 0x80) { /* 1000 0000 */
55
+ while (c > lb) {
56
+ b = p[c];
57
+ if (b >= 0xC0) break; /* 1100 0000 */
58
+ c--;
59
+ }
60
+ }
61
+ }
62
+ }
63
+ return c;
64
+ }
65
+
66
+ /* Code for character groupings: utf8 cases */
67
+
68
+ static int get_utf8(const symbol * p, int c, int l, int * slot) {
69
+ int b0, b1;
70
+ if (c >= l) return 0;
71
+ b0 = p[c++];
72
+ if (b0 < 0xC0 || c == l) { /* 1100 0000 */
73
+ * slot = b0; return 1;
74
+ }
75
+ b1 = p[c++];
76
+ if (b0 < 0xE0 || c == l) { /* 1110 0000 */
77
+ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
78
+ }
79
+ * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3;
80
+ }
81
+
82
+ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
83
+ int b0, b1;
84
+ if (c <= lb) return 0;
85
+ b0 = p[--c];
86
+ if (b0 < 0x80 || c == lb) { /* 1000 0000 */
87
+ * slot = b0; return 1;
88
+ }
89
+ b1 = p[--c];
90
+ if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
91
+ * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
92
+ }
93
+ * slot = (*p & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
94
+ }
95
+
96
+ extern int in_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
97
+ int ch;
98
+ int w = get_utf8(z->p, z->c, z->l, & ch);
99
+ unless (w) return 0;
100
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
101
+ z->c += w; return 1;
102
+ }
103
+
104
+ extern int in_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
105
+ int ch;
106
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
107
+ unless (w) return 0;
108
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
109
+ z->c -= w; return 1;
110
+ }
111
+
112
+ extern int out_grouping_U(struct SN_env * z, unsigned char * s, int min, int max) {
113
+ int ch;
114
+ int w = get_utf8(z->p, z->c, z->l, & ch);
115
+ unless (w) return 0;
116
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
117
+ z->c += w; return 1;
118
+ }
119
+
120
+ extern int out_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max) {
121
+ int ch;
122
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
123
+ unless (w) return 0;
124
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
125
+ z->c -= w; return 1;
126
+ }
127
+
128
+ /* Code for character groupings: non-utf8 cases */
129
+
130
+ extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
131
+ int ch;
132
+ if (z->c >= z->l) return 0;
133
+ ch = z->p[z->c];
134
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
135
+ z->c++; return 1;
136
+ }
137
+
138
+ extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
139
+ int ch;
140
+ if (z->c <= z->lb) return 0;
141
+ ch = z->p[z->c - 1];
142
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
143
+ z->c--; return 1;
144
+ }
145
+
146
+ extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max) {
147
+ int ch;
148
+ if (z->c >= z->l) return 0;
149
+ ch = z->p[z->c];
150
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
151
+ z->c++; return 1;
152
+ }
153
+
154
+ extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max) {
155
+ int ch;
156
+ if (z->c <= z->lb) return 0;
157
+ ch = z->p[z->c - 1];
158
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 0;
159
+ z->c--; return 1;
160
+ }
161
+
162
+ extern int eq_s(struct SN_env * z, int s_size, symbol * s) {
163
+ if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
164
+ z->c += s_size; return 1;
165
+ }
166
+
167
+ extern int eq_s_b(struct SN_env * z, int s_size, symbol * s) {
168
+ if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
169
+ z->c -= s_size; return 1;
170
+ }
171
+
172
+ extern int eq_v(struct SN_env * z, symbol * p) {
173
+ return eq_s(z, SIZE(p), p);
174
+ }
175
+
176
+ extern int eq_v_b(struct SN_env * z, symbol * p) {
177
+ return eq_s_b(z, SIZE(p), p);
178
+ }
179
+
180
+ extern int find_among(struct SN_env * z, struct among * v, int v_size) {
181
+
182
+ int i = 0;
183
+ int j = v_size;
184
+
185
+ int c = z->c; int l = z->l;
186
+ symbol * q = z->p + c;
187
+
188
+ struct among * w;
189
+
190
+ int common_i = 0;
191
+ int common_j = 0;
192
+
193
+ int first_key_inspected = 0;
194
+
195
+ while(1) {
196
+ int k = i + ((j - i) >> 1);
197
+ int diff = 0;
198
+ int common = common_i < common_j ? common_i : common_j; /* smaller */
199
+ w = v + k;
200
+ {
201
+ int i; for (i = common; i < w->s_size; i++) {
202
+ if (c + common == l) { diff = -1; break; }
203
+ diff = q[common] - w->s[i];
204
+ if (diff != 0) break;
205
+ common++;
206
+ }
207
+ }
208
+ if (diff < 0) { j = k; common_j = common; }
209
+ else { i = k; common_i = common; }
210
+ if (j - i <= 1) {
211
+ if (i > 0) break; /* v->s has been inspected */
212
+ if (j == i) break; /* only one item in v */
213
+
214
+ /* - but now we need to go round once more to get
215
+ v->s inspected. This looks messy, but is actually
216
+ the optimal approach. */
217
+
218
+ if (first_key_inspected) break;
219
+ first_key_inspected = 1;
220
+ }
221
+ }
222
+ while(1) {
223
+ w = v + i;
224
+ if (common_i >= w->s_size) {
225
+ z->c = c + w->s_size;
226
+ if (w->function == 0) return w->result;
227
+ {
228
+ int res = w->function(z);
229
+ z->c = c + w->s_size;
230
+ if (res) return w->result;
231
+ }
232
+ }
233
+ i = w->substring_i;
234
+ if (i < 0) return 0;
235
+ }
236
+ }
237
+
238
+ /* find_among_b is for backwards processing. Same comments apply */
239
+
240
+ extern int find_among_b(struct SN_env * z, struct among * v, int v_size) {
241
+
242
+ int i = 0;
243
+ int j = v_size;
244
+
245
+ int c = z->c; int lb = z->lb;
246
+ symbol * q = z->p + c - 1;
247
+
248
+ struct among * w;
249
+
250
+ int common_i = 0;
251
+ int common_j = 0;
252
+
253
+ int first_key_inspected = 0;
254
+
255
+ while(1) {
256
+ int k = i + ((j - i) >> 1);
257
+ int diff = 0;
258
+ int common = common_i < common_j ? common_i : common_j;
259
+ w = v + k;
260
+ {
261
+ int i; for (i = w->s_size - 1 - common; i >= 0; i--) {
262
+ if (c - common == lb) { diff = -1; break; }
263
+ diff = q[- common] - w->s[i];
264
+ if (diff != 0) break;
265
+ common++;
266
+ }
267
+ }
268
+ if (diff < 0) { j = k; common_j = common; }
269
+ else { i = k; common_i = common; }
270
+ if (j - i <= 1) {
271
+ if (i > 0) break;
272
+ if (j == i) break;
273
+ if (first_key_inspected) break;
274
+ first_key_inspected = 1;
275
+ }
276
+ }
277
+ while(1) {
278
+ w = v + i;
279
+ if (common_i >= w->s_size) {
280
+ z->c = c - w->s_size;
281
+ if (w->function == 0) return w->result;
282
+ {
283
+ int res = w->function(z);
284
+ z->c = c - w->s_size;
285
+ if (res) return w->result;
286
+ }
287
+ }
288
+ i = w->substring_i;
289
+ if (i < 0) return 0;
290
+ }
291
+ }
292
+
293
+
294
+ /* Increase the size of the buffer pointed to by p to at least n symbols.
295
+ * If insufficient memory, returns NULL and frees the old buffer.
296
+ */
297
+ static symbol * increase_size(symbol * p, int n) {
298
+ symbol * q;
299
+ int new_size = n + 20;
300
+ void * mem = realloc((char *) p - HEAD,
301
+ HEAD + (new_size + 1) * sizeof(symbol));
302
+ if (mem == NULL) {
303
+ lose_s(p);
304
+ return NULL;
305
+ }
306
+ q = (symbol *) (HEAD + (char *)mem);
307
+ CAPACITY(q) = new_size;
308
+ return q;
309
+ }
310
+
311
+ /* to replace symbols between c_bra and c_ket in z->p by the
312
+ s_size symbols at s.
313
+ Returns 0 on success, -1 on error.
314
+ Also, frees z->p (and sets it to NULL) on error.
315
+ */
316
+ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
317
+ {
318
+ int adjustment;
319
+ int len;
320
+ if (z->p == NULL) {
321
+ z->p = create_s();
322
+ if (z->p == NULL) return -1;
323
+ }
324
+ adjustment = s_size - (c_ket - c_bra);
325
+ len = SIZE(z->p);
326
+ if (adjustment != 0) {
327
+ if (adjustment + len > CAPACITY(z->p)) {
328
+ z->p = increase_size(z->p, adjustment + len);
329
+ if (z->p == NULL) return -1;
330
+ }
331
+ memmove(z->p + c_ket + adjustment,
332
+ z->p + c_ket,
333
+ (len - c_ket) * sizeof(symbol));
334
+ SET_SIZE(z->p, adjustment + len);
335
+ z->l += adjustment;
336
+ if (z->c >= c_ket)
337
+ z->c += adjustment;
338
+ else
339
+ if (z->c > c_bra)
340
+ z->c = c_bra;
341
+ }
342
+ unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
343
+ if (adjptr != NULL)
344
+ *adjptr = adjustment;
345
+ return 0;
346
+ }
347
+
348
+ static int slice_check(struct SN_env * z) {
349
+
350
+ if (z->bra < 0 ||
351
+ z->bra > z->ket ||
352
+ z->ket > z->l ||
353
+ z->p == NULL ||
354
+ z->l > SIZE(z->p)) /* this line could be removed */
355
+ {
356
+ #if 0
357
+ fprintf(stderr, "faulty slice operation:\n");
358
+ debug(z, -1, 0);
359
+ #endif
360
+ return -1;
361
+ }
362
+ return 0;
363
+ }
364
+
365
+ extern int slice_from_s(struct SN_env * z, int s_size, symbol * s) {
366
+ if (slice_check(z)) return -1;
367
+ return replace_s(z, z->bra, z->ket, s_size, s, NULL);
368
+ }
369
+
370
+ extern int slice_from_v(struct SN_env * z, symbol * p) {
371
+ return slice_from_s(z, SIZE(p), p);
372
+ }
373
+
374
+ extern int slice_del(struct SN_env * z) {
375
+ return slice_from_s(z, 0, 0);
376
+ }
377
+
378
+ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s) {
379
+ int adjustment;
380
+ if (replace_s(z, bra, ket, s_size, s, &adjustment))
381
+ return -1;
382
+ if (bra <= z->bra) z->bra += adjustment;
383
+ if (bra <= z->ket) z->ket += adjustment;
384
+ return 0;
385
+ }
386
+
387
+ extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p) {
388
+ int adjustment;
389
+ if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
390
+ return -1;
391
+ if (bra <= z->bra) z->bra += adjustment;
392
+ if (bra <= z->ket) z->ket += adjustment;
393
+ return 0;
394
+ }
395
+
396
+ extern symbol * slice_to(struct SN_env * z, symbol * p) {
397
+ if (slice_check(z)) {
398
+ lose_s(p);
399
+ return NULL;
400
+ }
401
+ {
402
+ int len = z->ket - z->bra;
403
+ if (CAPACITY(p) < len) {
404
+ p = increase_size(p, len);
405
+ if (p == NULL)
406
+ return NULL;
407
+ }
408
+ memmove(p, z->p + z->bra, len * sizeof(symbol));
409
+ SET_SIZE(p, len);
410
+ }
411
+ return p;
412
+ }
413
+
414
+ extern symbol * assign_to(struct SN_env * z, symbol * p) {
415
+ int len = z->l;
416
+ if (CAPACITY(p) < len) {
417
+ p = increase_size(p, len);
418
+ if (p == NULL)
419
+ return NULL;
420
+ }
421
+ memmove(p, z->p, len * sizeof(symbol));
422
+ SET_SIZE(p, len);
423
+ return p;
424
+ }
425
+
426
+ #if 0
427
+ extern void debug(struct SN_env * z, int number, int line_count) {
428
+ int i;
429
+ int limit = SIZE(z->p);
430
+ /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
431
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
432
+ for (i = 0; i <= limit; i++) {
433
+ if (z->lb == i) printf("{");
434
+ if (z->bra == i) printf("[");
435
+ if (z->c == i) printf("|");
436
+ if (z->ket == i) printf("]");
437
+ if (z->l == i) printf("}");
438
+ if (i < limit)
439
+ { int ch = z->p[i];
440
+ if (ch == 0) ch = '#';
441
+ printf("%c", ch);
442
+ }
443
+ }
444
+ printf("'\n");
445
+ }
446
+ #endif