ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/ext/r_doc.c DELETED
@@ -1,582 +0,0 @@
1
- #include "ferret.h"
2
- #include "document.h"
3
-
4
- VALUE cDocument;
5
- VALUE cField;
6
- VALUE cFieldStore;
7
- VALUE cFieldIndex;
8
- VALUE cFieldTermVector;
9
-
10
- /****************************************************************************
11
- *
12
- * Field Methods
13
- *
14
- ****************************************************************************/
15
-
16
- void
17
- frt_field_free(void *p)
18
- {
19
- object_del(p);
20
- df_destroy_data(p);
21
- }
22
-
23
- static VALUE
24
- frt_field_alloc(VALUE klass)
25
- {
26
- VALUE self;
27
- DocField *df = ALLOC(DocField);
28
- df->name = NULL;
29
- df->data = NULL;
30
- self = Data_Wrap_Struct(klass, NULL, &frt_field_free, df);
31
- object_add(df, self);
32
- return self;
33
- }
34
-
35
- #define GET_DF DocField *df = (DocField *)DATA_PTR(self)
36
-
37
- static VALUE
38
- frt_field_init(int argc, VALUE *argv, VALUE self)
39
- {
40
- GET_DF;
41
- VALUE rname, rdata, rstored, rindexed, rstore_tv, rbinary, rboost;
42
- char *name;
43
- char *data;
44
- float boost = 1.0;
45
- int stored = 0, indexed = 0, store_tv = 0;
46
- int len;
47
- bool binary = false;
48
- switch (rb_scan_args(argc, argv, "25", &rname, &rdata, &rstored,
49
- &rindexed, &rstore_tv, &rbinary, &rboost)) {
50
- case 7: boost = (float)rb_num2dbl(rboost);
51
- case 6: binary = RTEST(rbinary);
52
- case 5: store_tv = FIX2INT(rstore_tv);
53
- case 4: indexed = FIX2INT(rindexed);
54
- case 3: stored = FIX2INT(rstored);
55
- default:
56
- rname = rb_obj_as_string(rname);
57
- rdata = rb_obj_as_string(rdata);
58
- break;
59
- }
60
- name = RSTRING(rname)->ptr;
61
- len = RSTRING(rdata)->len;
62
- data = ALLOC_N(char, len + 1);
63
- MEMCPY(data, RSTRING(rdata)->ptr, char, len);
64
- data[len] = 0;
65
- df_set(df, name, data, stored, indexed, store_tv);
66
- df->blen = len;
67
- df->is_binary = binary;
68
- df->boost = boost;
69
- return Qnil;
70
- }
71
-
72
- static VALUE
73
- frt_field_get_name(VALUE self)
74
- {
75
- GET_DF;
76
- return rb_str_new2(df->name);
77
- }
78
-
79
- static VALUE
80
- frt_field_set_name(VALUE self, VALUE rname)
81
- {
82
- int len;
83
- GET_DF;
84
- rname = rb_obj_as_string(rname);
85
- len = RSTRING(rname)->len;
86
- REALLOC_N(df->name, char, len);
87
- MEMCPY(df->name, RSTRING(rname)->ptr, char, len);
88
- return Qnil;
89
- }
90
-
91
- static VALUE
92
- frt_field_get_data(VALUE self)
93
- {
94
- GET_DF;
95
- return rb_str_new(df->data, df->blen);
96
- }
97
-
98
- static VALUE
99
- frt_field_set_data(VALUE self, VALUE rdata)
100
- {
101
- int len;
102
- GET_DF;
103
- rdata = rb_obj_as_string(rdata);
104
- len = RSTRING(rdata)->len;
105
- REALLOC_N(df->data, char, len);
106
- MEMCPY(df->data, RSTRING(rdata)->ptr, char, len);
107
- df->blen = len;
108
- return Qnil;
109
- }
110
-
111
- static VALUE
112
- frt_field_get_boost(VALUE self)
113
- {
114
- GET_DF;
115
- return rb_float_new((double)df->boost);
116
- }
117
-
118
- static VALUE
119
- frt_field_set_boost(VALUE self, VALUE rboost)
120
- {
121
- GET_DF;
122
- df->boost = (float)rb_num2dbl(rboost);
123
- return Qnil;
124
- }
125
-
126
- static VALUE
127
- frt_field_is_stored(VALUE self)
128
- {
129
- GET_DF;
130
- return df->is_stored ? Qtrue : Qfalse;
131
- }
132
-
133
- static VALUE
134
- frt_field_is_indexed(VALUE self)
135
- {
136
- GET_DF;
137
- return df->is_indexed ? Qtrue : Qfalse;
138
- }
139
-
140
- static VALUE
141
- frt_field_is_tokenized(VALUE self)
142
- {
143
- GET_DF;
144
- return df->is_tokenized ? Qtrue : Qfalse;
145
- }
146
-
147
- static VALUE
148
- frt_field_is_binary(VALUE self)
149
- {
150
- GET_DF;
151
- return df->is_binary ? Qtrue : Qfalse;
152
- }
153
-
154
- static VALUE
155
- frt_field_is_compressed(VALUE self)
156
- {
157
- GET_DF;
158
- return df->is_compressed ? Qtrue : Qfalse;
159
- }
160
-
161
- static VALUE
162
- frt_field_store_tv(VALUE self)
163
- {
164
- GET_DF;
165
- return df->store_tv ? Qtrue : Qfalse;
166
- }
167
-
168
- static VALUE
169
- frt_field_store_pos(VALUE self)
170
- {
171
- GET_DF;
172
- return df->store_pos ? Qtrue : Qfalse;
173
- }
174
-
175
- static VALUE
176
- frt_field_store_offset(VALUE self)
177
- {
178
- GET_DF;
179
- return df->store_offset ? Qtrue : Qfalse;
180
- }
181
-
182
- static VALUE
183
- frt_field_omit_norms(VALUE self)
184
- {
185
- GET_DF;
186
- return df->omit_norms ? Qtrue : Qfalse;
187
- }
188
-
189
- static VALUE
190
- frt_field_to_s(VALUE self)
191
- {
192
- VALUE rstr;
193
- char *str;
194
- GET_DF;
195
-
196
- str = df_to_s(df);
197
- rstr = rb_str_new2(str);
198
- free(str);
199
- return rstr;
200
- }
201
-
202
- static VALUE
203
- frt_field_new_binary(VALUE klass, VALUE rname, VALUE rdata, VALUE rstore)
204
- {
205
- char *data;
206
- int len;
207
- DocField *df;
208
- int store = FIX2INT(rstore);
209
- rname = rb_obj_as_string(rname);
210
- rdata = rb_obj_as_string(rdata);
211
- len = RSTRING(rdata)->len;
212
- data = ALLOC_N(char, len);
213
- MEMCPY(data, RSTRING(rdata)->ptr, char, len);
214
-
215
- df = df_create_binary(RSTRING(rname)->ptr, data, len, store);
216
- return Data_Wrap_Struct(klass, NULL, &df_destroy_data, df);
217
- }
218
-
219
- static VALUE
220
- frt_field_set_store(VALUE self, VALUE rstore)
221
- {
222
- GET_DF;
223
- int store = FIX2INT(rstore);
224
- df_set_store(df, store);
225
- return Qnil;
226
- }
227
-
228
- static VALUE
229
- frt_field_set_term_vector(VALUE self, VALUE rterm_vector)
230
- {
231
- GET_DF;
232
- int term_vector = FIX2INT(rterm_vector);
233
- df_set_term_vector(df, term_vector);
234
- return Qnil;
235
- }
236
-
237
- static VALUE
238
- frt_field_set_index(VALUE self, VALUE rindex)
239
- {
240
- GET_DF;
241
- int index = FIX2INT(rindex);
242
- df_set_index(df, index);
243
- return Qnil;
244
- }
245
-
246
- /****************************************************************************
247
- *
248
- * Document Methods
249
- *
250
- ****************************************************************************/
251
-
252
- void
253
- frt_doc_free(void *p)
254
- {
255
- object_del(p);
256
- doc_destroy(p);
257
- }
258
-
259
- void
260
- frt_doc_mark(void *p)
261
- {
262
- int i;
263
- DocField *df;
264
- Document *doc = (Document *)p;
265
- for (i = 0; i < doc->dfcnt; i++) {
266
- df = doc->df_arr[i];
267
- frt_gc_mark(df);
268
- }
269
- }
270
-
271
- static VALUE
272
- frt_doc_alloc(VALUE klass)
273
- {
274
- Document *doc = doc_create();
275
- VALUE self = Data_Wrap_Struct(klass, &frt_doc_mark, &frt_doc_free, doc);
276
- doc->free_data = NULL;
277
- object_add(doc, self);
278
- return self;
279
- }
280
-
281
- VALUE
282
- frt_get_doc(Document *doc)
283
- {
284
- VALUE rfield, self = Qnil;
285
- DocField *df;
286
- int i;
287
-
288
- if (!doc || (self = object_get(doc)) != Qnil) return self;
289
-
290
- doc->free_data = NULL;
291
- self = Data_Wrap_Struct(cDocument, frt_doc_mark, frt_doc_free, doc);
292
-
293
- /* We add all the document's fields to the ruby object space so that they
294
- * can be retrieved in ruby later. This code must come after the above
295
- * wrapper which puts the document in the ruby object space so that there is
296
- * something to mark the doc fields when garbage collection starts. */
297
- for (i = 0; i < doc->dfcnt; i++) {
298
- df = doc->df_arr[i];
299
- rfield = Data_Wrap_Struct(cField, NULL, &frt_field_free, df);
300
- object_add(df, rfield);
301
- }
302
-
303
- object_add(doc, self);
304
- return self;
305
- }
306
-
307
- #define GET_DOC Document *doc = (Document *)DATA_PTR(self)
308
-
309
- static VALUE
310
- frt_doc_init(VALUE self)
311
- {
312
- return self;
313
- }
314
-
315
- static VALUE
316
- frt_doc_all_fields(VALUE self)
317
- {
318
- int i;
319
- GET_DOC;
320
- VALUE values = rb_ary_new2(doc->dfcnt);
321
- for (i = 0; i < doc->dfcnt; i++) {
322
- rb_ary_push(values, object_get(doc->df_arr[i]));
323
- }
324
- return values;
325
- }
326
-
327
- static VALUE
328
- frt_doc_field_count(VALUE self)
329
- {
330
- GET_DOC;
331
- return INT2FIX(doc->fcnt);
332
- }
333
-
334
- static VALUE
335
- frt_doc_entry_count(VALUE self)
336
- {
337
- GET_DOC;
338
- return INT2FIX(doc->dfcnt);
339
- }
340
-
341
- static VALUE
342
- frt_doc_add_field(VALUE self, VALUE rfield)
343
- {
344
- DocField *df;
345
- GET_DOC;
346
- Data_Get_Struct(rfield, DocField, df);
347
- doc_add_field(doc, df);
348
- return Qnil;
349
- }
350
-
351
- /* TODO: return the removed fields as an array */
352
- static VALUE
353
- frt_doc_remove_fields(VALUE self, VALUE rname)
354
- {
355
- Array *fields;
356
- GET_DOC;
357
- rname = rb_obj_as_string(rname);
358
- fields = doc_remove_fields(doc, RSTRING(rname)->ptr);
359
- ary_destroy(fields);
360
- return Qnil;
361
- }
362
-
363
- static VALUE
364
- frt_doc_remove_field(VALUE self, VALUE rname)
365
- {
366
- DocField *df;
367
- GET_DOC;
368
- rname = rb_obj_as_string(rname);
369
- df = doc_remove_field(doc, RSTRING(rname)->ptr);
370
- return object_get(df);
371
- }
372
-
373
- static VALUE
374
- frt_doc_field(VALUE self, VALUE rname)
375
- {
376
- GET_DOC;
377
- DocField *df;
378
- rname = rb_obj_as_string(rname);
379
- df = doc_get_field(doc, RSTRING(rname)->ptr);
380
- return object_get(df);
381
- }
382
-
383
- static VALUE
384
- frt_doc_fields(VALUE self, VALUE rname)
385
- {
386
- int i;
387
- VALUE fields;
388
- GET_DOC;
389
- Array *dfs;
390
- rname = rb_obj_as_string(rname);
391
- dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
392
- if (!dfs) return Qnil;
393
- fields = rb_ary_new2(dfs->size);
394
- for (i = 0; i < dfs->size; i++) {
395
- rb_ary_push(fields, object_get(dfs->elems[i]));
396
- }
397
-
398
- return fields;
399
- }
400
-
401
- static VALUE
402
- frt_doc_values(VALUE self, VALUE rname)
403
- {
404
- int i, len = 0, vindex = 0;
405
- VALUE rvalues;
406
- char *values = NULL;
407
- GET_DOC;
408
- Array *dfs;
409
- DocField *df;
410
- rname = rb_obj_as_string(rname);
411
- dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
412
- if (!dfs) return Qnil;
413
-
414
- for (i = 0; i < dfs->size; i++) {
415
- df = (DocField *)dfs->elems[i];
416
- if (df->is_binary) continue;
417
- len += df->blen + 1;
418
- REALLOC_N(values, char, len);
419
- MEMCPY(values + vindex, df->data, char, df->blen);
420
- vindex = len;
421
- values[vindex-1] = ' ';
422
- }
423
- if (len) {
424
- values[len-1] = '\0';
425
- rvalues = rb_str_new(values, len-1);
426
- free(values);
427
- } else {
428
- rvalues = Qnil;
429
- }
430
-
431
- return rvalues;
432
- }
433
-
434
- static VALUE
435
- frt_doc_binaries(VALUE self, VALUE rname)
436
- {
437
- int i;
438
- VALUE rvalues;
439
- GET_DOC;
440
- Array *dfs;
441
- DocField *df;
442
- rname = rb_obj_as_string(rname);
443
- dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
444
- if (!dfs) return Qnil;
445
-
446
- rvalues = rb_ary_new2(dfs->size);
447
- for (i = 0; i < dfs->size; i++) {
448
- df = (DocField *)dfs->elems[i];
449
- if (!df->is_binary) continue;
450
- rb_ary_push(rvalues, rb_str_new(df->data, df->blen));
451
- }
452
- return rvalues;
453
- }
454
-
455
- static VALUE
456
- frt_doc_set(VALUE self, VALUE rname, VALUE rdata)
457
- {
458
- DocField *df;
459
- GET_DOC;
460
- VALUE rfield;
461
- rname = rb_obj_as_string(rname);
462
- rdata = rb_obj_as_string(rdata);
463
-
464
- df = doc_get_field(doc, RSTRING(rname)->ptr);
465
- if (df) {
466
- free(df->data);
467
- df->data = estrdup(RSTRING(rdata)->ptr);
468
- rfield = object_get(df);
469
- } else {
470
- rfield = rb_funcall(cField, id_new, 2, rname, rdata);
471
- Data_Get_Struct(rfield, DocField, df);
472
- doc_add_field(doc, df);
473
- }
474
- return rfield;
475
- }
476
- static VALUE
477
- frt_doc_to_s(VALUE self)
478
- {
479
- char *str;
480
- VALUE rstr;
481
- GET_DOC;
482
- str = doc_to_s(doc);
483
- rstr = rb_str_new2(str);
484
- free(str);
485
- return rstr;
486
- }
487
-
488
- static VALUE
489
- frt_doc_get_boost(VALUE self)
490
- {
491
- GET_DOC;
492
- return rb_float_new((double)doc->boost);
493
- }
494
-
495
- static VALUE
496
- frt_doc_set_boost(VALUE self, VALUE rboost)
497
- {
498
- GET_DOC;
499
- doc->boost = (float)rb_num2dbl(rboost);
500
- return Qnil;
501
- }
502
-
503
-
504
- /****************************************************************************
505
- *
506
- * Init Function
507
- *
508
- ****************************************************************************/
509
-
510
- void
511
- Init_doc(void)
512
- {
513
- /* Field */
514
- cField = rb_define_class_under(mDocument, "Field", rb_cObject);
515
- rb_define_alloc_func(cField, frt_field_alloc);
516
-
517
- rb_define_method(cField, "initialize", frt_field_init, -1);
518
- rb_define_singleton_method(cField, "new_binary_field",
519
- frt_field_new_binary, 3);
520
- rb_define_method(cField, "name", frt_field_get_name, 0);
521
- rb_define_method(cField, "name=", frt_field_set_name, 1);
522
- rb_define_method(cField, "data", frt_field_get_data, 0);
523
- rb_define_method(cField, "data=", frt_field_set_data, 1);
524
- rb_define_method(cField, "boost", frt_field_get_boost, 0);
525
- rb_define_method(cField, "boost=", frt_field_set_boost, 1);
526
- rb_define_method(cField, "stored?", frt_field_is_stored, 0);
527
- rb_define_method(cField, "indexed?", frt_field_is_indexed, 0);
528
- rb_define_method(cField, "tokenized?", frt_field_is_tokenized, 0);
529
- rb_define_method(cField, "binary?", frt_field_is_binary, 0);
530
- rb_define_method(cField, "compressed?", frt_field_is_compressed, 0);
531
- rb_define_method(cField, "store_term_vector?", frt_field_store_tv, 0);
532
- rb_define_method(cField, "store_positions?", frt_field_store_pos, 0);
533
- rb_define_method(cField, "store_offsets?", frt_field_store_offset, 0);
534
- rb_define_method(cField, "omit_norms?", frt_field_omit_norms, 0);
535
- rb_define_method(cField, "to_s", frt_field_to_s, 0);
536
- rb_define_method(cField, "store=", frt_field_set_store, 1);
537
- rb_define_method(cField, "index=", frt_field_set_index, 1);
538
- rb_define_method(cField, "term_vector=", frt_field_set_term_vector, 1);
539
-
540
- /* Field Constants */
541
- cFieldStore = rb_define_class_under(cField, "Store", rb_cObject);
542
- rb_define_const(cFieldStore, "YES", INT2FIX(DF_STORE_YES));
543
- rb_define_const(cFieldStore, "NO", INT2FIX(DF_STORE_NO));
544
- rb_define_const(cFieldStore, "COMPRESS", INT2FIX(DF_STORE_COMPRESS));
545
- cFieldIndex = rb_define_class_under(cField, "Index", rb_cObject);
546
- rb_define_const(cFieldIndex, "UNTOKENIZED", INT2FIX(DF_INDEX_UNTOKENIZED));
547
- rb_define_const(cFieldIndex, "TOKENIZED", INT2FIX(DF_INDEX_TOKENIZED));
548
- rb_define_const(cFieldIndex, "NO", INT2FIX(DF_INDEX_NO));
549
- rb_define_const(cFieldIndex, "NO_NORMS", INT2FIX(DF_INDEX_NO_NORMS));
550
- cFieldTermVector = rb_define_class_under(cField, "TermVector", rb_cObject);
551
- rb_define_const(cFieldTermVector, "NO", INT2FIX(DF_TERM_VECTOR_NO));
552
- rb_define_const(cFieldTermVector, "YES", INT2FIX(DF_TERM_VECTOR_YES));
553
- rb_define_const(cFieldTermVector, "WITH_POSITIONS",
554
- INT2FIX(DF_TERM_VECTOR_WITH_POSITIONS));
555
- rb_define_const(cFieldTermVector, "WITH_OFFSETS",
556
- INT2FIX(DF_TERM_VECTOR_WITH_OFFSETS));
557
- rb_define_const(cFieldTermVector, "WITH_POSITIONS_OFFSETS",
558
- INT2FIX(DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS));
559
-
560
- /* Document */
561
- cDocument = rb_define_class_under(mDocument, "Document", rb_cObject);
562
- rb_define_alloc_func(cDocument, frt_doc_alloc);
563
-
564
- rb_define_method(cDocument, "initialize", frt_doc_init, 0);
565
- rb_define_method(cDocument, "all_fields", frt_doc_all_fields, 0);
566
- rb_define_method(cDocument, "field_count", frt_doc_field_count, 0);
567
- rb_define_method(cDocument, "entry_count", frt_doc_entry_count, 0);
568
- rb_define_method(cDocument, "add_field", frt_doc_add_field, 1);
569
- rb_define_method(cDocument, "<<", frt_doc_add_field, 1);
570
- rb_define_method(cDocument, "remove_fields", frt_doc_remove_fields, 1);
571
- rb_define_method(cDocument, "remove_field", frt_doc_remove_field, 1);
572
- rb_define_method(cDocument, "field", frt_doc_field, 1);
573
- rb_define_method(cDocument, "fields", frt_doc_fields, 1);
574
- rb_define_method(cDocument, "values", frt_doc_values, 1);
575
- rb_define_method(cDocument, "binaries", frt_doc_binaries, 1);
576
- rb_define_method(cDocument, "[]", frt_doc_values, 1);
577
- rb_define_method(cDocument, "set", frt_doc_set, 2);
578
- rb_define_method(cDocument, "[]=", frt_doc_set, 2);
579
- rb_define_method(cDocument, "to_s", frt_doc_to_s, 0);
580
- rb_define_method(cDocument, "boost", frt_doc_get_boost, 0);
581
- rb_define_method(cDocument, "boost=", frt_doc_set_boost, 1);
582
- }