jk-ferret 0.11.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
@@ -0,0 +1,22 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" >
3
+ <html xmlns="http://www.w3.org/1999/xhtml">
4
+ <head>
5
+ <title>Ferret-Browser:: <%= @path %></title>
6
+ <link rel="shortcut icon" type="image/x-icon" href="/s/i/favicon.ico" />
7
+ <link rel="stylesheet" type="text/css" href="/s/style.css" />
8
+ <script type="text/javascript" src="/s/global.js"></script>
9
+ </head>
10
+ <body id="<%= options[:controller] %>">
11
+ <ul id="top-menu">
12
+ <li class="home"><a href="/">Home</a></li>
13
+ <li class="document"><a href="/document">Documents</a></li>
14
+ <li class="term"><a href="/term">Terms</a></li>
15
+ <li class="term-vector"><a href="/term-vector">Term Vectors</a></li>
16
+ <li class="help"><a href="/help">Help</a></li>
17
+ </ul>
18
+ <div id="content">
19
+ <%= content %>
20
+ </div>
21
+ </body>
22
+ </html>
@@ -0,0 +1,4 @@
1
+ <h3>Term Vectors</h3>
2
+ <p>
3
+ Nothing to see here yet
4
+ </p>
@@ -0,0 +1,199 @@
1
+ <h3>Terms</h3>
2
+ <form action="" method="get">
3
+ <label for="field">Choose a field:
4
+ <select id="field" name="field" onchange="location.href='/term/show/' + this.value;">
5
+ <option value="">--</option>
6
+ <% @reader.field_infos.each do |fi| next unless fi.indexed? %>
7
+ <option value="<%=fi.name%>" <%= 'selected="selected"' if @field == fi.name %>><%=fi.name%></option>
8
+ <% end %>
9
+ </select>
10
+ </label>
11
+ </form>
12
+ <hr/>
13
+ <% if @terms %>
14
+ <div style="float:left; padding-right:30px;">
15
+ <h4>Field: <%= @field %></h4>
16
+ <form action="" onsubmit="return findTerm(document.getElementById('choose-term').value);">
17
+ <input id="choose-term" type="text"/></form>
18
+ <hr/>
19
+ <div id="terms">
20
+ </div>
21
+ <hr/>
22
+ <form action="">
23
+ <input type="submit" value="Previous" onclick="return prevTerm();"/>
24
+ <input type="submit" value="Next" onclick="return nextTerm();"/><br/>
25
+ </form>
26
+ </div>
27
+ <div style="float:left; padding-right:30px;">
28
+ <h4>Documents
29
+ <input name="show-documents" type="checkbox" id="show-documents" onchange="displayDoc();"/></h4>
30
+ <div id="doc-display">
31
+ <form action="" onsubmit="return findDoc(document.getElementById('choose-doc').value);">
32
+ <input id="choose-doc" type="text"/></form>
33
+ <hr/>
34
+ <div id="documents">
35
+ </div>
36
+ <hr/>
37
+ <form action="">
38
+ <input type="submit" value="Previous" onclick="return prevDoc();"/>
39
+ <input type="submit" value="Next" onclick="return nextDoc();"/><br/>
40
+ </form>
41
+ </div>
42
+ </div>
43
+ <script type="text/javascript"><!--
44
+ function createSideHeaderedTable(content) {
45
+ var table = document.createElement("table");
46
+ table.setAttribute('cellpadding', "0");
47
+ table.setAttribute('cellspacing', "0");
48
+ table.className = "left-headed";
49
+ for (var i = 0; i < content.length; i++) {
50
+ var tr = document.createElement("tr");
51
+ var th = document.createElement("th");
52
+ th.appendChild(document.createTextNode(content[i][0]));
53
+ tr.appendChild(th);
54
+ var td = document.createElement("td");
55
+ var c = content[i][1];
56
+ td.appendChild(c instanceof Object ? c : document.createTextNode(c));
57
+ tr.appendChild(td);
58
+ table.appendChild(tr);
59
+ }
60
+ return table;
61
+ }
62
+
63
+ var terms = <%= @terms %>;
64
+ var numTerms = terms.length;
65
+ var termI = 0;
66
+ var termDiv = document.getElementById("terms");
67
+ var termChooserTxt = document.getElementById('choose-term');
68
+
69
+ var docs = null;
70
+ var numDocs = 0;
71
+ var docI = 0;
72
+ var docDiv = document.getElementById("documents");
73
+ var showDocs = document.getElementById("show-documents");
74
+ var docChooserTxt = document.getElementById('choose-doc');
75
+ var docDisplayDiv = document.getElementById('doc-display');
76
+
77
+ function getDocs() {
78
+ var req = new XMLHttpRequest();
79
+ req.open('GET', '/term/termdocs/<%=@field%>/' + terms[termI][0], true);
80
+ req.onreadystatechange = function() {
81
+ if (req.readyState == 4) {
82
+ docs = eval(req.responseText);
83
+ numDocs = docs.length;
84
+ docI = 0;
85
+ displayDoc();
86
+ }
87
+ };
88
+ req.send(/*no params*/null);
89
+ return false;
90
+ }
91
+ function displayDoc() {
92
+ if (showDocs.checked) {
93
+ docDisplayDiv.style.display = 'block';
94
+ if (docs == null) {
95
+ getDocs();
96
+ } else {
97
+ docChooserTxt.value = docs[docI][0];
98
+ var docLink = document.createElement('a');
99
+ docLink.setAttribute('href', "/document/show/" + docs[docI][0]);
100
+ docLink.appendChild(document.createTextNode(docs[docI][0]));
101
+ var table = createSideHeaderedTable([
102
+ ['index', '' + (docI + 1) + ' of ' + numDocs],
103
+ ['document', docLink],
104
+ ['number of occurrences', docs[docI][1]],
105
+ ['positions', docs[docI][2].join(',')]
106
+ ]);
107
+ if (docDiv.firstChild) docDiv.replaceChild(table, docDiv.firstChild);
108
+ else docDiv.appendChild(table);
109
+ }
110
+ } else {
111
+ docDiv.innerHTML = '';
112
+ docDisplayDiv.style.display = 'none';
113
+ }
114
+ return false;
115
+ }
116
+ function nextDoc() {
117
+ if (docI < numDocs - 1) {
118
+ docI++;
119
+ displayDoc();
120
+ } else alert('No more docs. Already at the end.');
121
+ return false;
122
+ }
123
+ function prevDoc() {
124
+ if (docI > 0) {
125
+ docI -= 1;
126
+ displayDoc();
127
+ } else alert('No more docs. Already at the start.');
128
+ return false;
129
+ }
130
+ function findDoc(doc) {
131
+ if (docs && docs.length > 0) {
132
+ docI = bsearch(docs, doc, function(a, b) {return a[0] < b});
133
+ if (docI >= docs.length) docI -= 1;
134
+ displayDoc();
135
+ }
136
+ return false;
137
+ }
138
+ function displayTerm() {
139
+ docs = null;
140
+ termChooserTxt.value = terms[termI][0];
141
+ var table = createSideHeaderedTable([
142
+ ['index', '' + (termI + 1) + ' of ' + numTerms],
143
+ ['term', terms[termI][0]],
144
+ ['number of documents', terms[termI][1]],
145
+ ]);
146
+ termDiv.replaceChild(table, termDiv.firstChild);
147
+ displayDoc();
148
+ }
149
+ function nextTerm() {
150
+ if (termI < numTerms - 1) {
151
+ termI++;
152
+ displayTerm();
153
+ } else alert('No more terms. Already at the end.');
154
+ return false;
155
+ }
156
+ function prevTerm() {
157
+ if (termI > 0) {
158
+ termI -= 1;
159
+ displayTerm();
160
+ } else alert('No more terms. Already at the start.');
161
+ return false;
162
+ }
163
+ function findTerm(term) {
164
+ if (terms && terms.length > 0) {
165
+ termI = bsearch(terms, term, function(a, b) {return a[0] < b});
166
+ var match = (terms[termI]||[])[0];
167
+ if (term != match) {
168
+ alert('Term <%=@field%>:' + term + ' not found in index');
169
+ } else {
170
+ displayTerm();
171
+ }
172
+ }
173
+ return false;
174
+ }
175
+
176
+ function TermSuggestionProvider() {
177
+ }
178
+ TermSuggestionProvider.prototype.requestSuggestions = function(oAutoSuggestControl, bTypeAhead) {
179
+ var aSuggestions = new Array();
180
+ var sTextboxValue = oAutoSuggestControl.textbox.value;
181
+
182
+ if (sTextboxValue.length > 0) {
183
+ start = bsearch(terms, sTextboxValue, function(a, b) {return a[0] < b});
184
+ for (var i = start; i < terms.length
185
+ && terms[i][0].indexOf(sTextboxValue) == 0
186
+ && aSuggestions.length < 10; i++) {
187
+ aSuggestions.push(terms[i][0]);
188
+ }
189
+ oAutoSuggestControl.autosuggest(aSuggestions, bTypeAhead);
190
+ }
191
+ };
192
+ window.onload = function() {
193
+ var oTermTextbox = new AutoSuggestControl(termChooserTxt, new TermSuggestionProvider());
194
+ }
195
+ displayTerm();
196
+ //-->
197
+ </script>
198
+ <% end %>
199
+ <hr/>
@@ -0,0 +1 @@
1
+ <%= @reader.term_positions_for(@field, @term).to_json(:fast) %>
@@ -0,0 +1,14 @@
1
+ module WEBrick
2
+ class FerretBrowserHandler < WEBrick::HTTPServlet::AbstractServlet
3
+ # Creates a FerretBrowserHandler, which answers for the application
4
+ # within +klass+.
5
+ def initialize(server, reader, path)
6
+ super(server)
7
+ @delegator = Ferret::Browser::Delegator.new(reader, path)
8
+ end
9
+ # Handler for WEBrick requests (also aliased as do_POST).
10
+ def do_GET(req, res)
11
+ res.status, res.content_type, res.body = @delegator.run(req.meta_vars)
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,130 @@
1
+ module Ferret
2
+ # Instead of using documents to add data to an index you can use Hashes and
3
+ # Arrays. The only real benefits of using a Document over a Hash are pretty
4
+ # printing and the boost attribute. You can add the boost attribute to
5
+ # Hashes and arrays using the BoostMixin. For example;
6
+ #
7
+ # class Hash
8
+ # include BoostMixin
9
+ # end
10
+ #
11
+ # class Array
12
+ # include BoostMixin
13
+ # end
14
+ #
15
+ # class String
16
+ # include BoostMixin
17
+ # end
18
+ module BoostMixin
19
+ attr_accessor :boost
20
+ end
21
+
22
+ # Documents are the unit of indexing and search.
23
+ #
24
+ # A Document is a set of fields. Each field has a name and an array of
25
+ # textual values. If you are coming from a Lucene background you should note
26
+ # that Fields don't have any properties except for the boost property. You
27
+ # should use the Ferret::Index::FieldInfos class to set field properties
28
+ # across the whole index instead.
29
+ #
30
+ # === Boost
31
+ #
32
+ # The boost attribute makes a Document more important in the index. That is,
33
+ # you can increase the score of a match for queries that match a particular
34
+ # document, making it more likely to appear at the top of search results.
35
+ # You may, for example, want to boost products that have a higher user
36
+ # rating so that they are more likely to appear in search results.
37
+ #
38
+ # Note: that fields which are _not_ stored (see Ferret::Index::FieldInfos)
39
+ # are _not_ available in documents retrieved from the index, e.g.
40
+ # Ferret::Search::Searcher#doc or Ferret::Index::IndexReader#doc.
41
+ #
42
+ # Note: that modifying a Document retrieved from the index will not modify
43
+ # the document contained within the index. You need to delete the old
44
+ # version of the document and add the new version of the document.
45
+ class Document < Hash
46
+ include BoostMixin
47
+
48
+ # Create a new Document object with a boost. The boost defaults to 1.0.
49
+ def initialize(boost = 1.0)
50
+ @boost = boost
51
+ end
52
+
53
+ # Return true if the documents are equal, ie they have the same fields
54
+ def eql?(o)
55
+ return (o.is_a? Document and (o.boost == @boost) and
56
+ (self.keys == o.keys) and (self.values == o.values))
57
+ end
58
+ alias :== :eql?
59
+
60
+ # Create a string representation of the document
61
+ def to_s
62
+ buf = ["Document {"]
63
+ self.keys.sort_by {|key| key.to_s}.each do |key|
64
+ val = self[key]
65
+ val_str = if val.instance_of? Array then %{["#{val.join('", "')}"]}
66
+ elsif val.is_a? Field then val.to_s
67
+ else %{"#{val.to_s}"}
68
+ end
69
+ buf << " :#{key} => #{val_str}"
70
+ end
71
+ buf << ["}#{@boost == 1.0 ? "" : "^" + @boost.to_s}"]
72
+ return buf.join("\n")
73
+ end
74
+ end
75
+
76
+ # A Field is a section of a Document. A Field is basically an array with a
77
+ # boost attribute. It also provides pretty printing of the field with the
78
+ # #to_s method.
79
+ #
80
+ # === Boost
81
+ #
82
+ # The boost attribute makes a field more important in the index. That is,
83
+ # you can increase the score of a match for queries that match terms in a
84
+ # boosted field. You may, for example, want to boost a title field so that
85
+ # matches that match in the :title field score more highly than matches that
86
+ # match in the :contents field.
87
+ #
88
+ # Note: If you'd like to use boosted fields without having to use
89
+ # the Field class you can just include the BoostMixin in the Array class.
90
+ # See BoostMixin.
91
+ class Field < Array
92
+ include BoostMixin
93
+
94
+ # Create a new Field object. You can pass data to the field as either a
95
+ # string;
96
+ #
97
+ # f = Field.new("This is the fields data")
98
+ #
99
+ # or as an array of strings;
100
+ #
101
+ # f = Field.new(["this", "is", "an", "array", "of", "field", "data"])
102
+ #
103
+ # Of course Fields can also be boosted;
104
+ #
105
+ # f = Field.new("field data", 1000.0)
106
+ def initialize(data = [], boost = 1.0)
107
+ @boost = boost
108
+ if data.is_a? Array
109
+ data.each {|v| self << v}
110
+ else
111
+ self << data.to_s
112
+ end
113
+ end
114
+
115
+ def eql?(o)
116
+ return (o.is_a? Field and (o.boost == @boost) and super(o))
117
+ end
118
+ alias :== :eql?
119
+
120
+ def +(o)
121
+ return Field.new(super(o), self.boost)
122
+ end
123
+
124
+ def to_s
125
+ buf = %{["#{self.join('", "')}"]}
126
+ buf << "^#@boost" if @boost != 1.0
127
+ return buf
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,44 @@
1
+ require 'yaml'
2
+
3
+ module Ferret::Index
4
+ class FieldInfos
5
+ # Load FieldInfos from a YAML file. The YAML file should look something like
6
+ # this:
7
+ # default:
8
+ # store: :yes
9
+ # index: :yes
10
+ # term_vector: :no
11
+ #
12
+ # fields:
13
+ # id:
14
+ # index: :untokenized
15
+ # term_vector: :no
16
+ #
17
+ # title:
18
+ # boost: 20.0
19
+ # term_vector: :no
20
+ #
21
+ # content:
22
+ # term_vector: :with_positions_offsets
23
+ #
24
+ def self.load(yaml_str)
25
+ info = YAML.load(yaml_str)
26
+ convert_strings_to_symbols(info)
27
+ fis = FieldInfos.new(info[:default])
28
+ fields = info[:fields]
29
+ fields.keys.each {|key| fis.add_field(key, fields[key])} if fields
30
+ fis
31
+ end
32
+
33
+ private
34
+ def self.convert_strings_to_symbols(hash)
35
+ hash.keys.each do |key|
36
+ convert_strings_to_symbols(hash[key]) if hash[key].is_a?(Hash)
37
+ if key.is_a?(String)
38
+ hash[key.intern] = hash[key]
39
+ hash.delete(key)
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,87 @@
1
+ module Ferret
2
+ FIELD_TYPES = %w(integer float string byte).map{|t| t.to_sym}
3
+
4
+ # BlankSlate is a class with no instance methods except for __send__ and
5
+ # __id__. It is useful for creating proxy classes. It is currently used by
6
+ # the FieldSymbol class which is a proxy to the Symbol class
7
+ class BlankSlate
8
+ instance_methods.each { |m| undef_method m unless m =~ /^__/ }
9
+ end
10
+
11
+ # The FieldSymbolMethods module contains the methods that are added to both
12
+ # the Symbol class and the FieldSymbol class. These methods allow you to set
13
+ # the type easily set the type of a field by calling a method on a symbol.
14
+ #
15
+ # Right now this is only useful for Sorting and grouping, but some day Ferret
16
+ # may have typed fields, in which case these this methods will come in handy.
17
+ #
18
+ # The available types are specified in Ferret::FIELD_TYPES.
19
+ #
20
+ # == Examples
21
+ #
22
+ # index.search(query, :sort => :title.string.desc)
23
+ #
24
+ # index.search(query, :sort => [:price.float, :count.integer.desc])
25
+ #
26
+ # index.search(query, :group_by => :catalogue.string)
27
+ #
28
+ # == Note
29
+ #
30
+ # If you set the field type multiple times, the last type specified will be
31
+ # the type used. For example;
32
+ #
33
+ # puts :title.integer.float.byte.string.type.inspect # => :string
34
+ #
35
+ # Calling #desc twice will set desc? to false
36
+ #
37
+ # puts :title.desc? # => false
38
+ # puts :title.desc.desc? # => true
39
+ # puts :title.desc.desc.desc? # => false
40
+ module FieldSymbolMethods
41
+ FIELD_TYPES.each do |method|
42
+ define_method(method) do
43
+ fsym = FieldSymbol.new(self, respond_to?(:desc?) ? desc? : false)
44
+ fsym.type = method
45
+ fsym
46
+ end
47
+ end
48
+
49
+ # Set a field to be a descending field. This only makes sense in sort
50
+ # specifications.
51
+ def desc
52
+ fsym = FieldSymbol.new(self, respond_to?(:desc?) ? !desc? : true)
53
+ fsym.type = type if respond_to? :type
54
+ fsym
55
+ end
56
+
57
+ # Return whether or not this field should be a descending field
58
+ def desc?
59
+ @desc == true
60
+ end
61
+
62
+ # Return the type of this field
63
+ def type
64
+ @type || nil
65
+ end
66
+ end
67
+
68
+ # See FieldSymbolMethods
69
+ class FieldSymbol < BlankSlate
70
+ include FieldSymbolMethods
71
+ def initialize(symbol, desc = false)
72
+ @symbol = symbol
73
+ @desc = desc
74
+ end
75
+
76
+ def method_missing(method, *args)
77
+ @symbol.__send__(method, *args)
78
+ end
79
+
80
+ attr_writer :type, :desc
81
+ end
82
+ end
83
+
84
+ # See FieldSymbolMethods
85
+ class Symbol
86
+ include Ferret::FieldSymbolMethods
87
+ end