ruby_odeum 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. data/COPYING +504 -0
  2. data/LICENSE +504 -0
  3. data/README +50 -0
  4. data/bin/odeum_mgr +106 -0
  5. data/doc/rdoc/classes/Odeum.html +235 -0
  6. data/doc/rdoc/classes/Odeum.src/M000010.html +25 -0
  7. data/doc/rdoc/classes/Odeum.src/M000011.html +22 -0
  8. data/doc/rdoc/classes/Odeum.src/M000012.html +27 -0
  9. data/doc/rdoc/classes/Odeum.src/M000013.html +27 -0
  10. data/doc/rdoc/classes/Odeum.src/M000014.html +28 -0
  11. data/doc/rdoc/classes/Odeum/Document.html +382 -0
  12. data/doc/rdoc/classes/Odeum/Document.src/M000040.html +25 -0
  13. data/doc/rdoc/classes/Odeum/Document.src/M000041.html +22 -0
  14. data/doc/rdoc/classes/Odeum/Document.src/M000042.html +23 -0
  15. data/doc/rdoc/classes/Odeum/Document.src/M000043.html +23 -0
  16. data/doc/rdoc/classes/Odeum/Document.src/M000044.html +24 -0
  17. data/doc/rdoc/classes/Odeum/Document.src/M000045.html +32 -0
  18. data/doc/rdoc/classes/Odeum/Document.src/M000046.html +22 -0
  19. data/doc/rdoc/classes/Odeum/Document.src/M000047.html +22 -0
  20. data/doc/rdoc/classes/Odeum/Document.src/M000048.html +22 -0
  21. data/doc/rdoc/classes/Odeum/Document.src/M000049.html +22 -0
  22. data/doc/rdoc/classes/Odeum/Document.src/M000050.html +24 -0
  23. data/doc/rdoc/classes/Odeum/Document.src/M000051.html +27 -0
  24. data/doc/rdoc/classes/Odeum/Index.html +662 -0
  25. data/doc/rdoc/classes/Odeum/Index.src/M000015.html +46 -0
  26. data/doc/rdoc/classes/Odeum/Index.src/M000016.html +33 -0
  27. data/doc/rdoc/classes/Odeum/Index.src/M000017.html +35 -0
  28. data/doc/rdoc/classes/Odeum/Index.src/M000018.html +23 -0
  29. data/doc/rdoc/classes/Odeum/Index.src/M000019.html +22 -0
  30. data/doc/rdoc/classes/Odeum/Index.src/M000020.html +22 -0
  31. data/doc/rdoc/classes/Odeum/Index.src/M000021.html +22 -0
  32. data/doc/rdoc/classes/Odeum/Index.src/M000022.html +22 -0
  33. data/doc/rdoc/classes/Odeum/Index.src/M000023.html +22 -0
  34. data/doc/rdoc/classes/Odeum/Index.src/M000024.html +29 -0
  35. data/doc/rdoc/classes/Odeum/Index.src/M000025.html +23 -0
  36. data/doc/rdoc/classes/Odeum/Index.src/M000026.html +24 -0
  37. data/doc/rdoc/classes/Odeum/Index.src/M000027.html +23 -0
  38. data/doc/rdoc/classes/Odeum/Index.src/M000028.html +26 -0
  39. data/doc/rdoc/classes/Odeum/Index.src/M000029.html +24 -0
  40. data/doc/rdoc/classes/Odeum/Index.src/M000030.html +20 -0
  41. data/doc/rdoc/classes/Odeum/Index.src/M000031.html +22 -0
  42. data/doc/rdoc/classes/Odeum/Index.src/M000032.html +22 -0
  43. data/doc/rdoc/classes/Odeum/Index.src/M000033.html +22 -0
  44. data/doc/rdoc/classes/Odeum/Index.src/M000034.html +22 -0
  45. data/doc/rdoc/classes/Odeum/Index.src/M000035.html +20 -0
  46. data/doc/rdoc/classes/Odeum/Index.src/M000036.html +20 -0
  47. data/doc/rdoc/classes/Odeum/Index.src/M000037.html +22 -0
  48. data/doc/rdoc/classes/Odeum/Index.src/M000038.html +22 -0
  49. data/doc/rdoc/classes/Odeum/Index.src/M000039.html +22 -0
  50. data/doc/rdoc/classes/OdeumTest.html +257 -0
  51. data/doc/rdoc/classes/OdeumTest.src/M000001.html +18 -0
  52. data/doc/rdoc/classes/OdeumTest.src/M000002.html +19 -0
  53. data/doc/rdoc/classes/OdeumTest.src/M000003.html +27 -0
  54. data/doc/rdoc/classes/OdeumTest.src/M000004.html +25 -0
  55. data/doc/rdoc/classes/OdeumTest.src/M000005.html +44 -0
  56. data/doc/rdoc/classes/OdeumTest.src/M000006.html +20 -0
  57. data/doc/rdoc/classes/OdeumTest.src/M000007.html +39 -0
  58. data/doc/rdoc/classes/OdeumTest.src/M000008.html +59 -0
  59. data/doc/rdoc/classes/OdeumTest.src/M000009.html +41 -0
  60. data/doc/rdoc/created.rid +1 -0
  61. data/doc/rdoc/files/COPYING.html +756 -0
  62. data/doc/rdoc/files/LICENSE.html +756 -0
  63. data/doc/rdoc/files/README.html +175 -0
  64. data/doc/rdoc/files/ext/odeum_index/odeum_index_c.html +101 -0
  65. data/doc/rdoc/files/test/test_odeum_rb.html +109 -0
  66. data/doc/rdoc/fr_class_index.html +30 -0
  67. data/doc/rdoc/fr_file_index.html +31 -0
  68. data/doc/rdoc/fr_method_index.html +77 -0
  69. data/doc/rdoc/index.html +24 -0
  70. data/doc/rdoc/rdoc-style.css +208 -0
  71. data/ext/odeum_index/cabin.c +2735 -0
  72. data/ext/odeum_index/cabin.h +1040 -0
  73. data/ext/odeum_index/curia.c +1114 -0
  74. data/ext/odeum_index/curia.h +430 -0
  75. data/ext/odeum_index/depot.c +1910 -0
  76. data/ext/odeum_index/depot.h +439 -0
  77. data/ext/odeum_index/extconf.rb +10 -0
  78. data/ext/odeum_index/myconf.c +668 -0
  79. data/ext/odeum_index/myconf.h +523 -0
  80. data/ext/odeum_index/odeum.c +1743 -0
  81. data/ext/odeum_index/odeum.h +541 -0
  82. data/ext/odeum_index/odeum_index.c +991 -0
  83. data/ext/odeum_index/villa.c +1923 -0
  84. data/ext/odeum_index/villa.h +470 -0
  85. data/ext/odeum_index/vista.c +159 -0
  86. data/ext/odeum_index/vista.h +111 -0
  87. data/test/test_odeum.rb +174 -0
  88. metadata +138 -0
data/README ADDED
@@ -0,0 +1,50 @@
1
+ == Ruby/Odeum
2
+
3
+ Ruby/Odeum is a simple full text reverse indexer that lets you index a set of
4
+ files and then search through them very quickly. It is similar to Java's Lucene
5
+ but at a lower level since it does not provide a query language or a lexical parser
6
+ for documents. The extension is based on Mikio Hirabayashi's QDBM library and
7
+ includes a full distribution needed to use the extension right out of the box.
8
+
9
+ The library is very simple to use and has full documentation in ruby docs.
10
+ Take a look at the test/test_odeum.rb and the bin/odeum_mgr files for examples
11
+ of using the library in a simple way.
12
+
13
+ == Features
14
+
15
+ Pretty much the same features that you'd get from Odeum and QDBM, but available
16
+ in an idiomatic Ruby package. The big list is:
17
+
18
+ 1. Fast as hell. QDBM is one of the fastest libraries out there for this
19
+ kind of thing.
20
+ 2. Simple interface involving two classes and maybe one small set of module functions.
21
+ 3. Indexes documents of any type, with arbitrary names and unlimited (well, sort of)
22
+ meta-data.
23
+ 4. Searching by normalized words with returned "scores" for weighting.
24
+ 5. Locking at the thread level for the OS.
25
+
26
+ Mikio states that it is probably not suitable for document stores that are larger
27
+ than about 1 million documents in size.
28
+
29
+ == Building
30
+
31
+ Developers who want to work on the project should put Ruby files in lib and
32
+ modifications to the extension in ext/odeum_index. You should then use the
33
+ Rakefile and rake to build the application and run tests (build docs, etc.)
34
+
35
+ == Installing
36
+
37
+ There's a setup.rb you can use to compile and install the extension and odeum_mgr
38
+ script for regular users.
39
+
40
+ Anyone feel like making a gem out of this? :-)
41
+
42
+ == Contact
43
+
44
+ The Ruby extension is entirely my fault. Please do not contact Mikio about it
45
+ unless it's to say thanks for doing such a cool job on QDBM and Odeum. If you
46
+ have problems with the extension then let me know. I'll work with Mikio or
47
+ fix them myself depending on what needs fixing.
48
+
49
+ You can contact me at zedshaw at zedshaw dot com.
50
+
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'odeum_index'
4
+ require 'find'
5
+
6
+
7
+ # Does a higher level job of managing an inverted index of documents.
8
+ class Manager
9
+ attr_reader :name
10
+ include Odeum
11
+
12
+ def initialize(name)
13
+ @name = name
14
+ end
15
+
16
+ def setup_new_doc(file)
17
+ doc = Document.new file
18
+ contents = File.read(file)
19
+
20
+ doc.add_content(contents)
21
+ doc["Date"] = File.mtime(file).to_s
22
+
23
+ return doc
24
+ end
25
+
26
+ def load(dir)
27
+ odeum = Index.new name, OWRITER | OCREAT
28
+
29
+ i = 0
30
+ Dir.chdir(dir) do
31
+ Find.find("./") do |file|
32
+ if File.file? file
33
+ print "file: #{file}"
34
+
35
+ doc = odeum.get(file)
36
+ if not odeum.get(file) or doc["Date"] != File.mtime(file).to_s
37
+ doc = setup_new_doc(file)
38
+ odeum.put(doc, -1, 1)
39
+ doc.close
40
+ puts "...Added."
41
+
42
+ if i % 1000 == 0
43
+ odeum.sync
44
+ end
45
+
46
+ i += 1
47
+ else
48
+ puts "...skipped."
49
+ end
50
+ end
51
+ end
52
+ end
53
+
54
+ odeum.close
55
+ end
56
+
57
+
58
+ def search(words)
59
+ odeum = Index.new name, OREADER
60
+
61
+ words.each do |word|
62
+ docs = odeum.search(Odeum::normalizeword(word), -1)
63
+ puts "(#{word}) Found #{docs.length} docs."
64
+ docs.each do |id, score|
65
+ doc = odeum.get_by_id(id)
66
+ puts "#{score}: #{doc.uri}"
67
+ doc.close
68
+ end
69
+ end
70
+
71
+ puts "Searched #{odeum.doc_count} documents."
72
+ end
73
+ end
74
+
75
+
76
+ cmd = ARGV.shift
77
+ case cmd
78
+ when "update"
79
+ mgr = Manager.new ARGV.shift
80
+ mgr.load(ARGV.shift)
81
+ when "search"
82
+ mgr = Manager.new ARGV.shift
83
+ mgr.search(ARGV)
84
+ puts "Done."
85
+ when "merge"
86
+ Odeum.merge(ARGV.shift, ARGV)
87
+ when "remove"
88
+ Odeum.remove(ARGV.shift)
89
+ when "optimize"
90
+ odeum = Odeum::Index.new ARGV.shift, Odeum::OWRITER
91
+ odeum.optimize
92
+ when "stats"
93
+ odeum = Odeum::Index.new ARGV.shift, Odeum::OREADER
94
+ puts "Database stats for #{odeum.name}"
95
+ puts "
96
+ Number of Stored Documents: #{odeum.doc_count}
97
+ Size of Stored Documents: #{odeum.size}
98
+ Average Document Size: #{odeum.size / odeum.doc_count}
99
+ Modified Time: #{Time.at(odeum.mtime)}
100
+ Bucket Count: #{odeum.bucket_count}
101
+ Buckets Used: #{odeum.buckets_used}
102
+ Database Inode: #{odeum.inode}
103
+ "
104
+ else
105
+ puts "Commands: update, search, merge, remove, optimize, stats"
106
+ end
@@ -0,0 +1,235 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Module: Odeum</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Module</strong></td>
53
+ <td class="class-name-in-header">Odeum</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/ext/odeum_index/odeum_index_c.html">
59
+ ext/odeum_index/odeum_index.c
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ </table>
66
+ </div>
67
+ <!-- banner header -->
68
+
69
+ <div id="bodyContent">
70
+
71
+
72
+
73
+ <div id="contextContent">
74
+
75
+
76
+
77
+ </div>
78
+
79
+ <div id="method-list">
80
+ <h3 class="section-bar">Methods</h3>
81
+
82
+ <div class="name-list">
83
+ <a href="#M000012">breaktext</a>&nbsp;&nbsp;
84
+ <a href="#M000010">merge</a>&nbsp;&nbsp;
85
+ <a href="#M000013">normalizeword</a>&nbsp;&nbsp;
86
+ <a href="#M000011">remove</a>&nbsp;&nbsp;
87
+ <a href="#M000014">settuning</a>&nbsp;&nbsp;
88
+ </div>
89
+ </div>
90
+
91
+ </div>
92
+
93
+
94
+ <!-- if includes -->
95
+
96
+ <div id="section">
97
+
98
+ <div id="class-list">
99
+ <h3 class="section-bar">Classes and Modules</h3>
100
+
101
+ Class <a href="Odeum/Document.html" class="link">Odeum::Document</a><br />
102
+ Class <a href="Odeum/Index.html" class="link">Odeum::Index</a><br />
103
+
104
+ </div>
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+ <!-- if method_list -->
113
+ <div id="methods">
114
+ <h3 class="section-bar">Public Class methods</h3>
115
+
116
+ <div id="method-M000012" class="method-detail">
117
+ <a name="M000012"></a>
118
+
119
+ <div class="method-heading">
120
+ <a href="Odeum.src/M000012.html" target="Code" class="method-signature"
121
+ onclick="popupCode('Odeum.src/M000012.html');return false;">
122
+ <span class="method-name">Odeum::breaktext(test) &rarr; [word1, word2, word3]<br />
123
+ </span>
124
+ </a>
125
+ </div>
126
+
127
+ <div class="method-description">
128
+ <p>
129
+ Breaks a string into an array of words that are separated by space
130
+ characters and such delimiters as period, commaa, etc. You should also
131
+ check out StringScanner as a more flexible alternative. This function must
132
+ do a lot of data copying and other things in order to convert from <a
133
+ href="Odeum.html">Odeum</a> internal types to Ruby types.
134
+ </p>
135
+ </div>
136
+ </div>
137
+
138
+ <div id="method-M000010" class="method-detail">
139
+ <a name="M000010"></a>
140
+
141
+ <div class="method-heading">
142
+ <a href="Odeum.src/M000010.html" target="Code" class="method-signature"
143
+ onclick="popupCode('Odeum.src/M000010.html');return false;">
144
+ <span class="method-name">Odeum::merge(new_name, other_databases) &rarr; true/false<br />
145
+ </span>
146
+ </a>
147
+ </div>
148
+
149
+ <div class="method-description">
150
+ <p>
151
+ Merges the databases listed in other_databases (Array of Strings) into the
152
+ new database new_name. If two or more documents have the same URI then the
153
+ first one is adopted and the others are ignored.
154
+ </p>
155
+ </div>
156
+ </div>
157
+
158
+ <div id="method-M000013" class="method-detail">
159
+ <a name="M000013"></a>
160
+
161
+ <div class="method-heading">
162
+ <a href="Odeum.src/M000013.html" target="Code" class="method-signature"
163
+ onclick="popupCode('Odeum.src/M000013.html');return false;">
164
+ <span class="method-name">Odeum::normalizeword(asis) &rarr; normal<br />
165
+ </span>
166
+ </a>
167
+ </div>
168
+
169
+ <div class="method-description">
170
+ <p>
171
+ Given a word from breaktext (which is considered &quot;as-is&quot;) it will
172
+ &quot;normalize&quot; it in a consistent way which is suitable for
173
+ searching. The normalization effectively strips puntuation and spacing, and
174
+ then lowercases the word. If there is nothing but &quot;removed&quot; chars
175
+ in the asis string then the return is empty. Check for this so you
176
+ don&#8217;t try to search for nothing.
177
+ </p>
178
+ </div>
179
+ </div>
180
+
181
+ <div id="method-M000011" class="method-detail">
182
+ <a name="M000011"></a>
183
+
184
+ <div class="method-heading">
185
+ <a href="Odeum.src/M000011.html" target="Code" class="method-signature"
186
+ onclick="popupCode('Odeum.src/M000011.html');return false;">
187
+ <span class="method-name">Odeum::remove(name) &rarr; true/false<br />
188
+ </span>
189
+ </a>
190
+ </div>
191
+
192
+ <div class="method-description">
193
+ <p>
194
+ Removes the database directory and everything in it.
195
+ </p>
196
+ </div>
197
+ </div>
198
+
199
+ <div id="method-M000014" class="method-detail">
200
+ <a name="M000014"></a>
201
+
202
+ <div class="method-heading">
203
+ <a href="Odeum.src/M000014.html" target="Code" class="method-signature"
204
+ onclick="popupCode('Odeum.src/M000014.html');return false;">
205
+ <span class="method-name">Odeum::settuning(ibnum, idnum, cbnum, csiz) &rarr; nil<br />
206
+ </span>
207
+ </a>
208
+ </div>
209
+
210
+ <div class="method-description">
211
+ <p>
212
+ ibnum=32749: Number of buckets for inverted indexes. idnum=7: Division
213
+ number of inverted index. cbnum=262139: Number of buckets for dirty
214
+ buffers. csiz=8388608: Maximum bytes to use memory for dirty buffers.
215
+ </p>
216
+ <p>
217
+ This is set globally for all Indexes. Not sure what would happen if you
218
+ changed this mid-stream, so don&#8217;t. Make sure everything is closed.
219
+ </p>
220
+ </div>
221
+ </div>
222
+
223
+
224
+ </div>
225
+
226
+
227
+ </div>
228
+
229
+
230
+ <div id="validator-badges">
231
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
232
+ </div>
233
+
234
+ </body>
235
+ </html>
@@ -0,0 +1,25 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>merge (Odeum)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre>/**
14
+ * call-seq:
15
+ * Odeum::merge(new_name, other_databases) -&gt; true/false
16
+ *
17
+ * Merges the databases listed in other_databases (Array of Strings)
18
+ * into the new database new_name.
19
+ * If two or more documents have the same URI then the first one is
20
+ * adopted and the others are ignored.
21
+ */
22
+ VALUE Odeum_merge(VALUE self, VALUE name, VALUE elemnames) {
23
+ </pre>
24
+ </body>
25
+ </html>
@@ -0,0 +1,22 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>remove (Odeum)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre>/**
14
+ * call-seq:
15
+ * Odeum::remove(name) -&gt; true/false
16
+ *
17
+ * Removes the database directory and everything in it.
18
+ */
19
+ VALUE Odeum_remove(VALUE self, VALUE name) {
20
+ </pre>
21
+ </body>
22
+ </html>
@@ -0,0 +1,27 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>breaktext (Odeum)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre>/**
14
+ * call-seq:
15
+ * Odeum::breaktext(test) -&gt; [word1, word2, word3]
16
+ *
17
+ * Breaks a string into an array of words that are separated by
18
+ * space characters and such delimiters as period, commaa, etc.
19
+ * You should also check out StringScanner as a more flexible
20
+ * alternative. This function must do a lot of data copying and
21
+ * other things in order to convert from Odeum internal types to Ruby
22
+ * types.
23
+ */
24
+ VALUE Odeum_breaktext(VALUE self, VALUE text) {
25
+ </pre>
26
+ </body>
27
+ </html>