recordsearch 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,27 @@
1
+ desc 'Release the website and new gem version'
2
+ task :deploy => [:check_version, :website, :release] do
3
+ puts "Remember to create SVN tag:"
4
+ puts "svn copy svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/trunk " +
5
+ "svn+ssh://#{rubyforge_username}@rubyforge.org/var/svn/#{PATH}/tags/REL-#{VERS} "
6
+ puts "Suggested comment:"
7
+ puts "Tagging release #{CHANGES}"
8
+ end
9
+
10
+ desc 'Runs tasks website_generate and install_gem as a local deployment of the gem'
11
+ task :local_deploy => [:website_generate, :install_gem]
12
+
13
+ task :check_version do
14
+ unless ENV['VERSION']
15
+ puts 'Must pass a VERSION=x.y.z release version'
16
+ exit
17
+ end
18
+ unless ENV['VERSION'] == VERS
19
+ puts "Please update your version.rb to match the release version, currently #{VERS}"
20
+ exit
21
+ end
22
+ end
23
+
24
+ desc 'Install the package as a gem, without generating documentation(ri/rdoc)'
25
+ task :install_gem_no_doc => [:clean, :package] do
26
+ sh "#{'sudo ' unless Hoe::WINDOZE }gem install pkg/*.gem --no-rdoc --no-ri"
27
+ end
@@ -0,0 +1,7 @@
1
+ task :ruby_env do
2
+ RUBY_APP = if RUBY_PLATFORM =~ /java/
3
+ "jruby"
4
+ else
5
+ "ruby"
6
+ end unless defined? RUBY_APP
7
+ end
@@ -0,0 +1,17 @@
1
+ desc 'Generate website files'
2
+ task :website_generate => :ruby_env do
3
+ (Dir['website/**/*.txt'] - Dir['website/version*.txt']).each do |txt|
4
+ sh %{ #{RUBY_APP} script/txt2html #{txt} > #{txt.gsub(/txt$/,'html')} }
5
+ end
6
+ end
7
+
8
+ desc 'Upload website files to rubyforge'
9
+ task :website_upload do
10
+ host = "#{rubyforge_username}@rubyforge.org"
11
+ remote_dir = "/var/www/gforge-projects/#{RUBYFORGE_PROJECT}/"
12
+ local_dir = 'website'
13
+ sh %{rsync -aCv #{local_dir}/ #{host}:#{remote_dir}}
14
+ end
15
+
16
+ desc 'Generate and upload website files'
17
+ task :website => [:website_generate, :website_upload, :publish_docs, :publish_coverage]
@@ -0,0 +1,21 @@
1
+ require 'test/unit'
2
+ require File.dirname(__FILE__) + '/../lib/recordsearch'
3
+
4
+ class TestDataSource < RecordSearch::DataSource
5
+ def initialize(max=100)
6
+ @max = max
7
+ @cur = 0
8
+ end
9
+
10
+ def next
11
+ if @cur < @max
12
+ @cur += 1
13
+ self.class.format(@cur - 1)
14
+ end
15
+ end
16
+
17
+ def self.format(x)
18
+ #TODO @max should have 10 digits or less
19
+ "%010d\n" % x
20
+ end
21
+ end
@@ -0,0 +1,14 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ class TestIndex < Test::Unit::TestCase
4
+
5
+ def test_index
6
+ db_fname = 'tmp/list'
7
+ begin
8
+ RecordSearch.index(TestDataSource.new, db_fname)
9
+ ensure
10
+ File.delete(db_fname)
11
+ File.delete(db_fname + '.idx')
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,40 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ class TestRecordSearch < Test::Unit::TestCase
4
+
5
+ def test_search
6
+ 100.times { |x| assert_search(x) }
7
+ end
8
+
9
+ private
10
+
11
+ def assert_search(max)
12
+ db_fname = 'tmp/test_search'
13
+ begin
14
+ RecordSearch.index(TestDataSource.new(max), db_fname)
15
+ bs = TestSearch.new(db_fname)
16
+ max.times do |x|
17
+ value = TestDataSource.format(x)
18
+ assert_equal value, bs.search(value)
19
+ end
20
+ ensure
21
+ File.delete(db_fname)
22
+ File.delete(db_fname + '.idx')
23
+ end
24
+ end
25
+ end
26
+
27
+ class TestSearch < RecordSearch::Search
28
+ def parse(what)
29
+ what
30
+ end
31
+
32
+ def gt(a, b)
33
+ a > b
34
+ end
35
+
36
+ def eq(a, b)
37
+ a == b
38
+ end
39
+
40
+ end
@@ -0,0 +1,174 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4
+ <head>
5
+ <link rel="stylesheet" href="stylesheets/screen.css" type="text/css" media="screen" />
6
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7
+ <title>
8
+ Text Record Search
9
+ </title>
10
+ <script src="javascripts/rounded_corners_lite.inc.js" type="text/javascript"></script>
11
+ <style>
12
+
13
+ </style>
14
+ <script type="text/javascript">
15
+ window.onload = function() {
16
+ settings = {
17
+ tl: { radius: 10 },
18
+ tr: { radius: 10 },
19
+ bl: { radius: 10 },
20
+ br: { radius: 10 },
21
+ antiAlias: true,
22
+ autoPad: true,
23
+ validTags: ["div"]
24
+ }
25
+ var versionBox = new curvyCorners(settings, document.getElementById("version"));
26
+ versionBox.applyCornersToAll();
27
+ }
28
+ </script>
29
+ </head>
30
+ <body>
31
+ <div id="main">
32
+
33
+ <h1>Text Record Search</h1>
34
+ <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/recordsearch"; return false'>
35
+ <p>Get Version</p>
36
+ <a href="http://rubyforge.org/projects/recordsearch" class="numbers">1.0.0</a>
37
+ </div>
38
+ <h1>&#x2192; &#8216;recordsearch&#8217;</h1>
39
+
40
+
41
+ <h2>What</h2>
42
+
43
+
44
+ <p>With <code>recordsearch</code> you can do binary searches on a text file. I created this library as a few weeks after participating in the <a href="http://www.rubyquiz.com/quiz139.html">Ruby Quiz #139</a> I found myself needing something similar in a project I was working on.</p>
45
+
46
+
47
+ <h2>Installing</h2>
48
+
49
+
50
+ <p><pre class='syntax'><span class="ident">sudo</span> <span class="ident">gem</span> <span class="ident">install</span> <span class="ident">recordsearch</span></pre></p>
51
+
52
+
53
+ <p>Or download the gem and install manually.</p>
54
+
55
+
56
+ <h2>The basics</h2>
57
+
58
+
59
+ <p>The approach <code>recordsearch</code> uses is to pre process the text file to generate first an index and storing it in a file before being able to begin searching. It&#8217;s assumed the records are of variable size, it should not make much sense to use an index if you have records of fixed size. After the index is created you can begin to do searches.</p>
60
+
61
+
62
+ <p>Bear in mind I created this library to suit my needs, so, for example, when the index is being created, the contents of the original file is used to create another as I needed to transform the original file.</p>
63
+
64
+
65
+ <h2>Demonstration of usage</h2>
66
+
67
+
68
+ <p>Suppose we have a ~40MB file with a list of inflected words and their corresponding <a href="http://en.wikipedia.org/wiki/Lemma_%28linguistics%29">lemma</a> in <i>big_file.txt</i>. Each line of the file has the format <i>&#8220;inflected_word lemma&#8221;</i>.</p>
69
+
70
+
71
+ <p>First we require the needed files:</p>
72
+
73
+
74
+ <p><pre class='syntax'>
75
+ <span class="ident">require</span> <span class="punct">'</span><span class="string">rubygems</span><span class="punct">'</span>
76
+ <span class="ident">require</span> <span class="punct">'</span><span class="string">recordsearch</span><span class="punct">'</span>
77
+ </pre></p>
78
+
79
+
80
+ <h3>Creating the index</h3>
81
+
82
+
83
+ <p>We have to create the index first, to do this, we need a subclass of <a href="http://recordsearch.rubyforge.org/rdoc/classes/RecordSearch/DataSource.html">RecordSearch::DataSource</a> . In this example we are using a <a href="http://recordsearch.rubyforge.org/rdoc/classes/RecordSearch/FileDataSource.html">RecordSearch::FileDataSource</a> which only reads each line from the file.</p>
84
+
85
+
86
+ <p><pre class='syntax'>
87
+ <span class="ident">data_source</span> <span class="punct">=</span> <span class="constant">RecordSearch</span><span class="punct">::</span><span class="constant">FileDataSource</span><span class="punct">.</span><span class="ident">new</span><span class="punct">('</span><span class="string">big_file.txt</span><span class="punct">')</span>
88
+ <span class="constant">RecordSearch</span><span class="punct">::</span><span class="ident">index</span><span class="punct">(</span><span class="ident">data_source</span><span class="punct">,</span> <span class="punct">'</span><span class="string">searchable_file</span><span class="punct">')</span>
89
+ </pre></p>
90
+
91
+
92
+ <p>After this, <i>searchable_file</i> and <i>searchable_file.idx</i> have been created. This is one of the quirks the library has, if you don&#8217;t need to change the contents of <i>big_file.txt</i> you end up with <i>searchable_file</i> which is an exact duplicate of the original.</p>
93
+
94
+
95
+ <h3>Searching</h3>
96
+
97
+
98
+ <p>To search we need to subclass <a href="http://recordsearch.rubyforge.org/rdoc/classes/RecordSearch/Search.html">RecordSearch::Search</a> and implement the methods <code>parse</code>, <code>gt</code> and <code>eq</code>. In this example <i>parse</i> reads a line, and split it in an array with it&#8217;s first element as the inflected word and the second element the lemma.</p>
99
+
100
+
101
+ <p><pre class='syntax'>
102
+ <span class="keyword">class </span><span class="class">MySearch</span> <span class="punct">&lt;</span> <span class="constant">RecordSearch</span><span class="punct">::</span><span class="constant">Search</span>
103
+ <span class="keyword">def </span><span class="method">initialize</span>
104
+ <span class="keyword">super</span><span class="punct">('</span><span class="string">searchable_file</span><span class="punct">')</span>
105
+ <span class="keyword">end</span>
106
+
107
+ <span class="keyword">def </span><span class="method">parse</span><span class="punct">(</span><span class="ident">what</span><span class="punct">)</span>
108
+ <span class="ident">what</span><span class="punct">.</span><span class="ident">chomp</span><span class="punct">.</span><span class="ident">split</span>
109
+ <span class="keyword">end</span>
110
+
111
+ <span class="keyword">def </span><span class="method">gt</span><span class="punct">(</span><span class="ident">a</span><span class="punct">,</span> <span class="ident">b</span><span class="punct">)</span>
112
+ <span class="ident">a</span><span class="punct">[</span><span class="number">0</span><span class="punct">]</span> <span class="punct">&gt;</span> <span class="ident">b</span>
113
+ <span class="keyword">end</span>
114
+
115
+ <span class="keyword">def </span><span class="method">eq</span><span class="punct">(</span><span class="ident">a</span><span class="punct">,</span> <span class="ident">b</span><span class="punct">)</span>
116
+ <span class="ident">a</span><span class="punct">[</span><span class="number">0</span><span class="punct">]</span> <span class="punct">==</span> <span class="ident">b</span>
117
+ <span class="keyword">end</span>
118
+ <span class="keyword">end</span>
119
+ </pre></p>
120
+
121
+
122
+ <p>And we search like this:
123
+ <pre class='syntax'>
124
+ <span class="ident">my_search</span> <span class="punct">=</span> <span class="constant">MySearch</span><span class="punct">.</span><span class="ident">new</span>
125
+ <span class="ident">my_search</span><span class="punct">.</span><span class="ident">search</span><span class="punct">('</span><span class="string">item</span><span class="punct">')</span>
126
+ </pre></p>
127
+
128
+
129
+ <h2>Forum</h2>
130
+
131
+
132
+ There are two forums:
133
+ <ul>
134
+ <li><a href="http://rubyforge.org/forum/forum.php?forum_id=19017">open-discussion</a></li>
135
+ <li><a href="http://rubyforge.org/forum/forum.php?forum_id=19018">help</a></li>
136
+ </ul>
137
+
138
+ <h2>Documentacion</h2>
139
+
140
+
141
+ <p>The rdoc documentation is in the page <a href="http://recordsearch.rubyforge.org/rdoc">RecordSearch <span class="caps">API</span></a> and you can also see the <a href="http://recordsearch.rubyforge.org/coverage">test coverage report</a></p>
142
+
143
+
144
+ <h2>Source</h2>
145
+
146
+
147
+ <p>You can get the source using svn using this url <code>svn://rubyforge.org/var/svn/recordsearch/trunk</code> for anonymous access or go the the <a href="http://rubyforge.org/projects/recordsearch">RecordSearch</a> RubyForge page.</p>
148
+
149
+
150
+ <h2>License</h2>
151
+
152
+
153
+ <p>This code is free to use under the terms of the <span class="caps">GPL</span> license.</p>
154
+
155
+
156
+ <h2>Contact</h2>
157
+
158
+
159
+ <p>Comments are welcome. Send an email to <a href="mailto:lparravi@gmail.com">Luis Parravicini</a> or post a message to one of the available forums.</p>
160
+ <p class="coda">
161
+ <a href="http://ktulu.com.ar">Luis Parravicini</a>, 20th November 2007<br>
162
+ Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>
163
+ </p>
164
+ </div>
165
+
166
+ <script src="http://www.google-analytics.com/urchin.js" type="text/javascript">
167
+ </script>
168
+ <script type="text/javascript">
169
+ _uacct = "UA-2740686-4";
170
+ urchinTracker();
171
+ </script>
172
+
173
+ </body>
174
+ </html>
data/website/index.txt ADDED
@@ -0,0 +1,97 @@
1
+ h1. Text Record Search
2
+
3
+ h1. &#x2192; 'recordsearch'
4
+
5
+
6
+ h2. What
7
+
8
+ With <code>recordsearch</code> you can do binary searches on a text file. I created this library as a few weeks after participating in the "Ruby Quiz #139":http://www.rubyquiz.com/quiz139.html I found myself needing something similar in a project I was working on.
9
+
10
+ h2. Installing
11
+
12
+ <pre syntax="ruby">sudo gem install recordsearch</pre>
13
+
14
+ Or download the gem and install manually.
15
+
16
+ h2. The basics
17
+
18
+ The approach <code>recordsearch</code> uses is to pre process the text file to generate first an index and storing it in a file before being able to begin searching. It's assumed the records are of variable size, it should not make much sense to use an index if you have records of fixed size. After the index is created you can begin to do searches.
19
+
20
+ Bear in mind I created this library to suit my needs, so, for example, when the index is being created, the contents of the original file is used to create another as I needed to transform the original file.
21
+
22
+ h2. Demonstration of usage
23
+
24
+ Suppose we have a ~40MB file with a list of inflected words and their corresponding "lemma":http://en.wikipedia.org/wiki/Lemma_%28linguistics%29 in <i>big_file.txt</i>. Each line of the file has the format <i>"inflected_word lemma"</i>.
25
+
26
+ First we require the needed files:
27
+
28
+ <pre syntax="ruby">
29
+ require 'rubygems'
30
+ require 'recordsearch'
31
+ </pre>
32
+
33
+ h3. Creating the index
34
+
35
+ We have to create the index first, to do this, we need a subclass of "RecordSearch::DataSource":http://recordsearch.rubyforge.org/rdoc/classes/RecordSearch/DataSource.html . In this example we are using a "RecordSearch::FileDataSource":http://recordsearch.rubyforge.org/rdoc/classes/RecordSearch/FileDataSource.html which only reads each line from the file.
36
+
37
+ <pre syntax="ruby">
38
+ data_source = RecordSearch::FileDataSource.new('big_file.txt')
39
+ RecordSearch::index(data_source, 'searchable_file')
40
+ </pre>
41
+
42
+ After this, <i>searchable_file</i> and <i>searchable_file.idx</i> have been created. This is one of the quirks the library has, if you don't need to change the contents of <i>big_file.txt</i> you end up with <i>searchable_file</i> which is an exact duplicate of the original.
43
+
44
+ h3. Searching
45
+
46
+ To search we need to subclass "RecordSearch::Search":http://recordsearch.rubyforge.org/rdoc/classes/RecordSearch/Search.html and implement the methods <code>parse</code>, <code>gt</code> and <code>eq</code>. In this example <i>parse</i> reads a line, and split it in an array with it's first element as the inflected word and the second element the lemma.
47
+
48
+ <pre syntax="ruby">
49
+ class MySearch < RecordSearch::Search
50
+ def initialize
51
+ super('searchable_file')
52
+ end
53
+
54
+ def parse(what)
55
+ what.chomp.split
56
+ end
57
+
58
+ def gt(a, b)
59
+ a[0] > b
60
+ end
61
+
62
+ def eq(a, b)
63
+ a[0] == b
64
+ end
65
+ end
66
+ </pre>
67
+
68
+ And we search like this:
69
+ <pre syntax="ruby">
70
+ my_search = MySearch.new
71
+ my_search.search('item')
72
+ </pre>
73
+
74
+
75
+ h2. Forum
76
+
77
+ There are two forums:
78
+ <ul>
79
+ <li>"open-discussion":http://rubyforge.org/forum/forum.php?forum_id=19017</li>
80
+ <li>"help":http://rubyforge.org/forum/forum.php?forum_id=19018</li>
81
+ </ul>
82
+
83
+ h2. Documentacion
84
+
85
+ The rdoc documentation is in the page "RecordSearch API":http://recordsearch.rubyforge.org/rdoc and you can also see the "test coverage report":http://recordsearch.rubyforge.org/coverage
86
+
87
+ h2. Source
88
+
89
+ You can get the source using svn using this url <code>svn://rubyforge.org/var/svn/recordsearch/trunk</code> for anonymous access or go the the "RecordSearch":http://rubyforge.org/projects/recordsearch RubyForge page.
90
+
91
+ h2. License
92
+
93
+ This code is free to use under the terms of the GPL license.
94
+
95
+ h2. Contact
96
+
97
+ Comments are welcome. Send an email to "Luis Parravicini":mailto:lparravi@gmail.com or post a message to one of the available forums.