scripsi 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. data/Manifest +4 -0
  2. data/README.md +29 -0
  3. data/Rakefile +12 -0
  4. data/lib/scripsi.rb +184 -0
  5. data/scripsi.gemspec +32 -0
  6. metadata +76 -0
@@ -0,0 +1,4 @@
1
+ README.md
2
+ Rakefile
3
+ lib/scripsi.rb
4
+ Manifest
@@ -0,0 +1,29 @@
1
+ # Scripsi
2
+
3
+ A flexible text-searching library built on top of redis.
4
+
5
+ ## Sorted suffix indexing
6
+
7
+ Sorted suffix indexing allows you to search for any substring within a set of documents. First, index a collection of documents and associated ids:
8
+
9
+ require 'scripsi'
10
+ Scripsi.connect # connect to a running redis server
11
+
12
+ ssi = Scripsi::SortedSuffixIndexer.new "myindexer"
13
+ ssi.index(1,"Epistulam ad te scripsi.")
14
+ ssi.index(2,"I've written you a letter.")
15
+ ssi.index(3,"Quisnam Tusculo espistulam me misit?")
16
+ ssi.index(4,"Who in Tusculum would've sent me a letter?")
17
+
18
+ You can then search for any substring, and the indexer will return the ids of the documents where that substring appears.
19
+
20
+ ssi = Scripsi.indexer "myindexer"
21
+ ssi.search("te") # => [1,2,4]
22
+ ssi.search("Tuscul") # => [3,4]
23
+ ssi.search("Tusculu") # => [4]
24
+ ssi.search("you a le") # => [2]
25
+
26
+ You can also retrive the stored documents efficiently:
27
+
28
+ ssi.documents # lazy list of documents
29
+ ssi.documents[3] # document with id string
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('scripsi','0.0.1') do |p|
6
+ p.description = "a flexible text-searching library built on top of redis"
7
+ p.url = "https://github.com/alecbenzer/scripsi"
8
+ p.author = "Alec Benzer"
9
+ p.email = "alecbenzer@gmail.com"
10
+ p.ignore_pattern = ["*.rdb"]
11
+ p.development_dependencies = ["redis >=2.1.1"]
12
+ end
@@ -0,0 +1,184 @@
1
+ require 'redis'
2
+ require 'set'
3
+
4
+ module Scripsi
5
+ # connect to a redis server
6
+ def self.connect(options = {})
7
+ @@redis = Redis.new(options)
8
+ end
9
+
10
+ def self.redis
11
+ @@redis
12
+ end
13
+
14
+ @@partition_size = 10
15
+
16
+ def self.partition_size
17
+ @@partition_size
18
+ end
19
+
20
+ # generate a 'score' for a string
21
+ # used for storing it in a sorted set
22
+ #
23
+ # This method effectively turns a string into a base 27 floating point number,
24
+ # where 0 corresponds to no letter, 1 to A, 2 to B, etc.
25
+ #
26
+ # @param [String] str the string we are computing a score for
27
+ # @return [Number] the string's score
28
+ def self.score(str)
29
+ str = str.downcase
30
+ scrs = []
31
+ str.split('').each_slice(partition_size) do |s|
32
+ mult = 1.0
33
+ scr = 0.0
34
+ s.each do |char|
35
+ mult /= 27
36
+ scr += (char.ord-'a'.ord+1)*mult if ('a'..'z').include? char
37
+ end
38
+ scrs << scr
39
+ end
40
+ scrs
41
+ end
42
+
43
+ # get the indexer with the given id
44
+ def self.indexer(id)
45
+ type = Scripsi.redis.hget "scripsi:used", id.to_s
46
+ if type == "ssi"
47
+ SortedSuffixIndexer.build(id)
48
+ end
49
+ end
50
+
51
+ # (see #indexer)
52
+ def self.find(id)
53
+ indexer(id)
54
+ end
55
+
56
+ class SortedSuffixIndexer
57
+ def initialize(id=nil,check=true)
58
+ if check
59
+ if id and Scripsi.redis.hexists "scripsi:used", id.to_s
60
+ raise "id '#{id}' in use"
61
+ end
62
+ @id = id ? id.to_s : Scripsi.redis.incr("scripsi:next_id")
63
+ Scripsi.redis.hset "scripsi:used", @id, "ssi"
64
+ else
65
+ @id = id
66
+ end
67
+ @index_key = "scripsi:index:#{@id}"
68
+ @document_key = "scripsi:document:#{@id}"
69
+ @documents_key = "scripsi:documents:#{@id}"
70
+ @search_length = 30
71
+ end
72
+
73
+ # adds a document to this indexer
74
+ #
75
+ # @param [Integer] id a number representing the id of this document
76
+ # @param [String] str the text of the document
77
+ # @return [Boolean] returns true if the document was successfully indexed
78
+ def index(id,str)
79
+ id = id.to_s
80
+ return false if Scripsi.redis.hexists @documents_key, id
81
+ offset = Scripsi.redis.strlen @document_key
82
+ sfxs = suffixes(str).sort_by{|s,i| s}
83
+ sfxs.each do |suffix,i|
84
+ Scripsi.score(suffix).each_with_index do |scr,j|
85
+ Scripsi.redis.zadd "#{@index_key}:#{j}", scr, i+offset
86
+ end
87
+ end
88
+ doc = str + "\0#{id}\0"
89
+ Scripsi.redis.append @document_key, doc
90
+ endpoints = Marshal.dump([offset, offset + str.size - 1])
91
+ Scripsi.redis.hset @documents_key, id, endpoints
92
+ end
93
+
94
+ # a lazy list of documents associated with a SortedSuffixIndexer
95
+ class Documents
96
+ def initialize(doc_key, endpoints_key)
97
+ @doc_key = doc_key
98
+ @endpoints_key = endpoints_key
99
+ end
100
+
101
+ def [](id)
102
+ endpoints = Scripsi.redis.hget(@endpoints_key, id)
103
+ return nil unless endpoints
104
+ a,b = Marshal.load(endpoints)
105
+ Scripsi.redis.getrange @doc_key, a.to_i, b.to_i
106
+ end
107
+ end
108
+
109
+ # retrive the document with the given id
110
+ def documents
111
+ Documents.new(@document_key,@documents_key)
112
+ end
113
+
114
+ # searches for documents containing the substring term
115
+ #
116
+ # @param [String] term the substring to search for
117
+ def search(term)
118
+ term, length = term.downcase, term.length
119
+ set = nil
120
+ Scripsi.score(term).each_with_index do |scr,i|
121
+ a,b = scr.to_s, "#{scr+1.0/(27**length)}"
122
+ b = "(" + b unless a == b
123
+ ids = Scripsi.redis.zrangebyscore("#{@index_key}:#{i}",a,b)
124
+ set = set ? set & Set.new(ids) : Set.new(ids)
125
+ length -= Scripsi.partition_size
126
+ end
127
+ set.map{|i| read_to_id(i.to_i)}.uniq
128
+ end
129
+
130
+ # creates an indexer with the given id WITHOUT CHECKING
131
+ # this method is used internally - calling it yourself may result in deleting an indexer, unless you know the id you're using is valid
132
+ def self.build(id)
133
+ new(id,false)
134
+ end
135
+
136
+ def inspect
137
+ "#<Scripsi::SortedSuffixIndexer id=#{@id}>"
138
+ end
139
+
140
+ private
141
+
142
+ def suffixes(str)
143
+ str = str.downcase
144
+ (0...str.length).map {|i| [str[i..-1],i] }
145
+ end
146
+
147
+ def document_index(index)
148
+ doc_index = Scripsi.redis.zrange(@index_key, index, index).first.to_i
149
+ end
150
+
151
+ def compare_with_index(str,doc_index)
152
+ str.split('').each_with_index do |char,offset|
153
+ s = Scripsi.redis.getrange @document_key, doc_index+offset, doc_index+offset
154
+ STDERR.puts "comparing #{char} and #{s.downcase}"
155
+ comp = char <=> s.downcase
156
+ return comp unless comp == 0
157
+ end
158
+ 0
159
+ end
160
+
161
+ def read_to_id(doc_index)
162
+ last = Scripsi.redis.strlen @document_key
163
+ (doc_index..last).each do |i|
164
+ char = Scripsi.redis.getrange(@document_key, i, i)
165
+ if char == "\0"
166
+ id = ""
167
+ offset = 1
168
+ loop do
169
+ next_char = Scripsi.redis.getrange(@document_key,i+offset,i+offset)
170
+ if next_char == "\0"
171
+ break
172
+ else
173
+ id << next_char
174
+ offset += 1
175
+ end
176
+ end
177
+ return id.to_i
178
+ end
179
+ end
180
+ raise "index is corrupt"
181
+ end
182
+ end
183
+
184
+ end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{scripsi}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Alec Benzer"]
9
+ s.date = %q{2011-02-09}
10
+ s.description = %q{a flexible text-searching library built on top of redis}
11
+ s.email = %q{alecbenzer@gmail.com}
12
+ s.extra_rdoc_files = ["README.md", "lib/scripsi.rb"]
13
+ s.files = ["README.md", "Rakefile", "lib/scripsi.rb", "Manifest", "scripsi.gemspec"]
14
+ s.homepage = %q{https://github.com/alecbenzer/scripsi}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scripsi", "--main", "README.md"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{scripsi}
18
+ s.rubygems_version = %q{1.5.0}
19
+ s.summary = %q{a flexible text-searching library built on top of redis}
20
+
21
+ if s.respond_to? :specification_version then
22
+ s.specification_version = 3
23
+
24
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
25
+ s.add_development_dependency(%q<redis>, [">= 2.1.1"])
26
+ else
27
+ s.add_dependency(%q<redis>, [">= 2.1.1"])
28
+ end
29
+ else
30
+ s.add_dependency(%q<redis>, [">= 2.1.1"])
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scripsi
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Alec Benzer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-02-09 00:00:00 -05:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: redis
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: 2.1.1
25
+ type: :development
26
+ version_requirements: *id001
27
+ description: a flexible text-searching library built on top of redis
28
+ email: alecbenzer@gmail.com
29
+ executables: []
30
+
31
+ extensions: []
32
+
33
+ extra_rdoc_files:
34
+ - README.md
35
+ - lib/scripsi.rb
36
+ files:
37
+ - README.md
38
+ - Rakefile
39
+ - lib/scripsi.rb
40
+ - Manifest
41
+ - scripsi.gemspec
42
+ has_rdoc: true
43
+ homepage: https://github.com/alecbenzer/scripsi
44
+ licenses: []
45
+
46
+ post_install_message:
47
+ rdoc_options:
48
+ - --line-numbers
49
+ - --inline-source
50
+ - --title
51
+ - Scripsi
52
+ - --main
53
+ - README.md
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "1.2"
68
+ requirements: []
69
+
70
+ rubyforge_project: scripsi
71
+ rubygems_version: 1.5.0
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: a flexible text-searching library built on top of redis
75
+ test_files: []
76
+