scripsi 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest +4 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/lib/scripsi.rb +184 -0
- data/scripsi.gemspec +32 -0
- metadata +76 -0
data/Manifest
ADDED
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Scripsi
|
2
|
+
|
3
|
+
A flexible text-searching library built on top of redis.
|
4
|
+
|
5
|
+
## Sorted suffix indexing
|
6
|
+
|
7
|
+
Sorted suffix indexing allows you to search for any substring within a set of documents. First, index a collection of documents and associated ids:
|
8
|
+
|
9
|
+
require 'scripsi'
|
10
|
+
Scripsi.connect # connect to a running redis server
|
11
|
+
|
12
|
+
ssi = Scripsi::SortedSuffixIndexer.new "myindexer"
|
13
|
+
ssi.index(1,"Epistulam ad te scripsi.")
|
14
|
+
ssi.index(2,"I've written you a letter.")
|
15
|
+
ssi.index(3,"Quisnam Tusculo espistulam me misit?")
|
16
|
+
ssi.index(4,"Who in Tusculum would've sent me a letter?")
|
17
|
+
|
18
|
+
You can then search for any substring, and the indexer will return the ids of the documents where that substring appears.
|
19
|
+
|
20
|
+
ssi = Scripsi.indexer "myindexer"
|
21
|
+
ssi.search("te") # => [1,2,4]
|
22
|
+
ssi.search("Tuscul") # => [3,4]
|
23
|
+
ssi.search("Tusculu") # => [4]
|
24
|
+
ssi.search("you a le") # => [2]
|
25
|
+
|
26
|
+
You can also retrive the stored documents efficiently:
|
27
|
+
|
28
|
+
ssi.documents # lazy list of documents
|
29
|
+
ssi.documents[3] # document with id string
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('scripsi','0.0.1') do |p|
|
6
|
+
p.description = "a flexible text-searching library built on top of redis"
|
7
|
+
p.url = "https://github.com/alecbenzer/scripsi"
|
8
|
+
p.author = "Alec Benzer"
|
9
|
+
p.email = "alecbenzer@gmail.com"
|
10
|
+
p.ignore_pattern = ["*.rdb"]
|
11
|
+
p.development_dependencies = ["redis >=2.1.1"]
|
12
|
+
end
|
data/lib/scripsi.rb
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'redis'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module Scripsi
|
5
|
+
# connect to a redis server
|
6
|
+
def self.connect(options = {})
|
7
|
+
@@redis = Redis.new(options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.redis
|
11
|
+
@@redis
|
12
|
+
end
|
13
|
+
|
14
|
+
@@partition_size = 10
|
15
|
+
|
16
|
+
def self.partition_size
|
17
|
+
@@partition_size
|
18
|
+
end
|
19
|
+
|
20
|
+
# generate a 'score' for a string
|
21
|
+
# used for storing it in a sorted set
|
22
|
+
#
|
23
|
+
# This method effectively turns a string into a base 27 floating point number,
|
24
|
+
# where 0 corresponds to no letter, 1 to A, 2 to B, etc.
|
25
|
+
#
|
26
|
+
# @param [String] str the string we are computing a score for
|
27
|
+
# @return [Number] the string's score
|
28
|
+
def self.score(str)
|
29
|
+
str = str.downcase
|
30
|
+
scrs = []
|
31
|
+
str.split('').each_slice(partition_size) do |s|
|
32
|
+
mult = 1.0
|
33
|
+
scr = 0.0
|
34
|
+
s.each do |char|
|
35
|
+
mult /= 27
|
36
|
+
scr += (char.ord-'a'.ord+1)*mult if ('a'..'z').include? char
|
37
|
+
end
|
38
|
+
scrs << scr
|
39
|
+
end
|
40
|
+
scrs
|
41
|
+
end
|
42
|
+
|
43
|
+
# get the indexer with the given id
|
44
|
+
def self.indexer(id)
|
45
|
+
type = Scripsi.redis.hget "scripsi:used", id.to_s
|
46
|
+
if type == "ssi"
|
47
|
+
SortedSuffixIndexer.build(id)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# (see #indexer)
|
52
|
+
def self.find(id)
|
53
|
+
indexer(id)
|
54
|
+
end
|
55
|
+
|
56
|
+
class SortedSuffixIndexer
|
57
|
+
def initialize(id=nil,check=true)
|
58
|
+
if check
|
59
|
+
if id and Scripsi.redis.hexists "scripsi:used", id.to_s
|
60
|
+
raise "id '#{id}' in use"
|
61
|
+
end
|
62
|
+
@id = id ? id.to_s : Scripsi.redis.incr("scripsi:next_id")
|
63
|
+
Scripsi.redis.hset "scripsi:used", @id, "ssi"
|
64
|
+
else
|
65
|
+
@id = id
|
66
|
+
end
|
67
|
+
@index_key = "scripsi:index:#{@id}"
|
68
|
+
@document_key = "scripsi:document:#{@id}"
|
69
|
+
@documents_key = "scripsi:documents:#{@id}"
|
70
|
+
@search_length = 30
|
71
|
+
end
|
72
|
+
|
73
|
+
# adds a document to this indexer
|
74
|
+
#
|
75
|
+
# @param [Integer] id a number representing the id of this document
|
76
|
+
# @param [String] str the text of the document
|
77
|
+
# @return [Boolean] returns true if the document was successfully indexed
|
78
|
+
def index(id,str)
|
79
|
+
id = id.to_s
|
80
|
+
return false if Scripsi.redis.hexists @documents_key, id
|
81
|
+
offset = Scripsi.redis.strlen @document_key
|
82
|
+
sfxs = suffixes(str).sort_by{|s,i| s}
|
83
|
+
sfxs.each do |suffix,i|
|
84
|
+
Scripsi.score(suffix).each_with_index do |scr,j|
|
85
|
+
Scripsi.redis.zadd "#{@index_key}:#{j}", scr, i+offset
|
86
|
+
end
|
87
|
+
end
|
88
|
+
doc = str + "\0#{id}\0"
|
89
|
+
Scripsi.redis.append @document_key, doc
|
90
|
+
endpoints = Marshal.dump([offset, offset + str.size - 1])
|
91
|
+
Scripsi.redis.hset @documents_key, id, endpoints
|
92
|
+
end
|
93
|
+
|
94
|
+
# a lazy list of documents associated with a SortedSuffixIndexer
|
95
|
+
class Documents
|
96
|
+
def initialize(doc_key, endpoints_key)
|
97
|
+
@doc_key = doc_key
|
98
|
+
@endpoints_key = endpoints_key
|
99
|
+
end
|
100
|
+
|
101
|
+
def [](id)
|
102
|
+
endpoints = Scripsi.redis.hget(@endpoints_key, id)
|
103
|
+
return nil unless endpoints
|
104
|
+
a,b = Marshal.load(endpoints)
|
105
|
+
Scripsi.redis.getrange @doc_key, a.to_i, b.to_i
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# retrive the document with the given id
|
110
|
+
def documents
|
111
|
+
Documents.new(@document_key,@documents_key)
|
112
|
+
end
|
113
|
+
|
114
|
+
# searches for documents containing the substring term
|
115
|
+
#
|
116
|
+
# @param [String] term the substring to search for
|
117
|
+
def search(term)
|
118
|
+
term, length = term.downcase, term.length
|
119
|
+
set = nil
|
120
|
+
Scripsi.score(term).each_with_index do |scr,i|
|
121
|
+
a,b = scr.to_s, "#{scr+1.0/(27**length)}"
|
122
|
+
b = "(" + b unless a == b
|
123
|
+
ids = Scripsi.redis.zrangebyscore("#{@index_key}:#{i}",a,b)
|
124
|
+
set = set ? set & Set.new(ids) : Set.new(ids)
|
125
|
+
length -= Scripsi.partition_size
|
126
|
+
end
|
127
|
+
set.map{|i| read_to_id(i.to_i)}.uniq
|
128
|
+
end
|
129
|
+
|
130
|
+
# creates an indexer with the given id WITHOUT CHECKING
|
131
|
+
# this method is used internally - calling it yourself may result in deleting an indexer, unless you know the id you're using is valid
|
132
|
+
def self.build(id)
|
133
|
+
new(id,false)
|
134
|
+
end
|
135
|
+
|
136
|
+
def inspect
|
137
|
+
"#<Scripsi::SortedSuffixIndexer id=#{@id}>"
|
138
|
+
end
|
139
|
+
|
140
|
+
private
|
141
|
+
|
142
|
+
def suffixes(str)
|
143
|
+
str = str.downcase
|
144
|
+
(0...str.length).map {|i| [str[i..-1],i] }
|
145
|
+
end
|
146
|
+
|
147
|
+
def document_index(index)
|
148
|
+
doc_index = Scripsi.redis.zrange(@index_key, index, index).first.to_i
|
149
|
+
end
|
150
|
+
|
151
|
+
def compare_with_index(str,doc_index)
|
152
|
+
str.split('').each_with_index do |char,offset|
|
153
|
+
s = Scripsi.redis.getrange @document_key, doc_index+offset, doc_index+offset
|
154
|
+
STDERR.puts "comparing #{char} and #{s.downcase}"
|
155
|
+
comp = char <=> s.downcase
|
156
|
+
return comp unless comp == 0
|
157
|
+
end
|
158
|
+
0
|
159
|
+
end
|
160
|
+
|
161
|
+
def read_to_id(doc_index)
|
162
|
+
last = Scripsi.redis.strlen @document_key
|
163
|
+
(doc_index..last).each do |i|
|
164
|
+
char = Scripsi.redis.getrange(@document_key, i, i)
|
165
|
+
if char == "\0"
|
166
|
+
id = ""
|
167
|
+
offset = 1
|
168
|
+
loop do
|
169
|
+
next_char = Scripsi.redis.getrange(@document_key,i+offset,i+offset)
|
170
|
+
if next_char == "\0"
|
171
|
+
break
|
172
|
+
else
|
173
|
+
id << next_char
|
174
|
+
offset += 1
|
175
|
+
end
|
176
|
+
end
|
177
|
+
return id.to_i
|
178
|
+
end
|
179
|
+
end
|
180
|
+
raise "index is corrupt"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
data/scripsi.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{scripsi}
|
5
|
+
s.version = "0.0.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Alec Benzer"]
|
9
|
+
s.date = %q{2011-02-09}
|
10
|
+
s.description = %q{a flexible text-searching library built on top of redis}
|
11
|
+
s.email = %q{alecbenzer@gmail.com}
|
12
|
+
s.extra_rdoc_files = ["README.md", "lib/scripsi.rb"]
|
13
|
+
s.files = ["README.md", "Rakefile", "lib/scripsi.rb", "Manifest", "scripsi.gemspec"]
|
14
|
+
s.homepage = %q{https://github.com/alecbenzer/scripsi}
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scripsi", "--main", "README.md"]
|
16
|
+
s.require_paths = ["lib"]
|
17
|
+
s.rubyforge_project = %q{scripsi}
|
18
|
+
s.rubygems_version = %q{1.5.0}
|
19
|
+
s.summary = %q{a flexible text-searching library built on top of redis}
|
20
|
+
|
21
|
+
if s.respond_to? :specification_version then
|
22
|
+
s.specification_version = 3
|
23
|
+
|
24
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
25
|
+
s.add_development_dependency(%q<redis>, [">= 2.1.1"])
|
26
|
+
else
|
27
|
+
s.add_dependency(%q<redis>, [">= 2.1.1"])
|
28
|
+
end
|
29
|
+
else
|
30
|
+
s.add_dependency(%q<redis>, [">= 2.1.1"])
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scripsi
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alec Benzer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-02-09 00:00:00 -05:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: redis
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 2.1.1
|
25
|
+
type: :development
|
26
|
+
version_requirements: *id001
|
27
|
+
description: a flexible text-searching library built on top of redis
|
28
|
+
email: alecbenzer@gmail.com
|
29
|
+
executables: []
|
30
|
+
|
31
|
+
extensions: []
|
32
|
+
|
33
|
+
extra_rdoc_files:
|
34
|
+
- README.md
|
35
|
+
- lib/scripsi.rb
|
36
|
+
files:
|
37
|
+
- README.md
|
38
|
+
- Rakefile
|
39
|
+
- lib/scripsi.rb
|
40
|
+
- Manifest
|
41
|
+
- scripsi.gemspec
|
42
|
+
has_rdoc: true
|
43
|
+
homepage: https://github.com/alecbenzer/scripsi
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options:
|
48
|
+
- --line-numbers
|
49
|
+
- --inline-source
|
50
|
+
- --title
|
51
|
+
- Scripsi
|
52
|
+
- --main
|
53
|
+
- README.md
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "1.2"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: scripsi
|
71
|
+
rubygems_version: 1.5.0
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: a flexible text-searching library built on top of redis
|
75
|
+
test_files: []
|
76
|
+
|