scripsi 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest +4 -0
- data/README.md +29 -0
- data/Rakefile +12 -0
- data/lib/scripsi.rb +184 -0
- data/scripsi.gemspec +32 -0
- metadata +76 -0
data/Manifest
ADDED
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Scripsi
|
2
|
+
|
3
|
+
A flexible text-searching library built on top of redis.
|
4
|
+
|
5
|
+
## Sorted suffix indexing
|
6
|
+
|
7
|
+
Sorted suffix indexing allows you to search for any substring within a set of documents. First, index a collection of documents and associated ids:
|
8
|
+
|
9
|
+
require 'scripsi'
|
10
|
+
Scripsi.connect # connect to a running redis server
|
11
|
+
|
12
|
+
ssi = Scripsi::SortedSuffixIndexer.new "myindexer"
|
13
|
+
ssi.index(1,"Epistulam ad te scripsi.")
|
14
|
+
ssi.index(2,"I've written you a letter.")
|
15
|
+
ssi.index(3,"Quisnam Tusculo espistulam me misit?")
|
16
|
+
ssi.index(4,"Who in Tusculum would've sent me a letter?")
|
17
|
+
|
18
|
+
You can then search for any substring, and the indexer will return the ids of the documents where that substring appears.
|
19
|
+
|
20
|
+
ssi = Scripsi.indexer "myindexer"
|
21
|
+
ssi.search("te") # => [1,2,4]
|
22
|
+
ssi.search("Tuscul") # => [3,4]
|
23
|
+
ssi.search("Tusculu") # => [4]
|
24
|
+
ssi.search("you a le") # => [2]
|
25
|
+
|
26
|
+
You can also retrive the stored documents efficiently:
|
27
|
+
|
28
|
+
ssi.documents # lazy list of documents
|
29
|
+
ssi.documents[3] # document with id string
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('scripsi','0.0.1') do |p|
|
6
|
+
p.description = "a flexible text-searching library built on top of redis"
|
7
|
+
p.url = "https://github.com/alecbenzer/scripsi"
|
8
|
+
p.author = "Alec Benzer"
|
9
|
+
p.email = "alecbenzer@gmail.com"
|
10
|
+
p.ignore_pattern = ["*.rdb"]
|
11
|
+
p.development_dependencies = ["redis >=2.1.1"]
|
12
|
+
end
|
data/lib/scripsi.rb
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
require 'redis'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module Scripsi
|
5
|
+
# connect to a redis server
|
6
|
+
def self.connect(options = {})
|
7
|
+
@@redis = Redis.new(options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.redis
|
11
|
+
@@redis
|
12
|
+
end
|
13
|
+
|
14
|
+
@@partition_size = 10
|
15
|
+
|
16
|
+
def self.partition_size
|
17
|
+
@@partition_size
|
18
|
+
end
|
19
|
+
|
20
|
+
# generate a 'score' for a string
|
21
|
+
# used for storing it in a sorted set
|
22
|
+
#
|
23
|
+
# This method effectively turns a string into a base 27 floating point number,
|
24
|
+
# where 0 corresponds to no letter, 1 to A, 2 to B, etc.
|
25
|
+
#
|
26
|
+
# @param [String] str the string we are computing a score for
|
27
|
+
# @return [Number] the string's score
|
28
|
+
def self.score(str)
|
29
|
+
str = str.downcase
|
30
|
+
scrs = []
|
31
|
+
str.split('').each_slice(partition_size) do |s|
|
32
|
+
mult = 1.0
|
33
|
+
scr = 0.0
|
34
|
+
s.each do |char|
|
35
|
+
mult /= 27
|
36
|
+
scr += (char.ord-'a'.ord+1)*mult if ('a'..'z').include? char
|
37
|
+
end
|
38
|
+
scrs << scr
|
39
|
+
end
|
40
|
+
scrs
|
41
|
+
end
|
42
|
+
|
43
|
+
# get the indexer with the given id
|
44
|
+
def self.indexer(id)
|
45
|
+
type = Scripsi.redis.hget "scripsi:used", id.to_s
|
46
|
+
if type == "ssi"
|
47
|
+
SortedSuffixIndexer.build(id)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# (see #indexer)
|
52
|
+
def self.find(id)
|
53
|
+
indexer(id)
|
54
|
+
end
|
55
|
+
|
56
|
+
class SortedSuffixIndexer
|
57
|
+
def initialize(id=nil,check=true)
|
58
|
+
if check
|
59
|
+
if id and Scripsi.redis.hexists "scripsi:used", id.to_s
|
60
|
+
raise "id '#{id}' in use"
|
61
|
+
end
|
62
|
+
@id = id ? id.to_s : Scripsi.redis.incr("scripsi:next_id")
|
63
|
+
Scripsi.redis.hset "scripsi:used", @id, "ssi"
|
64
|
+
else
|
65
|
+
@id = id
|
66
|
+
end
|
67
|
+
@index_key = "scripsi:index:#{@id}"
|
68
|
+
@document_key = "scripsi:document:#{@id}"
|
69
|
+
@documents_key = "scripsi:documents:#{@id}"
|
70
|
+
@search_length = 30
|
71
|
+
end
|
72
|
+
|
73
|
+
# adds a document to this indexer
|
74
|
+
#
|
75
|
+
# @param [Integer] id a number representing the id of this document
|
76
|
+
# @param [String] str the text of the document
|
77
|
+
# @return [Boolean] returns true if the document was successfully indexed
|
78
|
+
def index(id,str)
|
79
|
+
id = id.to_s
|
80
|
+
return false if Scripsi.redis.hexists @documents_key, id
|
81
|
+
offset = Scripsi.redis.strlen @document_key
|
82
|
+
sfxs = suffixes(str).sort_by{|s,i| s}
|
83
|
+
sfxs.each do |suffix,i|
|
84
|
+
Scripsi.score(suffix).each_with_index do |scr,j|
|
85
|
+
Scripsi.redis.zadd "#{@index_key}:#{j}", scr, i+offset
|
86
|
+
end
|
87
|
+
end
|
88
|
+
doc = str + "\0#{id}\0"
|
89
|
+
Scripsi.redis.append @document_key, doc
|
90
|
+
endpoints = Marshal.dump([offset, offset + str.size - 1])
|
91
|
+
Scripsi.redis.hset @documents_key, id, endpoints
|
92
|
+
end
|
93
|
+
|
94
|
+
# a lazy list of documents associated with a SortedSuffixIndexer
|
95
|
+
class Documents
|
96
|
+
def initialize(doc_key, endpoints_key)
|
97
|
+
@doc_key = doc_key
|
98
|
+
@endpoints_key = endpoints_key
|
99
|
+
end
|
100
|
+
|
101
|
+
def [](id)
|
102
|
+
endpoints = Scripsi.redis.hget(@endpoints_key, id)
|
103
|
+
return nil unless endpoints
|
104
|
+
a,b = Marshal.load(endpoints)
|
105
|
+
Scripsi.redis.getrange @doc_key, a.to_i, b.to_i
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# retrive the document with the given id
|
110
|
+
def documents
|
111
|
+
Documents.new(@document_key,@documents_key)
|
112
|
+
end
|
113
|
+
|
114
|
+
# searches for documents containing the substring term
|
115
|
+
#
|
116
|
+
# @param [String] term the substring to search for
|
117
|
+
def search(term)
|
118
|
+
term, length = term.downcase, term.length
|
119
|
+
set = nil
|
120
|
+
Scripsi.score(term).each_with_index do |scr,i|
|
121
|
+
a,b = scr.to_s, "#{scr+1.0/(27**length)}"
|
122
|
+
b = "(" + b unless a == b
|
123
|
+
ids = Scripsi.redis.zrangebyscore("#{@index_key}:#{i}",a,b)
|
124
|
+
set = set ? set & Set.new(ids) : Set.new(ids)
|
125
|
+
length -= Scripsi.partition_size
|
126
|
+
end
|
127
|
+
set.map{|i| read_to_id(i.to_i)}.uniq
|
128
|
+
end
|
129
|
+
|
130
|
+
# creates an indexer with the given id WITHOUT CHECKING
|
131
|
+
# this method is used internally - calling it yourself may result in deleting an indexer, unless you know the id you're using is valid
|
132
|
+
def self.build(id)
|
133
|
+
new(id,false)
|
134
|
+
end
|
135
|
+
|
136
|
+
def inspect
|
137
|
+
"#<Scripsi::SortedSuffixIndexer id=#{@id}>"
|
138
|
+
end
|
139
|
+
|
140
|
+
private
|
141
|
+
|
142
|
+
def suffixes(str)
|
143
|
+
str = str.downcase
|
144
|
+
(0...str.length).map {|i| [str[i..-1],i] }
|
145
|
+
end
|
146
|
+
|
147
|
+
def document_index(index)
|
148
|
+
doc_index = Scripsi.redis.zrange(@index_key, index, index).first.to_i
|
149
|
+
end
|
150
|
+
|
151
|
+
def compare_with_index(str,doc_index)
|
152
|
+
str.split('').each_with_index do |char,offset|
|
153
|
+
s = Scripsi.redis.getrange @document_key, doc_index+offset, doc_index+offset
|
154
|
+
STDERR.puts "comparing #{char} and #{s.downcase}"
|
155
|
+
comp = char <=> s.downcase
|
156
|
+
return comp unless comp == 0
|
157
|
+
end
|
158
|
+
0
|
159
|
+
end
|
160
|
+
|
161
|
+
def read_to_id(doc_index)
|
162
|
+
last = Scripsi.redis.strlen @document_key
|
163
|
+
(doc_index..last).each do |i|
|
164
|
+
char = Scripsi.redis.getrange(@document_key, i, i)
|
165
|
+
if char == "\0"
|
166
|
+
id = ""
|
167
|
+
offset = 1
|
168
|
+
loop do
|
169
|
+
next_char = Scripsi.redis.getrange(@document_key,i+offset,i+offset)
|
170
|
+
if next_char == "\0"
|
171
|
+
break
|
172
|
+
else
|
173
|
+
id << next_char
|
174
|
+
offset += 1
|
175
|
+
end
|
176
|
+
end
|
177
|
+
return id.to_i
|
178
|
+
end
|
179
|
+
end
|
180
|
+
raise "index is corrupt"
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
end
|
data/scripsi.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{scripsi}
|
5
|
+
s.version = "0.0.1"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Alec Benzer"]
|
9
|
+
s.date = %q{2011-02-09}
|
10
|
+
s.description = %q{a flexible text-searching library built on top of redis}
|
11
|
+
s.email = %q{alecbenzer@gmail.com}
|
12
|
+
s.extra_rdoc_files = ["README.md", "lib/scripsi.rb"]
|
13
|
+
s.files = ["README.md", "Rakefile", "lib/scripsi.rb", "Manifest", "scripsi.gemspec"]
|
14
|
+
s.homepage = %q{https://github.com/alecbenzer/scripsi}
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scripsi", "--main", "README.md"]
|
16
|
+
s.require_paths = ["lib"]
|
17
|
+
s.rubyforge_project = %q{scripsi}
|
18
|
+
s.rubygems_version = %q{1.5.0}
|
19
|
+
s.summary = %q{a flexible text-searching library built on top of redis}
|
20
|
+
|
21
|
+
if s.respond_to? :specification_version then
|
22
|
+
s.specification_version = 3
|
23
|
+
|
24
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
25
|
+
s.add_development_dependency(%q<redis>, [">= 2.1.1"])
|
26
|
+
else
|
27
|
+
s.add_dependency(%q<redis>, [">= 2.1.1"])
|
28
|
+
end
|
29
|
+
else
|
30
|
+
s.add_dependency(%q<redis>, [">= 2.1.1"])
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scripsi
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alec Benzer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-02-09 00:00:00 -05:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: redis
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 2.1.1
|
25
|
+
type: :development
|
26
|
+
version_requirements: *id001
|
27
|
+
description: a flexible text-searching library built on top of redis
|
28
|
+
email: alecbenzer@gmail.com
|
29
|
+
executables: []
|
30
|
+
|
31
|
+
extensions: []
|
32
|
+
|
33
|
+
extra_rdoc_files:
|
34
|
+
- README.md
|
35
|
+
- lib/scripsi.rb
|
36
|
+
files:
|
37
|
+
- README.md
|
38
|
+
- Rakefile
|
39
|
+
- lib/scripsi.rb
|
40
|
+
- Manifest
|
41
|
+
- scripsi.gemspec
|
42
|
+
has_rdoc: true
|
43
|
+
homepage: https://github.com/alecbenzer/scripsi
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options:
|
48
|
+
- --line-numbers
|
49
|
+
- --inline-source
|
50
|
+
- --title
|
51
|
+
- Scripsi
|
52
|
+
- --main
|
53
|
+
- README.md
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: "1.2"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: scripsi
|
71
|
+
rubygems_version: 1.5.0
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: a flexible text-searching library built on top of redis
|
75
|
+
test_files: []
|
76
|
+
|