scripsi 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. data/Manifest +4 -0
  2. data/README.md +29 -0
  3. data/Rakefile +12 -0
  4. data/lib/scripsi.rb +184 -0
  5. data/scripsi.gemspec +32 -0
  6. metadata +76 -0
@@ -0,0 +1,4 @@
1
+ README.md
2
+ Rakefile
3
+ lib/scripsi.rb
4
+ Manifest
@@ -0,0 +1,29 @@
1
+ # Scripsi
2
+
3
+ A flexible text-searching library built on top of redis.
4
+
5
+ ## Sorted suffix indexing
6
+
7
+ Sorted suffix indexing allows you to search for any substring within a set of documents. First, index a collection of documents and associated ids:
8
+
9
+ require 'scripsi'
10
+ Scripsi.connect # connect to a running redis server
11
+
12
+ ssi = Scripsi::SortedSuffixIndexer.new "myindexer"
13
+ ssi.index(1,"Epistulam ad te scripsi.")
14
+ ssi.index(2,"I've written you a letter.")
15
+ ssi.index(3,"Quisnam Tusculo espistulam me misit?")
16
+ ssi.index(4,"Who in Tusculum would've sent me a letter?")
17
+
18
+ You can then search for any substring, and the indexer will return the ids of the documents where that substring appears.
19
+
20
+ ssi = Scripsi.indexer "myindexer"
21
+ ssi.search("te") # => [1,2,4]
22
+ ssi.search("Tuscul") # => [3,4]
23
+ ssi.search("Tusculu") # => [4]
24
+ ssi.search("you a le") # => [2]
25
+
26
+ You can also retrive the stored documents efficiently:
27
+
28
+ ssi.documents # lazy list of documents
29
+ ssi.documents[3] # document with id string
@@ -0,0 +1,12 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('scripsi','0.0.1') do |p|
6
+ p.description = "a flexible text-searching library built on top of redis"
7
+ p.url = "https://github.com/alecbenzer/scripsi"
8
+ p.author = "Alec Benzer"
9
+ p.email = "alecbenzer@gmail.com"
10
+ p.ignore_pattern = ["*.rdb"]
11
+ p.development_dependencies = ["redis >=2.1.1"]
12
+ end
@@ -0,0 +1,184 @@
1
+ require 'redis'
2
+ require 'set'
3
+
4
+ module Scripsi
5
+ # connect to a redis server
6
+ def self.connect(options = {})
7
+ @@redis = Redis.new(options)
8
+ end
9
+
10
+ def self.redis
11
+ @@redis
12
+ end
13
+
14
+ @@partition_size = 10
15
+
16
+ def self.partition_size
17
+ @@partition_size
18
+ end
19
+
20
+ # generate a 'score' for a string
21
+ # used for storing it in a sorted set
22
+ #
23
+ # This method effectively turns a string into a base 27 floating point number,
24
+ # where 0 corresponds to no letter, 1 to A, 2 to B, etc.
25
+ #
26
+ # @param [String] str the string we are computing a score for
27
+ # @return [Number] the string's score
28
+ def self.score(str)
29
+ str = str.downcase
30
+ scrs = []
31
+ str.split('').each_slice(partition_size) do |s|
32
+ mult = 1.0
33
+ scr = 0.0
34
+ s.each do |char|
35
+ mult /= 27
36
+ scr += (char.ord-'a'.ord+1)*mult if ('a'..'z').include? char
37
+ end
38
+ scrs << scr
39
+ end
40
+ scrs
41
+ end
42
+
43
+ # get the indexer with the given id
44
+ def self.indexer(id)
45
+ type = Scripsi.redis.hget "scripsi:used", id.to_s
46
+ if type == "ssi"
47
+ SortedSuffixIndexer.build(id)
48
+ end
49
+ end
50
+
51
+ # (see #indexer)
52
+ def self.find(id)
53
+ indexer(id)
54
+ end
55
+
56
+ class SortedSuffixIndexer
57
+ def initialize(id=nil,check=true)
58
+ if check
59
+ if id and Scripsi.redis.hexists "scripsi:used", id.to_s
60
+ raise "id '#{id}' in use"
61
+ end
62
+ @id = id ? id.to_s : Scripsi.redis.incr("scripsi:next_id")
63
+ Scripsi.redis.hset "scripsi:used", @id, "ssi"
64
+ else
65
+ @id = id
66
+ end
67
+ @index_key = "scripsi:index:#{@id}"
68
+ @document_key = "scripsi:document:#{@id}"
69
+ @documents_key = "scripsi:documents:#{@id}"
70
+ @search_length = 30
71
+ end
72
+
73
+ # adds a document to this indexer
74
+ #
75
+ # @param [Integer] id a number representing the id of this document
76
+ # @param [String] str the text of the document
77
+ # @return [Boolean] returns true if the document was successfully indexed
78
+ def index(id,str)
79
+ id = id.to_s
80
+ return false if Scripsi.redis.hexists @documents_key, id
81
+ offset = Scripsi.redis.strlen @document_key
82
+ sfxs = suffixes(str).sort_by{|s,i| s}
83
+ sfxs.each do |suffix,i|
84
+ Scripsi.score(suffix).each_with_index do |scr,j|
85
+ Scripsi.redis.zadd "#{@index_key}:#{j}", scr, i+offset
86
+ end
87
+ end
88
+ doc = str + "\0#{id}\0"
89
+ Scripsi.redis.append @document_key, doc
90
+ endpoints = Marshal.dump([offset, offset + str.size - 1])
91
+ Scripsi.redis.hset @documents_key, id, endpoints
92
+ end
93
+
94
+ # a lazy list of documents associated with a SortedSuffixIndexer
95
+ class Documents
96
+ def initialize(doc_key, endpoints_key)
97
+ @doc_key = doc_key
98
+ @endpoints_key = endpoints_key
99
+ end
100
+
101
+ def [](id)
102
+ endpoints = Scripsi.redis.hget(@endpoints_key, id)
103
+ return nil unless endpoints
104
+ a,b = Marshal.load(endpoints)
105
+ Scripsi.redis.getrange @doc_key, a.to_i, b.to_i
106
+ end
107
+ end
108
+
109
+ # retrive the document with the given id
110
+ def documents
111
+ Documents.new(@document_key,@documents_key)
112
+ end
113
+
114
+ # searches for documents containing the substring term
115
+ #
116
+ # @param [String] term the substring to search for
117
+ def search(term)
118
+ term, length = term.downcase, term.length
119
+ set = nil
120
+ Scripsi.score(term).each_with_index do |scr,i|
121
+ a,b = scr.to_s, "#{scr+1.0/(27**length)}"
122
+ b = "(" + b unless a == b
123
+ ids = Scripsi.redis.zrangebyscore("#{@index_key}:#{i}",a,b)
124
+ set = set ? set & Set.new(ids) : Set.new(ids)
125
+ length -= Scripsi.partition_size
126
+ end
127
+ set.map{|i| read_to_id(i.to_i)}.uniq
128
+ end
129
+
130
+ # creates an indexer with the given id WITHOUT CHECKING
131
+ # this method is used internally - calling it yourself may result in deleting an indexer, unless you know the id you're using is valid
132
+ def self.build(id)
133
+ new(id,false)
134
+ end
135
+
136
+ def inspect
137
+ "#<Scripsi::SortedSuffixIndexer id=#{@id}>"
138
+ end
139
+
140
+ private
141
+
142
+ def suffixes(str)
143
+ str = str.downcase
144
+ (0...str.length).map {|i| [str[i..-1],i] }
145
+ end
146
+
147
+ def document_index(index)
148
+ doc_index = Scripsi.redis.zrange(@index_key, index, index).first.to_i
149
+ end
150
+
151
+ def compare_with_index(str,doc_index)
152
+ str.split('').each_with_index do |char,offset|
153
+ s = Scripsi.redis.getrange @document_key, doc_index+offset, doc_index+offset
154
+ STDERR.puts "comparing #{char} and #{s.downcase}"
155
+ comp = char <=> s.downcase
156
+ return comp unless comp == 0
157
+ end
158
+ 0
159
+ end
160
+
161
+ def read_to_id(doc_index)
162
+ last = Scripsi.redis.strlen @document_key
163
+ (doc_index..last).each do |i|
164
+ char = Scripsi.redis.getrange(@document_key, i, i)
165
+ if char == "\0"
166
+ id = ""
167
+ offset = 1
168
+ loop do
169
+ next_char = Scripsi.redis.getrange(@document_key,i+offset,i+offset)
170
+ if next_char == "\0"
171
+ break
172
+ else
173
+ id << next_char
174
+ offset += 1
175
+ end
176
+ end
177
+ return id.to_i
178
+ end
179
+ end
180
+ raise "index is corrupt"
181
+ end
182
+ end
183
+
184
+ end
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{scripsi}
5
+ s.version = "0.0.1"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Alec Benzer"]
9
+ s.date = %q{2011-02-09}
10
+ s.description = %q{a flexible text-searching library built on top of redis}
11
+ s.email = %q{alecbenzer@gmail.com}
12
+ s.extra_rdoc_files = ["README.md", "lib/scripsi.rb"]
13
+ s.files = ["README.md", "Rakefile", "lib/scripsi.rb", "Manifest", "scripsi.gemspec"]
14
+ s.homepage = %q{https://github.com/alecbenzer/scripsi}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scripsi", "--main", "README.md"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{scripsi}
18
+ s.rubygems_version = %q{1.5.0}
19
+ s.summary = %q{a flexible text-searching library built on top of redis}
20
+
21
+ if s.respond_to? :specification_version then
22
+ s.specification_version = 3
23
+
24
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
25
+ s.add_development_dependency(%q<redis>, [">= 2.1.1"])
26
+ else
27
+ s.add_dependency(%q<redis>, [">= 2.1.1"])
28
+ end
29
+ else
30
+ s.add_dependency(%q<redis>, [">= 2.1.1"])
31
+ end
32
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scripsi
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Alec Benzer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-02-09 00:00:00 -05:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: redis
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: 2.1.1
25
+ type: :development
26
+ version_requirements: *id001
27
+ description: a flexible text-searching library built on top of redis
28
+ email: alecbenzer@gmail.com
29
+ executables: []
30
+
31
+ extensions: []
32
+
33
+ extra_rdoc_files:
34
+ - README.md
35
+ - lib/scripsi.rb
36
+ files:
37
+ - README.md
38
+ - Rakefile
39
+ - lib/scripsi.rb
40
+ - Manifest
41
+ - scripsi.gemspec
42
+ has_rdoc: true
43
+ homepage: https://github.com/alecbenzer/scripsi
44
+ licenses: []
45
+
46
+ post_install_message:
47
+ rdoc_options:
48
+ - --line-numbers
49
+ - --inline-source
50
+ - --title
51
+ - Scripsi
52
+ - --main
53
+ - README.md
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "1.2"
68
+ requirements: []
69
+
70
+ rubyforge_project: scripsi
71
+ rubygems_version: 1.5.0
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: a flexible text-searching library built on top of redis
75
+ test_files: []
76
+