picky 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/picky +14 -0
- data/lib/bundling.rb +10 -0
- data/lib/constants.rb +9 -0
- data/lib/deployment.rb +212 -0
- data/lib/picky/application.rb +40 -0
- data/lib/picky/cacher/convenience.rb +3 -0
- data/lib/picky/cacher/generator.rb +17 -0
- data/lib/picky/cacher/partial/default.rb +7 -0
- data/lib/picky/cacher/partial/none.rb +19 -0
- data/lib/picky/cacher/partial/strategy.rb +7 -0
- data/lib/picky/cacher/partial/subtoken.rb +91 -0
- data/lib/picky/cacher/partial_generator.rb +15 -0
- data/lib/picky/cacher/similarity/default.rb +7 -0
- data/lib/picky/cacher/similarity/double_levenshtone.rb +73 -0
- data/lib/picky/cacher/similarity/none.rb +25 -0
- data/lib/picky/cacher/similarity/strategy.rb +7 -0
- data/lib/picky/cacher/similarity_generator.rb +15 -0
- data/lib/picky/cacher/weights/default.rb +7 -0
- data/lib/picky/cacher/weights/logarithmic.rb +39 -0
- data/lib/picky/cacher/weights/strategy.rb +7 -0
- data/lib/picky/cacher/weights_generator.rb +15 -0
- data/lib/picky/configuration/configuration.rb +13 -0
- data/lib/picky/configuration/field.rb +68 -0
- data/lib/picky/configuration/indexes.rb +60 -0
- data/lib/picky/configuration/queries.rb +32 -0
- data/lib/picky/configuration/type.rb +52 -0
- data/lib/picky/cores.rb +101 -0
- data/lib/picky/db/configuration.rb +23 -0
- data/lib/picky/ext/ruby19/extconf.rb +7 -0
- data/lib/picky/ext/ruby19/performant.c +339 -0
- data/lib/picky/extensions/array.rb +45 -0
- data/lib/picky/extensions/hash.rb +11 -0
- data/lib/picky/extensions/module.rb +15 -0
- data/lib/picky/extensions/symbol.rb +18 -0
- data/lib/picky/generator.rb +156 -0
- data/lib/picky/helpers/cache.rb +23 -0
- data/lib/picky/helpers/gc.rb +11 -0
- data/lib/picky/helpers/measuring.rb +45 -0
- data/lib/picky/helpers/search.rb +27 -0
- data/lib/picky/index/bundle.rb +328 -0
- data/lib/picky/index/category.rb +109 -0
- data/lib/picky/index/combined.rb +38 -0
- data/lib/picky/index/type.rb +30 -0
- data/lib/picky/indexers/base.rb +77 -0
- data/lib/picky/indexers/default.rb +3 -0
- data/lib/picky/indexers/field.rb +13 -0
- data/lib/picky/indexers/no_source_specified_error.rb +5 -0
- data/lib/picky/indexers/solr.rb +60 -0
- data/lib/picky/indexes.rb +180 -0
- data/lib/picky/initializers/ext.rb +6 -0
- data/lib/picky/initializers/mysql.rb +22 -0
- data/lib/picky/loader.rb +287 -0
- data/lib/picky/loggers/search.rb +19 -0
- data/lib/picky/performant/array.rb +23 -0
- data/lib/picky/query/allocation.rb +82 -0
- data/lib/picky/query/allocations.rb +131 -0
- data/lib/picky/query/base.rb +124 -0
- data/lib/picky/query/combination.rb +69 -0
- data/lib/picky/query/combinations.rb +106 -0
- data/lib/picky/query/combinator.rb +92 -0
- data/lib/picky/query/full.rb +15 -0
- data/lib/picky/query/live.rb +22 -0
- data/lib/picky/query/qualifiers.rb +73 -0
- data/lib/picky/query/solr.rb +77 -0
- data/lib/picky/query/token.rb +215 -0
- data/lib/picky/query/tokens.rb +102 -0
- data/lib/picky/query/weigher.rb +159 -0
- data/lib/picky/query/weights.rb +55 -0
- data/lib/picky/rack/harakiri.rb +37 -0
- data/lib/picky/results/base.rb +103 -0
- data/lib/picky/results/full.rb +19 -0
- data/lib/picky/results/live.rb +19 -0
- data/lib/picky/routing.rb +165 -0
- data/lib/picky/signals.rb +11 -0
- data/lib/picky/solr/schema_generator.rb +73 -0
- data/lib/picky/sources/base.rb +19 -0
- data/lib/picky/sources/csv.rb +30 -0
- data/lib/picky/sources/db.rb +77 -0
- data/lib/picky/tokenizers/base.rb +130 -0
- data/lib/picky/tokenizers/default.rb +3 -0
- data/lib/picky/tokenizers/index.rb +73 -0
- data/lib/picky/tokenizers/query.rb +70 -0
- data/lib/picky/umlaut_substituter.rb +21 -0
- data/lib/picky-tasks.rb +6 -0
- data/lib/picky.rb +18 -0
- data/lib/tasks/application.rake +5 -0
- data/lib/tasks/cache.rake +53 -0
- data/lib/tasks/framework.rake +4 -0
- data/lib/tasks/index.rake +29 -0
- data/lib/tasks/server.rake +48 -0
- data/lib/tasks/shortcuts.rake +13 -0
- data/lib/tasks/solr.rake +36 -0
- data/lib/tasks/spec.rake +11 -0
- data/lib/tasks/statistics.rake +13 -0
- data/lib/tasks/try.rake +29 -0
- data/prototype_project/Gemfile +23 -0
- data/prototype_project/Rakefile +1 -0
- data/prototype_project/app/README +6 -0
- data/prototype_project/app/application.rb +50 -0
- data/prototype_project/app/application.ru +29 -0
- data/prototype_project/app/db.yml +10 -0
- data/prototype_project/app/logging.rb +20 -0
- data/prototype_project/app/unicorn.ru +10 -0
- data/prototype_project/log/README +1 -0
- data/prototype_project/script/console +34 -0
- data/prototype_project/tmp/README +0 -0
- data/prototype_project/tmp/pids/README +0 -0
- data/spec/ext/performant_spec.rb +64 -0
- data/spec/lib/application_spec.rb +61 -0
- data/spec/lib/cacher/partial/subtoken_spec.rb +89 -0
- data/spec/lib/cacher/partial_generator_spec.rb +35 -0
- data/spec/lib/cacher/similarity/double_levenshtone_spec.rb +60 -0
- data/spec/lib/cacher/similarity/none_spec.rb +23 -0
- data/spec/lib/cacher/similarity_generator_spec.rb +22 -0
- data/spec/lib/cacher/weights/logarithmic_spec.rb +30 -0
- data/spec/lib/cacher/weights_generator_spec.rb +21 -0
- data/spec/lib/configuration/configuration_spec.rb +38 -0
- data/spec/lib/configuration/type_spec.rb +49 -0
- data/spec/lib/configuration_spec.rb +8 -0
- data/spec/lib/cores_spec.rb +65 -0
- data/spec/lib/extensions/array_spec.rb +37 -0
- data/spec/lib/extensions/hash_spec.rb +11 -0
- data/spec/lib/extensions/module_spec.rb +27 -0
- data/spec/lib/extensions/symbol_spec.rb +85 -0
- data/spec/lib/generator_spec.rb +135 -0
- data/spec/lib/helpers/cache_spec.rb +35 -0
- data/spec/lib/helpers/gc_spec.rb +71 -0
- data/spec/lib/helpers/measuring_spec.rb +18 -0
- data/spec/lib/helpers/search_spec.rb +50 -0
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +47 -0
- data/spec/lib/index/bundle_spec.rb +260 -0
- data/spec/lib/index/category_spec.rb +203 -0
- data/spec/lib/indexers/base_spec.rb +73 -0
- data/spec/lib/indexers/field_spec.rb +20 -0
- data/spec/lib/loader_spec.rb +48 -0
- data/spec/lib/loggers/search_spec.rb +19 -0
- data/spec/lib/performant/array_spec.rb +13 -0
- data/spec/lib/query/allocation_spec.rb +194 -0
- data/spec/lib/query/allocations_spec.rb +336 -0
- data/spec/lib/query/base_spec.rb +104 -0
- data/spec/lib/query/combination_spec.rb +90 -0
- data/spec/lib/query/combinations_spec.rb +83 -0
- data/spec/lib/query/combinator_spec.rb +112 -0
- data/spec/lib/query/full_spec.rb +22 -0
- data/spec/lib/query/live_spec.rb +61 -0
- data/spec/lib/query/qualifiers_spec.rb +31 -0
- data/spec/lib/query/solr_spec.rb +51 -0
- data/spec/lib/query/token_spec.rb +297 -0
- data/spec/lib/query/tokens_spec.rb +189 -0
- data/spec/lib/query/weights_spec.rb +47 -0
- data/spec/lib/results/base_spec.rb +233 -0
- data/spec/lib/routing_spec.rb +318 -0
- data/spec/lib/solr/schema_generator_spec.rb +42 -0
- data/spec/lib/sources/db_spec.rb +91 -0
- data/spec/lib/tokenizers/base_spec.rb +61 -0
- data/spec/lib/tokenizers/index_spec.rb +51 -0
- data/spec/lib/tokenizers/query_spec.rb +105 -0
- data/spec/lib/umlaut_substituter_spec.rb +84 -0
- data/spec/specific/speed_spec.rb +55 -0
- metadata +371 -15
- data/README.textile +0 -9
data/bin/picky
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'picky/generator'
|
|
6
|
+
rescue LoadError => e
|
|
7
|
+
require 'rubygems'
|
|
8
|
+
picky_path = File.expand_path('../../lib', __FILE__)
|
|
9
|
+
$:.unshift(picky_path) if File.directory?(picky_path) && !$:.include?(picky_path)
|
|
10
|
+
require 'picky/generator'
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
generator = Picky::Generator.new
|
|
14
|
+
generator.generate ARGV
|
data/lib/bundling.rb
ADDED
data/lib/constants.rb
ADDED
data/lib/deployment.rb
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'constants'))
|
|
2
|
+
|
|
3
|
+
module Picky
|
|
4
|
+
module Capistrano
|
|
5
|
+
|
|
6
|
+
# Include all
|
|
7
|
+
#
|
|
8
|
+
module All
|
|
9
|
+
def self.extended cap_config
|
|
10
|
+
|
|
11
|
+
cap_config.instance_eval do
|
|
12
|
+
|
|
13
|
+
# Executes a rake task on the server.
|
|
14
|
+
#
|
|
15
|
+
# Options:
|
|
16
|
+
# * env: The SEARCH_ENV. Will not set if set explicitly to false. Default: production.
|
|
17
|
+
# * All other options get passed on to the Capistrano run task.
|
|
18
|
+
#
|
|
19
|
+
def execute_rake_task name, options = {}, &block
|
|
20
|
+
env = options.delete :env
|
|
21
|
+
env = env == false ? '' : "SEARCH_ENV=#{env || 'production'}"
|
|
22
|
+
run "cd #{current_path}; rake #{name} #{env}", options, &block
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
cap_config.extend Standard
|
|
28
|
+
cap_config.extend Deploy
|
|
29
|
+
cap_config.extend Caching
|
|
30
|
+
cap_config.extend Overrides
|
|
31
|
+
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Removes unneeded Rails defaults.
|
|
36
|
+
#
|
|
37
|
+
module Overrides
|
|
38
|
+
def self.extended cap_config
|
|
39
|
+
cap_config.instance_eval do
|
|
40
|
+
|
|
41
|
+
namespace :deploy do
|
|
42
|
+
tasks.delete :check
|
|
43
|
+
tasks.delete :cold
|
|
44
|
+
tasks.delete :migrations
|
|
45
|
+
tasks.delete :migrate
|
|
46
|
+
tasks.delete :upload
|
|
47
|
+
|
|
48
|
+
namespace :web do
|
|
49
|
+
tasks.delete :enable
|
|
50
|
+
tasks.delete :disable
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
module Standard
|
|
59
|
+
def self.extended cap_config
|
|
60
|
+
cap_config.load 'standard'
|
|
61
|
+
cap_config.load 'deploy'
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
module Deploy
|
|
66
|
+
|
|
67
|
+
def self.extended cap_config
|
|
68
|
+
cap_config.instance_eval do
|
|
69
|
+
|
|
70
|
+
namespace :deploy do
|
|
71
|
+
%w(start stop).each do |action|
|
|
72
|
+
desc "#{action} the Servers"
|
|
73
|
+
task action.to_sym, :roles => :app do
|
|
74
|
+
execute_rake_task "server:#{action}"
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
desc "Restart the Servers sequentially"
|
|
78
|
+
task :restart, :roles => :app do
|
|
79
|
+
find_servers(:roles => :app).each do |server|
|
|
80
|
+
execute_rake_task "server:restart", :hosts => server.host
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
desc 'Hot deploy the code'
|
|
85
|
+
task 'hot', :roles => :app do
|
|
86
|
+
update
|
|
87
|
+
execute_rake_task 'server:usr1', :env => false # No env needed.
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
desc "Setup a GitHub-style deployment."
|
|
91
|
+
task :setup, :roles => :app do
|
|
92
|
+
cmd = "git clone #{repository} #{current_path}-clone-cache &&" +
|
|
93
|
+
"rm #{current_path} &&" +
|
|
94
|
+
"mv #{current_path}-clone-cache #{current_path}"
|
|
95
|
+
run cmd
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
desc "Deploy"
|
|
99
|
+
task :default, :roles => :app do
|
|
100
|
+
update
|
|
101
|
+
restart
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
desc "Update the deployed code."
|
|
105
|
+
task :update_code do # code needs to be updated with all servers
|
|
106
|
+
puts "updating code to branch #{branch}"
|
|
107
|
+
cmd = "cd #{current_path} &&" +
|
|
108
|
+
"git fetch origin &&" +
|
|
109
|
+
"(git checkout -f #{branch} || git checkout -b #{branch} origin/#{branch}) &&" +
|
|
110
|
+
"git pull;" +
|
|
111
|
+
"git branch"
|
|
112
|
+
run cmd
|
|
113
|
+
symlink
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
desc "Cleans up the git checkout"
|
|
117
|
+
task :cleanup, :roles => :app do
|
|
118
|
+
run "cd #{current_path} && git gc --aggressive"
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
desc "create the symlinks to the shared dirs"
|
|
122
|
+
task :symlink do
|
|
123
|
+
set :user, 'deploy'
|
|
124
|
+
run "rm -rf #{current_path}/log; ln -sf #{shared_path}/log #{current_path}/log"
|
|
125
|
+
run "rm -rf #{current_path}/index; ln -sf #{shared_path}/index #{current_path}/index"
|
|
126
|
+
# link database-config files
|
|
127
|
+
run "ln -sf #{shared_path}/config/base.yml #{current_path}/config/db/base.yml"
|
|
128
|
+
run "ln -sf #{shared_path}/config/source.yml #{current_path}/config/db/source.yml"
|
|
129
|
+
# link unicorn.ru
|
|
130
|
+
run "ln -sf #{shared_path}/config/unicorn.ru #{current_path}/config/unicorn.ru" # TODO change path
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
namespace :rollback do
|
|
134
|
+
desc "Rollback to last release."
|
|
135
|
+
task :default, :roles => :app do
|
|
136
|
+
set :branch, branches[-2]
|
|
137
|
+
puts "rolling back to branch #{branch}"
|
|
138
|
+
deploy.update_code
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
task :code, :roles => :app do
|
|
142
|
+
# implicit
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
module Caching
|
|
153
|
+
|
|
154
|
+
def self.extended cap_config
|
|
155
|
+
cap_config.instance_eval do
|
|
156
|
+
namespace :cache do
|
|
157
|
+
desc "check the index files if they are ready to be used"
|
|
158
|
+
task :check, :roles => :cache do
|
|
159
|
+
execute_rake_task 'cache:check'
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
namespace :cache do
|
|
163
|
+
namespace :structure do
|
|
164
|
+
desc "create the index cache structure"
|
|
165
|
+
task :create, :roles => :app do
|
|
166
|
+
execute_rake_task 'cache:structure:create'
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
namespace :solr do
|
|
171
|
+
desc "create the index cache structure"
|
|
172
|
+
task :index, :roles => :cache do
|
|
173
|
+
execute_rake_task 'solr:index'
|
|
174
|
+
end
|
|
175
|
+
%w|start stop restart|.collect(&:to_sym).each do |action|
|
|
176
|
+
desc "#{action} the solr server"
|
|
177
|
+
task action, :roles => :app do
|
|
178
|
+
execute_rake_task 'solr:start'
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
module Statistics
|
|
188
|
+
|
|
189
|
+
def self.extended cap_config
|
|
190
|
+
namespace :statistics do
|
|
191
|
+
desc 'Start the statistics server'
|
|
192
|
+
task :start, :roles => :statistics do
|
|
193
|
+
set :user, 'root'
|
|
194
|
+
run "daemonize -c #{current_path} -u deploy -v #{current_path}/script/statistics/start production"
|
|
195
|
+
end
|
|
196
|
+
desc 'Stop the statistics server'
|
|
197
|
+
task :stop, :roles => :statistics do
|
|
198
|
+
run "#{current_path}/script/statistics/stop production"
|
|
199
|
+
end
|
|
200
|
+
desc 'Restart the statistics server'
|
|
201
|
+
task :restart, :roles => :statistics do
|
|
202
|
+
stop
|
|
203
|
+
sleep 2
|
|
204
|
+
start
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
end
|
|
212
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
class Application
|
|
2
|
+
|
|
3
|
+
# An application simply delegates to the routing to handle a request.
|
|
4
|
+
#
|
|
5
|
+
def self.routing
|
|
6
|
+
@routing ||= Routing.new
|
|
7
|
+
end
|
|
8
|
+
def self.call env
|
|
9
|
+
routing.call env
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# TODO Multiple indexes?
|
|
13
|
+
#
|
|
14
|
+
def self.indexes &block
|
|
15
|
+
indexes_configuration.instance_eval &block
|
|
16
|
+
# TODO Uglyyyyyy.
|
|
17
|
+
::Indexes.configuration = indexes_configuration
|
|
18
|
+
::Indexes.setup # TODO Think about setup/reload.
|
|
19
|
+
end
|
|
20
|
+
def self.indexes_configuration
|
|
21
|
+
@indexes || reset_indexes
|
|
22
|
+
end
|
|
23
|
+
def self.reset_indexes
|
|
24
|
+
@indexes = Configuration::Indexes.new # Is instance a problem?
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# TODO Multiple Queries?
|
|
28
|
+
#
|
|
29
|
+
def self.queries &block
|
|
30
|
+
queries_configuration.instance_eval &block
|
|
31
|
+
routing.freeze
|
|
32
|
+
end
|
|
33
|
+
def self.queries_configuration
|
|
34
|
+
@queries || reset_queries
|
|
35
|
+
end
|
|
36
|
+
def self.reset_queries
|
|
37
|
+
@queries = Configuration::Queries.new routing # Is instance a problem?
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
module Cacher
|
|
2
|
+
|
|
3
|
+
module Partial
|
|
4
|
+
|
|
5
|
+
# The subtoken partial strategy.
|
|
6
|
+
#
|
|
7
|
+
# If given
|
|
8
|
+
# "florian"
|
|
9
|
+
# will index
|
|
10
|
+
# "floria"
|
|
11
|
+
# "flori"
|
|
12
|
+
# "flor"
|
|
13
|
+
# "flo"
|
|
14
|
+
# "fl"
|
|
15
|
+
# "f"
|
|
16
|
+
# Depending on what the given down_to value is. (Example with down_to == 1)
|
|
17
|
+
#
|
|
18
|
+
class Subtoken < Strategy
|
|
19
|
+
|
|
20
|
+
attr_reader :down_to, :starting_at
|
|
21
|
+
|
|
22
|
+
# Down to is how far it will go down in generating the subtokens.
|
|
23
|
+
#
|
|
24
|
+
# Examples:
|
|
25
|
+
# With :hello, and starting_at 0
|
|
26
|
+
# * down to == 1: [:hello, :hell, :hel, :he, :h]
|
|
27
|
+
# * down to == 4: [:hello, :hell]
|
|
28
|
+
#
|
|
29
|
+
# With :hello, and starting_at -1
|
|
30
|
+
# * down to == 1: [:hell, :hel, :he, :h]
|
|
31
|
+
# * down to == 4: [:hell]
|
|
32
|
+
#
|
|
33
|
+
def initialize options = {}
|
|
34
|
+
@down_to = options[:down_to] || 1
|
|
35
|
+
starting_at = options[:starting_at] || 0
|
|
36
|
+
@starting_at = starting_at.zero? ? 0 : starting_at - 1
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Generates a partial index from the given index.
|
|
40
|
+
#
|
|
41
|
+
def generate_from index
|
|
42
|
+
result = {}
|
|
43
|
+
|
|
44
|
+
# Generate for each key token the subtokens.
|
|
45
|
+
#
|
|
46
|
+
i = 5000
|
|
47
|
+
index.each_key do |token|
|
|
48
|
+
i -= 1
|
|
49
|
+
if i == 0
|
|
50
|
+
puts "#{Time.now}: Generating partial tokens for token #{token}. This appears every 5000 tokens."
|
|
51
|
+
i = 5000
|
|
52
|
+
end
|
|
53
|
+
generate_for token, index, result
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Remove duplicate ids.
|
|
57
|
+
#
|
|
58
|
+
# TODO If it is unique for a subtoken, it is
|
|
59
|
+
# unique for all derived longer tokens.
|
|
60
|
+
#
|
|
61
|
+
result.each_value &:uniq! # Removed because of the set combination operation below
|
|
62
|
+
|
|
63
|
+
result
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
# To each shortened token of :test
|
|
69
|
+
# :test, :tes, :te, :t
|
|
70
|
+
# add all ids of :test
|
|
71
|
+
#
|
|
72
|
+
# "token" here means just text.
|
|
73
|
+
#
|
|
74
|
+
# TODO Could be improved by appending the aforegoing ids?
|
|
75
|
+
#
|
|
76
|
+
def generate_for token, index, result
|
|
77
|
+
clipped_token = starting_at.zero? ? token : token[0..starting_at].to_sym
|
|
78
|
+
clipped_token.subtokens(down_to).each do |subtoken|
|
|
79
|
+
if result[subtoken]
|
|
80
|
+
result[subtoken] += index[token] # unique
|
|
81
|
+
else
|
|
82
|
+
result[subtoken] = index[token].dup
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Cacher
|
|
2
|
+
|
|
3
|
+
# The partial generator uses a subtoken(downto:1) generator as default.
|
|
4
|
+
#
|
|
5
|
+
class PartialGenerator < Generator
|
|
6
|
+
|
|
7
|
+
# Generate a similarity index based on the given index.
|
|
8
|
+
#
|
|
9
|
+
def generate strategy = Partial::Subtoken.new(:down_to => 1)
|
|
10
|
+
strategy.generate_from self.index
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
#
|
|
3
|
+
module Cacher
|
|
4
|
+
|
|
5
|
+
module Similarity
|
|
6
|
+
|
|
7
|
+
# DoubleLevensthone means that it's a combination of
|
|
8
|
+
# * DoubleMetaphone
|
|
9
|
+
# and
|
|
10
|
+
# * Levenshtein
|
|
11
|
+
# :)
|
|
12
|
+
#
|
|
13
|
+
class DoubleLevenshtone < Strategy
|
|
14
|
+
|
|
15
|
+
attr_reader :amount
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
#
|
|
19
|
+
def initialize amount = 10
|
|
20
|
+
@amount = amount
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Encodes the given symbol.
|
|
24
|
+
#
|
|
25
|
+
# Returns a symbol.
|
|
26
|
+
#
|
|
27
|
+
def encoded sym
|
|
28
|
+
codes = Text::Metaphone.double_metaphone sym.to_s
|
|
29
|
+
codes.first.to_sym unless codes.empty?
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Generates an index for the given index (in full index style).
|
|
33
|
+
#
|
|
34
|
+
# In the following form:
|
|
35
|
+
# [:meier, :mueller, :peter, :pater] => { :MR => [:meier], :MLR => [:mueller], :PTR => [:peter, :pater] }
|
|
36
|
+
#
|
|
37
|
+
def generate_from index
|
|
38
|
+
hash = hashify index.keys
|
|
39
|
+
sort hash
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
# Sorts the index values in place.
|
|
45
|
+
#
|
|
46
|
+
def sort index
|
|
47
|
+
index.each_pair.each do |code, ary|
|
|
48
|
+
ary.sort_by_levenshtein! code
|
|
49
|
+
ary.slice! amount, ary.size # size is not perfectly correct, but anyway
|
|
50
|
+
end
|
|
51
|
+
index
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Hashifies a list of symbols.
|
|
55
|
+
#
|
|
56
|
+
# Where:
|
|
57
|
+
# { encoded_sym => [syms] }
|
|
58
|
+
#
|
|
59
|
+
def hashify list
|
|
60
|
+
list.inject({}) do |total, element|
|
|
61
|
+
if code = encoded(element)
|
|
62
|
+
total[code] ||= []
|
|
63
|
+
total[code] << element
|
|
64
|
+
end
|
|
65
|
+
total
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Cacher
|
|
2
|
+
|
|
3
|
+
module Similarity
|
|
4
|
+
|
|
5
|
+
# Similarity strategy that does nothing.
|
|
6
|
+
#
|
|
7
|
+
class None < Strategy
|
|
8
|
+
|
|
9
|
+
# Does not encode text. Just returns nil.
|
|
10
|
+
#
|
|
11
|
+
def encoded text
|
|
12
|
+
nil
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Returns an empty index.
|
|
16
|
+
#
|
|
17
|
+
def generate_from index
|
|
18
|
+
{}
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Cacher
|
|
2
|
+
|
|
3
|
+
# Uses no similarity as default.
|
|
4
|
+
#
|
|
5
|
+
class SimilarityGenerator < Generator
|
|
6
|
+
|
|
7
|
+
# Generate a similarity index based on the given index.
|
|
8
|
+
#
|
|
9
|
+
def generate strategy = Similarity::None.new
|
|
10
|
+
strategy.generate_from self.index
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module Cacher
|
|
2
|
+
|
|
3
|
+
module Weights
|
|
4
|
+
|
|
5
|
+
# Uses a logarithmic weight.
|
|
6
|
+
# If for a key k we have x ids, the weight is:
|
|
7
|
+
# w(x): log(x)
|
|
8
|
+
# Special case: If x < 1, then we use 0.
|
|
9
|
+
#
|
|
10
|
+
class Logarithmic < Strategy
|
|
11
|
+
|
|
12
|
+
# Generates a partial index from the given index.
|
|
13
|
+
#
|
|
14
|
+
def generate_from index
|
|
15
|
+
index.inject({}) do |hash, text_ids|
|
|
16
|
+
text, ids = *text_ids
|
|
17
|
+
weight = weight_for ids.size
|
|
18
|
+
hash[text] ||= weight.round(2) if weight
|
|
19
|
+
hash
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Sets the weight value.
|
|
24
|
+
#
|
|
25
|
+
# If the size is 0 or one, we would get -Infinity or 0.0.
|
|
26
|
+
# Thus we do not set a value if there is just one. The default, dynamically, is 0.
|
|
27
|
+
#
|
|
28
|
+
# BUT: We need the value, even if 0. To designate that there is a weight!
|
|
29
|
+
#
|
|
30
|
+
def weight_for amount
|
|
31
|
+
return 0 if amount < 1
|
|
32
|
+
Math.log amount
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Cacher
|
|
2
|
+
|
|
3
|
+
# Uses a logarithmic algorithm as default.
|
|
4
|
+
#
|
|
5
|
+
class WeightsGenerator < Generator
|
|
6
|
+
|
|
7
|
+
# Generate a weights index based on the given index.
|
|
8
|
+
#
|
|
9
|
+
def generate strategy = Weights::Logarithmic.new
|
|
10
|
+
strategy.generate_from self.index
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|