picky 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/application.rb +13 -0
- data/lib/picky/index/combined.rb +4 -0
- data/lib/picky/loader.rb +4 -0
- data/lib/picky/query/base.rb +10 -10
- data/lib/picky/results/base.rb +5 -6
- data/lib/picky/tokenizers/base.rb +7 -0
- data/lib/picky/tokenizers/index.rb +1 -3
- data/lib/picky/tokenizers/query.rb +4 -6
- data/lib/picky/umlaut_substituter.rb +16 -3
- data/prototype_project/config.ru +0 -3
- data/spec/ext/performant_spec.rb +2 -11
- data/spec/lib/application_spec.rb +1 -1
- data/spec/lib/cacher/partial/subtoken_spec.rb +3 -3
- data/spec/lib/extensions/hash_spec.rb +1 -1
- data/spec/lib/extensions/symbol_spec.rb +0 -5
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +1 -7
- data/spec/lib/query/allocations_spec.rb +1 -1
- data/spec/lib/query/combinations_spec.rb +3 -3
- data/spec/lib/query/live_spec.rb +2 -2
- data/spec/lib/results/base_spec.rb +8 -9
- data/spec/lib/umlaut_substituter_spec.rb +5 -3
- data/spec/specific/speed_spec.rb +4 -9
- metadata +3 -3
data/lib/picky/application.rb
CHANGED
@@ -3,6 +3,19 @@
|
|
3
3
|
class Application
|
4
4
|
class << self
|
5
5
|
|
6
|
+
# Finalize the subclass as soon as it
|
7
|
+
# has finished loading.
|
8
|
+
#
|
9
|
+
# Note: finalize finalizes the routes.
|
10
|
+
#
|
11
|
+
def inherited app
|
12
|
+
@apps ||= []
|
13
|
+
@apps << app
|
14
|
+
end
|
15
|
+
def finalize_apps
|
16
|
+
@apps.each &:finalize
|
17
|
+
end
|
18
|
+
|
6
19
|
# An application simply delegates to the routing to handle a request.
|
7
20
|
#
|
8
21
|
def call env
|
data/lib/picky/index/combined.rb
CHANGED
@@ -9,6 +9,8 @@ module Index
|
|
9
9
|
#
|
10
10
|
# TODO Need to use the right subtokens. Bake in?
|
11
11
|
#
|
12
|
+
# TODO One can use it as a wrapper, and it will extract the indexes itself. Rename: ExactFirst.
|
13
|
+
#
|
12
14
|
class Combined < Bundle
|
13
15
|
|
14
16
|
delegate :similar,
|
@@ -25,6 +27,8 @@ module Index
|
|
25
27
|
:load,
|
26
28
|
:to => :@partial
|
27
29
|
|
30
|
+
# TODO initialize type_or_category # => installs itself on all full and partial
|
31
|
+
#
|
28
32
|
def initialize full, partial
|
29
33
|
@full = full
|
30
34
|
@partial = partial
|
data/lib/picky/loader.rb
CHANGED
data/lib/picky/query/base.rb
CHANGED
@@ -32,7 +32,7 @@ module Query
|
|
32
32
|
results = nil
|
33
33
|
|
34
34
|
duration = timed do
|
35
|
-
results = execute(tokens, offset) || empty_results # TODO Does not work yet
|
35
|
+
results = execute(tokens, offset) || empty_results(offset) # TODO Does not work yet
|
36
36
|
end
|
37
37
|
results.duration = duration.round 6
|
38
38
|
|
@@ -42,21 +42,21 @@ module Query
|
|
42
42
|
# Return nil if no results have been found.
|
43
43
|
#
|
44
44
|
def execute tokens, offset
|
45
|
-
results_from sorted_allocations(tokens)
|
45
|
+
results_from offset, sorted_allocations(tokens)
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
# Returns an empty result with default values.
|
49
49
|
#
|
50
|
-
def empty_results
|
51
|
-
result_type.new
|
50
|
+
def empty_results offset = 0
|
51
|
+
result_type.new offset
|
52
52
|
end
|
53
|
-
|
53
|
+
|
54
54
|
# Delegates the tokenizing to the query tokenizer.
|
55
55
|
#
|
56
56
|
def tokenized text
|
57
57
|
@tokenizer.tokenize text
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
# Gets sorted allocations for the tokens.
|
61
61
|
#
|
62
62
|
# This generates the possible allocations, sorted.
|
@@ -111,9 +111,9 @@ module Query
|
|
111
111
|
#
|
112
112
|
# TODO Move to results. result_type.from allocations, offset
|
113
113
|
#
|
114
|
-
def results_from
|
115
|
-
results = result_type.new allocations
|
116
|
-
results.prepare!
|
114
|
+
def results_from offset = 0, allocations = nil
|
115
|
+
results = result_type.new offset, allocations
|
116
|
+
results.prepare!
|
117
117
|
results
|
118
118
|
end
|
119
119
|
|
data/lib/picky/results/base.rb
CHANGED
@@ -5,16 +5,16 @@ module Results
|
|
5
5
|
#
|
6
6
|
class Base
|
7
7
|
|
8
|
-
# Duration is set by the query.
|
8
|
+
# Duration is set externally by the query.
|
9
9
|
#
|
10
10
|
attr_writer :duration
|
11
11
|
attr_reader :allocations, :offset
|
12
12
|
|
13
13
|
# Takes instances of Query::Allocations as param.
|
14
14
|
#
|
15
|
-
def initialize allocations = nil
|
15
|
+
def initialize offset = 0, allocations = nil
|
16
|
+
@offset = offset
|
16
17
|
@allocations = allocations || Query::Allocations.new
|
17
|
-
@offset = 0
|
18
18
|
end
|
19
19
|
|
20
20
|
def add more_results
|
@@ -49,9 +49,8 @@ module Results
|
|
49
49
|
# Without this, the allocations are not processed,
|
50
50
|
# and no ids are calculated.
|
51
51
|
#
|
52
|
-
def prepare!
|
53
|
-
|
54
|
-
allocations.process! self.max_results, offset
|
52
|
+
def prepare!
|
53
|
+
allocations.process! self.max_results, self.offset
|
55
54
|
end
|
56
55
|
|
57
56
|
# Duration default is 0.
|
@@ -88,6 +88,13 @@ module Tokenizers
|
|
88
88
|
process tokens # processing tokens / strings
|
89
89
|
end
|
90
90
|
|
91
|
+
attr_accessor :substituter
|
92
|
+
alias substituter? substituter
|
93
|
+
|
94
|
+
def initialize substituter = UmlautSubstituter.new
|
95
|
+
@substituter = substituter
|
96
|
+
end
|
97
|
+
|
91
98
|
# Hooks.
|
92
99
|
#
|
93
100
|
|
@@ -5,8 +5,6 @@ module Tokenizers
|
|
5
5
|
#
|
6
6
|
class Index < Base
|
7
7
|
|
8
|
-
include UmlautSubstituter
|
9
|
-
|
10
8
|
# Default handling definitions. Override in config.
|
11
9
|
#
|
12
10
|
removes_characters(//)
|
@@ -26,7 +24,7 @@ module Tokenizers
|
|
26
24
|
# 5. Remove non-single stopwords. (Stopwords that occur with other words)
|
27
25
|
#
|
28
26
|
def preprocess text
|
29
|
-
text =
|
27
|
+
text = substituter.substitute text if substituter?
|
30
28
|
text.downcase!
|
31
29
|
remove_illegals text
|
32
30
|
contract text
|
@@ -13,8 +13,6 @@ module Tokenizers
|
|
13
13
|
#
|
14
14
|
class Query < Base
|
15
15
|
|
16
|
-
include UmlautSubstituter
|
17
|
-
|
18
16
|
# Default query tokenizer behaviour. Override in config.
|
19
17
|
#
|
20
18
|
removes_characters(//)
|
@@ -53,10 +51,10 @@ module Tokenizers
|
|
53
51
|
# TODO Perhaps move to Normalizer?
|
54
52
|
#
|
55
53
|
def normalize text
|
56
|
-
text =
|
57
|
-
text.downcase!
|
58
|
-
normalize_with_patterns text
|
59
|
-
text.to_sym
|
54
|
+
text = substituter.substitute text if substituter? # Substitute special characters TODO Move to subclass
|
55
|
+
text.downcase! # Downcase all text
|
56
|
+
normalize_with_patterns text # normalize
|
57
|
+
text.to_sym # symbolize
|
60
58
|
end
|
61
59
|
|
62
60
|
# Returns a token for a word.
|
@@ -1,8 +1,20 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
|
4
|
+
# Substitutes certain umlauts, like
|
5
|
+
# ä, ö, ü => ae, oe, ue.
|
6
|
+
# (and more, see specs)
|
7
|
+
#
|
8
|
+
class UmlautSubstituter
|
9
|
+
|
10
|
+
attr_reader :chars
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@chars = ActiveSupport::Multibyte.proxy_class
|
14
|
+
end
|
15
|
+
|
16
|
+
def substitute text
|
17
|
+
trans = chars.new(text).normalize(:kd)
|
6
18
|
|
7
19
|
# substitute special cases
|
8
20
|
#
|
@@ -18,4 +30,5 @@ module UmlautSubstituter
|
|
18
30
|
cp < 0x0300 || cp > 0x035F
|
19
31
|
}.pack('U*')
|
20
32
|
end
|
33
|
+
|
21
34
|
end
|
data/prototype_project/config.ru
CHANGED
@@ -23,8 +23,6 @@ Indexes.load_from_cache
|
|
23
23
|
# Use Harakiri middleware to kill unicorn child after X requests.
|
24
24
|
#
|
25
25
|
# See http://vimeo.com/12614970 for more info.
|
26
|
-
#
|
27
|
-
# Note: Comment this.
|
28
26
|
#
|
29
27
|
Rack::Harakiri.after = 50
|
30
28
|
use Rack::Harakiri
|
@@ -33,5 +31,4 @@ use Rack::Harakiri
|
|
33
31
|
#
|
34
32
|
# Note: Needs to be the same constant name as in app/application.rb.
|
35
33
|
#
|
36
|
-
PickySearch.finalize
|
37
34
|
run PickySearch
|
data/spec/ext/performant_spec.rb
CHANGED
@@ -3,13 +3,6 @@ require File.dirname(__FILE__) + '/../spec_helper'
|
|
3
3
|
describe Performant::Array do
|
4
4
|
|
5
5
|
describe "memory_efficient_intersect" do
|
6
|
-
before(:each) do
|
7
|
-
GC.disable
|
8
|
-
end
|
9
|
-
after(:each) do
|
10
|
-
GC.enable
|
11
|
-
GC.start
|
12
|
-
end
|
13
6
|
it "should intersect empty arrays correctly" do
|
14
7
|
arys = [[3,4], [1,2,3], []]
|
15
8
|
|
@@ -45,15 +38,13 @@ describe Performant::Array do
|
|
45
38
|
arys = [(1..50).to_a, (10_000..20_000).to_a << 7]
|
46
39
|
|
47
40
|
# brute force
|
48
|
-
|
49
|
-
Performant::Array.memory_efficient_intersect(arys.sort_by(&:size))
|
50
|
-
end.should < 0.001
|
41
|
+
performance_of { Performant::Array.memory_efficient_intersect(arys.sort_by(&:size)) }.should < 0.001
|
51
42
|
end
|
52
43
|
it "should be optimal for 2 small arrays of 50/10_000" do
|
53
44
|
arys = [(1..50).to_a, (10_000..20_000).to_a << 7]
|
54
45
|
|
55
46
|
# &
|
56
|
-
|
47
|
+
performance_of do
|
57
48
|
arys.inject(arys.shift.dup) do |total, ary|
|
58
49
|
total & arys
|
59
50
|
end
|
@@ -24,7 +24,7 @@ describe Application do
|
|
24
24
|
# Note that Picky needs the following characters to
|
25
25
|
# pass through, as they are control characters: *"~:
|
26
26
|
#
|
27
|
-
querying.removes_characters(/[^a-zA-Z0-9
|
27
|
+
querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
|
28
28
|
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
29
29
|
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
30
30
|
querying.normalizes_words([
|
@@ -28,7 +28,7 @@ describe Cacher::Partial::Subtoken do
|
|
28
28
|
}
|
29
29
|
end
|
30
30
|
it "should be fast" do
|
31
|
-
|
31
|
+
performance_of { @cacher.generate_from( :florian => [1], :flavia => [2] ) }.should < 0.0001
|
32
32
|
end
|
33
33
|
it "should handle duplicate ids" do
|
34
34
|
@cacher.generate_from( :flo => [1], :fla => [1] ).should == {
|
@@ -106,7 +106,7 @@ describe Cacher::Partial::Subtoken do
|
|
106
106
|
end
|
107
107
|
end
|
108
108
|
it "should be fast" do
|
109
|
-
|
109
|
+
performance_of { @cacher.generate_from(@index) }.should < 0.005
|
110
110
|
end
|
111
111
|
end
|
112
112
|
describe "a bigger example with almost identical symbols" do
|
@@ -118,7 +118,7 @@ describe Cacher::Partial::Subtoken do
|
|
118
118
|
end
|
119
119
|
end
|
120
120
|
it "should be fast" do
|
121
|
-
|
121
|
+
performance_of { @cacher.generate_from(@index) }.should < 0.003
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
@@ -29,15 +29,9 @@ describe Index::Bundle do
|
|
29
29
|
random_keys = generate_random_keys 500
|
30
30
|
random_ids = generate_random_ids 500
|
31
31
|
@full.index = Hash[random_keys.zip(random_ids)]
|
32
|
-
|
33
|
-
GC.disable
|
34
|
-
end
|
35
|
-
after(:each) do
|
36
|
-
GC.enable
|
37
|
-
GC.start
|
38
32
|
end
|
39
33
|
it 'should be fast' do
|
40
|
-
|
34
|
+
performance_of do
|
41
35
|
@full.generate_partial
|
42
36
|
end.should < 0.2
|
43
37
|
end
|
@@ -328,7 +328,7 @@ describe Query::Allocations do
|
|
328
328
|
@allocations.total.should == 110
|
329
329
|
end
|
330
330
|
it 'should be fast' do
|
331
|
-
|
331
|
+
performance_of { @allocations.process!(20, 0) }.should < 0.0001
|
332
332
|
end
|
333
333
|
end
|
334
334
|
end
|
@@ -62,21 +62,21 @@ describe 'Query::Combinations' do
|
|
62
62
|
@combination2.should_receive(:ids).once.with.and_return (1..100).to_a
|
63
63
|
@combination3.should_receive(:ids).once.with.and_return (1..10).to_a
|
64
64
|
|
65
|
-
|
65
|
+
performance_of { @combinations.ids }.should < 0.004
|
66
66
|
end
|
67
67
|
it "should be fast" do
|
68
68
|
@combination1.should_receive(:ids).once.with.and_return (1..1000).to_a
|
69
69
|
@combination2.should_receive(:ids).once.with.and_return (1..100).to_a
|
70
70
|
@combination3.should_receive(:ids).once.with.and_return (1..10).to_a
|
71
71
|
|
72
|
-
|
72
|
+
performance_of { @combinations.ids }.should < 0.00015
|
73
73
|
end
|
74
74
|
it "should be fast" do
|
75
75
|
@combination1.should_receive(:ids).once.with.and_return (1..1000).to_a
|
76
76
|
@combination2.should_receive(:ids).once.with.and_return (901..1000).to_a
|
77
77
|
@combination3.should_receive(:ids).once.with.and_return (1..10).to_a
|
78
78
|
|
79
|
-
|
79
|
+
performance_of { @combinations.ids }.should < 0.0001
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
data/spec/lib/query/live_spec.rb
CHANGED
@@ -31,7 +31,7 @@ describe Query::Live do
|
|
31
31
|
allocations = stub :allocations
|
32
32
|
@query.should_receive(:sorted_allocations).and_return allocations
|
33
33
|
|
34
|
-
@query.should_receive(:results_from).once.with(
|
34
|
+
@query.should_receive(:results_from).once.with(0, allocations).and_return stub(:results, :prepare! => true)
|
35
35
|
|
36
36
|
@query.execute 'some query', 0
|
37
37
|
end
|
@@ -53,7 +53,7 @@ describe Query::Live do
|
|
53
53
|
@query.results_from(@allocations).duration.should == 0
|
54
54
|
end
|
55
55
|
it "should generate a result with the allocations" do
|
56
|
-
@query.results_from(@allocations).allocations.should == @allocations
|
56
|
+
@query.results_from(0, @allocations).allocations.should == @allocations
|
57
57
|
end
|
58
58
|
end
|
59
59
|
end
|
@@ -20,10 +20,9 @@ describe Results do
|
|
20
20
|
@allocations = stub :allocations,
|
21
21
|
:process! => nil, :size => 12
|
22
22
|
|
23
|
-
@results = Results::Base.new @allocations
|
23
|
+
@results = Results::Base.new 1234, @allocations
|
24
24
|
@results.stub! :duration => 0.1234567890,
|
25
|
-
:total => 12345678
|
26
|
-
:offset => 1234
|
25
|
+
:total => 12345678
|
27
26
|
end
|
28
27
|
it 'should output a specific log' do
|
29
28
|
@results.to_log('some_query').should == '|0-08-16 10:07:33|0.123457|some_query |12345678|1234|12|'
|
@@ -46,11 +45,11 @@ describe Results do
|
|
46
45
|
before(:each) do
|
47
46
|
@allocations = stub :allocations, :process! => nil, :to_result => :allocations, :total => :some_total
|
48
47
|
|
49
|
-
@results = Results::Base.new @allocations
|
48
|
+
@results = Results::Base.new :some_offset, @allocations
|
50
49
|
@results.duration = :some_duration
|
51
50
|
end
|
52
51
|
it 'should do it correctly' do
|
53
|
-
@results.prepare!
|
52
|
+
@results.prepare!
|
54
53
|
|
55
54
|
@results.serialize.should == { :allocations => :allocations, :offset => :some_offset, :duration => :some_duration, :total => :some_total }
|
56
55
|
end
|
@@ -139,7 +138,7 @@ describe Results do
|
|
139
138
|
}.should_not raise_error
|
140
139
|
end
|
141
140
|
it 'should set the allocations to an empty array' do
|
142
|
-
Results::Full.new(:some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
141
|
+
Results::Full.new(:unimportant, :some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
143
142
|
end
|
144
143
|
end
|
145
144
|
describe 'Live' do
|
@@ -149,7 +148,7 @@ describe Results do
|
|
149
148
|
}.should_not raise_error
|
150
149
|
end
|
151
150
|
it 'should set the allocations to an empty array' do
|
152
|
-
Results::Live.new(:some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
151
|
+
Results::Live.new(:unimportant, :some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
153
152
|
end
|
154
153
|
end
|
155
154
|
end
|
@@ -188,7 +187,7 @@ describe Results do
|
|
188
187
|
describe 'Full' do
|
189
188
|
it 'should delegate to allocations.total' do
|
190
189
|
allocations = stub :allocations
|
191
|
-
results = Results::Full.new allocations
|
190
|
+
results = Results::Full.new nil, allocations
|
192
191
|
|
193
192
|
allocations.should_receive(:total).once
|
194
193
|
|
@@ -198,7 +197,7 @@ describe Results do
|
|
198
197
|
describe 'Live' do
|
199
198
|
it 'should delegate to allocations.total' do
|
200
199
|
allocations = stub :allocations
|
201
|
-
results = Results::Live.new allocations
|
200
|
+
results = Results::Live.new nil, allocations
|
202
201
|
|
203
202
|
allocations.should_receive(:total).once
|
204
203
|
|
@@ -2,18 +2,20 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe UmlautSubstituter do
|
5
|
-
|
5
|
+
before(:each) do
|
6
|
+
@substituter = UmlautSubstituter.new
|
7
|
+
end
|
6
8
|
|
7
9
|
# A bit of metaprogramming to help with the myriads of its.
|
8
10
|
#
|
9
11
|
def self.it_should_substitute(special_character, normal_character)
|
10
12
|
it "should substitute #{special_character} with #{normal_character}" do
|
11
|
-
|
13
|
+
@substituter.substitute(special_character).should == normal_character
|
12
14
|
end
|
13
15
|
end
|
14
16
|
def self.it_should_not_substitute(special_character)
|
15
17
|
it "should not substitute #{special_character}" do
|
16
|
-
|
18
|
+
@substituter.substitute(special_character).should == special_character
|
17
19
|
end
|
18
20
|
end
|
19
21
|
|
data/spec/specific/speed_spec.rb
CHANGED
@@ -9,15 +9,10 @@ describe "Speccing Ruby for speed" do
|
|
9
9
|
:speed => (1..5_000).to_a,
|
10
10
|
:test => (1..1_000).to_a
|
11
11
|
}
|
12
|
-
GC.disable
|
13
|
-
end
|
14
|
-
after(:each) do
|
15
|
-
GC.enable
|
16
|
-
GC.start # start the GC to minimize the chance that it will run again during the speed spec
|
17
12
|
end
|
18
13
|
describe "+" do
|
19
14
|
it "should be fast" do
|
20
|
-
|
15
|
+
performance_of do
|
21
16
|
@allocs.inject([]) do |total, alloc|
|
22
17
|
total + @ids[alloc]
|
23
18
|
end
|
@@ -26,14 +21,14 @@ describe "Speccing Ruby for speed" do
|
|
26
21
|
end
|
27
22
|
describe "map and flatten!(1)" do
|
28
23
|
it "should be fast" do
|
29
|
-
|
24
|
+
performance_of do
|
30
25
|
@allocs.map { |alloc| @ids[alloc] }.flatten!(1)
|
31
26
|
end.should < 0.02
|
32
27
|
end
|
33
28
|
end
|
34
29
|
describe "<< and flatten!(1)" do
|
35
30
|
it "should be fast" do
|
36
|
-
|
31
|
+
performance_of do
|
37
32
|
@allocs.inject([]) do |total, alloc|
|
38
33
|
total << @ids[alloc]
|
39
34
|
end.flatten!(1)
|
@@ -42,7 +37,7 @@ describe "Speccing Ruby for speed" do
|
|
42
37
|
end
|
43
38
|
describe "<< and flatten!" do
|
44
39
|
it "should be fast" do
|
45
|
-
|
40
|
+
performance_of do
|
46
41
|
@allocs.inject([]) do |total, alloc|
|
47
42
|
total << @ids[alloc]
|
48
43
|
end.flatten!
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 2
|
9
|
+
version: 0.2.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Florian Hanke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-14 00:00:00 +02:00
|
18
18
|
default_executable: picky
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|