picky 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/application.rb +13 -0
- data/lib/picky/index/combined.rb +4 -0
- data/lib/picky/loader.rb +4 -0
- data/lib/picky/query/base.rb +10 -10
- data/lib/picky/results/base.rb +5 -6
- data/lib/picky/tokenizers/base.rb +7 -0
- data/lib/picky/tokenizers/index.rb +1 -3
- data/lib/picky/tokenizers/query.rb +4 -6
- data/lib/picky/umlaut_substituter.rb +16 -3
- data/prototype_project/config.ru +0 -3
- data/spec/ext/performant_spec.rb +2 -11
- data/spec/lib/application_spec.rb +1 -1
- data/spec/lib/cacher/partial/subtoken_spec.rb +3 -3
- data/spec/lib/extensions/hash_spec.rb +1 -1
- data/spec/lib/extensions/symbol_spec.rb +0 -5
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +1 -7
- data/spec/lib/query/allocations_spec.rb +1 -1
- data/spec/lib/query/combinations_spec.rb +3 -3
- data/spec/lib/query/live_spec.rb +2 -2
- data/spec/lib/results/base_spec.rb +8 -9
- data/spec/lib/umlaut_substituter_spec.rb +5 -3
- data/spec/specific/speed_spec.rb +4 -9
- metadata +3 -3
data/lib/picky/application.rb
CHANGED
@@ -3,6 +3,19 @@
|
|
3
3
|
class Application
|
4
4
|
class << self
|
5
5
|
|
6
|
+
# Finalize the subclass as soon as it
|
7
|
+
# has finished loading.
|
8
|
+
#
|
9
|
+
# Note: finalize finalizes the routes.
|
10
|
+
#
|
11
|
+
def inherited app
|
12
|
+
@apps ||= []
|
13
|
+
@apps << app
|
14
|
+
end
|
15
|
+
def finalize_apps
|
16
|
+
@apps.each &:finalize
|
17
|
+
end
|
18
|
+
|
6
19
|
# An application simply delegates to the routing to handle a request.
|
7
20
|
#
|
8
21
|
def call env
|
data/lib/picky/index/combined.rb
CHANGED
@@ -9,6 +9,8 @@ module Index
|
|
9
9
|
#
|
10
10
|
# TODO Need to use the right subtokens. Bake in?
|
11
11
|
#
|
12
|
+
# TODO One can use it as a wrapper, and it will extract the indexes itself. Rename: ExactFirst.
|
13
|
+
#
|
12
14
|
class Combined < Bundle
|
13
15
|
|
14
16
|
delegate :similar,
|
@@ -25,6 +27,8 @@ module Index
|
|
25
27
|
:load,
|
26
28
|
:to => :@partial
|
27
29
|
|
30
|
+
# TODO initialize type_or_category # => installs itself on all full and partial
|
31
|
+
#
|
28
32
|
def initialize full, partial
|
29
33
|
@full = full
|
30
34
|
@partial = partial
|
data/lib/picky/loader.rb
CHANGED
data/lib/picky/query/base.rb
CHANGED
@@ -32,7 +32,7 @@ module Query
|
|
32
32
|
results = nil
|
33
33
|
|
34
34
|
duration = timed do
|
35
|
-
results = execute(tokens, offset) || empty_results # TODO Does not work yet
|
35
|
+
results = execute(tokens, offset) || empty_results(offset) # TODO Does not work yet
|
36
36
|
end
|
37
37
|
results.duration = duration.round 6
|
38
38
|
|
@@ -42,21 +42,21 @@ module Query
|
|
42
42
|
# Return nil if no results have been found.
|
43
43
|
#
|
44
44
|
def execute tokens, offset
|
45
|
-
results_from sorted_allocations(tokens)
|
45
|
+
results_from offset, sorted_allocations(tokens)
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
# Returns an empty result with default values.
|
49
49
|
#
|
50
|
-
def empty_results
|
51
|
-
result_type.new
|
50
|
+
def empty_results offset = 0
|
51
|
+
result_type.new offset
|
52
52
|
end
|
53
|
-
|
53
|
+
|
54
54
|
# Delegates the tokenizing to the query tokenizer.
|
55
55
|
#
|
56
56
|
def tokenized text
|
57
57
|
@tokenizer.tokenize text
|
58
58
|
end
|
59
|
-
|
59
|
+
|
60
60
|
# Gets sorted allocations for the tokens.
|
61
61
|
#
|
62
62
|
# This generates the possible allocations, sorted.
|
@@ -111,9 +111,9 @@ module Query
|
|
111
111
|
#
|
112
112
|
# TODO Move to results. result_type.from allocations, offset
|
113
113
|
#
|
114
|
-
def results_from
|
115
|
-
results = result_type.new allocations
|
116
|
-
results.prepare!
|
114
|
+
def results_from offset = 0, allocations = nil
|
115
|
+
results = result_type.new offset, allocations
|
116
|
+
results.prepare!
|
117
117
|
results
|
118
118
|
end
|
119
119
|
|
data/lib/picky/results/base.rb
CHANGED
@@ -5,16 +5,16 @@ module Results
|
|
5
5
|
#
|
6
6
|
class Base
|
7
7
|
|
8
|
-
# Duration is set by the query.
|
8
|
+
# Duration is set externally by the query.
|
9
9
|
#
|
10
10
|
attr_writer :duration
|
11
11
|
attr_reader :allocations, :offset
|
12
12
|
|
13
13
|
# Takes instances of Query::Allocations as param.
|
14
14
|
#
|
15
|
-
def initialize allocations = nil
|
15
|
+
def initialize offset = 0, allocations = nil
|
16
|
+
@offset = offset
|
16
17
|
@allocations = allocations || Query::Allocations.new
|
17
|
-
@offset = 0
|
18
18
|
end
|
19
19
|
|
20
20
|
def add more_results
|
@@ -49,9 +49,8 @@ module Results
|
|
49
49
|
# Without this, the allocations are not processed,
|
50
50
|
# and no ids are calculated.
|
51
51
|
#
|
52
|
-
def prepare!
|
53
|
-
|
54
|
-
allocations.process! self.max_results, offset
|
52
|
+
def prepare!
|
53
|
+
allocations.process! self.max_results, self.offset
|
55
54
|
end
|
56
55
|
|
57
56
|
# Duration default is 0.
|
@@ -88,6 +88,13 @@ module Tokenizers
|
|
88
88
|
process tokens # processing tokens / strings
|
89
89
|
end
|
90
90
|
|
91
|
+
attr_accessor :substituter
|
92
|
+
alias substituter? substituter
|
93
|
+
|
94
|
+
def initialize substituter = UmlautSubstituter.new
|
95
|
+
@substituter = substituter
|
96
|
+
end
|
97
|
+
|
91
98
|
# Hooks.
|
92
99
|
#
|
93
100
|
|
@@ -5,8 +5,6 @@ module Tokenizers
|
|
5
5
|
#
|
6
6
|
class Index < Base
|
7
7
|
|
8
|
-
include UmlautSubstituter
|
9
|
-
|
10
8
|
# Default handling definitions. Override in config.
|
11
9
|
#
|
12
10
|
removes_characters(//)
|
@@ -26,7 +24,7 @@ module Tokenizers
|
|
26
24
|
# 5. Remove non-single stopwords. (Stopwords that occur with other words)
|
27
25
|
#
|
28
26
|
def preprocess text
|
29
|
-
text =
|
27
|
+
text = substituter.substitute text if substituter?
|
30
28
|
text.downcase!
|
31
29
|
remove_illegals text
|
32
30
|
contract text
|
@@ -13,8 +13,6 @@ module Tokenizers
|
|
13
13
|
#
|
14
14
|
class Query < Base
|
15
15
|
|
16
|
-
include UmlautSubstituter
|
17
|
-
|
18
16
|
# Default query tokenizer behaviour. Override in config.
|
19
17
|
#
|
20
18
|
removes_characters(//)
|
@@ -53,10 +51,10 @@ module Tokenizers
|
|
53
51
|
# TODO Perhaps move to Normalizer?
|
54
52
|
#
|
55
53
|
def normalize text
|
56
|
-
text =
|
57
|
-
text.downcase!
|
58
|
-
normalize_with_patterns text
|
59
|
-
text.to_sym
|
54
|
+
text = substituter.substitute text if substituter? # Substitute special characters TODO Move to subclass
|
55
|
+
text.downcase! # Downcase all text
|
56
|
+
normalize_with_patterns text # normalize
|
57
|
+
text.to_sym # symbolize
|
60
58
|
end
|
61
59
|
|
62
60
|
# Returns a token for a word.
|
@@ -1,8 +1,20 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
|
4
|
+
# Substitutes certain umlauts, like
|
5
|
+
# ä, ö, ü => ae, oe, ue.
|
6
|
+
# (and more, see specs)
|
7
|
+
#
|
8
|
+
class UmlautSubstituter
|
9
|
+
|
10
|
+
attr_reader :chars
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@chars = ActiveSupport::Multibyte.proxy_class
|
14
|
+
end
|
15
|
+
|
16
|
+
def substitute text
|
17
|
+
trans = chars.new(text).normalize(:kd)
|
6
18
|
|
7
19
|
# substitute special cases
|
8
20
|
#
|
@@ -18,4 +30,5 @@ module UmlautSubstituter
|
|
18
30
|
cp < 0x0300 || cp > 0x035F
|
19
31
|
}.pack('U*')
|
20
32
|
end
|
33
|
+
|
21
34
|
end
|
data/prototype_project/config.ru
CHANGED
@@ -23,8 +23,6 @@ Indexes.load_from_cache
|
|
23
23
|
# Use Harakiri middleware to kill unicorn child after X requests.
|
24
24
|
#
|
25
25
|
# See http://vimeo.com/12614970 for more info.
|
26
|
-
#
|
27
|
-
# Note: Comment this.
|
28
26
|
#
|
29
27
|
Rack::Harakiri.after = 50
|
30
28
|
use Rack::Harakiri
|
@@ -33,5 +31,4 @@ use Rack::Harakiri
|
|
33
31
|
#
|
34
32
|
# Note: Needs to be the same constant name as in app/application.rb.
|
35
33
|
#
|
36
|
-
PickySearch.finalize
|
37
34
|
run PickySearch
|
data/spec/ext/performant_spec.rb
CHANGED
@@ -3,13 +3,6 @@ require File.dirname(__FILE__) + '/../spec_helper'
|
|
3
3
|
describe Performant::Array do
|
4
4
|
|
5
5
|
describe "memory_efficient_intersect" do
|
6
|
-
before(:each) do
|
7
|
-
GC.disable
|
8
|
-
end
|
9
|
-
after(:each) do
|
10
|
-
GC.enable
|
11
|
-
GC.start
|
12
|
-
end
|
13
6
|
it "should intersect empty arrays correctly" do
|
14
7
|
arys = [[3,4], [1,2,3], []]
|
15
8
|
|
@@ -45,15 +38,13 @@ describe Performant::Array do
|
|
45
38
|
arys = [(1..50).to_a, (10_000..20_000).to_a << 7]
|
46
39
|
|
47
40
|
# brute force
|
48
|
-
|
49
|
-
Performant::Array.memory_efficient_intersect(arys.sort_by(&:size))
|
50
|
-
end.should < 0.001
|
41
|
+
performance_of { Performant::Array.memory_efficient_intersect(arys.sort_by(&:size)) }.should < 0.001
|
51
42
|
end
|
52
43
|
it "should be optimal for 2 small arrays of 50/10_000" do
|
53
44
|
arys = [(1..50).to_a, (10_000..20_000).to_a << 7]
|
54
45
|
|
55
46
|
# &
|
56
|
-
|
47
|
+
performance_of do
|
57
48
|
arys.inject(arys.shift.dup) do |total, ary|
|
58
49
|
total & arys
|
59
50
|
end
|
@@ -24,7 +24,7 @@ describe Application do
|
|
24
24
|
# Note that Picky needs the following characters to
|
25
25
|
# pass through, as they are control characters: *"~:
|
26
26
|
#
|
27
|
-
querying.removes_characters(/[^a-zA-Z0-9
|
27
|
+
querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
|
28
28
|
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
29
29
|
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
30
30
|
querying.normalizes_words([
|
@@ -28,7 +28,7 @@ describe Cacher::Partial::Subtoken do
|
|
28
28
|
}
|
29
29
|
end
|
30
30
|
it "should be fast" do
|
31
|
-
|
31
|
+
performance_of { @cacher.generate_from( :florian => [1], :flavia => [2] ) }.should < 0.0001
|
32
32
|
end
|
33
33
|
it "should handle duplicate ids" do
|
34
34
|
@cacher.generate_from( :flo => [1], :fla => [1] ).should == {
|
@@ -106,7 +106,7 @@ describe Cacher::Partial::Subtoken do
|
|
106
106
|
end
|
107
107
|
end
|
108
108
|
it "should be fast" do
|
109
|
-
|
109
|
+
performance_of { @cacher.generate_from(@index) }.should < 0.005
|
110
110
|
end
|
111
111
|
end
|
112
112
|
describe "a bigger example with almost identical symbols" do
|
@@ -118,7 +118,7 @@ describe Cacher::Partial::Subtoken do
|
|
118
118
|
end
|
119
119
|
end
|
120
120
|
it "should be fast" do
|
121
|
-
|
121
|
+
performance_of { @cacher.generate_from(@index) }.should < 0.003
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
@@ -29,15 +29,9 @@ describe Index::Bundle do
|
|
29
29
|
random_keys = generate_random_keys 500
|
30
30
|
random_ids = generate_random_ids 500
|
31
31
|
@full.index = Hash[random_keys.zip(random_ids)]
|
32
|
-
|
33
|
-
GC.disable
|
34
|
-
end
|
35
|
-
after(:each) do
|
36
|
-
GC.enable
|
37
|
-
GC.start
|
38
32
|
end
|
39
33
|
it 'should be fast' do
|
40
|
-
|
34
|
+
performance_of do
|
41
35
|
@full.generate_partial
|
42
36
|
end.should < 0.2
|
43
37
|
end
|
@@ -328,7 +328,7 @@ describe Query::Allocations do
|
|
328
328
|
@allocations.total.should == 110
|
329
329
|
end
|
330
330
|
it 'should be fast' do
|
331
|
-
|
331
|
+
performance_of { @allocations.process!(20, 0) }.should < 0.0001
|
332
332
|
end
|
333
333
|
end
|
334
334
|
end
|
@@ -62,21 +62,21 @@ describe 'Query::Combinations' do
|
|
62
62
|
@combination2.should_receive(:ids).once.with.and_return (1..100).to_a
|
63
63
|
@combination3.should_receive(:ids).once.with.and_return (1..10).to_a
|
64
64
|
|
65
|
-
|
65
|
+
performance_of { @combinations.ids }.should < 0.004
|
66
66
|
end
|
67
67
|
it "should be fast" do
|
68
68
|
@combination1.should_receive(:ids).once.with.and_return (1..1000).to_a
|
69
69
|
@combination2.should_receive(:ids).once.with.and_return (1..100).to_a
|
70
70
|
@combination3.should_receive(:ids).once.with.and_return (1..10).to_a
|
71
71
|
|
72
|
-
|
72
|
+
performance_of { @combinations.ids }.should < 0.00015
|
73
73
|
end
|
74
74
|
it "should be fast" do
|
75
75
|
@combination1.should_receive(:ids).once.with.and_return (1..1000).to_a
|
76
76
|
@combination2.should_receive(:ids).once.with.and_return (901..1000).to_a
|
77
77
|
@combination3.should_receive(:ids).once.with.and_return (1..10).to_a
|
78
78
|
|
79
|
-
|
79
|
+
performance_of { @combinations.ids }.should < 0.0001
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
data/spec/lib/query/live_spec.rb
CHANGED
@@ -31,7 +31,7 @@ describe Query::Live do
|
|
31
31
|
allocations = stub :allocations
|
32
32
|
@query.should_receive(:sorted_allocations).and_return allocations
|
33
33
|
|
34
|
-
@query.should_receive(:results_from).once.with(
|
34
|
+
@query.should_receive(:results_from).once.with(0, allocations).and_return stub(:results, :prepare! => true)
|
35
35
|
|
36
36
|
@query.execute 'some query', 0
|
37
37
|
end
|
@@ -53,7 +53,7 @@ describe Query::Live do
|
|
53
53
|
@query.results_from(@allocations).duration.should == 0
|
54
54
|
end
|
55
55
|
it "should generate a result with the allocations" do
|
56
|
-
@query.results_from(@allocations).allocations.should == @allocations
|
56
|
+
@query.results_from(0, @allocations).allocations.should == @allocations
|
57
57
|
end
|
58
58
|
end
|
59
59
|
end
|
@@ -20,10 +20,9 @@ describe Results do
|
|
20
20
|
@allocations = stub :allocations,
|
21
21
|
:process! => nil, :size => 12
|
22
22
|
|
23
|
-
@results = Results::Base.new @allocations
|
23
|
+
@results = Results::Base.new 1234, @allocations
|
24
24
|
@results.stub! :duration => 0.1234567890,
|
25
|
-
:total => 12345678
|
26
|
-
:offset => 1234
|
25
|
+
:total => 12345678
|
27
26
|
end
|
28
27
|
it 'should output a specific log' do
|
29
28
|
@results.to_log('some_query').should == '|0-08-16 10:07:33|0.123457|some_query |12345678|1234|12|'
|
@@ -46,11 +45,11 @@ describe Results do
|
|
46
45
|
before(:each) do
|
47
46
|
@allocations = stub :allocations, :process! => nil, :to_result => :allocations, :total => :some_total
|
48
47
|
|
49
|
-
@results = Results::Base.new @allocations
|
48
|
+
@results = Results::Base.new :some_offset, @allocations
|
50
49
|
@results.duration = :some_duration
|
51
50
|
end
|
52
51
|
it 'should do it correctly' do
|
53
|
-
@results.prepare!
|
52
|
+
@results.prepare!
|
54
53
|
|
55
54
|
@results.serialize.should == { :allocations => :allocations, :offset => :some_offset, :duration => :some_duration, :total => :some_total }
|
56
55
|
end
|
@@ -139,7 +138,7 @@ describe Results do
|
|
139
138
|
}.should_not raise_error
|
140
139
|
end
|
141
140
|
it 'should set the allocations to an empty array' do
|
142
|
-
Results::Full.new(:some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
141
|
+
Results::Full.new(:unimportant, :some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
143
142
|
end
|
144
143
|
end
|
145
144
|
describe 'Live' do
|
@@ -149,7 +148,7 @@ describe Results do
|
|
149
148
|
}.should_not raise_error
|
150
149
|
end
|
151
150
|
it 'should set the allocations to an empty array' do
|
152
|
-
Results::Live.new(:some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
151
|
+
Results::Live.new(:unimportant, :some_allocations).instance_variable_get(:@allocations).should == :some_allocations
|
153
152
|
end
|
154
153
|
end
|
155
154
|
end
|
@@ -188,7 +187,7 @@ describe Results do
|
|
188
187
|
describe 'Full' do
|
189
188
|
it 'should delegate to allocations.total' do
|
190
189
|
allocations = stub :allocations
|
191
|
-
results = Results::Full.new allocations
|
190
|
+
results = Results::Full.new nil, allocations
|
192
191
|
|
193
192
|
allocations.should_receive(:total).once
|
194
193
|
|
@@ -198,7 +197,7 @@ describe Results do
|
|
198
197
|
describe 'Live' do
|
199
198
|
it 'should delegate to allocations.total' do
|
200
199
|
allocations = stub :allocations
|
201
|
-
results = Results::Live.new allocations
|
200
|
+
results = Results::Live.new nil, allocations
|
202
201
|
|
203
202
|
allocations.should_receive(:total).once
|
204
203
|
|
@@ -2,18 +2,20 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe UmlautSubstituter do
|
5
|
-
|
5
|
+
before(:each) do
|
6
|
+
@substituter = UmlautSubstituter.new
|
7
|
+
end
|
6
8
|
|
7
9
|
# A bit of metaprogramming to help with the myriads of its.
|
8
10
|
#
|
9
11
|
def self.it_should_substitute(special_character, normal_character)
|
10
12
|
it "should substitute #{special_character} with #{normal_character}" do
|
11
|
-
|
13
|
+
@substituter.substitute(special_character).should == normal_character
|
12
14
|
end
|
13
15
|
end
|
14
16
|
def self.it_should_not_substitute(special_character)
|
15
17
|
it "should not substitute #{special_character}" do
|
16
|
-
|
18
|
+
@substituter.substitute(special_character).should == special_character
|
17
19
|
end
|
18
20
|
end
|
19
21
|
|
data/spec/specific/speed_spec.rb
CHANGED
@@ -9,15 +9,10 @@ describe "Speccing Ruby for speed" do
|
|
9
9
|
:speed => (1..5_000).to_a,
|
10
10
|
:test => (1..1_000).to_a
|
11
11
|
}
|
12
|
-
GC.disable
|
13
|
-
end
|
14
|
-
after(:each) do
|
15
|
-
GC.enable
|
16
|
-
GC.start # start the GC to minimize the chance that it will run again during the speed spec
|
17
12
|
end
|
18
13
|
describe "+" do
|
19
14
|
it "should be fast" do
|
20
|
-
|
15
|
+
performance_of do
|
21
16
|
@allocs.inject([]) do |total, alloc|
|
22
17
|
total + @ids[alloc]
|
23
18
|
end
|
@@ -26,14 +21,14 @@ describe "Speccing Ruby for speed" do
|
|
26
21
|
end
|
27
22
|
describe "map and flatten!(1)" do
|
28
23
|
it "should be fast" do
|
29
|
-
|
24
|
+
performance_of do
|
30
25
|
@allocs.map { |alloc| @ids[alloc] }.flatten!(1)
|
31
26
|
end.should < 0.02
|
32
27
|
end
|
33
28
|
end
|
34
29
|
describe "<< and flatten!(1)" do
|
35
30
|
it "should be fast" do
|
36
|
-
|
31
|
+
performance_of do
|
37
32
|
@allocs.inject([]) do |total, alloc|
|
38
33
|
total << @ids[alloc]
|
39
34
|
end.flatten!(1)
|
@@ -42,7 +37,7 @@ describe "Speccing Ruby for speed" do
|
|
42
37
|
end
|
43
38
|
describe "<< and flatten!" do
|
44
39
|
it "should be fast" do
|
45
|
-
|
40
|
+
performance_of do
|
46
41
|
@allocs.inject([]) do |total, alloc|
|
47
42
|
total << @ids[alloc]
|
48
43
|
end.flatten!
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
8
|
+
- 2
|
9
|
+
version: 0.2.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Florian Hanke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-14 00:00:00 +02:00
|
18
18
|
default_executable: picky
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|