picky 1.4.1 → 1.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/{alias_instances.rb → aliases.rb} +1 -3
- data/lib/picky/application.rb +18 -19
- data/lib/picky/cores.rb +1 -1
- data/lib/picky/generators/aliases.rb +3 -0
- data/lib/picky/index/base.rb +179 -0
- data/lib/picky/index/memory.rb +28 -0
- data/lib/picky/index/redis.rb +28 -0
- data/lib/picky/{indexes_api.rb → index_bundle.rb} +16 -16
- data/lib/picky/indexed/indexes.rb +11 -7
- data/lib/picky/indexing/indexes.rb +14 -8
- data/lib/picky/internals/adapters/rack/base.rb +27 -0
- data/lib/picky/internals/adapters/rack/live_parameters.rb +37 -0
- data/lib/picky/internals/adapters/rack/query.rb +63 -0
- data/lib/picky/internals/adapters/rack.rb +34 -0
- data/lib/picky/{calculations → internals/calculations}/location.rb +0 -0
- data/lib/picky/{cli.rb → internals/cli.rb} +0 -0
- data/lib/picky/{configuration → internals/configuration}/index.rb +8 -2
- data/lib/picky/{ext → internals/ext}/maybe_compile.rb +0 -0
- data/lib/picky/{ext → internals/ext}/ruby19/extconf.rb +0 -0
- data/lib/picky/{ext → internals/ext}/ruby19/performant.c +0 -0
- data/lib/picky/{extensions → internals/extensions}/array.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/hash.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/module.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/object.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/symbol.rb +0 -0
- data/lib/picky/internals/frontend_adapters/rack.rb +154 -0
- data/lib/picky/internals/generators/base.rb +19 -0
- data/lib/picky/internals/generators/partial/default.rb +7 -0
- data/lib/picky/internals/generators/partial/none.rb +35 -0
- data/lib/picky/internals/generators/partial/strategy.rb +29 -0
- data/lib/picky/internals/generators/partial/substring.rb +122 -0
- data/lib/picky/internals/generators/partial_generator.rb +19 -0
- data/lib/picky/internals/generators/similarity/default.rb +9 -0
- data/lib/picky/internals/generators/similarity/double_levenshtone.rb +81 -0
- data/lib/picky/internals/generators/similarity/none.rb +35 -0
- data/lib/picky/internals/generators/similarity/strategy.rb +11 -0
- data/lib/picky/internals/generators/similarity_generator.rb +19 -0
- data/lib/picky/internals/generators/strategy.rb +18 -0
- data/lib/picky/internals/generators/weights/default.rb +9 -0
- data/lib/picky/internals/generators/weights/logarithmic.rb +43 -0
- data/lib/picky/internals/generators/weights/strategy.rb +11 -0
- data/lib/picky/internals/generators/weights_generator.rb +19 -0
- data/lib/picky/{helpers → internals/helpers}/measuring.rb +0 -0
- data/lib/picky/internals/index/backend.rb +113 -0
- data/lib/picky/internals/index/file/basic.rb +101 -0
- data/lib/picky/internals/index/file/json.rb +38 -0
- data/lib/picky/internals/index/file/marshal.rb +38 -0
- data/lib/picky/internals/index/file/text.rb +60 -0
- data/lib/picky/internals/index/files.rb +24 -0
- data/lib/picky/internals/index/redis/basic.rb +77 -0
- data/lib/picky/internals/index/redis/list_hash.rb +46 -0
- data/lib/picky/internals/index/redis/string_hash.rb +35 -0
- data/lib/picky/internals/index/redis.rb +44 -0
- data/lib/picky/internals/indexed/bundle/base.rb +72 -0
- data/lib/picky/internals/indexed/bundle/memory.rb +69 -0
- data/lib/picky/internals/indexed/bundle/redis.rb +70 -0
- data/lib/picky/internals/indexed/categories.rb +135 -0
- data/lib/picky/internals/indexed/category.rb +90 -0
- data/lib/picky/internals/indexed/index.rb +57 -0
- data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/calculation.rb +0 -0
- data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/location.rb +4 -2
- data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/wrapper.rb +1 -1
- data/lib/picky/internals/indexed/wrappers/exact_first.rb +65 -0
- data/lib/picky/{indexers → internals/indexers}/no_source_specified_error.rb +0 -0
- data/lib/picky/{indexers → internals/indexers}/serial.rb +2 -2
- data/lib/picky/{indexers → internals/indexers}/solr.rb +0 -0
- data/lib/picky/internals/indexing/bundle/base.rb +219 -0
- data/lib/picky/internals/indexing/bundle/memory.rb +25 -0
- data/lib/picky/internals/indexing/bundle/redis.rb +28 -0
- data/lib/picky/internals/indexing/bundle/super_base.rb +65 -0
- data/lib/picky/internals/indexing/categories.rb +42 -0
- data/lib/picky/internals/indexing/category.rb +120 -0
- data/lib/picky/internals/indexing/index.rb +67 -0
- data/lib/picky/{performant.rb → internals/performant.rb} +0 -0
- data/lib/picky/internals/query/allocation.rb +88 -0
- data/lib/picky/internals/query/allocations.rb +137 -0
- data/lib/picky/internals/query/combination.rb +80 -0
- data/lib/picky/internals/query/combinations/base.rb +84 -0
- data/lib/picky/internals/query/combinations/memory.rb +58 -0
- data/lib/picky/internals/query/combinations/redis.rb +59 -0
- data/lib/picky/internals/query/indexes.rb +180 -0
- data/lib/picky/internals/query/qualifiers.rb +81 -0
- data/lib/picky/internals/query/token.rb +215 -0
- data/lib/picky/internals/query/tokens.rb +89 -0
- data/lib/picky/{query → internals/query}/weights.rb +0 -0
- data/lib/picky/internals/results/base.rb +106 -0
- data/lib/picky/internals/results/full.rb +17 -0
- data/lib/picky/internals/results/live.rb +17 -0
- data/lib/picky/{solr → internals/solr}/schema_generator.rb +0 -0
- data/lib/picky/internals/tokenizers/base.rb +166 -0
- data/lib/picky/internals/tokenizers/index.rb +63 -0
- data/lib/picky/internals/tokenizers/query.rb +79 -0
- data/lib/picky/loader.rb +148 -112
- data/lib/picky/query/base.rb +57 -26
- data/lib/picky/query/full.rb +1 -1
- data/lib/picky/query/live.rb +1 -1
- data/lib/picky/sources/db.rb +27 -6
- data/lib/tasks/index.rake +3 -3
- data/lib/tasks/try.rake +2 -2
- data/spec/lib/aliases_spec.rb +9 -0
- data/spec/lib/application_spec.rb +3 -3
- data/spec/lib/generators/aliases_spec.rb +1 -0
- data/spec/lib/{index_api_spec.rb → index/base_spec.rb} +7 -7
- data/spec/lib/index_bundle_spec.rb +71 -0
- data/spec/lib/indexed/indexes_spec.rb +61 -0
- data/spec/lib/indexing/indexes_spec.rb +94 -24
- data/spec/lib/{adapters → internals/adapters}/rack/base_spec.rb +2 -2
- data/spec/lib/{adapters → internals/adapters}/rack/live_parameters_spec.rb +2 -2
- data/spec/lib/{adapters → internals/adapters}/rack/query_spec.rb +2 -2
- data/spec/lib/{calculations → internals/calculations}/location_spec.rb +0 -0
- data/spec/lib/{cli_spec.rb → internals/cli_spec.rb} +4 -1
- data/spec/lib/{configuration → internals/configuration}/index_spec.rb +1 -1
- data/spec/lib/{cores_spec.rb → internals/cores_spec.rb} +0 -0
- data/spec/lib/{extensions → internals/extensions}/array_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/hash_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/module_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/object_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/symbol_spec.rb +0 -0
- data/spec/lib/{frontend_adapters → internals/frontend_adapters}/rack_spec.rb +11 -11
- data/spec/lib/{cacher → internals/generators}/cacher_strategy_spec.rb +2 -2
- data/spec/lib/internals/generators/partial/default_spec.rb +17 -0
- data/spec/lib/internals/generators/partial/none_spec.rb +17 -0
- data/spec/lib/{cacher → internals/generators}/partial/substring_spec.rb +26 -27
- data/spec/lib/{cacher → internals/generators}/partial_generator_spec.rb +5 -5
- data/spec/lib/{cacher → internals/generators}/similarity/double_levenshtone_spec.rb +4 -4
- data/spec/lib/{cacher → internals/generators}/similarity/none_spec.rb +2 -2
- data/spec/lib/{cacher → internals/generators}/similarity_generator_spec.rb +4 -4
- data/spec/lib/{cacher → internals/generators}/weights/logarithmic_spec.rb +2 -2
- data/spec/lib/internals/generators/weights_generator_spec.rb +21 -0
- data/spec/lib/{helpers → internals/helpers}/measuring_spec.rb +0 -0
- data/spec/lib/{index → internals/index}/file/basic_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/file/json_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/file/marshal_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/file/text_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/files_spec.rb +2 -2
- data/spec/lib/{indexed/bundle_spec.rb → internals/indexed/bundle/memory_spec.rb} +4 -5
- data/spec/lib/{indexed → internals/indexed}/categories_spec.rb +13 -13
- data/spec/lib/{indexed → internals/indexed}/category_spec.rb +59 -32
- data/spec/lib/{indexed → internals/indexed}/index_spec.rb +5 -5
- data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/calculation_spec.rb +0 -0
- data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/wrapper_spec.rb +0 -0
- data/spec/lib/{indexed → internals/indexed}/wrappers/exact_first_spec.rb +5 -5
- data/spec/lib/{indexers → internals/indexers}/serial_spec.rb +0 -0
- data/spec/lib/{indexing/bundle_partial_generation_speed_spec.rb → internals/indexing/bundle/memory_partial_generation_speed_spec.rb} +3 -3
- data/spec/lib/{indexing/bundle_spec.rb → internals/indexing/bundle/memory_spec.rb} +3 -3
- data/spec/lib/{index/bundle_spec.rb → internals/indexing/bundle/super_base_spec.rb} +9 -3
- data/spec/lib/{indexing → internals/indexing}/category_spec.rb +3 -3
- data/spec/lib/{indexing → internals/indexing}/index_spec.rb +3 -3
- data/spec/lib/internals/indexing/indexes_spec.rb +36 -0
- data/spec/lib/{interfaces → internals/interfaces}/live_parameters_spec.rb +0 -0
- data/spec/lib/internals/results/base_spec.rb +105 -0
- data/spec/lib/internals/results/full_spec.rb +78 -0
- data/spec/lib/internals/results/live_spec.rb +88 -0
- data/spec/lib/{solr → internals/solr}/schema_generator_spec.rb +0 -0
- data/spec/lib/{tokenizers → internals/tokenizers}/base_spec.rb +3 -3
- data/spec/lib/{tokenizers → internals/tokenizers}/index_spec.rb +9 -9
- data/spec/lib/{tokenizers → internals/tokenizers}/query_spec.rb +11 -11
- data/spec/lib/query/allocation_spec.rb +12 -12
- data/spec/lib/query/allocations_spec.rb +19 -19
- data/spec/lib/query/base_spec.rb +28 -4
- data/spec/lib/query/combination_spec.rb +8 -9
- data/spec/lib/query/combinations/base_spec.rb +116 -0
- data/spec/lib/query/{combinations_spec.rb → combinations/memory_spec.rb} +14 -14
- data/spec/lib/query/combinations/redis_spec.rb +132 -0
- data/spec/lib/query/full_spec.rb +2 -2
- data/spec/lib/query/indexes_spec.rb +81 -0
- data/spec/lib/query/live_spec.rb +3 -3
- data/spec/lib/query/qualifiers_spec.rb +6 -6
- data/spec/lib/query/token_spec.rb +38 -38
- data/spec/lib/query/tokens_spec.rb +35 -35
- data/spec/lib/sources/db_spec.rb +23 -18
- metadata +212 -181
- data/lib/picky/adapters/rack/base.rb +0 -23
- data/lib/picky/adapters/rack/live_parameters.rb +0 -33
- data/lib/picky/adapters/rack/query.rb +0 -59
- data/lib/picky/adapters/rack.rb +0 -28
- data/lib/picky/cacher/convenience.rb +0 -3
- data/lib/picky/cacher/generator.rb +0 -15
- data/lib/picky/cacher/partial/default.rb +0 -5
- data/lib/picky/cacher/partial/none.rb +0 -31
- data/lib/picky/cacher/partial/strategy.rb +0 -21
- data/lib/picky/cacher/partial/substring.rb +0 -118
- data/lib/picky/cacher/partial_generator.rb +0 -15
- data/lib/picky/cacher/similarity/default.rb +0 -7
- data/lib/picky/cacher/similarity/double_levenshtone.rb +0 -77
- data/lib/picky/cacher/similarity/none.rb +0 -31
- data/lib/picky/cacher/similarity/strategy.rb +0 -9
- data/lib/picky/cacher/similarity_generator.rb +0 -15
- data/lib/picky/cacher/strategy.rb +0 -12
- data/lib/picky/cacher/weights/default.rb +0 -7
- data/lib/picky/cacher/weights/logarithmic.rb +0 -39
- data/lib/picky/cacher/weights/strategy.rb +0 -9
- data/lib/picky/cacher/weights_generator.rb +0 -15
- data/lib/picky/frontend_adapters/rack.rb +0 -150
- data/lib/picky/index/bundle.rb +0 -54
- data/lib/picky/index/file/basic.rb +0 -97
- data/lib/picky/index/file/json.rb +0 -34
- data/lib/picky/index/file/marshal.rb +0 -34
- data/lib/picky/index/file/text.rb +0 -56
- data/lib/picky/index/files.rb +0 -118
- data/lib/picky/index_api.rb +0 -175
- data/lib/picky/indexed/bundle.rb +0 -54
- data/lib/picky/indexed/categories.rb +0 -131
- data/lib/picky/indexed/category.rb +0 -85
- data/lib/picky/indexed/index.rb +0 -39
- data/lib/picky/indexed/wrappers/exact_first.rb +0 -61
- data/lib/picky/indexing/bundle.rb +0 -213
- data/lib/picky/indexing/categories.rb +0 -38
- data/lib/picky/indexing/category.rb +0 -117
- data/lib/picky/indexing/index.rb +0 -55
- data/lib/picky/query/allocation.rb +0 -82
- data/lib/picky/query/allocations.rb +0 -130
- data/lib/picky/query/combination.rb +0 -74
- data/lib/picky/query/combinations.rb +0 -105
- data/lib/picky/query/qualifiers.rb +0 -77
- data/lib/picky/query/token.rb +0 -202
- data/lib/picky/query/tokens.rb +0 -86
- data/lib/picky/query/weigher.rb +0 -165
- data/lib/picky/results/base.rb +0 -102
- data/lib/picky/results/full.rb +0 -13
- data/lib/picky/results/live.rb +0 -13
- data/lib/picky/tokenizers/base.rb +0 -161
- data/lib/picky/tokenizers/index.rb +0 -58
- data/lib/picky/tokenizers/query.rb +0 -74
- data/spec/lib/cacher/partial/default_spec.rb +0 -15
- data/spec/lib/cacher/partial/none_spec.rb +0 -17
- data/spec/lib/cacher/weights_generator_spec.rb +0 -21
- data/spec/lib/results/base_spec.rb +0 -257
- data/spec/lib/results/live_spec.rb +0 -15
@@ -0,0 +1,215 @@
|
|
1
|
+
module Internals
|
2
|
+
|
3
|
+
module Query
|
4
|
+
|
5
|
+
# This is a query token. Together with other tokens it makes up a query.
|
6
|
+
#
|
7
|
+
# It remembers the original form, and and a normalized form.
|
8
|
+
#
|
9
|
+
# It also knows whether it needs to look for similarity (bla~), or whether it is a partial (bla*).
|
10
|
+
#
|
11
|
+
# TODO Make partial / similarity char configurable.
|
12
|
+
#
|
13
|
+
class Token # :nodoc:all
|
14
|
+
|
15
|
+
attr_reader :text, :original
|
16
|
+
attr_writer :similar
|
17
|
+
|
18
|
+
delegate :blank?, :to => :text
|
19
|
+
|
20
|
+
# Normal initializer.
|
21
|
+
#
|
22
|
+
# Note: Use this if you do not want a qualified and normalized token.
|
23
|
+
#
|
24
|
+
def initialize text
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns a qualified and normalized token.
|
29
|
+
#
|
30
|
+
# Note: Use this in the search engine if you need a qualified
|
31
|
+
# and normalized token. I.e. one prepared for a search.
|
32
|
+
#
|
33
|
+
def self.processed text
|
34
|
+
token = new text
|
35
|
+
token.qualify
|
36
|
+
token.extract_original
|
37
|
+
token.partialize
|
38
|
+
token.similarize
|
39
|
+
token.remove_illegals
|
40
|
+
token
|
41
|
+
end
|
42
|
+
|
43
|
+
# This returns a predefined category name if the user has given one.
|
44
|
+
#
|
45
|
+
def user_defined_category_name
|
46
|
+
@qualifier
|
47
|
+
end
|
48
|
+
|
49
|
+
# Extracts a qualifier for this token and pre-assigns an allocation.
|
50
|
+
#
|
51
|
+
# Note: Removes the qualifier if it is not allowed.
|
52
|
+
#
|
53
|
+
def qualify
|
54
|
+
@qualifier, @text = split @text
|
55
|
+
@qualifier = Query::Qualifiers.instance.normalize @qualifier
|
56
|
+
end
|
57
|
+
def extract_original
|
58
|
+
@original = @text.dup
|
59
|
+
end
|
60
|
+
|
61
|
+
# Partial is a conditional setter.
|
62
|
+
#
|
63
|
+
# It is only settable if it hasn't been set yet.
|
64
|
+
#
|
65
|
+
def partial= partial
|
66
|
+
@partial = partial if @partial.nil?
|
67
|
+
end
|
68
|
+
def partial?
|
69
|
+
!@similar && @partial
|
70
|
+
end
|
71
|
+
|
72
|
+
# If the text ends with *, partialize it. If with ", don't.
|
73
|
+
#
|
74
|
+
@@no_partial = /\"\Z/
|
75
|
+
@@partial = /\*\Z/
|
76
|
+
def partialize
|
77
|
+
self.partial = false and return if @text =~ @@no_partial
|
78
|
+
self.partial = true if @text =~ @@partial
|
79
|
+
end
|
80
|
+
|
81
|
+
# If the text ends with ~ similarize it. If with ", don't.
|
82
|
+
#
|
83
|
+
@@no_similar = /\"\Z/
|
84
|
+
@@similar = /\~\Z/
|
85
|
+
def similarize
|
86
|
+
self.similar = false and return if @text =~ @@no_similar
|
87
|
+
self.similar = true if @text =~ @@similar
|
88
|
+
end
|
89
|
+
|
90
|
+
def similar?
|
91
|
+
@similar
|
92
|
+
end
|
93
|
+
|
94
|
+
# Normalizes this token's text.
|
95
|
+
#
|
96
|
+
@@illegals = /["*~]/
|
97
|
+
def remove_illegals
|
98
|
+
@text.gsub! @@illegals, '' unless @text.blank?
|
99
|
+
end
|
100
|
+
|
101
|
+
# Visitor for tokenizer.
|
102
|
+
#
|
103
|
+
# TODO Rewrite!!!
|
104
|
+
#
|
105
|
+
def tokenize_with tokenizer
|
106
|
+
@text = tokenizer.normalize @text
|
107
|
+
end
|
108
|
+
# TODO spec!
|
109
|
+
#
|
110
|
+
# TODO Rewrite!!
|
111
|
+
#
|
112
|
+
def tokenized tokenizer
|
113
|
+
tokenizer.tokenize(@text.to_s).each do |text|
|
114
|
+
yield text
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Returns an array of possible combinations.
|
119
|
+
#
|
120
|
+
def possible_combinations_in type
|
121
|
+
type.possible_combinations self
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a token with the next similar text.
|
125
|
+
#
|
126
|
+
# TODO Rewrite this. It is hard to understand. Also spec performance.
|
127
|
+
#
|
128
|
+
def next_similar_token category
|
129
|
+
token = self.dup
|
130
|
+
token if token.next_similar category.bundle_for(token)
|
131
|
+
end
|
132
|
+
# Sets and returns the next similar word.
|
133
|
+
#
|
134
|
+
# Note: Also overrides the original.
|
135
|
+
#
|
136
|
+
def next_similar bundle
|
137
|
+
@text = @original = (similarity(bundle).shift || return) if similar?
|
138
|
+
end
|
139
|
+
# Lazy similar reader.
|
140
|
+
#
|
141
|
+
def similarity bundle = nil
|
142
|
+
@similarity || @similarity = generate_similarity_for(bundle)
|
143
|
+
end
|
144
|
+
# Returns an enumerator that traverses over the similar.
|
145
|
+
#
|
146
|
+
# Note: The dup isn't too nice – since it is needed on account of the shift, above.
|
147
|
+
# (We avoid a StopIteration exception. Which of both is less evil?)
|
148
|
+
#
|
149
|
+
def generate_similarity_for bundle
|
150
|
+
bundle.similar(@text).dup || []
|
151
|
+
end
|
152
|
+
|
153
|
+
# Generates a solr term from this token.
|
154
|
+
#
|
155
|
+
# E.g. "name:heroes~0.75"
|
156
|
+
#
|
157
|
+
@@solr_fuzzy_mapping = {
|
158
|
+
1 => :'',
|
159
|
+
2 => :'',
|
160
|
+
3 => :'',
|
161
|
+
4 => :'~0.74',
|
162
|
+
5 => :'~0.78',
|
163
|
+
6 => :'~0.81',
|
164
|
+
7 => :'~0.83',
|
165
|
+
8 => :'~0.85',
|
166
|
+
9 => :'~0.87',
|
167
|
+
10 => :'~0.89'
|
168
|
+
}
|
169
|
+
@@solr_fuzzy_mapping.default = :'~0.9'
|
170
|
+
def to_solr
|
171
|
+
blank? ? '' : (to_s + @@solr_fuzzy_mapping[@text.size].to_s)
|
172
|
+
end
|
173
|
+
|
174
|
+
#
|
175
|
+
#
|
176
|
+
def to_result
|
177
|
+
[@original, @text]
|
178
|
+
end
|
179
|
+
|
180
|
+
# Internal identifier.
|
181
|
+
#
|
182
|
+
# TODO Uh.
|
183
|
+
#
|
184
|
+
def identifier
|
185
|
+
"#{similar?? :similarity : :index}:#{@text}"
|
186
|
+
end
|
187
|
+
|
188
|
+
# Displays the qualifier text and the text, joined.
|
189
|
+
#
|
190
|
+
# e.g. name:meier
|
191
|
+
#
|
192
|
+
def to_s
|
193
|
+
[@qualifier, @text].compact.join ':'
|
194
|
+
end
|
195
|
+
|
196
|
+
private
|
197
|
+
|
198
|
+
# Splits text into a qualifier and text.
|
199
|
+
#
|
200
|
+
# Returns [qualifier, text].
|
201
|
+
#
|
202
|
+
def split unqualified_text
|
203
|
+
qualifier, text = (unqualified_text || '').split(':', 2)
|
204
|
+
if text.blank?
|
205
|
+
[nil, (qualifier || '')]
|
206
|
+
else
|
207
|
+
[qualifier, text]
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
module Internals
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
module Query
|
8
|
+
|
9
|
+
# This class primarily handles switching through similar token constellations.
|
10
|
+
#
|
11
|
+
class Tokens # :nodoc:all
|
12
|
+
|
13
|
+
# Basically delegates to its internal tokens array.
|
14
|
+
#
|
15
|
+
self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
|
16
|
+
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def initialize tokens = []
|
20
|
+
@tokens = tokens
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
#
|
25
|
+
def tokenize_with tokenizer
|
26
|
+
@tokens.each { |token| token.tokenize_with(tokenizer) }
|
27
|
+
end
|
28
|
+
|
29
|
+
# Generates an array in the form of
|
30
|
+
# [
|
31
|
+
# [combination], # of token 1
|
32
|
+
# [combination, combination, combination], # of token 2
|
33
|
+
# [combination, combination] # of token 3
|
34
|
+
# ]
|
35
|
+
#
|
36
|
+
# TODO If we want token behaviour defined per Query, we can
|
37
|
+
# compact! here
|
38
|
+
#
|
39
|
+
def possible_combinations_in type
|
40
|
+
@tokens.inject([]) do |combinations, token|
|
41
|
+
combinations << token.possible_combinations_in(type)
|
42
|
+
end
|
43
|
+
# TODO compact! if ignore_unassigned_tokens
|
44
|
+
end
|
45
|
+
|
46
|
+
# Makes the last of the tokens partial.
|
47
|
+
#
|
48
|
+
def partialize_last
|
49
|
+
@tokens.last.partial = true unless empty?
|
50
|
+
end
|
51
|
+
|
52
|
+
# Caps the tokens to the maximum.
|
53
|
+
#
|
54
|
+
def cap maximum
|
55
|
+
@tokens.slice!(maximum..-1) if cap?(maximum)
|
56
|
+
end
|
57
|
+
def cap? maximum
|
58
|
+
@tokens.size > maximum
|
59
|
+
end
|
60
|
+
|
61
|
+
# Rejects blank tokens.
|
62
|
+
#
|
63
|
+
def reject
|
64
|
+
@tokens.reject! &:blank?
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns a solr query.
|
68
|
+
#
|
69
|
+
def to_solr_query
|
70
|
+
@tokens.map(&:to_solr).join ' '
|
71
|
+
end
|
72
|
+
|
73
|
+
#
|
74
|
+
#
|
75
|
+
def originals
|
76
|
+
@tokens.map(&:original)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Just join the token original texts.
|
80
|
+
#
|
81
|
+
def to_s
|
82
|
+
originals.join ' '
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
File without changes
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Internals
|
2
|
+
|
3
|
+
module Results # :nodoc:all
|
4
|
+
|
5
|
+
# This is the internal results object. Usually, to_marshal, or to_json
|
6
|
+
# is called on it to get a string for the answer.
|
7
|
+
#
|
8
|
+
class Base
|
9
|
+
|
10
|
+
# Duration is set externally by the query.
|
11
|
+
#
|
12
|
+
attr_writer :duration
|
13
|
+
attr_reader :allocations, :offset
|
14
|
+
|
15
|
+
# Takes instances of Query::Allocations as param.
|
16
|
+
#
|
17
|
+
def initialize offset = 0, allocations = Query::Allocations.new
|
18
|
+
@offset = offset
|
19
|
+
@allocations = allocations # || Query::Allocations.new
|
20
|
+
end
|
21
|
+
# Create new results and calculate the ids.
|
22
|
+
#
|
23
|
+
def self.from offset, allocations
|
24
|
+
results = new offset, allocations
|
25
|
+
results.prepare!
|
26
|
+
results
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
#
|
31
|
+
def serialize
|
32
|
+
{ allocations: allocations.to_result,
|
33
|
+
offset: offset,
|
34
|
+
duration: duration,
|
35
|
+
total: total }
|
36
|
+
end
|
37
|
+
# The default format is json.
|
38
|
+
#
|
39
|
+
def to_response options = {}
|
40
|
+
to_json options
|
41
|
+
end
|
42
|
+
# Convert to json format.
|
43
|
+
#
|
44
|
+
def to_json options = {}
|
45
|
+
serialize.to_json options
|
46
|
+
end
|
47
|
+
|
48
|
+
# This starts the actual processing.
|
49
|
+
#
|
50
|
+
# Without this, the allocations are not processed,
|
51
|
+
# and no ids are calculated.
|
52
|
+
#
|
53
|
+
def prepare!
|
54
|
+
allocations.process! self.max_results, self.offset
|
55
|
+
end
|
56
|
+
|
57
|
+
# Duration default is 0.
|
58
|
+
#
|
59
|
+
def duration
|
60
|
+
@duration || 0
|
61
|
+
end
|
62
|
+
# The total results. Delegates to the allocations.
|
63
|
+
#
|
64
|
+
# Caches.
|
65
|
+
#
|
66
|
+
def total
|
67
|
+
@total || @total = allocations.total || 0
|
68
|
+
end
|
69
|
+
|
70
|
+
# How many results are returned.
|
71
|
+
#
|
72
|
+
# Set in config using
|
73
|
+
# Results::Full.max_results = 20
|
74
|
+
#
|
75
|
+
class_inheritable_accessor :max_results
|
76
|
+
def max_results
|
77
|
+
self.class.max_results
|
78
|
+
end
|
79
|
+
|
80
|
+
# Convenience methods.
|
81
|
+
#
|
82
|
+
|
83
|
+
# Delegates to allocations.
|
84
|
+
#
|
85
|
+
def ids amount = 20
|
86
|
+
allocations.ids amount
|
87
|
+
end
|
88
|
+
# Gets an amout of random ids from the allocations.
|
89
|
+
#
|
90
|
+
# Note: Basically delegates to the allocations.
|
91
|
+
#
|
92
|
+
def random_ids amount = 1
|
93
|
+
allocations.random_ids amount
|
94
|
+
end
|
95
|
+
|
96
|
+
# Human readable log.
|
97
|
+
#
|
98
|
+
def to_log query
|
99
|
+
"|#{Time.now.to_s(:db)}|#{'%8f' % duration}|#{'%-50s' % query}|#{'%8d' % total}|#{'%4d' % offset}|#{'%2d' % allocations.size}|"
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
File without changes
|
@@ -0,0 +1,166 @@
|
|
1
|
+
module Internals
|
2
|
+
|
3
|
+
module Tokenizers # :nodoc:all
|
4
|
+
|
5
|
+
# Defines tokenizing processes used both in indexing and querying.
|
6
|
+
#
|
7
|
+
class Base
|
8
|
+
|
9
|
+
# TODO Move EMPTY_STRING top level.
|
10
|
+
#
|
11
|
+
EMPTY_STRING = ''.freeze
|
12
|
+
|
13
|
+
# Stopwords.
|
14
|
+
#
|
15
|
+
def stopwords regexp
|
16
|
+
@remove_stopwords_regexp = regexp
|
17
|
+
end
|
18
|
+
def remove_stopwords text
|
19
|
+
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
|
20
|
+
text
|
21
|
+
end
|
22
|
+
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
23
|
+
def remove_non_single_stopwords text
|
24
|
+
return text if text.match @@non_single_stopword_regexp
|
25
|
+
remove_stopwords text
|
26
|
+
end
|
27
|
+
|
28
|
+
# Illegals.
|
29
|
+
#
|
30
|
+
# TODO Should there be a legal?
|
31
|
+
#
|
32
|
+
def removes_characters regexp
|
33
|
+
@removes_characters_regexp = regexp
|
34
|
+
end
|
35
|
+
def remove_illegals text
|
36
|
+
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
|
37
|
+
text
|
38
|
+
end
|
39
|
+
|
40
|
+
# Splitting.
|
41
|
+
#
|
42
|
+
def splits_text_on regexp
|
43
|
+
@splits_text_on_regexp = regexp
|
44
|
+
end
|
45
|
+
def split text
|
46
|
+
text.split @splits_text_on_regexp
|
47
|
+
end
|
48
|
+
|
49
|
+
# Normalizing.
|
50
|
+
#
|
51
|
+
def normalizes_words regexp_replaces
|
52
|
+
@normalizes_words_regexp_replaces = regexp_replaces
|
53
|
+
end
|
54
|
+
def normalize_with_patterns text
|
55
|
+
return text unless @normalizes_words_regexp_replaces
|
56
|
+
|
57
|
+
@normalizes_words_regexp_replaces.each do |regex, replace|
|
58
|
+
# This should be sufficient
|
59
|
+
#
|
60
|
+
text.gsub!(regex, replace) and break
|
61
|
+
end
|
62
|
+
remove_after_normalizing_illegals text
|
63
|
+
text
|
64
|
+
end
|
65
|
+
|
66
|
+
# Illegal after normalizing.
|
67
|
+
#
|
68
|
+
def removes_characters_after_splitting regexp
|
69
|
+
@removes_characters_after_splitting_regexp = regexp
|
70
|
+
end
|
71
|
+
def remove_after_normalizing_illegals text
|
72
|
+
text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
|
73
|
+
end
|
74
|
+
|
75
|
+
# Substitute Characters with this substituter.
|
76
|
+
#
|
77
|
+
# Default is European Character substitution.
|
78
|
+
#
|
79
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
80
|
+
# TODO Raise if it doesn't quack substitute?
|
81
|
+
@substituter = substituter
|
82
|
+
end
|
83
|
+
def substitute_characters text
|
84
|
+
substituter?? substituter.substitute(text) : text
|
85
|
+
end
|
86
|
+
|
87
|
+
# Reject tokens after tokenizing based on the given criteria.
|
88
|
+
#
|
89
|
+
# Note: Currently only for indexing. TODO Redesign and write for both!
|
90
|
+
#
|
91
|
+
def reject_token_if &condition
|
92
|
+
@reject_condition = condition
|
93
|
+
end
|
94
|
+
def reject tokens
|
95
|
+
tokens.reject! &@reject_condition
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
# Returns a number of tokens, generated from the given text.
|
100
|
+
#
|
101
|
+
# Note:
|
102
|
+
# * preprocess, pretokenize are hooks
|
103
|
+
#
|
104
|
+
def tokenize text
|
105
|
+
text = preprocess text # processing the text
|
106
|
+
return empty_tokens if text.blank?
|
107
|
+
words = pretokenize text # splitting and preparations for tokenizing
|
108
|
+
return empty_tokens if words.empty?
|
109
|
+
tokens = tokens_for words # creating tokens / strings
|
110
|
+
process tokens # processing tokens / strings
|
111
|
+
end
|
112
|
+
|
113
|
+
attr_reader :substituter
|
114
|
+
alias substituter? substituter
|
115
|
+
|
116
|
+
def initialize options = {}
|
117
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
118
|
+
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
119
|
+
stopwords options[:stopwords] if options[:stopwords]
|
120
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
121
|
+
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
122
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
123
|
+
|
124
|
+
# Defaults.
|
125
|
+
#
|
126
|
+
splits_text_on options[:splits_text_on] || /\s/
|
127
|
+
reject_token_if &(options[:reject_token_if] || :blank?)
|
128
|
+
end
|
129
|
+
|
130
|
+
# Hooks.
|
131
|
+
#
|
132
|
+
|
133
|
+
# Preprocessing.
|
134
|
+
#
|
135
|
+
def preprocess text; end
|
136
|
+
# Pretokenizing.
|
137
|
+
#
|
138
|
+
def pretokenize text; end
|
139
|
+
# Postprocessing.
|
140
|
+
#
|
141
|
+
def process tokens
|
142
|
+
reject tokens # Reject any tokens that don't meet criteria
|
143
|
+
tokens
|
144
|
+
end
|
145
|
+
|
146
|
+
# Converts words into real tokens.
|
147
|
+
#
|
148
|
+
def tokens_for words
|
149
|
+
Internals::Query::Tokens.new words.collect! { |word| token_for word }
|
150
|
+
end
|
151
|
+
# Turns non-blank text into symbols.
|
152
|
+
#
|
153
|
+
def symbolize text
|
154
|
+
text.blank? ? nil : text.to_sym
|
155
|
+
end
|
156
|
+
# Returns a tokens object.
|
157
|
+
#
|
158
|
+
def empty_tokens
|
159
|
+
Internals::Query::Tokens.new
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Internals
|
2
|
+
|
3
|
+
module Tokenizers
|
4
|
+
|
5
|
+
# The base indexing tokenizer.
|
6
|
+
#
|
7
|
+
# Override in indexing subclasses and define in configuration.
|
8
|
+
#
|
9
|
+
class Index < Base
|
10
|
+
|
11
|
+
def self.default= new_default
|
12
|
+
@default = new_default
|
13
|
+
end
|
14
|
+
def self.default
|
15
|
+
@default ||= new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Default indexing preprocessing hook.
|
19
|
+
#
|
20
|
+
# Does:
|
21
|
+
# 1. Character substitution.
|
22
|
+
# 2. Downcasing.
|
23
|
+
# 3. Remove illegal expressions.
|
24
|
+
# 4. Remove non-single stopwords. (Stopwords that occur with other words)
|
25
|
+
#
|
26
|
+
def preprocess text
|
27
|
+
text = substitute_characters text
|
28
|
+
text.downcase!
|
29
|
+
remove_illegals text
|
30
|
+
# we do not remove single stopwords for an entirely different
|
31
|
+
# reason than in the query tokenizer.
|
32
|
+
# An indexed thing with just name "UND" (a possible stopword) should not lose its name.
|
33
|
+
#
|
34
|
+
remove_non_single_stopwords text
|
35
|
+
text
|
36
|
+
end
|
37
|
+
|
38
|
+
# Default indexing pretokenizing hook.
|
39
|
+
#
|
40
|
+
# Does:
|
41
|
+
# 1. Split the text into words.
|
42
|
+
# 2. Normalize each word.
|
43
|
+
#
|
44
|
+
def pretokenize text
|
45
|
+
words = split text
|
46
|
+
words.collect! do |word|
|
47
|
+
normalize_with_patterns word
|
48
|
+
word
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Does not actually return a token, but a
|
53
|
+
# symbol "token".
|
54
|
+
#
|
55
|
+
def token_for text
|
56
|
+
symbolize text
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|