picky 1.4.1 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/{alias_instances.rb → aliases.rb} +1 -3
- data/lib/picky/application.rb +18 -19
- data/lib/picky/cores.rb +1 -1
- data/lib/picky/generators/aliases.rb +3 -0
- data/lib/picky/index/base.rb +179 -0
- data/lib/picky/index/memory.rb +28 -0
- data/lib/picky/index/redis.rb +28 -0
- data/lib/picky/{indexes_api.rb → index_bundle.rb} +16 -16
- data/lib/picky/indexed/indexes.rb +11 -7
- data/lib/picky/indexing/indexes.rb +14 -8
- data/lib/picky/internals/adapters/rack/base.rb +27 -0
- data/lib/picky/internals/adapters/rack/live_parameters.rb +37 -0
- data/lib/picky/internals/adapters/rack/query.rb +63 -0
- data/lib/picky/internals/adapters/rack.rb +34 -0
- data/lib/picky/{calculations → internals/calculations}/location.rb +0 -0
- data/lib/picky/{cli.rb → internals/cli.rb} +0 -0
- data/lib/picky/{configuration → internals/configuration}/index.rb +8 -2
- data/lib/picky/{ext → internals/ext}/maybe_compile.rb +0 -0
- data/lib/picky/{ext → internals/ext}/ruby19/extconf.rb +0 -0
- data/lib/picky/{ext → internals/ext}/ruby19/performant.c +0 -0
- data/lib/picky/{extensions → internals/extensions}/array.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/hash.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/module.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/object.rb +0 -0
- data/lib/picky/{extensions → internals/extensions}/symbol.rb +0 -0
- data/lib/picky/internals/frontend_adapters/rack.rb +154 -0
- data/lib/picky/internals/generators/base.rb +19 -0
- data/lib/picky/internals/generators/partial/default.rb +7 -0
- data/lib/picky/internals/generators/partial/none.rb +35 -0
- data/lib/picky/internals/generators/partial/strategy.rb +29 -0
- data/lib/picky/internals/generators/partial/substring.rb +122 -0
- data/lib/picky/internals/generators/partial_generator.rb +19 -0
- data/lib/picky/internals/generators/similarity/default.rb +9 -0
- data/lib/picky/internals/generators/similarity/double_levenshtone.rb +81 -0
- data/lib/picky/internals/generators/similarity/none.rb +35 -0
- data/lib/picky/internals/generators/similarity/strategy.rb +11 -0
- data/lib/picky/internals/generators/similarity_generator.rb +19 -0
- data/lib/picky/internals/generators/strategy.rb +18 -0
- data/lib/picky/internals/generators/weights/default.rb +9 -0
- data/lib/picky/internals/generators/weights/logarithmic.rb +43 -0
- data/lib/picky/internals/generators/weights/strategy.rb +11 -0
- data/lib/picky/internals/generators/weights_generator.rb +19 -0
- data/lib/picky/{helpers → internals/helpers}/measuring.rb +0 -0
- data/lib/picky/internals/index/backend.rb +113 -0
- data/lib/picky/internals/index/file/basic.rb +101 -0
- data/lib/picky/internals/index/file/json.rb +38 -0
- data/lib/picky/internals/index/file/marshal.rb +38 -0
- data/lib/picky/internals/index/file/text.rb +60 -0
- data/lib/picky/internals/index/files.rb +24 -0
- data/lib/picky/internals/index/redis/basic.rb +77 -0
- data/lib/picky/internals/index/redis/list_hash.rb +46 -0
- data/lib/picky/internals/index/redis/string_hash.rb +35 -0
- data/lib/picky/internals/index/redis.rb +44 -0
- data/lib/picky/internals/indexed/bundle/base.rb +72 -0
- data/lib/picky/internals/indexed/bundle/memory.rb +69 -0
- data/lib/picky/internals/indexed/bundle/redis.rb +70 -0
- data/lib/picky/internals/indexed/categories.rb +135 -0
- data/lib/picky/internals/indexed/category.rb +90 -0
- data/lib/picky/internals/indexed/index.rb +57 -0
- data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/calculation.rb +0 -0
- data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/location.rb +4 -2
- data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/wrapper.rb +1 -1
- data/lib/picky/internals/indexed/wrappers/exact_first.rb +65 -0
- data/lib/picky/{indexers → internals/indexers}/no_source_specified_error.rb +0 -0
- data/lib/picky/{indexers → internals/indexers}/serial.rb +2 -2
- data/lib/picky/{indexers → internals/indexers}/solr.rb +0 -0
- data/lib/picky/internals/indexing/bundle/base.rb +219 -0
- data/lib/picky/internals/indexing/bundle/memory.rb +25 -0
- data/lib/picky/internals/indexing/bundle/redis.rb +28 -0
- data/lib/picky/internals/indexing/bundle/super_base.rb +65 -0
- data/lib/picky/internals/indexing/categories.rb +42 -0
- data/lib/picky/internals/indexing/category.rb +120 -0
- data/lib/picky/internals/indexing/index.rb +67 -0
- data/lib/picky/{performant.rb → internals/performant.rb} +0 -0
- data/lib/picky/internals/query/allocation.rb +88 -0
- data/lib/picky/internals/query/allocations.rb +137 -0
- data/lib/picky/internals/query/combination.rb +80 -0
- data/lib/picky/internals/query/combinations/base.rb +84 -0
- data/lib/picky/internals/query/combinations/memory.rb +58 -0
- data/lib/picky/internals/query/combinations/redis.rb +59 -0
- data/lib/picky/internals/query/indexes.rb +180 -0
- data/lib/picky/internals/query/qualifiers.rb +81 -0
- data/lib/picky/internals/query/token.rb +215 -0
- data/lib/picky/internals/query/tokens.rb +89 -0
- data/lib/picky/{query → internals/query}/weights.rb +0 -0
- data/lib/picky/internals/results/base.rb +106 -0
- data/lib/picky/internals/results/full.rb +17 -0
- data/lib/picky/internals/results/live.rb +17 -0
- data/lib/picky/{solr → internals/solr}/schema_generator.rb +0 -0
- data/lib/picky/internals/tokenizers/base.rb +166 -0
- data/lib/picky/internals/tokenizers/index.rb +63 -0
- data/lib/picky/internals/tokenizers/query.rb +79 -0
- data/lib/picky/loader.rb +148 -112
- data/lib/picky/query/base.rb +57 -26
- data/lib/picky/query/full.rb +1 -1
- data/lib/picky/query/live.rb +1 -1
- data/lib/picky/sources/db.rb +27 -6
- data/lib/tasks/index.rake +3 -3
- data/lib/tasks/try.rake +2 -2
- data/spec/lib/aliases_spec.rb +9 -0
- data/spec/lib/application_spec.rb +3 -3
- data/spec/lib/generators/aliases_spec.rb +1 -0
- data/spec/lib/{index_api_spec.rb → index/base_spec.rb} +7 -7
- data/spec/lib/index_bundle_spec.rb +71 -0
- data/spec/lib/indexed/indexes_spec.rb +61 -0
- data/spec/lib/indexing/indexes_spec.rb +94 -24
- data/spec/lib/{adapters → internals/adapters}/rack/base_spec.rb +2 -2
- data/spec/lib/{adapters → internals/adapters}/rack/live_parameters_spec.rb +2 -2
- data/spec/lib/{adapters → internals/adapters}/rack/query_spec.rb +2 -2
- data/spec/lib/{calculations → internals/calculations}/location_spec.rb +0 -0
- data/spec/lib/{cli_spec.rb → internals/cli_spec.rb} +4 -1
- data/spec/lib/{configuration → internals/configuration}/index_spec.rb +1 -1
- data/spec/lib/{cores_spec.rb → internals/cores_spec.rb} +0 -0
- data/spec/lib/{extensions → internals/extensions}/array_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/hash_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/module_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/object_spec.rb +0 -0
- data/spec/lib/{extensions → internals/extensions}/symbol_spec.rb +0 -0
- data/spec/lib/{frontend_adapters → internals/frontend_adapters}/rack_spec.rb +11 -11
- data/spec/lib/{cacher → internals/generators}/cacher_strategy_spec.rb +2 -2
- data/spec/lib/internals/generators/partial/default_spec.rb +17 -0
- data/spec/lib/internals/generators/partial/none_spec.rb +17 -0
- data/spec/lib/{cacher → internals/generators}/partial/substring_spec.rb +26 -27
- data/spec/lib/{cacher → internals/generators}/partial_generator_spec.rb +5 -5
- data/spec/lib/{cacher → internals/generators}/similarity/double_levenshtone_spec.rb +4 -4
- data/spec/lib/{cacher → internals/generators}/similarity/none_spec.rb +2 -2
- data/spec/lib/{cacher → internals/generators}/similarity_generator_spec.rb +4 -4
- data/spec/lib/{cacher → internals/generators}/weights/logarithmic_spec.rb +2 -2
- data/spec/lib/internals/generators/weights_generator_spec.rb +21 -0
- data/spec/lib/{helpers → internals/helpers}/measuring_spec.rb +0 -0
- data/spec/lib/{index → internals/index}/file/basic_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/file/json_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/file/marshal_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/file/text_spec.rb +2 -2
- data/spec/lib/{index → internals/index}/files_spec.rb +2 -2
- data/spec/lib/{indexed/bundle_spec.rb → internals/indexed/bundle/memory_spec.rb} +4 -5
- data/spec/lib/{indexed → internals/indexed}/categories_spec.rb +13 -13
- data/spec/lib/{indexed → internals/indexed}/category_spec.rb +59 -32
- data/spec/lib/{indexed → internals/indexed}/index_spec.rb +5 -5
- data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/calculation_spec.rb +0 -0
- data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/wrapper_spec.rb +0 -0
- data/spec/lib/{indexed → internals/indexed}/wrappers/exact_first_spec.rb +5 -5
- data/spec/lib/{indexers → internals/indexers}/serial_spec.rb +0 -0
- data/spec/lib/{indexing/bundle_partial_generation_speed_spec.rb → internals/indexing/bundle/memory_partial_generation_speed_spec.rb} +3 -3
- data/spec/lib/{indexing/bundle_spec.rb → internals/indexing/bundle/memory_spec.rb} +3 -3
- data/spec/lib/{index/bundle_spec.rb → internals/indexing/bundle/super_base_spec.rb} +9 -3
- data/spec/lib/{indexing → internals/indexing}/category_spec.rb +3 -3
- data/spec/lib/{indexing → internals/indexing}/index_spec.rb +3 -3
- data/spec/lib/internals/indexing/indexes_spec.rb +36 -0
- data/spec/lib/{interfaces → internals/interfaces}/live_parameters_spec.rb +0 -0
- data/spec/lib/internals/results/base_spec.rb +105 -0
- data/spec/lib/internals/results/full_spec.rb +78 -0
- data/spec/lib/internals/results/live_spec.rb +88 -0
- data/spec/lib/{solr → internals/solr}/schema_generator_spec.rb +0 -0
- data/spec/lib/{tokenizers → internals/tokenizers}/base_spec.rb +3 -3
- data/spec/lib/{tokenizers → internals/tokenizers}/index_spec.rb +9 -9
- data/spec/lib/{tokenizers → internals/tokenizers}/query_spec.rb +11 -11
- data/spec/lib/query/allocation_spec.rb +12 -12
- data/spec/lib/query/allocations_spec.rb +19 -19
- data/spec/lib/query/base_spec.rb +28 -4
- data/spec/lib/query/combination_spec.rb +8 -9
- data/spec/lib/query/combinations/base_spec.rb +116 -0
- data/spec/lib/query/{combinations_spec.rb → combinations/memory_spec.rb} +14 -14
- data/spec/lib/query/combinations/redis_spec.rb +132 -0
- data/spec/lib/query/full_spec.rb +2 -2
- data/spec/lib/query/indexes_spec.rb +81 -0
- data/spec/lib/query/live_spec.rb +3 -3
- data/spec/lib/query/qualifiers_spec.rb +6 -6
- data/spec/lib/query/token_spec.rb +38 -38
- data/spec/lib/query/tokens_spec.rb +35 -35
- data/spec/lib/sources/db_spec.rb +23 -18
- metadata +212 -181
- data/lib/picky/adapters/rack/base.rb +0 -23
- data/lib/picky/adapters/rack/live_parameters.rb +0 -33
- data/lib/picky/adapters/rack/query.rb +0 -59
- data/lib/picky/adapters/rack.rb +0 -28
- data/lib/picky/cacher/convenience.rb +0 -3
- data/lib/picky/cacher/generator.rb +0 -15
- data/lib/picky/cacher/partial/default.rb +0 -5
- data/lib/picky/cacher/partial/none.rb +0 -31
- data/lib/picky/cacher/partial/strategy.rb +0 -21
- data/lib/picky/cacher/partial/substring.rb +0 -118
- data/lib/picky/cacher/partial_generator.rb +0 -15
- data/lib/picky/cacher/similarity/default.rb +0 -7
- data/lib/picky/cacher/similarity/double_levenshtone.rb +0 -77
- data/lib/picky/cacher/similarity/none.rb +0 -31
- data/lib/picky/cacher/similarity/strategy.rb +0 -9
- data/lib/picky/cacher/similarity_generator.rb +0 -15
- data/lib/picky/cacher/strategy.rb +0 -12
- data/lib/picky/cacher/weights/default.rb +0 -7
- data/lib/picky/cacher/weights/logarithmic.rb +0 -39
- data/lib/picky/cacher/weights/strategy.rb +0 -9
- data/lib/picky/cacher/weights_generator.rb +0 -15
- data/lib/picky/frontend_adapters/rack.rb +0 -150
- data/lib/picky/index/bundle.rb +0 -54
- data/lib/picky/index/file/basic.rb +0 -97
- data/lib/picky/index/file/json.rb +0 -34
- data/lib/picky/index/file/marshal.rb +0 -34
- data/lib/picky/index/file/text.rb +0 -56
- data/lib/picky/index/files.rb +0 -118
- data/lib/picky/index_api.rb +0 -175
- data/lib/picky/indexed/bundle.rb +0 -54
- data/lib/picky/indexed/categories.rb +0 -131
- data/lib/picky/indexed/category.rb +0 -85
- data/lib/picky/indexed/index.rb +0 -39
- data/lib/picky/indexed/wrappers/exact_first.rb +0 -61
- data/lib/picky/indexing/bundle.rb +0 -213
- data/lib/picky/indexing/categories.rb +0 -38
- data/lib/picky/indexing/category.rb +0 -117
- data/lib/picky/indexing/index.rb +0 -55
- data/lib/picky/query/allocation.rb +0 -82
- data/lib/picky/query/allocations.rb +0 -130
- data/lib/picky/query/combination.rb +0 -74
- data/lib/picky/query/combinations.rb +0 -105
- data/lib/picky/query/qualifiers.rb +0 -77
- data/lib/picky/query/token.rb +0 -202
- data/lib/picky/query/tokens.rb +0 -86
- data/lib/picky/query/weigher.rb +0 -165
- data/lib/picky/results/base.rb +0 -102
- data/lib/picky/results/full.rb +0 -13
- data/lib/picky/results/live.rb +0 -13
- data/lib/picky/tokenizers/base.rb +0 -161
- data/lib/picky/tokenizers/index.rb +0 -58
- data/lib/picky/tokenizers/query.rb +0 -74
- data/spec/lib/cacher/partial/default_spec.rb +0 -15
- data/spec/lib/cacher/partial/none_spec.rb +0 -17
- data/spec/lib/cacher/weights_generator_spec.rb +0 -21
- data/spec/lib/results/base_spec.rb +0 -257
- data/spec/lib/results/live_spec.rb +0 -15
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
module Internals
|
|
2
|
+
|
|
3
|
+
module Query
|
|
4
|
+
|
|
5
|
+
# This is a query token. Together with other tokens it makes up a query.
|
|
6
|
+
#
|
|
7
|
+
# It remembers the original form, and and a normalized form.
|
|
8
|
+
#
|
|
9
|
+
# It also knows whether it needs to look for similarity (bla~), or whether it is a partial (bla*).
|
|
10
|
+
#
|
|
11
|
+
# TODO Make partial / similarity char configurable.
|
|
12
|
+
#
|
|
13
|
+
class Token # :nodoc:all
|
|
14
|
+
|
|
15
|
+
attr_reader :text, :original
|
|
16
|
+
attr_writer :similar
|
|
17
|
+
|
|
18
|
+
delegate :blank?, :to => :text
|
|
19
|
+
|
|
20
|
+
# Normal initializer.
|
|
21
|
+
#
|
|
22
|
+
# Note: Use this if you do not want a qualified and normalized token.
|
|
23
|
+
#
|
|
24
|
+
def initialize text
|
|
25
|
+
@text = text
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Returns a qualified and normalized token.
|
|
29
|
+
#
|
|
30
|
+
# Note: Use this in the search engine if you need a qualified
|
|
31
|
+
# and normalized token. I.e. one prepared for a search.
|
|
32
|
+
#
|
|
33
|
+
def self.processed text
|
|
34
|
+
token = new text
|
|
35
|
+
token.qualify
|
|
36
|
+
token.extract_original
|
|
37
|
+
token.partialize
|
|
38
|
+
token.similarize
|
|
39
|
+
token.remove_illegals
|
|
40
|
+
token
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# This returns a predefined category name if the user has given one.
|
|
44
|
+
#
|
|
45
|
+
def user_defined_category_name
|
|
46
|
+
@qualifier
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Extracts a qualifier for this token and pre-assigns an allocation.
|
|
50
|
+
#
|
|
51
|
+
# Note: Removes the qualifier if it is not allowed.
|
|
52
|
+
#
|
|
53
|
+
def qualify
|
|
54
|
+
@qualifier, @text = split @text
|
|
55
|
+
@qualifier = Query::Qualifiers.instance.normalize @qualifier
|
|
56
|
+
end
|
|
57
|
+
def extract_original
|
|
58
|
+
@original = @text.dup
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Partial is a conditional setter.
|
|
62
|
+
#
|
|
63
|
+
# It is only settable if it hasn't been set yet.
|
|
64
|
+
#
|
|
65
|
+
def partial= partial
|
|
66
|
+
@partial = partial if @partial.nil?
|
|
67
|
+
end
|
|
68
|
+
def partial?
|
|
69
|
+
!@similar && @partial
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# If the text ends with *, partialize it. If with ", don't.
|
|
73
|
+
#
|
|
74
|
+
@@no_partial = /\"\Z/
|
|
75
|
+
@@partial = /\*\Z/
|
|
76
|
+
def partialize
|
|
77
|
+
self.partial = false and return if @text =~ @@no_partial
|
|
78
|
+
self.partial = true if @text =~ @@partial
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# If the text ends with ~ similarize it. If with ", don't.
|
|
82
|
+
#
|
|
83
|
+
@@no_similar = /\"\Z/
|
|
84
|
+
@@similar = /\~\Z/
|
|
85
|
+
def similarize
|
|
86
|
+
self.similar = false and return if @text =~ @@no_similar
|
|
87
|
+
self.similar = true if @text =~ @@similar
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def similar?
|
|
91
|
+
@similar
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Normalizes this token's text.
|
|
95
|
+
#
|
|
96
|
+
@@illegals = /["*~]/
|
|
97
|
+
def remove_illegals
|
|
98
|
+
@text.gsub! @@illegals, '' unless @text.blank?
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Visitor for tokenizer.
|
|
102
|
+
#
|
|
103
|
+
# TODO Rewrite!!!
|
|
104
|
+
#
|
|
105
|
+
def tokenize_with tokenizer
|
|
106
|
+
@text = tokenizer.normalize @text
|
|
107
|
+
end
|
|
108
|
+
# TODO spec!
|
|
109
|
+
#
|
|
110
|
+
# TODO Rewrite!!
|
|
111
|
+
#
|
|
112
|
+
def tokenized tokenizer
|
|
113
|
+
tokenizer.tokenize(@text.to_s).each do |text|
|
|
114
|
+
yield text
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Returns an array of possible combinations.
|
|
119
|
+
#
|
|
120
|
+
def possible_combinations_in type
|
|
121
|
+
type.possible_combinations self
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Returns a token with the next similar text.
|
|
125
|
+
#
|
|
126
|
+
# TODO Rewrite this. It is hard to understand. Also spec performance.
|
|
127
|
+
#
|
|
128
|
+
def next_similar_token category
|
|
129
|
+
token = self.dup
|
|
130
|
+
token if token.next_similar category.bundle_for(token)
|
|
131
|
+
end
|
|
132
|
+
# Sets and returns the next similar word.
|
|
133
|
+
#
|
|
134
|
+
# Note: Also overrides the original.
|
|
135
|
+
#
|
|
136
|
+
def next_similar bundle
|
|
137
|
+
@text = @original = (similarity(bundle).shift || return) if similar?
|
|
138
|
+
end
|
|
139
|
+
# Lazy similar reader.
|
|
140
|
+
#
|
|
141
|
+
def similarity bundle = nil
|
|
142
|
+
@similarity || @similarity = generate_similarity_for(bundle)
|
|
143
|
+
end
|
|
144
|
+
# Returns an enumerator that traverses over the similar.
|
|
145
|
+
#
|
|
146
|
+
# Note: The dup isn't too nice – since it is needed on account of the shift, above.
|
|
147
|
+
# (We avoid a StopIteration exception. Which of both is less evil?)
|
|
148
|
+
#
|
|
149
|
+
def generate_similarity_for bundle
|
|
150
|
+
bundle.similar(@text).dup || []
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Generates a solr term from this token.
|
|
154
|
+
#
|
|
155
|
+
# E.g. "name:heroes~0.75"
|
|
156
|
+
#
|
|
157
|
+
@@solr_fuzzy_mapping = {
|
|
158
|
+
1 => :'',
|
|
159
|
+
2 => :'',
|
|
160
|
+
3 => :'',
|
|
161
|
+
4 => :'~0.74',
|
|
162
|
+
5 => :'~0.78',
|
|
163
|
+
6 => :'~0.81',
|
|
164
|
+
7 => :'~0.83',
|
|
165
|
+
8 => :'~0.85',
|
|
166
|
+
9 => :'~0.87',
|
|
167
|
+
10 => :'~0.89'
|
|
168
|
+
}
|
|
169
|
+
@@solr_fuzzy_mapping.default = :'~0.9'
|
|
170
|
+
def to_solr
|
|
171
|
+
blank? ? '' : (to_s + @@solr_fuzzy_mapping[@text.size].to_s)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
#
|
|
175
|
+
#
|
|
176
|
+
def to_result
|
|
177
|
+
[@original, @text]
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# Internal identifier.
|
|
181
|
+
#
|
|
182
|
+
# TODO Uh.
|
|
183
|
+
#
|
|
184
|
+
def identifier
|
|
185
|
+
"#{similar?? :similarity : :index}:#{@text}"
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Displays the qualifier text and the text, joined.
|
|
189
|
+
#
|
|
190
|
+
# e.g. name:meier
|
|
191
|
+
#
|
|
192
|
+
def to_s
|
|
193
|
+
[@qualifier, @text].compact.join ':'
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
private
|
|
197
|
+
|
|
198
|
+
# Splits text into a qualifier and text.
|
|
199
|
+
#
|
|
200
|
+
# Returns [qualifier, text].
|
|
201
|
+
#
|
|
202
|
+
def split unqualified_text
|
|
203
|
+
qualifier, text = (unqualified_text || '').split(':', 2)
|
|
204
|
+
if text.blank?
|
|
205
|
+
[nil, (qualifier || '')]
|
|
206
|
+
else
|
|
207
|
+
[qualifier, text]
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
#
|
|
3
|
+
module Internals
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
#
|
|
7
|
+
module Query
|
|
8
|
+
|
|
9
|
+
# This class primarily handles switching through similar token constellations.
|
|
10
|
+
#
|
|
11
|
+
class Tokens # :nodoc:all
|
|
12
|
+
|
|
13
|
+
# Basically delegates to its internal tokens array.
|
|
14
|
+
#
|
|
15
|
+
self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
|
|
16
|
+
|
|
17
|
+
#
|
|
18
|
+
#
|
|
19
|
+
def initialize tokens = []
|
|
20
|
+
@tokens = tokens
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
#
|
|
24
|
+
#
|
|
25
|
+
def tokenize_with tokenizer
|
|
26
|
+
@tokens.each { |token| token.tokenize_with(tokenizer) }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Generates an array in the form of
|
|
30
|
+
# [
|
|
31
|
+
# [combination], # of token 1
|
|
32
|
+
# [combination, combination, combination], # of token 2
|
|
33
|
+
# [combination, combination] # of token 3
|
|
34
|
+
# ]
|
|
35
|
+
#
|
|
36
|
+
# TODO If we want token behaviour defined per Query, we can
|
|
37
|
+
# compact! here
|
|
38
|
+
#
|
|
39
|
+
def possible_combinations_in type
|
|
40
|
+
@tokens.inject([]) do |combinations, token|
|
|
41
|
+
combinations << token.possible_combinations_in(type)
|
|
42
|
+
end
|
|
43
|
+
# TODO compact! if ignore_unassigned_tokens
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Makes the last of the tokens partial.
|
|
47
|
+
#
|
|
48
|
+
def partialize_last
|
|
49
|
+
@tokens.last.partial = true unless empty?
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Caps the tokens to the maximum.
|
|
53
|
+
#
|
|
54
|
+
def cap maximum
|
|
55
|
+
@tokens.slice!(maximum..-1) if cap?(maximum)
|
|
56
|
+
end
|
|
57
|
+
def cap? maximum
|
|
58
|
+
@tokens.size > maximum
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Rejects blank tokens.
|
|
62
|
+
#
|
|
63
|
+
def reject
|
|
64
|
+
@tokens.reject! &:blank?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Returns a solr query.
|
|
68
|
+
#
|
|
69
|
+
def to_solr_query
|
|
70
|
+
@tokens.map(&:to_solr).join ' '
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
#
|
|
74
|
+
#
|
|
75
|
+
def originals
|
|
76
|
+
@tokens.map(&:original)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Just join the token original texts.
|
|
80
|
+
#
|
|
81
|
+
def to_s
|
|
82
|
+
originals.join ' '
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
end
|
|
File without changes
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
module Internals
|
|
2
|
+
|
|
3
|
+
module Results # :nodoc:all
|
|
4
|
+
|
|
5
|
+
# This is the internal results object. Usually, to_marshal, or to_json
|
|
6
|
+
# is called on it to get a string for the answer.
|
|
7
|
+
#
|
|
8
|
+
class Base
|
|
9
|
+
|
|
10
|
+
# Duration is set externally by the query.
|
|
11
|
+
#
|
|
12
|
+
attr_writer :duration
|
|
13
|
+
attr_reader :allocations, :offset
|
|
14
|
+
|
|
15
|
+
# Takes instances of Query::Allocations as param.
|
|
16
|
+
#
|
|
17
|
+
def initialize offset = 0, allocations = Query::Allocations.new
|
|
18
|
+
@offset = offset
|
|
19
|
+
@allocations = allocations # || Query::Allocations.new
|
|
20
|
+
end
|
|
21
|
+
# Create new results and calculate the ids.
|
|
22
|
+
#
|
|
23
|
+
def self.from offset, allocations
|
|
24
|
+
results = new offset, allocations
|
|
25
|
+
results.prepare!
|
|
26
|
+
results
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
#
|
|
30
|
+
#
|
|
31
|
+
def serialize
|
|
32
|
+
{ allocations: allocations.to_result,
|
|
33
|
+
offset: offset,
|
|
34
|
+
duration: duration,
|
|
35
|
+
total: total }
|
|
36
|
+
end
|
|
37
|
+
# The default format is json.
|
|
38
|
+
#
|
|
39
|
+
def to_response options = {}
|
|
40
|
+
to_json options
|
|
41
|
+
end
|
|
42
|
+
# Convert to json format.
|
|
43
|
+
#
|
|
44
|
+
def to_json options = {}
|
|
45
|
+
serialize.to_json options
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# This starts the actual processing.
|
|
49
|
+
#
|
|
50
|
+
# Without this, the allocations are not processed,
|
|
51
|
+
# and no ids are calculated.
|
|
52
|
+
#
|
|
53
|
+
def prepare!
|
|
54
|
+
allocations.process! self.max_results, self.offset
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Duration default is 0.
|
|
58
|
+
#
|
|
59
|
+
def duration
|
|
60
|
+
@duration || 0
|
|
61
|
+
end
|
|
62
|
+
# The total results. Delegates to the allocations.
|
|
63
|
+
#
|
|
64
|
+
# Caches.
|
|
65
|
+
#
|
|
66
|
+
def total
|
|
67
|
+
@total || @total = allocations.total || 0
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# How many results are returned.
|
|
71
|
+
#
|
|
72
|
+
# Set in config using
|
|
73
|
+
# Results::Full.max_results = 20
|
|
74
|
+
#
|
|
75
|
+
class_inheritable_accessor :max_results
|
|
76
|
+
def max_results
|
|
77
|
+
self.class.max_results
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Convenience methods.
|
|
81
|
+
#
|
|
82
|
+
|
|
83
|
+
# Delegates to allocations.
|
|
84
|
+
#
|
|
85
|
+
def ids amount = 20
|
|
86
|
+
allocations.ids amount
|
|
87
|
+
end
|
|
88
|
+
# Gets an amout of random ids from the allocations.
|
|
89
|
+
#
|
|
90
|
+
# Note: Basically delegates to the allocations.
|
|
91
|
+
#
|
|
92
|
+
def random_ids amount = 1
|
|
93
|
+
allocations.random_ids amount
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Human readable log.
|
|
97
|
+
#
|
|
98
|
+
def to_log query
|
|
99
|
+
"|#{Time.now.to_s(:db)}|#{'%8f' % duration}|#{'%-50s' % query}|#{'%8d' % total}|#{'%4d' % offset}|#{'%2d' % allocations.size}|"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
end
|
|
File without changes
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
module Internals
|
|
2
|
+
|
|
3
|
+
module Tokenizers # :nodoc:all
|
|
4
|
+
|
|
5
|
+
# Defines tokenizing processes used both in indexing and querying.
|
|
6
|
+
#
|
|
7
|
+
class Base
|
|
8
|
+
|
|
9
|
+
# TODO Move EMPTY_STRING top level.
|
|
10
|
+
#
|
|
11
|
+
EMPTY_STRING = ''.freeze
|
|
12
|
+
|
|
13
|
+
# Stopwords.
|
|
14
|
+
#
|
|
15
|
+
def stopwords regexp
|
|
16
|
+
@remove_stopwords_regexp = regexp
|
|
17
|
+
end
|
|
18
|
+
def remove_stopwords text
|
|
19
|
+
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
|
|
20
|
+
text
|
|
21
|
+
end
|
|
22
|
+
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
|
23
|
+
def remove_non_single_stopwords text
|
|
24
|
+
return text if text.match @@non_single_stopword_regexp
|
|
25
|
+
remove_stopwords text
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Illegals.
|
|
29
|
+
#
|
|
30
|
+
# TODO Should there be a legal?
|
|
31
|
+
#
|
|
32
|
+
def removes_characters regexp
|
|
33
|
+
@removes_characters_regexp = regexp
|
|
34
|
+
end
|
|
35
|
+
def remove_illegals text
|
|
36
|
+
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
|
|
37
|
+
text
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Splitting.
|
|
41
|
+
#
|
|
42
|
+
def splits_text_on regexp
|
|
43
|
+
@splits_text_on_regexp = regexp
|
|
44
|
+
end
|
|
45
|
+
def split text
|
|
46
|
+
text.split @splits_text_on_regexp
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Normalizing.
|
|
50
|
+
#
|
|
51
|
+
def normalizes_words regexp_replaces
|
|
52
|
+
@normalizes_words_regexp_replaces = regexp_replaces
|
|
53
|
+
end
|
|
54
|
+
def normalize_with_patterns text
|
|
55
|
+
return text unless @normalizes_words_regexp_replaces
|
|
56
|
+
|
|
57
|
+
@normalizes_words_regexp_replaces.each do |regex, replace|
|
|
58
|
+
# This should be sufficient
|
|
59
|
+
#
|
|
60
|
+
text.gsub!(regex, replace) and break
|
|
61
|
+
end
|
|
62
|
+
remove_after_normalizing_illegals text
|
|
63
|
+
text
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Illegal after normalizing.
|
|
67
|
+
#
|
|
68
|
+
def removes_characters_after_splitting regexp
|
|
69
|
+
@removes_characters_after_splitting_regexp = regexp
|
|
70
|
+
end
|
|
71
|
+
def remove_after_normalizing_illegals text
|
|
72
|
+
text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Substitute Characters with this substituter.
|
|
76
|
+
#
|
|
77
|
+
# Default is European Character substitution.
|
|
78
|
+
#
|
|
79
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
|
80
|
+
# TODO Raise if it doesn't quack substitute?
|
|
81
|
+
@substituter = substituter
|
|
82
|
+
end
|
|
83
|
+
def substitute_characters text
|
|
84
|
+
substituter?? substituter.substitute(text) : text
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Reject tokens after tokenizing based on the given criteria.
|
|
88
|
+
#
|
|
89
|
+
# Note: Currently only for indexing. TODO Redesign and write for both!
|
|
90
|
+
#
|
|
91
|
+
def reject_token_if &condition
|
|
92
|
+
@reject_condition = condition
|
|
93
|
+
end
|
|
94
|
+
def reject tokens
|
|
95
|
+
tokens.reject! &@reject_condition
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# Returns a number of tokens, generated from the given text.
|
|
100
|
+
#
|
|
101
|
+
# Note:
|
|
102
|
+
# * preprocess, pretokenize are hooks
|
|
103
|
+
#
|
|
104
|
+
def tokenize text
|
|
105
|
+
text = preprocess text # processing the text
|
|
106
|
+
return empty_tokens if text.blank?
|
|
107
|
+
words = pretokenize text # splitting and preparations for tokenizing
|
|
108
|
+
return empty_tokens if words.empty?
|
|
109
|
+
tokens = tokens_for words # creating tokens / strings
|
|
110
|
+
process tokens # processing tokens / strings
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
attr_reader :substituter
|
|
114
|
+
alias substituter? substituter
|
|
115
|
+
|
|
116
|
+
def initialize options = {}
|
|
117
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
|
118
|
+
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
|
119
|
+
stopwords options[:stopwords] if options[:stopwords]
|
|
120
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
|
121
|
+
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
|
122
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
|
123
|
+
|
|
124
|
+
# Defaults.
|
|
125
|
+
#
|
|
126
|
+
splits_text_on options[:splits_text_on] || /\s/
|
|
127
|
+
reject_token_if &(options[:reject_token_if] || :blank?)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
# Hooks.
|
|
131
|
+
#
|
|
132
|
+
|
|
133
|
+
# Preprocessing.
|
|
134
|
+
#
|
|
135
|
+
def preprocess text; end
|
|
136
|
+
# Pretokenizing.
|
|
137
|
+
#
|
|
138
|
+
def pretokenize text; end
|
|
139
|
+
# Postprocessing.
|
|
140
|
+
#
|
|
141
|
+
def process tokens
|
|
142
|
+
reject tokens # Reject any tokens that don't meet criteria
|
|
143
|
+
tokens
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Converts words into real tokens.
|
|
147
|
+
#
|
|
148
|
+
def tokens_for words
|
|
149
|
+
Internals::Query::Tokens.new words.collect! { |word| token_for word }
|
|
150
|
+
end
|
|
151
|
+
# Turns non-blank text into symbols.
|
|
152
|
+
#
|
|
153
|
+
def symbolize text
|
|
154
|
+
text.blank? ? nil : text.to_sym
|
|
155
|
+
end
|
|
156
|
+
# Returns a tokens object.
|
|
157
|
+
#
|
|
158
|
+
def empty_tokens
|
|
159
|
+
Internals::Query::Tokens.new
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module Internals
|
|
2
|
+
|
|
3
|
+
module Tokenizers
|
|
4
|
+
|
|
5
|
+
# The base indexing tokenizer.
|
|
6
|
+
#
|
|
7
|
+
# Override in indexing subclasses and define in configuration.
|
|
8
|
+
#
|
|
9
|
+
class Index < Base
|
|
10
|
+
|
|
11
|
+
def self.default= new_default
|
|
12
|
+
@default = new_default
|
|
13
|
+
end
|
|
14
|
+
def self.default
|
|
15
|
+
@default ||= new
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Default indexing preprocessing hook.
|
|
19
|
+
#
|
|
20
|
+
# Does:
|
|
21
|
+
# 1. Character substitution.
|
|
22
|
+
# 2. Downcasing.
|
|
23
|
+
# 3. Remove illegal expressions.
|
|
24
|
+
# 4. Remove non-single stopwords. (Stopwords that occur with other words)
|
|
25
|
+
#
|
|
26
|
+
def preprocess text
|
|
27
|
+
text = substitute_characters text
|
|
28
|
+
text.downcase!
|
|
29
|
+
remove_illegals text
|
|
30
|
+
# we do not remove single stopwords for an entirely different
|
|
31
|
+
# reason than in the query tokenizer.
|
|
32
|
+
# An indexed thing with just name "UND" (a possible stopword) should not lose its name.
|
|
33
|
+
#
|
|
34
|
+
remove_non_single_stopwords text
|
|
35
|
+
text
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Default indexing pretokenizing hook.
|
|
39
|
+
#
|
|
40
|
+
# Does:
|
|
41
|
+
# 1. Split the text into words.
|
|
42
|
+
# 2. Normalize each word.
|
|
43
|
+
#
|
|
44
|
+
def pretokenize text
|
|
45
|
+
words = split text
|
|
46
|
+
words.collect! do |word|
|
|
47
|
+
normalize_with_patterns word
|
|
48
|
+
word
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Does not actually return a token, but a
|
|
53
|
+
# symbol "token".
|
|
54
|
+
#
|
|
55
|
+
def token_for text
|
|
56
|
+
symbolize text
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|