picky 1.4.1 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (229) hide show
  1. data/lib/picky/{alias_instances.rb → aliases.rb} +1 -3
  2. data/lib/picky/application.rb +18 -19
  3. data/lib/picky/cores.rb +1 -1
  4. data/lib/picky/generators/aliases.rb +3 -0
  5. data/lib/picky/index/base.rb +179 -0
  6. data/lib/picky/index/memory.rb +28 -0
  7. data/lib/picky/index/redis.rb +28 -0
  8. data/lib/picky/{indexes_api.rb → index_bundle.rb} +16 -16
  9. data/lib/picky/indexed/indexes.rb +11 -7
  10. data/lib/picky/indexing/indexes.rb +14 -8
  11. data/lib/picky/internals/adapters/rack/base.rb +27 -0
  12. data/lib/picky/internals/adapters/rack/live_parameters.rb +37 -0
  13. data/lib/picky/internals/adapters/rack/query.rb +63 -0
  14. data/lib/picky/internals/adapters/rack.rb +34 -0
  15. data/lib/picky/{calculations → internals/calculations}/location.rb +0 -0
  16. data/lib/picky/{cli.rb → internals/cli.rb} +0 -0
  17. data/lib/picky/{configuration → internals/configuration}/index.rb +8 -2
  18. data/lib/picky/{ext → internals/ext}/maybe_compile.rb +0 -0
  19. data/lib/picky/{ext → internals/ext}/ruby19/extconf.rb +0 -0
  20. data/lib/picky/{ext → internals/ext}/ruby19/performant.c +0 -0
  21. data/lib/picky/{extensions → internals/extensions}/array.rb +0 -0
  22. data/lib/picky/{extensions → internals/extensions}/hash.rb +0 -0
  23. data/lib/picky/{extensions → internals/extensions}/module.rb +0 -0
  24. data/lib/picky/{extensions → internals/extensions}/object.rb +0 -0
  25. data/lib/picky/{extensions → internals/extensions}/symbol.rb +0 -0
  26. data/lib/picky/internals/frontend_adapters/rack.rb +154 -0
  27. data/lib/picky/internals/generators/base.rb +19 -0
  28. data/lib/picky/internals/generators/partial/default.rb +7 -0
  29. data/lib/picky/internals/generators/partial/none.rb +35 -0
  30. data/lib/picky/internals/generators/partial/strategy.rb +29 -0
  31. data/lib/picky/internals/generators/partial/substring.rb +122 -0
  32. data/lib/picky/internals/generators/partial_generator.rb +19 -0
  33. data/lib/picky/internals/generators/similarity/default.rb +9 -0
  34. data/lib/picky/internals/generators/similarity/double_levenshtone.rb +81 -0
  35. data/lib/picky/internals/generators/similarity/none.rb +35 -0
  36. data/lib/picky/internals/generators/similarity/strategy.rb +11 -0
  37. data/lib/picky/internals/generators/similarity_generator.rb +19 -0
  38. data/lib/picky/internals/generators/strategy.rb +18 -0
  39. data/lib/picky/internals/generators/weights/default.rb +9 -0
  40. data/lib/picky/internals/generators/weights/logarithmic.rb +43 -0
  41. data/lib/picky/internals/generators/weights/strategy.rb +11 -0
  42. data/lib/picky/internals/generators/weights_generator.rb +19 -0
  43. data/lib/picky/{helpers → internals/helpers}/measuring.rb +0 -0
  44. data/lib/picky/internals/index/backend.rb +113 -0
  45. data/lib/picky/internals/index/file/basic.rb +101 -0
  46. data/lib/picky/internals/index/file/json.rb +38 -0
  47. data/lib/picky/internals/index/file/marshal.rb +38 -0
  48. data/lib/picky/internals/index/file/text.rb +60 -0
  49. data/lib/picky/internals/index/files.rb +24 -0
  50. data/lib/picky/internals/index/redis/basic.rb +77 -0
  51. data/lib/picky/internals/index/redis/list_hash.rb +46 -0
  52. data/lib/picky/internals/index/redis/string_hash.rb +35 -0
  53. data/lib/picky/internals/index/redis.rb +44 -0
  54. data/lib/picky/internals/indexed/bundle/base.rb +72 -0
  55. data/lib/picky/internals/indexed/bundle/memory.rb +69 -0
  56. data/lib/picky/internals/indexed/bundle/redis.rb +70 -0
  57. data/lib/picky/internals/indexed/categories.rb +135 -0
  58. data/lib/picky/internals/indexed/category.rb +90 -0
  59. data/lib/picky/internals/indexed/index.rb +57 -0
  60. data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/calculation.rb +0 -0
  61. data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/location.rb +4 -2
  62. data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/wrapper.rb +1 -1
  63. data/lib/picky/internals/indexed/wrappers/exact_first.rb +65 -0
  64. data/lib/picky/{indexers → internals/indexers}/no_source_specified_error.rb +0 -0
  65. data/lib/picky/{indexers → internals/indexers}/serial.rb +2 -2
  66. data/lib/picky/{indexers → internals/indexers}/solr.rb +0 -0
  67. data/lib/picky/internals/indexing/bundle/base.rb +219 -0
  68. data/lib/picky/internals/indexing/bundle/memory.rb +25 -0
  69. data/lib/picky/internals/indexing/bundle/redis.rb +28 -0
  70. data/lib/picky/internals/indexing/bundle/super_base.rb +65 -0
  71. data/lib/picky/internals/indexing/categories.rb +42 -0
  72. data/lib/picky/internals/indexing/category.rb +120 -0
  73. data/lib/picky/internals/indexing/index.rb +67 -0
  74. data/lib/picky/{performant.rb → internals/performant.rb} +0 -0
  75. data/lib/picky/internals/query/allocation.rb +88 -0
  76. data/lib/picky/internals/query/allocations.rb +137 -0
  77. data/lib/picky/internals/query/combination.rb +80 -0
  78. data/lib/picky/internals/query/combinations/base.rb +84 -0
  79. data/lib/picky/internals/query/combinations/memory.rb +58 -0
  80. data/lib/picky/internals/query/combinations/redis.rb +59 -0
  81. data/lib/picky/internals/query/indexes.rb +180 -0
  82. data/lib/picky/internals/query/qualifiers.rb +81 -0
  83. data/lib/picky/internals/query/token.rb +215 -0
  84. data/lib/picky/internals/query/tokens.rb +89 -0
  85. data/lib/picky/{query → internals/query}/weights.rb +0 -0
  86. data/lib/picky/internals/results/base.rb +106 -0
  87. data/lib/picky/internals/results/full.rb +17 -0
  88. data/lib/picky/internals/results/live.rb +17 -0
  89. data/lib/picky/{solr → internals/solr}/schema_generator.rb +0 -0
  90. data/lib/picky/internals/tokenizers/base.rb +166 -0
  91. data/lib/picky/internals/tokenizers/index.rb +63 -0
  92. data/lib/picky/internals/tokenizers/query.rb +79 -0
  93. data/lib/picky/loader.rb +148 -112
  94. data/lib/picky/query/base.rb +57 -26
  95. data/lib/picky/query/full.rb +1 -1
  96. data/lib/picky/query/live.rb +1 -1
  97. data/lib/picky/sources/db.rb +27 -6
  98. data/lib/tasks/index.rake +3 -3
  99. data/lib/tasks/try.rake +2 -2
  100. data/spec/lib/aliases_spec.rb +9 -0
  101. data/spec/lib/application_spec.rb +3 -3
  102. data/spec/lib/generators/aliases_spec.rb +1 -0
  103. data/spec/lib/{index_api_spec.rb → index/base_spec.rb} +7 -7
  104. data/spec/lib/index_bundle_spec.rb +71 -0
  105. data/spec/lib/indexed/indexes_spec.rb +61 -0
  106. data/spec/lib/indexing/indexes_spec.rb +94 -24
  107. data/spec/lib/{adapters → internals/adapters}/rack/base_spec.rb +2 -2
  108. data/spec/lib/{adapters → internals/adapters}/rack/live_parameters_spec.rb +2 -2
  109. data/spec/lib/{adapters → internals/adapters}/rack/query_spec.rb +2 -2
  110. data/spec/lib/{calculations → internals/calculations}/location_spec.rb +0 -0
  111. data/spec/lib/{cli_spec.rb → internals/cli_spec.rb} +4 -1
  112. data/spec/lib/{configuration → internals/configuration}/index_spec.rb +1 -1
  113. data/spec/lib/{cores_spec.rb → internals/cores_spec.rb} +0 -0
  114. data/spec/lib/{extensions → internals/extensions}/array_spec.rb +0 -0
  115. data/spec/lib/{extensions → internals/extensions}/hash_spec.rb +0 -0
  116. data/spec/lib/{extensions → internals/extensions}/module_spec.rb +0 -0
  117. data/spec/lib/{extensions → internals/extensions}/object_spec.rb +0 -0
  118. data/spec/lib/{extensions → internals/extensions}/symbol_spec.rb +0 -0
  119. data/spec/lib/{frontend_adapters → internals/frontend_adapters}/rack_spec.rb +11 -11
  120. data/spec/lib/{cacher → internals/generators}/cacher_strategy_spec.rb +2 -2
  121. data/spec/lib/internals/generators/partial/default_spec.rb +17 -0
  122. data/spec/lib/internals/generators/partial/none_spec.rb +17 -0
  123. data/spec/lib/{cacher → internals/generators}/partial/substring_spec.rb +26 -27
  124. data/spec/lib/{cacher → internals/generators}/partial_generator_spec.rb +5 -5
  125. data/spec/lib/{cacher → internals/generators}/similarity/double_levenshtone_spec.rb +4 -4
  126. data/spec/lib/{cacher → internals/generators}/similarity/none_spec.rb +2 -2
  127. data/spec/lib/{cacher → internals/generators}/similarity_generator_spec.rb +4 -4
  128. data/spec/lib/{cacher → internals/generators}/weights/logarithmic_spec.rb +2 -2
  129. data/spec/lib/internals/generators/weights_generator_spec.rb +21 -0
  130. data/spec/lib/{helpers → internals/helpers}/measuring_spec.rb +0 -0
  131. data/spec/lib/{index → internals/index}/file/basic_spec.rb +2 -2
  132. data/spec/lib/{index → internals/index}/file/json_spec.rb +2 -2
  133. data/spec/lib/{index → internals/index}/file/marshal_spec.rb +2 -2
  134. data/spec/lib/{index → internals/index}/file/text_spec.rb +2 -2
  135. data/spec/lib/{index → internals/index}/files_spec.rb +2 -2
  136. data/spec/lib/{indexed/bundle_spec.rb → internals/indexed/bundle/memory_spec.rb} +4 -5
  137. data/spec/lib/{indexed → internals/indexed}/categories_spec.rb +13 -13
  138. data/spec/lib/{indexed → internals/indexed}/category_spec.rb +59 -32
  139. data/spec/lib/{indexed → internals/indexed}/index_spec.rb +5 -5
  140. data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/calculation_spec.rb +0 -0
  141. data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/wrapper_spec.rb +0 -0
  142. data/spec/lib/{indexed → internals/indexed}/wrappers/exact_first_spec.rb +5 -5
  143. data/spec/lib/{indexers → internals/indexers}/serial_spec.rb +0 -0
  144. data/spec/lib/{indexing/bundle_partial_generation_speed_spec.rb → internals/indexing/bundle/memory_partial_generation_speed_spec.rb} +3 -3
  145. data/spec/lib/{indexing/bundle_spec.rb → internals/indexing/bundle/memory_spec.rb} +3 -3
  146. data/spec/lib/{index/bundle_spec.rb → internals/indexing/bundle/super_base_spec.rb} +9 -3
  147. data/spec/lib/{indexing → internals/indexing}/category_spec.rb +3 -3
  148. data/spec/lib/{indexing → internals/indexing}/index_spec.rb +3 -3
  149. data/spec/lib/internals/indexing/indexes_spec.rb +36 -0
  150. data/spec/lib/{interfaces → internals/interfaces}/live_parameters_spec.rb +0 -0
  151. data/spec/lib/internals/results/base_spec.rb +105 -0
  152. data/spec/lib/internals/results/full_spec.rb +78 -0
  153. data/spec/lib/internals/results/live_spec.rb +88 -0
  154. data/spec/lib/{solr → internals/solr}/schema_generator_spec.rb +0 -0
  155. data/spec/lib/{tokenizers → internals/tokenizers}/base_spec.rb +3 -3
  156. data/spec/lib/{tokenizers → internals/tokenizers}/index_spec.rb +9 -9
  157. data/spec/lib/{tokenizers → internals/tokenizers}/query_spec.rb +11 -11
  158. data/spec/lib/query/allocation_spec.rb +12 -12
  159. data/spec/lib/query/allocations_spec.rb +19 -19
  160. data/spec/lib/query/base_spec.rb +28 -4
  161. data/spec/lib/query/combination_spec.rb +8 -9
  162. data/spec/lib/query/combinations/base_spec.rb +116 -0
  163. data/spec/lib/query/{combinations_spec.rb → combinations/memory_spec.rb} +14 -14
  164. data/spec/lib/query/combinations/redis_spec.rb +132 -0
  165. data/spec/lib/query/full_spec.rb +2 -2
  166. data/spec/lib/query/indexes_spec.rb +81 -0
  167. data/spec/lib/query/live_spec.rb +3 -3
  168. data/spec/lib/query/qualifiers_spec.rb +6 -6
  169. data/spec/lib/query/token_spec.rb +38 -38
  170. data/spec/lib/query/tokens_spec.rb +35 -35
  171. data/spec/lib/sources/db_spec.rb +23 -18
  172. metadata +212 -181
  173. data/lib/picky/adapters/rack/base.rb +0 -23
  174. data/lib/picky/adapters/rack/live_parameters.rb +0 -33
  175. data/lib/picky/adapters/rack/query.rb +0 -59
  176. data/lib/picky/adapters/rack.rb +0 -28
  177. data/lib/picky/cacher/convenience.rb +0 -3
  178. data/lib/picky/cacher/generator.rb +0 -15
  179. data/lib/picky/cacher/partial/default.rb +0 -5
  180. data/lib/picky/cacher/partial/none.rb +0 -31
  181. data/lib/picky/cacher/partial/strategy.rb +0 -21
  182. data/lib/picky/cacher/partial/substring.rb +0 -118
  183. data/lib/picky/cacher/partial_generator.rb +0 -15
  184. data/lib/picky/cacher/similarity/default.rb +0 -7
  185. data/lib/picky/cacher/similarity/double_levenshtone.rb +0 -77
  186. data/lib/picky/cacher/similarity/none.rb +0 -31
  187. data/lib/picky/cacher/similarity/strategy.rb +0 -9
  188. data/lib/picky/cacher/similarity_generator.rb +0 -15
  189. data/lib/picky/cacher/strategy.rb +0 -12
  190. data/lib/picky/cacher/weights/default.rb +0 -7
  191. data/lib/picky/cacher/weights/logarithmic.rb +0 -39
  192. data/lib/picky/cacher/weights/strategy.rb +0 -9
  193. data/lib/picky/cacher/weights_generator.rb +0 -15
  194. data/lib/picky/frontend_adapters/rack.rb +0 -150
  195. data/lib/picky/index/bundle.rb +0 -54
  196. data/lib/picky/index/file/basic.rb +0 -97
  197. data/lib/picky/index/file/json.rb +0 -34
  198. data/lib/picky/index/file/marshal.rb +0 -34
  199. data/lib/picky/index/file/text.rb +0 -56
  200. data/lib/picky/index/files.rb +0 -118
  201. data/lib/picky/index_api.rb +0 -175
  202. data/lib/picky/indexed/bundle.rb +0 -54
  203. data/lib/picky/indexed/categories.rb +0 -131
  204. data/lib/picky/indexed/category.rb +0 -85
  205. data/lib/picky/indexed/index.rb +0 -39
  206. data/lib/picky/indexed/wrappers/exact_first.rb +0 -61
  207. data/lib/picky/indexing/bundle.rb +0 -213
  208. data/lib/picky/indexing/categories.rb +0 -38
  209. data/lib/picky/indexing/category.rb +0 -117
  210. data/lib/picky/indexing/index.rb +0 -55
  211. data/lib/picky/query/allocation.rb +0 -82
  212. data/lib/picky/query/allocations.rb +0 -130
  213. data/lib/picky/query/combination.rb +0 -74
  214. data/lib/picky/query/combinations.rb +0 -105
  215. data/lib/picky/query/qualifiers.rb +0 -77
  216. data/lib/picky/query/token.rb +0 -202
  217. data/lib/picky/query/tokens.rb +0 -86
  218. data/lib/picky/query/weigher.rb +0 -165
  219. data/lib/picky/results/base.rb +0 -102
  220. data/lib/picky/results/full.rb +0 -13
  221. data/lib/picky/results/live.rb +0 -13
  222. data/lib/picky/tokenizers/base.rb +0 -161
  223. data/lib/picky/tokenizers/index.rb +0 -58
  224. data/lib/picky/tokenizers/query.rb +0 -74
  225. data/spec/lib/cacher/partial/default_spec.rb +0 -15
  226. data/spec/lib/cacher/partial/none_spec.rb +0 -17
  227. data/spec/lib/cacher/weights_generator_spec.rb +0 -21
  228. data/spec/lib/results/base_spec.rb +0 -257
  229. data/spec/lib/results/live_spec.rb +0 -15
@@ -0,0 +1,215 @@
1
+ module Internals
2
+
3
+ module Query
4
+
5
+ # This is a query token. Together with other tokens it makes up a query.
6
+ #
7
+ # It remembers the original form, and and a normalized form.
8
+ #
9
+ # It also knows whether it needs to look for similarity (bla~), or whether it is a partial (bla*).
10
+ #
11
+ # TODO Make partial / similarity char configurable.
12
+ #
13
+ class Token # :nodoc:all
14
+
15
+ attr_reader :text, :original
16
+ attr_writer :similar
17
+
18
+ delegate :blank?, :to => :text
19
+
20
+ # Normal initializer.
21
+ #
22
+ # Note: Use this if you do not want a qualified and normalized token.
23
+ #
24
+ def initialize text
25
+ @text = text
26
+ end
27
+
28
+ # Returns a qualified and normalized token.
29
+ #
30
+ # Note: Use this in the search engine if you need a qualified
31
+ # and normalized token. I.e. one prepared for a search.
32
+ #
33
+ def self.processed text
34
+ token = new text
35
+ token.qualify
36
+ token.extract_original
37
+ token.partialize
38
+ token.similarize
39
+ token.remove_illegals
40
+ token
41
+ end
42
+
43
+ # This returns a predefined category name if the user has given one.
44
+ #
45
+ def user_defined_category_name
46
+ @qualifier
47
+ end
48
+
49
+ # Extracts a qualifier for this token and pre-assigns an allocation.
50
+ #
51
+ # Note: Removes the qualifier if it is not allowed.
52
+ #
53
+ def qualify
54
+ @qualifier, @text = split @text
55
+ @qualifier = Query::Qualifiers.instance.normalize @qualifier
56
+ end
57
+ def extract_original
58
+ @original = @text.dup
59
+ end
60
+
61
+ # Partial is a conditional setter.
62
+ #
63
+ # It is only settable if it hasn't been set yet.
64
+ #
65
+ def partial= partial
66
+ @partial = partial if @partial.nil?
67
+ end
68
+ def partial?
69
+ !@similar && @partial
70
+ end
71
+
72
+ # If the text ends with *, partialize it. If with ", don't.
73
+ #
74
+ @@no_partial = /\"\Z/
75
+ @@partial = /\*\Z/
76
+ def partialize
77
+ self.partial = false and return if @text =~ @@no_partial
78
+ self.partial = true if @text =~ @@partial
79
+ end
80
+
81
+ # If the text ends with ~ similarize it. If with ", don't.
82
+ #
83
+ @@no_similar = /\"\Z/
84
+ @@similar = /\~\Z/
85
+ def similarize
86
+ self.similar = false and return if @text =~ @@no_similar
87
+ self.similar = true if @text =~ @@similar
88
+ end
89
+
90
+ def similar?
91
+ @similar
92
+ end
93
+
94
+ # Normalizes this token's text.
95
+ #
96
+ @@illegals = /["*~]/
97
+ def remove_illegals
98
+ @text.gsub! @@illegals, '' unless @text.blank?
99
+ end
100
+
101
+ # Visitor for tokenizer.
102
+ #
103
+ # TODO Rewrite!!!
104
+ #
105
+ def tokenize_with tokenizer
106
+ @text = tokenizer.normalize @text
107
+ end
108
+ # TODO spec!
109
+ #
110
+ # TODO Rewrite!!
111
+ #
112
+ def tokenized tokenizer
113
+ tokenizer.tokenize(@text.to_s).each do |text|
114
+ yield text
115
+ end
116
+ end
117
+
118
+ # Returns an array of possible combinations.
119
+ #
120
+ def possible_combinations_in type
121
+ type.possible_combinations self
122
+ end
123
+
124
+ # Returns a token with the next similar text.
125
+ #
126
+ # TODO Rewrite this. It is hard to understand. Also spec performance.
127
+ #
128
+ def next_similar_token category
129
+ token = self.dup
130
+ token if token.next_similar category.bundle_for(token)
131
+ end
132
+ # Sets and returns the next similar word.
133
+ #
134
+ # Note: Also overrides the original.
135
+ #
136
+ def next_similar bundle
137
+ @text = @original = (similarity(bundle).shift || return) if similar?
138
+ end
139
+ # Lazy similar reader.
140
+ #
141
+ def similarity bundle = nil
142
+ @similarity || @similarity = generate_similarity_for(bundle)
143
+ end
144
+ # Returns an enumerator that traverses over the similar.
145
+ #
146
+ # Note: The dup isn't too nice – since it is needed on account of the shift, above.
147
+ # (We avoid a StopIteration exception. Which of both is less evil?)
148
+ #
149
+ def generate_similarity_for bundle
150
+ bundle.similar(@text).dup || []
151
+ end
152
+
153
+ # Generates a solr term from this token.
154
+ #
155
+ # E.g. "name:heroes~0.75"
156
+ #
157
+ @@solr_fuzzy_mapping = {
158
+ 1 => :'',
159
+ 2 => :'',
160
+ 3 => :'',
161
+ 4 => :'~0.74',
162
+ 5 => :'~0.78',
163
+ 6 => :'~0.81',
164
+ 7 => :'~0.83',
165
+ 8 => :'~0.85',
166
+ 9 => :'~0.87',
167
+ 10 => :'~0.89'
168
+ }
169
+ @@solr_fuzzy_mapping.default = :'~0.9'
170
+ def to_solr
171
+ blank? ? '' : (to_s + @@solr_fuzzy_mapping[@text.size].to_s)
172
+ end
173
+
174
+ #
175
+ #
176
+ def to_result
177
+ [@original, @text]
178
+ end
179
+
180
+ # Internal identifier.
181
+ #
182
+ # TODO Uh.
183
+ #
184
+ def identifier
185
+ "#{similar?? :similarity : :index}:#{@text}"
186
+ end
187
+
188
+ # Displays the qualifier text and the text, joined.
189
+ #
190
+ # e.g. name:meier
191
+ #
192
+ def to_s
193
+ [@qualifier, @text].compact.join ':'
194
+ end
195
+
196
+ private
197
+
198
+ # Splits text into a qualifier and text.
199
+ #
200
+ # Returns [qualifier, text].
201
+ #
202
+ def split unqualified_text
203
+ qualifier, text = (unqualified_text || '').split(':', 2)
204
+ if text.blank?
205
+ [nil, (qualifier || '')]
206
+ else
207
+ [qualifier, text]
208
+ end
209
+ end
210
+
211
+ end
212
+
213
+ end
214
+
215
+ end
@@ -0,0 +1,89 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ #
6
+ #
7
+ module Query
8
+
9
+ # This class primarily handles switching through similar token constellations.
10
+ #
11
+ class Tokens # :nodoc:all
12
+
13
+ # Basically delegates to its internal tokens array.
14
+ #
15
+ self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
16
+
17
+ #
18
+ #
19
+ def initialize tokens = []
20
+ @tokens = tokens
21
+ end
22
+
23
+ #
24
+ #
25
+ def tokenize_with tokenizer
26
+ @tokens.each { |token| token.tokenize_with(tokenizer) }
27
+ end
28
+
29
+ # Generates an array in the form of
30
+ # [
31
+ # [combination], # of token 1
32
+ # [combination, combination, combination], # of token 2
33
+ # [combination, combination] # of token 3
34
+ # ]
35
+ #
36
+ # TODO If we want token behaviour defined per Query, we can
37
+ # compact! here
38
+ #
39
+ def possible_combinations_in type
40
+ @tokens.inject([]) do |combinations, token|
41
+ combinations << token.possible_combinations_in(type)
42
+ end
43
+ # TODO compact! if ignore_unassigned_tokens
44
+ end
45
+
46
+ # Makes the last of the tokens partial.
47
+ #
48
+ def partialize_last
49
+ @tokens.last.partial = true unless empty?
50
+ end
51
+
52
+ # Caps the tokens to the maximum.
53
+ #
54
+ def cap maximum
55
+ @tokens.slice!(maximum..-1) if cap?(maximum)
56
+ end
57
+ def cap? maximum
58
+ @tokens.size > maximum
59
+ end
60
+
61
+ # Rejects blank tokens.
62
+ #
63
+ def reject
64
+ @tokens.reject! &:blank?
65
+ end
66
+
67
+ # Returns a solr query.
68
+ #
69
+ def to_solr_query
70
+ @tokens.map(&:to_solr).join ' '
71
+ end
72
+
73
+ #
74
+ #
75
+ def originals
76
+ @tokens.map(&:original)
77
+ end
78
+
79
+ # Just join the token original texts.
80
+ #
81
+ def to_s
82
+ originals.join ' '
83
+ end
84
+
85
+ end
86
+
87
+ end
88
+
89
+ end
File without changes
@@ -0,0 +1,106 @@
1
+ module Internals
2
+
3
+ module Results # :nodoc:all
4
+
5
+ # This is the internal results object. Usually, to_marshal, or to_json
6
+ # is called on it to get a string for the answer.
7
+ #
8
+ class Base
9
+
10
+ # Duration is set externally by the query.
11
+ #
12
+ attr_writer :duration
13
+ attr_reader :allocations, :offset
14
+
15
+ # Takes instances of Query::Allocations as param.
16
+ #
17
+ def initialize offset = 0, allocations = Query::Allocations.new
18
+ @offset = offset
19
+ @allocations = allocations # || Query::Allocations.new
20
+ end
21
+ # Create new results and calculate the ids.
22
+ #
23
+ def self.from offset, allocations
24
+ results = new offset, allocations
25
+ results.prepare!
26
+ results
27
+ end
28
+
29
+ #
30
+ #
31
+ def serialize
32
+ { allocations: allocations.to_result,
33
+ offset: offset,
34
+ duration: duration,
35
+ total: total }
36
+ end
37
+ # The default format is json.
38
+ #
39
+ def to_response options = {}
40
+ to_json options
41
+ end
42
+ # Convert to json format.
43
+ #
44
+ def to_json options = {}
45
+ serialize.to_json options
46
+ end
47
+
48
+ # This starts the actual processing.
49
+ #
50
+ # Without this, the allocations are not processed,
51
+ # and no ids are calculated.
52
+ #
53
+ def prepare!
54
+ allocations.process! self.max_results, self.offset
55
+ end
56
+
57
+ # Duration default is 0.
58
+ #
59
+ def duration
60
+ @duration || 0
61
+ end
62
+ # The total results. Delegates to the allocations.
63
+ #
64
+ # Caches.
65
+ #
66
+ def total
67
+ @total || @total = allocations.total || 0
68
+ end
69
+
70
+ # How many results are returned.
71
+ #
72
+ # Set in config using
73
+ # Results::Full.max_results = 20
74
+ #
75
+ class_inheritable_accessor :max_results
76
+ def max_results
77
+ self.class.max_results
78
+ end
79
+
80
+ # Convenience methods.
81
+ #
82
+
83
+ # Delegates to allocations.
84
+ #
85
+ def ids amount = 20
86
+ allocations.ids amount
87
+ end
88
+ # Gets an amout of random ids from the allocations.
89
+ #
90
+ # Note: Basically delegates to the allocations.
91
+ #
92
+ def random_ids amount = 1
93
+ allocations.random_ids amount
94
+ end
95
+
96
+ # Human readable log.
97
+ #
98
+ def to_log query
99
+ "|#{Time.now.to_s(:db)}|#{'%8f' % duration}|#{'%-50s' % query}|#{'%8d' % total}|#{'%4d' % offset}|#{'%2d' % allocations.size}|"
100
+ end
101
+
102
+ end
103
+
104
+ end
105
+
106
+ end
@@ -0,0 +1,17 @@
1
+ module Internals
2
+
3
+ module Results
4
+ # Full results are limited to maximally 20 results (by default).
5
+ #
6
+ class Full < Base
7
+
8
+ self.max_results = 20
9
+
10
+ def to_log *args
11
+ ?> + super
12
+ end
13
+
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,17 @@
1
+ module Internals
2
+
3
+ module Results
4
+ # Live results are not returning any results.
5
+ #
6
+ class Live < Base
7
+
8
+ self.max_results = 0
9
+
10
+ def to_log *args
11
+ ?. + super
12
+ end
13
+
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,166 @@
1
+ module Internals
2
+
3
+ module Tokenizers # :nodoc:all
4
+
5
+ # Defines tokenizing processes used both in indexing and querying.
6
+ #
7
+ class Base
8
+
9
+ # TODO Move EMPTY_STRING top level.
10
+ #
11
+ EMPTY_STRING = ''.freeze
12
+
13
+ # Stopwords.
14
+ #
15
+ def stopwords regexp
16
+ @remove_stopwords_regexp = regexp
17
+ end
18
+ def remove_stopwords text
19
+ text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
20
+ text
21
+ end
22
+ @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
23
+ def remove_non_single_stopwords text
24
+ return text if text.match @@non_single_stopword_regexp
25
+ remove_stopwords text
26
+ end
27
+
28
+ # Illegals.
29
+ #
30
+ # TODO Should there be a legal?
31
+ #
32
+ def removes_characters regexp
33
+ @removes_characters_regexp = regexp
34
+ end
35
+ def remove_illegals text
36
+ text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
37
+ text
38
+ end
39
+
40
+ # Splitting.
41
+ #
42
+ def splits_text_on regexp
43
+ @splits_text_on_regexp = regexp
44
+ end
45
+ def split text
46
+ text.split @splits_text_on_regexp
47
+ end
48
+
49
+ # Normalizing.
50
+ #
51
+ def normalizes_words regexp_replaces
52
+ @normalizes_words_regexp_replaces = regexp_replaces
53
+ end
54
+ def normalize_with_patterns text
55
+ return text unless @normalizes_words_regexp_replaces
56
+
57
+ @normalizes_words_regexp_replaces.each do |regex, replace|
58
+ # This should be sufficient
59
+ #
60
+ text.gsub!(regex, replace) and break
61
+ end
62
+ remove_after_normalizing_illegals text
63
+ text
64
+ end
65
+
66
+ # Illegal after normalizing.
67
+ #
68
+ def removes_characters_after_splitting regexp
69
+ @removes_characters_after_splitting_regexp = regexp
70
+ end
71
+ def remove_after_normalizing_illegals text
72
+ text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
73
+ end
74
+
75
+ # Substitute Characters with this substituter.
76
+ #
77
+ # Default is European Character substitution.
78
+ #
79
+ def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
80
+ # TODO Raise if it doesn't quack substitute?
81
+ @substituter = substituter
82
+ end
83
+ def substitute_characters text
84
+ substituter?? substituter.substitute(text) : text
85
+ end
86
+
87
+ # Reject tokens after tokenizing based on the given criteria.
88
+ #
89
+ # Note: Currently only for indexing. TODO Redesign and write for both!
90
+ #
91
+ def reject_token_if &condition
92
+ @reject_condition = condition
93
+ end
94
+ def reject tokens
95
+ tokens.reject! &@reject_condition
96
+ end
97
+
98
+
99
+ # Returns a number of tokens, generated from the given text.
100
+ #
101
+ # Note:
102
+ # * preprocess, pretokenize are hooks
103
+ #
104
+ def tokenize text
105
+ text = preprocess text # processing the text
106
+ return empty_tokens if text.blank?
107
+ words = pretokenize text # splitting and preparations for tokenizing
108
+ return empty_tokens if words.empty?
109
+ tokens = tokens_for words # creating tokens / strings
110
+ process tokens # processing tokens / strings
111
+ end
112
+
113
+ attr_reader :substituter
114
+ alias substituter? substituter
115
+
116
+ def initialize options = {}
117
+ removes_characters options[:removes_characters] if options[:removes_characters]
118
+ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
119
+ stopwords options[:stopwords] if options[:stopwords]
120
+ normalizes_words options[:normalizes_words] if options[:normalizes_words]
121
+ removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
122
+ substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
123
+
124
+ # Defaults.
125
+ #
126
+ splits_text_on options[:splits_text_on] || /\s/
127
+ reject_token_if &(options[:reject_token_if] || :blank?)
128
+ end
129
+
130
+ # Hooks.
131
+ #
132
+
133
+ # Preprocessing.
134
+ #
135
+ def preprocess text; end
136
+ # Pretokenizing.
137
+ #
138
+ def pretokenize text; end
139
+ # Postprocessing.
140
+ #
141
+ def process tokens
142
+ reject tokens # Reject any tokens that don't meet criteria
143
+ tokens
144
+ end
145
+
146
+ # Converts words into real tokens.
147
+ #
148
+ def tokens_for words
149
+ Internals::Query::Tokens.new words.collect! { |word| token_for word }
150
+ end
151
+ # Turns non-blank text into symbols.
152
+ #
153
+ def symbolize text
154
+ text.blank? ? nil : text.to_sym
155
+ end
156
+ # Returns a tokens object.
157
+ #
158
+ def empty_tokens
159
+ Internals::Query::Tokens.new
160
+ end
161
+
162
+ end
163
+
164
+ end
165
+
166
+ end
@@ -0,0 +1,63 @@
1
+ module Internals
2
+
3
+ module Tokenizers
4
+
5
+ # The base indexing tokenizer.
6
+ #
7
+ # Override in indexing subclasses and define in configuration.
8
+ #
9
+ class Index < Base
10
+
11
+ def self.default= new_default
12
+ @default = new_default
13
+ end
14
+ def self.default
15
+ @default ||= new
16
+ end
17
+
18
+ # Default indexing preprocessing hook.
19
+ #
20
+ # Does:
21
+ # 1. Character substitution.
22
+ # 2. Downcasing.
23
+ # 3. Remove illegal expressions.
24
+ # 4. Remove non-single stopwords. (Stopwords that occur with other words)
25
+ #
26
+ def preprocess text
27
+ text = substitute_characters text
28
+ text.downcase!
29
+ remove_illegals text
30
+ # we do not remove single stopwords for an entirely different
31
+ # reason than in the query tokenizer.
32
+ # An indexed thing with just name "UND" (a possible stopword) should not lose its name.
33
+ #
34
+ remove_non_single_stopwords text
35
+ text
36
+ end
37
+
38
+ # Default indexing pretokenizing hook.
39
+ #
40
+ # Does:
41
+ # 1. Split the text into words.
42
+ # 2. Normalize each word.
43
+ #
44
+ def pretokenize text
45
+ words = split text
46
+ words.collect! do |word|
47
+ normalize_with_patterns word
48
+ word
49
+ end
50
+ end
51
+
52
+ # Does not actually return a token, but a
53
+ # symbol "token".
54
+ #
55
+ def token_for text
56
+ symbolize text
57
+ end
58
+
59
+ end
60
+
61
+ end
62
+
63
+ end