picky 1.4.1 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (229) hide show
  1. data/lib/picky/{alias_instances.rb → aliases.rb} +1 -3
  2. data/lib/picky/application.rb +18 -19
  3. data/lib/picky/cores.rb +1 -1
  4. data/lib/picky/generators/aliases.rb +3 -0
  5. data/lib/picky/index/base.rb +179 -0
  6. data/lib/picky/index/memory.rb +28 -0
  7. data/lib/picky/index/redis.rb +28 -0
  8. data/lib/picky/{indexes_api.rb → index_bundle.rb} +16 -16
  9. data/lib/picky/indexed/indexes.rb +11 -7
  10. data/lib/picky/indexing/indexes.rb +14 -8
  11. data/lib/picky/internals/adapters/rack/base.rb +27 -0
  12. data/lib/picky/internals/adapters/rack/live_parameters.rb +37 -0
  13. data/lib/picky/internals/adapters/rack/query.rb +63 -0
  14. data/lib/picky/internals/adapters/rack.rb +34 -0
  15. data/lib/picky/{calculations → internals/calculations}/location.rb +0 -0
  16. data/lib/picky/{cli.rb → internals/cli.rb} +0 -0
  17. data/lib/picky/{configuration → internals/configuration}/index.rb +8 -2
  18. data/lib/picky/{ext → internals/ext}/maybe_compile.rb +0 -0
  19. data/lib/picky/{ext → internals/ext}/ruby19/extconf.rb +0 -0
  20. data/lib/picky/{ext → internals/ext}/ruby19/performant.c +0 -0
  21. data/lib/picky/{extensions → internals/extensions}/array.rb +0 -0
  22. data/lib/picky/{extensions → internals/extensions}/hash.rb +0 -0
  23. data/lib/picky/{extensions → internals/extensions}/module.rb +0 -0
  24. data/lib/picky/{extensions → internals/extensions}/object.rb +0 -0
  25. data/lib/picky/{extensions → internals/extensions}/symbol.rb +0 -0
  26. data/lib/picky/internals/frontend_adapters/rack.rb +154 -0
  27. data/lib/picky/internals/generators/base.rb +19 -0
  28. data/lib/picky/internals/generators/partial/default.rb +7 -0
  29. data/lib/picky/internals/generators/partial/none.rb +35 -0
  30. data/lib/picky/internals/generators/partial/strategy.rb +29 -0
  31. data/lib/picky/internals/generators/partial/substring.rb +122 -0
  32. data/lib/picky/internals/generators/partial_generator.rb +19 -0
  33. data/lib/picky/internals/generators/similarity/default.rb +9 -0
  34. data/lib/picky/internals/generators/similarity/double_levenshtone.rb +81 -0
  35. data/lib/picky/internals/generators/similarity/none.rb +35 -0
  36. data/lib/picky/internals/generators/similarity/strategy.rb +11 -0
  37. data/lib/picky/internals/generators/similarity_generator.rb +19 -0
  38. data/lib/picky/internals/generators/strategy.rb +18 -0
  39. data/lib/picky/internals/generators/weights/default.rb +9 -0
  40. data/lib/picky/internals/generators/weights/logarithmic.rb +43 -0
  41. data/lib/picky/internals/generators/weights/strategy.rb +11 -0
  42. data/lib/picky/internals/generators/weights_generator.rb +19 -0
  43. data/lib/picky/{helpers → internals/helpers}/measuring.rb +0 -0
  44. data/lib/picky/internals/index/backend.rb +113 -0
  45. data/lib/picky/internals/index/file/basic.rb +101 -0
  46. data/lib/picky/internals/index/file/json.rb +38 -0
  47. data/lib/picky/internals/index/file/marshal.rb +38 -0
  48. data/lib/picky/internals/index/file/text.rb +60 -0
  49. data/lib/picky/internals/index/files.rb +24 -0
  50. data/lib/picky/internals/index/redis/basic.rb +77 -0
  51. data/lib/picky/internals/index/redis/list_hash.rb +46 -0
  52. data/lib/picky/internals/index/redis/string_hash.rb +35 -0
  53. data/lib/picky/internals/index/redis.rb +44 -0
  54. data/lib/picky/internals/indexed/bundle/base.rb +72 -0
  55. data/lib/picky/internals/indexed/bundle/memory.rb +69 -0
  56. data/lib/picky/internals/indexed/bundle/redis.rb +70 -0
  57. data/lib/picky/internals/indexed/categories.rb +135 -0
  58. data/lib/picky/internals/indexed/category.rb +90 -0
  59. data/lib/picky/internals/indexed/index.rb +57 -0
  60. data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/calculation.rb +0 -0
  61. data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/location.rb +4 -2
  62. data/lib/picky/{indexed → internals/indexed}/wrappers/bundle/wrapper.rb +1 -1
  63. data/lib/picky/internals/indexed/wrappers/exact_first.rb +65 -0
  64. data/lib/picky/{indexers → internals/indexers}/no_source_specified_error.rb +0 -0
  65. data/lib/picky/{indexers → internals/indexers}/serial.rb +2 -2
  66. data/lib/picky/{indexers → internals/indexers}/solr.rb +0 -0
  67. data/lib/picky/internals/indexing/bundle/base.rb +219 -0
  68. data/lib/picky/internals/indexing/bundle/memory.rb +25 -0
  69. data/lib/picky/internals/indexing/bundle/redis.rb +28 -0
  70. data/lib/picky/internals/indexing/bundle/super_base.rb +65 -0
  71. data/lib/picky/internals/indexing/categories.rb +42 -0
  72. data/lib/picky/internals/indexing/category.rb +120 -0
  73. data/lib/picky/internals/indexing/index.rb +67 -0
  74. data/lib/picky/{performant.rb → internals/performant.rb} +0 -0
  75. data/lib/picky/internals/query/allocation.rb +88 -0
  76. data/lib/picky/internals/query/allocations.rb +137 -0
  77. data/lib/picky/internals/query/combination.rb +80 -0
  78. data/lib/picky/internals/query/combinations/base.rb +84 -0
  79. data/lib/picky/internals/query/combinations/memory.rb +58 -0
  80. data/lib/picky/internals/query/combinations/redis.rb +59 -0
  81. data/lib/picky/internals/query/indexes.rb +180 -0
  82. data/lib/picky/internals/query/qualifiers.rb +81 -0
  83. data/lib/picky/internals/query/token.rb +215 -0
  84. data/lib/picky/internals/query/tokens.rb +89 -0
  85. data/lib/picky/{query → internals/query}/weights.rb +0 -0
  86. data/lib/picky/internals/results/base.rb +106 -0
  87. data/lib/picky/internals/results/full.rb +17 -0
  88. data/lib/picky/internals/results/live.rb +17 -0
  89. data/lib/picky/{solr → internals/solr}/schema_generator.rb +0 -0
  90. data/lib/picky/internals/tokenizers/base.rb +166 -0
  91. data/lib/picky/internals/tokenizers/index.rb +63 -0
  92. data/lib/picky/internals/tokenizers/query.rb +79 -0
  93. data/lib/picky/loader.rb +148 -112
  94. data/lib/picky/query/base.rb +57 -26
  95. data/lib/picky/query/full.rb +1 -1
  96. data/lib/picky/query/live.rb +1 -1
  97. data/lib/picky/sources/db.rb +27 -6
  98. data/lib/tasks/index.rake +3 -3
  99. data/lib/tasks/try.rake +2 -2
  100. data/spec/lib/aliases_spec.rb +9 -0
  101. data/spec/lib/application_spec.rb +3 -3
  102. data/spec/lib/generators/aliases_spec.rb +1 -0
  103. data/spec/lib/{index_api_spec.rb → index/base_spec.rb} +7 -7
  104. data/spec/lib/index_bundle_spec.rb +71 -0
  105. data/spec/lib/indexed/indexes_spec.rb +61 -0
  106. data/spec/lib/indexing/indexes_spec.rb +94 -24
  107. data/spec/lib/{adapters → internals/adapters}/rack/base_spec.rb +2 -2
  108. data/spec/lib/{adapters → internals/adapters}/rack/live_parameters_spec.rb +2 -2
  109. data/spec/lib/{adapters → internals/adapters}/rack/query_spec.rb +2 -2
  110. data/spec/lib/{calculations → internals/calculations}/location_spec.rb +0 -0
  111. data/spec/lib/{cli_spec.rb → internals/cli_spec.rb} +4 -1
  112. data/spec/lib/{configuration → internals/configuration}/index_spec.rb +1 -1
  113. data/spec/lib/{cores_spec.rb → internals/cores_spec.rb} +0 -0
  114. data/spec/lib/{extensions → internals/extensions}/array_spec.rb +0 -0
  115. data/spec/lib/{extensions → internals/extensions}/hash_spec.rb +0 -0
  116. data/spec/lib/{extensions → internals/extensions}/module_spec.rb +0 -0
  117. data/spec/lib/{extensions → internals/extensions}/object_spec.rb +0 -0
  118. data/spec/lib/{extensions → internals/extensions}/symbol_spec.rb +0 -0
  119. data/spec/lib/{frontend_adapters → internals/frontend_adapters}/rack_spec.rb +11 -11
  120. data/spec/lib/{cacher → internals/generators}/cacher_strategy_spec.rb +2 -2
  121. data/spec/lib/internals/generators/partial/default_spec.rb +17 -0
  122. data/spec/lib/internals/generators/partial/none_spec.rb +17 -0
  123. data/spec/lib/{cacher → internals/generators}/partial/substring_spec.rb +26 -27
  124. data/spec/lib/{cacher → internals/generators}/partial_generator_spec.rb +5 -5
  125. data/spec/lib/{cacher → internals/generators}/similarity/double_levenshtone_spec.rb +4 -4
  126. data/spec/lib/{cacher → internals/generators}/similarity/none_spec.rb +2 -2
  127. data/spec/lib/{cacher → internals/generators}/similarity_generator_spec.rb +4 -4
  128. data/spec/lib/{cacher → internals/generators}/weights/logarithmic_spec.rb +2 -2
  129. data/spec/lib/internals/generators/weights_generator_spec.rb +21 -0
  130. data/spec/lib/{helpers → internals/helpers}/measuring_spec.rb +0 -0
  131. data/spec/lib/{index → internals/index}/file/basic_spec.rb +2 -2
  132. data/spec/lib/{index → internals/index}/file/json_spec.rb +2 -2
  133. data/spec/lib/{index → internals/index}/file/marshal_spec.rb +2 -2
  134. data/spec/lib/{index → internals/index}/file/text_spec.rb +2 -2
  135. data/spec/lib/{index → internals/index}/files_spec.rb +2 -2
  136. data/spec/lib/{indexed/bundle_spec.rb → internals/indexed/bundle/memory_spec.rb} +4 -5
  137. data/spec/lib/{indexed → internals/indexed}/categories_spec.rb +13 -13
  138. data/spec/lib/{indexed → internals/indexed}/category_spec.rb +59 -32
  139. data/spec/lib/{indexed → internals/indexed}/index_spec.rb +5 -5
  140. data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/calculation_spec.rb +0 -0
  141. data/spec/lib/{indexed → internals/indexed}/wrappers/bundle/wrapper_spec.rb +0 -0
  142. data/spec/lib/{indexed → internals/indexed}/wrappers/exact_first_spec.rb +5 -5
  143. data/spec/lib/{indexers → internals/indexers}/serial_spec.rb +0 -0
  144. data/spec/lib/{indexing/bundle_partial_generation_speed_spec.rb → internals/indexing/bundle/memory_partial_generation_speed_spec.rb} +3 -3
  145. data/spec/lib/{indexing/bundle_spec.rb → internals/indexing/bundle/memory_spec.rb} +3 -3
  146. data/spec/lib/{index/bundle_spec.rb → internals/indexing/bundle/super_base_spec.rb} +9 -3
  147. data/spec/lib/{indexing → internals/indexing}/category_spec.rb +3 -3
  148. data/spec/lib/{indexing → internals/indexing}/index_spec.rb +3 -3
  149. data/spec/lib/internals/indexing/indexes_spec.rb +36 -0
  150. data/spec/lib/{interfaces → internals/interfaces}/live_parameters_spec.rb +0 -0
  151. data/spec/lib/internals/results/base_spec.rb +105 -0
  152. data/spec/lib/internals/results/full_spec.rb +78 -0
  153. data/spec/lib/internals/results/live_spec.rb +88 -0
  154. data/spec/lib/{solr → internals/solr}/schema_generator_spec.rb +0 -0
  155. data/spec/lib/{tokenizers → internals/tokenizers}/base_spec.rb +3 -3
  156. data/spec/lib/{tokenizers → internals/tokenizers}/index_spec.rb +9 -9
  157. data/spec/lib/{tokenizers → internals/tokenizers}/query_spec.rb +11 -11
  158. data/spec/lib/query/allocation_spec.rb +12 -12
  159. data/spec/lib/query/allocations_spec.rb +19 -19
  160. data/spec/lib/query/base_spec.rb +28 -4
  161. data/spec/lib/query/combination_spec.rb +8 -9
  162. data/spec/lib/query/combinations/base_spec.rb +116 -0
  163. data/spec/lib/query/{combinations_spec.rb → combinations/memory_spec.rb} +14 -14
  164. data/spec/lib/query/combinations/redis_spec.rb +132 -0
  165. data/spec/lib/query/full_spec.rb +2 -2
  166. data/spec/lib/query/indexes_spec.rb +81 -0
  167. data/spec/lib/query/live_spec.rb +3 -3
  168. data/spec/lib/query/qualifiers_spec.rb +6 -6
  169. data/spec/lib/query/token_spec.rb +38 -38
  170. data/spec/lib/query/tokens_spec.rb +35 -35
  171. data/spec/lib/sources/db_spec.rb +23 -18
  172. metadata +212 -181
  173. data/lib/picky/adapters/rack/base.rb +0 -23
  174. data/lib/picky/adapters/rack/live_parameters.rb +0 -33
  175. data/lib/picky/adapters/rack/query.rb +0 -59
  176. data/lib/picky/adapters/rack.rb +0 -28
  177. data/lib/picky/cacher/convenience.rb +0 -3
  178. data/lib/picky/cacher/generator.rb +0 -15
  179. data/lib/picky/cacher/partial/default.rb +0 -5
  180. data/lib/picky/cacher/partial/none.rb +0 -31
  181. data/lib/picky/cacher/partial/strategy.rb +0 -21
  182. data/lib/picky/cacher/partial/substring.rb +0 -118
  183. data/lib/picky/cacher/partial_generator.rb +0 -15
  184. data/lib/picky/cacher/similarity/default.rb +0 -7
  185. data/lib/picky/cacher/similarity/double_levenshtone.rb +0 -77
  186. data/lib/picky/cacher/similarity/none.rb +0 -31
  187. data/lib/picky/cacher/similarity/strategy.rb +0 -9
  188. data/lib/picky/cacher/similarity_generator.rb +0 -15
  189. data/lib/picky/cacher/strategy.rb +0 -12
  190. data/lib/picky/cacher/weights/default.rb +0 -7
  191. data/lib/picky/cacher/weights/logarithmic.rb +0 -39
  192. data/lib/picky/cacher/weights/strategy.rb +0 -9
  193. data/lib/picky/cacher/weights_generator.rb +0 -15
  194. data/lib/picky/frontend_adapters/rack.rb +0 -150
  195. data/lib/picky/index/bundle.rb +0 -54
  196. data/lib/picky/index/file/basic.rb +0 -97
  197. data/lib/picky/index/file/json.rb +0 -34
  198. data/lib/picky/index/file/marshal.rb +0 -34
  199. data/lib/picky/index/file/text.rb +0 -56
  200. data/lib/picky/index/files.rb +0 -118
  201. data/lib/picky/index_api.rb +0 -175
  202. data/lib/picky/indexed/bundle.rb +0 -54
  203. data/lib/picky/indexed/categories.rb +0 -131
  204. data/lib/picky/indexed/category.rb +0 -85
  205. data/lib/picky/indexed/index.rb +0 -39
  206. data/lib/picky/indexed/wrappers/exact_first.rb +0 -61
  207. data/lib/picky/indexing/bundle.rb +0 -213
  208. data/lib/picky/indexing/categories.rb +0 -38
  209. data/lib/picky/indexing/category.rb +0 -117
  210. data/lib/picky/indexing/index.rb +0 -55
  211. data/lib/picky/query/allocation.rb +0 -82
  212. data/lib/picky/query/allocations.rb +0 -130
  213. data/lib/picky/query/combination.rb +0 -74
  214. data/lib/picky/query/combinations.rb +0 -105
  215. data/lib/picky/query/qualifiers.rb +0 -77
  216. data/lib/picky/query/token.rb +0 -202
  217. data/lib/picky/query/tokens.rb +0 -86
  218. data/lib/picky/query/weigher.rb +0 -165
  219. data/lib/picky/results/base.rb +0 -102
  220. data/lib/picky/results/full.rb +0 -13
  221. data/lib/picky/results/live.rb +0 -13
  222. data/lib/picky/tokenizers/base.rb +0 -161
  223. data/lib/picky/tokenizers/index.rb +0 -58
  224. data/lib/picky/tokenizers/query.rb +0 -74
  225. data/spec/lib/cacher/partial/default_spec.rb +0 -15
  226. data/spec/lib/cacher/partial/none_spec.rb +0 -17
  227. data/spec/lib/cacher/weights_generator_spec.rb +0 -21
  228. data/spec/lib/results/base_spec.rb +0 -257
  229. data/spec/lib/results/live_spec.rb +0 -15
@@ -0,0 +1,215 @@
1
+ module Internals
2
+
3
+ module Query
4
+
5
+ # This is a query token. Together with other tokens it makes up a query.
6
+ #
7
+ # It remembers the original form, and and a normalized form.
8
+ #
9
+ # It also knows whether it needs to look for similarity (bla~), or whether it is a partial (bla*).
10
+ #
11
+ # TODO Make partial / similarity char configurable.
12
+ #
13
+ class Token # :nodoc:all
14
+
15
+ attr_reader :text, :original
16
+ attr_writer :similar
17
+
18
+ delegate :blank?, :to => :text
19
+
20
+ # Normal initializer.
21
+ #
22
+ # Note: Use this if you do not want a qualified and normalized token.
23
+ #
24
+ def initialize text
25
+ @text = text
26
+ end
27
+
28
+ # Returns a qualified and normalized token.
29
+ #
30
+ # Note: Use this in the search engine if you need a qualified
31
+ # and normalized token. I.e. one prepared for a search.
32
+ #
33
+ def self.processed text
34
+ token = new text
35
+ token.qualify
36
+ token.extract_original
37
+ token.partialize
38
+ token.similarize
39
+ token.remove_illegals
40
+ token
41
+ end
42
+
43
+ # This returns a predefined category name if the user has given one.
44
+ #
45
+ def user_defined_category_name
46
+ @qualifier
47
+ end
48
+
49
+ # Extracts a qualifier for this token and pre-assigns an allocation.
50
+ #
51
+ # Note: Removes the qualifier if it is not allowed.
52
+ #
53
+ def qualify
54
+ @qualifier, @text = split @text
55
+ @qualifier = Query::Qualifiers.instance.normalize @qualifier
56
+ end
57
+ def extract_original
58
+ @original = @text.dup
59
+ end
60
+
61
+ # Partial is a conditional setter.
62
+ #
63
+ # It is only settable if it hasn't been set yet.
64
+ #
65
+ def partial= partial
66
+ @partial = partial if @partial.nil?
67
+ end
68
+ def partial?
69
+ !@similar && @partial
70
+ end
71
+
72
+ # If the text ends with *, partialize it. If with ", don't.
73
+ #
74
+ @@no_partial = /\"\Z/
75
+ @@partial = /\*\Z/
76
+ def partialize
77
+ self.partial = false and return if @text =~ @@no_partial
78
+ self.partial = true if @text =~ @@partial
79
+ end
80
+
81
+ # If the text ends with ~ similarize it. If with ", don't.
82
+ #
83
+ @@no_similar = /\"\Z/
84
+ @@similar = /\~\Z/
85
+ def similarize
86
+ self.similar = false and return if @text =~ @@no_similar
87
+ self.similar = true if @text =~ @@similar
88
+ end
89
+
90
+ def similar?
91
+ @similar
92
+ end
93
+
94
+ # Normalizes this token's text.
95
+ #
96
+ @@illegals = /["*~]/
97
+ def remove_illegals
98
+ @text.gsub! @@illegals, '' unless @text.blank?
99
+ end
100
+
101
+ # Visitor for tokenizer.
102
+ #
103
+ # TODO Rewrite!!!
104
+ #
105
+ def tokenize_with tokenizer
106
+ @text = tokenizer.normalize @text
107
+ end
108
+ # TODO spec!
109
+ #
110
+ # TODO Rewrite!!
111
+ #
112
+ def tokenized tokenizer
113
+ tokenizer.tokenize(@text.to_s).each do |text|
114
+ yield text
115
+ end
116
+ end
117
+
118
+ # Returns an array of possible combinations.
119
+ #
120
+ def possible_combinations_in type
121
+ type.possible_combinations self
122
+ end
123
+
124
+ # Returns a token with the next similar text.
125
+ #
126
+ # TODO Rewrite this. It is hard to understand. Also spec performance.
127
+ #
128
+ def next_similar_token category
129
+ token = self.dup
130
+ token if token.next_similar category.bundle_for(token)
131
+ end
132
+ # Sets and returns the next similar word.
133
+ #
134
+ # Note: Also overrides the original.
135
+ #
136
+ def next_similar bundle
137
+ @text = @original = (similarity(bundle).shift || return) if similar?
138
+ end
139
+ # Lazy similar reader.
140
+ #
141
+ def similarity bundle = nil
142
+ @similarity || @similarity = generate_similarity_for(bundle)
143
+ end
144
+ # Returns an enumerator that traverses over the similar.
145
+ #
146
+ # Note: The dup isn't too nice – since it is needed on account of the shift, above.
147
+ # (We avoid a StopIteration exception. Which of both is less evil?)
148
+ #
149
+ def generate_similarity_for bundle
150
+ bundle.similar(@text).dup || []
151
+ end
152
+
153
+ # Generates a solr term from this token.
154
+ #
155
+ # E.g. "name:heroes~0.75"
156
+ #
157
+ @@solr_fuzzy_mapping = {
158
+ 1 => :'',
159
+ 2 => :'',
160
+ 3 => :'',
161
+ 4 => :'~0.74',
162
+ 5 => :'~0.78',
163
+ 6 => :'~0.81',
164
+ 7 => :'~0.83',
165
+ 8 => :'~0.85',
166
+ 9 => :'~0.87',
167
+ 10 => :'~0.89'
168
+ }
169
+ @@solr_fuzzy_mapping.default = :'~0.9'
170
+ def to_solr
171
+ blank? ? '' : (to_s + @@solr_fuzzy_mapping[@text.size].to_s)
172
+ end
173
+
174
+ #
175
+ #
176
+ def to_result
177
+ [@original, @text]
178
+ end
179
+
180
+ # Internal identifier.
181
+ #
182
+ # TODO Uh.
183
+ #
184
+ def identifier
185
+ "#{similar?? :similarity : :index}:#{@text}"
186
+ end
187
+
188
+ # Displays the qualifier text and the text, joined.
189
+ #
190
+ # e.g. name:meier
191
+ #
192
+ def to_s
193
+ [@qualifier, @text].compact.join ':'
194
+ end
195
+
196
+ private
197
+
198
+ # Splits text into a qualifier and text.
199
+ #
200
+ # Returns [qualifier, text].
201
+ #
202
+ def split unqualified_text
203
+ qualifier, text = (unqualified_text || '').split(':', 2)
204
+ if text.blank?
205
+ [nil, (qualifier || '')]
206
+ else
207
+ [qualifier, text]
208
+ end
209
+ end
210
+
211
+ end
212
+
213
+ end
214
+
215
+ end
@@ -0,0 +1,89 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ #
6
+ #
7
+ module Query
8
+
9
+ # This class primarily handles switching through similar token constellations.
10
+ #
11
+ class Tokens # :nodoc:all
12
+
13
+ # Basically delegates to its internal tokens array.
14
+ #
15
+ self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
16
+
17
+ #
18
+ #
19
+ def initialize tokens = []
20
+ @tokens = tokens
21
+ end
22
+
23
+ #
24
+ #
25
+ def tokenize_with tokenizer
26
+ @tokens.each { |token| token.tokenize_with(tokenizer) }
27
+ end
28
+
29
+ # Generates an array in the form of
30
+ # [
31
+ # [combination], # of token 1
32
+ # [combination, combination, combination], # of token 2
33
+ # [combination, combination] # of token 3
34
+ # ]
35
+ #
36
+ # TODO If we want token behaviour defined per Query, we can
37
+ # compact! here
38
+ #
39
+ def possible_combinations_in type
40
+ @tokens.inject([]) do |combinations, token|
41
+ combinations << token.possible_combinations_in(type)
42
+ end
43
+ # TODO compact! if ignore_unassigned_tokens
44
+ end
45
+
46
+ # Makes the last of the tokens partial.
47
+ #
48
+ def partialize_last
49
+ @tokens.last.partial = true unless empty?
50
+ end
51
+
52
+ # Caps the tokens to the maximum.
53
+ #
54
+ def cap maximum
55
+ @tokens.slice!(maximum..-1) if cap?(maximum)
56
+ end
57
+ def cap? maximum
58
+ @tokens.size > maximum
59
+ end
60
+
61
+ # Rejects blank tokens.
62
+ #
63
+ def reject
64
+ @tokens.reject! &:blank?
65
+ end
66
+
67
+ # Returns a solr query.
68
+ #
69
+ def to_solr_query
70
+ @tokens.map(&:to_solr).join ' '
71
+ end
72
+
73
+ #
74
+ #
75
+ def originals
76
+ @tokens.map(&:original)
77
+ end
78
+
79
+ # Just join the token original texts.
80
+ #
81
+ def to_s
82
+ originals.join ' '
83
+ end
84
+
85
+ end
86
+
87
+ end
88
+
89
+ end
File without changes
@@ -0,0 +1,106 @@
1
+ module Internals
2
+
3
+ module Results # :nodoc:all
4
+
5
+ # This is the internal results object. Usually, to_marshal, or to_json
6
+ # is called on it to get a string for the answer.
7
+ #
8
+ class Base
9
+
10
+ # Duration is set externally by the query.
11
+ #
12
+ attr_writer :duration
13
+ attr_reader :allocations, :offset
14
+
15
+ # Takes instances of Query::Allocations as param.
16
+ #
17
+ def initialize offset = 0, allocations = Query::Allocations.new
18
+ @offset = offset
19
+ @allocations = allocations # || Query::Allocations.new
20
+ end
21
+ # Create new results and calculate the ids.
22
+ #
23
+ def self.from offset, allocations
24
+ results = new offset, allocations
25
+ results.prepare!
26
+ results
27
+ end
28
+
29
+ #
30
+ #
31
+ def serialize
32
+ { allocations: allocations.to_result,
33
+ offset: offset,
34
+ duration: duration,
35
+ total: total }
36
+ end
37
+ # The default format is json.
38
+ #
39
+ def to_response options = {}
40
+ to_json options
41
+ end
42
+ # Convert to json format.
43
+ #
44
+ def to_json options = {}
45
+ serialize.to_json options
46
+ end
47
+
48
+ # This starts the actual processing.
49
+ #
50
+ # Without this, the allocations are not processed,
51
+ # and no ids are calculated.
52
+ #
53
+ def prepare!
54
+ allocations.process! self.max_results, self.offset
55
+ end
56
+
57
+ # Duration default is 0.
58
+ #
59
+ def duration
60
+ @duration || 0
61
+ end
62
+ # The total results. Delegates to the allocations.
63
+ #
64
+ # Caches.
65
+ #
66
+ def total
67
+ @total || @total = allocations.total || 0
68
+ end
69
+
70
+ # How many results are returned.
71
+ #
72
+ # Set in config using
73
+ # Results::Full.max_results = 20
74
+ #
75
+ class_inheritable_accessor :max_results
76
+ def max_results
77
+ self.class.max_results
78
+ end
79
+
80
+ # Convenience methods.
81
+ #
82
+
83
+ # Delegates to allocations.
84
+ #
85
+ def ids amount = 20
86
+ allocations.ids amount
87
+ end
88
+ # Gets an amout of random ids from the allocations.
89
+ #
90
+ # Note: Basically delegates to the allocations.
91
+ #
92
+ def random_ids amount = 1
93
+ allocations.random_ids amount
94
+ end
95
+
96
+ # Human readable log.
97
+ #
98
+ def to_log query
99
+ "|#{Time.now.to_s(:db)}|#{'%8f' % duration}|#{'%-50s' % query}|#{'%8d' % total}|#{'%4d' % offset}|#{'%2d' % allocations.size}|"
100
+ end
101
+
102
+ end
103
+
104
+ end
105
+
106
+ end
@@ -0,0 +1,17 @@
1
+ module Internals
2
+
3
+ module Results
4
+ # Full results are limited to maximally 20 results (by default).
5
+ #
6
+ class Full < Base
7
+
8
+ self.max_results = 20
9
+
10
+ def to_log *args
11
+ ?> + super
12
+ end
13
+
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,17 @@
1
+ module Internals
2
+
3
+ module Results
4
+ # Live results are not returning any results.
5
+ #
6
+ class Live < Base
7
+
8
+ self.max_results = 0
9
+
10
+ def to_log *args
11
+ ?. + super
12
+ end
13
+
14
+ end
15
+ end
16
+
17
+ end
@@ -0,0 +1,166 @@
1
+ module Internals
2
+
3
+ module Tokenizers # :nodoc:all
4
+
5
+ # Defines tokenizing processes used both in indexing and querying.
6
+ #
7
+ class Base
8
+
9
+ # TODO Move EMPTY_STRING top level.
10
+ #
11
+ EMPTY_STRING = ''.freeze
12
+
13
+ # Stopwords.
14
+ #
15
+ def stopwords regexp
16
+ @remove_stopwords_regexp = regexp
17
+ end
18
+ def remove_stopwords text
19
+ text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
20
+ text
21
+ end
22
+ @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
23
+ def remove_non_single_stopwords text
24
+ return text if text.match @@non_single_stopword_regexp
25
+ remove_stopwords text
26
+ end
27
+
28
+ # Illegals.
29
+ #
30
+ # TODO Should there be a legal?
31
+ #
32
+ def removes_characters regexp
33
+ @removes_characters_regexp = regexp
34
+ end
35
+ def remove_illegals text
36
+ text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
37
+ text
38
+ end
39
+
40
+ # Splitting.
41
+ #
42
+ def splits_text_on regexp
43
+ @splits_text_on_regexp = regexp
44
+ end
45
+ def split text
46
+ text.split @splits_text_on_regexp
47
+ end
48
+
49
+ # Normalizing.
50
+ #
51
+ def normalizes_words regexp_replaces
52
+ @normalizes_words_regexp_replaces = regexp_replaces
53
+ end
54
+ def normalize_with_patterns text
55
+ return text unless @normalizes_words_regexp_replaces
56
+
57
+ @normalizes_words_regexp_replaces.each do |regex, replace|
58
+ # This should be sufficient
59
+ #
60
+ text.gsub!(regex, replace) and break
61
+ end
62
+ remove_after_normalizing_illegals text
63
+ text
64
+ end
65
+
66
+ # Illegal after normalizing.
67
+ #
68
+ def removes_characters_after_splitting regexp
69
+ @removes_characters_after_splitting_regexp = regexp
70
+ end
71
+ def remove_after_normalizing_illegals text
72
+ text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
73
+ end
74
+
75
+ # Substitute Characters with this substituter.
76
+ #
77
+ # Default is European Character substitution.
78
+ #
79
+ def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
80
+ # TODO Raise if it doesn't quack substitute?
81
+ @substituter = substituter
82
+ end
83
+ def substitute_characters text
84
+ substituter?? substituter.substitute(text) : text
85
+ end
86
+
87
+ # Reject tokens after tokenizing based on the given criteria.
88
+ #
89
+ # Note: Currently only for indexing. TODO Redesign and write for both!
90
+ #
91
+ def reject_token_if &condition
92
+ @reject_condition = condition
93
+ end
94
+ def reject tokens
95
+ tokens.reject! &@reject_condition
96
+ end
97
+
98
+
99
+ # Returns a number of tokens, generated from the given text.
100
+ #
101
+ # Note:
102
+ # * preprocess, pretokenize are hooks
103
+ #
104
+ def tokenize text
105
+ text = preprocess text # processing the text
106
+ return empty_tokens if text.blank?
107
+ words = pretokenize text # splitting and preparations for tokenizing
108
+ return empty_tokens if words.empty?
109
+ tokens = tokens_for words # creating tokens / strings
110
+ process tokens # processing tokens / strings
111
+ end
112
+
113
+ attr_reader :substituter
114
+ alias substituter? substituter
115
+
116
+ def initialize options = {}
117
+ removes_characters options[:removes_characters] if options[:removes_characters]
118
+ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
119
+ stopwords options[:stopwords] if options[:stopwords]
120
+ normalizes_words options[:normalizes_words] if options[:normalizes_words]
121
+ removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
122
+ substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
123
+
124
+ # Defaults.
125
+ #
126
+ splits_text_on options[:splits_text_on] || /\s/
127
+ reject_token_if &(options[:reject_token_if] || :blank?)
128
+ end
129
+
130
+ # Hooks.
131
+ #
132
+
133
+ # Preprocessing.
134
+ #
135
+ def preprocess text; end
136
+ # Pretokenizing.
137
+ #
138
+ def pretokenize text; end
139
+ # Postprocessing.
140
+ #
141
+ def process tokens
142
+ reject tokens # Reject any tokens that don't meet criteria
143
+ tokens
144
+ end
145
+
146
+ # Converts words into real tokens.
147
+ #
148
+ def tokens_for words
149
+ Internals::Query::Tokens.new words.collect! { |word| token_for word }
150
+ end
151
+ # Turns non-blank text into symbols.
152
+ #
153
+ def symbolize text
154
+ text.blank? ? nil : text.to_sym
155
+ end
156
+ # Returns a tokens object.
157
+ #
158
+ def empty_tokens
159
+ Internals::Query::Tokens.new
160
+ end
161
+
162
+ end
163
+
164
+ end
165
+
166
+ end
@@ -0,0 +1,63 @@
1
+ module Internals
2
+
3
+ module Tokenizers
4
+
5
+ # The base indexing tokenizer.
6
+ #
7
+ # Override in indexing subclasses and define in configuration.
8
+ #
9
+ class Index < Base
10
+
11
+ def self.default= new_default
12
+ @default = new_default
13
+ end
14
+ def self.default
15
+ @default ||= new
16
+ end
17
+
18
+ # Default indexing preprocessing hook.
19
+ #
20
+ # Does:
21
+ # 1. Character substitution.
22
+ # 2. Downcasing.
23
+ # 3. Remove illegal expressions.
24
+ # 4. Remove non-single stopwords. (Stopwords that occur with other words)
25
+ #
26
+ def preprocess text
27
+ text = substitute_characters text
28
+ text.downcase!
29
+ remove_illegals text
30
+ # we do not remove single stopwords for an entirely different
31
+ # reason than in the query tokenizer.
32
+ # An indexed thing with just name "UND" (a possible stopword) should not lose its name.
33
+ #
34
+ remove_non_single_stopwords text
35
+ text
36
+ end
37
+
38
+ # Default indexing pretokenizing hook.
39
+ #
40
+ # Does:
41
+ # 1. Split the text into words.
42
+ # 2. Normalize each word.
43
+ #
44
+ def pretokenize text
45
+ words = split text
46
+ words.collect! do |word|
47
+ normalize_with_patterns word
48
+ word
49
+ end
50
+ end
51
+
52
+ # Does not actually return a token, but a
53
+ # symbol "token".
54
+ #
55
+ def token_for text
56
+ symbolize text
57
+ end
58
+
59
+ end
60
+
61
+ end
62
+
63
+ end