picky 1.5.2 → 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data/lib/picky/analyzer.rb +154 -0
  2. data/lib/picky/application.rb +53 -33
  3. data/lib/picky/character_substituters/west_european.rb +10 -6
  4. data/lib/picky/cli.rb +18 -18
  5. data/lib/picky/index/base.rb +44 -13
  6. data/lib/picky/index_bundle.rb +13 -4
  7. data/lib/picky/indexed/indexes.rb +26 -10
  8. data/lib/picky/indexing/indexes.rb +26 -24
  9. data/lib/picky/interfaces/live_parameters.rb +23 -16
  10. data/lib/picky/internals/extensions/object.rb +13 -6
  11. data/lib/picky/internals/frontend_adapters/rack.rb +30 -34
  12. data/lib/picky/internals/index/backend.rb +1 -2
  13. data/lib/picky/internals/index/file/basic.rb +18 -14
  14. data/lib/picky/internals/index/files.rb +16 -6
  15. data/lib/picky/internals/index/redis/basic.rb +12 -5
  16. data/lib/picky/internals/index/redis.rb +2 -2
  17. data/lib/picky/internals/indexed/bundle/base.rb +58 -14
  18. data/lib/picky/internals/indexed/bundle/memory.rb +40 -14
  19. data/lib/picky/internals/indexed/bundle/redis.rb +9 -30
  20. data/lib/picky/internals/indexed/categories.rb +19 -14
  21. data/lib/picky/internals/indexed/category.rb +44 -20
  22. data/lib/picky/internals/indexed/index.rb +23 -13
  23. data/lib/picky/internals/indexed/wrappers/bundle/wrapper.rb +27 -9
  24. data/lib/picky/internals/indexers/serial.rb +1 -1
  25. data/lib/picky/internals/indexing/bundle/base.rb +28 -28
  26. data/lib/picky/internals/indexing/bundle/memory.rb +14 -7
  27. data/lib/picky/internals/indexing/categories.rb +15 -11
  28. data/lib/picky/internals/indexing/category.rb +30 -20
  29. data/lib/picky/internals/indexing/index.rb +22 -14
  30. data/lib/picky/internals/query/allocations.rb +0 -15
  31. data/lib/picky/internals/query/combinations/base.rb +0 -4
  32. data/lib/picky/internals/query/combinations/redis.rb +19 -8
  33. data/lib/picky/internals/query/indexes.rb +3 -6
  34. data/lib/picky/internals/query/token.rb +0 -4
  35. data/lib/picky/internals/query/weights.rb +2 -11
  36. data/lib/picky/internals/results/base.rb +3 -10
  37. data/lib/picky/internals/tokenizers/base.rb +64 -28
  38. data/lib/picky/internals/tokenizers/index.rb +8 -8
  39. data/lib/picky/loader.rb +59 -53
  40. data/lib/picky/query/base.rb +23 -29
  41. data/lib/picky/sources/base.rb +10 -10
  42. data/lib/picky/sources/couch.rb +14 -10
  43. data/lib/picky/sources/csv.rb +21 -14
  44. data/lib/picky/sources/db.rb +37 -31
  45. data/lib/picky/sources/delicious.rb +11 -8
  46. data/lib/picky/sources/wrappers/base.rb +3 -1
  47. data/lib/picky/statistics.rb +66 -0
  48. data/lib/tasks/application.rake +3 -0
  49. data/lib/tasks/checks.rake +11 -0
  50. data/lib/tasks/framework.rake +3 -0
  51. data/lib/tasks/index.rake +9 -11
  52. data/lib/tasks/routes.rake +3 -2
  53. data/lib/tasks/shortcuts.rake +17 -5
  54. data/lib/tasks/statistics.rake +20 -12
  55. data/lib/tasks/try.rake +14 -14
  56. data/spec/lib/application_spec.rb +3 -3
  57. data/spec/lib/index/base_spec.rb +25 -3
  58. data/spec/lib/internals/extensions/object_spec.rb +46 -20
  59. data/spec/lib/internals/frontend_adapters/rack_spec.rb +3 -3
  60. data/spec/lib/internals/index/redis/basic_spec.rb +67 -0
  61. data/spec/lib/internals/indexers/serial_spec.rb +1 -1
  62. data/spec/lib/internals/results/base_spec.rb +0 -12
  63. data/spec/lib/internals/tokenizers/base_spec.rb +49 -1
  64. data/spec/lib/query/allocations_spec.rb +0 -56
  65. data/spec/lib/query/base_spec.rb +25 -21
  66. data/spec/lib/query/combinations/redis_spec.rb +6 -1
  67. data/spec/lib/sources/delicious_spec.rb +2 -2
  68. data/spec/lib/statistics_spec.rb +31 -0
  69. metadata +9 -2
@@ -1,45 +1,61 @@
1
1
  module Indexed
2
-
2
+
3
3
  # Registers the indexes held at runtime, for queries.
4
4
  #
5
5
  class Indexes
6
-
6
+
7
7
  attr_reader :indexes, :index_mapping
8
-
8
+
9
9
  each_delegate :load_from_cache,
10
10
  :to => :indexes
11
-
11
+
12
12
  def initialize
13
13
  clear
14
14
  end
15
-
15
+
16
+ def to_s
17
+ indexes.indented_to_s
18
+ end
19
+
16
20
  # Clears the indexes and the mapping.
17
21
  #
18
22
  def clear
19
23
  @indexes = []
20
24
  @index_mapping = {}
21
25
  end
22
-
26
+
23
27
  # Reloads all indexes, one after another,
24
28
  # in the order they were added.
25
29
  #
26
30
  def reload
27
31
  load_from_cache
28
32
  end
29
-
33
+
30
34
  # Registers an index with the indexes.
31
35
  #
32
36
  def register index
33
37
  self.indexes << index
34
38
  self.index_mapping[index.name] = index
35
39
  end
36
-
40
+
41
+ # Load each index, and analyze it.
42
+ #
43
+ # Returns a hash with the findings.
44
+ #
45
+ def analyze
46
+ result = {}
47
+ self.indexes.each do |index|
48
+ index.analyze result
49
+ end
50
+ result
51
+ end
52
+
37
53
  # Extracts an index, given its identifier.
38
54
  #
39
55
  def [] identifier
40
56
  index_mapping[identifier.to_sym]
41
57
  end
42
-
58
+
43
59
  end
44
-
60
+
45
61
  end
@@ -1,11 +1,11 @@
1
1
  module Indexing
2
-
2
+
3
3
  # Registers the indexes held at index time, for indexing.
4
4
  #
5
5
  class Indexes
6
-
6
+
7
7
  attr_reader :indexes
8
-
8
+
9
9
  each_delegate :take_snapshot,
10
10
  :generate_caches,
11
11
  :backup_caches,
@@ -14,57 +14,59 @@ module Indexing
14
14
  :clear_caches,
15
15
  :create_directory_structure,
16
16
  :to => :indexes
17
-
17
+
18
18
  def initialize
19
19
  clear
20
20
  end
21
-
21
+
22
+ def to_s
23
+ indexes.indented_to_s
24
+ end
25
+
22
26
  # Clears the array of indexes.
23
27
  #
24
28
  def clear
25
29
  @indexes = []
26
30
  end
27
-
31
+
28
32
  # Registers an index with the indexes.
29
33
  #
30
34
  def register index
31
35
  self.indexes << index
32
36
  end
33
-
37
+
34
38
  # Runs the indexers in parallel (index + cache).
35
39
  #
36
40
  # TODO Spec.
37
41
  #
38
42
  def index randomly = true
39
43
  take_snapshot
40
-
44
+
41
45
  # Run in parallel.
42
46
  #
43
- timed_exclaim "INDEXING USING #{Cores.max_processors} PROCESSORS, IN #{randomly ? 'RANDOM' : 'GIVEN'} ORDER."
44
-
45
- # TODO Think about having serial work units.
47
+ timed_exclaim "Indexing using #{Cores.max_processors} processors, in #{randomly ? 'random' : 'given'} order."
48
+
49
+ # Run indexing/caching forked.
46
50
  #
47
51
  Cores.forked self.indexes, { randomly: randomly } do |an_index|
48
52
  an_index.index
49
- # TODO
50
- # end
51
- # Cores.forked self.indexes, { randomly: randomly } do |an_index|
52
53
  an_index.cache
53
54
  end
54
- timed_exclaim "INDEXING FINISHED."
55
+
56
+ timed_exclaim "Indexing finished."
55
57
  end
56
-
58
+
57
59
  # For integration testing – indexes for the tests without forking and shouting ;)
58
60
  #
59
61
  def index_for_tests
60
62
  take_snapshot
61
-
63
+
62
64
  self.indexes.each do |an_index|
63
65
  an_index.index
64
66
  an_index.cache
65
67
  end
66
68
  end
67
-
69
+
68
70
  # Generate only the index for the given index:category pair.
69
71
  #
70
72
  def generate_index_only index_name, category_name = nil
@@ -77,23 +79,23 @@ module Indexing
77
79
  found = find index_name, category_name
78
80
  found.generate_caches if found
79
81
  end
80
-
82
+
81
83
  # Find a given index:category pair.
82
84
  #
83
85
  def find index_name, category_name
84
86
  index_name = index_name.to_sym
85
-
87
+
86
88
  indexes.each do |index|
87
89
  next unless index.name == index_name
88
-
90
+
89
91
  return index unless category_name
90
-
92
+
91
93
  found = index.categories.find category_name
92
94
  return found if found
93
95
  end
94
-
96
+
95
97
  raise %Q{Index "#{index_name}" not found. Possible indexes: "#{indexes.map(&:name).join('", "')}".}
96
98
  end
97
-
99
+
98
100
  end
99
101
  end
@@ -9,12 +9,12 @@ module Interfaces
9
9
  # Important Note: This will only work in Master/Child configurations.
10
10
  #
11
11
  class LiveParameters
12
-
12
+
13
13
  def initialize
14
14
  @child, @parent = IO.pipe
15
15
  start_master_process_thread
16
16
  end
17
-
17
+
18
18
  # This runs a thread that listens to child processes.
19
19
  #
20
20
  def start_master_process_thread
@@ -30,18 +30,19 @@ module Interfaces
30
30
  exclaim "Trying to update MASTER configuration."
31
31
  try_updating_configuration_with configuration_hash
32
32
  kill_each_worker_except pid
33
- # TODO rescue on error.
34
-
33
+
34
+ # Fails hard on an error.
35
+ #
35
36
  end
36
37
  end
37
38
  end
38
-
39
+
39
40
  # TODO This needs to be webserver agnostic.
40
41
  #
41
42
  def worker_pids
42
43
  Unicorn::HttpServer::WORKERS.keys
43
44
  end
44
-
45
+
45
46
  # Taken from Unicorn.
46
47
  #
47
48
  def kill_each_worker_except pid
@@ -61,14 +62,14 @@ module Interfaces
61
62
  def remove_worker wpid
62
63
  worker = Unicorn::HttpServer::WORKERS.delete(wpid) and worker.tmp.close rescue nil
63
64
  end
64
-
65
+
65
66
  # Updates any parameters with the ones given and
66
67
  # returns the updated params.
67
68
  #
68
69
  # The params are a strictly defined hash of:
69
70
  # * querying_removes_characters: Regexp
70
71
  # * querying_stopwords: Regexp
71
- # TODO etc.
72
+ # * querying_splits_text_on: Regexp
72
73
  #
73
74
  # This first tries to update in the child process,
74
75
  # and if successful, in the parent process
@@ -103,7 +104,7 @@ module Interfaces
103
104
  def close_child
104
105
  @child.close unless @child.closed?
105
106
  end
106
-
107
+
107
108
  class CouldNotUpdateConfigurationError < StandardError
108
109
  attr_reader :config_key
109
110
  def initialize config_key, message
@@ -111,9 +112,9 @@ module Interfaces
111
112
  @config_key = config_key
112
113
  end
113
114
  end
114
-
115
+
115
116
  # Tries updating the configuration in the child process or parent process.
116
- #
117
+ #
117
118
  def try_updating_configuration_with configuration_hash
118
119
  current_key = nil
119
120
  begin
@@ -128,7 +129,7 @@ module Interfaces
128
129
  raise CouldNotUpdateConfigurationError.new current_key, e.message
129
130
  end
130
131
  end
131
-
132
+
132
133
  def extract_configuration
133
134
  {
134
135
  querying_removes_characters: querying_removes_characters,
@@ -136,7 +137,7 @@ module Interfaces
136
137
  querying_splits_text_on: querying_splits_text_on
137
138
  }
138
139
  end
139
-
140
+
140
141
  # TODO Move to Interface object.
141
142
  #
142
143
  def querying_removes_characters
@@ -157,11 +158,17 @@ module Interfaces
157
158
  def querying_splits_text_on= new_value
158
159
  Tokenizers::Query.default.instance_variable_set(:@splits_text_on_regexp, %r{#{new_value}})
159
160
  end
160
-
161
+
162
+ #
163
+ #
164
+ def to_s
165
+ "Suckerfish Live Interface (Use the picky-live gem to introspect)"
166
+ end
167
+
161
168
  end
162
-
169
+
163
170
  # Aka.
164
171
  #
165
172
  ::LiveParameters = LiveParameters
166
-
173
+
167
174
  end
@@ -1,22 +1,29 @@
1
1
  class Object # :nodoc:all
2
-
2
+
3
3
  # Puts a text in the form:
4
4
  # 12:34:56: text here
5
5
  #
6
6
  def timed_exclaim text
7
7
  exclaim "#{Time.now.strftime("%H:%M:%S")}: #{text}"
8
8
  end
9
-
9
+
10
10
  # Just puts the given text.
11
11
  #
12
12
  def exclaim text
13
13
  puts text
14
14
  end
15
-
15
+
16
16
  # Puts a text that informs the user of a missing gem.
17
17
  #
18
- def puts_gem_missing gem_name, message
19
- puts "#{gem_name} gem missing!\nTo use #{message}, you need to:\n 1. Add the following line to Gemfile:\n gem '#{gem_name}'\n 2. Then, run:\n bundle update\n"
18
+ def warn_gem_missing gem_name, message
19
+ warn "#{gem_name} gem missing!\nTo use #{message}, you need to:\n 1. Add the following line to Gemfile:\n gem '#{gem_name}'\n 2. Then, run:\n bundle update\n"
20
20
  end
21
-
21
+
22
+ # Indents each line by <tt>amount=2</tt> spaces.
23
+ #
24
+ def indented_to_s amount = 2
25
+ ary = self.respond_to?(:join) ? self : self.to_s.split("\n")
26
+ ary.map { |s| "#{" "*amount}#{s}"}.join("\n")
27
+ end
28
+
22
29
  end
@@ -7,18 +7,8 @@ module Internals
7
7
  # TODO Rename to Routing again. Push everything back into appropriate Adapters.
8
8
  #
9
9
  class Rack # :nodoc:all
10
-
11
- @@defaults = {
12
- query_key: 'query'.freeze,
13
- offset_key: 'offset'.freeze,
14
- content_type: 'application/octet-stream'.freeze # TODO Wrong.
15
- }
16
-
17
- def initialize
18
- @defaults = @@defaults.dup
19
- end
20
-
21
- #
10
+
11
+ #
22
12
  #
23
13
  def reset_routes
24
14
  @routes = ::Rack::Mount::RouteSet.new
@@ -29,13 +19,13 @@ module Internals
29
19
  def finalize
30
20
  routes.freeze
31
21
  end
32
-
22
+
33
23
  # Routing simply delegates to the route set to handle a request.
34
24
  #
35
25
  def call env
36
26
  routes.call env
37
27
  end
38
-
28
+
39
29
  # API method.
40
30
  #
41
31
  def route options = {}
@@ -61,7 +51,7 @@ module Internals
61
51
  end
62
52
  def route_one url, query, route_options = {}
63
53
  raise RouteTargetNilError.new(url) unless query
64
- routes.add_route Internals::Adapters::Rack.app_for(query, route_options), default_options(url, route_options)
54
+ routes.add_route Internals::Adapters::Rack.app_for(query, route_options), default_options(url, route_options), {}, query.to_s
65
55
  end
66
56
  class RouteTargetNilError < StandardError
67
57
  def initialize url
@@ -81,9 +71,9 @@ module Internals
81
71
  def default status
82
72
  answer nil, STATUSES[status]
83
73
  end
84
-
85
-
86
-
74
+
75
+
76
+
87
77
  # TODO Can Rack handle this for me?
88
78
  #
89
79
  # Note: Rack-mount already handles the 404.
@@ -92,21 +82,21 @@ module Internals
92
82
  200 => lambda { |_| [200, { 'Content-Type' => 'text/html', 'Content-Length' => '0' }, ['']] },
93
83
  404 => lambda { |_| [404, { 'Content-Type' => 'text/html', 'Content-Length' => '0' }, ['']] }
94
84
  }
95
-
85
+
96
86
  #
97
87
  #
98
88
  def default_options url, route_options = {}
99
89
  url = normalized url
100
-
90
+
101
91
  options = { request_method: 'GET' }.merge route_options
102
-
92
+
103
93
  options[:path_info] = url if url
104
-
94
+
105
95
  options.delete :content_type
106
-
96
+
107
97
  query_params = options.delete :query
108
98
  options[:query_string] = %r{#{generate_query_string(query_params)}} if query_params
109
-
99
+
110
100
  options
111
101
  end
112
102
  #
@@ -117,38 +107,44 @@ module Internals
117
107
  k, v = query_params.first
118
108
  "#{k}=#{v}"
119
109
  end
120
-
110
+
121
111
  # Setup a route that answers using the given app.
122
112
  #
123
113
  def answer url = nil, app = nil
124
114
  routes.add_route (app || STATUSES[200]), default_options(url)
125
115
  end
126
-
116
+
127
117
  # Returns a regular expression for the url if it is given a String-like object.
128
118
  #
129
119
  def normalized url
130
120
  url.respond_to?(:to_str) ? %r{#{url}} : url
131
121
  end
132
-
122
+
133
123
  # Returns true if there are no routes defined.
134
124
  #
135
125
  def empty?
136
126
  routes.length.zero?
137
127
  end
138
-
139
- # TODO Beautify.
128
+
129
+ # TODO Beautify. Rewrite!
140
130
  #
141
131
  def to_s
132
+ max_length = routes.instance_variable_get(:@routes).reduce(0) do |current_max, route|
133
+ route_length = route.conditions[:path_info].source.to_s.size
134
+ route_length > current_max ? route_length : current_max
135
+ end
136
+ "Note: Anchored (\u2713) regexps are faster, e.g. /\\A.*\\Z/ or /^.*$/.\n\n" +
142
137
  routes.instance_variable_get(:@routes).map do |route|
143
138
  path_info = route.conditions[:path_info]
144
139
  anchored = ::Rack::Mount::Utils.regexp_anchored?(path_info)
145
140
  anchored_ok = anchored ? "\u2713" : " "
146
- "#{anchored_ok} #{path_info.source}"
147
- end.join "\n"
141
+ source = path_info.source
142
+ "#{anchored_ok} #{source.ljust(max_length)} => #{route.name}"
143
+ end.join("\n")
148
144
  end
149
-
145
+
150
146
  end
151
-
147
+
152
148
  end
153
-
149
+
154
150
  end
@@ -95,8 +95,7 @@ module Internals
95
95
  similarity.restore
96
96
  configuration.restore
97
97
  end
98
-
99
-
98
+
100
99
  # Delete all index files.
101
100
  #
102
101
  def delete
@@ -1,11 +1,11 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  # Handles all aspects of index files, such as dumping/loading.
6
6
  #
7
7
  module File
8
-
8
+
9
9
  # Base class for all index files.
10
10
  #
11
11
  # Provides necessary helper methods for its
@@ -14,22 +14,26 @@ module Internals
14
14
  # dump/load methods.
15
15
  #
16
16
  class Basic
17
-
17
+
18
18
  attr_reader :cache_path
19
-
19
+
20
20
  # An index cache takes a path, without file extension,
21
21
  # which will be provided by the subclasses.
22
22
  #
23
23
  def initialize cache_path
24
24
  @cache_path = "#{cache_path}.#{extension}"
25
25
  end
26
-
26
+
27
+ def to_s
28
+ cache_path
29
+ end
30
+
27
31
  # The default extension for index files is "index".
28
32
  #
29
33
  def extension
30
34
  :index
31
35
  end
32
-
36
+
33
37
  # Will copy the index file to a location that
34
38
  # is in a directory named "backup" right under
35
39
  # the directory the index file is in.
@@ -49,7 +53,7 @@ module Internals
49
53
  def prepare_backup target
50
54
  FileUtils.mkdir target unless Dir.exists?(target)
51
55
  end
52
-
56
+
53
57
  # Copies the file from its backup location back
54
58
  # to the original location.
55
59
  #
@@ -62,16 +66,16 @@ module Internals
62
66
  dir, name = ::File.split path
63
67
  ::File.join dir, 'backup', name
64
68
  end
65
-
69
+
66
70
  # Deletes the file.
67
71
  #
68
72
  def delete
69
73
  `rm -Rf #{cache_path}`
70
74
  end
71
-
75
+
72
76
  # Checks.
73
77
  #
74
-
78
+
75
79
  # Is this cache file suspiciously small?
76
80
  # (less than 8 Bytes of size)
77
81
  #
@@ -91,11 +95,11 @@ module Internals
91
95
  def size_of path
92
96
  `ls -l #{path} | awk '{print $5}'`.to_i
93
97
  end
94
-
98
+
95
99
  end
96
-
100
+
97
101
  end
98
-
102
+
99
103
  end
100
-
104
+
101
105
  end
@@ -1,12 +1,12 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Files < Backend
6
-
6
+
7
7
  def initialize bundle_name, config
8
8
  super bundle_name, config
9
-
9
+
10
10
  # Note: We marshal the similarity, as the
11
11
  # Yajl json lib cannot load symbolized
12
12
  # values, just keys.
@@ -16,9 +16,19 @@ module Internals
16
16
  @similarity = File::Marshal.new config.index_path(bundle_name, :similarity)
17
17
  @configuration = File::JSON.new config.index_path(bundle_name, :configuration)
18
18
  end
19
-
19
+
20
+ def to_s
21
+ <<-FILES
22
+ Files:
23
+ #{"Index: #{@index}".indented_to_s}
24
+ #{"Weights: #{@weights}".indented_to_s}
25
+ #{"Similarity: #{@similarity}".indented_to_s}
26
+ #{"Config: #{@configuration}".indented_to_s}
27
+ FILES
28
+ end
29
+
20
30
  end
21
-
31
+
22
32
  end
23
-
33
+
24
34
  end
@@ -49,7 +49,10 @@ module Internals
49
49
  # Deletes the Redis index namespace.
50
50
  #
51
51
  def delete
52
- # TODO @backend.
52
+ # Not implemented here.
53
+ # Note: backend.flushdb might be the way to go,
54
+ # but since we cannot delete by key pattern,
55
+ # we don't do anything.
53
56
  end
54
57
 
55
58
  # Checks.
@@ -58,19 +61,23 @@ module Internals
58
61
  # Is this cache suspiciously small?
59
62
  #
60
63
  def cache_small?
61
- false # TODO
64
+ size < 1
62
65
  end
63
66
  # Is the cache ok?
64
67
  #
65
68
  # A small cache is still ok.
66
69
  #
67
70
  def cache_ok?
68
- false # TODO
71
+ size > 0
69
72
  end
70
73
  # Extracts the size of the file in Bytes.
71
74
  #
72
- def size_of path
73
- # TODO
75
+ # Note: This is a very forgiving implementation.
76
+ # But as long as Redis does not implement
77
+ # DBSIZE KEYPATTERN, we are stuck with this.
78
+ #
79
+ def size
80
+ backend.dbsize
74
81
  end
75
82
 
76
83
  end
@@ -8,8 +8,8 @@ module Internals
8
8
 
9
9
  def initialize bundle_name, config
10
10
  super bundle_name, config
11
-
12
- # TODO
11
+
12
+ # Refine a few Redis "types".
13
13
  #
14
14
  @index = Redis::ListHash.new "#{config.identifier}:#{bundle_name}:index"
15
15
  @weights = Redis::StringHash.new "#{config.identifier}:#{bundle_name}:weights"