picky 2.0.0.pre2 → 2.0.0.pre3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/lib/picky/application.rb +1 -1
  2. data/lib/picky/cli.rb +2 -2
  3. data/lib/picky/index/base.rb +1 -1
  4. data/lib/picky/internals/generators/similarity/double_metaphone.rb +32 -0
  5. data/lib/picky/internals/generators/similarity/metaphone.rb +32 -0
  6. data/lib/picky/internals/generators/similarity/{double_levenshtone.rb → phonetic.rb} +9 -21
  7. data/lib/picky/internals/generators/similarity/soundex.rb +32 -0
  8. data/lib/picky/internals/index/redis/basic.rb +15 -15
  9. data/lib/picky/internals/index/redis/list_hash.rb +13 -13
  10. data/lib/picky/internals/index/redis/string_hash.rb +11 -9
  11. data/lib/picky/internals/indexers/serial.rb +8 -8
  12. data/lib/picky/internals/indexing/bundle/base.rb +1 -1
  13. data/lib/picky/internals/indexing/bundle/memory.rb +1 -4
  14. data/lib/picky/internals/indexing/category.rb +3 -3
  15. data/lib/picky/internals/query/combinations/base.rb +5 -11
  16. data/lib/picky/internals/query/combinations/redis.rb +44 -24
  17. data/lib/picky/internals/query/indexes.rb +29 -24
  18. data/lib/picky/internals/query/token.rb +12 -12
  19. data/lib/picky/internals/tokenizers/base.rb +1 -1
  20. data/lib/picky/loader.rb +4 -4
  21. data/lib/picky/sources/couch.rb +4 -6
  22. data/lib/picky/sources/delicious.rb +1 -1
  23. data/spec/lib/analyzer_spec.rb +18 -0
  24. data/spec/lib/application_spec.rb +13 -3
  25. data/spec/lib/bundling_spec.rb +21 -0
  26. data/spec/lib/character_substituters/west_european_spec.rb +8 -2
  27. data/spec/lib/cli_spec.rb +45 -17
  28. data/spec/lib/index/redis_spec.rb +15 -0
  29. data/spec/lib/internals/adapters/rack/live_parameters_spec.rb +11 -6
  30. data/spec/lib/internals/frontend_adapters/rack_spec.rb +22 -0
  31. data/spec/lib/internals/generators/similarity/{double_levenshtone_spec.rb → double_metaphone_spec.rb} +1 -7
  32. data/spec/lib/internals/generators/similarity/metaphone_spec.rb +60 -0
  33. data/spec/lib/internals/generators/similarity/phonetic_spec.rb +13 -0
  34. data/spec/lib/internals/generators/similarity/soundex_spec.rb +60 -0
  35. data/spec/lib/internals/generators/similarity_generator_spec.rb +1 -1
  36. data/spec/lib/internals/index/file/basic_spec.rb +15 -5
  37. data/spec/lib/internals/index/redis/list_hash_spec.rb +34 -0
  38. data/spec/lib/internals/index/redis/string_hash_spec.rb +12 -0
  39. data/spec/lib/internals/indexed/bundle/memory_spec.rb +66 -0
  40. data/spec/lib/internals/indexing/bundle/memory_spec.rb +87 -71
  41. data/spec/lib/internals/indexing/bundle/redis_spec.rb +282 -0
  42. data/spec/lib/internals/indexing/bundle/super_base_spec.rb +1 -1
  43. data/spec/lib/internals/indexing/categories_spec.rb +49 -0
  44. data/spec/lib/internals/indexing/category_spec.rb +68 -35
  45. data/spec/lib/query/combinations/base_spec.rb +0 -9
  46. data/spec/lib/query/combinations/memory_spec.rb +0 -9
  47. data/spec/lib/query/combinations/redis_spec.rb +40 -5
  48. data/spec/lib/sources/couch_spec.rb +22 -0
  49. data/spec/lib/sources/csv_spec.rb +7 -0
  50. data/spec/lib/sources/db_spec.rb +7 -1
  51. data/spec/lib/sources/delicious_spec.rb +6 -2
  52. metadata +26 -5
@@ -130,7 +130,7 @@
130
130
  # books.define_category :title,
131
131
  # qualifiers: [:t, :title, :titre],
132
132
  # partial: Partial::Substring.new(:from => 1),
133
- # similarity: Similarity::Phonetic.new(2)
133
+ # similarity: Similarity::DoubleMetaphone.new(2)
134
134
  # books.define_category :author,
135
135
  # partial: Partial::Substring.new(:from => -2)
136
136
  # books.define_category :isbn
data/lib/picky/cli.rb CHANGED
@@ -71,7 +71,7 @@ module Picky
71
71
  end
72
72
  class Generate < Base
73
73
  def execute name, args, params
74
- system "picky-generate #{args.join(' ')}"
74
+ Kernel.system "picky-generate #{args.join(' ')}"
75
75
  end
76
76
  end
77
77
  class Help < Base
@@ -83,7 +83,7 @@ module Picky
83
83
  " picky #{command} #{params_to_s(params)}"
84
84
  end.join(?\n)
85
85
 
86
- puts "Possible commands:\n" + commands
86
+ Kernel.puts "Possible commands:\n#{commands}\n"
87
87
  end
88
88
  end
89
89
 
@@ -79,7 +79,7 @@ INDEX
79
79
  #
80
80
  # === Options
81
81
  # * partial: Partial::None.new or Partial::Substring.new(from: starting_char, to: ending_char). Default is Partial::Substring.new(from: -3, to: -1).
82
- # * similarity: Similarity::None.new or Similarity::Phonetic.new(similar_words_searched). Default is Similarity::None.new.
82
+ # * similarity: Similarity::None.new or Similarity::DoubleMetaphone.new(similar_words_searched). Default is Similarity::None.new.
83
83
  # * qualifiers: An array of qualifiers with which you can define which category you’d like to search, for example “title:hobbit” will search for hobbit in just title categories. Example: qualifiers: [:t, :titre, :title] (use it for example with multiple languages). Default is the name of the category.
84
84
  # * qualifier: Convenience options if you just need a single qualifier, see above. Example: qualifiers => :title. Default is the name of the category.
85
85
  # * source: Use a different source than the index uses. If you think you need that, there might be a better solution to your problem. Please post to the mailing list first with your application.rb :)
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ module Generators
6
+
7
+ module Similarity
8
+
9
+ # It's actually a combination of double metaphone
10
+ # and Levenshtein.
11
+ #
12
+ # It uses the double metaphone to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class DoubleMetaphone < Phonetic
16
+
17
+ # Encodes the given symbol.
18
+ #
19
+ # Returns a symbol.
20
+ #
21
+ def encoded sym
22
+ codes = Text::Metaphone.double_metaphone sym.to_s
23
+ codes.first.to_sym unless codes.empty?
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ module Generators
6
+
7
+ module Similarity
8
+
9
+ # It's actually a combination of metaphone
10
+ # and Levenshtein.
11
+ #
12
+ # It uses the metaphone to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class Metaphone < Phonetic
16
+
17
+ # Encodes the given symbol.
18
+ #
19
+ # Returns a symbol.
20
+ #
21
+ def encoded sym
22
+ code = Text::Metaphone.metaphone sym.to_s
23
+ code.to_sym if code
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -6,31 +6,23 @@ module Internals
6
6
 
7
7
  module Similarity
8
8
 
9
- # DoubleLevensthone means that it's a combination of
10
- # * DoubleMetaphone
11
- # and
12
- # * Levenshtein
13
- # :)
9
+ # It's actually a combination of double metaphone
10
+ # and Levenshtein.
14
11
  #
15
- class DoubleLevenshtone < Strategy
12
+ # It uses the double metaphone to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class Phonetic < Strategy
16
16
 
17
17
  attr_reader :amount
18
18
 
19
19
  #
20
20
  #
21
21
  def initialize amount = 10
22
+ raise "From Picky 2.0 on you need to use the DoubleMetaphone similarity instead of the Phonetic similarity." if self.class == Phonetic
22
23
  @amount = amount
23
24
  end
24
25
 
25
- # Encodes the given symbol.
26
- #
27
- # Returns a symbol.
28
- #
29
- def encoded sym
30
- codes = Text::Metaphone.double_metaphone sym.to_s
31
- codes.first.to_sym unless codes.empty?
32
- end
33
-
34
26
  # Generates an index for the given index (in exact index style).
35
27
  #
36
28
  # In the following form:
@@ -41,7 +33,7 @@ module Internals
41
33
  sort hash
42
34
  end
43
35
 
44
- private
36
+ protected
45
37
 
46
38
  # Sorts the index values in place.
47
39
  #
@@ -69,13 +61,9 @@ module Internals
69
61
  end
70
62
 
71
63
  end
72
-
73
- # ... aka Phonetic.
74
- #
75
- Phonetic = DoubleLevenshtone
76
64
 
77
65
  end
78
66
 
79
67
  end
80
-
68
+
81
69
  end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ module Generators
6
+
7
+ module Similarity
8
+
9
+ # It's actually a combination of soundex
10
+ # and Levenshtein.
11
+ #
12
+ # It uses the soundex to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class Soundex < Phonetic
16
+
17
+ # Encodes the given symbol.
18
+ #
19
+ # Returns a symbol.
20
+ #
21
+ def encoded sym
22
+ code = Text::Soundex.soundex sym.to_s
23
+ code.to_sym if code
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -1,9 +1,9 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Redis
6
-
6
+
7
7
  # Redis Backend Accessor.
8
8
  #
9
9
  # Provides necessary helper methods for its
@@ -12,23 +12,23 @@ module Internals
12
12
  # dump/load methods.
13
13
  #
14
14
  class Basic
15
-
15
+
16
16
  attr_reader :namespace, :backend
17
-
17
+
18
18
  # An index cache takes a path, without file extension,
19
19
  # which will be provided by the subclasses.
20
20
  #
21
21
  def initialize namespace
22
22
  @namespace = namespace
23
-
23
+
24
24
  # TODO Turn this inside out such that people can pass in
25
25
  # their own Redis instance.
26
- #
26
+ #
27
27
  # TODO Make the :db a real option.
28
28
  #
29
29
  @backend = ::Redis.new :db => 15
30
30
  end
31
-
31
+
32
32
  # Does nothing.
33
33
  #
34
34
  def load
@@ -39,13 +39,13 @@ module Internals
39
39
  def retrieve
40
40
  # Nothing.
41
41
  end
42
-
42
+
43
43
  # Redis does not backup.
44
44
  #
45
45
  def backup
46
46
  # Nothing.
47
47
  end
48
-
48
+
49
49
  # Deletes the Redis index namespace.
50
50
  #
51
51
  def delete
@@ -54,10 +54,10 @@ module Internals
54
54
  # but since we cannot delete by key pattern,
55
55
  # we don't do anything.
56
56
  end
57
-
57
+
58
58
  # Checks.
59
59
  #
60
-
60
+
61
61
  # Is this cache suspiciously small?
62
62
  #
63
63
  def cache_small?
@@ -79,11 +79,11 @@ module Internals
79
79
  def size
80
80
  backend.dbsize
81
81
  end
82
-
82
+
83
83
  end
84
-
84
+
85
85
  end
86
-
86
+
87
87
  end
88
-
88
+
89
89
  end
@@ -1,11 +1,11 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Redis
6
-
6
+
7
7
  class ListHash < Basic
8
-
8
+
9
9
  # Writes the hash into Redis.
10
10
  #
11
11
  # TODO Performance: rpush as you get the values instead of putting it together in an array first.
@@ -14,33 +14,33 @@ module Internals
14
14
  hash.each_pair do |key, values|
15
15
  redis_key = "#{namespace}:#{key}"
16
16
  i = 0
17
- @backend.multi do
18
- @backend.del redis_key
19
-
17
+ backend.multi do
18
+ backend.del redis_key
19
+
20
20
  values.each do |value|
21
21
  i += 1
22
- @backend.zadd redis_key, i, value
22
+ backend.zadd redis_key, i, value
23
23
  end
24
24
  end
25
25
  end
26
26
  end
27
-
27
+
28
28
  # Get a collection.
29
29
  #
30
30
  def collection sym
31
- @backend.lrange "#{namespace}:#{sym}", 0, -1
31
+ backend.lrange "#{namespace}:#{sym}", 0, -1
32
32
  end
33
-
33
+
34
34
  # Get a single value.
35
35
  #
36
36
  def member sym
37
37
  raise "Can't retrieve a single value from a Redis ListHash. Use Index::Redis::StringHash."
38
38
  end
39
-
39
+
40
40
  end
41
-
41
+
42
42
  end
43
-
43
+
44
44
  end
45
45
 
46
46
  end
@@ -1,36 +1,38 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Redis
6
-
6
+
7
7
  class StringHash < Basic
8
-
8
+
9
9
  # Writes the hash into Redis.
10
10
  #
11
+ # TODO Could we use multi?
12
+ #
11
13
  def dump hash
12
14
  redis = backend
13
15
  hash.each_pair do |key, value|
14
16
  redis.hset namespace, key, value
15
17
  end
16
18
  end
17
-
19
+
18
20
  # Get a collection.
19
21
  #
20
22
  def collection sym
21
23
  raise "Can't retrieve a collection from a StringHash. Use Index::Redis::ListHash."
22
24
  end
23
-
25
+
24
26
  # Get a single value.
25
27
  #
26
28
  def member sym
27
29
  backend.hget namespace, sym
28
30
  end
29
-
31
+
30
32
  end
31
-
33
+
32
34
  end
33
-
35
+
34
36
  end
35
-
37
+
36
38
  end
@@ -1,25 +1,25 @@
1
1
  # encoding: utf-8
2
2
  #
3
3
  module Indexers
4
-
4
+
5
5
  # The indexer defines the control flow.
6
6
  #
7
7
  class Serial
8
-
8
+
9
9
  attr_accessor :tokenizer, :source
10
-
10
+
11
11
  def initialize configuration, source, tokenizer
12
12
  @configuration = configuration
13
13
  @source = source || raise_no_source
14
14
  @tokenizer = tokenizer
15
15
  end
16
-
16
+
17
17
  # Raise a no source exception.
18
18
  #
19
19
  def raise_no_source
20
20
  raise NoSourceSpecifiedException.new("No source given for #{@configuration}.")
21
21
  end
22
-
22
+
23
23
  # Delegates the key format to the source.
24
24
  #
25
25
  # Default is to_i.
@@ -27,7 +27,7 @@ module Indexers
27
27
  def key_format
28
28
  @source.key_format || :to_i
29
29
  end
30
-
30
+
31
31
  # Selects the original id (indexed id) and a column to process. The column data is called "token".
32
32
  #
33
33
  # Note: Puts together the parts first in an array, then releasing the array from time to time by joining.
@@ -39,7 +39,7 @@ module Indexers
39
39
  def process
40
40
  comma = ?,
41
41
  newline = ?\n
42
-
42
+
43
43
  # TODO Move open to config?
44
44
  #
45
45
  # @category.prepared_index do |file|
@@ -67,6 +67,6 @@ module Indexers
67
67
  def indexing_message
68
68
  timed_exclaim %Q{"#{@configuration.identifier}": Starting indexing.}
69
69
  end
70
-
70
+
71
71
  end
72
72
  end
@@ -180,7 +180,7 @@ module Internals
180
180
  # Raises an appropriate error message for the given cache.
181
181
  #
182
182
  def raise_cache_missing what
183
- raise "#{what} cache for #{identifier} missing."
183
+ raise "Error: The #{what} cache for #{identifier} is missing."
184
184
  end
185
185
 
186
186
  # Warns the user if the similarity index is small.
@@ -17,10 +17,7 @@ module Internals
17
17
  alias backend files
18
18
 
19
19
  def to_s
20
- <<-MEMORY
21
- Memory
22
- #{@backend.indented_to_s}
23
- MEMORY
20
+ "Memory\n#{@backend.indented_to_s}"
24
21
  end
25
22
 
26
23
  end
@@ -14,7 +14,7 @@ module Internals
14
14
  # * index: Index to which this category is attached to.
15
15
  # Options:
16
16
  # * partial: Partial::None.new, Partial::Substring.new(from:start_char, to:up_to_char) (defaults from:-3, to:-1)
17
- # * similarity: Similarity::None.new (default), Similarity::Phonetic.new(amount_of_similarly_linked_words)
17
+ # * similarity: Similarity::None.new (default), Similarity::DoubleMetaphone.new(amount_of_similarly_linked_words)
18
18
  # * source: Use if the category should use a different source.
19
19
  # * from: The source category identifier to take the data from.
20
20
  #
@@ -23,6 +23,8 @@ module Internals
23
23
  # * weights:
24
24
  # * tokenizer:
25
25
  #
26
+ # TODO Should source be not optional, or taken from the index?
27
+ #
26
28
  def initialize name, index, options = {}
27
29
  @name = name
28
30
  @from = options[:from]
@@ -59,8 +61,6 @@ Category(#{name} from #{from}):
59
61
  @from || name
60
62
  end
61
63
 
62
- # TODO Spec.
63
- #
64
64
  def backup_caches
65
65
  timed_exclaim "Backing up #{identifier}."
66
66
  exact.backup
@@ -8,11 +8,11 @@ module Internals
8
8
  # An allocation consists of a number of combinations.
9
9
  #
10
10
  module Combinations # :nodoc:all
11
-
11
+
12
12
  # Base Combinations contain methods for calculating score and ids.
13
13
  #
14
14
  class Base
15
-
15
+
16
16
  attr_reader :combinations
17
17
 
18
18
  delegate :empty?, :to => :@combinations
@@ -36,13 +36,7 @@ module Internals
36
36
  def weighted_score weights
37
37
  weights.score @combinations
38
38
  end
39
-
40
- # Wrap the combinations into an allocation with the result_identifier.
41
- #
42
- def pack_into_allocation result_identifier
43
- Allocation.new self, result_identifier
44
- end
45
-
39
+
46
40
  # Filters the tokens and identifiers such that only identifiers
47
41
  # that are passed in, remain, including their tokens.
48
42
  #
@@ -70,11 +64,11 @@ module Internals
70
64
  def to_result
71
65
  @combinations.map &:to_result
72
66
  end
73
-
67
+
74
68
  end
75
69
 
76
70
  end
77
71
 
78
72
  end
79
-
73
+
80
74
  end
@@ -8,63 +8,83 @@ module Internals
8
8
  # An allocation consists of a number of combinations.
9
9
  #
10
10
  module Combinations # :nodoc:all
11
-
11
+
12
12
  # Redis Combinations contain specific methods for
13
13
  # calculating score and ids in memory.
14
14
  #
15
15
  class Redis < Base
16
-
17
- # TODO Err… yeah. Wrap in Picky specific wrapper.
16
+
17
+ # Connect to the backend.
18
+ #
19
+ # TODO Use specific Picky Redis wrapper.
20
+ #
21
+ def self.redis
22
+ @redis ||= ::Redis.new :db => 15
23
+ end
24
+
25
+ attr_reader :redis
26
+
27
+ #
18
28
  #
19
29
  def initialize combinations
20
30
  super combinations
21
-
22
- @@redis ||= ::Redis.new :db => 15
31
+
32
+ @redis = self.class.redis
23
33
  end
24
-
34
+
25
35
  # Returns the result ids for the allocation.
26
36
  #
27
37
  def ids amount, offset
28
38
  return [] if @combinations.empty?
29
-
39
+
30
40
  identifiers = @combinations.inject([]) do |identifiers, combination|
31
41
  identifiers << "#{combination.identifier}"
32
42
  end
33
-
43
+
34
44
  result_id = generate_intermediate_result_id
35
-
45
+
36
46
  # Intersect and store.
37
47
  #
38
- @@redis.zinterstore result_id, identifiers
39
-
48
+ redis.zinterstore result_id, identifiers
49
+
40
50
  # Get the stored result.
41
51
  #
42
- results = @@redis.zrange result_id, offset, (offset + amount)
43
-
52
+ results = redis.zrange result_id, offset, (offset + amount)
53
+
44
54
  # Delete the stored result as it was only for temporary purposes.
45
55
  #
46
- @@redis.del result_id
47
-
56
+ # Note: I could also not delete it, but that would not be clean at all.
57
+ #
58
+ redis.del result_id
59
+
48
60
  results
49
61
  end
50
-
62
+
51
63
  # Generate a multiple host/process safe result id.
52
64
  #
65
+ # Note: Generated when this class loads.
66
+ #
53
67
  require 'socket'
54
- @@host = Socket.gethostname
55
- define_method :host do
56
- @@host
68
+ def self.extract_host
69
+ @host ||= Socket.gethostname
70
+ end
71
+ def host
72
+ self.class.extract_host
73
+ end
74
+ extract_host
75
+ def pid
76
+ @pid ||= Process.pid
57
77
  end
58
78
  # Use the host and pid (generated lazily in child processes) for the result.
59
79
  #
60
80
  def generate_intermediate_result_id
61
- :"#{host}:#{@pid ||= Process.pid}:picky:result"
81
+ :"#{host}:#{pid}:picky:result"
62
82
  end
63
-
83
+
64
84
  end
65
-
85
+
66
86
  end
67
-
87
+
68
88
  end
69
-
89
+
70
90
  end