picky 2.0.0.pre2 → 2.0.0.pre3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. data/lib/picky/application.rb +1 -1
  2. data/lib/picky/cli.rb +2 -2
  3. data/lib/picky/index/base.rb +1 -1
  4. data/lib/picky/internals/generators/similarity/double_metaphone.rb +32 -0
  5. data/lib/picky/internals/generators/similarity/metaphone.rb +32 -0
  6. data/lib/picky/internals/generators/similarity/{double_levenshtone.rb → phonetic.rb} +9 -21
  7. data/lib/picky/internals/generators/similarity/soundex.rb +32 -0
  8. data/lib/picky/internals/index/redis/basic.rb +15 -15
  9. data/lib/picky/internals/index/redis/list_hash.rb +13 -13
  10. data/lib/picky/internals/index/redis/string_hash.rb +11 -9
  11. data/lib/picky/internals/indexers/serial.rb +8 -8
  12. data/lib/picky/internals/indexing/bundle/base.rb +1 -1
  13. data/lib/picky/internals/indexing/bundle/memory.rb +1 -4
  14. data/lib/picky/internals/indexing/category.rb +3 -3
  15. data/lib/picky/internals/query/combinations/base.rb +5 -11
  16. data/lib/picky/internals/query/combinations/redis.rb +44 -24
  17. data/lib/picky/internals/query/indexes.rb +29 -24
  18. data/lib/picky/internals/query/token.rb +12 -12
  19. data/lib/picky/internals/tokenizers/base.rb +1 -1
  20. data/lib/picky/loader.rb +4 -4
  21. data/lib/picky/sources/couch.rb +4 -6
  22. data/lib/picky/sources/delicious.rb +1 -1
  23. data/spec/lib/analyzer_spec.rb +18 -0
  24. data/spec/lib/application_spec.rb +13 -3
  25. data/spec/lib/bundling_spec.rb +21 -0
  26. data/spec/lib/character_substituters/west_european_spec.rb +8 -2
  27. data/spec/lib/cli_spec.rb +45 -17
  28. data/spec/lib/index/redis_spec.rb +15 -0
  29. data/spec/lib/internals/adapters/rack/live_parameters_spec.rb +11 -6
  30. data/spec/lib/internals/frontend_adapters/rack_spec.rb +22 -0
  31. data/spec/lib/internals/generators/similarity/{double_levenshtone_spec.rb → double_metaphone_spec.rb} +1 -7
  32. data/spec/lib/internals/generators/similarity/metaphone_spec.rb +60 -0
  33. data/spec/lib/internals/generators/similarity/phonetic_spec.rb +13 -0
  34. data/spec/lib/internals/generators/similarity/soundex_spec.rb +60 -0
  35. data/spec/lib/internals/generators/similarity_generator_spec.rb +1 -1
  36. data/spec/lib/internals/index/file/basic_spec.rb +15 -5
  37. data/spec/lib/internals/index/redis/list_hash_spec.rb +34 -0
  38. data/spec/lib/internals/index/redis/string_hash_spec.rb +12 -0
  39. data/spec/lib/internals/indexed/bundle/memory_spec.rb +66 -0
  40. data/spec/lib/internals/indexing/bundle/memory_spec.rb +87 -71
  41. data/spec/lib/internals/indexing/bundle/redis_spec.rb +282 -0
  42. data/spec/lib/internals/indexing/bundle/super_base_spec.rb +1 -1
  43. data/spec/lib/internals/indexing/categories_spec.rb +49 -0
  44. data/spec/lib/internals/indexing/category_spec.rb +68 -35
  45. data/spec/lib/query/combinations/base_spec.rb +0 -9
  46. data/spec/lib/query/combinations/memory_spec.rb +0 -9
  47. data/spec/lib/query/combinations/redis_spec.rb +40 -5
  48. data/spec/lib/sources/couch_spec.rb +22 -0
  49. data/spec/lib/sources/csv_spec.rb +7 -0
  50. data/spec/lib/sources/db_spec.rb +7 -1
  51. data/spec/lib/sources/delicious_spec.rb +6 -2
  52. metadata +26 -5
@@ -130,7 +130,7 @@
130
130
  # books.define_category :title,
131
131
  # qualifiers: [:t, :title, :titre],
132
132
  # partial: Partial::Substring.new(:from => 1),
133
- # similarity: Similarity::Phonetic.new(2)
133
+ # similarity: Similarity::DoubleMetaphone.new(2)
134
134
  # books.define_category :author,
135
135
  # partial: Partial::Substring.new(:from => -2)
136
136
  # books.define_category :isbn
data/lib/picky/cli.rb CHANGED
@@ -71,7 +71,7 @@ module Picky
71
71
  end
72
72
  class Generate < Base
73
73
  def execute name, args, params
74
- system "picky-generate #{args.join(' ')}"
74
+ Kernel.system "picky-generate #{args.join(' ')}"
75
75
  end
76
76
  end
77
77
  class Help < Base
@@ -83,7 +83,7 @@ module Picky
83
83
  " picky #{command} #{params_to_s(params)}"
84
84
  end.join(?\n)
85
85
 
86
- puts "Possible commands:\n" + commands
86
+ Kernel.puts "Possible commands:\n#{commands}\n"
87
87
  end
88
88
  end
89
89
 
@@ -79,7 +79,7 @@ INDEX
79
79
  #
80
80
  # === Options
81
81
  # * partial: Partial::None.new or Partial::Substring.new(from: starting_char, to: ending_char). Default is Partial::Substring.new(from: -3, to: -1).
82
- # * similarity: Similarity::None.new or Similarity::Phonetic.new(similar_words_searched). Default is Similarity::None.new.
82
+ # * similarity: Similarity::None.new or Similarity::DoubleMetaphone.new(similar_words_searched). Default is Similarity::None.new.
83
83
  # * qualifiers: An array of qualifiers with which you can define which category you’d like to search, for example “title:hobbit” will search for hobbit in just title categories. Example: qualifiers: [:t, :titre, :title] (use it for example with multiple languages). Default is the name of the category.
84
84
  # * qualifier: Convenience options if you just need a single qualifier, see above. Example: qualifiers => :title. Default is the name of the category.
85
85
  # * source: Use a different source than the index uses. If you think you need that, there might be a better solution to your problem. Please post to the mailing list first with your application.rb :)
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ module Generators
6
+
7
+ module Similarity
8
+
9
+ # It's actually a combination of double metaphone
10
+ # and Levenshtein.
11
+ #
12
+ # It uses the double metaphone to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class DoubleMetaphone < Phonetic
16
+
17
+ # Encodes the given symbol.
18
+ #
19
+ # Returns a symbol.
20
+ #
21
+ def encoded sym
22
+ codes = Text::Metaphone.double_metaphone sym.to_s
23
+ codes.first.to_sym unless codes.empty?
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ module Generators
6
+
7
+ module Similarity
8
+
9
+ # It's actually a combination of metaphone
10
+ # and Levenshtein.
11
+ #
12
+ # It uses the metaphone to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class Metaphone < Phonetic
16
+
17
+ # Encodes the given symbol.
18
+ #
19
+ # Returns a symbol.
20
+ #
21
+ def encoded sym
22
+ code = Text::Metaphone.metaphone sym.to_s
23
+ code.to_sym if code
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -6,31 +6,23 @@ module Internals
6
6
 
7
7
  module Similarity
8
8
 
9
- # DoubleLevensthone means that it's a combination of
10
- # * DoubleMetaphone
11
- # and
12
- # * Levenshtein
13
- # :)
9
+ # It's actually a combination of double metaphone
10
+ # and Levenshtein.
14
11
  #
15
- class DoubleLevenshtone < Strategy
12
+ # It uses the double metaphone to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class Phonetic < Strategy
16
16
 
17
17
  attr_reader :amount
18
18
 
19
19
  #
20
20
  #
21
21
  def initialize amount = 10
22
+ raise "From Picky 2.0 on you need to use the DoubleMetaphone similarity instead of the Phonetic similarity." if self.class == Phonetic
22
23
  @amount = amount
23
24
  end
24
25
 
25
- # Encodes the given symbol.
26
- #
27
- # Returns a symbol.
28
- #
29
- def encoded sym
30
- codes = Text::Metaphone.double_metaphone sym.to_s
31
- codes.first.to_sym unless codes.empty?
32
- end
33
-
34
26
  # Generates an index for the given index (in exact index style).
35
27
  #
36
28
  # In the following form:
@@ -41,7 +33,7 @@ module Internals
41
33
  sort hash
42
34
  end
43
35
 
44
- private
36
+ protected
45
37
 
46
38
  # Sorts the index values in place.
47
39
  #
@@ -69,13 +61,9 @@ module Internals
69
61
  end
70
62
 
71
63
  end
72
-
73
- # ... aka Phonetic.
74
- #
75
- Phonetic = DoubleLevenshtone
76
64
 
77
65
  end
78
66
 
79
67
  end
80
-
68
+
81
69
  end
@@ -0,0 +1,32 @@
1
+ # encoding: utf-8
2
+ #
3
+ module Internals
4
+
5
+ module Generators
6
+
7
+ module Similarity
8
+
9
+ # It's actually a combination of soundex
10
+ # and Levenshtein.
11
+ #
12
+ # It uses the soundex to get similar words
13
+ # and ranks them using the levenshtein.
14
+ #
15
+ class Soundex < Phonetic
16
+
17
+ # Encodes the given symbol.
18
+ #
19
+ # Returns a symbol.
20
+ #
21
+ def encoded sym
22
+ code = Text::Soundex.soundex sym.to_s
23
+ code.to_sym if code
24
+ end
25
+
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -1,9 +1,9 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Redis
6
-
6
+
7
7
  # Redis Backend Accessor.
8
8
  #
9
9
  # Provides necessary helper methods for its
@@ -12,23 +12,23 @@ module Internals
12
12
  # dump/load methods.
13
13
  #
14
14
  class Basic
15
-
15
+
16
16
  attr_reader :namespace, :backend
17
-
17
+
18
18
  # An index cache takes a path, without file extension,
19
19
  # which will be provided by the subclasses.
20
20
  #
21
21
  def initialize namespace
22
22
  @namespace = namespace
23
-
23
+
24
24
  # TODO Turn this inside out such that people can pass in
25
25
  # their own Redis instance.
26
- #
26
+ #
27
27
  # TODO Make the :db a real option.
28
28
  #
29
29
  @backend = ::Redis.new :db => 15
30
30
  end
31
-
31
+
32
32
  # Does nothing.
33
33
  #
34
34
  def load
@@ -39,13 +39,13 @@ module Internals
39
39
  def retrieve
40
40
  # Nothing.
41
41
  end
42
-
42
+
43
43
  # Redis does not backup.
44
44
  #
45
45
  def backup
46
46
  # Nothing.
47
47
  end
48
-
48
+
49
49
  # Deletes the Redis index namespace.
50
50
  #
51
51
  def delete
@@ -54,10 +54,10 @@ module Internals
54
54
  # but since we cannot delete by key pattern,
55
55
  # we don't do anything.
56
56
  end
57
-
57
+
58
58
  # Checks.
59
59
  #
60
-
60
+
61
61
  # Is this cache suspiciously small?
62
62
  #
63
63
  def cache_small?
@@ -79,11 +79,11 @@ module Internals
79
79
  def size
80
80
  backend.dbsize
81
81
  end
82
-
82
+
83
83
  end
84
-
84
+
85
85
  end
86
-
86
+
87
87
  end
88
-
88
+
89
89
  end
@@ -1,11 +1,11 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Redis
6
-
6
+
7
7
  class ListHash < Basic
8
-
8
+
9
9
  # Writes the hash into Redis.
10
10
  #
11
11
  # TODO Performance: rpush as you get the values instead of putting it together in an array first.
@@ -14,33 +14,33 @@ module Internals
14
14
  hash.each_pair do |key, values|
15
15
  redis_key = "#{namespace}:#{key}"
16
16
  i = 0
17
- @backend.multi do
18
- @backend.del redis_key
19
-
17
+ backend.multi do
18
+ backend.del redis_key
19
+
20
20
  values.each do |value|
21
21
  i += 1
22
- @backend.zadd redis_key, i, value
22
+ backend.zadd redis_key, i, value
23
23
  end
24
24
  end
25
25
  end
26
26
  end
27
-
27
+
28
28
  # Get a collection.
29
29
  #
30
30
  def collection sym
31
- @backend.lrange "#{namespace}:#{sym}", 0, -1
31
+ backend.lrange "#{namespace}:#{sym}", 0, -1
32
32
  end
33
-
33
+
34
34
  # Get a single value.
35
35
  #
36
36
  def member sym
37
37
  raise "Can't retrieve a single value from a Redis ListHash. Use Index::Redis::StringHash."
38
38
  end
39
-
39
+
40
40
  end
41
-
41
+
42
42
  end
43
-
43
+
44
44
  end
45
45
 
46
46
  end
@@ -1,36 +1,38 @@
1
1
  module Internals
2
2
 
3
3
  module Index
4
-
4
+
5
5
  class Redis
6
-
6
+
7
7
  class StringHash < Basic
8
-
8
+
9
9
  # Writes the hash into Redis.
10
10
  #
11
+ # TODO Could we use multi?
12
+ #
11
13
  def dump hash
12
14
  redis = backend
13
15
  hash.each_pair do |key, value|
14
16
  redis.hset namespace, key, value
15
17
  end
16
18
  end
17
-
19
+
18
20
  # Get a collection.
19
21
  #
20
22
  def collection sym
21
23
  raise "Can't retrieve a collection from a StringHash. Use Index::Redis::ListHash."
22
24
  end
23
-
25
+
24
26
  # Get a single value.
25
27
  #
26
28
  def member sym
27
29
  backend.hget namespace, sym
28
30
  end
29
-
31
+
30
32
  end
31
-
33
+
32
34
  end
33
-
35
+
34
36
  end
35
-
37
+
36
38
  end
@@ -1,25 +1,25 @@
1
1
  # encoding: utf-8
2
2
  #
3
3
  module Indexers
4
-
4
+
5
5
  # The indexer defines the control flow.
6
6
  #
7
7
  class Serial
8
-
8
+
9
9
  attr_accessor :tokenizer, :source
10
-
10
+
11
11
  def initialize configuration, source, tokenizer
12
12
  @configuration = configuration
13
13
  @source = source || raise_no_source
14
14
  @tokenizer = tokenizer
15
15
  end
16
-
16
+
17
17
  # Raise a no source exception.
18
18
  #
19
19
  def raise_no_source
20
20
  raise NoSourceSpecifiedException.new("No source given for #{@configuration}.")
21
21
  end
22
-
22
+
23
23
  # Delegates the key format to the source.
24
24
  #
25
25
  # Default is to_i.
@@ -27,7 +27,7 @@ module Indexers
27
27
  def key_format
28
28
  @source.key_format || :to_i
29
29
  end
30
-
30
+
31
31
  # Selects the original id (indexed id) and a column to process. The column data is called "token".
32
32
  #
33
33
  # Note: Puts together the parts first in an array, then releasing the array from time to time by joining.
@@ -39,7 +39,7 @@ module Indexers
39
39
  def process
40
40
  comma = ?,
41
41
  newline = ?\n
42
-
42
+
43
43
  # TODO Move open to config?
44
44
  #
45
45
  # @category.prepared_index do |file|
@@ -67,6 +67,6 @@ module Indexers
67
67
  def indexing_message
68
68
  timed_exclaim %Q{"#{@configuration.identifier}": Starting indexing.}
69
69
  end
70
-
70
+
71
71
  end
72
72
  end
@@ -180,7 +180,7 @@ module Internals
180
180
  # Raises an appropriate error message for the given cache.
181
181
  #
182
182
  def raise_cache_missing what
183
- raise "#{what} cache for #{identifier} missing."
183
+ raise "Error: The #{what} cache for #{identifier} is missing."
184
184
  end
185
185
 
186
186
  # Warns the user if the similarity index is small.
@@ -17,10 +17,7 @@ module Internals
17
17
  alias backend files
18
18
 
19
19
  def to_s
20
- <<-MEMORY
21
- Memory
22
- #{@backend.indented_to_s}
23
- MEMORY
20
+ "Memory\n#{@backend.indented_to_s}"
24
21
  end
25
22
 
26
23
  end
@@ -14,7 +14,7 @@ module Internals
14
14
  # * index: Index to which this category is attached to.
15
15
  # Options:
16
16
  # * partial: Partial::None.new, Partial::Substring.new(from:start_char, to:up_to_char) (defaults from:-3, to:-1)
17
- # * similarity: Similarity::None.new (default), Similarity::Phonetic.new(amount_of_similarly_linked_words)
17
+ # * similarity: Similarity::None.new (default), Similarity::DoubleMetaphone.new(amount_of_similarly_linked_words)
18
18
  # * source: Use if the category should use a different source.
19
19
  # * from: The source category identifier to take the data from.
20
20
  #
@@ -23,6 +23,8 @@ module Internals
23
23
  # * weights:
24
24
  # * tokenizer:
25
25
  #
26
+ # TODO Should source be not optional, or taken from the index?
27
+ #
26
28
  def initialize name, index, options = {}
27
29
  @name = name
28
30
  @from = options[:from]
@@ -59,8 +61,6 @@ Category(#{name} from #{from}):
59
61
  @from || name
60
62
  end
61
63
 
62
- # TODO Spec.
63
- #
64
64
  def backup_caches
65
65
  timed_exclaim "Backing up #{identifier}."
66
66
  exact.backup
@@ -8,11 +8,11 @@ module Internals
8
8
  # An allocation consists of a number of combinations.
9
9
  #
10
10
  module Combinations # :nodoc:all
11
-
11
+
12
12
  # Base Combinations contain methods for calculating score and ids.
13
13
  #
14
14
  class Base
15
-
15
+
16
16
  attr_reader :combinations
17
17
 
18
18
  delegate :empty?, :to => :@combinations
@@ -36,13 +36,7 @@ module Internals
36
36
  def weighted_score weights
37
37
  weights.score @combinations
38
38
  end
39
-
40
- # Wrap the combinations into an allocation with the result_identifier.
41
- #
42
- def pack_into_allocation result_identifier
43
- Allocation.new self, result_identifier
44
- end
45
-
39
+
46
40
  # Filters the tokens and identifiers such that only identifiers
47
41
  # that are passed in, remain, including their tokens.
48
42
  #
@@ -70,11 +64,11 @@ module Internals
70
64
  def to_result
71
65
  @combinations.map &:to_result
72
66
  end
73
-
67
+
74
68
  end
75
69
 
76
70
  end
77
71
 
78
72
  end
79
-
73
+
80
74
  end
@@ -8,63 +8,83 @@ module Internals
8
8
  # An allocation consists of a number of combinations.
9
9
  #
10
10
  module Combinations # :nodoc:all
11
-
11
+
12
12
  # Redis Combinations contain specific methods for
13
13
  # calculating score and ids in memory.
14
14
  #
15
15
  class Redis < Base
16
-
17
- # TODO Err… yeah. Wrap in Picky specific wrapper.
16
+
17
+ # Connect to the backend.
18
+ #
19
+ # TODO Use specific Picky Redis wrapper.
20
+ #
21
+ def self.redis
22
+ @redis ||= ::Redis.new :db => 15
23
+ end
24
+
25
+ attr_reader :redis
26
+
27
+ #
18
28
  #
19
29
  def initialize combinations
20
30
  super combinations
21
-
22
- @@redis ||= ::Redis.new :db => 15
31
+
32
+ @redis = self.class.redis
23
33
  end
24
-
34
+
25
35
  # Returns the result ids for the allocation.
26
36
  #
27
37
  def ids amount, offset
28
38
  return [] if @combinations.empty?
29
-
39
+
30
40
  identifiers = @combinations.inject([]) do |identifiers, combination|
31
41
  identifiers << "#{combination.identifier}"
32
42
  end
33
-
43
+
34
44
  result_id = generate_intermediate_result_id
35
-
45
+
36
46
  # Intersect and store.
37
47
  #
38
- @@redis.zinterstore result_id, identifiers
39
-
48
+ redis.zinterstore result_id, identifiers
49
+
40
50
  # Get the stored result.
41
51
  #
42
- results = @@redis.zrange result_id, offset, (offset + amount)
43
-
52
+ results = redis.zrange result_id, offset, (offset + amount)
53
+
44
54
  # Delete the stored result as it was only for temporary purposes.
45
55
  #
46
- @@redis.del result_id
47
-
56
+ # Note: I could also not delete it, but that would not be clean at all.
57
+ #
58
+ redis.del result_id
59
+
48
60
  results
49
61
  end
50
-
62
+
51
63
  # Generate a multiple host/process safe result id.
52
64
  #
65
+ # Note: Generated when this class loads.
66
+ #
53
67
  require 'socket'
54
- @@host = Socket.gethostname
55
- define_method :host do
56
- @@host
68
+ def self.extract_host
69
+ @host ||= Socket.gethostname
70
+ end
71
+ def host
72
+ self.class.extract_host
73
+ end
74
+ extract_host
75
+ def pid
76
+ @pid ||= Process.pid
57
77
  end
58
78
  # Use the host and pid (generated lazily in child processes) for the result.
59
79
  #
60
80
  def generate_intermediate_result_id
61
- :"#{host}:#{@pid ||= Process.pid}:picky:result"
81
+ :"#{host}:#{pid}:picky:result"
62
82
  end
63
-
83
+
64
84
  end
65
-
85
+
66
86
  end
67
-
87
+
68
88
  end
69
-
89
+
70
90
  end