RubyGems - picky - Versions diffs - 0.10.5 → 0.11.0 - Mend

picky 0.10.5 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

data/lib/picky/alias_instances.rb +1 -0
data/lib/picky/application.rb +6 -7
data/lib/picky/bundle.rb +31 -0
data/lib/picky/configuration/indexes.rb +30 -41
data/lib/picky/configuration/type.rb +6 -40
data/lib/picky/ext/maybe_compile.rb +9 -0
data/lib/picky/index/bundle.rb +1 -139
data/lib/picky/{query/combinator.rb → index/categories.rb} +16 -18
data/lib/picky/index/category.rb +20 -46
data/lib/picky/index/type.rb +16 -12
data/lib/picky/index/types.rb +41 -0
data/lib/picky/index/wrappers/exact_first.rb +5 -1
data/lib/picky/indexers/base.rb +9 -8
data/lib/picky/indexing/bundle.rb +152 -0
data/lib/picky/indexing/categories.rb +36 -0
data/lib/picky/indexing/category.rb +145 -0
data/lib/picky/indexing/type.rb +45 -0
data/lib/picky/indexing/types.rb +74 -0
data/lib/picky/loader.rb +17 -7
data/lib/picky/query/base.rb +5 -4
data/lib/picky/sources/wrappers/base.rb +23 -0
data/lib/picky/sources/wrappers/location.rb +92 -0
data/lib/picky/tokenizers/index.rb +4 -1
data/lib/picky/type.rb +46 -0
data/lib/picky/types.rb +38 -0
data/lib/tasks/index.rake +4 -0
data/project_prototype/Gemfile +1 -1
data/project_prototype/app/application.rb +12 -12
data/spec/lib/application_spec.rb +6 -9
data/spec/lib/configuration/indexes_spec.rb +0 -85
data/spec/lib/index/bundle_spec.rb +2 -94
data/spec/lib/index/category_spec.rb +7 -86
data/spec/lib/index/type_spec.rb +14 -26
data/spec/lib/index/wrappers/exact_first_spec.rb +12 -12
data/spec/lib/{index → indexing}/bundle_partial_generation_speed_spec.rb +2 -2
data/spec/lib/indexing/bundle_spec.rb +174 -0
data/spec/lib/{query/combinator_spec.rb → indexing/categories_spec.rb} +30 -34
data/spec/lib/indexing/category_spec.rb +257 -0
data/spec/lib/indexing/type_spec.rb +32 -0
data/spec/lib/loader_spec.rb +0 -2
data/spec/lib/query/base_spec.rb +8 -17
data/spec/lib/query/full_spec.rb +3 -6
data/spec/lib/query/live_spec.rb +4 -3
data/spec/lib/sources/wrappers/base_spec.rb +35 -0
data/spec/lib/sources/wrappers/location_spec.rb +68 -0
data/spec/lib/tokenizers/index_spec.rb +2 -5
metadata +32 -16
data/lib/picky/configuration/field.rb +0 -73
data/lib/picky/indexes.rb +0 -179
data/lib/picky/initializers/ext.rb +0 -1
data/spec/lib/configuration/field_spec.rb +0 -208
data/spec/lib/configuration/type_spec.rb +0 -49

data/lib/picky/indexers/base.rb CHANGED Viewed

@@ -7,22 +7,22 @@ module Indexers
   #
   class Base
-    def initialize type, field
+    def initialize type, category
       @type       = type
-      @field      = field
+      @category   = category
     end
     # Convenience method for getting the right Tokenizer.
     #
     def tokenizer
-      @field.tokenizer
+      @category.tokenizer
     end
     # Convenience methods for user subclasses.
     #
     # TODO Duplicate code in Index::Files.
     #
     def search_index_file_name
-      @field.search_index_file_name
+      @category.search_index_file_name
     end
     # Executes the specific strategy.
@@ -34,10 +34,10 @@ module Indexers
     # Get the source where the data is taken from.
     #
     def source
-      @field.source || raise_no_source
+      @category.source || raise_no_source
     end
     def raise_no_source
-      raise NoSourceSpecifiedException.new "No source given for index:#{@type.name}, field:#{@field.name}." # TODO field.identifier
+      raise NoSourceSpecifiedException.new "No source given for index:#{@type.name}, category:#{@category.name}." # TODO field.identifier
     end
     # Selects the original id (indexed id) and a column to process. The column data is called "token".
@@ -54,8 +54,9 @@ module Indexers
       #
       File.open(search_index_file_name, 'w:binary') do |file|
         result = []
-        source.harvest(@type, @field) do |indexed_id, text|
+        source.harvest(@type, @category) do |indexed_id, text|
           tokenizer.tokenize(text).each do |token_text|
+            next unless token_text
             result << indexed_id << comma << token_text << newline
           end
           file.write(result.join) && result.clear if result.size > 100_000
@@ -65,7 +66,7 @@ module Indexers
     end
     def indexing_message
-      timed_exclaim "INDEX #{@type.name} #{@field.name}" #:#{@field.indexed_name}." # TODO field.identifier
+      timed_exclaim "INDEX #{@type.name} #{@category.name}" #:#{@category.indexed_as}." # TODO field.identifier
     end
   end

data/lib/picky/indexing/bundle.rb ADDED Viewed

@@ -0,0 +1,152 @@
+# encoding: utf-8
+#
+module Indexing
+  # This is the indexing bundle.
+  # It does all menial tasks that have nothing to do
+  # with the actual index running etc.
+  #
+  # TODO Superclass?
+  #
+  class Bundle < ::Bundle
+    attr_accessor :partial_strategy, :weights_strategy
+    attr_reader   :files
+    # Path is in which directory the cache is located.
+    #
+    def initialize name, category, type, similarity_strategy, partial_strategy, weights_strategy
+      super name, category, type, similarity_strategy
+      @partial_strategy    = partial_strategy
+      @weights_strategy    = weights_strategy
+    end
+    # Generation
+    #
+    # This method
+    # * loads the base index from the db
+    # * generates derived indexes
+    # * dumps all the indexes into files
+    #
+    def generate_caches_from_source
+      load_from_index_file
+      generate_caches_from_memory
+    end
+    # Generates derived indexes from the index and dumps.
+    #
+    # Note: assumes that there is something in the index
+    #
+    def generate_caches_from_memory
+      cache_from_memory_generation_message
+      generate_derived
+    end
+    def cache_from_memory_generation_message
+      timed_exclaim "CACHE FROM MEMORY #{identifier}."
+    end
+    # Generates the weights and similarity from the main index.
+    #
+    def generate_derived
+      generate_weights
+      generate_similarity
+    end
+    # Load the data from the db.
+    #
+    def load_from_index_file
+      load_from_index_generation_message
+      clear
+      retrieve
+    end
+    def load_from_index_generation_message
+      timed_exclaim "LOAD INDEX #{identifier}."
+    end
+    # Retrieves the data into the index.
+    #
+    def retrieve
+      files.retrieve do |id, token|
+        initialize_index_for token
+        index[token] << id
+      end
+    end
+    def initialize_index_for token
+      index[token] ||= []
+    end
+    # Generators.
+    #
+    # TODO Move somewhere more fitting.
+    #
+    # Generates a new index (writes its index) using the
+    # given partial caching strategy.
+    #
+    def generate_partial
+      generator = Cacher::PartialGenerator.new self.index
+      self.index = generator.generate self.partial_strategy
+    end
+    def generate_partial_from exact_index
+      timed_exclaim "PARTIAL GENERATE #{identifier}."
+      self.index = exact_index
+      self.generate_partial
+      self
+    end
+    # Generates a new similarity index (writes its index) using the
+    # given similarity caching strategy.
+    #
+    def generate_similarity
+      generator = Cacher::SimilarityGenerator.new self.index
+      self.similarity = generator.generate self.similarity_strategy
+    end
+    # Generates a new weights index (writes its index) using the
+    # given weight caching strategy.
+    #
+    def generate_weights
+      generator = Cacher::WeightsGenerator.new self.index
+      self.weights = generator.generate self.weights_strategy
+    end
+    # Saves the index in a dump file.
+    #
+    def dump
+      dump_index
+      dump_similarity
+      dump_weights
+    end
+    def dump_index
+      timed_exclaim "DUMP INDEX #{identifier}."
+      files.dump_index index
+    end
+    def dump_similarity
+      timed_exclaim "DUMP SIMILARITY #{identifier}."
+      files.dump_similarity similarity
+    end
+    def dump_weights
+      timed_exclaim "DUMP WEIGHTS #{identifier}."
+      files.dump_weights weights
+    end
+    # Alerts the user if an index is missing.
+    #
+    def raise_unless_cache_exists
+      warn_cache_small :index      if files.index_cache_small?
+      warn_cache_small :similarity if files.similarity_cache_small?
+      warn_cache_small :weights    if files.weights_cache_small?
+      raise_cache_missing :index      unless files.index_cache_ok?
+      raise_cache_missing :similarity unless files.similarity_cache_ok?
+      raise_cache_missing :weights    unless files.weights_cache_ok?
+    end
+    def warn_cache_small what
+      puts "#{what} cache for #{identifier} smaller than 16 bytes."
+    end
+    # Raises an appropriate error message.
+    #
+    def raise_cache_missing what
+      raise "#{what} cache for #{identifier} missing."
+    end
+  end
+end

data/lib/picky/indexing/categories.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Indexing
+  class Categories
+    attr_reader :categories
+    each_delegate :index,
+                  :cache,
+                  :generate_caches,
+                  :backup_caches,
+                  :restore_caches,
+                  :check_caches,
+                  :clear_caches,
+                  :create_directory_structure,
+                  :to => :categories
+    def initialize
+      @categories = []
+    end
+    def << category
+      categories << category
+    end
+    def find category_name
+      category_name = category_name.to_sym
+      categories.each do |category|
+        next unless category.name == category_name
+        return category
+      end
+    end
+  end
+end

data/lib/picky/indexing/category.rb ADDED Viewed

@@ -0,0 +1,145 @@
+module Indexing
+  class Category
+    attr_reader :name, :type, :indexed_as, :virtual, :tokenizer, :source, :exact, :partial
+    # TODO Dup the options?
+    #
+    def initialize name, type, options = {}
+      @name = name
+      @type = type
+      @source        = options[:source]
+      @tokenizer     = options[:tokenizer] || Tokenizers::Index.default
+      @indexer_class = options[:indexer]   || Indexers::Default
+      @indexed_as    = options[:as]        || name
+      @virtual       = options[:virtual]   || false # TODO What is this again?
+      # TODO Push into Bundle.
+      #
+      partial    = options[:partial]    || Cacher::Partial::Default
+      weights    = options[:weights]    || Cacher::Weights::Default
+      similarity = options[:similarity] || Cacher::Similarity::Default
+      @exact   = options[:exact_indexing_bundle]   || Bundle.new(:exact,   self, type, similarity, Cacher::Partial::None.new, weights)
+      @partial = options[:partial_indexing_bundle] || Bundle.new(:partial, self, type, Cacher::Similarity::None.new, partial, weights)
+      # @remove          = options[:remove]        || false
+      # @filter          = options[:filter]        || true
+      @options = options # TODO Remove?
+    end
+    # TODO Move to initializer?
+    #
+    def identifier
+      @identifier ||= "#{type.name} #{name}"
+    end
+    # Note: Most of the time the source of the type is used.
+    #
+    def source
+      @source || type.source
+    end
+    # TODO Spec.
+    #
+    def backup_caches
+      timed_exclaim "Backing up #{identifier}."
+      exact.backup
+      partial.backup
+    end
+    def restore_caches
+      timed_exclaim "Restoring #{identifier}."
+      exact.restore
+      partial.restore
+    end
+    def check_caches
+      timed_exclaim "Checking #{identifier}."
+      exact.raise_unless_cache_exists
+      partial.raise_unless_cache_exists
+    end
+    def clear_caches
+      timed_exclaim "Deleting #{identifier}."
+      exact.delete
+      partial.delete
+    end
+    def create_directory_structure
+      timed_exclaim "Creating directory structure for #{identifier}."
+      exact.create_directory
+      partial.create_directory
+    end
+    # Used for testing.
+    #
+    # TODO Remove?
+    #
+    def generate_indexes_from_exact_index
+      generate_derived_exact
+      generate_partial
+      generate_derived_partial
+    end
+    def generate_derived_exact
+      exact.generate_derived
+    end
+    def generate_derived_partial
+      partial.generate_derived
+    end
+    # Generates all caches for this category.
+    #
+    def cache
+      prepare_cache_directory
+      generate_caches
+    end
+    def generate_caches
+      generate_caches_from_source
+      generate_partial
+      generate_caches_from_memory
+      dump_caches
+      timed_exclaim "CACHE FINISHED #{identifier}."
+    end
+    def generate_caches_from_source
+      exact.generate_caches_from_source
+    end
+    def generate_partial
+      partial.generate_partial_from exact.index
+    end
+    def generate_caches_from_memory
+      partial.generate_caches_from_memory
+    end
+    def dump_caches
+      exact.dump
+      partial.dump
+    end
+    # TODO Partially move to type. Duplicate Code in indexers/field.rb.
+    #
+    def search_index_root
+      File.join PICKY_ROOT, 'index'
+    end
+    def cache_directory
+      File.join search_index_root, PICKY_ENVIRONMENT, type.name.to_s
+    end
+    def search_index_file_name
+      File.join cache_directory, "prepared_#{name}_index.txt"
+    end
+    def index
+      prepare_cache_directory
+      indexer.index
+    end
+    def prepare_cache_directory
+      FileUtils.mkdir_p cache_directory
+    end
+    def indexer
+      @indexer || @indexer = @indexer_class.new(type, self)
+    end
+    def virtual?
+      !!virtual
+    end
+  end
+end

data/lib/picky/indexing/type.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module Indexing
+  class Type
+    attr_reader :name, :source, :categories, :after_indexing
+    # Delegators for indexing.
+    #
+    delegate :connect_backend,
+             :to => :source
+    delegate :index,
+             :cache,
+             :generate_caches,
+             :backup_caches,
+             :restore_caches,
+             :check_caches,
+             :clear_caches,
+             :create_directory_structure,
+             :to => :categories
+    def initialize name, source, options = {}
+      @name   = name
+      @source = source
+      @after_indexing = options[:after_indexing]
+      @categories = Categories.new
+    end
+    # TODO Spec. Doc.
+    #
+    def add_category name, options = {}
+      categories << Category.new(name, self, options)
+    end
+    # Indexing.
+    #
+    def take_snapshot
+      source.take_snapshot self
+    end
+  end
+end

data/lib/picky/indexing/types.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module Indexing
+  class Types
+    attr_reader :types
+    each_delegate :take_snapshot,
+                  :generate_caches,
+                  :backup_caches,
+                  :restore_caches,
+                  :check_caches,
+                  :clear_caches,
+                  :create_directory_structure,
+                  :to => :types
+    def initialize
+      clear
+    end
+    # TODO Spec.
+    #
+    def clear
+      @types = []
+    end
+    # TODO Spec. Superclass?
+    #
+    def register type
+      self.types << type
+    end
+    # Runs the indexers in parallel (index + cache).
+    #
+    # TODO Spec.
+    #
+    def index randomly = true
+      take_snapshot
+      # Run in parallel.
+      #
+      timed_exclaim "INDEXING USING #{Cores.max_processors} PROCESSORS, IN #{randomly ? 'RANDOM' : 'GIVEN'} ORDER."
+      Cores.forked self.types, { randomly: randomly } do |type|
+        type.index
+        type.cache
+      end
+      timed_exclaim "INDEXING FINISHED."
+    end
+    # TODO Spec
+    #
+    def generate_index_only type_name, field_name
+      found = find type_name, field_name
+      found.index if found
+    end
+    def generate_cache_only type_name, category_name
+      found = find type_name, field_name
+      found.generate_caches if found
+    end
+    # TODO Spec
+    #
+    def find type_name, category_name
+      type_name     = type_name.to_sym
+      types.each do |type|
+        next unless type.name == type_name
+        found = type.categories.find category_name
+        return found if found
+      end
+    end
+  end
+end

data/lib/picky/loader.rb CHANGED Viewed

@@ -84,7 +84,7 @@ module Loader
   def self.load_framework
     # Load compiled C code.
     #
-    require_relative 'initializers/ext'
+    require_relative 'ext/maybe_compile'
     # Load extensions.
     #
@@ -166,9 +166,23 @@ module Loader
     # Index types.
     #
+    load_relative 'bundle'
+    load_relative 'indexing/bundle'
+    load_relative 'indexing/category'
+    load_relative 'indexing/categories'
+    load_relative 'indexing/type'
+    load_relative 'indexing/types'
     load_relative 'index/bundle'
     load_relative 'index/category'
+    load_relative 'index/categories'
     load_relative 'index/type'
+    load_relative 'index/types'
+    load_relative 'types'
+    load_relative 'alias_instances'
+    load_relative 'type'
     load_relative 'index/wrappers/exact_first'
@@ -193,7 +207,6 @@ module Loader
     load_relative 'query/qualifiers'
     load_relative 'query/weigher'
-    load_relative 'query/combinator'
     load_relative 'query/weights'
@@ -219,14 +232,11 @@ module Loader
     load_relative 'sources/delicious'
     load_relative 'sources/couch'
-    # Indexes.
-    #
-    load_relative 'indexes'
+    load_relative 'sources/wrappers/base'
+    load_relative 'sources/wrappers/location'
     # Configuration.
     #
-    load_relative 'configuration/field'
-    load_relative 'configuration/type'
     load_relative 'configuration/indexes'
     # ... in Application.

data/lib/picky/query/base.rb CHANGED Viewed

@@ -17,10 +17,11 @@ module Query
     #    * tokenizer: Tokenizers::Query.default by default.
     #    * weights:   A hash of weights, or a Query::Weights object.
     #
-    def initialize *index_types
-      options      = Hash === index_types.last ? index_types.pop : {}
-      @index_types = index_types
-      @weigher     = options[:weigher]   || Weigher.new(index_types)
+    def initialize *index_type_definitions
+      options      = Hash === index_type_definitions.last ? index_type_definitions.pop : {}
+      indexes      = index_type_definitions.map &:index
+      @weigher     = options[:weigher]   || Weigher.new(indexes)
       @tokenizer   = options[:tokenizer] || Tokenizers::Query.default
       weights      = options[:weights] || Weights.new
       @weights     = Hash === weights ? Weights.new(weights) : weights

data/lib/picky/sources/wrappers/base.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module Sources
+  module Wrappers
+    class Base
+      attr_reader :backend
+      # Wraps a backend
+      #
+      def initialize backend
+        @backend = backend
+      end
+      # Default is delegation for all methods
+      #
+      delegate :harvest, :connect_backend, :take_snapshot, :to => :backend
+    end
+  end
+end

data/lib/picky/sources/wrappers/location.rb ADDED Viewed

@@ -0,0 +1,92 @@
+module Sources
+  module Wrappers
+    class Location < Base
+      attr_reader :precision, :grid
+      # TODO Save min and grid!
+      #
+      def initialize backend, options = {}
+        super backend
+        @user_grid = extract_user_grid options
+        @precision = extract_precision options
+        @grid      = @user_grid / (@precision + 0.5)
+      end
+      #
+      #
+      def extract_user_grid options
+        options[:grid] || raise # TODO
+      end
+      # Extracts an amount of grids that this
+      # Precision is given in a value.
+      # 1 is low (up to 16.6% error), 5 is very high (up to 5% error).
+      #
+      # We don't recommend using values higher than 5.
+      #
+      # Default is 1.
+      #
+      def extract_precision options
+        options[:precision] || 1
+      end
+      def reset
+        @min = 1.0/0
+      end
+      # Yield the data (id, text for id) for the given type and field.
+      #
+      def harvest type, field
+        reset
+        # Cache. TODO Make option?
+        #
+        locations = []
+        # Gather min/max.
+        #
+        backend.harvest type, field do |indexed_id, location|
+          location = location.to_f
+          @min = location if location < @min
+          locations << [indexed_id, location]
+        end
+        # Add a margin.
+        #
+        marginize
+        # Recalculate locations.
+        #
+        locations.each do |indexed_id, location|
+          locations_for(location).each do |new_location|
+            yield indexed_id, new_location.to_s
+          end
+        end
+      end
+      def marginize
+        @min -= @user_grid
+      end
+      # Put location onto multiple places on a grid.
+      #
+      # Note: Always returns an integer.
+      #
+      def locations_for location
+        new_location = ((location - @min) / grid).floor
+        min_location = new_location - precision
+        max_location = new_location + precision
+        (min_location..max_location).to_a
+      end
+    end
+  end
+end