RubyGems - picky - Versions diffs - 0.9.4 → 0.10.0 - Mend

picky 0.9.4 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/lib/picky/application.rb +42 -29
data/lib/picky/character_substitution/european.rb +33 -0
data/lib/picky/configuration/field.rb +3 -3
data/lib/picky/configuration/indexes.rb +4 -10
data/lib/picky/configuration/queries.rb +2 -10
data/lib/picky/index/bundle.rb +2 -7
data/lib/picky/index/file/text.rb +6 -1
data/lib/picky/loader.rb +4 -4
data/lib/picky/query/base.rb +1 -1
data/lib/picky/query/tokens.rb +4 -11
data/lib/picky/tokenizers/base.rb +23 -5
data/lib/picky/tokenizers/index.rb +8 -1
data/lib/picky/tokenizers/query.rb +21 -7
data/lib/tasks/server.rake +3 -3
data/lib/tasks/shortcuts.rake +4 -4
data/lib/tasks/spec.rake +1 -1
data/lib/tasks/try.rake +6 -8
data/project_prototype/Gemfile +3 -2
data/project_prototype/app/application.rb +35 -47
data/spec/lib/application_spec.rb +36 -19
data/spec/lib/{umlaut_substituter_spec.rb → character_substitution/european_spec.rb} +14 -2
data/spec/lib/configuration/field_spec.rb +12 -12
data/spec/lib/configuration/indexes_spec.rb +3 -40
data/spec/lib/extensions/array_spec.rb +1 -1
data/spec/lib/extensions/hash_spec.rb +1 -1
data/spec/lib/index/file/text_spec.rb +14 -1
data/spec/lib/query/combination_spec.rb +1 -1
data/spec/lib/query/tokens_spec.rb +11 -19
data/spec/lib/tokenizers/base_spec.rb +21 -1
data/spec/lib/tokenizers/index_spec.rb +23 -0
data/spec/lib/tokenizers/query_spec.rb +32 -0
metadata +7 -13
data/lib/picky/tokenizers/default/index.rb +0 -7
data/lib/picky/tokenizers/default/query.rb +0 -7
data/lib/picky/umlaut_substituter.rb +0 -34
data/spec/lib/tokenizers/default/index_spec.rb +0 -11
data/spec/lib/tokenizers/default/query_spec.rb +0 -11

data/lib/picky/application.rb CHANGED Viewed

@@ -1,55 +1,68 @@
 # The Picky application wherein the indexing and querying is defined.
 #
 class Application
   class << self
-    attr_reader :apps
+    # Returns a configured tokenizer that
+    # is used for indexing by default.
+    #
+    def default_indexing options = {}
+      indexing.default_tokenizer options
+    end
-    # Finalize the subclass as soon as it
-    # has finished loading.
+    # Returns a configured tokenizer that
+    # is used for querying by default.
+    #
+    def default_querying options = {}
+      querying.default_tokenizer options
+    end
+    # Routes.
     #
-    # Note: finalize finalizes the routes.
+    delegate :route, :root, :to => :routing
+    # Index, Field.
     #
-    def inherited app
-      @apps ||= []
-      @apps << app
-    end
-    def finalize_apps
-      @apps.each &:finalize
-    end
+    # TODO Rename category.
+    #
+    delegate :field, :to => :indexing
+    def category *args; indexing.field *args;        end
+    def index *args;    indexing.define_index *args; end
     # An application simply delegates to the routing to handle a request.
     #
     def call env
       routing.call env
     end
-    # Freezes the routes.
-    #
-    def finalize
-      routing.freeze
-    end
     def routing
       @routing ||= Routing.new
     end
-    # Routes.
-    #
-    delegate :route, :root, :to => :routing
-    # TODO Rename to default_indexing?
-    #
     def indexing
       @indexing ||= Configuration::Indexes.new
     end
-    def index *args
-      self.type *args
+    def querying
+      @queries ||= Configuration::Queries.new
     end
-    delegate :type, :field, :to => :indexing
-    # TODO Rename to default_querying?
+    # Finalize the subclass as soon as it
+    # has finished loading.
     #
-    def querying
-      @queries ||= Configuration::Queries.new
+    attr_reader :apps
+    def initialize_apps
+      @apps ||= []
+    end
+    def inherited app
+      initialize_apps
+      apps << app
+    end
+    def finalize_apps
+      initialize_apps
+      apps.each &:finalize
+    end
+    # Finalizes the routes.
+    #
+    def finalize
+      routing.freeze
     end
     # TODO Add more info.

data/lib/picky/character_substitution/european.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# encoding: utf-8
+#
+module CharacterSubstitution
+  # Substitutes Umlauts like
+  # ä, ö, ü => ae, oe, ue.
+  # (and more, see specs)
+  #
+  class European
+    def initialize
+      @chars = ActiveSupport::Multibyte.proxy_class
+    end
+    def substitute text
+      trans = @chars.new(text).normalize(:kd)
+      # substitute special cases
+      #
+      trans.gsub!('ß', 'ss')
+      # substitute umlauts (of A,O,U,a,o,u)
+      #
+      trans.gsub!(/([AOUaou])\314\210/u, '\1e')
+      # get rid of ecutes, graves and …
+      #
+      trans.unpack('U*').select { |cp|
+        cp < 0x0300 || cp > 0x035F
+      }.pack('U*')
+    end
+  end
+end

data/lib/picky/configuration/field.rb CHANGED Viewed

@@ -5,10 +5,10 @@ module Configuration
   #
   class Field
     attr_reader :name, :indexed_name, :virtual, :tokenizer
-    attr_accessor :type # convenience
-    def initialize name, tokenizer, options = {}
+    attr_accessor :type # convenience TODO Still needed?
+    def initialize name, options = {}
       @name            = name.to_sym
-      @tokenizer       = tokenizer
+      @tokenizer       = options[:tokenizer] || Tokenizers::Index.default
       # TODO Dup the options?

data/lib/picky/configuration/indexes.rb CHANGED Viewed

@@ -10,17 +10,13 @@ module Configuration
       @types = []
     end
-    def default_tokenizer
-      @default_tokenizer ||= Tokenizers::Default::Index
+    def default_tokenizer options = {}
+      Tokenizers::Index.default = Tokenizers::Index.new(options)
     end
-    # Delegates
-    #
-    delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
     # TODO Rewrite all this configuration handling.
     #
-    def type name, source, *fields
+    def define_index name, source, *fields
       new_type = Type.new name, source, *fields
       types << new_type
       ::Indexes.configuration ||= self
@@ -30,9 +26,7 @@ module Configuration
       generated
     end
     def field name, options = {}
-      tokenizer = options[:tokenizer] || default_tokenizer
-      Field.new name, tokenizer, options
+      Field.new name, options
     end
     #

data/lib/picky/configuration/queries.rb CHANGED Viewed

@@ -6,16 +6,8 @@ module Configuration
     #
     #
-    def default_tokenizer
-      @default_tokenizer ||= Tokenizers::Default::Query
-    end
-    delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
-    # Delegates.
-    #
-    def maximum_tokens amount
-      Query::Tokens.maximum = amount
+    def default_tokenizer options = {}
+      Tokenizers::Query.default = Tokenizers::Query.new(options)
     end
   end

data/lib/picky/index/bundle.rb CHANGED Viewed

@@ -98,15 +98,10 @@ module Index
     end
     # Retrieves the data into the index.
     #
-    # TODO Beautify.
-    #
     def retrieve
-      files.retrieve do |indexed_id, token|
-        token.chomp!
-        token = token.to_sym
+      files.retrieve do |id, token|
         initialize_index_for token
-        index[token] << indexed_id.to_i
+        index[token] << id
       end
     end
     def initialize_index_for token

data/lib/picky/index/file/text.rb CHANGED Viewed

@@ -13,10 +13,15 @@ module Index
       def dump hash
         raise "Can't dump to text file. Use JSON or Marshal."
       end
+      # Yields an id and a symbol token.
+      #
       def retrieve
+        id, token =
         ::File.open(cache_path, 'r:binary') do |file|
           file.each_line do |line|
-            yield line.split ?,, 2
+            id, token = line.split ?,, 2
+            yield id.to_i, (token.chomp! || token).to_sym
           end
         end
       end

data/lib/picky/loader.rb CHANGED Viewed

@@ -104,6 +104,10 @@ module Loader
     load_relative 'helpers/cache'
     load_relative 'helpers/measuring'
+    # Character Substitution
+    #
+    load_relative 'character_substitution/european'
     # Signal handling
     #
     load_relative 'signals'
@@ -111,7 +115,6 @@ module Loader
     # Various.
     #
     load_relative 'loggers/search'
-    load_relative 'umlaut_substituter'
     # Index generation strategies.
     #
@@ -180,9 +183,6 @@ module Loader
     load_relative 'tokenizers/index'
     load_relative 'tokenizers/query'
-    load_relative 'tokenizers/default/index'
-    load_relative 'tokenizers/default/query'
     # Query combinations, qualifiers, weigher.
     #
     load_relative 'query/combination'

data/lib/picky/query/base.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Query
       options      = Hash === index_types.last ? index_types.pop : {}
       @index_types = index_types
       @weigher     = options[:weigher]   || Weigher.new(index_types)
-      @tokenizer   = options[:tokenizer] || Tokenizers::Default::Query
+      @tokenizer   = options[:tokenizer] || Tokenizers::Query.default
       @weights     = options[:weights]   || Weights.new
     end

data/lib/picky/query/tokens.rb CHANGED Viewed

@@ -6,11 +6,6 @@ module Query
   #
   class Tokens
-    #
-    #
-    cattr_accessor :maximum
-    self.maximum = 5
     # Basically delegates to its internal tokens array.
     #
     self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
@@ -52,13 +47,11 @@ module Query
     # Caps the tokens to the maximum.
     #
-    # Note: We could parametrize this if necessary.
-    #
-    def cap
-      @tokens.slice!(@@maximum..-1) if cap?
+    def cap maximum
+      @tokens.slice!(maximum..-1) if cap?(maximum)
     end
-    def cap?
-      @tokens.size > @@maximum
+    def cap? maximum
+      @tokens.size > maximum
     end
     # Rejects blank tokens.

data/lib/picky/tokenizers/base.rb CHANGED Viewed

@@ -79,6 +79,19 @@ module Tokenizers
       text.gsub! @removes_characters_after_splitting_regexp, '' if @removes_characters_after_splitting_regexp
     end
+    # Substitute Characters with this substituter.
+    #
+    # Default is European Character substitution.
+    #
+    def substitutes_characters_with substituter = CharacterSubstitution::European.new
+      # TODO Raise if it doesn't quack substitute?
+      @substituter = substituter
+    end
+    def substitute_characters text
+      substituter?? substituter.substitute(text) : text
+    end
     # Returns a number of tokens, generated from the given text.
     #
     # Note:
@@ -93,15 +106,20 @@ module Tokenizers
                process tokens   # processing tokens / strings
     end
-    attr_accessor :substituter
+    attr_reader :substituter
     alias substituter? substituter
-    def initialize substituter = UmlautSubstituter.new
-      @substituter = substituter
+    def initialize options = {}
+      removes_characters options[:removes_characters]                                 if options[:removes_characters]
+      contracts_expressions *options[:contracts_expressions]                          if options[:contracts_expressions]
+      stopwords options[:stopwords]                                                   if options[:stopwords]
+      normalizes_words options[:normalizes_words]                                     if options[:normalizes_words]
+      removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
+      substitutes_characters_with options[:substitutes_characters_with]               if options[:substitutes_characters_with]
-      # TODO Default handling.
+      # Defaults.
       #
-      splits_text_on(/\s/)
+      splits_text_on options[:splits_text_on] || /\s/
     end
     # Hooks.

data/lib/picky/tokenizers/index.rb CHANGED Viewed

@@ -5,6 +5,13 @@ module Tokenizers
   #
   class Index < Base
+    def self.default= new_default
+      @default = new_default
+    end
+    def self.default
+      @default ||= new
+    end
     # Default indexing preprocessing hook.
     #
     # Does:
@@ -15,7 +22,7 @@ module Tokenizers
     #   5. Remove non-single stopwords. (Stopwords that occur with other words)
     #
     def preprocess text
-      text = substituter.substitute text if substituter?
+      text = substitute_characters text
       text.downcase!
       remove_illegals text
       contract text

data/lib/picky/tokenizers/query.rb CHANGED Viewed

@@ -13,6 +13,20 @@ module Tokenizers
   #
   class Query < Base
+    def self.default= new_default
+      @default = new_default
+    end
+    def self.default
+      @default ||= new
+    end
+    attr_reader :maximum_tokens
+    def initialize options = {}
+      super options
+      @maximum_tokens = options[:maximum_tokens] || 5
+    end
     def preprocess text
       remove_illegals text             # Remove illegal characters
       remove_non_single_stopwords text # remove stop words
@@ -33,9 +47,9 @@ module Tokenizers
     #
     def process tokens
       tokens.tokenize_with self
-      tokens.reject          # Reject any tokens that don't meet criteria
-      tokens.cap             # Cut off superfluous tokens
-      tokens.partialize_last # Set certain tokens as partial
+      tokens.reject              # Reject any tokens that don't meet criteria
+      tokens.cap maximum_tokens  # Cut off superfluous tokens
+      tokens.partialize_last     # Set certain tokens as partial
       tokens
     end
@@ -44,10 +58,10 @@ module Tokenizers
     # TODO Perhaps move to Normalizer?
     #
     def normalize text
-      text = substituter.substitute text if substituter? # Substitute special characters TODO Move to subclass
-      text.downcase!                                     # Downcase all text
-      normalize_with_patterns text                       # normalize
-      text.to_sym                                        # symbolize
+      text = substitute_characters text # Substitute special characters TODO Move to subclass
+      text.downcase!                    # Downcase all text
+      normalize_with_patterns text      # normalize
+      text.to_sym                       # symbolize
     end
     # Returns a token for a word.

data/lib/tasks/server.rake CHANGED Viewed

@@ -11,7 +11,7 @@ namespace :server do
     pid.blank? ? nil : pid.chomp
   end
-  desc "Start the unicorns. (Wehee!)"
+  # desc "Start the unicorns. (Wehee!)"
   task :start => :framework do
     chdir_to_root
     # Rake::Task[:"solr:start"].invoke # TODO Move to better place.
@@ -21,13 +21,13 @@ namespace :server do
     exec command
   end
-  desc "Stop the unicorns. (Blam!)"
+  # desc "Stop the unicorns. (Blam!)"
   task :stop => :framework do
     `kill -QUIT #{current_pid}` if current_pid
     # Rake::Task[:"solr:stop"].invoke # TODO Move to better place.
   end
-  desc "Restart the unicorns."
+  # desc "Restart the unicorns."
   task :restart do
     Rake::Task[:"server:stop"].invoke
     sleep 5

data/lib/tasks/shortcuts.rake CHANGED Viewed

@@ -1,20 +1,20 @@
-desc "Shortcut for index:generate."
+desc "Generate the index."
 task :index => :application do
   Rake::Task[:'index:generate'].invoke
 end
-desc "Shortcut for try:both"
+desc "Try the given text in the indexer/query (type:field optional)."
 task :try, [:text, :type_and_field] => :application do |_, options|
   text, type_and_field = options.text, options.type_and_field
   Rake::Task[:'try:both'].invoke text, type_and_field
 end
-desc "shortcut for server:start"
+desc "Start the server."
 task :start do
   Rake::Task[:'server:start'].invoke
 end
-desc "shortcut for server:stop"
+desc "Stop the server."
 task :stop do
   Rake::Task[:'server:stop'].invoke
 end

data/lib/tasks/spec.rake CHANGED Viewed

@@ -3,7 +3,7 @@ require 'spec/rake/spectask'
 task :default => :spec
-desc "Run all specs in spec directory (excluding plugin specs)"
+desc "Run all specs"
 Spec::Rake::SpecTask.new(:spec) do |t|
   spec_root = File.join(File.dirname(__FILE__), '..', '..', 'spec')
   t.spec_opts = ['--options', "\"#{File.join(spec_root, 'spec.opts')}\""]

data/lib/tasks/try.rake CHANGED Viewed

@@ -2,25 +2,23 @@
 #
 namespace :try do
-  desc "Try how a given word would be tokenized when indexing (type:field optional)."
+  # desc "Try how a given word would be tokenized when indexing (type:field optional)."
   task :index, [:text, :type_and_field] => :application do |_, options|
     text, type_and_field = options.text, options.type_and_field
-    tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Default::Index
+    tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Index.default
-    puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text).to_a}"
+    puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
   end
-  desc "Try how a given word would be tokenized when querying."
+  # desc "Try how a given word would be tokenized when querying."
   task :query, [:text] => :application do |_, options|
     text = options.text
-    # TODO Text is destroyed.
-    #
-    puts "\"#{text}\" is query tokenized as #{Tokenizers::Default::Query.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
+    puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
   end
-  desc "Try the given text with both the index and the query (type:field optional)."
+  # desc "Try the given text with both the index and the query (type:field optional)."
   task :both, [:text, :type_and_field] => :application do |_, options|
     text, type_and_field = options.text, options.type_and_field

data/project_prototype/Gemfile CHANGED Viewed

@@ -2,8 +2,9 @@ source :gemcutter
 # Gems required by Picky.
 #
-gem 'picky',            '~> 0.9.0'
-gem 'bundler',          '>= 0.9.26'
+gem 'picky',            '~> 0.10.0'
+gem 'rake'
+gem 'bundler'
 gem 'rack',             '~> 1.2.1'
 gem 'rack-mount',       '~> 0.6.9'
 gem 'text',             '~> 0.2.0'

data/project_prototype/app/application.rb CHANGED Viewed

@@ -1,58 +1,46 @@
 # encoding: utf-8
 #
+# TODO Adapt the generated example
+#      (a library books finder) to what you need.
+#
+# Check the Wiki http://github.com/floere/picky/wiki for more options.
+# Ask me or the google group if you have questions or specific requests.
+#
 class PickySearch < Application
-  # TODO Adapt the generated example
-  #      (a library books finder) to what you need.
-  #
-  # Check the Wiki http://github.com/floere/picky/wiki for more options.
+  # Indexing: How text is indexed.
+  # Querying: How query text is handled.
+  #
+  default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
+                   stopwords:          /\b(and|the|of|it|in|for)\b/,
+                   splits_text_on:     /[\s\/\-\"\&\.]/
+  default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
+                   stopwords:          /\b(and|the|of|it|in|for)\b/,
+                   splits_text_on:     /[\s\/\-\,\&]+/,
+                   maximum_tokens: 5, # Max amount of tokens passing into a query. 5 is the default.
+                   substitutes_characters_with: CharacterSubstitution::European.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
+  # Define an index. Use a database etc. source? http://github.com/floere/picky/wiki/Sources-Configuration#sources
   #
-  # Ask me or the google group if you have questions or specific requests.
-  #
-  indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
-  indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
-  indexing.splits_text_on(/[\s\/\-\"\&\.]/)
   books_index = index :books,
-                      Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, :file => 'app/library.csv'),
-                      # Use a database as source:
-                      # Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
-                      # Or delicious:
-                      # Sources::Delicious.new('username', 'password'), # offers title, tags, url fields.
-                      field(:title,
-                            :partial => Partial::Substring.new(:from => 1), # Index substrings upwards from character 1 (default: -3),
-                                                                              # e.g. picky -> p, pi, pic, pick, picky
-                                                                              # Like this, you'll find picky even when entering just a "p".
-                            :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: no similarity).
-                      field(:author, :partial => Partial::Substring.new(:from => 1)),
-                      field(:isbn,   :partial => Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
-  # Defines the maximum tokens (words) that pass through to the engine.
-  #
-  querying.maximum_tokens 5
+                      Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, file: 'app/library.csv'),
+                      category(:title,
+                               partial: Partial::Substring.new(from: 1), # Indexes substrings upwards from character 1 (default: -3),
+                                                                         # You'll find "picky" even when entering just a "p".
+                               similarity: Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: No similarity).
+                      category(:author,
+                               partial: Partial::Substring.new(from: 1)),
+                      category(:isbn,
+                               partial: Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
-  # Note that Picky needs the following characters to
-  # pass through, as they are control characters: *"~:
-  #
-  querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
-  querying.stopwords(/\b(and|the|of|it|in|for)\b/)
-  querying.splits_text_on(/[\s\/\-\,\&]+/)
+  full_books = Query::Full.new books_index    # A Full query returns ids, combinations, and counts.
+  live_books = Query::Live.new books_index    # A Live query does return all that Full returns, except ids.
-  # The example defines two queries that use the same index(es).
-  #
-  # A Full query returns ids, combinations, and counts.
-  # A Live query does return all that Full returns, without ids.
-  #
-  # Note: You can pass a query multiple indexes and it will combine them.
-  #
-  full_books = Query::Full.new books_index
-  live_books = Query::Live.new books_index
+  route %r{\A/books/full\Z} => full_books        # Routing is simple: url_path_regexp => query
+  route %r{\A/books/live\Z} => live_books        #
-  # Routing is simple.
-  # A path regexp pointing to a query that will be run.
-  #
-  route %r{^/books/full} => full_books
-  route %r{^/books/live} => live_books
+  # Note: You can pass a query multiple indexes and it will query in all of them.
 end

data/spec/lib/application_spec.rb CHANGED Viewed

@@ -5,32 +5,49 @@ require 'spec_helper'
 describe Application do
   describe "integration" do
+    it "should run ok" do
+      lambda {
+        class MinimalTestApplication < Application
+          books = index :books,
+                        Sources::DB.new('SELECT id, title FROM books', :file => 'app/db.yml'),
+                        category(:title)
+          full = Query::Full.new books
+          live = Query::Live.new books
+          route %r{^/books/full} => full
+          route %r{^/books/live} => live
+        end
+        Tokenizers::Index.default.tokenize 'some text'
+        Tokenizers::Query.default.tokenize 'some text'
+      }.should_not raise_error
+    end
     it "should run ok" do
       lambda {
         # TODO Add all possible cases.
         #
         class TestApplication < Application
-          indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
-          indexing.contracts_expressions(/mr\.\s*|mister\s*/i, 'mr ')
-          indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
-          indexing.splits_text_on(/[\s\/\-\"\&\.]/)
-          indexing.removes_characters_after_splitting(/[\.]/)
+          default_indexing removes_characters:                 /[^a-zA-Z0-9\s\/\-\"\&\.]/,
+                           contracts_expressions:              [/mr\.\s*|mister\s*/i, 'mr '],
+                           stopwords:                          /\b(and|the|of|it|in|for)\b/,
+                           splits_text_on:                     /[\s\/\-\"\&\.]/,
+                           removes_characters_after_splitting: /[\.]/
+          default_querying removes_characters: /[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/,
+                           stopwords:          /\b(and|the|of|it|in|for)\b/,
+                           splits_text_on:     /[\s\/\-\,\&]+/,
+                           normalizes_words:   [[/Deoxyribonucleic Acid/i, 'DNA']],
+                           substitutes_characters_with: CharacterSubstitution::European.new,
+                           maximum_tokens:     5
-          books_index = index Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
-                              field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
-                              field(:author),
-                              field(:isbn,  :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
+          books_index = index :books,
+                              Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
+                              category(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
+                              category(:author),
+                              category(:isbn,  :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
-          # Note that Picky needs the following characters to
-          # pass through, as they are control characters: *"~:
-          #
-          querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
-          querying.stopwords(/\b(and|the|of|it|in|for)\b/)
-          querying.splits_text_on(/[\s\/\-\,\&]+/)
-          querying.normalizes_words([
-            [/Deoxyribonucleic Acid/i, 'DNA']
-          ])
-          querying.maximum_tokens 5
           full = Query::Full.new books_index
           live = Query::Live.new books_index

data/spec/lib/{umlaut_substituter_spec.rb → character_substitution/european_spec.rb} RENAMED Viewed

@@ -1,9 +1,10 @@
 # encoding: utf-8
+#
 require 'spec_helper'
-describe UmlautSubstituter do
+describe CharacterSubstitution do
   before(:each) do
-    @substituter = UmlautSubstituter.new
+    @substituter = CharacterSubstitution::European.new
   end
   # A bit of metaprogramming to help with the myriads of its.
@@ -82,5 +83,16 @@ describe UmlautSubstituter do
     it_should_substitute 'å', 'a'
     it_should_substitute 'Å', 'A'
   end
+  describe "diacritic" do
+    it_should_substitute 'ñ', 'n'
+  end
+  describe "speed" do
+    it "is fast" do
+      result = performance_of { @substituter.substitute('ä') }
+      result.should < 0.00009
+    end
+  end
 end

data/spec/lib/configuration/field_spec.rb CHANGED Viewed

@@ -5,7 +5,7 @@ describe Configuration::Field do
     describe "virtual?" do
       context "with virtual true" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, :some_tokenizer, :virtual => true
+          @field = Configuration::Field.new :some_name, :virtual => true
         end
         it "returns the right value" do
           @field.virtual?.should == true
@@ -13,7 +13,7 @@ describe Configuration::Field do
       end
       context "with virtual object" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, :some_tokenizer, :virtual => 123.6
+          @field = Configuration::Field.new :some_name, :virtual => 123.6
         end
         it "returns the right value" do
           @field.virtual?.should == true
@@ -39,7 +39,7 @@ describe Configuration::Field do
     describe "tokenizer" do
       context "with specific tokenizer" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, Tokenizers::Index.new
+          @field = Configuration::Field.new :some_name, tokenizer: Tokenizers::Index.new
           @field.type = :some_type
         end
@@ -54,7 +54,7 @@ describe Configuration::Field do
     describe "indexer" do
       context "with default indexer" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, :some_tokenizer
+          @field = Configuration::Field.new :some_name
         end
         it "caches" do
           @field.indexer.should == @field.indexer
@@ -62,7 +62,7 @@ describe Configuration::Field do
       end
       context "with specific indexer" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, :indexer => Indexers::Default
+          @field = Configuration::Field.new :some_name, tokenizer: Indexers::Default
           @field.type = :some_type
         end
@@ -81,7 +81,7 @@ describe Configuration::Field do
     end
     describe "cache" do
       before(:each) do
-        @field = Configuration::Field.new :some_name, :some_tokenizer
+        @field = Configuration::Field.new :some_name
         @field.stub! :prepare_cache_directory
         @generated = stub :generated, :generate_caches => nil
@@ -100,7 +100,7 @@ describe Configuration::Field do
     end
     describe "prepare_cache_directory" do
       before(:each) do
-        @field = Configuration::Field.new :some_name, :some_tokenizer
+        @field = Configuration::Field.new :some_name
         @field.stub! :cache_directory => :some_cache_directory
       end
@@ -112,7 +112,7 @@ describe Configuration::Field do
     end
     describe "index" do
       before(:each) do
-        @field = Configuration::Field.new :some_name, :some_tokenizer
+        @field = Configuration::Field.new :some_name
         @field.stub! :prepare_cache_directory
         @indexer = stub :indexer, :index => nil
@@ -132,7 +132,7 @@ describe Configuration::Field do
     describe "source" do
       context "with source" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, :some_tokenizer, :source => :some_given_source
+          @field = Configuration::Field.new :some_name, :source => :some_given_source
           @type = stub :type, :name => :some_type
           @field.type = @type
@@ -143,7 +143,7 @@ describe Configuration::Field do
       end
       context "without source" do
         before(:each) do
-          @field = Configuration::Field.new :some_name, :some_tokenizer
+          @field = Configuration::Field.new :some_name
           @type = stub :type, :name => :some_type, :source => :some_type_source
           @field.type = @type
@@ -155,7 +155,7 @@ describe Configuration::Field do
     end
     context "name symbol" do
       before(:each) do
-        @field = Configuration::Field.new :some_name, :some_tokenizer
+        @field = Configuration::Field.new :some_name
         @type = stub :type, :name => :some_type
         @field.type = @type
@@ -189,7 +189,7 @@ describe Configuration::Field do
     end
     context "name string" do
       before(:each) do
-        @field = Configuration::Field.new 'some_name', :some_tokenizer
+        @field = Configuration::Field.new 'some_name'
       end
       describe "generate_qualifiers_from" do
         context "without qualifiers" do

data/spec/lib/configuration/indexes_spec.rb CHANGED Viewed

@@ -18,47 +18,10 @@ describe Configuration::Indexes do
   describe "default_tokenizer" do
     it "is a default tokenizer" do
-      @config.default_tokenizer.should == Tokenizers::Default::Index
+      @config.default_tokenizer.should be_kind_of(Tokenizers::Index)
     end
-    it "caches" do
-      @config.default_tokenizer.should == @config.default_tokenizer
-    end
-  end
-  describe "delegates" do
-    before(:each) do
-      @receiver = mock :receiver
-      @config.stub! :default_tokenizer => @receiver
-    end
-    it "delegates" do
-      @receiver.should_receive(:removes_characters).once
-      @config.removes_characters
-    end
-    it "delegates" do
-      @receiver.should_receive(:contracts_expressions).once
-      @config.contracts_expressions
-    end
-    it "delegates" do
-      @receiver.should_receive(:stopwords).once
-      @config.stopwords
-    end
-    it "delegates" do
-      @receiver.should_receive(:splits_text_on).once
-      @config.splits_text_on
-    end
-    it "delegates" do
-      @receiver.should_receive(:normalizes_words).once
-      @config.normalizes_words
-    end
-    it "delegates" do
-      @receiver.should_receive(:removes_characters_after_splitting).once
-      @config.removes_characters_after_splitting
+    it "does not cache" do
+      @config.default_tokenizer.should_not == @config.default_tokenizer
     end
   end

data/spec/lib/extensions/array_spec.rb CHANGED Viewed

@@ -51,7 +51,7 @@ describe Array do
       [:test1, :test1, :test2, :test2, :test3].clustered_uniq.should == [:test1, :test2, :test3]
     end
     it "is fast" do
-      performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.00001
+      performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.000012
     end
   end

data/spec/lib/extensions/hash_spec.rb CHANGED Viewed

@@ -65,7 +65,7 @@ describe Hash do
       lambda { @obj.to_json(:some => :option) }.should_not raise_error
     end
     it "should be fast" do
-      performance_of { @obj.to_json }.should < 0.00006
+      performance_of { @obj.to_json }.should < 0.000065
     end
   end

data/spec/lib/index/file/text_spec.rb CHANGED Viewed

@@ -21,7 +21,20 @@ describe Index::File::Text do
     end
   end
   describe "retrieve" do
-    it
+    before(:each) do
+      @io = stub :io
+      @io.should_receive(:each_line).once.with.and_yield '123456,some_nice_token'
+      File.should_receive(:open).any_number_of_times.and_yield @io
+    end
+    it "yields split lines and returns the id and token text" do
+      @file.retrieve do |id, token|
+        id.should    == 123456
+        token.should == :some_nice_token
+      end
+    end
+    it "is fast" do
+      performance_of { @file.retrieve { |id, token| } }.should < 0.00005
+    end
   end
 end

data/spec/lib/query/combination_spec.rb CHANGED Viewed

@@ -46,7 +46,7 @@ describe 'Query::Combination' do
         @combination = Query::Combination.new token, @category
       end
       it 'should return a correct result' do
-        @combination.to_result.should == [:some_category_name, 'Blä~', :blae]
+        @combination.to_result.should == [:some_category_name, 'Blä~', :blä] # Note: Characters not substituted. That's ok.
       end
     end
     it 'should return a correct result' do

data/spec/lib/query/tokens_spec.rb CHANGED Viewed

@@ -36,31 +36,26 @@ describe Query::Tokens do
       @tokens.instance_variable_get(:@tokens).should == [@nonblank, @nonblank]
     end
   end
-  describe "class variables" do
-    describe "maximal query words" do
-      it "should answer" do
-        lambda { Query::Tokens.maximum }.should_not raise_error
-      end
-    end
-  end
   describe 'cap' do
     context 'one token' do
       before(:each) do
         @token = Query::Token.processed 'Token'
         @tokens = Query::Tokens.new [@token]
       end
-      it 'should not cut it down' do
-        @tokens.cap
+      it 'does not cut it down' do
+        @tokens.cap 5
         @tokens.instance_variable_get(:@tokens).should == [@token]
       end
+      it 'cuts it down' do
+        @tokens.cap 0
+        @tokens.instance_variable_get(:@tokens).should == []
+      end
     end
     context 'many tokens' do
       before(:each) do
-        @old_maximum = Query::Tokens.maximum
-        Query::Tokens.maximum = 3
         @first = Query::Token.processed 'Hello'
         @second = Query::Token.processed 'I'
         @third = Query::Token.processed 'Am'
@@ -72,12 +67,9 @@ describe Query::Tokens do
           Query::Token.processed('Token')
         ]
       end
-      after(:each) do
-        Query::Tokens.maximum = @old_maximum
-      end
       it 'should cap the number of tokens' do
-        @tokens.cap
+        @tokens.cap 3
         @tokens.instance_variable_get(:@tokens).should == [@first, @second, @third]
       end
     end

data/spec/lib/tokenizers/base_spec.rb CHANGED Viewed

@@ -1,4 +1,5 @@
-# coding: utf-8
+# encoding: utf-8
+#
 require 'spec_helper'
 describe Tokenizers::Base do
@@ -7,6 +8,22 @@ describe Tokenizers::Base do
     @tokenizer = Tokenizers::Base.new
   end
+  describe "substitute(s)_characters*" do
+    it "doesn't substitute if there is no substituter" do
+      @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
+    end
+    it "uses the substituter to replace characters" do
+      @tokenizer.substitutes_characters_with CharacterSubstitution::European.new
+      @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
+    end
+    it "uses the european substituter as default" do
+      @tokenizer.substitutes_characters_with
+      @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
+    end
+  end
   describe "removes_characters_after_splitting" do
     context "without removes_characters_after_splitting called" do
       it "has remove_after_normalizing_illegals" do
@@ -64,6 +81,9 @@ describe Tokenizers::Base do
       it 'should define a method split that splits by default on \s' do
         @tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
       end
+      it 'splits text on /\s/ by default' do
+        @tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
+      end
     end
     context "with removes_characters called" do
       before(:each) do

data/spec/lib/tokenizers/index_spec.rb CHANGED Viewed

@@ -8,6 +8,29 @@ describe Tokenizers::Index do
     @tokenizer = Tokenizers::Index.new
   end
+  describe "default*" do
+    before(:all) do
+      @old = Tokenizers::Index.default
+    end
+    after(:all) do
+      Tokenizers::Index.default = @old
+    end
+    it "has a reader" do
+      lambda { Tokenizers::Index.default }.should_not raise_error
+    end
+    it "returns by default a new Index" do
+      Tokenizers::Index.default.should be_kind_of(Tokenizers::Index)
+    end
+    it "has a writer" do
+      lambda { Tokenizers::Index.default = :bla }.should_not raise_error
+    end
+    it "returns what has been written, if something has been written" do
+      Tokenizers::Index.default = :some_default
+      Tokenizers::Index.default.should == :some_default
+    end
+  end
   describe "remove_removes_characters" do
     it "should not remove ' from a query by default" do
       @tokenizer.remove_illegals("Lugi's").should == "Lugi's"

data/spec/lib/tokenizers/query_spec.rb CHANGED Viewed

@@ -7,6 +7,38 @@ describe Tokenizers::Query do
     @tokenizer = Tokenizers::Query.new
   end
+  describe "default*" do
+    before(:all) do
+      @old = Tokenizers::Query.default
+    end
+    after(:all) do
+      Tokenizers::Query.default = @old
+    end
+    it "has a reader" do
+      lambda { Tokenizers::Query.default }.should_not raise_error
+    end
+    it "returns by default a new Index" do
+      Tokenizers::Query.default.should be_kind_of(Tokenizers::Query)
+    end
+    it "has a writer" do
+      lambda { Tokenizers::Query.default = :bla }.should_not raise_error
+    end
+    it "returns what has been written, if something has been written" do
+      Tokenizers::Query.default = :some_default
+      Tokenizers::Query.default.should == :some_default
+    end
+  end
+  describe "maximum_tokens" do
+    it "should be set to 5 by default" do
+      @tokenizer.maximum_tokens.should == 5
+    end
+    it "should be settable" do
+      Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
+    end
+  end
   describe 'preprocess' do
     it 'should call methods in order' do
       text = stub :text

metadata CHANGED Viewed

@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
   prerelease: false
   segments:
   - 0
-  - 9
-  - 4
-  version: 0.9.4
+  - 10
+  - 0
+  version: 0.10.0
 platform: ruby
 authors:
 - Florian Hanke
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-10-30 00:00:00 +02:00
+date: 2010-10-31 00:00:00 +02:00
 default_executable: picky
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -59,6 +59,7 @@ files:
 - lib/picky/cacher/weights/logarithmic.rb
 - lib/picky/cacher/weights/strategy.rb
 - lib/picky/cacher/weights_generator.rb
+- lib/picky/character_substitution/european.rb
 - lib/picky/configuration/field.rb
 - lib/picky/configuration/indexes.rb
 - lib/picky/configuration/queries.rb
@@ -118,11 +119,8 @@ files:
 - lib/picky/sources/db.rb
 - lib/picky/sources/delicious.rb
 - lib/picky/tokenizers/base.rb
-- lib/picky/tokenizers/default/index.rb
-- lib/picky/tokenizers/default/query.rb
 - lib/picky/tokenizers/index.rb
 - lib/picky/tokenizers/query.rb
-- lib/picky/umlaut_substituter.rb
 - lib/picky-tasks.rb
 - lib/picky.rb
 - lib/tasks/application.rake
@@ -161,6 +159,7 @@ files:
 - spec/lib/cacher/similarity_generator_spec.rb
 - spec/lib/cacher/weights/logarithmic_spec.rb
 - spec/lib/cacher/weights_generator_spec.rb
+- spec/lib/character_substitution/european_spec.rb
 - spec/lib/configuration/field_spec.rb
 - spec/lib/configuration/indexes_spec.rb
 - spec/lib/configuration/type_spec.rb
@@ -210,11 +209,8 @@ files:
 - spec/lib/sources/db_spec.rb
 - spec/lib/sources/delicious_spec.rb
 - spec/lib/tokenizers/base_spec.rb
-- spec/lib/tokenizers/default/index_spec.rb
-- spec/lib/tokenizers/default/query_spec.rb
 - spec/lib/tokenizers/index_spec.rb
 - spec/lib/tokenizers/query_spec.rb
-- spec/lib/umlaut_substituter_spec.rb
 - spec/specific/speed_spec.rb
 - bin/picky
 has_rdoc: true
@@ -261,6 +257,7 @@ test_files:
 - spec/lib/cacher/similarity_generator_spec.rb
 - spec/lib/cacher/weights/logarithmic_spec.rb
 - spec/lib/cacher/weights_generator_spec.rb
+- spec/lib/character_substitution/european_spec.rb
 - spec/lib/configuration/field_spec.rb
 - spec/lib/configuration/indexes_spec.rb
 - spec/lib/configuration/type_spec.rb
@@ -310,9 +307,6 @@ test_files:
 - spec/lib/sources/db_spec.rb
 - spec/lib/sources/delicious_spec.rb
 - spec/lib/tokenizers/base_spec.rb
-- spec/lib/tokenizers/default/index_spec.rb
-- spec/lib/tokenizers/default/query_spec.rb
 - spec/lib/tokenizers/index_spec.rb
 - spec/lib/tokenizers/query_spec.rb
-- spec/lib/umlaut_substituter_spec.rb
 - spec/specific/speed_spec.rb

data/lib/picky/tokenizers/default/index.rb DELETED Viewed

@@ -1,7 +0,0 @@
-module Tokenizers
-  module Default
-    # Default is always an instance.
-    #
-    Index = ::Tokenizers::Index.new
-  end
-end

data/lib/picky/tokenizers/default/query.rb DELETED Viewed

@@ -1,7 +0,0 @@
-module Tokenizers
-  module Default
-    # Default is always an instance.
-    #
-    Query = ::Tokenizers::Query.new
-  end
-end

data/lib/picky/umlaut_substituter.rb DELETED Viewed

@@ -1,34 +0,0 @@
-# encoding: utf-8
-#
-# Substitutes certain umlauts, like
-# ä, ö, ü => ae, oe, ue.
-# (and more, see specs)
-#
-class UmlautSubstituter
-  attr_reader :chars
-  def initialize
-    @chars = ActiveSupport::Multibyte.proxy_class
-  end
-  def substitute text
-    trans = chars.new(text).normalize(:kd)
-    # substitute special cases
-    #
-    trans.gsub!('ß', 'ss')
-    # substitute umlauts (of A,O,U,a,o,u)
-    #
-    trans.gsub!(/([AOUaou])\314\210/u, '\1e')
-    # get rid of ecutes, graves and …
-    #
-    trans.unpack('U*').select { |cp|
-      cp < 0x0300 || cp > 0x035F
-    }.pack('U*')
-  end
-end

data/spec/lib/tokenizers/default/index_spec.rb DELETED Viewed

@@ -1,11 +0,0 @@
-# encoding: utf-8
-#
-require 'spec_helper'
-describe Tokenizers::Default::Index do
-  it "is an instance of the index tokenizer" do
-    Tokenizers::Default::Index.should be_kind_of(Tokenizers::Index)
-  end
-end

data/spec/lib/tokenizers/default/query_spec.rb DELETED Viewed

@@ -1,11 +0,0 @@
-# encoding: utf-8
-#
-require 'spec_helper'
-describe Tokenizers::Default::Query do
-  it "is an instance of the index tokenizer" do
-    Tokenizers::Default::Query.should be_kind_of(Tokenizers::Query)
-  end
-end