picky 0.9.4 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/lib/picky/application.rb +42 -29
  2. data/lib/picky/character_substitution/european.rb +33 -0
  3. data/lib/picky/configuration/field.rb +3 -3
  4. data/lib/picky/configuration/indexes.rb +4 -10
  5. data/lib/picky/configuration/queries.rb +2 -10
  6. data/lib/picky/index/bundle.rb +2 -7
  7. data/lib/picky/index/file/text.rb +6 -1
  8. data/lib/picky/loader.rb +4 -4
  9. data/lib/picky/query/base.rb +1 -1
  10. data/lib/picky/query/tokens.rb +4 -11
  11. data/lib/picky/tokenizers/base.rb +23 -5
  12. data/lib/picky/tokenizers/index.rb +8 -1
  13. data/lib/picky/tokenizers/query.rb +21 -7
  14. data/lib/tasks/server.rake +3 -3
  15. data/lib/tasks/shortcuts.rake +4 -4
  16. data/lib/tasks/spec.rake +1 -1
  17. data/lib/tasks/try.rake +6 -8
  18. data/project_prototype/Gemfile +3 -2
  19. data/project_prototype/app/application.rb +35 -47
  20. data/spec/lib/application_spec.rb +36 -19
  21. data/spec/lib/{umlaut_substituter_spec.rb → character_substitution/european_spec.rb} +14 -2
  22. data/spec/lib/configuration/field_spec.rb +12 -12
  23. data/spec/lib/configuration/indexes_spec.rb +3 -40
  24. data/spec/lib/extensions/array_spec.rb +1 -1
  25. data/spec/lib/extensions/hash_spec.rb +1 -1
  26. data/spec/lib/index/file/text_spec.rb +14 -1
  27. data/spec/lib/query/combination_spec.rb +1 -1
  28. data/spec/lib/query/tokens_spec.rb +11 -19
  29. data/spec/lib/tokenizers/base_spec.rb +21 -1
  30. data/spec/lib/tokenizers/index_spec.rb +23 -0
  31. data/spec/lib/tokenizers/query_spec.rb +32 -0
  32. metadata +7 -13
  33. data/lib/picky/tokenizers/default/index.rb +0 -7
  34. data/lib/picky/tokenizers/default/query.rb +0 -7
  35. data/lib/picky/umlaut_substituter.rb +0 -34
  36. data/spec/lib/tokenizers/default/index_spec.rb +0 -11
  37. data/spec/lib/tokenizers/default/query_spec.rb +0 -11
@@ -1,55 +1,68 @@
1
1
  # The Picky application wherein the indexing and querying is defined.
2
2
  #
3
3
  class Application
4
+
4
5
  class << self
5
6
 
6
- attr_reader :apps
7
+ # Returns a configured tokenizer that
8
+ # is used for indexing by default.
9
+ #
10
+ def default_indexing options = {}
11
+ indexing.default_tokenizer options
12
+ end
7
13
 
8
- # Finalize the subclass as soon as it
9
- # has finished loading.
14
+ # Returns a configured tokenizer that
15
+ # is used for querying by default.
16
+ #
17
+ def default_querying options = {}
18
+ querying.default_tokenizer options
19
+ end
20
+
21
+ # Routes.
10
22
  #
11
- # Note: finalize finalizes the routes.
23
+ delegate :route, :root, :to => :routing
24
+ # Index, Field.
12
25
  #
13
- def inherited app
14
- @apps ||= []
15
- @apps << app
16
- end
17
- def finalize_apps
18
- @apps.each &:finalize
19
- end
26
+ # TODO Rename category.
27
+ #
28
+ delegate :field, :to => :indexing
29
+ def category *args; indexing.field *args; end
30
+ def index *args; indexing.define_index *args; end
20
31
 
21
32
  # An application simply delegates to the routing to handle a request.
22
33
  #
23
34
  def call env
24
35
  routing.call env
25
36
  end
26
-
27
- # Freezes the routes.
28
- #
29
- def finalize
30
- routing.freeze
31
- end
32
37
  def routing
33
38
  @routing ||= Routing.new
34
39
  end
35
- # Routes.
36
- #
37
- delegate :route, :root, :to => :routing
38
-
39
- # TODO Rename to default_indexing?
40
- #
41
40
  def indexing
42
41
  @indexing ||= Configuration::Indexes.new
43
42
  end
44
- def index *args
45
- self.type *args
43
+ def querying
44
+ @queries ||= Configuration::Queries.new
46
45
  end
47
- delegate :type, :field, :to => :indexing
48
46
 
49
- # TODO Rename to default_querying?
47
+ # Finalize the subclass as soon as it
48
+ # has finished loading.
50
49
  #
51
- def querying
52
- @queries ||= Configuration::Queries.new
50
+ attr_reader :apps
51
+ def initialize_apps
52
+ @apps ||= []
53
+ end
54
+ def inherited app
55
+ initialize_apps
56
+ apps << app
57
+ end
58
+ def finalize_apps
59
+ initialize_apps
60
+ apps.each &:finalize
61
+ end
62
+ # Finalizes the routes.
63
+ #
64
+ def finalize
65
+ routing.freeze
53
66
  end
54
67
 
55
68
  # TODO Add more info.
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+ #
3
+ module CharacterSubstitution
4
+ # Substitutes Umlauts like
5
+ # ä, ö, ü => ae, oe, ue.
6
+ # (and more, see specs)
7
+ #
8
+ class European
9
+
10
+ def initialize
11
+ @chars = ActiveSupport::Multibyte.proxy_class
12
+ end
13
+
14
+ def substitute text
15
+ trans = @chars.new(text).normalize(:kd)
16
+
17
+ # substitute special cases
18
+ #
19
+ trans.gsub!('ß', 'ss')
20
+
21
+ # substitute umlauts (of A,O,U,a,o,u)
22
+ #
23
+ trans.gsub!(/([AOUaou])\314\210/u, '\1e')
24
+
25
+ # get rid of ecutes, graves and …
26
+ #
27
+ trans.unpack('U*').select { |cp|
28
+ cp < 0x0300 || cp > 0x035F
29
+ }.pack('U*')
30
+ end
31
+
32
+ end
33
+ end
@@ -5,10 +5,10 @@ module Configuration
5
5
  #
6
6
  class Field
7
7
  attr_reader :name, :indexed_name, :virtual, :tokenizer
8
- attr_accessor :type # convenience
9
- def initialize name, tokenizer, options = {}
8
+ attr_accessor :type # convenience TODO Still needed?
9
+ def initialize name, options = {}
10
10
  @name = name.to_sym
11
- @tokenizer = tokenizer
11
+ @tokenizer = options[:tokenizer] || Tokenizers::Index.default
12
12
 
13
13
  # TODO Dup the options?
14
14
 
@@ -10,17 +10,13 @@ module Configuration
10
10
  @types = []
11
11
  end
12
12
 
13
- def default_tokenizer
14
- @default_tokenizer ||= Tokenizers::Default::Index
13
+ def default_tokenizer options = {}
14
+ Tokenizers::Index.default = Tokenizers::Index.new(options)
15
15
  end
16
16
 
17
- # Delegates
18
- #
19
- delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
20
-
21
17
  # TODO Rewrite all this configuration handling.
22
18
  #
23
- def type name, source, *fields
19
+ def define_index name, source, *fields
24
20
  new_type = Type.new name, source, *fields
25
21
  types << new_type
26
22
  ::Indexes.configuration ||= self
@@ -30,9 +26,7 @@ module Configuration
30
26
  generated
31
27
  end
32
28
  def field name, options = {}
33
- tokenizer = options[:tokenizer] || default_tokenizer
34
-
35
- Field.new name, tokenizer, options
29
+ Field.new name, options
36
30
  end
37
31
 
38
32
  #
@@ -6,16 +6,8 @@ module Configuration
6
6
 
7
7
  #
8
8
  #
9
- def default_tokenizer
10
- @default_tokenizer ||= Tokenizers::Default::Query
11
- end
12
-
13
- delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
14
-
15
- # Delegates.
16
- #
17
- def maximum_tokens amount
18
- Query::Tokens.maximum = amount
9
+ def default_tokenizer options = {}
10
+ Tokenizers::Query.default = Tokenizers::Query.new(options)
19
11
  end
20
12
 
21
13
  end
@@ -98,15 +98,10 @@ module Index
98
98
  end
99
99
  # Retrieves the data into the index.
100
100
  #
101
- # TODO Beautify.
102
- #
103
101
  def retrieve
104
- files.retrieve do |indexed_id, token|
105
- token.chomp!
106
- token = token.to_sym
107
-
102
+ files.retrieve do |id, token|
108
103
  initialize_index_for token
109
- index[token] << indexed_id.to_i
104
+ index[token] << id
110
105
  end
111
106
  end
112
107
  def initialize_index_for token
@@ -13,10 +13,15 @@ module Index
13
13
  def dump hash
14
14
  raise "Can't dump to text file. Use JSON or Marshal."
15
15
  end
16
+
17
+ # Yields an id and a symbol token.
18
+ #
16
19
  def retrieve
20
+ id, token =
17
21
  ::File.open(cache_path, 'r:binary') do |file|
18
22
  file.each_line do |line|
19
- yield line.split ?,, 2
23
+ id, token = line.split ?,, 2
24
+ yield id.to_i, (token.chomp! || token).to_sym
20
25
  end
21
26
  end
22
27
  end
data/lib/picky/loader.rb CHANGED
@@ -104,6 +104,10 @@ module Loader
104
104
  load_relative 'helpers/cache'
105
105
  load_relative 'helpers/measuring'
106
106
 
107
+ # Character Substitution
108
+ #
109
+ load_relative 'character_substitution/european'
110
+
107
111
  # Signal handling
108
112
  #
109
113
  load_relative 'signals'
@@ -111,7 +115,6 @@ module Loader
111
115
  # Various.
112
116
  #
113
117
  load_relative 'loggers/search'
114
- load_relative 'umlaut_substituter'
115
118
 
116
119
  # Index generation strategies.
117
120
  #
@@ -180,9 +183,6 @@ module Loader
180
183
  load_relative 'tokenizers/index'
181
184
  load_relative 'tokenizers/query'
182
185
 
183
- load_relative 'tokenizers/default/index'
184
- load_relative 'tokenizers/default/query'
185
-
186
186
  # Query combinations, qualifiers, weigher.
187
187
  #
188
188
  load_relative 'query/combination'
@@ -16,7 +16,7 @@ module Query
16
16
  options = Hash === index_types.last ? index_types.pop : {}
17
17
  @index_types = index_types
18
18
  @weigher = options[:weigher] || Weigher.new(index_types)
19
- @tokenizer = options[:tokenizer] || Tokenizers::Default::Query
19
+ @tokenizer = options[:tokenizer] || Tokenizers::Query.default
20
20
  @weights = options[:weights] || Weights.new
21
21
  end
22
22
 
@@ -6,11 +6,6 @@ module Query
6
6
  #
7
7
  class Tokens
8
8
 
9
- #
10
- #
11
- cattr_accessor :maximum
12
- self.maximum = 5
13
-
14
9
  # Basically delegates to its internal tokens array.
15
10
  #
16
11
  self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
@@ -52,13 +47,11 @@ module Query
52
47
 
53
48
  # Caps the tokens to the maximum.
54
49
  #
55
- # Note: We could parametrize this if necessary.
56
- #
57
- def cap
58
- @tokens.slice!(@@maximum..-1) if cap?
50
+ def cap maximum
51
+ @tokens.slice!(maximum..-1) if cap?(maximum)
59
52
  end
60
- def cap?
61
- @tokens.size > @@maximum
53
+ def cap? maximum
54
+ @tokens.size > maximum
62
55
  end
63
56
 
64
57
  # Rejects blank tokens.
@@ -79,6 +79,19 @@ module Tokenizers
79
79
  text.gsub! @removes_characters_after_splitting_regexp, '' if @removes_characters_after_splitting_regexp
80
80
  end
81
81
 
82
+ # Substitute Characters with this substituter.
83
+ #
84
+ # Default is European Character substitution.
85
+ #
86
+ def substitutes_characters_with substituter = CharacterSubstitution::European.new
87
+ # TODO Raise if it doesn't quack substitute?
88
+ @substituter = substituter
89
+ end
90
+ def substitute_characters text
91
+ substituter?? substituter.substitute(text) : text
92
+ end
93
+
94
+
82
95
  # Returns a number of tokens, generated from the given text.
83
96
  #
84
97
  # Note:
@@ -93,15 +106,20 @@ module Tokenizers
93
106
  process tokens # processing tokens / strings
94
107
  end
95
108
 
96
- attr_accessor :substituter
109
+ attr_reader :substituter
97
110
  alias substituter? substituter
98
111
 
99
- def initialize substituter = UmlautSubstituter.new
100
- @substituter = substituter
112
+ def initialize options = {}
113
+ removes_characters options[:removes_characters] if options[:removes_characters]
114
+ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
115
+ stopwords options[:stopwords] if options[:stopwords]
116
+ normalizes_words options[:normalizes_words] if options[:normalizes_words]
117
+ removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
118
+ substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
101
119
 
102
- # TODO Default handling.
120
+ # Defaults.
103
121
  #
104
- splits_text_on(/\s/)
122
+ splits_text_on options[:splits_text_on] || /\s/
105
123
  end
106
124
 
107
125
  # Hooks.
@@ -5,6 +5,13 @@ module Tokenizers
5
5
  #
6
6
  class Index < Base
7
7
 
8
+ def self.default= new_default
9
+ @default = new_default
10
+ end
11
+ def self.default
12
+ @default ||= new
13
+ end
14
+
8
15
  # Default indexing preprocessing hook.
9
16
  #
10
17
  # Does:
@@ -15,7 +22,7 @@ module Tokenizers
15
22
  # 5. Remove non-single stopwords. (Stopwords that occur with other words)
16
23
  #
17
24
  def preprocess text
18
- text = substituter.substitute text if substituter?
25
+ text = substitute_characters text
19
26
  text.downcase!
20
27
  remove_illegals text
21
28
  contract text
@@ -13,6 +13,20 @@ module Tokenizers
13
13
  #
14
14
  class Query < Base
15
15
 
16
+ def self.default= new_default
17
+ @default = new_default
18
+ end
19
+ def self.default
20
+ @default ||= new
21
+ end
22
+
23
+ attr_reader :maximum_tokens
24
+
25
+ def initialize options = {}
26
+ super options
27
+ @maximum_tokens = options[:maximum_tokens] || 5
28
+ end
29
+
16
30
  def preprocess text
17
31
  remove_illegals text # Remove illegal characters
18
32
  remove_non_single_stopwords text # remove stop words
@@ -33,9 +47,9 @@ module Tokenizers
33
47
  #
34
48
  def process tokens
35
49
  tokens.tokenize_with self
36
- tokens.reject # Reject any tokens that don't meet criteria
37
- tokens.cap # Cut off superfluous tokens
38
- tokens.partialize_last # Set certain tokens as partial
50
+ tokens.reject # Reject any tokens that don't meet criteria
51
+ tokens.cap maximum_tokens # Cut off superfluous tokens
52
+ tokens.partialize_last # Set certain tokens as partial
39
53
  tokens
40
54
  end
41
55
 
@@ -44,10 +58,10 @@ module Tokenizers
44
58
  # TODO Perhaps move to Normalizer?
45
59
  #
46
60
  def normalize text
47
- text = substituter.substitute text if substituter? # Substitute special characters TODO Move to subclass
48
- text.downcase! # Downcase all text
49
- normalize_with_patterns text # normalize
50
- text.to_sym # symbolize
61
+ text = substitute_characters text # Substitute special characters TODO Move to subclass
62
+ text.downcase! # Downcase all text
63
+ normalize_with_patterns text # normalize
64
+ text.to_sym # symbolize
51
65
  end
52
66
 
53
67
  # Returns a token for a word.
@@ -11,7 +11,7 @@ namespace :server do
11
11
  pid.blank? ? nil : pid.chomp
12
12
  end
13
13
 
14
- desc "Start the unicorns. (Wehee!)"
14
+ # desc "Start the unicorns. (Wehee!)"
15
15
  task :start => :framework do
16
16
  chdir_to_root
17
17
  # Rake::Task[:"solr:start"].invoke # TODO Move to better place.
@@ -21,13 +21,13 @@ namespace :server do
21
21
  exec command
22
22
  end
23
23
 
24
- desc "Stop the unicorns. (Blam!)"
24
+ # desc "Stop the unicorns. (Blam!)"
25
25
  task :stop => :framework do
26
26
  `kill -QUIT #{current_pid}` if current_pid
27
27
  # Rake::Task[:"solr:stop"].invoke # TODO Move to better place.
28
28
  end
29
29
 
30
- desc "Restart the unicorns."
30
+ # desc "Restart the unicorns."
31
31
  task :restart do
32
32
  Rake::Task[:"server:stop"].invoke
33
33
  sleep 5
@@ -1,20 +1,20 @@
1
- desc "Shortcut for index:generate."
1
+ desc "Generate the index."
2
2
  task :index => :application do
3
3
  Rake::Task[:'index:generate'].invoke
4
4
  end
5
5
 
6
- desc "Shortcut for try:both"
6
+ desc "Try the given text in the indexer/query (type:field optional)."
7
7
  task :try, [:text, :type_and_field] => :application do |_, options|
8
8
  text, type_and_field = options.text, options.type_and_field
9
9
 
10
10
  Rake::Task[:'try:both'].invoke text, type_and_field
11
11
  end
12
12
 
13
- desc "shortcut for server:start"
13
+ desc "Start the server."
14
14
  task :start do
15
15
  Rake::Task[:'server:start'].invoke
16
16
  end
17
- desc "shortcut for server:stop"
17
+ desc "Stop the server."
18
18
  task :stop do
19
19
  Rake::Task[:'server:stop'].invoke
20
20
  end
data/lib/tasks/spec.rake CHANGED
@@ -3,7 +3,7 @@ require 'spec/rake/spectask'
3
3
 
4
4
  task :default => :spec
5
5
 
6
- desc "Run all specs in spec directory (excluding plugin specs)"
6
+ desc "Run all specs"
7
7
  Spec::Rake::SpecTask.new(:spec) do |t|
8
8
  spec_root = File.join(File.dirname(__FILE__), '..', '..', 'spec')
9
9
  t.spec_opts = ['--options', "\"#{File.join(spec_root, 'spec.opts')}\""]
data/lib/tasks/try.rake CHANGED
@@ -2,25 +2,23 @@
2
2
  #
3
3
  namespace :try do
4
4
 
5
- desc "Try how a given word would be tokenized when indexing (type:field optional)."
5
+ # desc "Try how a given word would be tokenized when indexing (type:field optional)."
6
6
  task :index, [:text, :type_and_field] => :application do |_, options|
7
7
  text, type_and_field = options.text, options.type_and_field
8
8
 
9
- tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Default::Index
9
+ tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Index.default
10
10
 
11
- puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text).to_a}"
11
+ puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
12
12
  end
13
13
 
14
- desc "Try how a given word would be tokenized when querying."
14
+ # desc "Try how a given word would be tokenized when querying."
15
15
  task :query, [:text] => :application do |_, options|
16
16
  text = options.text
17
17
 
18
- # TODO Text is destroyed.
19
- #
20
- puts "\"#{text}\" is query tokenized as #{Tokenizers::Default::Query.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
18
+ puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
21
19
  end
22
20
 
23
- desc "Try the given text with both the index and the query (type:field optional)."
21
+ # desc "Try the given text with both the index and the query (type:field optional)."
24
22
  task :both, [:text, :type_and_field] => :application do |_, options|
25
23
  text, type_and_field = options.text, options.type_and_field
26
24
 
@@ -2,8 +2,9 @@ source :gemcutter
2
2
 
3
3
  # Gems required by Picky.
4
4
  #
5
- gem 'picky', '~> 0.9.0'
6
- gem 'bundler', '>= 0.9.26'
5
+ gem 'picky', '~> 0.10.0'
6
+ gem 'rake'
7
+ gem 'bundler'
7
8
  gem 'rack', '~> 1.2.1'
8
9
  gem 'rack-mount', '~> 0.6.9'
9
10
  gem 'text', '~> 0.2.0'
@@ -1,58 +1,46 @@
1
1
  # encoding: utf-8
2
2
  #
3
+ # TODO Adapt the generated example
4
+ # (a library books finder) to what you need.
5
+ #
6
+ # Check the Wiki http://github.com/floere/picky/wiki for more options.
7
+ # Ask me or the google group if you have questions or specific requests.
8
+ #
3
9
  class PickySearch < Application
4
10
 
5
- # TODO Adapt the generated example
6
- # (a library books finder) to what you need.
7
- #
8
- # Check the Wiki http://github.com/floere/picky/wiki for more options.
11
+ # Indexing: How text is indexed.
12
+ # Querying: How query text is handled.
13
+ #
14
+ default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
15
+ stopwords: /\b(and|the|of|it|in|for)\b/,
16
+ splits_text_on: /[\s\/\-\"\&\.]/
17
+
18
+ default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
19
+ stopwords: /\b(and|the|of|it|in|for)\b/,
20
+ splits_text_on: /[\s\/\-\,\&]+/,
21
+
22
+ maximum_tokens: 5, # Max amount of tokens passing into a query. 5 is the default.
23
+ substitutes_characters_with: CharacterSubstitution::European.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
24
+
25
+ # Define an index. Use a database etc. source? http://github.com/floere/picky/wiki/Sources-Configuration#sources
9
26
  #
10
- # Ask me or the google group if you have questions or specific requests.
11
- #
12
-
13
- indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
14
- indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
15
- indexing.splits_text_on(/[\s\/\-\"\&\.]/)
16
-
17
27
  books_index = index :books,
18
- Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, :file => 'app/library.csv'),
19
- # Use a database as source:
20
- # Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
21
- # Or delicious:
22
- # Sources::Delicious.new('username', 'password'), # offers title, tags, url fields.
23
- field(:title,
24
- :partial => Partial::Substring.new(:from => 1), # Index substrings upwards from character 1 (default: -3),
25
- # e.g. picky -> p, pi, pic, pick, picky
26
- # Like this, you'll find picky even when entering just a "p".
27
- :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: no similarity).
28
- field(:author, :partial => Partial::Substring.new(:from => 1)),
29
- field(:isbn, :partial => Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
30
-
31
- # Defines the maximum tokens (words) that pass through to the engine.
32
- #
33
- querying.maximum_tokens 5
28
+ Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, file: 'app/library.csv'),
29
+ category(:title,
30
+ partial: Partial::Substring.new(from: 1), # Indexes substrings upwards from character 1 (default: -3),
31
+ # You'll find "picky" even when entering just a "p".
32
+ similarity: Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: No similarity).
33
+ category(:author,
34
+ partial: Partial::Substring.new(from: 1)),
35
+ category(:isbn,
36
+ partial: Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
34
37
 
35
- # Note that Picky needs the following characters to
36
- # pass through, as they are control characters: *"~:
37
- #
38
- querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
39
- querying.stopwords(/\b(and|the|of|it|in|for)\b/)
40
- querying.splits_text_on(/[\s\/\-\,\&]+/)
38
+ full_books = Query::Full.new books_index # A Full query returns ids, combinations, and counts.
39
+ live_books = Query::Live.new books_index # A Live query does return all that Full returns, except ids.
41
40
 
42
- # The example defines two queries that use the same index(es).
43
- #
44
- # A Full query returns ids, combinations, and counts.
45
- # A Live query does return all that Full returns, without ids.
46
- #
47
- # Note: You can pass a query multiple indexes and it will combine them.
48
- #
49
- full_books = Query::Full.new books_index
50
- live_books = Query::Live.new books_index
41
+ route %r{\A/books/full\Z} => full_books # Routing is simple: url_path_regexp => query
42
+ route %r{\A/books/live\Z} => live_books #
51
43
 
52
- # Routing is simple.
53
- # A path regexp pointing to a query that will be run.
54
- #
55
- route %r{^/books/full} => full_books
56
- route %r{^/books/live} => live_books
44
+ # Note: You can pass a query multiple indexes and it will query in all of them.
57
45
 
58
46
  end
@@ -5,32 +5,49 @@ require 'spec_helper'
5
5
  describe Application do
6
6
 
7
7
  describe "integration" do
8
+ it "should run ok" do
9
+ lambda {
10
+ class MinimalTestApplication < Application
11
+ books = index :books,
12
+ Sources::DB.new('SELECT id, title FROM books', :file => 'app/db.yml'),
13
+ category(:title)
14
+
15
+
16
+ full = Query::Full.new books
17
+ live = Query::Live.new books
18
+
19
+ route %r{^/books/full} => full
20
+ route %r{^/books/live} => live
21
+ end
22
+ Tokenizers::Index.default.tokenize 'some text'
23
+ Tokenizers::Query.default.tokenize 'some text'
24
+ }.should_not raise_error
25
+ end
8
26
  it "should run ok" do
9
27
  lambda {
10
28
  # TODO Add all possible cases.
11
29
  #
12
30
  class TestApplication < Application
13
- indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
14
- indexing.contracts_expressions(/mr\.\s*|mister\s*/i, 'mr ')
15
- indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
16
- indexing.splits_text_on(/[\s\/\-\"\&\.]/)
17
- indexing.removes_characters_after_splitting(/[\.]/)
31
+ default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
32
+ contracts_expressions: [/mr\.\s*|mister\s*/i, 'mr '],
33
+ stopwords: /\b(and|the|of|it|in|for)\b/,
34
+ splits_text_on: /[\s\/\-\"\&\.]/,
35
+ removes_characters_after_splitting: /[\.]/
36
+
37
+ default_querying removes_characters: /[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/,
38
+ stopwords: /\b(and|the|of|it|in|for)\b/,
39
+ splits_text_on: /[\s\/\-\,\&]+/,
40
+ normalizes_words: [[/Deoxyribonucleic Acid/i, 'DNA']],
41
+
42
+ substitutes_characters_with: CharacterSubstitution::European.new,
43
+ maximum_tokens: 5
18
44
 
19
- books_index = index Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
20
- field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
21
- field(:author),
22
- field(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
45
+ books_index = index :books,
46
+ Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
47
+ category(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
48
+ category(:author),
49
+ category(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
23
50
 
24
- # Note that Picky needs the following characters to
25
- # pass through, as they are control characters: *"~:
26
- #
27
- querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
28
- querying.stopwords(/\b(and|the|of|it|in|for)\b/)
29
- querying.splits_text_on(/[\s\/\-\,\&]+/)
30
- querying.normalizes_words([
31
- [/Deoxyribonucleic Acid/i, 'DNA']
32
- ])
33
- querying.maximum_tokens 5
34
51
 
35
52
  full = Query::Full.new books_index
36
53
  live = Query::Live.new books_index
@@ -1,9 +1,10 @@
1
1
  # encoding: utf-8
2
+ #
2
3
  require 'spec_helper'
3
4
 
4
- describe UmlautSubstituter do
5
+ describe CharacterSubstitution do
5
6
  before(:each) do
6
- @substituter = UmlautSubstituter.new
7
+ @substituter = CharacterSubstitution::European.new
7
8
  end
8
9
 
9
10
  # A bit of metaprogramming to help with the myriads of its.
@@ -82,5 +83,16 @@ describe UmlautSubstituter do
82
83
  it_should_substitute 'å', 'a'
83
84
  it_should_substitute 'Å', 'A'
84
85
  end
86
+
87
+ describe "diacritic" do
88
+ it_should_substitute 'ñ', 'n'
89
+ end
90
+
91
+ describe "speed" do
92
+ it "is fast" do
93
+ result = performance_of { @substituter.substitute('ä') }
94
+ result.should < 0.00009
95
+ end
96
+ end
85
97
 
86
98
  end
@@ -5,7 +5,7 @@ describe Configuration::Field do
5
5
  describe "virtual?" do
6
6
  context "with virtual true" do
7
7
  before(:each) do
8
- @field = Configuration::Field.new :some_name, :some_tokenizer, :virtual => true
8
+ @field = Configuration::Field.new :some_name, :virtual => true
9
9
  end
10
10
  it "returns the right value" do
11
11
  @field.virtual?.should == true
@@ -13,7 +13,7 @@ describe Configuration::Field do
13
13
  end
14
14
  context "with virtual object" do
15
15
  before(:each) do
16
- @field = Configuration::Field.new :some_name, :some_tokenizer, :virtual => 123.6
16
+ @field = Configuration::Field.new :some_name, :virtual => 123.6
17
17
  end
18
18
  it "returns the right value" do
19
19
  @field.virtual?.should == true
@@ -39,7 +39,7 @@ describe Configuration::Field do
39
39
  describe "tokenizer" do
40
40
  context "with specific tokenizer" do
41
41
  before(:each) do
42
- @field = Configuration::Field.new :some_name, Tokenizers::Index.new
42
+ @field = Configuration::Field.new :some_name, tokenizer: Tokenizers::Index.new
43
43
 
44
44
  @field.type = :some_type
45
45
  end
@@ -54,7 +54,7 @@ describe Configuration::Field do
54
54
  describe "indexer" do
55
55
  context "with default indexer" do
56
56
  before(:each) do
57
- @field = Configuration::Field.new :some_name, :some_tokenizer
57
+ @field = Configuration::Field.new :some_name
58
58
  end
59
59
  it "caches" do
60
60
  @field.indexer.should == @field.indexer
@@ -62,7 +62,7 @@ describe Configuration::Field do
62
62
  end
63
63
  context "with specific indexer" do
64
64
  before(:each) do
65
- @field = Configuration::Field.new :some_name, :indexer => Indexers::Default
65
+ @field = Configuration::Field.new :some_name, tokenizer: Indexers::Default
66
66
 
67
67
  @field.type = :some_type
68
68
  end
@@ -81,7 +81,7 @@ describe Configuration::Field do
81
81
  end
82
82
  describe "cache" do
83
83
  before(:each) do
84
- @field = Configuration::Field.new :some_name, :some_tokenizer
84
+ @field = Configuration::Field.new :some_name
85
85
  @field.stub! :prepare_cache_directory
86
86
 
87
87
  @generated = stub :generated, :generate_caches => nil
@@ -100,7 +100,7 @@ describe Configuration::Field do
100
100
  end
101
101
  describe "prepare_cache_directory" do
102
102
  before(:each) do
103
- @field = Configuration::Field.new :some_name, :some_tokenizer
103
+ @field = Configuration::Field.new :some_name
104
104
 
105
105
  @field.stub! :cache_directory => :some_cache_directory
106
106
  end
@@ -112,7 +112,7 @@ describe Configuration::Field do
112
112
  end
113
113
  describe "index" do
114
114
  before(:each) do
115
- @field = Configuration::Field.new :some_name, :some_tokenizer
115
+ @field = Configuration::Field.new :some_name
116
116
  @field.stub! :prepare_cache_directory
117
117
 
118
118
  @indexer = stub :indexer, :index => nil
@@ -132,7 +132,7 @@ describe Configuration::Field do
132
132
  describe "source" do
133
133
  context "with source" do
134
134
  before(:each) do
135
- @field = Configuration::Field.new :some_name, :some_tokenizer, :source => :some_given_source
135
+ @field = Configuration::Field.new :some_name, :source => :some_given_source
136
136
 
137
137
  @type = stub :type, :name => :some_type
138
138
  @field.type = @type
@@ -143,7 +143,7 @@ describe Configuration::Field do
143
143
  end
144
144
  context "without source" do
145
145
  before(:each) do
146
- @field = Configuration::Field.new :some_name, :some_tokenizer
146
+ @field = Configuration::Field.new :some_name
147
147
 
148
148
  @type = stub :type, :name => :some_type, :source => :some_type_source
149
149
  @field.type = @type
@@ -155,7 +155,7 @@ describe Configuration::Field do
155
155
  end
156
156
  context "name symbol" do
157
157
  before(:each) do
158
- @field = Configuration::Field.new :some_name, :some_tokenizer
158
+ @field = Configuration::Field.new :some_name
159
159
 
160
160
  @type = stub :type, :name => :some_type
161
161
  @field.type = @type
@@ -189,7 +189,7 @@ describe Configuration::Field do
189
189
  end
190
190
  context "name string" do
191
191
  before(:each) do
192
- @field = Configuration::Field.new 'some_name', :some_tokenizer
192
+ @field = Configuration::Field.new 'some_name'
193
193
  end
194
194
  describe "generate_qualifiers_from" do
195
195
  context "without qualifiers" do
@@ -18,47 +18,10 @@ describe Configuration::Indexes do
18
18
 
19
19
  describe "default_tokenizer" do
20
20
  it "is a default tokenizer" do
21
- @config.default_tokenizer.should == Tokenizers::Default::Index
21
+ @config.default_tokenizer.should be_kind_of(Tokenizers::Index)
22
22
  end
23
- it "caches" do
24
- @config.default_tokenizer.should == @config.default_tokenizer
25
- end
26
- end
27
-
28
- describe "delegates" do
29
- before(:each) do
30
- @receiver = mock :receiver
31
- @config.stub! :default_tokenizer => @receiver
32
- end
33
- it "delegates" do
34
- @receiver.should_receive(:removes_characters).once
35
-
36
- @config.removes_characters
37
- end
38
- it "delegates" do
39
- @receiver.should_receive(:contracts_expressions).once
40
-
41
- @config.contracts_expressions
42
- end
43
- it "delegates" do
44
- @receiver.should_receive(:stopwords).once
45
-
46
- @config.stopwords
47
- end
48
- it "delegates" do
49
- @receiver.should_receive(:splits_text_on).once
50
-
51
- @config.splits_text_on
52
- end
53
- it "delegates" do
54
- @receiver.should_receive(:normalizes_words).once
55
-
56
- @config.normalizes_words
57
- end
58
- it "delegates" do
59
- @receiver.should_receive(:removes_characters_after_splitting).once
60
-
61
- @config.removes_characters_after_splitting
23
+ it "does not cache" do
24
+ @config.default_tokenizer.should_not == @config.default_tokenizer
62
25
  end
63
26
  end
64
27
 
@@ -51,7 +51,7 @@ describe Array do
51
51
  [:test1, :test1, :test2, :test2, :test3].clustered_uniq.should == [:test1, :test2, :test3]
52
52
  end
53
53
  it "is fast" do
54
- performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.00001
54
+ performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.000012
55
55
  end
56
56
  end
57
57
 
@@ -65,7 +65,7 @@ describe Hash do
65
65
  lambda { @obj.to_json(:some => :option) }.should_not raise_error
66
66
  end
67
67
  it "should be fast" do
68
- performance_of { @obj.to_json }.should < 0.00006
68
+ performance_of { @obj.to_json }.should < 0.000065
69
69
  end
70
70
  end
71
71
 
@@ -21,7 +21,20 @@ describe Index::File::Text do
21
21
  end
22
22
  end
23
23
  describe "retrieve" do
24
- it
24
+ before(:each) do
25
+ @io = stub :io
26
+ @io.should_receive(:each_line).once.with.and_yield '123456,some_nice_token'
27
+ File.should_receive(:open).any_number_of_times.and_yield @io
28
+ end
29
+ it "yields split lines and returns the id and token text" do
30
+ @file.retrieve do |id, token|
31
+ id.should == 123456
32
+ token.should == :some_nice_token
33
+ end
34
+ end
35
+ it "is fast" do
36
+ performance_of { @file.retrieve { |id, token| } }.should < 0.00005
37
+ end
25
38
  end
26
39
 
27
40
  end
@@ -46,7 +46,7 @@ describe 'Query::Combination' do
46
46
  @combination = Query::Combination.new token, @category
47
47
  end
48
48
  it 'should return a correct result' do
49
- @combination.to_result.should == [:some_category_name, 'Blä~', :blae]
49
+ @combination.to_result.should == [:some_category_name, 'Blä~', :blä] # Note: Characters not substituted. That's ok.
50
50
  end
51
51
  end
52
52
  it 'should return a correct result' do
@@ -36,31 +36,26 @@ describe Query::Tokens do
36
36
  @tokens.instance_variable_get(:@tokens).should == [@nonblank, @nonblank]
37
37
  end
38
38
  end
39
-
40
- describe "class variables" do
41
- describe "maximal query words" do
42
- it "should answer" do
43
- lambda { Query::Tokens.maximum }.should_not raise_error
44
- end
45
- end
46
- end
47
-
39
+
48
40
  describe 'cap' do
49
41
  context 'one token' do
50
42
  before(:each) do
51
43
  @token = Query::Token.processed 'Token'
52
44
  @tokens = Query::Tokens.new [@token]
53
45
  end
54
- it 'should not cut it down' do
55
- @tokens.cap
56
-
46
+ it 'does not cut it down' do
47
+ @tokens.cap 5
48
+
57
49
  @tokens.instance_variable_get(:@tokens).should == [@token]
58
50
  end
51
+ it 'cuts it down' do
52
+ @tokens.cap 0
53
+
54
+ @tokens.instance_variable_get(:@tokens).should == []
55
+ end
59
56
  end
60
57
  context 'many tokens' do
61
58
  before(:each) do
62
- @old_maximum = Query::Tokens.maximum
63
- Query::Tokens.maximum = 3
64
59
  @first = Query::Token.processed 'Hello'
65
60
  @second = Query::Token.processed 'I'
66
61
  @third = Query::Token.processed 'Am'
@@ -72,12 +67,9 @@ describe Query::Tokens do
72
67
  Query::Token.processed('Token')
73
68
  ]
74
69
  end
75
- after(:each) do
76
- Query::Tokens.maximum = @old_maximum
77
- end
78
70
  it 'should cap the number of tokens' do
79
- @tokens.cap
80
-
71
+ @tokens.cap 3
72
+
81
73
  @tokens.instance_variable_get(:@tokens).should == [@first, @second, @third]
82
74
  end
83
75
  end
@@ -1,4 +1,5 @@
1
- # coding: utf-8
1
+ # encoding: utf-8
2
+ #
2
3
  require 'spec_helper'
3
4
 
4
5
  describe Tokenizers::Base do
@@ -7,6 +8,22 @@ describe Tokenizers::Base do
7
8
  @tokenizer = Tokenizers::Base.new
8
9
  end
9
10
 
11
+ describe "substitute(s)_characters*" do
12
+ it "doesn't substitute if there is no substituter" do
13
+ @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
14
+ end
15
+ it "uses the substituter to replace characters" do
16
+ @tokenizer.substitutes_characters_with CharacterSubstitution::European.new
17
+
18
+ @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
19
+ end
20
+ it "uses the european substituter as default" do
21
+ @tokenizer.substitutes_characters_with
22
+
23
+ @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
24
+ end
25
+ end
26
+
10
27
  describe "removes_characters_after_splitting" do
11
28
  context "without removes_characters_after_splitting called" do
12
29
  it "has remove_after_normalizing_illegals" do
@@ -64,6 +81,9 @@ describe Tokenizers::Base do
64
81
  it 'should define a method split that splits by default on \s' do
65
82
  @tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
66
83
  end
84
+ it 'splits text on /\s/ by default' do
85
+ @tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
86
+ end
67
87
  end
68
88
  context "with removes_characters called" do
69
89
  before(:each) do
@@ -8,6 +8,29 @@ describe Tokenizers::Index do
8
8
  @tokenizer = Tokenizers::Index.new
9
9
  end
10
10
 
11
+ describe "default*" do
12
+ before(:all) do
13
+ @old = Tokenizers::Index.default
14
+ end
15
+ after(:all) do
16
+ Tokenizers::Index.default = @old
17
+ end
18
+ it "has a reader" do
19
+ lambda { Tokenizers::Index.default }.should_not raise_error
20
+ end
21
+ it "returns by default a new Index" do
22
+ Tokenizers::Index.default.should be_kind_of(Tokenizers::Index)
23
+ end
24
+ it "has a writer" do
25
+ lambda { Tokenizers::Index.default = :bla }.should_not raise_error
26
+ end
27
+ it "returns what has been written, if something has been written" do
28
+ Tokenizers::Index.default = :some_default
29
+
30
+ Tokenizers::Index.default.should == :some_default
31
+ end
32
+ end
33
+
11
34
  describe "remove_removes_characters" do
12
35
  it "should not remove ' from a query by default" do
13
36
  @tokenizer.remove_illegals("Lugi's").should == "Lugi's"
@@ -7,6 +7,38 @@ describe Tokenizers::Query do
7
7
  @tokenizer = Tokenizers::Query.new
8
8
  end
9
9
 
10
+ describe "default*" do
11
+ before(:all) do
12
+ @old = Tokenizers::Query.default
13
+ end
14
+ after(:all) do
15
+ Tokenizers::Query.default = @old
16
+ end
17
+ it "has a reader" do
18
+ lambda { Tokenizers::Query.default }.should_not raise_error
19
+ end
20
+ it "returns by default a new Index" do
21
+ Tokenizers::Query.default.should be_kind_of(Tokenizers::Query)
22
+ end
23
+ it "has a writer" do
24
+ lambda { Tokenizers::Query.default = :bla }.should_not raise_error
25
+ end
26
+ it "returns what has been written, if something has been written" do
27
+ Tokenizers::Query.default = :some_default
28
+
29
+ Tokenizers::Query.default.should == :some_default
30
+ end
31
+ end
32
+
33
+ describe "maximum_tokens" do
34
+ it "should be set to 5 by default" do
35
+ @tokenizer.maximum_tokens.should == 5
36
+ end
37
+ it "should be settable" do
38
+ Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
39
+ end
40
+ end
41
+
10
42
  describe 'preprocess' do
11
43
  it 'should call methods in order' do
12
44
  text = stub :text
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 9
8
- - 4
9
- version: 0.9.4
7
+ - 10
8
+ - 0
9
+ version: 0.10.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Florian Hanke
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-30 00:00:00 +02:00
17
+ date: 2010-10-31 00:00:00 +02:00
18
18
  default_executable: picky
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -59,6 +59,7 @@ files:
59
59
  - lib/picky/cacher/weights/logarithmic.rb
60
60
  - lib/picky/cacher/weights/strategy.rb
61
61
  - lib/picky/cacher/weights_generator.rb
62
+ - lib/picky/character_substitution/european.rb
62
63
  - lib/picky/configuration/field.rb
63
64
  - lib/picky/configuration/indexes.rb
64
65
  - lib/picky/configuration/queries.rb
@@ -118,11 +119,8 @@ files:
118
119
  - lib/picky/sources/db.rb
119
120
  - lib/picky/sources/delicious.rb
120
121
  - lib/picky/tokenizers/base.rb
121
- - lib/picky/tokenizers/default/index.rb
122
- - lib/picky/tokenizers/default/query.rb
123
122
  - lib/picky/tokenizers/index.rb
124
123
  - lib/picky/tokenizers/query.rb
125
- - lib/picky/umlaut_substituter.rb
126
124
  - lib/picky-tasks.rb
127
125
  - lib/picky.rb
128
126
  - lib/tasks/application.rake
@@ -161,6 +159,7 @@ files:
161
159
  - spec/lib/cacher/similarity_generator_spec.rb
162
160
  - spec/lib/cacher/weights/logarithmic_spec.rb
163
161
  - spec/lib/cacher/weights_generator_spec.rb
162
+ - spec/lib/character_substitution/european_spec.rb
164
163
  - spec/lib/configuration/field_spec.rb
165
164
  - spec/lib/configuration/indexes_spec.rb
166
165
  - spec/lib/configuration/type_spec.rb
@@ -210,11 +209,8 @@ files:
210
209
  - spec/lib/sources/db_spec.rb
211
210
  - spec/lib/sources/delicious_spec.rb
212
211
  - spec/lib/tokenizers/base_spec.rb
213
- - spec/lib/tokenizers/default/index_spec.rb
214
- - spec/lib/tokenizers/default/query_spec.rb
215
212
  - spec/lib/tokenizers/index_spec.rb
216
213
  - spec/lib/tokenizers/query_spec.rb
217
- - spec/lib/umlaut_substituter_spec.rb
218
214
  - spec/specific/speed_spec.rb
219
215
  - bin/picky
220
216
  has_rdoc: true
@@ -261,6 +257,7 @@ test_files:
261
257
  - spec/lib/cacher/similarity_generator_spec.rb
262
258
  - spec/lib/cacher/weights/logarithmic_spec.rb
263
259
  - spec/lib/cacher/weights_generator_spec.rb
260
+ - spec/lib/character_substitution/european_spec.rb
264
261
  - spec/lib/configuration/field_spec.rb
265
262
  - spec/lib/configuration/indexes_spec.rb
266
263
  - spec/lib/configuration/type_spec.rb
@@ -310,9 +307,6 @@ test_files:
310
307
  - spec/lib/sources/db_spec.rb
311
308
  - spec/lib/sources/delicious_spec.rb
312
309
  - spec/lib/tokenizers/base_spec.rb
313
- - spec/lib/tokenizers/default/index_spec.rb
314
- - spec/lib/tokenizers/default/query_spec.rb
315
310
  - spec/lib/tokenizers/index_spec.rb
316
311
  - spec/lib/tokenizers/query_spec.rb
317
- - spec/lib/umlaut_substituter_spec.rb
318
312
  - spec/specific/speed_spec.rb
@@ -1,7 +0,0 @@
1
- module Tokenizers
2
- module Default
3
- # Default is always an instance.
4
- #
5
- Index = ::Tokenizers::Index.new
6
- end
7
- end
@@ -1,7 +0,0 @@
1
- module Tokenizers
2
- module Default
3
- # Default is always an instance.
4
- #
5
- Query = ::Tokenizers::Query.new
6
- end
7
- end
@@ -1,34 +0,0 @@
1
- # encoding: utf-8
2
- #
3
-
4
- # Substitutes certain umlauts, like
5
- # ä, ö, ü => ae, oe, ue.
6
- # (and more, see specs)
7
- #
8
- class UmlautSubstituter
9
-
10
- attr_reader :chars
11
-
12
- def initialize
13
- @chars = ActiveSupport::Multibyte.proxy_class
14
- end
15
-
16
- def substitute text
17
- trans = chars.new(text).normalize(:kd)
18
-
19
- # substitute special cases
20
- #
21
- trans.gsub!('ß', 'ss')
22
-
23
- # substitute umlauts (of A,O,U,a,o,u)
24
- #
25
- trans.gsub!(/([AOUaou])\314\210/u, '\1e')
26
-
27
- # get rid of ecutes, graves and …
28
- #
29
- trans.unpack('U*').select { |cp|
30
- cp < 0x0300 || cp > 0x035F
31
- }.pack('U*')
32
- end
33
-
34
- end
@@ -1,11 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- require 'spec_helper'
4
-
5
- describe Tokenizers::Default::Index do
6
-
7
- it "is an instance of the index tokenizer" do
8
- Tokenizers::Default::Index.should be_kind_of(Tokenizers::Index)
9
- end
10
-
11
- end
@@ -1,11 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- require 'spec_helper'
4
-
5
- describe Tokenizers::Default::Query do
6
-
7
- it "is an instance of the index tokenizer" do
8
- Tokenizers::Default::Query.should be_kind_of(Tokenizers::Query)
9
- end
10
-
11
- end