picky 0.9.4 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/lib/picky/application.rb +42 -29
  2. data/lib/picky/character_substitution/european.rb +33 -0
  3. data/lib/picky/configuration/field.rb +3 -3
  4. data/lib/picky/configuration/indexes.rb +4 -10
  5. data/lib/picky/configuration/queries.rb +2 -10
  6. data/lib/picky/index/bundle.rb +2 -7
  7. data/lib/picky/index/file/text.rb +6 -1
  8. data/lib/picky/loader.rb +4 -4
  9. data/lib/picky/query/base.rb +1 -1
  10. data/lib/picky/query/tokens.rb +4 -11
  11. data/lib/picky/tokenizers/base.rb +23 -5
  12. data/lib/picky/tokenizers/index.rb +8 -1
  13. data/lib/picky/tokenizers/query.rb +21 -7
  14. data/lib/tasks/server.rake +3 -3
  15. data/lib/tasks/shortcuts.rake +4 -4
  16. data/lib/tasks/spec.rake +1 -1
  17. data/lib/tasks/try.rake +6 -8
  18. data/project_prototype/Gemfile +3 -2
  19. data/project_prototype/app/application.rb +35 -47
  20. data/spec/lib/application_spec.rb +36 -19
  21. data/spec/lib/{umlaut_substituter_spec.rb → character_substitution/european_spec.rb} +14 -2
  22. data/spec/lib/configuration/field_spec.rb +12 -12
  23. data/spec/lib/configuration/indexes_spec.rb +3 -40
  24. data/spec/lib/extensions/array_spec.rb +1 -1
  25. data/spec/lib/extensions/hash_spec.rb +1 -1
  26. data/spec/lib/index/file/text_spec.rb +14 -1
  27. data/spec/lib/query/combination_spec.rb +1 -1
  28. data/spec/lib/query/tokens_spec.rb +11 -19
  29. data/spec/lib/tokenizers/base_spec.rb +21 -1
  30. data/spec/lib/tokenizers/index_spec.rb +23 -0
  31. data/spec/lib/tokenizers/query_spec.rb +32 -0
  32. metadata +7 -13
  33. data/lib/picky/tokenizers/default/index.rb +0 -7
  34. data/lib/picky/tokenizers/default/query.rb +0 -7
  35. data/lib/picky/umlaut_substituter.rb +0 -34
  36. data/spec/lib/tokenizers/default/index_spec.rb +0 -11
  37. data/spec/lib/tokenizers/default/query_spec.rb +0 -11
@@ -1,55 +1,68 @@
1
1
  # The Picky application wherein the indexing and querying is defined.
2
2
  #
3
3
  class Application
4
+
4
5
  class << self
5
6
 
6
- attr_reader :apps
7
+ # Returns a configured tokenizer that
8
+ # is used for indexing by default.
9
+ #
10
+ def default_indexing options = {}
11
+ indexing.default_tokenizer options
12
+ end
7
13
 
8
- # Finalize the subclass as soon as it
9
- # has finished loading.
14
+ # Returns a configured tokenizer that
15
+ # is used for querying by default.
16
+ #
17
+ def default_querying options = {}
18
+ querying.default_tokenizer options
19
+ end
20
+
21
+ # Routes.
10
22
  #
11
- # Note: finalize finalizes the routes.
23
+ delegate :route, :root, :to => :routing
24
+ # Index, Field.
12
25
  #
13
- def inherited app
14
- @apps ||= []
15
- @apps << app
16
- end
17
- def finalize_apps
18
- @apps.each &:finalize
19
- end
26
+ # TODO Rename category.
27
+ #
28
+ delegate :field, :to => :indexing
29
+ def category *args; indexing.field *args; end
30
+ def index *args; indexing.define_index *args; end
20
31
 
21
32
  # An application simply delegates to the routing to handle a request.
22
33
  #
23
34
  def call env
24
35
  routing.call env
25
36
  end
26
-
27
- # Freezes the routes.
28
- #
29
- def finalize
30
- routing.freeze
31
- end
32
37
  def routing
33
38
  @routing ||= Routing.new
34
39
  end
35
- # Routes.
36
- #
37
- delegate :route, :root, :to => :routing
38
-
39
- # TODO Rename to default_indexing?
40
- #
41
40
  def indexing
42
41
  @indexing ||= Configuration::Indexes.new
43
42
  end
44
- def index *args
45
- self.type *args
43
+ def querying
44
+ @queries ||= Configuration::Queries.new
46
45
  end
47
- delegate :type, :field, :to => :indexing
48
46
 
49
- # TODO Rename to default_querying?
47
+ # Finalize the subclass as soon as it
48
+ # has finished loading.
50
49
  #
51
- def querying
52
- @queries ||= Configuration::Queries.new
50
+ attr_reader :apps
51
+ def initialize_apps
52
+ @apps ||= []
53
+ end
54
+ def inherited app
55
+ initialize_apps
56
+ apps << app
57
+ end
58
+ def finalize_apps
59
+ initialize_apps
60
+ apps.each &:finalize
61
+ end
62
+ # Finalizes the routes.
63
+ #
64
+ def finalize
65
+ routing.freeze
53
66
  end
54
67
 
55
68
  # TODO Add more info.
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+ #
3
+ module CharacterSubstitution
4
+ # Substitutes Umlauts like
5
+ # ä, ö, ü => ae, oe, ue.
6
+ # (and more, see specs)
7
+ #
8
+ class European
9
+
10
+ def initialize
11
+ @chars = ActiveSupport::Multibyte.proxy_class
12
+ end
13
+
14
+ def substitute text
15
+ trans = @chars.new(text).normalize(:kd)
16
+
17
+ # substitute special cases
18
+ #
19
+ trans.gsub!('ß', 'ss')
20
+
21
+ # substitute umlauts (of A,O,U,a,o,u)
22
+ #
23
+ trans.gsub!(/([AOUaou])\314\210/u, '\1e')
24
+
25
+ # get rid of ecutes, graves and …
26
+ #
27
+ trans.unpack('U*').select { |cp|
28
+ cp < 0x0300 || cp > 0x035F
29
+ }.pack('U*')
30
+ end
31
+
32
+ end
33
+ end
@@ -5,10 +5,10 @@ module Configuration
5
5
  #
6
6
  class Field
7
7
  attr_reader :name, :indexed_name, :virtual, :tokenizer
8
- attr_accessor :type # convenience
9
- def initialize name, tokenizer, options = {}
8
+ attr_accessor :type # convenience TODO Still needed?
9
+ def initialize name, options = {}
10
10
  @name = name.to_sym
11
- @tokenizer = tokenizer
11
+ @tokenizer = options[:tokenizer] || Tokenizers::Index.default
12
12
 
13
13
  # TODO Dup the options?
14
14
 
@@ -10,17 +10,13 @@ module Configuration
10
10
  @types = []
11
11
  end
12
12
 
13
- def default_tokenizer
14
- @default_tokenizer ||= Tokenizers::Default::Index
13
+ def default_tokenizer options = {}
14
+ Tokenizers::Index.default = Tokenizers::Index.new(options)
15
15
  end
16
16
 
17
- # Delegates
18
- #
19
- delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
20
-
21
17
  # TODO Rewrite all this configuration handling.
22
18
  #
23
- def type name, source, *fields
19
+ def define_index name, source, *fields
24
20
  new_type = Type.new name, source, *fields
25
21
  types << new_type
26
22
  ::Indexes.configuration ||= self
@@ -30,9 +26,7 @@ module Configuration
30
26
  generated
31
27
  end
32
28
  def field name, options = {}
33
- tokenizer = options[:tokenizer] || default_tokenizer
34
-
35
- Field.new name, tokenizer, options
29
+ Field.new name, options
36
30
  end
37
31
 
38
32
  #
@@ -6,16 +6,8 @@ module Configuration
6
6
 
7
7
  #
8
8
  #
9
- def default_tokenizer
10
- @default_tokenizer ||= Tokenizers::Default::Query
11
- end
12
-
13
- delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
14
-
15
- # Delegates.
16
- #
17
- def maximum_tokens amount
18
- Query::Tokens.maximum = amount
9
+ def default_tokenizer options = {}
10
+ Tokenizers::Query.default = Tokenizers::Query.new(options)
19
11
  end
20
12
 
21
13
  end
@@ -98,15 +98,10 @@ module Index
98
98
  end
99
99
  # Retrieves the data into the index.
100
100
  #
101
- # TODO Beautify.
102
- #
103
101
  def retrieve
104
- files.retrieve do |indexed_id, token|
105
- token.chomp!
106
- token = token.to_sym
107
-
102
+ files.retrieve do |id, token|
108
103
  initialize_index_for token
109
- index[token] << indexed_id.to_i
104
+ index[token] << id
110
105
  end
111
106
  end
112
107
  def initialize_index_for token
@@ -13,10 +13,15 @@ module Index
13
13
  def dump hash
14
14
  raise "Can't dump to text file. Use JSON or Marshal."
15
15
  end
16
+
17
+ # Yields an id and a symbol token.
18
+ #
16
19
  def retrieve
20
+ id, token =
17
21
  ::File.open(cache_path, 'r:binary') do |file|
18
22
  file.each_line do |line|
19
- yield line.split ?,, 2
23
+ id, token = line.split ?,, 2
24
+ yield id.to_i, (token.chomp! || token).to_sym
20
25
  end
21
26
  end
22
27
  end
data/lib/picky/loader.rb CHANGED
@@ -104,6 +104,10 @@ module Loader
104
104
  load_relative 'helpers/cache'
105
105
  load_relative 'helpers/measuring'
106
106
 
107
+ # Character Substitution
108
+ #
109
+ load_relative 'character_substitution/european'
110
+
107
111
  # Signal handling
108
112
  #
109
113
  load_relative 'signals'
@@ -111,7 +115,6 @@ module Loader
111
115
  # Various.
112
116
  #
113
117
  load_relative 'loggers/search'
114
- load_relative 'umlaut_substituter'
115
118
 
116
119
  # Index generation strategies.
117
120
  #
@@ -180,9 +183,6 @@ module Loader
180
183
  load_relative 'tokenizers/index'
181
184
  load_relative 'tokenizers/query'
182
185
 
183
- load_relative 'tokenizers/default/index'
184
- load_relative 'tokenizers/default/query'
185
-
186
186
  # Query combinations, qualifiers, weigher.
187
187
  #
188
188
  load_relative 'query/combination'
@@ -16,7 +16,7 @@ module Query
16
16
  options = Hash === index_types.last ? index_types.pop : {}
17
17
  @index_types = index_types
18
18
  @weigher = options[:weigher] || Weigher.new(index_types)
19
- @tokenizer = options[:tokenizer] || Tokenizers::Default::Query
19
+ @tokenizer = options[:tokenizer] || Tokenizers::Query.default
20
20
  @weights = options[:weights] || Weights.new
21
21
  end
22
22
 
@@ -6,11 +6,6 @@ module Query
6
6
  #
7
7
  class Tokens
8
8
 
9
- #
10
- #
11
- cattr_accessor :maximum
12
- self.maximum = 5
13
-
14
9
  # Basically delegates to its internal tokens array.
15
10
  #
16
11
  self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
@@ -52,13 +47,11 @@ module Query
52
47
 
53
48
  # Caps the tokens to the maximum.
54
49
  #
55
- # Note: We could parametrize this if necessary.
56
- #
57
- def cap
58
- @tokens.slice!(@@maximum..-1) if cap?
50
+ def cap maximum
51
+ @tokens.slice!(maximum..-1) if cap?(maximum)
59
52
  end
60
- def cap?
61
- @tokens.size > @@maximum
53
+ def cap? maximum
54
+ @tokens.size > maximum
62
55
  end
63
56
 
64
57
  # Rejects blank tokens.
@@ -79,6 +79,19 @@ module Tokenizers
79
79
  text.gsub! @removes_characters_after_splitting_regexp, '' if @removes_characters_after_splitting_regexp
80
80
  end
81
81
 
82
+ # Substitute Characters with this substituter.
83
+ #
84
+ # Default is European Character substitution.
85
+ #
86
+ def substitutes_characters_with substituter = CharacterSubstitution::European.new
87
+ # TODO Raise if it doesn't quack substitute?
88
+ @substituter = substituter
89
+ end
90
+ def substitute_characters text
91
+ substituter?? substituter.substitute(text) : text
92
+ end
93
+
94
+
82
95
  # Returns a number of tokens, generated from the given text.
83
96
  #
84
97
  # Note:
@@ -93,15 +106,20 @@ module Tokenizers
93
106
  process tokens # processing tokens / strings
94
107
  end
95
108
 
96
- attr_accessor :substituter
109
+ attr_reader :substituter
97
110
  alias substituter? substituter
98
111
 
99
- def initialize substituter = UmlautSubstituter.new
100
- @substituter = substituter
112
+ def initialize options = {}
113
+ removes_characters options[:removes_characters] if options[:removes_characters]
114
+ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
115
+ stopwords options[:stopwords] if options[:stopwords]
116
+ normalizes_words options[:normalizes_words] if options[:normalizes_words]
117
+ removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
118
+ substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
101
119
 
102
- # TODO Default handling.
120
+ # Defaults.
103
121
  #
104
- splits_text_on(/\s/)
122
+ splits_text_on options[:splits_text_on] || /\s/
105
123
  end
106
124
 
107
125
  # Hooks.
@@ -5,6 +5,13 @@ module Tokenizers
5
5
  #
6
6
  class Index < Base
7
7
 
8
+ def self.default= new_default
9
+ @default = new_default
10
+ end
11
+ def self.default
12
+ @default ||= new
13
+ end
14
+
8
15
  # Default indexing preprocessing hook.
9
16
  #
10
17
  # Does:
@@ -15,7 +22,7 @@ module Tokenizers
15
22
  # 5. Remove non-single stopwords. (Stopwords that occur with other words)
16
23
  #
17
24
  def preprocess text
18
- text = substituter.substitute text if substituter?
25
+ text = substitute_characters text
19
26
  text.downcase!
20
27
  remove_illegals text
21
28
  contract text
@@ -13,6 +13,20 @@ module Tokenizers
13
13
  #
14
14
  class Query < Base
15
15
 
16
+ def self.default= new_default
17
+ @default = new_default
18
+ end
19
+ def self.default
20
+ @default ||= new
21
+ end
22
+
23
+ attr_reader :maximum_tokens
24
+
25
+ def initialize options = {}
26
+ super options
27
+ @maximum_tokens = options[:maximum_tokens] || 5
28
+ end
29
+
16
30
  def preprocess text
17
31
  remove_illegals text # Remove illegal characters
18
32
  remove_non_single_stopwords text # remove stop words
@@ -33,9 +47,9 @@ module Tokenizers
33
47
  #
34
48
  def process tokens
35
49
  tokens.tokenize_with self
36
- tokens.reject # Reject any tokens that don't meet criteria
37
- tokens.cap # Cut off superfluous tokens
38
- tokens.partialize_last # Set certain tokens as partial
50
+ tokens.reject # Reject any tokens that don't meet criteria
51
+ tokens.cap maximum_tokens # Cut off superfluous tokens
52
+ tokens.partialize_last # Set certain tokens as partial
39
53
  tokens
40
54
  end
41
55
 
@@ -44,10 +58,10 @@ module Tokenizers
44
58
  # TODO Perhaps move to Normalizer?
45
59
  #
46
60
  def normalize text
47
- text = substituter.substitute text if substituter? # Substitute special characters TODO Move to subclass
48
- text.downcase! # Downcase all text
49
- normalize_with_patterns text # normalize
50
- text.to_sym # symbolize
61
+ text = substitute_characters text # Substitute special characters TODO Move to subclass
62
+ text.downcase! # Downcase all text
63
+ normalize_with_patterns text # normalize
64
+ text.to_sym # symbolize
51
65
  end
52
66
 
53
67
  # Returns a token for a word.
@@ -11,7 +11,7 @@ namespace :server do
11
11
  pid.blank? ? nil : pid.chomp
12
12
  end
13
13
 
14
- desc "Start the unicorns. (Wehee!)"
14
+ # desc "Start the unicorns. (Wehee!)"
15
15
  task :start => :framework do
16
16
  chdir_to_root
17
17
  # Rake::Task[:"solr:start"].invoke # TODO Move to better place.
@@ -21,13 +21,13 @@ namespace :server do
21
21
  exec command
22
22
  end
23
23
 
24
- desc "Stop the unicorns. (Blam!)"
24
+ # desc "Stop the unicorns. (Blam!)"
25
25
  task :stop => :framework do
26
26
  `kill -QUIT #{current_pid}` if current_pid
27
27
  # Rake::Task[:"solr:stop"].invoke # TODO Move to better place.
28
28
  end
29
29
 
30
- desc "Restart the unicorns."
30
+ # desc "Restart the unicorns."
31
31
  task :restart do
32
32
  Rake::Task[:"server:stop"].invoke
33
33
  sleep 5
@@ -1,20 +1,20 @@
1
- desc "Shortcut for index:generate."
1
+ desc "Generate the index."
2
2
  task :index => :application do
3
3
  Rake::Task[:'index:generate'].invoke
4
4
  end
5
5
 
6
- desc "Shortcut for try:both"
6
+ desc "Try the given text in the indexer/query (type:field optional)."
7
7
  task :try, [:text, :type_and_field] => :application do |_, options|
8
8
  text, type_and_field = options.text, options.type_and_field
9
9
 
10
10
  Rake::Task[:'try:both'].invoke text, type_and_field
11
11
  end
12
12
 
13
- desc "shortcut for server:start"
13
+ desc "Start the server."
14
14
  task :start do
15
15
  Rake::Task[:'server:start'].invoke
16
16
  end
17
- desc "shortcut for server:stop"
17
+ desc "Stop the server."
18
18
  task :stop do
19
19
  Rake::Task[:'server:stop'].invoke
20
20
  end
data/lib/tasks/spec.rake CHANGED
@@ -3,7 +3,7 @@ require 'spec/rake/spectask'
3
3
 
4
4
  task :default => :spec
5
5
 
6
- desc "Run all specs in spec directory (excluding plugin specs)"
6
+ desc "Run all specs"
7
7
  Spec::Rake::SpecTask.new(:spec) do |t|
8
8
  spec_root = File.join(File.dirname(__FILE__), '..', '..', 'spec')
9
9
  t.spec_opts = ['--options', "\"#{File.join(spec_root, 'spec.opts')}\""]
data/lib/tasks/try.rake CHANGED
@@ -2,25 +2,23 @@
2
2
  #
3
3
  namespace :try do
4
4
 
5
- desc "Try how a given word would be tokenized when indexing (type:field optional)."
5
+ # desc "Try how a given word would be tokenized when indexing (type:field optional)."
6
6
  task :index, [:text, :type_and_field] => :application do |_, options|
7
7
  text, type_and_field = options.text, options.type_and_field
8
8
 
9
- tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Default::Index
9
+ tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Index.default
10
10
 
11
- puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text).to_a}"
11
+ puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
12
12
  end
13
13
 
14
- desc "Try how a given word would be tokenized when querying."
14
+ # desc "Try how a given word would be tokenized when querying."
15
15
  task :query, [:text] => :application do |_, options|
16
16
  text = options.text
17
17
 
18
- # TODO Text is destroyed.
19
- #
20
- puts "\"#{text}\" is query tokenized as #{Tokenizers::Default::Query.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
18
+ puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
21
19
  end
22
20
 
23
- desc "Try the given text with both the index and the query (type:field optional)."
21
+ # desc "Try the given text with both the index and the query (type:field optional)."
24
22
  task :both, [:text, :type_and_field] => :application do |_, options|
25
23
  text, type_and_field = options.text, options.type_and_field
26
24
 
@@ -2,8 +2,9 @@ source :gemcutter
2
2
 
3
3
  # Gems required by Picky.
4
4
  #
5
- gem 'picky', '~> 0.9.0'
6
- gem 'bundler', '>= 0.9.26'
5
+ gem 'picky', '~> 0.10.0'
6
+ gem 'rake'
7
+ gem 'bundler'
7
8
  gem 'rack', '~> 1.2.1'
8
9
  gem 'rack-mount', '~> 0.6.9'
9
10
  gem 'text', '~> 0.2.0'
@@ -1,58 +1,46 @@
1
1
  # encoding: utf-8
2
2
  #
3
+ # TODO Adapt the generated example
4
+ # (a library books finder) to what you need.
5
+ #
6
+ # Check the Wiki http://github.com/floere/picky/wiki for more options.
7
+ # Ask me or the google group if you have questions or specific requests.
8
+ #
3
9
  class PickySearch < Application
4
10
 
5
- # TODO Adapt the generated example
6
- # (a library books finder) to what you need.
7
- #
8
- # Check the Wiki http://github.com/floere/picky/wiki for more options.
11
+ # Indexing: How text is indexed.
12
+ # Querying: How query text is handled.
13
+ #
14
+ default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
15
+ stopwords: /\b(and|the|of|it|in|for)\b/,
16
+ splits_text_on: /[\s\/\-\"\&\.]/
17
+
18
+ default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
19
+ stopwords: /\b(and|the|of|it|in|for)\b/,
20
+ splits_text_on: /[\s\/\-\,\&]+/,
21
+
22
+ maximum_tokens: 5, # Max amount of tokens passing into a query. 5 is the default.
23
+ substitutes_characters_with: CharacterSubstitution::European.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
24
+
25
+ # Define an index. Use a database etc. source? http://github.com/floere/picky/wiki/Sources-Configuration#sources
9
26
  #
10
- # Ask me or the google group if you have questions or specific requests.
11
- #
12
-
13
- indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
14
- indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
15
- indexing.splits_text_on(/[\s\/\-\"\&\.]/)
16
-
17
27
  books_index = index :books,
18
- Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, :file => 'app/library.csv'),
19
- # Use a database as source:
20
- # Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
21
- # Or delicious:
22
- # Sources::Delicious.new('username', 'password'), # offers title, tags, url fields.
23
- field(:title,
24
- :partial => Partial::Substring.new(:from => 1), # Index substrings upwards from character 1 (default: -3),
25
- # e.g. picky -> p, pi, pic, pick, picky
26
- # Like this, you'll find picky even when entering just a "p".
27
- :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: no similarity).
28
- field(:author, :partial => Partial::Substring.new(:from => 1)),
29
- field(:isbn, :partial => Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
30
-
31
- # Defines the maximum tokens (words) that pass through to the engine.
32
- #
33
- querying.maximum_tokens 5
28
+ Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, file: 'app/library.csv'),
29
+ category(:title,
30
+ partial: Partial::Substring.new(from: 1), # Indexes substrings upwards from character 1 (default: -3),
31
+ # You'll find "picky" even when entering just a "p".
32
+ similarity: Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: No similarity).
33
+ category(:author,
34
+ partial: Partial::Substring.new(from: 1)),
35
+ category(:isbn,
36
+ partial: Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
34
37
 
35
- # Note that Picky needs the following characters to
36
- # pass through, as they are control characters: *"~:
37
- #
38
- querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
39
- querying.stopwords(/\b(and|the|of|it|in|for)\b/)
40
- querying.splits_text_on(/[\s\/\-\,\&]+/)
38
+ full_books = Query::Full.new books_index # A Full query returns ids, combinations, and counts.
39
+ live_books = Query::Live.new books_index # A Live query does return all that Full returns, except ids.
41
40
 
42
- # The example defines two queries that use the same index(es).
43
- #
44
- # A Full query returns ids, combinations, and counts.
45
- # A Live query does return all that Full returns, without ids.
46
- #
47
- # Note: You can pass a query multiple indexes and it will combine them.
48
- #
49
- full_books = Query::Full.new books_index
50
- live_books = Query::Live.new books_index
41
+ route %r{\A/books/full\Z} => full_books # Routing is simple: url_path_regexp => query
42
+ route %r{\A/books/live\Z} => live_books #
51
43
 
52
- # Routing is simple.
53
- # A path regexp pointing to a query that will be run.
54
- #
55
- route %r{^/books/full} => full_books
56
- route %r{^/books/live} => live_books
44
+ # Note: You can pass a query multiple indexes and it will query in all of them.
57
45
 
58
46
  end
@@ -5,32 +5,49 @@ require 'spec_helper'
5
5
  describe Application do
6
6
 
7
7
  describe "integration" do
8
+ it "should run ok" do
9
+ lambda {
10
+ class MinimalTestApplication < Application
11
+ books = index :books,
12
+ Sources::DB.new('SELECT id, title FROM books', :file => 'app/db.yml'),
13
+ category(:title)
14
+
15
+
16
+ full = Query::Full.new books
17
+ live = Query::Live.new books
18
+
19
+ route %r{^/books/full} => full
20
+ route %r{^/books/live} => live
21
+ end
22
+ Tokenizers::Index.default.tokenize 'some text'
23
+ Tokenizers::Query.default.tokenize 'some text'
24
+ }.should_not raise_error
25
+ end
8
26
  it "should run ok" do
9
27
  lambda {
10
28
  # TODO Add all possible cases.
11
29
  #
12
30
  class TestApplication < Application
13
- indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
14
- indexing.contracts_expressions(/mr\.\s*|mister\s*/i, 'mr ')
15
- indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
16
- indexing.splits_text_on(/[\s\/\-\"\&\.]/)
17
- indexing.removes_characters_after_splitting(/[\.]/)
31
+ default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
32
+ contracts_expressions: [/mr\.\s*|mister\s*/i, 'mr '],
33
+ stopwords: /\b(and|the|of|it|in|for)\b/,
34
+ splits_text_on: /[\s\/\-\"\&\.]/,
35
+ removes_characters_after_splitting: /[\.]/
36
+
37
+ default_querying removes_characters: /[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/,
38
+ stopwords: /\b(and|the|of|it|in|for)\b/,
39
+ splits_text_on: /[\s\/\-\,\&]+/,
40
+ normalizes_words: [[/Deoxyribonucleic Acid/i, 'DNA']],
41
+
42
+ substitutes_characters_with: CharacterSubstitution::European.new,
43
+ maximum_tokens: 5
18
44
 
19
- books_index = index Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
20
- field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
21
- field(:author),
22
- field(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
45
+ books_index = index :books,
46
+ Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
47
+ category(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
48
+ category(:author),
49
+ category(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
23
50
 
24
- # Note that Picky needs the following characters to
25
- # pass through, as they are control characters: *"~:
26
- #
27
- querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
28
- querying.stopwords(/\b(and|the|of|it|in|for)\b/)
29
- querying.splits_text_on(/[\s\/\-\,\&]+/)
30
- querying.normalizes_words([
31
- [/Deoxyribonucleic Acid/i, 'DNA']
32
- ])
33
- querying.maximum_tokens 5
34
51
 
35
52
  full = Query::Full.new books_index
36
53
  live = Query::Live.new books_index
@@ -1,9 +1,10 @@
1
1
  # encoding: utf-8
2
+ #
2
3
  require 'spec_helper'
3
4
 
4
- describe UmlautSubstituter do
5
+ describe CharacterSubstitution do
5
6
  before(:each) do
6
- @substituter = UmlautSubstituter.new
7
+ @substituter = CharacterSubstitution::European.new
7
8
  end
8
9
 
9
10
  # A bit of metaprogramming to help with the myriads of its.
@@ -82,5 +83,16 @@ describe UmlautSubstituter do
82
83
  it_should_substitute 'å', 'a'
83
84
  it_should_substitute 'Å', 'A'
84
85
  end
86
+
87
+ describe "diacritic" do
88
+ it_should_substitute 'ñ', 'n'
89
+ end
90
+
91
+ describe "speed" do
92
+ it "is fast" do
93
+ result = performance_of { @substituter.substitute('ä') }
94
+ result.should < 0.00009
95
+ end
96
+ end
85
97
 
86
98
  end
@@ -5,7 +5,7 @@ describe Configuration::Field do
5
5
  describe "virtual?" do
6
6
  context "with virtual true" do
7
7
  before(:each) do
8
- @field = Configuration::Field.new :some_name, :some_tokenizer, :virtual => true
8
+ @field = Configuration::Field.new :some_name, :virtual => true
9
9
  end
10
10
  it "returns the right value" do
11
11
  @field.virtual?.should == true
@@ -13,7 +13,7 @@ describe Configuration::Field do
13
13
  end
14
14
  context "with virtual object" do
15
15
  before(:each) do
16
- @field = Configuration::Field.new :some_name, :some_tokenizer, :virtual => 123.6
16
+ @field = Configuration::Field.new :some_name, :virtual => 123.6
17
17
  end
18
18
  it "returns the right value" do
19
19
  @field.virtual?.should == true
@@ -39,7 +39,7 @@ describe Configuration::Field do
39
39
  describe "tokenizer" do
40
40
  context "with specific tokenizer" do
41
41
  before(:each) do
42
- @field = Configuration::Field.new :some_name, Tokenizers::Index.new
42
+ @field = Configuration::Field.new :some_name, tokenizer: Tokenizers::Index.new
43
43
 
44
44
  @field.type = :some_type
45
45
  end
@@ -54,7 +54,7 @@ describe Configuration::Field do
54
54
  describe "indexer" do
55
55
  context "with default indexer" do
56
56
  before(:each) do
57
- @field = Configuration::Field.new :some_name, :some_tokenizer
57
+ @field = Configuration::Field.new :some_name
58
58
  end
59
59
  it "caches" do
60
60
  @field.indexer.should == @field.indexer
@@ -62,7 +62,7 @@ describe Configuration::Field do
62
62
  end
63
63
  context "with specific indexer" do
64
64
  before(:each) do
65
- @field = Configuration::Field.new :some_name, :indexer => Indexers::Default
65
+ @field = Configuration::Field.new :some_name, tokenizer: Indexers::Default
66
66
 
67
67
  @field.type = :some_type
68
68
  end
@@ -81,7 +81,7 @@ describe Configuration::Field do
81
81
  end
82
82
  describe "cache" do
83
83
  before(:each) do
84
- @field = Configuration::Field.new :some_name, :some_tokenizer
84
+ @field = Configuration::Field.new :some_name
85
85
  @field.stub! :prepare_cache_directory
86
86
 
87
87
  @generated = stub :generated, :generate_caches => nil
@@ -100,7 +100,7 @@ describe Configuration::Field do
100
100
  end
101
101
  describe "prepare_cache_directory" do
102
102
  before(:each) do
103
- @field = Configuration::Field.new :some_name, :some_tokenizer
103
+ @field = Configuration::Field.new :some_name
104
104
 
105
105
  @field.stub! :cache_directory => :some_cache_directory
106
106
  end
@@ -112,7 +112,7 @@ describe Configuration::Field do
112
112
  end
113
113
  describe "index" do
114
114
  before(:each) do
115
- @field = Configuration::Field.new :some_name, :some_tokenizer
115
+ @field = Configuration::Field.new :some_name
116
116
  @field.stub! :prepare_cache_directory
117
117
 
118
118
  @indexer = stub :indexer, :index => nil
@@ -132,7 +132,7 @@ describe Configuration::Field do
132
132
  describe "source" do
133
133
  context "with source" do
134
134
  before(:each) do
135
- @field = Configuration::Field.new :some_name, :some_tokenizer, :source => :some_given_source
135
+ @field = Configuration::Field.new :some_name, :source => :some_given_source
136
136
 
137
137
  @type = stub :type, :name => :some_type
138
138
  @field.type = @type
@@ -143,7 +143,7 @@ describe Configuration::Field do
143
143
  end
144
144
  context "without source" do
145
145
  before(:each) do
146
- @field = Configuration::Field.new :some_name, :some_tokenizer
146
+ @field = Configuration::Field.new :some_name
147
147
 
148
148
  @type = stub :type, :name => :some_type, :source => :some_type_source
149
149
  @field.type = @type
@@ -155,7 +155,7 @@ describe Configuration::Field do
155
155
  end
156
156
  context "name symbol" do
157
157
  before(:each) do
158
- @field = Configuration::Field.new :some_name, :some_tokenizer
158
+ @field = Configuration::Field.new :some_name
159
159
 
160
160
  @type = stub :type, :name => :some_type
161
161
  @field.type = @type
@@ -189,7 +189,7 @@ describe Configuration::Field do
189
189
  end
190
190
  context "name string" do
191
191
  before(:each) do
192
- @field = Configuration::Field.new 'some_name', :some_tokenizer
192
+ @field = Configuration::Field.new 'some_name'
193
193
  end
194
194
  describe "generate_qualifiers_from" do
195
195
  context "without qualifiers" do
@@ -18,47 +18,10 @@ describe Configuration::Indexes do
18
18
 
19
19
  describe "default_tokenizer" do
20
20
  it "is a default tokenizer" do
21
- @config.default_tokenizer.should == Tokenizers::Default::Index
21
+ @config.default_tokenizer.should be_kind_of(Tokenizers::Index)
22
22
  end
23
- it "caches" do
24
- @config.default_tokenizer.should == @config.default_tokenizer
25
- end
26
- end
27
-
28
- describe "delegates" do
29
- before(:each) do
30
- @receiver = mock :receiver
31
- @config.stub! :default_tokenizer => @receiver
32
- end
33
- it "delegates" do
34
- @receiver.should_receive(:removes_characters).once
35
-
36
- @config.removes_characters
37
- end
38
- it "delegates" do
39
- @receiver.should_receive(:contracts_expressions).once
40
-
41
- @config.contracts_expressions
42
- end
43
- it "delegates" do
44
- @receiver.should_receive(:stopwords).once
45
-
46
- @config.stopwords
47
- end
48
- it "delegates" do
49
- @receiver.should_receive(:splits_text_on).once
50
-
51
- @config.splits_text_on
52
- end
53
- it "delegates" do
54
- @receiver.should_receive(:normalizes_words).once
55
-
56
- @config.normalizes_words
57
- end
58
- it "delegates" do
59
- @receiver.should_receive(:removes_characters_after_splitting).once
60
-
61
- @config.removes_characters_after_splitting
23
+ it "does not cache" do
24
+ @config.default_tokenizer.should_not == @config.default_tokenizer
62
25
  end
63
26
  end
64
27
 
@@ -51,7 +51,7 @@ describe Array do
51
51
  [:test1, :test1, :test2, :test2, :test3].clustered_uniq.should == [:test1, :test2, :test3]
52
52
  end
53
53
  it "is fast" do
54
- performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.00001
54
+ performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.000012
55
55
  end
56
56
  end
57
57
 
@@ -65,7 +65,7 @@ describe Hash do
65
65
  lambda { @obj.to_json(:some => :option) }.should_not raise_error
66
66
  end
67
67
  it "should be fast" do
68
- performance_of { @obj.to_json }.should < 0.00006
68
+ performance_of { @obj.to_json }.should < 0.000065
69
69
  end
70
70
  end
71
71
 
@@ -21,7 +21,20 @@ describe Index::File::Text do
21
21
  end
22
22
  end
23
23
  describe "retrieve" do
24
- it
24
+ before(:each) do
25
+ @io = stub :io
26
+ @io.should_receive(:each_line).once.with.and_yield '123456,some_nice_token'
27
+ File.should_receive(:open).any_number_of_times.and_yield @io
28
+ end
29
+ it "yields split lines and returns the id and token text" do
30
+ @file.retrieve do |id, token|
31
+ id.should == 123456
32
+ token.should == :some_nice_token
33
+ end
34
+ end
35
+ it "is fast" do
36
+ performance_of { @file.retrieve { |id, token| } }.should < 0.00005
37
+ end
25
38
  end
26
39
 
27
40
  end
@@ -46,7 +46,7 @@ describe 'Query::Combination' do
46
46
  @combination = Query::Combination.new token, @category
47
47
  end
48
48
  it 'should return a correct result' do
49
- @combination.to_result.should == [:some_category_name, 'Blä~', :blae]
49
+ @combination.to_result.should == [:some_category_name, 'Blä~', :blä] # Note: Characters not substituted. That's ok.
50
50
  end
51
51
  end
52
52
  it 'should return a correct result' do
@@ -36,31 +36,26 @@ describe Query::Tokens do
36
36
  @tokens.instance_variable_get(:@tokens).should == [@nonblank, @nonblank]
37
37
  end
38
38
  end
39
-
40
- describe "class variables" do
41
- describe "maximal query words" do
42
- it "should answer" do
43
- lambda { Query::Tokens.maximum }.should_not raise_error
44
- end
45
- end
46
- end
47
-
39
+
48
40
  describe 'cap' do
49
41
  context 'one token' do
50
42
  before(:each) do
51
43
  @token = Query::Token.processed 'Token'
52
44
  @tokens = Query::Tokens.new [@token]
53
45
  end
54
- it 'should not cut it down' do
55
- @tokens.cap
56
-
46
+ it 'does not cut it down' do
47
+ @tokens.cap 5
48
+
57
49
  @tokens.instance_variable_get(:@tokens).should == [@token]
58
50
  end
51
+ it 'cuts it down' do
52
+ @tokens.cap 0
53
+
54
+ @tokens.instance_variable_get(:@tokens).should == []
55
+ end
59
56
  end
60
57
  context 'many tokens' do
61
58
  before(:each) do
62
- @old_maximum = Query::Tokens.maximum
63
- Query::Tokens.maximum = 3
64
59
  @first = Query::Token.processed 'Hello'
65
60
  @second = Query::Token.processed 'I'
66
61
  @third = Query::Token.processed 'Am'
@@ -72,12 +67,9 @@ describe Query::Tokens do
72
67
  Query::Token.processed('Token')
73
68
  ]
74
69
  end
75
- after(:each) do
76
- Query::Tokens.maximum = @old_maximum
77
- end
78
70
  it 'should cap the number of tokens' do
79
- @tokens.cap
80
-
71
+ @tokens.cap 3
72
+
81
73
  @tokens.instance_variable_get(:@tokens).should == [@first, @second, @third]
82
74
  end
83
75
  end
@@ -1,4 +1,5 @@
1
- # coding: utf-8
1
+ # encoding: utf-8
2
+ #
2
3
  require 'spec_helper'
3
4
 
4
5
  describe Tokenizers::Base do
@@ -7,6 +8,22 @@ describe Tokenizers::Base do
7
8
  @tokenizer = Tokenizers::Base.new
8
9
  end
9
10
 
11
+ describe "substitute(s)_characters*" do
12
+ it "doesn't substitute if there is no substituter" do
13
+ @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
14
+ end
15
+ it "uses the substituter to replace characters" do
16
+ @tokenizer.substitutes_characters_with CharacterSubstitution::European.new
17
+
18
+ @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
19
+ end
20
+ it "uses the european substituter as default" do
21
+ @tokenizer.substitutes_characters_with
22
+
23
+ @tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
24
+ end
25
+ end
26
+
10
27
  describe "removes_characters_after_splitting" do
11
28
  context "without removes_characters_after_splitting called" do
12
29
  it "has remove_after_normalizing_illegals" do
@@ -64,6 +81,9 @@ describe Tokenizers::Base do
64
81
  it 'should define a method split that splits by default on \s' do
65
82
  @tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
66
83
  end
84
+ it 'splits text on /\s/ by default' do
85
+ @tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
86
+ end
67
87
  end
68
88
  context "with removes_characters called" do
69
89
  before(:each) do
@@ -8,6 +8,29 @@ describe Tokenizers::Index do
8
8
  @tokenizer = Tokenizers::Index.new
9
9
  end
10
10
 
11
+ describe "default*" do
12
+ before(:all) do
13
+ @old = Tokenizers::Index.default
14
+ end
15
+ after(:all) do
16
+ Tokenizers::Index.default = @old
17
+ end
18
+ it "has a reader" do
19
+ lambda { Tokenizers::Index.default }.should_not raise_error
20
+ end
21
+ it "returns by default a new Index" do
22
+ Tokenizers::Index.default.should be_kind_of(Tokenizers::Index)
23
+ end
24
+ it "has a writer" do
25
+ lambda { Tokenizers::Index.default = :bla }.should_not raise_error
26
+ end
27
+ it "returns what has been written, if something has been written" do
28
+ Tokenizers::Index.default = :some_default
29
+
30
+ Tokenizers::Index.default.should == :some_default
31
+ end
32
+ end
33
+
11
34
  describe "remove_removes_characters" do
12
35
  it "should not remove ' from a query by default" do
13
36
  @tokenizer.remove_illegals("Lugi's").should == "Lugi's"
@@ -7,6 +7,38 @@ describe Tokenizers::Query do
7
7
  @tokenizer = Tokenizers::Query.new
8
8
  end
9
9
 
10
+ describe "default*" do
11
+ before(:all) do
12
+ @old = Tokenizers::Query.default
13
+ end
14
+ after(:all) do
15
+ Tokenizers::Query.default = @old
16
+ end
17
+ it "has a reader" do
18
+ lambda { Tokenizers::Query.default }.should_not raise_error
19
+ end
20
+ it "returns by default a new Index" do
21
+ Tokenizers::Query.default.should be_kind_of(Tokenizers::Query)
22
+ end
23
+ it "has a writer" do
24
+ lambda { Tokenizers::Query.default = :bla }.should_not raise_error
25
+ end
26
+ it "returns what has been written, if something has been written" do
27
+ Tokenizers::Query.default = :some_default
28
+
29
+ Tokenizers::Query.default.should == :some_default
30
+ end
31
+ end
32
+
33
+ describe "maximum_tokens" do
34
+ it "should be set to 5 by default" do
35
+ @tokenizer.maximum_tokens.should == 5
36
+ end
37
+ it "should be settable" do
38
+ Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
39
+ end
40
+ end
41
+
10
42
  describe 'preprocess' do
11
43
  it 'should call methods in order' do
12
44
  text = stub :text
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 9
8
- - 4
9
- version: 0.9.4
7
+ - 10
8
+ - 0
9
+ version: 0.10.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Florian Hanke
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-10-30 00:00:00 +02:00
17
+ date: 2010-10-31 00:00:00 +02:00
18
18
  default_executable: picky
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -59,6 +59,7 @@ files:
59
59
  - lib/picky/cacher/weights/logarithmic.rb
60
60
  - lib/picky/cacher/weights/strategy.rb
61
61
  - lib/picky/cacher/weights_generator.rb
62
+ - lib/picky/character_substitution/european.rb
62
63
  - lib/picky/configuration/field.rb
63
64
  - lib/picky/configuration/indexes.rb
64
65
  - lib/picky/configuration/queries.rb
@@ -118,11 +119,8 @@ files:
118
119
  - lib/picky/sources/db.rb
119
120
  - lib/picky/sources/delicious.rb
120
121
  - lib/picky/tokenizers/base.rb
121
- - lib/picky/tokenizers/default/index.rb
122
- - lib/picky/tokenizers/default/query.rb
123
122
  - lib/picky/tokenizers/index.rb
124
123
  - lib/picky/tokenizers/query.rb
125
- - lib/picky/umlaut_substituter.rb
126
124
  - lib/picky-tasks.rb
127
125
  - lib/picky.rb
128
126
  - lib/tasks/application.rake
@@ -161,6 +159,7 @@ files:
161
159
  - spec/lib/cacher/similarity_generator_spec.rb
162
160
  - spec/lib/cacher/weights/logarithmic_spec.rb
163
161
  - spec/lib/cacher/weights_generator_spec.rb
162
+ - spec/lib/character_substitution/european_spec.rb
164
163
  - spec/lib/configuration/field_spec.rb
165
164
  - spec/lib/configuration/indexes_spec.rb
166
165
  - spec/lib/configuration/type_spec.rb
@@ -210,11 +209,8 @@ files:
210
209
  - spec/lib/sources/db_spec.rb
211
210
  - spec/lib/sources/delicious_spec.rb
212
211
  - spec/lib/tokenizers/base_spec.rb
213
- - spec/lib/tokenizers/default/index_spec.rb
214
- - spec/lib/tokenizers/default/query_spec.rb
215
212
  - spec/lib/tokenizers/index_spec.rb
216
213
  - spec/lib/tokenizers/query_spec.rb
217
- - spec/lib/umlaut_substituter_spec.rb
218
214
  - spec/specific/speed_spec.rb
219
215
  - bin/picky
220
216
  has_rdoc: true
@@ -261,6 +257,7 @@ test_files:
261
257
  - spec/lib/cacher/similarity_generator_spec.rb
262
258
  - spec/lib/cacher/weights/logarithmic_spec.rb
263
259
  - spec/lib/cacher/weights_generator_spec.rb
260
+ - spec/lib/character_substitution/european_spec.rb
264
261
  - spec/lib/configuration/field_spec.rb
265
262
  - spec/lib/configuration/indexes_spec.rb
266
263
  - spec/lib/configuration/type_spec.rb
@@ -310,9 +307,6 @@ test_files:
310
307
  - spec/lib/sources/db_spec.rb
311
308
  - spec/lib/sources/delicious_spec.rb
312
309
  - spec/lib/tokenizers/base_spec.rb
313
- - spec/lib/tokenizers/default/index_spec.rb
314
- - spec/lib/tokenizers/default/query_spec.rb
315
310
  - spec/lib/tokenizers/index_spec.rb
316
311
  - spec/lib/tokenizers/query_spec.rb
317
- - spec/lib/umlaut_substituter_spec.rb
318
312
  - spec/specific/speed_spec.rb
@@ -1,7 +0,0 @@
1
- module Tokenizers
2
- module Default
3
- # Default is always an instance.
4
- #
5
- Index = ::Tokenizers::Index.new
6
- end
7
- end
@@ -1,7 +0,0 @@
1
- module Tokenizers
2
- module Default
3
- # Default is always an instance.
4
- #
5
- Query = ::Tokenizers::Query.new
6
- end
7
- end
@@ -1,34 +0,0 @@
1
- # encoding: utf-8
2
- #
3
-
4
- # Substitutes certain umlauts, like
5
- # ä, ö, ü => ae, oe, ue.
6
- # (and more, see specs)
7
- #
8
- class UmlautSubstituter
9
-
10
- attr_reader :chars
11
-
12
- def initialize
13
- @chars = ActiveSupport::Multibyte.proxy_class
14
- end
15
-
16
- def substitute text
17
- trans = chars.new(text).normalize(:kd)
18
-
19
- # substitute special cases
20
- #
21
- trans.gsub!('ß', 'ss')
22
-
23
- # substitute umlauts (of A,O,U,a,o,u)
24
- #
25
- trans.gsub!(/([AOUaou])\314\210/u, '\1e')
26
-
27
- # get rid of ecutes, graves and …
28
- #
29
- trans.unpack('U*').select { |cp|
30
- cp < 0x0300 || cp > 0x035F
31
- }.pack('U*')
32
- end
33
-
34
- end
@@ -1,11 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- require 'spec_helper'
4
-
5
- describe Tokenizers::Default::Index do
6
-
7
- it "is an instance of the index tokenizer" do
8
- Tokenizers::Default::Index.should be_kind_of(Tokenizers::Index)
9
- end
10
-
11
- end
@@ -1,11 +0,0 @@
1
- # encoding: utf-8
2
- #
3
- require 'spec_helper'
4
-
5
- describe Tokenizers::Default::Query do
6
-
7
- it "is an instance of the index tokenizer" do
8
- Tokenizers::Default::Query.should be_kind_of(Tokenizers::Query)
9
- end
10
-
11
- end