picky 0.9.4 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/application.rb +42 -29
- data/lib/picky/character_substitution/european.rb +33 -0
- data/lib/picky/configuration/field.rb +3 -3
- data/lib/picky/configuration/indexes.rb +4 -10
- data/lib/picky/configuration/queries.rb +2 -10
- data/lib/picky/index/bundle.rb +2 -7
- data/lib/picky/index/file/text.rb +6 -1
- data/lib/picky/loader.rb +4 -4
- data/lib/picky/query/base.rb +1 -1
- data/lib/picky/query/tokens.rb +4 -11
- data/lib/picky/tokenizers/base.rb +23 -5
- data/lib/picky/tokenizers/index.rb +8 -1
- data/lib/picky/tokenizers/query.rb +21 -7
- data/lib/tasks/server.rake +3 -3
- data/lib/tasks/shortcuts.rake +4 -4
- data/lib/tasks/spec.rake +1 -1
- data/lib/tasks/try.rake +6 -8
- data/project_prototype/Gemfile +3 -2
- data/project_prototype/app/application.rb +35 -47
- data/spec/lib/application_spec.rb +36 -19
- data/spec/lib/{umlaut_substituter_spec.rb → character_substitution/european_spec.rb} +14 -2
- data/spec/lib/configuration/field_spec.rb +12 -12
- data/spec/lib/configuration/indexes_spec.rb +3 -40
- data/spec/lib/extensions/array_spec.rb +1 -1
- data/spec/lib/extensions/hash_spec.rb +1 -1
- data/spec/lib/index/file/text_spec.rb +14 -1
- data/spec/lib/query/combination_spec.rb +1 -1
- data/spec/lib/query/tokens_spec.rb +11 -19
- data/spec/lib/tokenizers/base_spec.rb +21 -1
- data/spec/lib/tokenizers/index_spec.rb +23 -0
- data/spec/lib/tokenizers/query_spec.rb +32 -0
- metadata +7 -13
- data/lib/picky/tokenizers/default/index.rb +0 -7
- data/lib/picky/tokenizers/default/query.rb +0 -7
- data/lib/picky/umlaut_substituter.rb +0 -34
- data/spec/lib/tokenizers/default/index_spec.rb +0 -11
- data/spec/lib/tokenizers/default/query_spec.rb +0 -11
data/lib/picky/application.rb
CHANGED
@@ -1,55 +1,68 @@
|
|
1
1
|
# The Picky application wherein the indexing and querying is defined.
|
2
2
|
#
|
3
3
|
class Application
|
4
|
+
|
4
5
|
class << self
|
5
6
|
|
6
|
-
|
7
|
+
# Returns a configured tokenizer that
|
8
|
+
# is used for indexing by default.
|
9
|
+
#
|
10
|
+
def default_indexing options = {}
|
11
|
+
indexing.default_tokenizer options
|
12
|
+
end
|
7
13
|
|
8
|
-
#
|
9
|
-
#
|
14
|
+
# Returns a configured tokenizer that
|
15
|
+
# is used for querying by default.
|
16
|
+
#
|
17
|
+
def default_querying options = {}
|
18
|
+
querying.default_tokenizer options
|
19
|
+
end
|
20
|
+
|
21
|
+
# Routes.
|
10
22
|
#
|
11
|
-
|
23
|
+
delegate :route, :root, :to => :routing
|
24
|
+
# Index, Field.
|
12
25
|
#
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
def
|
18
|
-
@apps.each &:finalize
|
19
|
-
end
|
26
|
+
# TODO Rename category.
|
27
|
+
#
|
28
|
+
delegate :field, :to => :indexing
|
29
|
+
def category *args; indexing.field *args; end
|
30
|
+
def index *args; indexing.define_index *args; end
|
20
31
|
|
21
32
|
# An application simply delegates to the routing to handle a request.
|
22
33
|
#
|
23
34
|
def call env
|
24
35
|
routing.call env
|
25
36
|
end
|
26
|
-
|
27
|
-
# Freezes the routes.
|
28
|
-
#
|
29
|
-
def finalize
|
30
|
-
routing.freeze
|
31
|
-
end
|
32
37
|
def routing
|
33
38
|
@routing ||= Routing.new
|
34
39
|
end
|
35
|
-
# Routes.
|
36
|
-
#
|
37
|
-
delegate :route, :root, :to => :routing
|
38
|
-
|
39
|
-
# TODO Rename to default_indexing?
|
40
|
-
#
|
41
40
|
def indexing
|
42
41
|
@indexing ||= Configuration::Indexes.new
|
43
42
|
end
|
44
|
-
def
|
45
|
-
|
43
|
+
def querying
|
44
|
+
@queries ||= Configuration::Queries.new
|
46
45
|
end
|
47
|
-
delegate :type, :field, :to => :indexing
|
48
46
|
|
49
|
-
#
|
47
|
+
# Finalize the subclass as soon as it
|
48
|
+
# has finished loading.
|
50
49
|
#
|
51
|
-
|
52
|
-
|
50
|
+
attr_reader :apps
|
51
|
+
def initialize_apps
|
52
|
+
@apps ||= []
|
53
|
+
end
|
54
|
+
def inherited app
|
55
|
+
initialize_apps
|
56
|
+
apps << app
|
57
|
+
end
|
58
|
+
def finalize_apps
|
59
|
+
initialize_apps
|
60
|
+
apps.each &:finalize
|
61
|
+
end
|
62
|
+
# Finalizes the routes.
|
63
|
+
#
|
64
|
+
def finalize
|
65
|
+
routing.freeze
|
53
66
|
end
|
54
67
|
|
55
68
|
# TODO Add more info.
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
module CharacterSubstitution
|
4
|
+
# Substitutes Umlauts like
|
5
|
+
# ä, ö, ü => ae, oe, ue.
|
6
|
+
# (and more, see specs)
|
7
|
+
#
|
8
|
+
class European
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@chars = ActiveSupport::Multibyte.proxy_class
|
12
|
+
end
|
13
|
+
|
14
|
+
def substitute text
|
15
|
+
trans = @chars.new(text).normalize(:kd)
|
16
|
+
|
17
|
+
# substitute special cases
|
18
|
+
#
|
19
|
+
trans.gsub!('ß', 'ss')
|
20
|
+
|
21
|
+
# substitute umlauts (of A,O,U,a,o,u)
|
22
|
+
#
|
23
|
+
trans.gsub!(/([AOUaou])\314\210/u, '\1e')
|
24
|
+
|
25
|
+
# get rid of ecutes, graves and …
|
26
|
+
#
|
27
|
+
trans.unpack('U*').select { |cp|
|
28
|
+
cp < 0x0300 || cp > 0x035F
|
29
|
+
}.pack('U*')
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -5,10 +5,10 @@ module Configuration
|
|
5
5
|
#
|
6
6
|
class Field
|
7
7
|
attr_reader :name, :indexed_name, :virtual, :tokenizer
|
8
|
-
attr_accessor :type # convenience
|
9
|
-
def initialize name,
|
8
|
+
attr_accessor :type # convenience TODO Still needed?
|
9
|
+
def initialize name, options = {}
|
10
10
|
@name = name.to_sym
|
11
|
-
@tokenizer = tokenizer
|
11
|
+
@tokenizer = options[:tokenizer] || Tokenizers::Index.default
|
12
12
|
|
13
13
|
# TODO Dup the options?
|
14
14
|
|
@@ -10,17 +10,13 @@ module Configuration
|
|
10
10
|
@types = []
|
11
11
|
end
|
12
12
|
|
13
|
-
def default_tokenizer
|
14
|
-
|
13
|
+
def default_tokenizer options = {}
|
14
|
+
Tokenizers::Index.default = Tokenizers::Index.new(options)
|
15
15
|
end
|
16
16
|
|
17
|
-
# Delegates
|
18
|
-
#
|
19
|
-
delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
|
20
|
-
|
21
17
|
# TODO Rewrite all this configuration handling.
|
22
18
|
#
|
23
|
-
def
|
19
|
+
def define_index name, source, *fields
|
24
20
|
new_type = Type.new name, source, *fields
|
25
21
|
types << new_type
|
26
22
|
::Indexes.configuration ||= self
|
@@ -30,9 +26,7 @@ module Configuration
|
|
30
26
|
generated
|
31
27
|
end
|
32
28
|
def field name, options = {}
|
33
|
-
|
34
|
-
|
35
|
-
Field.new name, tokenizer, options
|
29
|
+
Field.new name, options
|
36
30
|
end
|
37
31
|
|
38
32
|
#
|
@@ -6,16 +6,8 @@ module Configuration
|
|
6
6
|
|
7
7
|
#
|
8
8
|
#
|
9
|
-
def default_tokenizer
|
10
|
-
|
11
|
-
end
|
12
|
-
|
13
|
-
delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
|
14
|
-
|
15
|
-
# Delegates.
|
16
|
-
#
|
17
|
-
def maximum_tokens amount
|
18
|
-
Query::Tokens.maximum = amount
|
9
|
+
def default_tokenizer options = {}
|
10
|
+
Tokenizers::Query.default = Tokenizers::Query.new(options)
|
19
11
|
end
|
20
12
|
|
21
13
|
end
|
data/lib/picky/index/bundle.rb
CHANGED
@@ -98,15 +98,10 @@ module Index
|
|
98
98
|
end
|
99
99
|
# Retrieves the data into the index.
|
100
100
|
#
|
101
|
-
# TODO Beautify.
|
102
|
-
#
|
103
101
|
def retrieve
|
104
|
-
files.retrieve do |
|
105
|
-
token.chomp!
|
106
|
-
token = token.to_sym
|
107
|
-
|
102
|
+
files.retrieve do |id, token|
|
108
103
|
initialize_index_for token
|
109
|
-
index[token] <<
|
104
|
+
index[token] << id
|
110
105
|
end
|
111
106
|
end
|
112
107
|
def initialize_index_for token
|
@@ -13,10 +13,15 @@ module Index
|
|
13
13
|
def dump hash
|
14
14
|
raise "Can't dump to text file. Use JSON or Marshal."
|
15
15
|
end
|
16
|
+
|
17
|
+
# Yields an id and a symbol token.
|
18
|
+
#
|
16
19
|
def retrieve
|
20
|
+
id, token =
|
17
21
|
::File.open(cache_path, 'r:binary') do |file|
|
18
22
|
file.each_line do |line|
|
19
|
-
|
23
|
+
id, token = line.split ?,, 2
|
24
|
+
yield id.to_i, (token.chomp! || token).to_sym
|
20
25
|
end
|
21
26
|
end
|
22
27
|
end
|
data/lib/picky/loader.rb
CHANGED
@@ -104,6 +104,10 @@ module Loader
|
|
104
104
|
load_relative 'helpers/cache'
|
105
105
|
load_relative 'helpers/measuring'
|
106
106
|
|
107
|
+
# Character Substitution
|
108
|
+
#
|
109
|
+
load_relative 'character_substitution/european'
|
110
|
+
|
107
111
|
# Signal handling
|
108
112
|
#
|
109
113
|
load_relative 'signals'
|
@@ -111,7 +115,6 @@ module Loader
|
|
111
115
|
# Various.
|
112
116
|
#
|
113
117
|
load_relative 'loggers/search'
|
114
|
-
load_relative 'umlaut_substituter'
|
115
118
|
|
116
119
|
# Index generation strategies.
|
117
120
|
#
|
@@ -180,9 +183,6 @@ module Loader
|
|
180
183
|
load_relative 'tokenizers/index'
|
181
184
|
load_relative 'tokenizers/query'
|
182
185
|
|
183
|
-
load_relative 'tokenizers/default/index'
|
184
|
-
load_relative 'tokenizers/default/query'
|
185
|
-
|
186
186
|
# Query combinations, qualifiers, weigher.
|
187
187
|
#
|
188
188
|
load_relative 'query/combination'
|
data/lib/picky/query/base.rb
CHANGED
@@ -16,7 +16,7 @@ module Query
|
|
16
16
|
options = Hash === index_types.last ? index_types.pop : {}
|
17
17
|
@index_types = index_types
|
18
18
|
@weigher = options[:weigher] || Weigher.new(index_types)
|
19
|
-
@tokenizer = options[:tokenizer] || Tokenizers::
|
19
|
+
@tokenizer = options[:tokenizer] || Tokenizers::Query.default
|
20
20
|
@weights = options[:weights] || Weights.new
|
21
21
|
end
|
22
22
|
|
data/lib/picky/query/tokens.rb
CHANGED
@@ -6,11 +6,6 @@ module Query
|
|
6
6
|
#
|
7
7
|
class Tokens
|
8
8
|
|
9
|
-
#
|
10
|
-
#
|
11
|
-
cattr_accessor :maximum
|
12
|
-
self.maximum = 5
|
13
|
-
|
14
9
|
# Basically delegates to its internal tokens array.
|
15
10
|
#
|
16
11
|
self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
|
@@ -52,13 +47,11 @@ module Query
|
|
52
47
|
|
53
48
|
# Caps the tokens to the maximum.
|
54
49
|
#
|
55
|
-
|
56
|
-
|
57
|
-
def cap
|
58
|
-
@tokens.slice!(@@maximum..-1) if cap?
|
50
|
+
def cap maximum
|
51
|
+
@tokens.slice!(maximum..-1) if cap?(maximum)
|
59
52
|
end
|
60
|
-
def cap?
|
61
|
-
@tokens.size >
|
53
|
+
def cap? maximum
|
54
|
+
@tokens.size > maximum
|
62
55
|
end
|
63
56
|
|
64
57
|
# Rejects blank tokens.
|
@@ -79,6 +79,19 @@ module Tokenizers
|
|
79
79
|
text.gsub! @removes_characters_after_splitting_regexp, '' if @removes_characters_after_splitting_regexp
|
80
80
|
end
|
81
81
|
|
82
|
+
# Substitute Characters with this substituter.
|
83
|
+
#
|
84
|
+
# Default is European Character substitution.
|
85
|
+
#
|
86
|
+
def substitutes_characters_with substituter = CharacterSubstitution::European.new
|
87
|
+
# TODO Raise if it doesn't quack substitute?
|
88
|
+
@substituter = substituter
|
89
|
+
end
|
90
|
+
def substitute_characters text
|
91
|
+
substituter?? substituter.substitute(text) : text
|
92
|
+
end
|
93
|
+
|
94
|
+
|
82
95
|
# Returns a number of tokens, generated from the given text.
|
83
96
|
#
|
84
97
|
# Note:
|
@@ -93,15 +106,20 @@ module Tokenizers
|
|
93
106
|
process tokens # processing tokens / strings
|
94
107
|
end
|
95
108
|
|
96
|
-
|
109
|
+
attr_reader :substituter
|
97
110
|
alias substituter? substituter
|
98
111
|
|
99
|
-
def initialize
|
100
|
-
|
112
|
+
def initialize options = {}
|
113
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
114
|
+
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
115
|
+
stopwords options[:stopwords] if options[:stopwords]
|
116
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
117
|
+
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
118
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
101
119
|
|
102
|
-
#
|
120
|
+
# Defaults.
|
103
121
|
#
|
104
|
-
splits_text_on
|
122
|
+
splits_text_on options[:splits_text_on] || /\s/
|
105
123
|
end
|
106
124
|
|
107
125
|
# Hooks.
|
@@ -5,6 +5,13 @@ module Tokenizers
|
|
5
5
|
#
|
6
6
|
class Index < Base
|
7
7
|
|
8
|
+
def self.default= new_default
|
9
|
+
@default = new_default
|
10
|
+
end
|
11
|
+
def self.default
|
12
|
+
@default ||= new
|
13
|
+
end
|
14
|
+
|
8
15
|
# Default indexing preprocessing hook.
|
9
16
|
#
|
10
17
|
# Does:
|
@@ -15,7 +22,7 @@ module Tokenizers
|
|
15
22
|
# 5. Remove non-single stopwords. (Stopwords that occur with other words)
|
16
23
|
#
|
17
24
|
def preprocess text
|
18
|
-
text =
|
25
|
+
text = substitute_characters text
|
19
26
|
text.downcase!
|
20
27
|
remove_illegals text
|
21
28
|
contract text
|
@@ -13,6 +13,20 @@ module Tokenizers
|
|
13
13
|
#
|
14
14
|
class Query < Base
|
15
15
|
|
16
|
+
def self.default= new_default
|
17
|
+
@default = new_default
|
18
|
+
end
|
19
|
+
def self.default
|
20
|
+
@default ||= new
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_reader :maximum_tokens
|
24
|
+
|
25
|
+
def initialize options = {}
|
26
|
+
super options
|
27
|
+
@maximum_tokens = options[:maximum_tokens] || 5
|
28
|
+
end
|
29
|
+
|
16
30
|
def preprocess text
|
17
31
|
remove_illegals text # Remove illegal characters
|
18
32
|
remove_non_single_stopwords text # remove stop words
|
@@ -33,9 +47,9 @@ module Tokenizers
|
|
33
47
|
#
|
34
48
|
def process tokens
|
35
49
|
tokens.tokenize_with self
|
36
|
-
tokens.reject
|
37
|
-
tokens.cap
|
38
|
-
tokens.partialize_last
|
50
|
+
tokens.reject # Reject any tokens that don't meet criteria
|
51
|
+
tokens.cap maximum_tokens # Cut off superfluous tokens
|
52
|
+
tokens.partialize_last # Set certain tokens as partial
|
39
53
|
tokens
|
40
54
|
end
|
41
55
|
|
@@ -44,10 +58,10 @@ module Tokenizers
|
|
44
58
|
# TODO Perhaps move to Normalizer?
|
45
59
|
#
|
46
60
|
def normalize text
|
47
|
-
text =
|
48
|
-
text.downcase!
|
49
|
-
normalize_with_patterns text
|
50
|
-
text.to_sym
|
61
|
+
text = substitute_characters text # Substitute special characters TODO Move to subclass
|
62
|
+
text.downcase! # Downcase all text
|
63
|
+
normalize_with_patterns text # normalize
|
64
|
+
text.to_sym # symbolize
|
51
65
|
end
|
52
66
|
|
53
67
|
# Returns a token for a word.
|
data/lib/tasks/server.rake
CHANGED
@@ -11,7 +11,7 @@ namespace :server do
|
|
11
11
|
pid.blank? ? nil : pid.chomp
|
12
12
|
end
|
13
13
|
|
14
|
-
desc "Start the unicorns. (Wehee!)"
|
14
|
+
# desc "Start the unicorns. (Wehee!)"
|
15
15
|
task :start => :framework do
|
16
16
|
chdir_to_root
|
17
17
|
# Rake::Task[:"solr:start"].invoke # TODO Move to better place.
|
@@ -21,13 +21,13 @@ namespace :server do
|
|
21
21
|
exec command
|
22
22
|
end
|
23
23
|
|
24
|
-
desc "Stop the unicorns. (Blam!)"
|
24
|
+
# desc "Stop the unicorns. (Blam!)"
|
25
25
|
task :stop => :framework do
|
26
26
|
`kill -QUIT #{current_pid}` if current_pid
|
27
27
|
# Rake::Task[:"solr:stop"].invoke # TODO Move to better place.
|
28
28
|
end
|
29
29
|
|
30
|
-
desc "Restart the unicorns."
|
30
|
+
# desc "Restart the unicorns."
|
31
31
|
task :restart do
|
32
32
|
Rake::Task[:"server:stop"].invoke
|
33
33
|
sleep 5
|
data/lib/tasks/shortcuts.rake
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
desc "
|
1
|
+
desc "Generate the index."
|
2
2
|
task :index => :application do
|
3
3
|
Rake::Task[:'index:generate'].invoke
|
4
4
|
end
|
5
5
|
|
6
|
-
desc "
|
6
|
+
desc "Try the given text in the indexer/query (type:field optional)."
|
7
7
|
task :try, [:text, :type_and_field] => :application do |_, options|
|
8
8
|
text, type_and_field = options.text, options.type_and_field
|
9
9
|
|
10
10
|
Rake::Task[:'try:both'].invoke text, type_and_field
|
11
11
|
end
|
12
12
|
|
13
|
-
desc "
|
13
|
+
desc "Start the server."
|
14
14
|
task :start do
|
15
15
|
Rake::Task[:'server:start'].invoke
|
16
16
|
end
|
17
|
-
desc "
|
17
|
+
desc "Stop the server."
|
18
18
|
task :stop do
|
19
19
|
Rake::Task[:'server:stop'].invoke
|
20
20
|
end
|
data/lib/tasks/spec.rake
CHANGED
@@ -3,7 +3,7 @@ require 'spec/rake/spectask'
|
|
3
3
|
|
4
4
|
task :default => :spec
|
5
5
|
|
6
|
-
desc "Run all specs
|
6
|
+
desc "Run all specs"
|
7
7
|
Spec::Rake::SpecTask.new(:spec) do |t|
|
8
8
|
spec_root = File.join(File.dirname(__FILE__), '..', '..', 'spec')
|
9
9
|
t.spec_opts = ['--options', "\"#{File.join(spec_root, 'spec.opts')}\""]
|
data/lib/tasks/try.rake
CHANGED
@@ -2,25 +2,23 @@
|
|
2
2
|
#
|
3
3
|
namespace :try do
|
4
4
|
|
5
|
-
desc "Try how a given word would be tokenized when indexing (type:field optional)."
|
5
|
+
# desc "Try how a given word would be tokenized when indexing (type:field optional)."
|
6
6
|
task :index, [:text, :type_and_field] => :application do |_, options|
|
7
7
|
text, type_and_field = options.text, options.type_and_field
|
8
8
|
|
9
|
-
tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::
|
9
|
+
tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Index.default
|
10
10
|
|
11
|
-
puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text).to_a}"
|
11
|
+
puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
|
12
12
|
end
|
13
13
|
|
14
|
-
desc "Try how a given word would be tokenized when querying."
|
14
|
+
# desc "Try how a given word would be tokenized when querying."
|
15
15
|
task :query, [:text] => :application do |_, options|
|
16
16
|
text = options.text
|
17
17
|
|
18
|
-
#
|
19
|
-
#
|
20
|
-
puts "\"#{text}\" is query tokenized as #{Tokenizers::Default::Query.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
|
18
|
+
puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
|
21
19
|
end
|
22
20
|
|
23
|
-
desc "Try the given text with both the index and the query (type:field optional)."
|
21
|
+
# desc "Try the given text with both the index and the query (type:field optional)."
|
24
22
|
task :both, [:text, :type_and_field] => :application do |_, options|
|
25
23
|
text, type_and_field = options.text, options.type_and_field
|
26
24
|
|
data/project_prototype/Gemfile
CHANGED
@@ -1,58 +1,46 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#
|
3
|
+
# TODO Adapt the generated example
|
4
|
+
# (a library books finder) to what you need.
|
5
|
+
#
|
6
|
+
# Check the Wiki http://github.com/floere/picky/wiki for more options.
|
7
|
+
# Ask me or the google group if you have questions or specific requests.
|
8
|
+
#
|
3
9
|
class PickySearch < Application
|
4
10
|
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
11
|
+
# Indexing: How text is indexed.
|
12
|
+
# Querying: How query text is handled.
|
13
|
+
#
|
14
|
+
default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
|
15
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
16
|
+
splits_text_on: /[\s\/\-\"\&\.]/
|
17
|
+
|
18
|
+
default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
|
19
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
20
|
+
splits_text_on: /[\s\/\-\,\&]+/,
|
21
|
+
|
22
|
+
maximum_tokens: 5, # Max amount of tokens passing into a query. 5 is the default.
|
23
|
+
substitutes_characters_with: CharacterSubstitution::European.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
|
24
|
+
|
25
|
+
# Define an index. Use a database etc. source? http://github.com/floere/picky/wiki/Sources-Configuration#sources
|
9
26
|
#
|
10
|
-
# Ask me or the google group if you have questions or specific requests.
|
11
|
-
#
|
12
|
-
|
13
|
-
indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
|
14
|
-
indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
|
15
|
-
indexing.splits_text_on(/[\s\/\-\"\&\.]/)
|
16
|
-
|
17
27
|
books_index = index :books,
|
18
|
-
Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, :
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
:similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: no similarity).
|
28
|
-
field(:author, :partial => Partial::Substring.new(:from => 1)),
|
29
|
-
field(:isbn, :partial => Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
|
30
|
-
|
31
|
-
# Defines the maximum tokens (words) that pass through to the engine.
|
32
|
-
#
|
33
|
-
querying.maximum_tokens 5
|
28
|
+
Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, file: 'app/library.csv'),
|
29
|
+
category(:title,
|
30
|
+
partial: Partial::Substring.new(from: 1), # Indexes substrings upwards from character 1 (default: -3),
|
31
|
+
# You'll find "picky" even when entering just a "p".
|
32
|
+
similarity: Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: No similarity).
|
33
|
+
category(:author,
|
34
|
+
partial: Partial::Substring.new(from: 1)),
|
35
|
+
category(:isbn,
|
36
|
+
partial: Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
|
34
37
|
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
|
39
|
-
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
40
|
-
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
38
|
+
full_books = Query::Full.new books_index # A Full query returns ids, combinations, and counts.
|
39
|
+
live_books = Query::Live.new books_index # A Live query does return all that Full returns, except ids.
|
41
40
|
|
42
|
-
|
43
|
-
#
|
44
|
-
# A Full query returns ids, combinations, and counts.
|
45
|
-
# A Live query does return all that Full returns, without ids.
|
46
|
-
#
|
47
|
-
# Note: You can pass a query multiple indexes and it will combine them.
|
48
|
-
#
|
49
|
-
full_books = Query::Full.new books_index
|
50
|
-
live_books = Query::Live.new books_index
|
41
|
+
route %r{\A/books/full\Z} => full_books # Routing is simple: url_path_regexp => query
|
42
|
+
route %r{\A/books/live\Z} => live_books #
|
51
43
|
|
52
|
-
#
|
53
|
-
# A path regexp pointing to a query that will be run.
|
54
|
-
#
|
55
|
-
route %r{^/books/full} => full_books
|
56
|
-
route %r{^/books/live} => live_books
|
44
|
+
# Note: You can pass a query multiple indexes and it will query in all of them.
|
57
45
|
|
58
46
|
end
|
@@ -5,32 +5,49 @@ require 'spec_helper'
|
|
5
5
|
describe Application do
|
6
6
|
|
7
7
|
describe "integration" do
|
8
|
+
it "should run ok" do
|
9
|
+
lambda {
|
10
|
+
class MinimalTestApplication < Application
|
11
|
+
books = index :books,
|
12
|
+
Sources::DB.new('SELECT id, title FROM books', :file => 'app/db.yml'),
|
13
|
+
category(:title)
|
14
|
+
|
15
|
+
|
16
|
+
full = Query::Full.new books
|
17
|
+
live = Query::Live.new books
|
18
|
+
|
19
|
+
route %r{^/books/full} => full
|
20
|
+
route %r{^/books/live} => live
|
21
|
+
end
|
22
|
+
Tokenizers::Index.default.tokenize 'some text'
|
23
|
+
Tokenizers::Query.default.tokenize 'some text'
|
24
|
+
}.should_not raise_error
|
25
|
+
end
|
8
26
|
it "should run ok" do
|
9
27
|
lambda {
|
10
28
|
# TODO Add all possible cases.
|
11
29
|
#
|
12
30
|
class TestApplication < Application
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
31
|
+
default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
|
32
|
+
contracts_expressions: [/mr\.\s*|mister\s*/i, 'mr '],
|
33
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
34
|
+
splits_text_on: /[\s\/\-\"\&\.]/,
|
35
|
+
removes_characters_after_splitting: /[\.]/
|
36
|
+
|
37
|
+
default_querying removes_characters: /[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/,
|
38
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
39
|
+
splits_text_on: /[\s\/\-\,\&]+/,
|
40
|
+
normalizes_words: [[/Deoxyribonucleic Acid/i, 'DNA']],
|
41
|
+
|
42
|
+
substitutes_characters_with: CharacterSubstitution::European.new,
|
43
|
+
maximum_tokens: 5
|
18
44
|
|
19
|
-
books_index = index
|
20
|
-
|
21
|
-
|
22
|
-
|
45
|
+
books_index = index :books,
|
46
|
+
Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
|
47
|
+
category(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
|
48
|
+
category(:author),
|
49
|
+
category(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
|
23
50
|
|
24
|
-
# Note that Picky needs the following characters to
|
25
|
-
# pass through, as they are control characters: *"~:
|
26
|
-
#
|
27
|
-
querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
|
28
|
-
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
29
|
-
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
30
|
-
querying.normalizes_words([
|
31
|
-
[/Deoxyribonucleic Acid/i, 'DNA']
|
32
|
-
])
|
33
|
-
querying.maximum_tokens 5
|
34
51
|
|
35
52
|
full = Query::Full.new books_index
|
36
53
|
live = Query::Live.new books_index
|
@@ -1,9 +1,10 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
#
|
2
3
|
require 'spec_helper'
|
3
4
|
|
4
|
-
describe
|
5
|
+
describe CharacterSubstitution do
|
5
6
|
before(:each) do
|
6
|
-
@substituter =
|
7
|
+
@substituter = CharacterSubstitution::European.new
|
7
8
|
end
|
8
9
|
|
9
10
|
# A bit of metaprogramming to help with the myriads of its.
|
@@ -82,5 +83,16 @@ describe UmlautSubstituter do
|
|
82
83
|
it_should_substitute 'å', 'a'
|
83
84
|
it_should_substitute 'Å', 'A'
|
84
85
|
end
|
86
|
+
|
87
|
+
describe "diacritic" do
|
88
|
+
it_should_substitute 'ñ', 'n'
|
89
|
+
end
|
90
|
+
|
91
|
+
describe "speed" do
|
92
|
+
it "is fast" do
|
93
|
+
result = performance_of { @substituter.substitute('ä') }
|
94
|
+
result.should < 0.00009
|
95
|
+
end
|
96
|
+
end
|
85
97
|
|
86
98
|
end
|
@@ -5,7 +5,7 @@ describe Configuration::Field do
|
|
5
5
|
describe "virtual?" do
|
6
6
|
context "with virtual true" do
|
7
7
|
before(:each) do
|
8
|
-
@field = Configuration::Field.new :some_name, :
|
8
|
+
@field = Configuration::Field.new :some_name, :virtual => true
|
9
9
|
end
|
10
10
|
it "returns the right value" do
|
11
11
|
@field.virtual?.should == true
|
@@ -13,7 +13,7 @@ describe Configuration::Field do
|
|
13
13
|
end
|
14
14
|
context "with virtual object" do
|
15
15
|
before(:each) do
|
16
|
-
@field = Configuration::Field.new :some_name, :
|
16
|
+
@field = Configuration::Field.new :some_name, :virtual => 123.6
|
17
17
|
end
|
18
18
|
it "returns the right value" do
|
19
19
|
@field.virtual?.should == true
|
@@ -39,7 +39,7 @@ describe Configuration::Field do
|
|
39
39
|
describe "tokenizer" do
|
40
40
|
context "with specific tokenizer" do
|
41
41
|
before(:each) do
|
42
|
-
@field = Configuration::Field.new :some_name, Tokenizers::Index.new
|
42
|
+
@field = Configuration::Field.new :some_name, tokenizer: Tokenizers::Index.new
|
43
43
|
|
44
44
|
@field.type = :some_type
|
45
45
|
end
|
@@ -54,7 +54,7 @@ describe Configuration::Field do
|
|
54
54
|
describe "indexer" do
|
55
55
|
context "with default indexer" do
|
56
56
|
before(:each) do
|
57
|
-
@field = Configuration::Field.new :some_name
|
57
|
+
@field = Configuration::Field.new :some_name
|
58
58
|
end
|
59
59
|
it "caches" do
|
60
60
|
@field.indexer.should == @field.indexer
|
@@ -62,7 +62,7 @@ describe Configuration::Field do
|
|
62
62
|
end
|
63
63
|
context "with specific indexer" do
|
64
64
|
before(:each) do
|
65
|
-
@field = Configuration::Field.new :some_name, :
|
65
|
+
@field = Configuration::Field.new :some_name, tokenizer: Indexers::Default
|
66
66
|
|
67
67
|
@field.type = :some_type
|
68
68
|
end
|
@@ -81,7 +81,7 @@ describe Configuration::Field do
|
|
81
81
|
end
|
82
82
|
describe "cache" do
|
83
83
|
before(:each) do
|
84
|
-
@field = Configuration::Field.new :some_name
|
84
|
+
@field = Configuration::Field.new :some_name
|
85
85
|
@field.stub! :prepare_cache_directory
|
86
86
|
|
87
87
|
@generated = stub :generated, :generate_caches => nil
|
@@ -100,7 +100,7 @@ describe Configuration::Field do
|
|
100
100
|
end
|
101
101
|
describe "prepare_cache_directory" do
|
102
102
|
before(:each) do
|
103
|
-
@field = Configuration::Field.new :some_name
|
103
|
+
@field = Configuration::Field.new :some_name
|
104
104
|
|
105
105
|
@field.stub! :cache_directory => :some_cache_directory
|
106
106
|
end
|
@@ -112,7 +112,7 @@ describe Configuration::Field do
|
|
112
112
|
end
|
113
113
|
describe "index" do
|
114
114
|
before(:each) do
|
115
|
-
@field = Configuration::Field.new :some_name
|
115
|
+
@field = Configuration::Field.new :some_name
|
116
116
|
@field.stub! :prepare_cache_directory
|
117
117
|
|
118
118
|
@indexer = stub :indexer, :index => nil
|
@@ -132,7 +132,7 @@ describe Configuration::Field do
|
|
132
132
|
describe "source" do
|
133
133
|
context "with source" do
|
134
134
|
before(:each) do
|
135
|
-
@field = Configuration::Field.new :some_name, :
|
135
|
+
@field = Configuration::Field.new :some_name, :source => :some_given_source
|
136
136
|
|
137
137
|
@type = stub :type, :name => :some_type
|
138
138
|
@field.type = @type
|
@@ -143,7 +143,7 @@ describe Configuration::Field do
|
|
143
143
|
end
|
144
144
|
context "without source" do
|
145
145
|
before(:each) do
|
146
|
-
@field = Configuration::Field.new :some_name
|
146
|
+
@field = Configuration::Field.new :some_name
|
147
147
|
|
148
148
|
@type = stub :type, :name => :some_type, :source => :some_type_source
|
149
149
|
@field.type = @type
|
@@ -155,7 +155,7 @@ describe Configuration::Field do
|
|
155
155
|
end
|
156
156
|
context "name symbol" do
|
157
157
|
before(:each) do
|
158
|
-
@field = Configuration::Field.new :some_name
|
158
|
+
@field = Configuration::Field.new :some_name
|
159
159
|
|
160
160
|
@type = stub :type, :name => :some_type
|
161
161
|
@field.type = @type
|
@@ -189,7 +189,7 @@ describe Configuration::Field do
|
|
189
189
|
end
|
190
190
|
context "name string" do
|
191
191
|
before(:each) do
|
192
|
-
@field = Configuration::Field.new 'some_name'
|
192
|
+
@field = Configuration::Field.new 'some_name'
|
193
193
|
end
|
194
194
|
describe "generate_qualifiers_from" do
|
195
195
|
context "without qualifiers" do
|
@@ -18,47 +18,10 @@ describe Configuration::Indexes do
|
|
18
18
|
|
19
19
|
describe "default_tokenizer" do
|
20
20
|
it "is a default tokenizer" do
|
21
|
-
@config.default_tokenizer.should
|
21
|
+
@config.default_tokenizer.should be_kind_of(Tokenizers::Index)
|
22
22
|
end
|
23
|
-
it "
|
24
|
-
@config.default_tokenizer.
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
describe "delegates" do
|
29
|
-
before(:each) do
|
30
|
-
@receiver = mock :receiver
|
31
|
-
@config.stub! :default_tokenizer => @receiver
|
32
|
-
end
|
33
|
-
it "delegates" do
|
34
|
-
@receiver.should_receive(:removes_characters).once
|
35
|
-
|
36
|
-
@config.removes_characters
|
37
|
-
end
|
38
|
-
it "delegates" do
|
39
|
-
@receiver.should_receive(:contracts_expressions).once
|
40
|
-
|
41
|
-
@config.contracts_expressions
|
42
|
-
end
|
43
|
-
it "delegates" do
|
44
|
-
@receiver.should_receive(:stopwords).once
|
45
|
-
|
46
|
-
@config.stopwords
|
47
|
-
end
|
48
|
-
it "delegates" do
|
49
|
-
@receiver.should_receive(:splits_text_on).once
|
50
|
-
|
51
|
-
@config.splits_text_on
|
52
|
-
end
|
53
|
-
it "delegates" do
|
54
|
-
@receiver.should_receive(:normalizes_words).once
|
55
|
-
|
56
|
-
@config.normalizes_words
|
57
|
-
end
|
58
|
-
it "delegates" do
|
59
|
-
@receiver.should_receive(:removes_characters_after_splitting).once
|
60
|
-
|
61
|
-
@config.removes_characters_after_splitting
|
23
|
+
it "does not cache" do
|
24
|
+
@config.default_tokenizer.should_not == @config.default_tokenizer
|
62
25
|
end
|
63
26
|
end
|
64
27
|
|
@@ -51,7 +51,7 @@ describe Array do
|
|
51
51
|
[:test1, :test1, :test2, :test2, :test3].clustered_uniq.should == [:test1, :test2, :test3]
|
52
52
|
end
|
53
53
|
it "is fast" do
|
54
|
-
performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.
|
54
|
+
performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.000012
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
@@ -21,7 +21,20 @@ describe Index::File::Text do
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
describe "retrieve" do
|
24
|
-
|
24
|
+
before(:each) do
|
25
|
+
@io = stub :io
|
26
|
+
@io.should_receive(:each_line).once.with.and_yield '123456,some_nice_token'
|
27
|
+
File.should_receive(:open).any_number_of_times.and_yield @io
|
28
|
+
end
|
29
|
+
it "yields split lines and returns the id and token text" do
|
30
|
+
@file.retrieve do |id, token|
|
31
|
+
id.should == 123456
|
32
|
+
token.should == :some_nice_token
|
33
|
+
end
|
34
|
+
end
|
35
|
+
it "is fast" do
|
36
|
+
performance_of { @file.retrieve { |id, token| } }.should < 0.00005
|
37
|
+
end
|
25
38
|
end
|
26
39
|
|
27
40
|
end
|
@@ -46,7 +46,7 @@ describe 'Query::Combination' do
|
|
46
46
|
@combination = Query::Combination.new token, @category
|
47
47
|
end
|
48
48
|
it 'should return a correct result' do
|
49
|
-
@combination.to_result.should == [:some_category_name, 'Blä~', :
|
49
|
+
@combination.to_result.should == [:some_category_name, 'Blä~', :blä] # Note: Characters not substituted. That's ok.
|
50
50
|
end
|
51
51
|
end
|
52
52
|
it 'should return a correct result' do
|
@@ -36,31 +36,26 @@ describe Query::Tokens do
|
|
36
36
|
@tokens.instance_variable_get(:@tokens).should == [@nonblank, @nonblank]
|
37
37
|
end
|
38
38
|
end
|
39
|
-
|
40
|
-
describe "class variables" do
|
41
|
-
describe "maximal query words" do
|
42
|
-
it "should answer" do
|
43
|
-
lambda { Query::Tokens.maximum }.should_not raise_error
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
39
|
+
|
48
40
|
describe 'cap' do
|
49
41
|
context 'one token' do
|
50
42
|
before(:each) do
|
51
43
|
@token = Query::Token.processed 'Token'
|
52
44
|
@tokens = Query::Tokens.new [@token]
|
53
45
|
end
|
54
|
-
it '
|
55
|
-
@tokens.cap
|
56
|
-
|
46
|
+
it 'does not cut it down' do
|
47
|
+
@tokens.cap 5
|
48
|
+
|
57
49
|
@tokens.instance_variable_get(:@tokens).should == [@token]
|
58
50
|
end
|
51
|
+
it 'cuts it down' do
|
52
|
+
@tokens.cap 0
|
53
|
+
|
54
|
+
@tokens.instance_variable_get(:@tokens).should == []
|
55
|
+
end
|
59
56
|
end
|
60
57
|
context 'many tokens' do
|
61
58
|
before(:each) do
|
62
|
-
@old_maximum = Query::Tokens.maximum
|
63
|
-
Query::Tokens.maximum = 3
|
64
59
|
@first = Query::Token.processed 'Hello'
|
65
60
|
@second = Query::Token.processed 'I'
|
66
61
|
@third = Query::Token.processed 'Am'
|
@@ -72,12 +67,9 @@ describe Query::Tokens do
|
|
72
67
|
Query::Token.processed('Token')
|
73
68
|
]
|
74
69
|
end
|
75
|
-
after(:each) do
|
76
|
-
Query::Tokens.maximum = @old_maximum
|
77
|
-
end
|
78
70
|
it 'should cap the number of tokens' do
|
79
|
-
@tokens.cap
|
80
|
-
|
71
|
+
@tokens.cap 3
|
72
|
+
|
81
73
|
@tokens.instance_variable_get(:@tokens).should == [@first, @second, @third]
|
82
74
|
end
|
83
75
|
end
|
@@ -1,4 +1,5 @@
|
|
1
|
-
#
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
2
3
|
require 'spec_helper'
|
3
4
|
|
4
5
|
describe Tokenizers::Base do
|
@@ -7,6 +8,22 @@ describe Tokenizers::Base do
|
|
7
8
|
@tokenizer = Tokenizers::Base.new
|
8
9
|
end
|
9
10
|
|
11
|
+
describe "substitute(s)_characters*" do
|
12
|
+
it "doesn't substitute if there is no substituter" do
|
13
|
+
@tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
|
14
|
+
end
|
15
|
+
it "uses the substituter to replace characters" do
|
16
|
+
@tokenizer.substitutes_characters_with CharacterSubstitution::European.new
|
17
|
+
|
18
|
+
@tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
19
|
+
end
|
20
|
+
it "uses the european substituter as default" do
|
21
|
+
@tokenizer.substitutes_characters_with
|
22
|
+
|
23
|
+
@tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
10
27
|
describe "removes_characters_after_splitting" do
|
11
28
|
context "without removes_characters_after_splitting called" do
|
12
29
|
it "has remove_after_normalizing_illegals" do
|
@@ -64,6 +81,9 @@ describe Tokenizers::Base do
|
|
64
81
|
it 'should define a method split that splits by default on \s' do
|
65
82
|
@tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
|
66
83
|
end
|
84
|
+
it 'splits text on /\s/ by default' do
|
85
|
+
@tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
|
86
|
+
end
|
67
87
|
end
|
68
88
|
context "with removes_characters called" do
|
69
89
|
before(:each) do
|
@@ -8,6 +8,29 @@ describe Tokenizers::Index do
|
|
8
8
|
@tokenizer = Tokenizers::Index.new
|
9
9
|
end
|
10
10
|
|
11
|
+
describe "default*" do
|
12
|
+
before(:all) do
|
13
|
+
@old = Tokenizers::Index.default
|
14
|
+
end
|
15
|
+
after(:all) do
|
16
|
+
Tokenizers::Index.default = @old
|
17
|
+
end
|
18
|
+
it "has a reader" do
|
19
|
+
lambda { Tokenizers::Index.default }.should_not raise_error
|
20
|
+
end
|
21
|
+
it "returns by default a new Index" do
|
22
|
+
Tokenizers::Index.default.should be_kind_of(Tokenizers::Index)
|
23
|
+
end
|
24
|
+
it "has a writer" do
|
25
|
+
lambda { Tokenizers::Index.default = :bla }.should_not raise_error
|
26
|
+
end
|
27
|
+
it "returns what has been written, if something has been written" do
|
28
|
+
Tokenizers::Index.default = :some_default
|
29
|
+
|
30
|
+
Tokenizers::Index.default.should == :some_default
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
11
34
|
describe "remove_removes_characters" do
|
12
35
|
it "should not remove ' from a query by default" do
|
13
36
|
@tokenizer.remove_illegals("Lugi's").should == "Lugi's"
|
@@ -7,6 +7,38 @@ describe Tokenizers::Query do
|
|
7
7
|
@tokenizer = Tokenizers::Query.new
|
8
8
|
end
|
9
9
|
|
10
|
+
describe "default*" do
|
11
|
+
before(:all) do
|
12
|
+
@old = Tokenizers::Query.default
|
13
|
+
end
|
14
|
+
after(:all) do
|
15
|
+
Tokenizers::Query.default = @old
|
16
|
+
end
|
17
|
+
it "has a reader" do
|
18
|
+
lambda { Tokenizers::Query.default }.should_not raise_error
|
19
|
+
end
|
20
|
+
it "returns by default a new Index" do
|
21
|
+
Tokenizers::Query.default.should be_kind_of(Tokenizers::Query)
|
22
|
+
end
|
23
|
+
it "has a writer" do
|
24
|
+
lambda { Tokenizers::Query.default = :bla }.should_not raise_error
|
25
|
+
end
|
26
|
+
it "returns what has been written, if something has been written" do
|
27
|
+
Tokenizers::Query.default = :some_default
|
28
|
+
|
29
|
+
Tokenizers::Query.default.should == :some_default
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "maximum_tokens" do
|
34
|
+
it "should be set to 5 by default" do
|
35
|
+
@tokenizer.maximum_tokens.should == 5
|
36
|
+
end
|
37
|
+
it "should be settable" do
|
38
|
+
Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
10
42
|
describe 'preprocess' do
|
11
43
|
it 'should call methods in order' do
|
12
44
|
text = stub :text
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 10
|
8
|
+
- 0
|
9
|
+
version: 0.10.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Florian Hanke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-31 00:00:00 +02:00
|
18
18
|
default_executable: picky
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- lib/picky/cacher/weights/logarithmic.rb
|
60
60
|
- lib/picky/cacher/weights/strategy.rb
|
61
61
|
- lib/picky/cacher/weights_generator.rb
|
62
|
+
- lib/picky/character_substitution/european.rb
|
62
63
|
- lib/picky/configuration/field.rb
|
63
64
|
- lib/picky/configuration/indexes.rb
|
64
65
|
- lib/picky/configuration/queries.rb
|
@@ -118,11 +119,8 @@ files:
|
|
118
119
|
- lib/picky/sources/db.rb
|
119
120
|
- lib/picky/sources/delicious.rb
|
120
121
|
- lib/picky/tokenizers/base.rb
|
121
|
-
- lib/picky/tokenizers/default/index.rb
|
122
|
-
- lib/picky/tokenizers/default/query.rb
|
123
122
|
- lib/picky/tokenizers/index.rb
|
124
123
|
- lib/picky/tokenizers/query.rb
|
125
|
-
- lib/picky/umlaut_substituter.rb
|
126
124
|
- lib/picky-tasks.rb
|
127
125
|
- lib/picky.rb
|
128
126
|
- lib/tasks/application.rake
|
@@ -161,6 +159,7 @@ files:
|
|
161
159
|
- spec/lib/cacher/similarity_generator_spec.rb
|
162
160
|
- spec/lib/cacher/weights/logarithmic_spec.rb
|
163
161
|
- spec/lib/cacher/weights_generator_spec.rb
|
162
|
+
- spec/lib/character_substitution/european_spec.rb
|
164
163
|
- spec/lib/configuration/field_spec.rb
|
165
164
|
- spec/lib/configuration/indexes_spec.rb
|
166
165
|
- spec/lib/configuration/type_spec.rb
|
@@ -210,11 +209,8 @@ files:
|
|
210
209
|
- spec/lib/sources/db_spec.rb
|
211
210
|
- spec/lib/sources/delicious_spec.rb
|
212
211
|
- spec/lib/tokenizers/base_spec.rb
|
213
|
-
- spec/lib/tokenizers/default/index_spec.rb
|
214
|
-
- spec/lib/tokenizers/default/query_spec.rb
|
215
212
|
- spec/lib/tokenizers/index_spec.rb
|
216
213
|
- spec/lib/tokenizers/query_spec.rb
|
217
|
-
- spec/lib/umlaut_substituter_spec.rb
|
218
214
|
- spec/specific/speed_spec.rb
|
219
215
|
- bin/picky
|
220
216
|
has_rdoc: true
|
@@ -261,6 +257,7 @@ test_files:
|
|
261
257
|
- spec/lib/cacher/similarity_generator_spec.rb
|
262
258
|
- spec/lib/cacher/weights/logarithmic_spec.rb
|
263
259
|
- spec/lib/cacher/weights_generator_spec.rb
|
260
|
+
- spec/lib/character_substitution/european_spec.rb
|
264
261
|
- spec/lib/configuration/field_spec.rb
|
265
262
|
- spec/lib/configuration/indexes_spec.rb
|
266
263
|
- spec/lib/configuration/type_spec.rb
|
@@ -310,9 +307,6 @@ test_files:
|
|
310
307
|
- spec/lib/sources/db_spec.rb
|
311
308
|
- spec/lib/sources/delicious_spec.rb
|
312
309
|
- spec/lib/tokenizers/base_spec.rb
|
313
|
-
- spec/lib/tokenizers/default/index_spec.rb
|
314
|
-
- spec/lib/tokenizers/default/query_spec.rb
|
315
310
|
- spec/lib/tokenizers/index_spec.rb
|
316
311
|
- spec/lib/tokenizers/query_spec.rb
|
317
|
-
- spec/lib/umlaut_substituter_spec.rb
|
318
312
|
- spec/specific/speed_spec.rb
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
|
4
|
-
# Substitutes certain umlauts, like
|
5
|
-
# ä, ö, ü => ae, oe, ue.
|
6
|
-
# (and more, see specs)
|
7
|
-
#
|
8
|
-
class UmlautSubstituter
|
9
|
-
|
10
|
-
attr_reader :chars
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@chars = ActiveSupport::Multibyte.proxy_class
|
14
|
-
end
|
15
|
-
|
16
|
-
def substitute text
|
17
|
-
trans = chars.new(text).normalize(:kd)
|
18
|
-
|
19
|
-
# substitute special cases
|
20
|
-
#
|
21
|
-
trans.gsub!('ß', 'ss')
|
22
|
-
|
23
|
-
# substitute umlauts (of A,O,U,a,o,u)
|
24
|
-
#
|
25
|
-
trans.gsub!(/([AOUaou])\314\210/u, '\1e')
|
26
|
-
|
27
|
-
# get rid of ecutes, graves and …
|
28
|
-
#
|
29
|
-
trans.unpack('U*').select { |cp|
|
30
|
-
cp < 0x0300 || cp > 0x035F
|
31
|
-
}.pack('U*')
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|