picky 0.9.4 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/application.rb +42 -29
- data/lib/picky/character_substitution/european.rb +33 -0
- data/lib/picky/configuration/field.rb +3 -3
- data/lib/picky/configuration/indexes.rb +4 -10
- data/lib/picky/configuration/queries.rb +2 -10
- data/lib/picky/index/bundle.rb +2 -7
- data/lib/picky/index/file/text.rb +6 -1
- data/lib/picky/loader.rb +4 -4
- data/lib/picky/query/base.rb +1 -1
- data/lib/picky/query/tokens.rb +4 -11
- data/lib/picky/tokenizers/base.rb +23 -5
- data/lib/picky/tokenizers/index.rb +8 -1
- data/lib/picky/tokenizers/query.rb +21 -7
- data/lib/tasks/server.rake +3 -3
- data/lib/tasks/shortcuts.rake +4 -4
- data/lib/tasks/spec.rake +1 -1
- data/lib/tasks/try.rake +6 -8
- data/project_prototype/Gemfile +3 -2
- data/project_prototype/app/application.rb +35 -47
- data/spec/lib/application_spec.rb +36 -19
- data/spec/lib/{umlaut_substituter_spec.rb → character_substitution/european_spec.rb} +14 -2
- data/spec/lib/configuration/field_spec.rb +12 -12
- data/spec/lib/configuration/indexes_spec.rb +3 -40
- data/spec/lib/extensions/array_spec.rb +1 -1
- data/spec/lib/extensions/hash_spec.rb +1 -1
- data/spec/lib/index/file/text_spec.rb +14 -1
- data/spec/lib/query/combination_spec.rb +1 -1
- data/spec/lib/query/tokens_spec.rb +11 -19
- data/spec/lib/tokenizers/base_spec.rb +21 -1
- data/spec/lib/tokenizers/index_spec.rb +23 -0
- data/spec/lib/tokenizers/query_spec.rb +32 -0
- metadata +7 -13
- data/lib/picky/tokenizers/default/index.rb +0 -7
- data/lib/picky/tokenizers/default/query.rb +0 -7
- data/lib/picky/umlaut_substituter.rb +0 -34
- data/spec/lib/tokenizers/default/index_spec.rb +0 -11
- data/spec/lib/tokenizers/default/query_spec.rb +0 -11
data/lib/picky/application.rb
CHANGED
@@ -1,55 +1,68 @@
|
|
1
1
|
# The Picky application wherein the indexing and querying is defined.
|
2
2
|
#
|
3
3
|
class Application
|
4
|
+
|
4
5
|
class << self
|
5
6
|
|
6
|
-
|
7
|
+
# Returns a configured tokenizer that
|
8
|
+
# is used for indexing by default.
|
9
|
+
#
|
10
|
+
def default_indexing options = {}
|
11
|
+
indexing.default_tokenizer options
|
12
|
+
end
|
7
13
|
|
8
|
-
#
|
9
|
-
#
|
14
|
+
# Returns a configured tokenizer that
|
15
|
+
# is used for querying by default.
|
16
|
+
#
|
17
|
+
def default_querying options = {}
|
18
|
+
querying.default_tokenizer options
|
19
|
+
end
|
20
|
+
|
21
|
+
# Routes.
|
10
22
|
#
|
11
|
-
|
23
|
+
delegate :route, :root, :to => :routing
|
24
|
+
# Index, Field.
|
12
25
|
#
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
def
|
18
|
-
@apps.each &:finalize
|
19
|
-
end
|
26
|
+
# TODO Rename category.
|
27
|
+
#
|
28
|
+
delegate :field, :to => :indexing
|
29
|
+
def category *args; indexing.field *args; end
|
30
|
+
def index *args; indexing.define_index *args; end
|
20
31
|
|
21
32
|
# An application simply delegates to the routing to handle a request.
|
22
33
|
#
|
23
34
|
def call env
|
24
35
|
routing.call env
|
25
36
|
end
|
26
|
-
|
27
|
-
# Freezes the routes.
|
28
|
-
#
|
29
|
-
def finalize
|
30
|
-
routing.freeze
|
31
|
-
end
|
32
37
|
def routing
|
33
38
|
@routing ||= Routing.new
|
34
39
|
end
|
35
|
-
# Routes.
|
36
|
-
#
|
37
|
-
delegate :route, :root, :to => :routing
|
38
|
-
|
39
|
-
# TODO Rename to default_indexing?
|
40
|
-
#
|
41
40
|
def indexing
|
42
41
|
@indexing ||= Configuration::Indexes.new
|
43
42
|
end
|
44
|
-
def
|
45
|
-
|
43
|
+
def querying
|
44
|
+
@queries ||= Configuration::Queries.new
|
46
45
|
end
|
47
|
-
delegate :type, :field, :to => :indexing
|
48
46
|
|
49
|
-
#
|
47
|
+
# Finalize the subclass as soon as it
|
48
|
+
# has finished loading.
|
50
49
|
#
|
51
|
-
|
52
|
-
|
50
|
+
attr_reader :apps
|
51
|
+
def initialize_apps
|
52
|
+
@apps ||= []
|
53
|
+
end
|
54
|
+
def inherited app
|
55
|
+
initialize_apps
|
56
|
+
apps << app
|
57
|
+
end
|
58
|
+
def finalize_apps
|
59
|
+
initialize_apps
|
60
|
+
apps.each &:finalize
|
61
|
+
end
|
62
|
+
# Finalizes the routes.
|
63
|
+
#
|
64
|
+
def finalize
|
65
|
+
routing.freeze
|
53
66
|
end
|
54
67
|
|
55
68
|
# TODO Add more info.
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
module CharacterSubstitution
|
4
|
+
# Substitutes Umlauts like
|
5
|
+
# ä, ö, ü => ae, oe, ue.
|
6
|
+
# (and more, see specs)
|
7
|
+
#
|
8
|
+
class European
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
@chars = ActiveSupport::Multibyte.proxy_class
|
12
|
+
end
|
13
|
+
|
14
|
+
def substitute text
|
15
|
+
trans = @chars.new(text).normalize(:kd)
|
16
|
+
|
17
|
+
# substitute special cases
|
18
|
+
#
|
19
|
+
trans.gsub!('ß', 'ss')
|
20
|
+
|
21
|
+
# substitute umlauts (of A,O,U,a,o,u)
|
22
|
+
#
|
23
|
+
trans.gsub!(/([AOUaou])\314\210/u, '\1e')
|
24
|
+
|
25
|
+
# get rid of ecutes, graves and …
|
26
|
+
#
|
27
|
+
trans.unpack('U*').select { |cp|
|
28
|
+
cp < 0x0300 || cp > 0x035F
|
29
|
+
}.pack('U*')
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -5,10 +5,10 @@ module Configuration
|
|
5
5
|
#
|
6
6
|
class Field
|
7
7
|
attr_reader :name, :indexed_name, :virtual, :tokenizer
|
8
|
-
attr_accessor :type # convenience
|
9
|
-
def initialize name,
|
8
|
+
attr_accessor :type # convenience TODO Still needed?
|
9
|
+
def initialize name, options = {}
|
10
10
|
@name = name.to_sym
|
11
|
-
@tokenizer = tokenizer
|
11
|
+
@tokenizer = options[:tokenizer] || Tokenizers::Index.default
|
12
12
|
|
13
13
|
# TODO Dup the options?
|
14
14
|
|
@@ -10,17 +10,13 @@ module Configuration
|
|
10
10
|
@types = []
|
11
11
|
end
|
12
12
|
|
13
|
-
def default_tokenizer
|
14
|
-
|
13
|
+
def default_tokenizer options = {}
|
14
|
+
Tokenizers::Index.default = Tokenizers::Index.new(options)
|
15
15
|
end
|
16
16
|
|
17
|
-
# Delegates
|
18
|
-
#
|
19
|
-
delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
|
20
|
-
|
21
17
|
# TODO Rewrite all this configuration handling.
|
22
18
|
#
|
23
|
-
def
|
19
|
+
def define_index name, source, *fields
|
24
20
|
new_type = Type.new name, source, *fields
|
25
21
|
types << new_type
|
26
22
|
::Indexes.configuration ||= self
|
@@ -30,9 +26,7 @@ module Configuration
|
|
30
26
|
generated
|
31
27
|
end
|
32
28
|
def field name, options = {}
|
33
|
-
|
34
|
-
|
35
|
-
Field.new name, tokenizer, options
|
29
|
+
Field.new name, options
|
36
30
|
end
|
37
31
|
|
38
32
|
#
|
@@ -6,16 +6,8 @@ module Configuration
|
|
6
6
|
|
7
7
|
#
|
8
8
|
#
|
9
|
-
def default_tokenizer
|
10
|
-
|
11
|
-
end
|
12
|
-
|
13
|
-
delegate :removes_characters, :contracts_expressions, :stopwords, :splits_text_on, :normalizes_words, :removes_characters_after_splitting, :to => :default_tokenizer
|
14
|
-
|
15
|
-
# Delegates.
|
16
|
-
#
|
17
|
-
def maximum_tokens amount
|
18
|
-
Query::Tokens.maximum = amount
|
9
|
+
def default_tokenizer options = {}
|
10
|
+
Tokenizers::Query.default = Tokenizers::Query.new(options)
|
19
11
|
end
|
20
12
|
|
21
13
|
end
|
data/lib/picky/index/bundle.rb
CHANGED
@@ -98,15 +98,10 @@ module Index
|
|
98
98
|
end
|
99
99
|
# Retrieves the data into the index.
|
100
100
|
#
|
101
|
-
# TODO Beautify.
|
102
|
-
#
|
103
101
|
def retrieve
|
104
|
-
files.retrieve do |
|
105
|
-
token.chomp!
|
106
|
-
token = token.to_sym
|
107
|
-
|
102
|
+
files.retrieve do |id, token|
|
108
103
|
initialize_index_for token
|
109
|
-
index[token] <<
|
104
|
+
index[token] << id
|
110
105
|
end
|
111
106
|
end
|
112
107
|
def initialize_index_for token
|
@@ -13,10 +13,15 @@ module Index
|
|
13
13
|
def dump hash
|
14
14
|
raise "Can't dump to text file. Use JSON or Marshal."
|
15
15
|
end
|
16
|
+
|
17
|
+
# Yields an id and a symbol token.
|
18
|
+
#
|
16
19
|
def retrieve
|
20
|
+
id, token =
|
17
21
|
::File.open(cache_path, 'r:binary') do |file|
|
18
22
|
file.each_line do |line|
|
19
|
-
|
23
|
+
id, token = line.split ?,, 2
|
24
|
+
yield id.to_i, (token.chomp! || token).to_sym
|
20
25
|
end
|
21
26
|
end
|
22
27
|
end
|
data/lib/picky/loader.rb
CHANGED
@@ -104,6 +104,10 @@ module Loader
|
|
104
104
|
load_relative 'helpers/cache'
|
105
105
|
load_relative 'helpers/measuring'
|
106
106
|
|
107
|
+
# Character Substitution
|
108
|
+
#
|
109
|
+
load_relative 'character_substitution/european'
|
110
|
+
|
107
111
|
# Signal handling
|
108
112
|
#
|
109
113
|
load_relative 'signals'
|
@@ -111,7 +115,6 @@ module Loader
|
|
111
115
|
# Various.
|
112
116
|
#
|
113
117
|
load_relative 'loggers/search'
|
114
|
-
load_relative 'umlaut_substituter'
|
115
118
|
|
116
119
|
# Index generation strategies.
|
117
120
|
#
|
@@ -180,9 +183,6 @@ module Loader
|
|
180
183
|
load_relative 'tokenizers/index'
|
181
184
|
load_relative 'tokenizers/query'
|
182
185
|
|
183
|
-
load_relative 'tokenizers/default/index'
|
184
|
-
load_relative 'tokenizers/default/query'
|
185
|
-
|
186
186
|
# Query combinations, qualifiers, weigher.
|
187
187
|
#
|
188
188
|
load_relative 'query/combination'
|
data/lib/picky/query/base.rb
CHANGED
@@ -16,7 +16,7 @@ module Query
|
|
16
16
|
options = Hash === index_types.last ? index_types.pop : {}
|
17
17
|
@index_types = index_types
|
18
18
|
@weigher = options[:weigher] || Weigher.new(index_types)
|
19
|
-
@tokenizer = options[:tokenizer] || Tokenizers::
|
19
|
+
@tokenizer = options[:tokenizer] || Tokenizers::Query.default
|
20
20
|
@weights = options[:weights] || Weights.new
|
21
21
|
end
|
22
22
|
|
data/lib/picky/query/tokens.rb
CHANGED
@@ -6,11 +6,6 @@ module Query
|
|
6
6
|
#
|
7
7
|
class Tokens
|
8
8
|
|
9
|
-
#
|
10
|
-
#
|
11
|
-
cattr_accessor :maximum
|
12
|
-
self.maximum = 5
|
13
|
-
|
14
9
|
# Basically delegates to its internal tokens array.
|
15
10
|
#
|
16
11
|
self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
|
@@ -52,13 +47,11 @@ module Query
|
|
52
47
|
|
53
48
|
# Caps the tokens to the maximum.
|
54
49
|
#
|
55
|
-
|
56
|
-
|
57
|
-
def cap
|
58
|
-
@tokens.slice!(@@maximum..-1) if cap?
|
50
|
+
def cap maximum
|
51
|
+
@tokens.slice!(maximum..-1) if cap?(maximum)
|
59
52
|
end
|
60
|
-
def cap?
|
61
|
-
@tokens.size >
|
53
|
+
def cap? maximum
|
54
|
+
@tokens.size > maximum
|
62
55
|
end
|
63
56
|
|
64
57
|
# Rejects blank tokens.
|
@@ -79,6 +79,19 @@ module Tokenizers
|
|
79
79
|
text.gsub! @removes_characters_after_splitting_regexp, '' if @removes_characters_after_splitting_regexp
|
80
80
|
end
|
81
81
|
|
82
|
+
# Substitute Characters with this substituter.
|
83
|
+
#
|
84
|
+
# Default is European Character substitution.
|
85
|
+
#
|
86
|
+
def substitutes_characters_with substituter = CharacterSubstitution::European.new
|
87
|
+
# TODO Raise if it doesn't quack substitute?
|
88
|
+
@substituter = substituter
|
89
|
+
end
|
90
|
+
def substitute_characters text
|
91
|
+
substituter?? substituter.substitute(text) : text
|
92
|
+
end
|
93
|
+
|
94
|
+
|
82
95
|
# Returns a number of tokens, generated from the given text.
|
83
96
|
#
|
84
97
|
# Note:
|
@@ -93,15 +106,20 @@ module Tokenizers
|
|
93
106
|
process tokens # processing tokens / strings
|
94
107
|
end
|
95
108
|
|
96
|
-
|
109
|
+
attr_reader :substituter
|
97
110
|
alias substituter? substituter
|
98
111
|
|
99
|
-
def initialize
|
100
|
-
|
112
|
+
def initialize options = {}
|
113
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
114
|
+
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
115
|
+
stopwords options[:stopwords] if options[:stopwords]
|
116
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
117
|
+
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
118
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
101
119
|
|
102
|
-
#
|
120
|
+
# Defaults.
|
103
121
|
#
|
104
|
-
splits_text_on
|
122
|
+
splits_text_on options[:splits_text_on] || /\s/
|
105
123
|
end
|
106
124
|
|
107
125
|
# Hooks.
|
@@ -5,6 +5,13 @@ module Tokenizers
|
|
5
5
|
#
|
6
6
|
class Index < Base
|
7
7
|
|
8
|
+
def self.default= new_default
|
9
|
+
@default = new_default
|
10
|
+
end
|
11
|
+
def self.default
|
12
|
+
@default ||= new
|
13
|
+
end
|
14
|
+
|
8
15
|
# Default indexing preprocessing hook.
|
9
16
|
#
|
10
17
|
# Does:
|
@@ -15,7 +22,7 @@ module Tokenizers
|
|
15
22
|
# 5. Remove non-single stopwords. (Stopwords that occur with other words)
|
16
23
|
#
|
17
24
|
def preprocess text
|
18
|
-
text =
|
25
|
+
text = substitute_characters text
|
19
26
|
text.downcase!
|
20
27
|
remove_illegals text
|
21
28
|
contract text
|
@@ -13,6 +13,20 @@ module Tokenizers
|
|
13
13
|
#
|
14
14
|
class Query < Base
|
15
15
|
|
16
|
+
def self.default= new_default
|
17
|
+
@default = new_default
|
18
|
+
end
|
19
|
+
def self.default
|
20
|
+
@default ||= new
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_reader :maximum_tokens
|
24
|
+
|
25
|
+
def initialize options = {}
|
26
|
+
super options
|
27
|
+
@maximum_tokens = options[:maximum_tokens] || 5
|
28
|
+
end
|
29
|
+
|
16
30
|
def preprocess text
|
17
31
|
remove_illegals text # Remove illegal characters
|
18
32
|
remove_non_single_stopwords text # remove stop words
|
@@ -33,9 +47,9 @@ module Tokenizers
|
|
33
47
|
#
|
34
48
|
def process tokens
|
35
49
|
tokens.tokenize_with self
|
36
|
-
tokens.reject
|
37
|
-
tokens.cap
|
38
|
-
tokens.partialize_last
|
50
|
+
tokens.reject # Reject any tokens that don't meet criteria
|
51
|
+
tokens.cap maximum_tokens # Cut off superfluous tokens
|
52
|
+
tokens.partialize_last # Set certain tokens as partial
|
39
53
|
tokens
|
40
54
|
end
|
41
55
|
|
@@ -44,10 +58,10 @@ module Tokenizers
|
|
44
58
|
# TODO Perhaps move to Normalizer?
|
45
59
|
#
|
46
60
|
def normalize text
|
47
|
-
text =
|
48
|
-
text.downcase!
|
49
|
-
normalize_with_patterns text
|
50
|
-
text.to_sym
|
61
|
+
text = substitute_characters text # Substitute special characters TODO Move to subclass
|
62
|
+
text.downcase! # Downcase all text
|
63
|
+
normalize_with_patterns text # normalize
|
64
|
+
text.to_sym # symbolize
|
51
65
|
end
|
52
66
|
|
53
67
|
# Returns a token for a word.
|
data/lib/tasks/server.rake
CHANGED
@@ -11,7 +11,7 @@ namespace :server do
|
|
11
11
|
pid.blank? ? nil : pid.chomp
|
12
12
|
end
|
13
13
|
|
14
|
-
desc "Start the unicorns. (Wehee!)"
|
14
|
+
# desc "Start the unicorns. (Wehee!)"
|
15
15
|
task :start => :framework do
|
16
16
|
chdir_to_root
|
17
17
|
# Rake::Task[:"solr:start"].invoke # TODO Move to better place.
|
@@ -21,13 +21,13 @@ namespace :server do
|
|
21
21
|
exec command
|
22
22
|
end
|
23
23
|
|
24
|
-
desc "Stop the unicorns. (Blam!)"
|
24
|
+
# desc "Stop the unicorns. (Blam!)"
|
25
25
|
task :stop => :framework do
|
26
26
|
`kill -QUIT #{current_pid}` if current_pid
|
27
27
|
# Rake::Task[:"solr:stop"].invoke # TODO Move to better place.
|
28
28
|
end
|
29
29
|
|
30
|
-
desc "Restart the unicorns."
|
30
|
+
# desc "Restart the unicorns."
|
31
31
|
task :restart do
|
32
32
|
Rake::Task[:"server:stop"].invoke
|
33
33
|
sleep 5
|
data/lib/tasks/shortcuts.rake
CHANGED
@@ -1,20 +1,20 @@
|
|
1
|
-
desc "
|
1
|
+
desc "Generate the index."
|
2
2
|
task :index => :application do
|
3
3
|
Rake::Task[:'index:generate'].invoke
|
4
4
|
end
|
5
5
|
|
6
|
-
desc "
|
6
|
+
desc "Try the given text in the indexer/query (type:field optional)."
|
7
7
|
task :try, [:text, :type_and_field] => :application do |_, options|
|
8
8
|
text, type_and_field = options.text, options.type_and_field
|
9
9
|
|
10
10
|
Rake::Task[:'try:both'].invoke text, type_and_field
|
11
11
|
end
|
12
12
|
|
13
|
-
desc "
|
13
|
+
desc "Start the server."
|
14
14
|
task :start do
|
15
15
|
Rake::Task[:'server:start'].invoke
|
16
16
|
end
|
17
|
-
desc "
|
17
|
+
desc "Stop the server."
|
18
18
|
task :stop do
|
19
19
|
Rake::Task[:'server:stop'].invoke
|
20
20
|
end
|
data/lib/tasks/spec.rake
CHANGED
@@ -3,7 +3,7 @@ require 'spec/rake/spectask'
|
|
3
3
|
|
4
4
|
task :default => :spec
|
5
5
|
|
6
|
-
desc "Run all specs
|
6
|
+
desc "Run all specs"
|
7
7
|
Spec::Rake::SpecTask.new(:spec) do |t|
|
8
8
|
spec_root = File.join(File.dirname(__FILE__), '..', '..', 'spec')
|
9
9
|
t.spec_opts = ['--options', "\"#{File.join(spec_root, 'spec.opts')}\""]
|
data/lib/tasks/try.rake
CHANGED
@@ -2,25 +2,23 @@
|
|
2
2
|
#
|
3
3
|
namespace :try do
|
4
4
|
|
5
|
-
desc "Try how a given word would be tokenized when indexing (type:field optional)."
|
5
|
+
# desc "Try how a given word would be tokenized when indexing (type:field optional)."
|
6
6
|
task :index, [:text, :type_and_field] => :application do |_, options|
|
7
7
|
text, type_and_field = options.text, options.type_and_field
|
8
8
|
|
9
|
-
tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::
|
9
|
+
tokenizer = type_and_field ? Indexes.find(*type_and_field.split(':')).tokenizer : Tokenizers::Index.default
|
10
10
|
|
11
|
-
puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text).to_a}"
|
11
|
+
puts "\"#{text}\" is index tokenized as #{tokenizer.tokenize(text.dup).to_a}"
|
12
12
|
end
|
13
13
|
|
14
|
-
desc "Try how a given word would be tokenized when querying."
|
14
|
+
# desc "Try how a given word would be tokenized when querying."
|
15
15
|
task :query, [:text] => :application do |_, options|
|
16
16
|
text = options.text
|
17
17
|
|
18
|
-
#
|
19
|
-
#
|
20
|
-
puts "\"#{text}\" is query tokenized as #{Tokenizers::Default::Query.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
|
18
|
+
puts "\"#{text}\" is query tokenized as #{Tokenizers::Query.default.tokenize(text.dup).to_a.map(&:to_s).map(&:to_sym)}"
|
21
19
|
end
|
22
20
|
|
23
|
-
desc "Try the given text with both the index and the query (type:field optional)."
|
21
|
+
# desc "Try the given text with both the index and the query (type:field optional)."
|
24
22
|
task :both, [:text, :type_and_field] => :application do |_, options|
|
25
23
|
text, type_and_field = options.text, options.type_and_field
|
26
24
|
|
data/project_prototype/Gemfile
CHANGED
@@ -1,58 +1,46 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#
|
3
|
+
# TODO Adapt the generated example
|
4
|
+
# (a library books finder) to what you need.
|
5
|
+
#
|
6
|
+
# Check the Wiki http://github.com/floere/picky/wiki for more options.
|
7
|
+
# Ask me or the google group if you have questions or specific requests.
|
8
|
+
#
|
3
9
|
class PickySearch < Application
|
4
10
|
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
11
|
+
# Indexing: How text is indexed.
|
12
|
+
# Querying: How query text is handled.
|
13
|
+
#
|
14
|
+
default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
|
15
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
16
|
+
splits_text_on: /[\s\/\-\"\&\.]/
|
17
|
+
|
18
|
+
default_querying removes_characters: /[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/, # Picky needs control chars *"~: to pass through.
|
19
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
20
|
+
splits_text_on: /[\s\/\-\,\&]+/,
|
21
|
+
|
22
|
+
maximum_tokens: 5, # Max amount of tokens passing into a query. 5 is the default.
|
23
|
+
substitutes_characters_with: CharacterSubstitution::European.new # Normalizes special user input, Ä -> Ae, ñ -> n etc.
|
24
|
+
|
25
|
+
# Define an index. Use a database etc. source? http://github.com/floere/picky/wiki/Sources-Configuration#sources
|
9
26
|
#
|
10
|
-
# Ask me or the google group if you have questions or specific requests.
|
11
|
-
#
|
12
|
-
|
13
|
-
indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
|
14
|
-
indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
|
15
|
-
indexing.splits_text_on(/[\s\/\-\"\&\.]/)
|
16
|
-
|
17
27
|
books_index = index :books,
|
18
|
-
Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, :
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
:similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: no similarity).
|
28
|
-
field(:author, :partial => Partial::Substring.new(:from => 1)),
|
29
|
-
field(:isbn, :partial => Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
|
30
|
-
|
31
|
-
# Defines the maximum tokens (words) that pass through to the engine.
|
32
|
-
#
|
33
|
-
querying.maximum_tokens 5
|
28
|
+
Sources::CSV.new(:title, :author, :isbn, :year, :publisher, :subjects, file: 'app/library.csv'),
|
29
|
+
category(:title,
|
30
|
+
partial: Partial::Substring.new(from: 1), # Indexes substrings upwards from character 1 (default: -3),
|
31
|
+
# You'll find "picky" even when entering just a "p".
|
32
|
+
similarity: Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed (default: No similarity).
|
33
|
+
category(:author,
|
34
|
+
partial: Partial::Substring.new(from: 1)),
|
35
|
+
category(:isbn,
|
36
|
+
partial: Partial::None.new) # Partial substring searching on an ISBN makes not much sense, neither does similarity.
|
34
37
|
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
|
39
|
-
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
40
|
-
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
38
|
+
full_books = Query::Full.new books_index # A Full query returns ids, combinations, and counts.
|
39
|
+
live_books = Query::Live.new books_index # A Live query does return all that Full returns, except ids.
|
41
40
|
|
42
|
-
|
43
|
-
#
|
44
|
-
# A Full query returns ids, combinations, and counts.
|
45
|
-
# A Live query does return all that Full returns, without ids.
|
46
|
-
#
|
47
|
-
# Note: You can pass a query multiple indexes and it will combine them.
|
48
|
-
#
|
49
|
-
full_books = Query::Full.new books_index
|
50
|
-
live_books = Query::Live.new books_index
|
41
|
+
route %r{\A/books/full\Z} => full_books # Routing is simple: url_path_regexp => query
|
42
|
+
route %r{\A/books/live\Z} => live_books #
|
51
43
|
|
52
|
-
#
|
53
|
-
# A path regexp pointing to a query that will be run.
|
54
|
-
#
|
55
|
-
route %r{^/books/full} => full_books
|
56
|
-
route %r{^/books/live} => live_books
|
44
|
+
# Note: You can pass a query multiple indexes and it will query in all of them.
|
57
45
|
|
58
46
|
end
|
@@ -5,32 +5,49 @@ require 'spec_helper'
|
|
5
5
|
describe Application do
|
6
6
|
|
7
7
|
describe "integration" do
|
8
|
+
it "should run ok" do
|
9
|
+
lambda {
|
10
|
+
class MinimalTestApplication < Application
|
11
|
+
books = index :books,
|
12
|
+
Sources::DB.new('SELECT id, title FROM books', :file => 'app/db.yml'),
|
13
|
+
category(:title)
|
14
|
+
|
15
|
+
|
16
|
+
full = Query::Full.new books
|
17
|
+
live = Query::Live.new books
|
18
|
+
|
19
|
+
route %r{^/books/full} => full
|
20
|
+
route %r{^/books/live} => live
|
21
|
+
end
|
22
|
+
Tokenizers::Index.default.tokenize 'some text'
|
23
|
+
Tokenizers::Query.default.tokenize 'some text'
|
24
|
+
}.should_not raise_error
|
25
|
+
end
|
8
26
|
it "should run ok" do
|
9
27
|
lambda {
|
10
28
|
# TODO Add all possible cases.
|
11
29
|
#
|
12
30
|
class TestApplication < Application
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
31
|
+
default_indexing removes_characters: /[^a-zA-Z0-9\s\/\-\"\&\.]/,
|
32
|
+
contracts_expressions: [/mr\.\s*|mister\s*/i, 'mr '],
|
33
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
34
|
+
splits_text_on: /[\s\/\-\"\&\.]/,
|
35
|
+
removes_characters_after_splitting: /[\.]/
|
36
|
+
|
37
|
+
default_querying removes_characters: /[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/,
|
38
|
+
stopwords: /\b(and|the|of|it|in|for)\b/,
|
39
|
+
splits_text_on: /[\s\/\-\,\&]+/,
|
40
|
+
normalizes_words: [[/Deoxyribonucleic Acid/i, 'DNA']],
|
41
|
+
|
42
|
+
substitutes_characters_with: CharacterSubstitution::European.new,
|
43
|
+
maximum_tokens: 5
|
18
44
|
|
19
|
-
books_index = index
|
20
|
-
|
21
|
-
|
22
|
-
|
45
|
+
books_index = index :books,
|
46
|
+
Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
|
47
|
+
category(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
|
48
|
+
category(:author),
|
49
|
+
category(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
|
23
50
|
|
24
|
-
# Note that Picky needs the following characters to
|
25
|
-
# pass through, as they are control characters: *"~:
|
26
|
-
#
|
27
|
-
querying.removes_characters(/[^a-zA-Z0-9äöü\s\/\-\,\&\"\~\*\:]/)
|
28
|
-
querying.stopwords(/\b(and|the|of|it|in|for)\b/)
|
29
|
-
querying.splits_text_on(/[\s\/\-\,\&]+/)
|
30
|
-
querying.normalizes_words([
|
31
|
-
[/Deoxyribonucleic Acid/i, 'DNA']
|
32
|
-
])
|
33
|
-
querying.maximum_tokens 5
|
34
51
|
|
35
52
|
full = Query::Full.new books_index
|
36
53
|
live = Query::Live.new books_index
|
@@ -1,9 +1,10 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
#
|
2
3
|
require 'spec_helper'
|
3
4
|
|
4
|
-
describe
|
5
|
+
describe CharacterSubstitution do
|
5
6
|
before(:each) do
|
6
|
-
@substituter =
|
7
|
+
@substituter = CharacterSubstitution::European.new
|
7
8
|
end
|
8
9
|
|
9
10
|
# A bit of metaprogramming to help with the myriads of its.
|
@@ -82,5 +83,16 @@ describe UmlautSubstituter do
|
|
82
83
|
it_should_substitute 'å', 'a'
|
83
84
|
it_should_substitute 'Å', 'A'
|
84
85
|
end
|
86
|
+
|
87
|
+
describe "diacritic" do
|
88
|
+
it_should_substitute 'ñ', 'n'
|
89
|
+
end
|
90
|
+
|
91
|
+
describe "speed" do
|
92
|
+
it "is fast" do
|
93
|
+
result = performance_of { @substituter.substitute('ä') }
|
94
|
+
result.should < 0.00009
|
95
|
+
end
|
96
|
+
end
|
85
97
|
|
86
98
|
end
|
@@ -5,7 +5,7 @@ describe Configuration::Field do
|
|
5
5
|
describe "virtual?" do
|
6
6
|
context "with virtual true" do
|
7
7
|
before(:each) do
|
8
|
-
@field = Configuration::Field.new :some_name, :
|
8
|
+
@field = Configuration::Field.new :some_name, :virtual => true
|
9
9
|
end
|
10
10
|
it "returns the right value" do
|
11
11
|
@field.virtual?.should == true
|
@@ -13,7 +13,7 @@ describe Configuration::Field do
|
|
13
13
|
end
|
14
14
|
context "with virtual object" do
|
15
15
|
before(:each) do
|
16
|
-
@field = Configuration::Field.new :some_name, :
|
16
|
+
@field = Configuration::Field.new :some_name, :virtual => 123.6
|
17
17
|
end
|
18
18
|
it "returns the right value" do
|
19
19
|
@field.virtual?.should == true
|
@@ -39,7 +39,7 @@ describe Configuration::Field do
|
|
39
39
|
describe "tokenizer" do
|
40
40
|
context "with specific tokenizer" do
|
41
41
|
before(:each) do
|
42
|
-
@field = Configuration::Field.new :some_name, Tokenizers::Index.new
|
42
|
+
@field = Configuration::Field.new :some_name, tokenizer: Tokenizers::Index.new
|
43
43
|
|
44
44
|
@field.type = :some_type
|
45
45
|
end
|
@@ -54,7 +54,7 @@ describe Configuration::Field do
|
|
54
54
|
describe "indexer" do
|
55
55
|
context "with default indexer" do
|
56
56
|
before(:each) do
|
57
|
-
@field = Configuration::Field.new :some_name
|
57
|
+
@field = Configuration::Field.new :some_name
|
58
58
|
end
|
59
59
|
it "caches" do
|
60
60
|
@field.indexer.should == @field.indexer
|
@@ -62,7 +62,7 @@ describe Configuration::Field do
|
|
62
62
|
end
|
63
63
|
context "with specific indexer" do
|
64
64
|
before(:each) do
|
65
|
-
@field = Configuration::Field.new :some_name, :
|
65
|
+
@field = Configuration::Field.new :some_name, tokenizer: Indexers::Default
|
66
66
|
|
67
67
|
@field.type = :some_type
|
68
68
|
end
|
@@ -81,7 +81,7 @@ describe Configuration::Field do
|
|
81
81
|
end
|
82
82
|
describe "cache" do
|
83
83
|
before(:each) do
|
84
|
-
@field = Configuration::Field.new :some_name
|
84
|
+
@field = Configuration::Field.new :some_name
|
85
85
|
@field.stub! :prepare_cache_directory
|
86
86
|
|
87
87
|
@generated = stub :generated, :generate_caches => nil
|
@@ -100,7 +100,7 @@ describe Configuration::Field do
|
|
100
100
|
end
|
101
101
|
describe "prepare_cache_directory" do
|
102
102
|
before(:each) do
|
103
|
-
@field = Configuration::Field.new :some_name
|
103
|
+
@field = Configuration::Field.new :some_name
|
104
104
|
|
105
105
|
@field.stub! :cache_directory => :some_cache_directory
|
106
106
|
end
|
@@ -112,7 +112,7 @@ describe Configuration::Field do
|
|
112
112
|
end
|
113
113
|
describe "index" do
|
114
114
|
before(:each) do
|
115
|
-
@field = Configuration::Field.new :some_name
|
115
|
+
@field = Configuration::Field.new :some_name
|
116
116
|
@field.stub! :prepare_cache_directory
|
117
117
|
|
118
118
|
@indexer = stub :indexer, :index => nil
|
@@ -132,7 +132,7 @@ describe Configuration::Field do
|
|
132
132
|
describe "source" do
|
133
133
|
context "with source" do
|
134
134
|
before(:each) do
|
135
|
-
@field = Configuration::Field.new :some_name, :
|
135
|
+
@field = Configuration::Field.new :some_name, :source => :some_given_source
|
136
136
|
|
137
137
|
@type = stub :type, :name => :some_type
|
138
138
|
@field.type = @type
|
@@ -143,7 +143,7 @@ describe Configuration::Field do
|
|
143
143
|
end
|
144
144
|
context "without source" do
|
145
145
|
before(:each) do
|
146
|
-
@field = Configuration::Field.new :some_name
|
146
|
+
@field = Configuration::Field.new :some_name
|
147
147
|
|
148
148
|
@type = stub :type, :name => :some_type, :source => :some_type_source
|
149
149
|
@field.type = @type
|
@@ -155,7 +155,7 @@ describe Configuration::Field do
|
|
155
155
|
end
|
156
156
|
context "name symbol" do
|
157
157
|
before(:each) do
|
158
|
-
@field = Configuration::Field.new :some_name
|
158
|
+
@field = Configuration::Field.new :some_name
|
159
159
|
|
160
160
|
@type = stub :type, :name => :some_type
|
161
161
|
@field.type = @type
|
@@ -189,7 +189,7 @@ describe Configuration::Field do
|
|
189
189
|
end
|
190
190
|
context "name string" do
|
191
191
|
before(:each) do
|
192
|
-
@field = Configuration::Field.new 'some_name'
|
192
|
+
@field = Configuration::Field.new 'some_name'
|
193
193
|
end
|
194
194
|
describe "generate_qualifiers_from" do
|
195
195
|
context "without qualifiers" do
|
@@ -18,47 +18,10 @@ describe Configuration::Indexes do
|
|
18
18
|
|
19
19
|
describe "default_tokenizer" do
|
20
20
|
it "is a default tokenizer" do
|
21
|
-
@config.default_tokenizer.should
|
21
|
+
@config.default_tokenizer.should be_kind_of(Tokenizers::Index)
|
22
22
|
end
|
23
|
-
it "
|
24
|
-
@config.default_tokenizer.
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
describe "delegates" do
|
29
|
-
before(:each) do
|
30
|
-
@receiver = mock :receiver
|
31
|
-
@config.stub! :default_tokenizer => @receiver
|
32
|
-
end
|
33
|
-
it "delegates" do
|
34
|
-
@receiver.should_receive(:removes_characters).once
|
35
|
-
|
36
|
-
@config.removes_characters
|
37
|
-
end
|
38
|
-
it "delegates" do
|
39
|
-
@receiver.should_receive(:contracts_expressions).once
|
40
|
-
|
41
|
-
@config.contracts_expressions
|
42
|
-
end
|
43
|
-
it "delegates" do
|
44
|
-
@receiver.should_receive(:stopwords).once
|
45
|
-
|
46
|
-
@config.stopwords
|
47
|
-
end
|
48
|
-
it "delegates" do
|
49
|
-
@receiver.should_receive(:splits_text_on).once
|
50
|
-
|
51
|
-
@config.splits_text_on
|
52
|
-
end
|
53
|
-
it "delegates" do
|
54
|
-
@receiver.should_receive(:normalizes_words).once
|
55
|
-
|
56
|
-
@config.normalizes_words
|
57
|
-
end
|
58
|
-
it "delegates" do
|
59
|
-
@receiver.should_receive(:removes_characters_after_splitting).once
|
60
|
-
|
61
|
-
@config.removes_characters_after_splitting
|
23
|
+
it "does not cache" do
|
24
|
+
@config.default_tokenizer.should_not == @config.default_tokenizer
|
62
25
|
end
|
63
26
|
end
|
64
27
|
|
@@ -51,7 +51,7 @@ describe Array do
|
|
51
51
|
[:test1, :test1, :test2, :test2, :test3].clustered_uniq.should == [:test1, :test2, :test3]
|
52
52
|
end
|
53
53
|
it "is fast" do
|
54
|
-
performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.
|
54
|
+
performance_of { [:test1, :test1, :test2, :test2, :test3].clustered_uniq }.should < 0.000012
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
@@ -21,7 +21,20 @@ describe Index::File::Text do
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
describe "retrieve" do
|
24
|
-
|
24
|
+
before(:each) do
|
25
|
+
@io = stub :io
|
26
|
+
@io.should_receive(:each_line).once.with.and_yield '123456,some_nice_token'
|
27
|
+
File.should_receive(:open).any_number_of_times.and_yield @io
|
28
|
+
end
|
29
|
+
it "yields split lines and returns the id and token text" do
|
30
|
+
@file.retrieve do |id, token|
|
31
|
+
id.should == 123456
|
32
|
+
token.should == :some_nice_token
|
33
|
+
end
|
34
|
+
end
|
35
|
+
it "is fast" do
|
36
|
+
performance_of { @file.retrieve { |id, token| } }.should < 0.00005
|
37
|
+
end
|
25
38
|
end
|
26
39
|
|
27
40
|
end
|
@@ -46,7 +46,7 @@ describe 'Query::Combination' do
|
|
46
46
|
@combination = Query::Combination.new token, @category
|
47
47
|
end
|
48
48
|
it 'should return a correct result' do
|
49
|
-
@combination.to_result.should == [:some_category_name, 'Blä~', :
|
49
|
+
@combination.to_result.should == [:some_category_name, 'Blä~', :blä] # Note: Characters not substituted. That's ok.
|
50
50
|
end
|
51
51
|
end
|
52
52
|
it 'should return a correct result' do
|
@@ -36,31 +36,26 @@ describe Query::Tokens do
|
|
36
36
|
@tokens.instance_variable_get(:@tokens).should == [@nonblank, @nonblank]
|
37
37
|
end
|
38
38
|
end
|
39
|
-
|
40
|
-
describe "class variables" do
|
41
|
-
describe "maximal query words" do
|
42
|
-
it "should answer" do
|
43
|
-
lambda { Query::Tokens.maximum }.should_not raise_error
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
39
|
+
|
48
40
|
describe 'cap' do
|
49
41
|
context 'one token' do
|
50
42
|
before(:each) do
|
51
43
|
@token = Query::Token.processed 'Token'
|
52
44
|
@tokens = Query::Tokens.new [@token]
|
53
45
|
end
|
54
|
-
it '
|
55
|
-
@tokens.cap
|
56
|
-
|
46
|
+
it 'does not cut it down' do
|
47
|
+
@tokens.cap 5
|
48
|
+
|
57
49
|
@tokens.instance_variable_get(:@tokens).should == [@token]
|
58
50
|
end
|
51
|
+
it 'cuts it down' do
|
52
|
+
@tokens.cap 0
|
53
|
+
|
54
|
+
@tokens.instance_variable_get(:@tokens).should == []
|
55
|
+
end
|
59
56
|
end
|
60
57
|
context 'many tokens' do
|
61
58
|
before(:each) do
|
62
|
-
@old_maximum = Query::Tokens.maximum
|
63
|
-
Query::Tokens.maximum = 3
|
64
59
|
@first = Query::Token.processed 'Hello'
|
65
60
|
@second = Query::Token.processed 'I'
|
66
61
|
@third = Query::Token.processed 'Am'
|
@@ -72,12 +67,9 @@ describe Query::Tokens do
|
|
72
67
|
Query::Token.processed('Token')
|
73
68
|
]
|
74
69
|
end
|
75
|
-
after(:each) do
|
76
|
-
Query::Tokens.maximum = @old_maximum
|
77
|
-
end
|
78
70
|
it 'should cap the number of tokens' do
|
79
|
-
@tokens.cap
|
80
|
-
|
71
|
+
@tokens.cap 3
|
72
|
+
|
81
73
|
@tokens.instance_variable_get(:@tokens).should == [@first, @second, @third]
|
82
74
|
end
|
83
75
|
end
|
@@ -1,4 +1,5 @@
|
|
1
|
-
#
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
2
3
|
require 'spec_helper'
|
3
4
|
|
4
5
|
describe Tokenizers::Base do
|
@@ -7,6 +8,22 @@ describe Tokenizers::Base do
|
|
7
8
|
@tokenizer = Tokenizers::Base.new
|
8
9
|
end
|
9
10
|
|
11
|
+
describe "substitute(s)_characters*" do
|
12
|
+
it "doesn't substitute if there is no substituter" do
|
13
|
+
@tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
|
14
|
+
end
|
15
|
+
it "uses the substituter to replace characters" do
|
16
|
+
@tokenizer.substitutes_characters_with CharacterSubstitution::European.new
|
17
|
+
|
18
|
+
@tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
19
|
+
end
|
20
|
+
it "uses the european substituter as default" do
|
21
|
+
@tokenizer.substitutes_characters_with
|
22
|
+
|
23
|
+
@tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
10
27
|
describe "removes_characters_after_splitting" do
|
11
28
|
context "without removes_characters_after_splitting called" do
|
12
29
|
it "has remove_after_normalizing_illegals" do
|
@@ -64,6 +81,9 @@ describe Tokenizers::Base do
|
|
64
81
|
it 'should define a method split that splits by default on \s' do
|
65
82
|
@tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
|
66
83
|
end
|
84
|
+
it 'splits text on /\s/ by default' do
|
85
|
+
@tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
|
86
|
+
end
|
67
87
|
end
|
68
88
|
context "with removes_characters called" do
|
69
89
|
before(:each) do
|
@@ -8,6 +8,29 @@ describe Tokenizers::Index do
|
|
8
8
|
@tokenizer = Tokenizers::Index.new
|
9
9
|
end
|
10
10
|
|
11
|
+
describe "default*" do
|
12
|
+
before(:all) do
|
13
|
+
@old = Tokenizers::Index.default
|
14
|
+
end
|
15
|
+
after(:all) do
|
16
|
+
Tokenizers::Index.default = @old
|
17
|
+
end
|
18
|
+
it "has a reader" do
|
19
|
+
lambda { Tokenizers::Index.default }.should_not raise_error
|
20
|
+
end
|
21
|
+
it "returns by default a new Index" do
|
22
|
+
Tokenizers::Index.default.should be_kind_of(Tokenizers::Index)
|
23
|
+
end
|
24
|
+
it "has a writer" do
|
25
|
+
lambda { Tokenizers::Index.default = :bla }.should_not raise_error
|
26
|
+
end
|
27
|
+
it "returns what has been written, if something has been written" do
|
28
|
+
Tokenizers::Index.default = :some_default
|
29
|
+
|
30
|
+
Tokenizers::Index.default.should == :some_default
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
11
34
|
describe "remove_removes_characters" do
|
12
35
|
it "should not remove ' from a query by default" do
|
13
36
|
@tokenizer.remove_illegals("Lugi's").should == "Lugi's"
|
@@ -7,6 +7,38 @@ describe Tokenizers::Query do
|
|
7
7
|
@tokenizer = Tokenizers::Query.new
|
8
8
|
end
|
9
9
|
|
10
|
+
describe "default*" do
|
11
|
+
before(:all) do
|
12
|
+
@old = Tokenizers::Query.default
|
13
|
+
end
|
14
|
+
after(:all) do
|
15
|
+
Tokenizers::Query.default = @old
|
16
|
+
end
|
17
|
+
it "has a reader" do
|
18
|
+
lambda { Tokenizers::Query.default }.should_not raise_error
|
19
|
+
end
|
20
|
+
it "returns by default a new Index" do
|
21
|
+
Tokenizers::Query.default.should be_kind_of(Tokenizers::Query)
|
22
|
+
end
|
23
|
+
it "has a writer" do
|
24
|
+
lambda { Tokenizers::Query.default = :bla }.should_not raise_error
|
25
|
+
end
|
26
|
+
it "returns what has been written, if something has been written" do
|
27
|
+
Tokenizers::Query.default = :some_default
|
28
|
+
|
29
|
+
Tokenizers::Query.default.should == :some_default
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "maximum_tokens" do
|
34
|
+
it "should be set to 5 by default" do
|
35
|
+
@tokenizer.maximum_tokens.should == 5
|
36
|
+
end
|
37
|
+
it "should be settable" do
|
38
|
+
Tokenizers::Query.new(maximum_tokens: 3).maximum_tokens.should == 3
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
10
42
|
describe 'preprocess' do
|
11
43
|
it 'should call methods in order' do
|
12
44
|
text = stub :text
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 10
|
8
|
+
- 0
|
9
|
+
version: 0.10.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Florian Hanke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-10-
|
17
|
+
date: 2010-10-31 00:00:00 +02:00
|
18
18
|
default_executable: picky
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -59,6 +59,7 @@ files:
|
|
59
59
|
- lib/picky/cacher/weights/logarithmic.rb
|
60
60
|
- lib/picky/cacher/weights/strategy.rb
|
61
61
|
- lib/picky/cacher/weights_generator.rb
|
62
|
+
- lib/picky/character_substitution/european.rb
|
62
63
|
- lib/picky/configuration/field.rb
|
63
64
|
- lib/picky/configuration/indexes.rb
|
64
65
|
- lib/picky/configuration/queries.rb
|
@@ -118,11 +119,8 @@ files:
|
|
118
119
|
- lib/picky/sources/db.rb
|
119
120
|
- lib/picky/sources/delicious.rb
|
120
121
|
- lib/picky/tokenizers/base.rb
|
121
|
-
- lib/picky/tokenizers/default/index.rb
|
122
|
-
- lib/picky/tokenizers/default/query.rb
|
123
122
|
- lib/picky/tokenizers/index.rb
|
124
123
|
- lib/picky/tokenizers/query.rb
|
125
|
-
- lib/picky/umlaut_substituter.rb
|
126
124
|
- lib/picky-tasks.rb
|
127
125
|
- lib/picky.rb
|
128
126
|
- lib/tasks/application.rake
|
@@ -161,6 +159,7 @@ files:
|
|
161
159
|
- spec/lib/cacher/similarity_generator_spec.rb
|
162
160
|
- spec/lib/cacher/weights/logarithmic_spec.rb
|
163
161
|
- spec/lib/cacher/weights_generator_spec.rb
|
162
|
+
- spec/lib/character_substitution/european_spec.rb
|
164
163
|
- spec/lib/configuration/field_spec.rb
|
165
164
|
- spec/lib/configuration/indexes_spec.rb
|
166
165
|
- spec/lib/configuration/type_spec.rb
|
@@ -210,11 +209,8 @@ files:
|
|
210
209
|
- spec/lib/sources/db_spec.rb
|
211
210
|
- spec/lib/sources/delicious_spec.rb
|
212
211
|
- spec/lib/tokenizers/base_spec.rb
|
213
|
-
- spec/lib/tokenizers/default/index_spec.rb
|
214
|
-
- spec/lib/tokenizers/default/query_spec.rb
|
215
212
|
- spec/lib/tokenizers/index_spec.rb
|
216
213
|
- spec/lib/tokenizers/query_spec.rb
|
217
|
-
- spec/lib/umlaut_substituter_spec.rb
|
218
214
|
- spec/specific/speed_spec.rb
|
219
215
|
- bin/picky
|
220
216
|
has_rdoc: true
|
@@ -261,6 +257,7 @@ test_files:
|
|
261
257
|
- spec/lib/cacher/similarity_generator_spec.rb
|
262
258
|
- spec/lib/cacher/weights/logarithmic_spec.rb
|
263
259
|
- spec/lib/cacher/weights_generator_spec.rb
|
260
|
+
- spec/lib/character_substitution/european_spec.rb
|
264
261
|
- spec/lib/configuration/field_spec.rb
|
265
262
|
- spec/lib/configuration/indexes_spec.rb
|
266
263
|
- spec/lib/configuration/type_spec.rb
|
@@ -310,9 +307,6 @@ test_files:
|
|
310
307
|
- spec/lib/sources/db_spec.rb
|
311
308
|
- spec/lib/sources/delicious_spec.rb
|
312
309
|
- spec/lib/tokenizers/base_spec.rb
|
313
|
-
- spec/lib/tokenizers/default/index_spec.rb
|
314
|
-
- spec/lib/tokenizers/default/query_spec.rb
|
315
310
|
- spec/lib/tokenizers/index_spec.rb
|
316
311
|
- spec/lib/tokenizers/query_spec.rb
|
317
|
-
- spec/lib/umlaut_substituter_spec.rb
|
318
312
|
- spec/specific/speed_spec.rb
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
|
4
|
-
# Substitutes certain umlauts, like
|
5
|
-
# ä, ö, ü => ae, oe, ue.
|
6
|
-
# (and more, see specs)
|
7
|
-
#
|
8
|
-
class UmlautSubstituter
|
9
|
-
|
10
|
-
attr_reader :chars
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@chars = ActiveSupport::Multibyte.proxy_class
|
14
|
-
end
|
15
|
-
|
16
|
-
def substitute text
|
17
|
-
trans = chars.new(text).normalize(:kd)
|
18
|
-
|
19
|
-
# substitute special cases
|
20
|
-
#
|
21
|
-
trans.gsub!('ß', 'ss')
|
22
|
-
|
23
|
-
# substitute umlauts (of A,O,U,a,o,u)
|
24
|
-
#
|
25
|
-
trans.gsub!(/([AOUaou])\314\210/u, '\1e')
|
26
|
-
|
27
|
-
# get rid of ecutes, graves and …
|
28
|
-
#
|
29
|
-
trans.unpack('U*').select { |cp|
|
30
|
-
cp < 0x0300 || cp > 0x035F
|
31
|
-
}.pack('U*')
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|