picky 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/application.rb +12 -12
- data/lib/picky/backends/backend.rb +17 -0
- data/lib/picky/{backend → backends}/file/basic.rb +1 -1
- data/lib/picky/{backend → backends}/file/json.rb +1 -1
- data/lib/picky/{backend → backends}/file/marshal.rb +1 -1
- data/lib/picky/{backend → backends}/file/text.rb +1 -1
- data/lib/picky/backends/memory.rb +53 -0
- data/lib/picky/{backend → backends}/redis/basic.rb +9 -14
- data/lib/picky/backends/redis/float_hash.rb +26 -0
- data/lib/picky/{backend → backends}/redis/list_hash.rb +7 -11
- data/lib/picky/{backend → backends}/redis/string_hash.rb +7 -11
- data/lib/picky/backends/redis.rb +87 -0
- data/lib/picky/bundle.rb +107 -11
- data/lib/picky/category.rb +5 -5
- data/lib/picky/index.rb +329 -0
- data/lib/picky/index_indexed.rb +31 -0
- data/lib/picky/index_indexing.rb +161 -0
- data/lib/picky/indexed/bundle.rb +112 -0
- data/lib/picky/indexed/wrappers/exact_first.rb +1 -1
- data/lib/picky/indexers/parallel.rb +2 -1
- data/lib/picky/indexers/serial.rb +2 -1
- data/lib/picky/indexes_indexing.rb +1 -1
- data/lib/picky/indexing/bundle.rb +188 -0
- data/lib/picky/indexing/wrappers/category/location.rb +1 -1
- data/lib/picky/interfaces/live_parameters.rb +8 -8
- data/lib/picky/loader.rb +24 -38
- data/lib/picky/migrations/from_30_to_31.rb +61 -0
- data/lib/picky/query/allocation.rb +10 -5
- data/lib/picky/query/combinations.rb +70 -0
- data/lib/picky/query/indexes.rb +8 -7
- data/lib/picky/query/indexes_check.rb +47 -0
- data/lib/picky/query/token.rb +16 -29
- data/lib/picky/query/tokens.rb +4 -20
- data/lib/picky/search.rb +51 -58
- data/lib/picky/tokenizer.rb +231 -0
- data/lib/picky/tokenizers/location.rb +1 -1
- data/lib/tasks/try.rake +4 -12
- data/lib/tasks/try.rb +37 -0
- data/spec/lib/application_spec.rb +5 -5
- data/spec/lib/{backend → backends}/file/basic_spec.rb +2 -2
- data/spec/lib/{backend → backends}/file/json_spec.rb +2 -2
- data/spec/lib/{backend → backends}/file/marshal_spec.rb +2 -2
- data/spec/lib/{backend → backends}/file/text_spec.rb +1 -1
- data/spec/lib/backends/memory_spec.rb +77 -0
- data/spec/lib/{backend → backends}/redis/basic_spec.rb +19 -21
- data/spec/lib/backends/redis/float_hash_spec.rb +38 -0
- data/spec/lib/backends/redis/list_hash_spec.rb +27 -0
- data/spec/lib/backends/redis/string_hash_spec.rb +38 -0
- data/spec/lib/backends/redis_spec.rb +79 -0
- data/spec/lib/categories_indexed_spec.rb +3 -3
- data/spec/lib/category_indexed_spec.rb +6 -6
- data/spec/lib/category_indexing_spec.rb +1 -1
- data/spec/lib/category_spec.rb +1 -1
- data/spec/lib/frontend_adapters/rack_spec.rb +2 -2
- data/spec/lib/{indexes/index_indexed_spec.rb → index_indexed_spec.rb} +1 -1
- data/spec/lib/{indexes/index_indexing_spec.rb → index_indexing_spec.rb} +1 -1
- data/spec/lib/{indexes/index_spec.rb → index_spec.rb} +1 -1
- data/spec/lib/indexed/{bundle/memory_spec.rb → memory_spec.rb} +18 -18
- data/spec/lib/indexed/wrappers/exact_first_spec.rb +2 -2
- data/spec/lib/indexing/{bundle/memory_partial_generation_speed_spec.rb → bundle_partial_generation_speed_spec.rb} +3 -3
- data/spec/lib/indexing/bundle_spec.rb +302 -0
- data/spec/lib/query/allocation_spec.rb +21 -11
- data/spec/lib/query/combination_spec.rb +2 -2
- data/spec/lib/query/{combinations/base_spec.rb → combinations_spec.rb} +1 -1
- data/spec/lib/query/indexes_check_spec.rb +25 -0
- data/spec/lib/query/indexes_spec.rb +5 -1
- data/spec/lib/query/token_spec.rb +18 -20
- data/spec/lib/query/tokens_spec.rb +14 -65
- data/spec/lib/search_spec.rb +36 -37
- data/spec/lib/tasks/try_spec.rb +51 -0
- data/spec/lib/{tokenizers/base_spec.rb → tokenizer_spec.rb} +15 -44
- metadata +64 -81
- data/lib/picky/backend/base.rb +0 -121
- data/lib/picky/backend/files.rb +0 -28
- data/lib/picky/backend/redis.rb +0 -44
- data/lib/picky/indexed/bundle/base.rb +0 -47
- data/lib/picky/indexed/bundle/memory.rb +0 -88
- data/lib/picky/indexed/bundle/redis.rb +0 -91
- data/lib/picky/indexes/index.rb +0 -328
- data/lib/picky/indexes/index_indexed.rb +0 -35
- data/lib/picky/indexes/index_indexing.rb +0 -165
- data/lib/picky/indexes/memory.rb +0 -20
- data/lib/picky/indexes/redis.rb +0 -20
- data/lib/picky/indexing/bundle/base.rb +0 -242
- data/lib/picky/indexing/bundle/memory.rb +0 -26
- data/lib/picky/indexing/bundle/redis.rb +0 -26
- data/lib/picky/query/combinations/base.rb +0 -74
- data/lib/picky/query/combinations/memory.rb +0 -52
- data/lib/picky/query/combinations/redis.rb +0 -90
- data/lib/picky/query.rb +0 -6
- data/lib/picky/tokenizers/base.rb +0 -231
- data/lib/picky/tokenizers/index.rb +0 -34
- data/lib/picky/tokenizers/query.rb +0 -61
- data/spec/lib/backend/files_spec.rb +0 -189
- data/spec/lib/backend/redis/list_hash_spec.rb +0 -40
- data/spec/lib/backend/redis/string_hash_spec.rb +0 -47
- data/spec/lib/backend/redis_spec.rb +0 -170
- data/spec/lib/indexed/bundle/redis_spec.rb +0 -41
- data/spec/lib/indexes/redis_spec.rb +0 -15
- data/spec/lib/indexing/bundle/base_spec.rb +0 -38
- data/spec/lib/indexing/bundle/memory_spec.rb +0 -287
- data/spec/lib/indexing/bundle/redis_spec.rb +0 -283
- data/spec/lib/query/combinations/memory_spec.rb +0 -158
- data/spec/lib/query/combinations/redis_spec.rb +0 -172
- data/spec/lib/tokenizers/index_spec.rb +0 -69
- data/spec/lib/tokenizers/query_spec.rb +0 -121
data/lib/picky/query/tokens.rb
CHANGED
@@ -20,17 +20,16 @@ module Picky
|
|
20
20
|
|
21
21
|
# Creates a new Tokens object from a number of Strings.
|
22
22
|
#
|
23
|
-
|
24
|
-
|
25
|
-
#
|
26
|
-
def self.processed words, downcase = true
|
27
|
-
new words.collect! { |word| Token.processed word, downcase }
|
23
|
+
def self.processed words, originals
|
24
|
+
new words.zip(originals).collect! { |word, original| Token.processed word, original }
|
28
25
|
end
|
29
26
|
|
30
27
|
# Tokenizes each token.
|
31
28
|
#
|
32
29
|
# Note: Passed tokenizer needs to offer #normalize(text).
|
33
30
|
#
|
31
|
+
# TODO Still needed?
|
32
|
+
#
|
34
33
|
def tokenize_with tokenizer
|
35
34
|
@tokens.each { |token| token.tokenize_with(tokenizer) }
|
36
35
|
end
|
@@ -63,21 +62,6 @@ module Picky
|
|
63
62
|
@tokens.last.partial = true unless empty?
|
64
63
|
end
|
65
64
|
|
66
|
-
# Caps the tokens to the maximum.
|
67
|
-
#
|
68
|
-
def cap maximum
|
69
|
-
@tokens.slice!(maximum..-1) if cap?(maximum)
|
70
|
-
end
|
71
|
-
def cap? maximum
|
72
|
-
@tokens.size > maximum
|
73
|
-
end
|
74
|
-
|
75
|
-
# Rejects blank tokens.
|
76
|
-
#
|
77
|
-
def reject
|
78
|
-
@tokens.reject! &:blank?
|
79
|
-
end
|
80
|
-
|
81
65
|
#
|
82
66
|
#
|
83
67
|
def categorize mapper
|
data/lib/picky/search.rb
CHANGED
@@ -18,7 +18,8 @@ module Picky
|
|
18
18
|
include Helpers::Measuring
|
19
19
|
|
20
20
|
attr_reader :indexes
|
21
|
-
attr_accessor :tokenizer,
|
21
|
+
attr_accessor :tokenizer,
|
22
|
+
:weights
|
22
23
|
|
23
24
|
# Takes:
|
24
25
|
# * A number of indexes
|
@@ -29,16 +30,17 @@ module Picky
|
|
29
30
|
# It is also possible to define the tokenizer and weights like so.
|
30
31
|
# Example:
|
31
32
|
# search = Search.new(index1, index2, index3) do
|
32
|
-
# searching removes_characters: /[^a-z]
|
33
|
-
# weights [:author, :title] => +3,
|
33
|
+
# searching removes_characters: /[^a-z]/ # etc.
|
34
|
+
# weights [:author, :title] => +3,
|
35
|
+
# [:title, :isbn] => +1
|
34
36
|
# end
|
35
37
|
#
|
36
38
|
def initialize *index_definitions
|
37
|
-
@indexes = Query::Indexes.new *index_definitions
|
39
|
+
@indexes = Query::Indexes.new *index_definitions
|
38
40
|
|
39
41
|
instance_eval(&Proc.new) if block_given?
|
40
42
|
|
41
|
-
@tokenizer ||=
|
43
|
+
@tokenizer ||= Tokenizer.query_default # THINK Not dynamic. Ok?
|
42
44
|
@weights ||= Query::Weights.new
|
43
45
|
|
44
46
|
self
|
@@ -58,28 +60,50 @@ module Picky
|
|
58
60
|
@tokenizer = if options.respond_to?(:tokenize)
|
59
61
|
options
|
60
62
|
else
|
61
|
-
options &&
|
63
|
+
options && Tokenizer.new(options)
|
62
64
|
end
|
63
65
|
end
|
64
66
|
|
65
|
-
#
|
67
|
+
# Examples:
|
66
68
|
# search = Search.new(books_index, dvd_index, mp3_index) do
|
67
69
|
# boost [:author, :title] => +3,
|
68
70
|
# [:title, :isbn] => +1
|
69
71
|
# end
|
70
72
|
#
|
73
|
+
# or
|
74
|
+
#
|
75
|
+
# # Explicitly add a random number (0...1) to the weights.
|
76
|
+
# #
|
77
|
+
# my_weights = Class.new do
|
78
|
+
# # Instance only needs to implement
|
79
|
+
# # score_for combinations
|
80
|
+
# # and return a number that is
|
81
|
+
# # added to the weight.
|
82
|
+
# #
|
83
|
+
# def score_for combinations
|
84
|
+
# rand
|
85
|
+
# end
|
86
|
+
# end.new
|
87
|
+
#
|
88
|
+
# search = Search.new(books_index, dvd_index, mp3_index) do
|
89
|
+
# boost my_weights
|
90
|
+
# end
|
91
|
+
#
|
71
92
|
def boost weights
|
72
|
-
weights
|
73
|
-
|
93
|
+
@weights = if weights.respond_to?(:score_for)
|
94
|
+
weights
|
95
|
+
else
|
96
|
+
Query::Weights.new weights
|
97
|
+
end
|
74
98
|
end
|
75
99
|
|
76
100
|
# This is the main entry point for a query.
|
77
101
|
# Use this in specs and also for running queries.
|
78
102
|
#
|
79
103
|
# Parameters:
|
80
|
-
# * text:
|
81
|
-
# * ids = 20:
|
82
|
-
# * offset = 0:
|
104
|
+
# * text: The search text.
|
105
|
+
# * ids = 20: The amount of ids to calculate (with offset).
|
106
|
+
# * offset = 0: The offset from which position to return the ids. Useful for pagination.
|
83
107
|
#
|
84
108
|
# Note: The Rack adapter calls this method after unravelling the HTTP request.
|
85
109
|
#
|
@@ -89,7 +113,7 @@ module Picky
|
|
89
113
|
|
90
114
|
# Runs the actual search using Query::Tokens.
|
91
115
|
#
|
92
|
-
# Note: Internal method, use #search
|
116
|
+
# Note: Internal method, use #search to search.
|
93
117
|
#
|
94
118
|
def search_with tokens, ids = 20, offset = 0, original_text = nil
|
95
119
|
results = nil
|
@@ -104,7 +128,7 @@ module Picky
|
|
104
128
|
|
105
129
|
# Execute a search using Query::Tokens.
|
106
130
|
#
|
107
|
-
# Note: Internal method, use #search.
|
131
|
+
# Note: Internal method, use #search to search.
|
108
132
|
#
|
109
133
|
def execute tokens, ids, offset, original_text = nil
|
110
134
|
Results.from original_text, ids, offset, sorted_allocations(tokens)
|
@@ -113,10 +137,16 @@ module Picky
|
|
113
137
|
# Delegates the tokenizing to the query tokenizer.
|
114
138
|
#
|
115
139
|
# Parameters:
|
116
|
-
# * text: The
|
140
|
+
# * text: The string to tokenize.
|
141
|
+
#
|
142
|
+
# Returns:
|
143
|
+
# * A Picky::Query::Tokens instance.
|
117
144
|
#
|
118
145
|
def tokenized text
|
119
|
-
tokenizer.tokenize text
|
146
|
+
tokens, originals = tokenizer.tokenize text
|
147
|
+
tokens = Query::Tokens.processed tokens, originals || tokens
|
148
|
+
tokens.partialize_last # Note: In the standard Picky search, the last token is always partial.
|
149
|
+
tokens
|
120
150
|
end
|
121
151
|
|
122
152
|
# Gets sorted allocations for the tokens.
|
@@ -125,52 +155,15 @@ module Picky
|
|
125
155
|
indexes.prepared_allocations_for tokens, weights
|
126
156
|
end
|
127
157
|
|
128
|
-
# Returns the right combinations strategy for
|
129
|
-
# a number of query indexes.
|
130
|
-
#
|
131
|
-
# Currently it isn't possible using Memory and Redis etc.
|
132
|
-
# indexes in the same query index group.
|
133
|
-
#
|
134
|
-
# Picky will raise a Query::Indexes::DifferentTypesError.
|
135
|
-
#
|
136
|
-
@@mapping = {
|
137
|
-
Indexes::Memory => Query::Combinations::Memory,
|
138
|
-
Indexes::Redis => Query::Combinations::Redis
|
139
|
-
}
|
140
|
-
def combinations_type_for index_definitions_ary
|
141
|
-
index_types = extract_index_types index_definitions_ary
|
142
|
-
!index_types.empty? && @@mapping[*index_types] || Query::Combinations::Memory
|
143
|
-
end
|
144
|
-
def extract_index_types index_definitions_ary
|
145
|
-
index_types = index_definitions_ary.map(&:class)
|
146
|
-
index_types.uniq!
|
147
|
-
check_index_types index_types
|
148
|
-
index_types
|
149
|
-
end
|
150
|
-
def check_index_types index_types
|
151
|
-
raise_different index_types if index_types.size > 1
|
152
|
-
end
|
153
|
-
# Currently it isn't possible using Memory and Redis etc.
|
154
|
-
# indexes in the same query index group.
|
155
|
-
#
|
156
|
-
class DifferentTypesError < StandardError
|
157
|
-
def initialize types
|
158
|
-
@types = types
|
159
|
-
end
|
160
|
-
def to_s
|
161
|
-
"Currently it isn't possible to mix #{@types.join(" and ")} Indexes in the same Search instance."
|
162
|
-
end
|
163
|
-
end
|
164
|
-
def raise_different index_types
|
165
|
-
raise DifferentTypesError.new(index_types)
|
166
|
-
end
|
167
|
-
|
168
158
|
# Display some nice information for the user.
|
169
159
|
#
|
170
160
|
def to_s
|
171
161
|
s = "#{self.class}("
|
172
|
-
|
173
|
-
|
162
|
+
unless @indexes.indexes.empty?
|
163
|
+
s << @indexes.indexes.map(&:name).join(', ')
|
164
|
+
s << ", "
|
165
|
+
end
|
166
|
+
s << "weights: #{@weights}"
|
174
167
|
s << ")"
|
175
168
|
s
|
176
169
|
end
|
@@ -0,0 +1,231 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
module Picky
|
4
|
+
|
5
|
+
# Defines tokenizing processes used both in indexing and querying.
|
6
|
+
#
|
7
|
+
class Tokenizer
|
8
|
+
|
9
|
+
def self.index_default= new_default
|
10
|
+
@index_default = new_default
|
11
|
+
end
|
12
|
+
def self.index_default
|
13
|
+
@index_default ||= new
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.query_default= new_default
|
17
|
+
@query_default = new_default
|
18
|
+
end
|
19
|
+
def self.query_default
|
20
|
+
@query_default ||= new
|
21
|
+
end
|
22
|
+
|
23
|
+
# TODO Move EMPTY_STRING top level.
|
24
|
+
#
|
25
|
+
EMPTY_STRING = ''.freeze
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
|
29
|
+
<<-TOKENIZER
|
30
|
+
Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
|
31
|
+
Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
|
32
|
+
Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
|
33
|
+
Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
|
34
|
+
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
|
35
|
+
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
|
36
|
+
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
37
|
+
TOKENIZER
|
38
|
+
end
|
39
|
+
|
40
|
+
# Stopwords.
|
41
|
+
#
|
42
|
+
# We only allow regexps (even if string would be okay
|
43
|
+
# too for gsub! - it's too hard to understand)
|
44
|
+
#
|
45
|
+
def stopwords regexp
|
46
|
+
check_argument_in __method__, Regexp, regexp
|
47
|
+
@remove_stopwords_regexp = regexp
|
48
|
+
end
|
49
|
+
def remove_stopwords text
|
50
|
+
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
|
51
|
+
text
|
52
|
+
end
|
53
|
+
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
54
|
+
def remove_non_single_stopwords text
|
55
|
+
return text unless @remove_stopwords_regexp
|
56
|
+
return text if text.match @@non_single_stopword_regexp
|
57
|
+
remove_stopwords text
|
58
|
+
end
|
59
|
+
|
60
|
+
# Illegals.
|
61
|
+
#
|
62
|
+
# We only allow regexps (even if string would be okay
|
63
|
+
# too for gsub! - it's too hard to understand)
|
64
|
+
#
|
65
|
+
def removes_characters regexp
|
66
|
+
check_argument_in __method__, Regexp, regexp
|
67
|
+
@removes_characters_regexp = regexp
|
68
|
+
end
|
69
|
+
def remove_illegals text
|
70
|
+
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
|
71
|
+
text
|
72
|
+
end
|
73
|
+
|
74
|
+
# Splitting.
|
75
|
+
#
|
76
|
+
# We allow Strings and Regexps.
|
77
|
+
# Note: We do not test against to_str since symbols do not work with String#split.
|
78
|
+
#
|
79
|
+
def splits_text_on regexp_or_string
|
80
|
+
raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
|
81
|
+
@splits_text_on = regexp_or_string
|
82
|
+
end
|
83
|
+
def split text
|
84
|
+
text.split @splits_text_on
|
85
|
+
end
|
86
|
+
|
87
|
+
# Normalizing.
|
88
|
+
#
|
89
|
+
# We only allow arrays.
|
90
|
+
#
|
91
|
+
def normalizes_words regexp_replaces
|
92
|
+
raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
|
93
|
+
@normalizes_words_regexp_replaces = regexp_replaces
|
94
|
+
end
|
95
|
+
def normalize_with_patterns text
|
96
|
+
return text unless @normalizes_words_regexp_replaces
|
97
|
+
|
98
|
+
@normalizes_words_regexp_replaces.each do |regex, replace|
|
99
|
+
# This should be sufficient
|
100
|
+
#
|
101
|
+
text.gsub!(regex, replace) and break
|
102
|
+
end
|
103
|
+
|
104
|
+
text
|
105
|
+
end
|
106
|
+
def normalize_with_patterns?
|
107
|
+
@normalizes_words_regexp_replaces
|
108
|
+
end
|
109
|
+
|
110
|
+
# Substitute Characters with this substituter.
|
111
|
+
#
|
112
|
+
# Default is European Character substitution.
|
113
|
+
#
|
114
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
115
|
+
raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
|
116
|
+
@substituter = substituter
|
117
|
+
end
|
118
|
+
def substitute_characters text
|
119
|
+
substituter?? substituter.substitute(text) : text
|
120
|
+
end
|
121
|
+
|
122
|
+
# Reject tokens after tokenizing based on the given criteria.
|
123
|
+
#
|
124
|
+
def rejects_token_if &condition
|
125
|
+
@reject_condition = condition
|
126
|
+
end
|
127
|
+
def reject tokens
|
128
|
+
tokens.reject! &@reject_condition
|
129
|
+
end
|
130
|
+
|
131
|
+
def case_sensitive case_sensitive
|
132
|
+
@case_sensitive = case_sensitive
|
133
|
+
end
|
134
|
+
def downcase?
|
135
|
+
!@case_sensitive
|
136
|
+
end
|
137
|
+
|
138
|
+
def maximum_tokens amount
|
139
|
+
@maximum_tokens = amount
|
140
|
+
end
|
141
|
+
def cap words
|
142
|
+
words.slice!(@maximum_tokens..-1) if cap?(words)
|
143
|
+
end
|
144
|
+
def cap? words
|
145
|
+
@maximum_tokens && words.size > @maximum_tokens
|
146
|
+
end
|
147
|
+
|
148
|
+
# Checks if the right argument type has been given.
|
149
|
+
#
|
150
|
+
def check_argument_in method, type, argument, &condition
|
151
|
+
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
|
152
|
+
end
|
153
|
+
|
154
|
+
attr_reader :substituter
|
155
|
+
alias substituter? substituter
|
156
|
+
|
157
|
+
def initialize options = {}
|
158
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
159
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
160
|
+
stopwords options[:stopwords] if options[:stopwords]
|
161
|
+
splits_text_on options[:splits_text_on] || /\s/
|
162
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
163
|
+
maximum_tokens options[:maximum_tokens]
|
164
|
+
rejects_token_if &(options[:rejects_token_if] || :blank?)
|
165
|
+
case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
|
166
|
+
end
|
167
|
+
|
168
|
+
# Returns a number of tokens, generated from the given text,
|
169
|
+
# based on the parameters given.
|
170
|
+
#
|
171
|
+
# Returns:
|
172
|
+
# [[:token1, :token2], ["Original1", "Original2"]]
|
173
|
+
#
|
174
|
+
def tokenize text
|
175
|
+
text = preprocess text # processing the text
|
176
|
+
return empty_tokens if text.blank?
|
177
|
+
words = pretokenize text # splitting and preparations for tokenizing
|
178
|
+
return empty_tokens if words.empty?
|
179
|
+
tokens = tokens_for words # creating tokens / strings
|
180
|
+
[tokens, words]
|
181
|
+
end
|
182
|
+
|
183
|
+
# Default preprocessing hook.
|
184
|
+
#
|
185
|
+
# Does:
|
186
|
+
# 1. Character substitution.
|
187
|
+
# 2. Remove illegal expressions.
|
188
|
+
# 3. Remove non-single stopwords. (Stopwords that occur with other words)
|
189
|
+
#
|
190
|
+
def preprocess text
|
191
|
+
text = substitute_characters text
|
192
|
+
remove_illegals text
|
193
|
+
# We do not remove single stopwords e.g. in the indexer for
|
194
|
+
# an entirely different reason than in the query tokenizer.
|
195
|
+
# An indexed thing with just name "UND" (a possible stopword)
|
196
|
+
# should not lose its name.
|
197
|
+
#
|
198
|
+
remove_non_single_stopwords text
|
199
|
+
text
|
200
|
+
end
|
201
|
+
|
202
|
+
# Pretokenizing.
|
203
|
+
#
|
204
|
+
# Does:
|
205
|
+
# * Split the text into words.
|
206
|
+
# * Cap the amount of tokens if maximum_tokens is set.
|
207
|
+
#
|
208
|
+
def pretokenize text
|
209
|
+
words = split text
|
210
|
+
words.collect! { |word| normalize_with_patterns word } if normalize_with_patterns?
|
211
|
+
reject words
|
212
|
+
cap words if cap?(words)
|
213
|
+
words
|
214
|
+
end
|
215
|
+
|
216
|
+
# Downcases.
|
217
|
+
#
|
218
|
+
def tokens_for words
|
219
|
+
words.collect! { |word| word.downcase!; word } if downcase?
|
220
|
+
words
|
221
|
+
end
|
222
|
+
|
223
|
+
# Returns empty tokens.
|
224
|
+
#
|
225
|
+
def empty_tokens
|
226
|
+
[[], []]
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
data/lib/tasks/try.rake
CHANGED
@@ -2,18 +2,10 @@
|
|
2
2
|
#
|
3
3
|
desc "Try the given text in the indexer/query (index and category optional)."
|
4
4
|
task :try, [:text, :index, :category] => :application do |_, options|
|
5
|
-
text, index, category = options.text, options.index, options.category
|
6
|
-
|
7
5
|
puts
|
8
|
-
fail "\x1b[31mrake try needs a text to try indexing and query preparation\x1b[m, e.g. rake 'try[yourtext]'." unless text
|
9
|
-
|
10
|
-
specific = Picky::Indexes
|
11
|
-
specific = specific[index] if index
|
12
|
-
specific = specific[category] if category
|
6
|
+
fail "\x1b[31mrake try needs a text to try indexing and query preparation\x1b[m, e.g. rake 'try[yourtext]'." unless options.text
|
13
7
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
puts
|
18
|
-
puts "(category qualifiers, e.g. title: are removed if they do not exist as a qualifier, so 'toitle:bla' -> 'bla')"
|
8
|
+
require File.expand_path '../try', __FILE__
|
9
|
+
try = Picky::Try.new options.text, options.index, options.category
|
10
|
+
try.to_stdout
|
19
11
|
end
|
data/lib/tasks/try.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module Picky
|
2
|
+
|
3
|
+
class Try
|
4
|
+
|
5
|
+
attr_reader :text, :specific
|
6
|
+
|
7
|
+
def initialize text, index = nil, category = nil
|
8
|
+
@text = text
|
9
|
+
@specific = Picky::Indexes
|
10
|
+
@specific = @specific[index.to_sym] if index
|
11
|
+
@specific = @specific[category.to_sym] if category
|
12
|
+
end
|
13
|
+
|
14
|
+
def saved
|
15
|
+
specific.tokenizer.tokenize(text.dup).first
|
16
|
+
end
|
17
|
+
|
18
|
+
def searched
|
19
|
+
Picky::Tokenizer.query_default.tokenize(text.dup).first
|
20
|
+
end
|
21
|
+
|
22
|
+
def output
|
23
|
+
<<-OUTPUT
|
24
|
+
\"#{text}\" is saved in the #{specific.identifier} index as #{saved}
|
25
|
+
\"#{text}\" as a search will be tokenized as #{searched}
|
26
|
+
|
27
|
+
(category qualifiers, e.g. title: are removed if they do not exist as a qualifier, so 'toitle:bla' -> 'bla')
|
28
|
+
OUTPUT
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_stdout
|
32
|
+
puts output
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -8,7 +8,7 @@ describe Picky::Application do
|
|
8
8
|
it "should run ok" do
|
9
9
|
lambda {
|
10
10
|
class MinimalTestApplication < described_class
|
11
|
-
books = Picky::
|
11
|
+
books = Picky::Index.new :books do
|
12
12
|
source Picky::Sources::DB.new(
|
13
13
|
'SELECT id, title FROM books',
|
14
14
|
:file => 'app/db.yml'
|
@@ -20,8 +20,8 @@ describe Picky::Application do
|
|
20
20
|
|
21
21
|
route %r{^/books} => Picky::Search.new(books)
|
22
22
|
end
|
23
|
-
Picky::
|
24
|
-
Picky::
|
23
|
+
Picky::Tokenizer.index_default.tokenize 'some text'
|
24
|
+
Picky::Tokenizer.query_default.tokenize 'some text'
|
25
25
|
}.should_not raise_error
|
26
26
|
end
|
27
27
|
it "should run ok" do
|
@@ -44,7 +44,7 @@ describe Picky::Application do
|
|
44
44
|
substitutes_characters_with: Picky::CharacterSubstituters::WestEuropean.new,
|
45
45
|
maximum_tokens: 5
|
46
46
|
|
47
|
-
books_index = Picky::
|
47
|
+
books_index = Picky::Index.new :books do
|
48
48
|
source Picky::Sources::DB.new(
|
49
49
|
'SELECT id, title, author, isbn13 as isbn FROM books',
|
50
50
|
:file => 'app/db.yml'
|
@@ -58,7 +58,7 @@ describe Picky::Application do
|
|
58
58
|
books_index.define_category :isbn,
|
59
59
|
partial: Picky::Partial::None.new # Partially searching on an ISBN makes not much sense.
|
60
60
|
|
61
|
-
geo_index = Picky::
|
61
|
+
geo_index = Picky::Index.new :geo do
|
62
62
|
source Picky::Sources::CSV.new(:location, :north, :east, file: 'data/ch.csv', col_sep: ',')
|
63
63
|
indexing removes_characters: /[^a-z]/
|
64
64
|
category :location,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe Picky::
|
3
|
+
describe Picky::Backends::File::Basic do
|
4
4
|
|
5
5
|
let(:file) { described_class.new 'some/cache/path/to/file' }
|
6
6
|
|
@@ -18,7 +18,7 @@ describe Picky::Backend::File::Basic do
|
|
18
18
|
|
19
19
|
describe 'to_s' do
|
20
20
|
it 'returns the cache path with the default file extension' do
|
21
|
-
file.to_s.should == 'Picky::
|
21
|
+
file.to_s.should == 'Picky::Backends::File::Basic(some/cache/path/to/file.index)'
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe Picky::
|
3
|
+
describe Picky::Backends::File::JSON do
|
4
4
|
|
5
5
|
let(:file) { described_class.new 'some/cache/path/to/file' }
|
6
6
|
|
@@ -24,7 +24,7 @@ describe Picky::Backend::File::JSON do
|
|
24
24
|
|
25
25
|
describe 'to_s' do
|
26
26
|
it 'returns the cache path with the default file extension' do
|
27
|
-
file.to_s.should == 'Picky::
|
27
|
+
file.to_s.should == 'Picky::Backends::File::JSON(some/cache/path/to/file.json)'
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe Picky::
|
3
|
+
describe Picky::Backends::File::Marshal do
|
4
4
|
|
5
5
|
let(:file) { described_class.new 'some/cache/path/to/file' }
|
6
6
|
|
@@ -24,7 +24,7 @@ describe Picky::Backend::File::Marshal do
|
|
24
24
|
|
25
25
|
describe 'to_s' do
|
26
26
|
it 'returns the cache path with the default file extension' do
|
27
|
-
file.to_s.should == 'Picky::
|
27
|
+
file.to_s.should == 'Picky::Backends::File::Marshal(some/cache/path/to/file.dump)'
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|