picky 2.5.2 → 2.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/adapters/rack/base.rb +23 -0
- data/lib/picky/adapters/rack/live_parameters.rb +33 -0
- data/lib/picky/adapters/rack/query.rb +65 -0
- data/lib/picky/adapters/rack.rb +30 -0
- data/lib/picky/application.rb +5 -5
- data/lib/picky/backend/backend.rb +108 -0
- data/lib/picky/backend/file/basic.rb +101 -0
- data/lib/picky/backend/file/json.rb +34 -0
- data/lib/picky/backend/file/marshal.rb +34 -0
- data/lib/picky/backend/file/text.rb +56 -0
- data/lib/picky/backend/files.rb +30 -0
- data/lib/picky/backend/redis/basic.rb +85 -0
- data/lib/picky/backend/redis/list_hash.rb +49 -0
- data/lib/picky/backend/redis/string_hash.rb +40 -0
- data/lib/picky/backend/redis.rb +40 -0
- data/lib/picky/calculations/location.rb +57 -0
- data/lib/picky/categories.rb +62 -0
- data/lib/picky/categories_indexed.rb +93 -0
- data/lib/picky/categories_indexing.rb +12 -0
- data/lib/picky/category.rb +127 -0
- data/lib/picky/category_indexed.rb +64 -0
- data/lib/picky/category_indexing.rb +145 -0
- data/lib/picky/{internals/ext → ext}/maybe_compile.rb +0 -0
- data/lib/picky/{internals/ext → ext}/ruby19/extconf.rb +0 -0
- data/lib/picky/{internals/ext → ext}/ruby19/performant.c +0 -0
- data/lib/picky/{internals/extensions → extensions}/array.rb +0 -0
- data/lib/picky/extensions/class.rb +11 -0
- data/lib/picky/{internals/extensions → extensions}/hash.rb +0 -0
- data/lib/picky/{internals/extensions → extensions}/module.rb +0 -0
- data/lib/picky/{internals/extensions → extensions}/object.rb +0 -0
- data/lib/picky/{internals/extensions → extensions}/symbol.rb +0 -0
- data/lib/picky/frontend_adapters/rack.rb +146 -0
- data/lib/picky/generators/aliases.rb +3 -3
- data/lib/picky/generators/base.rb +15 -0
- data/lib/picky/generators/partial/default.rb +5 -0
- data/lib/picky/generators/partial/none.rb +31 -0
- data/lib/picky/generators/partial/strategy.rb +25 -0
- data/lib/picky/generators/partial/substring.rb +118 -0
- data/lib/picky/generators/partial_generator.rb +15 -0
- data/lib/picky/generators/similarity/default.rb +7 -0
- data/lib/picky/generators/similarity/double_metaphone.rb +28 -0
- data/lib/picky/generators/similarity/metaphone.rb +28 -0
- data/lib/picky/generators/similarity/none.rb +31 -0
- data/lib/picky/generators/similarity/phonetic.rb +65 -0
- data/lib/picky/generators/similarity/soundex.rb +28 -0
- data/lib/picky/generators/similarity/strategy.rb +9 -0
- data/lib/picky/generators/similarity_generator.rb +15 -0
- data/lib/picky/generators/strategy.rb +14 -0
- data/lib/picky/generators/weights/default.rb +7 -0
- data/lib/picky/generators/weights/logarithmic.rb +39 -0
- data/lib/picky/generators/weights/strategy.rb +9 -0
- data/lib/picky/generators/weights_generator.rb +15 -0
- data/lib/picky/{internals/helpers → helpers}/measuring.rb +0 -0
- data/lib/picky/index/base.rb +119 -104
- data/lib/picky/index/base_indexed.rb +27 -0
- data/lib/picky/index/base_indexing.rb +119 -0
- data/lib/picky/index/memory.rb +6 -18
- data/lib/picky/index/redis.rb +6 -18
- data/lib/picky/indexed/bundle/base.rb +110 -0
- data/lib/picky/indexed/bundle/memory.rb +91 -0
- data/lib/picky/indexed/bundle/redis.rb +45 -0
- data/lib/picky/indexed/wrappers/bundle/calculation.rb +35 -0
- data/lib/picky/indexed/wrappers/bundle/location.rb +42 -0
- data/lib/picky/indexed/wrappers/bundle/wrapper.rb +43 -0
- data/lib/picky/indexed/wrappers/category/location.rb +25 -0
- data/lib/picky/indexed/wrappers/exact_first.rb +55 -0
- data/lib/picky/{internals/indexers → indexers}/base.rb +0 -0
- data/lib/picky/{internals/indexers → indexers}/parallel.rb +0 -0
- data/lib/picky/{internals/indexers → indexers}/serial.rb +0 -0
- data/lib/picky/{internals/indexers → indexers}/solr.rb +0 -0
- data/lib/picky/indexes.rb +73 -0
- data/lib/picky/indexes_indexed.rb +29 -0
- data/lib/picky/indexes_indexing.rb +49 -0
- data/lib/picky/indexing/bundle/base.rb +212 -0
- data/lib/picky/indexing/bundle/memory.rb +25 -0
- data/lib/picky/indexing/bundle/redis.rb +24 -0
- data/lib/picky/indexing/bundle/super_base.rb +61 -0
- data/lib/picky/indexing/wrappers/category/location.rb +25 -0
- data/lib/picky/interfaces/live_parameters.rb +8 -8
- data/lib/picky/loader.rb +89 -95
- data/lib/picky/{internals/performant.rb → performant.rb} +0 -0
- data/lib/picky/query/allocation.rb +84 -0
- data/lib/picky/query/allocations.rb +114 -0
- data/lib/picky/query/combination.rb +76 -0
- data/lib/picky/query/combinations/base.rb +70 -0
- data/lib/picky/query/combinations/memory.rb +48 -0
- data/lib/picky/query/combinations/redis.rb +86 -0
- data/lib/picky/query/indexes.rb +195 -0
- data/lib/picky/query/qualifiers.rb +76 -0
- data/lib/picky/query/token.rb +198 -0
- data/lib/picky/query/tokens.rb +103 -0
- data/lib/picky/{internals/query → query}/weights.rb +0 -0
- data/lib/picky/results.rb +1 -1
- data/lib/picky/search.rb +6 -6
- data/lib/picky/{internals/solr → solr}/schema_generator.rb +0 -0
- data/lib/picky/sources/db.rb +7 -7
- data/lib/picky/sources/wrappers/location.rb +2 -2
- data/lib/picky/tokenizers/base.rb +224 -0
- data/lib/picky/tokenizers/index.rb +30 -0
- data/lib/picky/tokenizers/location.rb +49 -0
- data/lib/picky/tokenizers/query.rb +55 -0
- data/lib/tasks/index.rake +4 -3
- data/lib/tasks/try.rake +2 -2
- data/spec/lib/{internals/adapters → adapters}/rack/base_spec.rb +1 -1
- data/spec/lib/{internals/adapters → adapters}/rack/live_parameters_spec.rb +1 -1
- data/spec/lib/{internals/adapters → adapters}/rack/query_spec.rb +1 -1
- data/spec/lib/application_spec.rb +3 -3
- data/spec/lib/{internals/index → backend}/file/basic_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/file/json_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/file/marshal_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/file/text_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/files_spec.rb +3 -3
- data/spec/lib/{internals/index → backend}/redis/basic_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/redis/list_hash_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/redis/string_hash_spec.rb +1 -1
- data/spec/lib/{internals/index → backend}/redis_spec.rb +11 -5
- data/spec/lib/{internals/calculations → calculations}/location_spec.rb +1 -1
- data/spec/lib/{internals/indexed/categories_spec.rb → categories_indexed_spec.rb} +10 -10
- data/spec/lib/{internals/indexed/category_spec.rb → category_indexed_spec.rb} +12 -12
- data/spec/lib/{internals/indexing/category_spec.rb → category_indexing_spec.rb} +10 -10
- data/spec/lib/{internals/cores_spec.rb → cores_spec.rb} +0 -0
- data/spec/lib/{internals/extensions → extensions}/array_spec.rb +0 -0
- data/spec/lib/{internals/extensions → extensions}/hash_spec.rb +0 -0
- data/spec/lib/{internals/extensions → extensions}/module_spec.rb +0 -0
- data/spec/lib/{internals/extensions → extensions}/object_spec.rb +0 -0
- data/spec/lib/{internals/extensions → extensions}/symbol_spec.rb +0 -0
- data/spec/lib/{internals/frontend_adapters → frontend_adapters}/rack_spec.rb +10 -10
- data/spec/lib/generators/aliases_spec.rb +3 -3
- data/spec/lib/{internals/generators → generators}/cacher_strategy_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/partial/default_spec.rb +3 -3
- data/spec/lib/{internals/generators → generators}/partial/none_spec.rb +2 -2
- data/spec/lib/{internals/generators → generators}/partial/substring_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/partial_generator_spec.rb +3 -3
- data/spec/lib/{internals/generators → generators}/similarity/double_metaphone_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/similarity/metaphone_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/similarity/none_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/similarity/phonetic_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/similarity/soundex_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/similarity_generator_spec.rb +2 -2
- data/spec/lib/{internals/generators → generators}/weights/logarithmic_spec.rb +1 -1
- data/spec/lib/{internals/generators → generators}/weights_generator_spec.rb +5 -5
- data/spec/lib/{internals/helpers → helpers}/measuring_spec.rb +0 -0
- data/spec/lib/{internals/indexed/index_spec.rb → index/base_indexed_spec.rb} +5 -5
- data/spec/lib/{internals/indexing/index_spec.rb → index/base_indexing_spec.rb} +6 -19
- data/spec/lib/index/base_spec.rb +10 -53
- data/spec/lib/{internals/indexed → indexed}/bundle/memory_spec.rb +5 -5
- data/spec/lib/{internals/indexed → indexed}/bundle/redis_spec.rb +4 -4
- data/spec/lib/{internals/indexed → indexed}/wrappers/bundle/calculation_spec.rb +1 -1
- data/spec/lib/{internals/indexed → indexed}/wrappers/bundle/wrapper_spec.rb +1 -1
- data/spec/lib/{internals/indexed → indexed}/wrappers/exact_first_spec.rb +7 -7
- data/spec/lib/{internals/indexers → indexers}/base_spec.rb +0 -0
- data/spec/lib/{internals/indexers → indexers}/parallel_spec.rb +0 -0
- data/spec/lib/{internals/indexers → indexers}/serial_spec.rb +0 -0
- data/spec/lib/indexes_class_spec.rb +30 -0
- data/spec/lib/{indexed/indexes_spec.rb → indexes_indexed_spec.rb} +1 -1
- data/spec/lib/{indexing/indexes_spec.rb → indexes_indexing_spec.rb} +8 -8
- data/spec/lib/{internals/indexing/indexes_spec.rb → indexes_spec.rb} +15 -12
- data/spec/lib/{internals/indexing → indexing}/bundle/memory_partial_generation_speed_spec.rb +4 -4
- data/spec/lib/{internals/indexing → indexing}/bundle/memory_spec.rb +3 -3
- data/spec/lib/{internals/indexing → indexing}/bundle/redis_spec.rb +3 -3
- data/spec/lib/{internals/indexing → indexing}/bundle/super_base_spec.rb +2 -2
- data/spec/lib/{internals/interfaces → interfaces}/live_parameters_spec.rb +0 -0
- data/spec/lib/query/allocation_spec.rb +1 -1
- data/spec/lib/query/allocations_spec.rb +1 -1
- data/spec/lib/query/combination_spec.rb +5 -5
- data/spec/lib/query/combinations/base_spec.rb +1 -1
- data/spec/lib/query/combinations/memory_spec.rb +1 -1
- data/spec/lib/query/combinations/redis_spec.rb +1 -1
- data/spec/lib/query/indexes_spec.rb +1 -1
- data/spec/lib/query/qualifiers_spec.rb +4 -4
- data/spec/lib/query/token_spec.rb +3 -3
- data/spec/lib/query/tokens_spec.rb +32 -32
- data/spec/lib/search_spec.rb +5 -5
- data/spec/lib/{internals/solr → solr}/schema_generator_spec.rb +0 -0
- data/spec/lib/sources/db_spec.rb +4 -8
- data/spec/lib/sources/wrappers/location_spec.rb +1 -1
- data/spec/lib/{internals/tokenizers → tokenizers}/base_spec.rb +1 -1
- data/spec/lib/{internals/tokenizers → tokenizers}/index_spec.rb +1 -1
- data/spec/lib/{internals/tokenizers → tokenizers}/query_spec.rb +1 -1
- metadata +214 -215
- data/lib/picky/aliases.rb +0 -4
- data/lib/picky/index_bundle.rb +0 -48
- data/lib/picky/indexed/indexes.rb +0 -59
- data/lib/picky/indexing/indexes.rb +0 -87
- data/lib/picky/internals/adapters/rack/base.rb +0 -27
- data/lib/picky/internals/adapters/rack/live_parameters.rb +0 -37
- data/lib/picky/internals/adapters/rack/query.rb +0 -69
- data/lib/picky/internals/adapters/rack.rb +0 -34
- data/lib/picky/internals/calculations/location.rb +0 -59
- data/lib/picky/internals/frontend_adapters/rack.rb +0 -150
- data/lib/picky/internals/generators/base.rb +0 -19
- data/lib/picky/internals/generators/partial/default.rb +0 -7
- data/lib/picky/internals/generators/partial/none.rb +0 -35
- data/lib/picky/internals/generators/partial/strategy.rb +0 -29
- data/lib/picky/internals/generators/partial/substring.rb +0 -122
- data/lib/picky/internals/generators/partial_generator.rb +0 -19
- data/lib/picky/internals/generators/similarity/default.rb +0 -9
- data/lib/picky/internals/generators/similarity/double_metaphone.rb +0 -32
- data/lib/picky/internals/generators/similarity/metaphone.rb +0 -32
- data/lib/picky/internals/generators/similarity/none.rb +0 -35
- data/lib/picky/internals/generators/similarity/phonetic.rb +0 -69
- data/lib/picky/internals/generators/similarity/soundex.rb +0 -32
- data/lib/picky/internals/generators/similarity/strategy.rb +0 -11
- data/lib/picky/internals/generators/similarity_generator.rb +0 -19
- data/lib/picky/internals/generators/strategy.rb +0 -18
- data/lib/picky/internals/generators/weights/default.rb +0 -9
- data/lib/picky/internals/generators/weights/logarithmic.rb +0 -43
- data/lib/picky/internals/generators/weights/strategy.rb +0 -11
- data/lib/picky/internals/generators/weights_generator.rb +0 -19
- data/lib/picky/internals/index/backend.rb +0 -112
- data/lib/picky/internals/index/file/basic.rb +0 -105
- data/lib/picky/internals/index/file/json.rb +0 -38
- data/lib/picky/internals/index/file/marshal.rb +0 -38
- data/lib/picky/internals/index/file/text.rb +0 -60
- data/lib/picky/internals/index/files.rb +0 -34
- data/lib/picky/internals/index/redis/basic.rb +0 -89
- data/lib/picky/internals/index/redis/list_hash.rb +0 -53
- data/lib/picky/internals/index/redis/string_hash.rb +0 -44
- data/lib/picky/internals/index/redis.rb +0 -44
- data/lib/picky/internals/indexed/bundle/base.rb +0 -114
- data/lib/picky/internals/indexed/bundle/memory.rb +0 -95
- data/lib/picky/internals/indexed/bundle/redis.rb +0 -49
- data/lib/picky/internals/indexed/categories.rb +0 -140
- data/lib/picky/internals/indexed/category.rb +0 -111
- data/lib/picky/internals/indexed/index.rb +0 -63
- data/lib/picky/internals/indexed/wrappers/bundle/calculation.rb +0 -37
- data/lib/picky/internals/indexed/wrappers/bundle/location.rb +0 -44
- data/lib/picky/internals/indexed/wrappers/bundle/wrapper.rb +0 -45
- data/lib/picky/internals/indexed/wrappers/category/location.rb +0 -27
- data/lib/picky/internals/indexed/wrappers/exact_first.rb +0 -59
- data/lib/picky/internals/indexing/bundle/base.rb +0 -216
- data/lib/picky/internals/indexing/bundle/memory.rb +0 -29
- data/lib/picky/internals/indexing/bundle/redis.rb +0 -28
- data/lib/picky/internals/indexing/bundle/super_base.rb +0 -65
- data/lib/picky/internals/indexing/category.rb +0 -153
- data/lib/picky/internals/indexing/index.rb +0 -142
- data/lib/picky/internals/indexing/wrappers/category/location.rb +0 -27
- data/lib/picky/internals/query/allocation.rb +0 -88
- data/lib/picky/internals/query/allocations.rb +0 -118
- data/lib/picky/internals/query/combination.rb +0 -80
- data/lib/picky/internals/query/combinations/base.rb +0 -74
- data/lib/picky/internals/query/combinations/memory.rb +0 -52
- data/lib/picky/internals/query/combinations/redis.rb +0 -90
- data/lib/picky/internals/query/indexes.rb +0 -199
- data/lib/picky/internals/query/qualifiers.rb +0 -82
- data/lib/picky/internals/query/token.rb +0 -202
- data/lib/picky/internals/query/tokens.rb +0 -109
- data/lib/picky/internals/shared/category.rb +0 -52
- data/lib/picky/internals/tokenizers/base.rb +0 -228
- data/lib/picky/internals/tokenizers/index.rb +0 -34
- data/lib/picky/internals/tokenizers/location.rb +0 -54
- data/lib/picky/internals/tokenizers/query.rb +0 -59
- data/lib/picky/internals.rb +0 -2
- data/spec/lib/aliases_spec.rb +0 -9
- data/spec/lib/index_bundle_spec.rb +0 -69
@@ -1,109 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
module Internals
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
7
|
-
module Query
|
8
|
-
|
9
|
-
# This class primarily handles switching through similar token constellations.
|
10
|
-
#
|
11
|
-
class Tokens # :nodoc:all
|
12
|
-
|
13
|
-
# Basically delegates to its internal tokens array.
|
14
|
-
#
|
15
|
-
self.delegate *[Enumerable.instance_methods, :slice!, :[], :uniq!, :last, :reject!, :length, :size, :empty?, :each, :exit, { :to => :@tokens }].flatten
|
16
|
-
|
17
|
-
# Create a new Tokens object with the array of tokens passed in.
|
18
|
-
#
|
19
|
-
def initialize tokens = []
|
20
|
-
@tokens = tokens
|
21
|
-
end
|
22
|
-
|
23
|
-
# Creates a new Tokens object from a number of Strings.
|
24
|
-
#
|
25
|
-
# Options:
|
26
|
-
# * downcase: Whether to downcase the passed strings (default is true)
|
27
|
-
#
|
28
|
-
def self.processed words, downcase = true
|
29
|
-
new words.collect! { |word| Token.processed word, downcase }
|
30
|
-
end
|
31
|
-
|
32
|
-
# Tokenizes each token.
|
33
|
-
#
|
34
|
-
# Note: Passed tokenizer needs to offer #normalize(text).
|
35
|
-
#
|
36
|
-
def tokenize_with tokenizer
|
37
|
-
@tokens.each { |token| token.tokenize_with(tokenizer) }
|
38
|
-
end
|
39
|
-
|
40
|
-
# Generates an array in the form of
|
41
|
-
# [
|
42
|
-
# [combination], # of token 1
|
43
|
-
# [combination, combination, combination], # of token 2
|
44
|
-
# [combination, combination] # of token 3
|
45
|
-
# ]
|
46
|
-
#
|
47
|
-
def possible_combinations_in index
|
48
|
-
@tokens.inject([]) do |combinations, token|
|
49
|
-
possible_combinations = token.possible_combinations_in index
|
50
|
-
|
51
|
-
# TODO Could move the ignore_unassigned_tokens here!
|
52
|
-
#
|
53
|
-
# Note: Optimization for ignoring tokens that allocate to nothing and
|
54
|
-
# can be ignored.
|
55
|
-
# For example in a special search, where "florian" is not
|
56
|
-
# mapped to any category.
|
57
|
-
#
|
58
|
-
possible_combinations ? combinations << possible_combinations : combinations
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
# Makes the last of the tokens partial.
|
63
|
-
#
|
64
|
-
def partialize_last
|
65
|
-
@tokens.last.partial = true unless empty?
|
66
|
-
end
|
67
|
-
|
68
|
-
# Caps the tokens to the maximum.
|
69
|
-
#
|
70
|
-
def cap maximum
|
71
|
-
@tokens.slice!(maximum..-1) if cap?(maximum)
|
72
|
-
end
|
73
|
-
def cap? maximum
|
74
|
-
@tokens.size > maximum
|
75
|
-
end
|
76
|
-
|
77
|
-
# Rejects blank tokens.
|
78
|
-
#
|
79
|
-
def reject
|
80
|
-
@tokens.reject! &:blank?
|
81
|
-
end
|
82
|
-
|
83
|
-
# Returns a solr query.
|
84
|
-
#
|
85
|
-
def to_solr_query
|
86
|
-
@tokens.map(&:to_solr).join ' '
|
87
|
-
end
|
88
|
-
|
89
|
-
#
|
90
|
-
#
|
91
|
-
def originals
|
92
|
-
@tokens.map(&:original)
|
93
|
-
end
|
94
|
-
|
95
|
-
def == other
|
96
|
-
self.tokens == other.tokens
|
97
|
-
end
|
98
|
-
|
99
|
-
# Just join the token original texts.
|
100
|
-
#
|
101
|
-
def to_s
|
102
|
-
originals.join ' '
|
103
|
-
end
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
end
|
108
|
-
|
109
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
module Internals
|
2
|
-
module Shared
|
3
|
-
|
4
|
-
module Category
|
5
|
-
|
6
|
-
def index_name
|
7
|
-
index.name
|
8
|
-
end
|
9
|
-
def category_name
|
10
|
-
name
|
11
|
-
end
|
12
|
-
|
13
|
-
# Path and partial filename of a specific index on this category.
|
14
|
-
#
|
15
|
-
def index_path bundle_name, type
|
16
|
-
"#{index_directory}/#{name}_#{bundle_name}_#{type}"
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
#
|
21
|
-
def prepared_index_path
|
22
|
-
@prepared_index_path ||= "#{index_directory}/prepared_#{name}_index"
|
23
|
-
end
|
24
|
-
def prepared_index_file &block
|
25
|
-
@prepared_index_file ||= Internals::Index::File::Text.new prepared_index_path
|
26
|
-
@prepared_index_file.open_for_indexing &block
|
27
|
-
end
|
28
|
-
|
29
|
-
# Identifier for internal use.
|
30
|
-
#
|
31
|
-
def identifier
|
32
|
-
@identifier ||= "#{index.name}:#{name}"
|
33
|
-
end
|
34
|
-
def to_s
|
35
|
-
"#{index.name} #{name}"
|
36
|
-
end
|
37
|
-
|
38
|
-
# The index directory for this category.
|
39
|
-
#
|
40
|
-
def index_directory
|
41
|
-
@index_directory ||= "#{PICKY_ROOT}/index/#{PICKY_ENVIRONMENT}/#{index.name}"
|
42
|
-
end
|
43
|
-
# Creates the index directory including all necessary paths above it.
|
44
|
-
#
|
45
|
-
def prepare_index_directory
|
46
|
-
FileUtils.mkdir_p index_directory
|
47
|
-
end
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|
@@ -1,228 +0,0 @@
|
|
1
|
-
module Internals
|
2
|
-
|
3
|
-
module Tokenizers # :nodoc:all
|
4
|
-
|
5
|
-
# Defines tokenizing processes used both in indexing and querying.
|
6
|
-
#
|
7
|
-
class Base
|
8
|
-
|
9
|
-
# TODO Move EMPTY_STRING top level.
|
10
|
-
#
|
11
|
-
EMPTY_STRING = ''.freeze
|
12
|
-
|
13
|
-
def to_s
|
14
|
-
reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
|
15
|
-
<<-TOKENIZER
|
16
|
-
Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
|
17
|
-
Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
|
18
|
-
Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
|
19
|
-
Removes chars after split: #{@removes_characters_after_splitting_regexp ? "/#{@removes_characters_after_splitting_regexp.source}/" : '-'}
|
20
|
-
Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
|
21
|
-
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
|
22
|
-
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
|
23
|
-
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
24
|
-
TOKENIZER
|
25
|
-
end
|
26
|
-
|
27
|
-
# Stopwords.
|
28
|
-
#
|
29
|
-
# We only allow regexps (even if string would be okay
|
30
|
-
# too for gsub! - it's too hard to understand)
|
31
|
-
#
|
32
|
-
def stopwords regexp
|
33
|
-
check_argument_in __method__, Regexp, regexp
|
34
|
-
@remove_stopwords_regexp = regexp
|
35
|
-
end
|
36
|
-
def remove_stopwords text
|
37
|
-
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
|
38
|
-
text
|
39
|
-
end
|
40
|
-
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
41
|
-
def remove_non_single_stopwords text
|
42
|
-
return text if text.match @@non_single_stopword_regexp
|
43
|
-
remove_stopwords text
|
44
|
-
end
|
45
|
-
|
46
|
-
# Illegals.
|
47
|
-
#
|
48
|
-
# We only allow regexps (even if string would be okay
|
49
|
-
# too for gsub! - it's too hard to understand)
|
50
|
-
#
|
51
|
-
def removes_characters regexp
|
52
|
-
check_argument_in __method__, Regexp, regexp
|
53
|
-
@removes_characters_regexp = regexp
|
54
|
-
end
|
55
|
-
def remove_illegals text
|
56
|
-
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
|
57
|
-
text
|
58
|
-
end
|
59
|
-
|
60
|
-
# Splitting.
|
61
|
-
#
|
62
|
-
# We allow Strings and Regexps.
|
63
|
-
# Note: We do not test against to_str since symbols do not work with String#split.
|
64
|
-
#
|
65
|
-
def splits_text_on regexp_or_string
|
66
|
-
raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
|
67
|
-
@splits_text_on = regexp_or_string
|
68
|
-
end
|
69
|
-
def split text
|
70
|
-
text.split @splits_text_on
|
71
|
-
end
|
72
|
-
|
73
|
-
# Normalizing.
|
74
|
-
#
|
75
|
-
# We only allow arrays.
|
76
|
-
#
|
77
|
-
def normalizes_words regexp_replaces
|
78
|
-
raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
|
79
|
-
@normalizes_words_regexp_replaces = regexp_replaces
|
80
|
-
end
|
81
|
-
def normalize_with_patterns text
|
82
|
-
return text unless @normalizes_words_regexp_replaces
|
83
|
-
|
84
|
-
@normalizes_words_regexp_replaces.each do |regex, replace|
|
85
|
-
# This should be sufficient
|
86
|
-
#
|
87
|
-
text.gsub!(regex, replace) and break
|
88
|
-
end
|
89
|
-
remove_after_normalizing_illegals text
|
90
|
-
text
|
91
|
-
end
|
92
|
-
|
93
|
-
# Illegal after normalizing.
|
94
|
-
#
|
95
|
-
# We only allow regexps (even if string would be okay
|
96
|
-
# too for gsub! - it's too hard to understand)
|
97
|
-
#
|
98
|
-
def removes_characters_after_splitting regexp
|
99
|
-
check_argument_in __method__, Regexp, regexp
|
100
|
-
@removes_characters_after_splitting_regexp = regexp
|
101
|
-
end
|
102
|
-
def remove_after_normalizing_illegals text
|
103
|
-
text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
|
104
|
-
end
|
105
|
-
|
106
|
-
# Substitute Characters with this substituter.
|
107
|
-
#
|
108
|
-
# Default is European Character substitution.
|
109
|
-
#
|
110
|
-
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
111
|
-
raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
|
112
|
-
@substituter = substituter
|
113
|
-
end
|
114
|
-
def substitute_characters text
|
115
|
-
substituter?? substituter.substitute(text) : text
|
116
|
-
end
|
117
|
-
|
118
|
-
# Reject tokens after tokenizing based on the given criteria.
|
119
|
-
#
|
120
|
-
# Note: Currently only for indexing.
|
121
|
-
#
|
122
|
-
def reject_token_if &condition
|
123
|
-
@reject_condition = condition
|
124
|
-
end
|
125
|
-
def reject tokens
|
126
|
-
tokens.reject! &@reject_condition
|
127
|
-
end
|
128
|
-
|
129
|
-
def case_sensitive case_sensitive
|
130
|
-
@case_sensitive = case_sensitive
|
131
|
-
end
|
132
|
-
def downcase?
|
133
|
-
!@case_sensitive
|
134
|
-
end
|
135
|
-
|
136
|
-
# Checks if the right argument type has been given.
|
137
|
-
#
|
138
|
-
def check_argument_in method, type, argument, &condition
|
139
|
-
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
|
140
|
-
end
|
141
|
-
|
142
|
-
|
143
|
-
# Returns a number of tokens, generated from the given text.
|
144
|
-
#
|
145
|
-
# Note:
|
146
|
-
# * preprocess, pretokenize are hooks
|
147
|
-
#
|
148
|
-
def tokenize text
|
149
|
-
text = preprocess text # processing the text
|
150
|
-
return empty_tokens if text.blank?
|
151
|
-
words = pretokenize text # splitting and preparations for tokenizing
|
152
|
-
return empty_tokens if words.empty?
|
153
|
-
tokens = tokens_for words # creating tokens / strings
|
154
|
-
process tokens # processing tokens / strings
|
155
|
-
end
|
156
|
-
|
157
|
-
attr_reader :substituter
|
158
|
-
alias substituter? substituter
|
159
|
-
|
160
|
-
def initialize options = {}
|
161
|
-
removes_characters options[:removes_characters] if options[:removes_characters]
|
162
|
-
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
163
|
-
stopwords options[:stopwords] if options[:stopwords]
|
164
|
-
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
165
|
-
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
166
|
-
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
167
|
-
case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
|
168
|
-
|
169
|
-
# Defaults.
|
170
|
-
#
|
171
|
-
splits_text_on options[:splits_text_on] || /\s/
|
172
|
-
reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
|
173
|
-
end
|
174
|
-
|
175
|
-
# Default preprocessing hook.
|
176
|
-
#
|
177
|
-
# Does:
|
178
|
-
# 1. Character substitution.
|
179
|
-
# 2. Remove illegal expressions.
|
180
|
-
# 3. Remove non-single stopwords. (Stopwords that occur with other words)
|
181
|
-
#
|
182
|
-
def preprocess text
|
183
|
-
text = substitute_characters text
|
184
|
-
remove_illegals text
|
185
|
-
# We do not remove single stopwords e.g. in the indexer for
|
186
|
-
# an entirely different reason than in the query tokenizer.
|
187
|
-
# An indexed thing with just name "UND" (a possible stopword)
|
188
|
-
# should not lose its name.
|
189
|
-
#
|
190
|
-
remove_non_single_stopwords text
|
191
|
-
text
|
192
|
-
end
|
193
|
-
# Pretokenizing.
|
194
|
-
#
|
195
|
-
# Does:
|
196
|
-
# 1. Split the text into words.
|
197
|
-
# 2. Normalize each word.
|
198
|
-
#
|
199
|
-
def pretokenize text
|
200
|
-
words = split text
|
201
|
-
words.collect! do |word|
|
202
|
-
normalize_with_patterns word
|
203
|
-
word
|
204
|
-
end
|
205
|
-
end
|
206
|
-
# Basic postprocessing (overridden in both query/index tokenizers).
|
207
|
-
#
|
208
|
-
def process tokens
|
209
|
-
reject tokens # Reject any tokens that don't meet criteria
|
210
|
-
tokens
|
211
|
-
end
|
212
|
-
|
213
|
-
# # Converts words into real tokens.
|
214
|
-
# #
|
215
|
-
# def tokens_for words
|
216
|
-
# Internals::Query::Tokens.new words.collect! { |word| token_for word }
|
217
|
-
# end
|
218
|
-
# Turns non-blank text into symbols.
|
219
|
-
#
|
220
|
-
def symbolize text
|
221
|
-
text.blank? ? nil : text.to_sym
|
222
|
-
end
|
223
|
-
|
224
|
-
end
|
225
|
-
|
226
|
-
end
|
227
|
-
|
228
|
-
end
|
@@ -1,34 +0,0 @@
|
|
1
|
-
module Internals
|
2
|
-
|
3
|
-
module Tokenizers
|
4
|
-
|
5
|
-
# The base indexing tokenizer.
|
6
|
-
#
|
7
|
-
# Override in indexing subclasses and define in configuration.
|
8
|
-
#
|
9
|
-
class Index < Base
|
10
|
-
|
11
|
-
def self.default= new_default
|
12
|
-
@default = new_default
|
13
|
-
end
|
14
|
-
def self.default
|
15
|
-
@default ||= new
|
16
|
-
end
|
17
|
-
|
18
|
-
# Does not actually return a token, but a
|
19
|
-
# symbol "token".
|
20
|
-
#
|
21
|
-
def tokens_for words
|
22
|
-
words.collect! { |word| word.downcase! if downcase?; word.to_sym }
|
23
|
-
end
|
24
|
-
# Returns empty tokens.
|
25
|
-
#
|
26
|
-
def empty_tokens
|
27
|
-
[]
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
module Internals
|
2
|
-
|
3
|
-
module Tokenizers
|
4
|
-
|
5
|
-
|
6
|
-
class Location < Base
|
7
|
-
|
8
|
-
attr_reader :calculation
|
9
|
-
|
10
|
-
def initialize options = {}
|
11
|
-
super options
|
12
|
-
|
13
|
-
grid = options[:grid]
|
14
|
-
precision = options[:precision] || 1
|
15
|
-
|
16
|
-
@calculation = Internals::Calculations::Location.new grid, precision
|
17
|
-
|
18
|
-
@minimum = 1.0 / 0
|
19
|
-
|
20
|
-
@locations = []
|
21
|
-
end
|
22
|
-
|
23
|
-
# TODO Work on this!
|
24
|
-
#
|
25
|
-
def tokenize text
|
26
|
-
|
27
|
-
# Gather min/max.
|
28
|
-
#
|
29
|
-
source.harvest category do |indexed_id, location|
|
30
|
-
location = location.to_f
|
31
|
-
minimum = location if location < minimum
|
32
|
-
locations << [indexed_id, location]
|
33
|
-
end
|
34
|
-
|
35
|
-
calculation.minimum = minimum
|
36
|
-
|
37
|
-
# Recalculate locations.
|
38
|
-
#
|
39
|
-
locations.each do |indexed_id, location|
|
40
|
-
calculation.recalculated_range(location).each do |new_location|
|
41
|
-
yield indexed_id, new_location.to_s
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
# TODO Move to the right place.
|
46
|
-
#
|
47
|
-
category.exact[:location_minimum] = minimum
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
@@ -1,59 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
module Internals
|
4
|
-
|
5
|
-
module Tokenizers
|
6
|
-
|
7
|
-
# There are a few class methods that you can use to configure how a query works.
|
8
|
-
#
|
9
|
-
# removes_characters regexp
|
10
|
-
# illegal_after_normalizing regexp
|
11
|
-
# stopwords regexp
|
12
|
-
# contracts_expressions regexp, to_string
|
13
|
-
# splits_text_on regexp
|
14
|
-
# normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
|
15
|
-
#
|
16
|
-
class Query < Base
|
17
|
-
|
18
|
-
def self.default= new_default
|
19
|
-
@default = new_default
|
20
|
-
end
|
21
|
-
def self.default
|
22
|
-
@default ||= new
|
23
|
-
end
|
24
|
-
|
25
|
-
attr_reader :maximum_tokens
|
26
|
-
|
27
|
-
def initialize options = {}
|
28
|
-
super options
|
29
|
-
@maximum_tokens = options[:maximum_tokens] || 5
|
30
|
-
end
|
31
|
-
|
32
|
-
# Let each token process itself.
|
33
|
-
# Reject, limit, and partialize tokens.
|
34
|
-
#
|
35
|
-
# In querying we work with real tokens (in indexing it's just symbols).
|
36
|
-
#
|
37
|
-
def process tokens
|
38
|
-
tokens.reject # Reject any tokens that don't meet criteria.
|
39
|
-
tokens.cap maximum_tokens # Cut off superfluous tokens.
|
40
|
-
tokens.partialize_last # Set certain tokens as partial.
|
41
|
-
tokens
|
42
|
-
end
|
43
|
-
|
44
|
-
# Converts words into real tokens.
|
45
|
-
#
|
46
|
-
def tokens_for words
|
47
|
-
Internals::Query::Tokens.processed words, downcase?
|
48
|
-
end
|
49
|
-
# Returns a tokens object.
|
50
|
-
#
|
51
|
-
def empty_tokens
|
52
|
-
Internals::Query::Tokens.new
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
end
|
data/lib/picky/internals.rb
DELETED
data/spec/lib/aliases_spec.rb
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
require 'spec_helper'
|
4
|
-
|
5
|
-
describe IndexBundle do
|
6
|
-
|
7
|
-
let(:some_index) { stub :index, :name => :some_index, :internal_indexed => :indexed_index, :internal_indexing => :indexing_index }
|
8
|
-
let(:indexes) { described_class.new }
|
9
|
-
let(:indexed) { stub :indexed, :register => nil }
|
10
|
-
let(:indexing) { stub :indexing, :register => nil }
|
11
|
-
|
12
|
-
before(:each) do
|
13
|
-
indexes.stub! :indexing => indexing
|
14
|
-
indexes.stub! :indexed => indexed
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.it_delegates method, receiver
|
18
|
-
it "delegates #{method} to #{receiver}" do
|
19
|
-
indexes.send(receiver).should_receive(method.to_sym).once
|
20
|
-
|
21
|
-
indexes.send method
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
describe 'delegation' do
|
26
|
-
it_delegates :reload, :indexed
|
27
|
-
it_delegates :load_from_cache, :indexed
|
28
|
-
|
29
|
-
it_delegates :check_caches, :indexing
|
30
|
-
it_delegates :find, :indexing
|
31
|
-
it_delegates :index, :indexing
|
32
|
-
it_delegates :index_for_tests, :indexing
|
33
|
-
end
|
34
|
-
|
35
|
-
describe '[]' do
|
36
|
-
before(:each) do
|
37
|
-
indexes.register some_index
|
38
|
-
end
|
39
|
-
it 'takes strings' do
|
40
|
-
indexes['some_index'].should == some_index
|
41
|
-
end
|
42
|
-
it 'takes symbols' do
|
43
|
-
indexes[:some_index].should == some_index
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
describe 'register' do
|
48
|
-
it 'registers with the indexes' do
|
49
|
-
indexes.register some_index
|
50
|
-
|
51
|
-
indexes.indexes.should == [some_index]
|
52
|
-
end
|
53
|
-
it 'registers with the index map' do
|
54
|
-
indexes.register some_index
|
55
|
-
|
56
|
-
indexes[some_index.name].should == some_index
|
57
|
-
end
|
58
|
-
it 'registers with the indexing' do
|
59
|
-
indexing.should_receive(:register).once.with :indexing_index
|
60
|
-
|
61
|
-
indexes.register some_index
|
62
|
-
end
|
63
|
-
it 'registers with the indexed' do
|
64
|
-
indexed.should_receive(:register).once.with :indexed_index
|
65
|
-
|
66
|
-
indexes.register some_index
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|