picky 2.7.0 → 3.0.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/picky/adapters/rack/base.rb +20 -16
- data/lib/picky/adapters/rack/live_parameters.rb +28 -24
- data/lib/picky/adapters/rack/search.rb +67 -0
- data/lib/picky/adapters/rack.rb +27 -23
- data/lib/picky/application.rb +246 -236
- data/lib/picky/backend/base.rb +115 -119
- data/lib/picky/backend/file/basic.rb +102 -98
- data/lib/picky/backend/file/json.rb +27 -23
- data/lib/picky/backend/file/marshal.rb +32 -28
- data/lib/picky/backend/file/text.rb +45 -41
- data/lib/picky/backend/files.rb +19 -15
- data/lib/picky/backend/redis/basic.rb +76 -72
- data/lib/picky/backend/redis/list_hash.rb +40 -36
- data/lib/picky/backend/redis/string_hash.rb +30 -26
- data/lib/picky/backend/redis.rb +32 -28
- data/lib/picky/bundle.rb +82 -57
- data/lib/{bundling.rb → picky/bundling.rb} +0 -0
- data/lib/picky/calculations/location.rb +51 -47
- data/lib/picky/categories.rb +60 -56
- data/lib/picky/categories_indexed.rb +73 -82
- data/lib/picky/categories_indexing.rb +12 -8
- data/lib/picky/category.rb +109 -120
- data/lib/picky/category_indexed.rb +39 -41
- data/lib/picky/category_indexing.rb +123 -125
- data/lib/picky/character_substituters/west_european.rb +32 -26
- data/lib/{constants.rb → picky/constants.rb} +0 -0
- data/lib/picky/cores.rb +96 -92
- data/lib/{deployment.rb → picky/deployment.rb} +0 -0
- data/lib/picky/frontend_adapters/rack.rb +133 -118
- data/lib/picky/generators/aliases.rb +5 -3
- data/lib/picky/generators/base.rb +11 -7
- data/lib/picky/generators/partial/default.rb +7 -3
- data/lib/picky/generators/partial/none.rb +24 -20
- data/lib/picky/generators/partial/strategy.rb +20 -16
- data/lib/picky/generators/partial/substring.rb +94 -90
- data/lib/picky/generators/partial_generator.rb +11 -7
- data/lib/picky/generators/similarity/default.rb +9 -5
- data/lib/picky/generators/similarity/double_metaphone.rb +20 -16
- data/lib/picky/generators/similarity/metaphone.rb +20 -16
- data/lib/picky/generators/similarity/none.rb +23 -19
- data/lib/picky/generators/similarity/phonetic.rb +49 -45
- data/lib/picky/generators/similarity/soundex.rb +20 -16
- data/lib/picky/generators/similarity/strategy.rb +10 -6
- data/lib/picky/generators/similarity_generator.rb +11 -7
- data/lib/picky/generators/strategy.rb +14 -10
- data/lib/picky/generators/weights/default.rb +9 -5
- data/lib/picky/generators/weights/logarithmic.rb +30 -26
- data/lib/picky/generators/weights/strategy.rb +10 -6
- data/lib/picky/generators/weights_generator.rb +11 -7
- data/lib/picky/helpers/measuring.rb +20 -16
- data/lib/picky/indexed/bundle/base.rb +39 -37
- data/lib/picky/indexed/bundle/memory.rb +68 -64
- data/lib/picky/indexed/bundle/redis.rb +73 -69
- data/lib/picky/indexed/wrappers/bundle/calculation.rb +26 -22
- data/lib/picky/indexed/wrappers/bundle/location.rb +30 -26
- data/lib/picky/indexed/wrappers/bundle/wrapper.rb +36 -32
- data/lib/picky/indexed/wrappers/category/location.rb +17 -13
- data/lib/picky/indexed/wrappers/exact_first.rb +46 -42
- data/lib/picky/indexers/base.rb +26 -22
- data/lib/picky/indexers/parallel.rb +62 -58
- data/lib/picky/indexers/serial.rb +41 -37
- data/lib/picky/indexes/index.rb +400 -0
- data/lib/picky/indexes/index_indexed.rb +24 -0
- data/lib/picky/indexes/index_indexing.rb +138 -0
- data/lib/picky/indexes/memory.rb +20 -0
- data/lib/picky/indexes/redis.rb +20 -0
- data/lib/picky/indexes.rb +68 -61
- data/lib/picky/indexes_indexed.rb +16 -12
- data/lib/picky/indexes_indexing.rb +41 -37
- data/lib/picky/indexing/bundle/base.rb +216 -205
- data/lib/picky/indexing/bundle/memory.rb +16 -11
- data/lib/picky/indexing/bundle/redis.rb +14 -12
- data/lib/picky/indexing/wrappers/category/location.rb +17 -13
- data/lib/picky/interfaces/live_parameters.rb +159 -154
- data/lib/picky/loader.rb +267 -304
- data/lib/picky/loggers/search.rb +20 -13
- data/lib/picky/no_source_specified_exception.rb +7 -3
- data/lib/picky/performant.rb +6 -2
- data/lib/picky/query/allocation.rb +71 -67
- data/lib/picky/query/allocations.rb +99 -94
- data/lib/picky/query/combination.rb +70 -66
- data/lib/picky/query/combinations/base.rb +56 -52
- data/lib/picky/query/combinations/memory.rb +36 -32
- data/lib/picky/query/combinations/redis.rb +66 -62
- data/lib/picky/query/indexes.rb +175 -160
- data/lib/picky/query/qualifier_category_mapper.rb +43 -0
- data/lib/picky/query/token.rb +165 -172
- data/lib/picky/query/tokens.rb +86 -82
- data/lib/picky/query/weights.rb +44 -48
- data/lib/picky/query.rb +5 -1
- data/lib/picky/rack/harakiri.rb +51 -47
- data/lib/picky/results.rb +81 -77
- data/lib/picky/search.rb +169 -158
- data/lib/picky/sinatra.rb +34 -0
- data/lib/picky/sources/base.rb +73 -70
- data/lib/picky/sources/couch.rb +61 -57
- data/lib/picky/sources/csv.rb +68 -64
- data/lib/picky/sources/db.rb +139 -135
- data/lib/picky/sources/delicious.rb +52 -48
- data/lib/picky/sources/mongo.rb +68 -63
- data/lib/picky/sources/wrappers/base.rb +20 -16
- data/lib/picky/sources/wrappers/location.rb +37 -33
- data/lib/picky/statistics.rb +46 -43
- data/lib/picky/tasks.rb +3 -0
- data/lib/picky/tokenizers/base.rb +192 -187
- data/lib/picky/tokenizers/index.rb +25 -21
- data/lib/picky/tokenizers/location.rb +33 -29
- data/lib/picky/tokenizers/query.rb +49 -43
- data/lib/picky.rb +21 -13
- data/lib/tasks/application.rake +1 -1
- data/lib/tasks/index.rake +3 -3
- data/lib/tasks/routes.rake +1 -1
- data/lib/tasks/server.rake +1 -1
- data/spec/lib/adapters/rack/base_spec.rb +1 -1
- data/spec/lib/adapters/rack/live_parameters_spec.rb +1 -1
- data/spec/lib/adapters/rack/query_spec.rb +1 -1
- data/spec/lib/application_spec.rb +39 -32
- data/spec/lib/backend/file/basic_spec.rb +2 -2
- data/spec/lib/backend/file/json_spec.rb +2 -2
- data/spec/lib/backend/file/marshal_spec.rb +2 -2
- data/spec/lib/backend/file/text_spec.rb +1 -1
- data/spec/lib/backend/files_spec.rb +14 -24
- data/spec/lib/backend/redis/basic_spec.rb +2 -2
- data/spec/lib/backend/redis/list_hash_spec.rb +3 -3
- data/spec/lib/backend/redis/string_hash_spec.rb +3 -3
- data/spec/lib/backend/redis_spec.rb +20 -13
- data/spec/lib/calculations/location_spec.rb +1 -1
- data/spec/lib/categories_indexed_spec.rb +16 -34
- data/spec/lib/category_indexed_spec.rb +9 -27
- data/spec/lib/category_indexing_spec.rb +2 -3
- data/spec/lib/category_spec.rb +10 -10
- data/spec/lib/character_substituters/west_european_spec.rb +6 -5
- data/spec/lib/cores_spec.rb +17 -17
- data/spec/lib/extensions/symbol_spec.rb +15 -1
- data/spec/lib/frontend_adapters/rack_spec.rb +20 -20
- data/spec/lib/generators/aliases_spec.rb +3 -3
- data/spec/lib/generators/cacher_strategy_spec.rb +1 -1
- data/spec/lib/generators/partial/default_spec.rb +3 -3
- data/spec/lib/generators/partial/none_spec.rb +2 -2
- data/spec/lib/generators/partial/substring_spec.rb +1 -1
- data/spec/lib/generators/partial_generator_spec.rb +3 -3
- data/spec/lib/generators/similarity/double_metaphone_spec.rb +1 -1
- data/spec/lib/generators/similarity/metaphone_spec.rb +1 -1
- data/spec/lib/generators/similarity/none_spec.rb +1 -1
- data/spec/lib/generators/similarity/phonetic_spec.rb +1 -1
- data/spec/lib/generators/similarity/soundex_spec.rb +1 -1
- data/spec/lib/generators/similarity_generator_spec.rb +2 -2
- data/spec/lib/generators/weights/logarithmic_spec.rb +1 -1
- data/spec/lib/generators/weights_generator_spec.rb +1 -1
- data/spec/lib/helpers/measuring_spec.rb +2 -2
- data/spec/lib/indexed/bundle/memory_spec.rb +6 -6
- data/spec/lib/indexed/bundle/redis_spec.rb +4 -4
- data/spec/lib/indexed/wrappers/bundle/calculation_spec.rb +2 -3
- data/spec/lib/indexed/wrappers/bundle/wrapper_spec.rb +2 -2
- data/spec/lib/indexed/wrappers/exact_first_spec.rb +5 -5
- data/spec/lib/indexers/base_spec.rb +1 -1
- data/spec/lib/indexers/parallel_spec.rb +1 -1
- data/spec/lib/indexers/serial_spec.rb +1 -1
- data/spec/lib/{index/base_indexed_spec.rb → indexes/index_indexed_spec.rb} +3 -3
- data/spec/lib/{index/base_indexing_spec.rb → indexes/index_indexing_spec.rb} +19 -2
- data/spec/lib/{index/base_spec.rb → indexes/index_spec.rb} +6 -25
- data/spec/lib/{index → indexes}/redis_spec.rb +1 -1
- data/spec/lib/indexes_class_spec.rb +2 -2
- data/spec/lib/indexes_indexed_spec.rb +1 -1
- data/spec/lib/indexes_indexing_spec.rb +1 -1
- data/spec/lib/indexes_spec.rb +1 -1
- data/spec/lib/indexing/bundle/base_spec.rb +7 -5
- data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +4 -4
- data/spec/lib/indexing/bundle/memory_spec.rb +15 -15
- data/spec/lib/indexing/bundle/redis_spec.rb +9 -9
- data/spec/lib/interfaces/live_parameters_spec.rb +5 -5
- data/spec/lib/loader_spec.rb +17 -19
- data/spec/lib/loggers/search_spec.rb +2 -2
- data/spec/lib/query/allocation_spec.rb +1 -1
- data/spec/lib/query/allocations_spec.rb +1 -1
- data/spec/lib/query/combination_spec.rb +4 -4
- data/spec/lib/query/combinations/base_spec.rb +1 -1
- data/spec/lib/query/combinations/memory_spec.rb +1 -1
- data/spec/lib/query/combinations/redis_spec.rb +1 -1
- data/spec/lib/query/indexes_spec.rb +7 -2
- data/spec/lib/query/qualifier_category_mapper_spec.rb +34 -0
- data/spec/lib/query/token_spec.rb +32 -53
- data/spec/lib/query/tokens_spec.rb +30 -35
- data/spec/lib/query/weights_spec.rb +16 -16
- data/spec/lib/rack/harakiri_spec.rb +5 -5
- data/spec/lib/results_spec.rb +1 -1
- data/spec/lib/search_spec.rb +24 -22
- data/spec/lib/sinatra_spec.rb +36 -0
- data/spec/lib/sources/base_spec.rb +1 -1
- data/spec/lib/sources/couch_spec.rb +9 -9
- data/spec/lib/sources/csv_spec.rb +7 -7
- data/spec/lib/sources/db_spec.rb +2 -2
- data/spec/lib/sources/delicious_spec.rb +5 -5
- data/spec/lib/sources/mongo_spec.rb +7 -7
- data/spec/lib/sources/wrappers/base_spec.rb +2 -2
- data/spec/lib/sources/wrappers/location_spec.rb +1 -1
- data/spec/lib/statistics_spec.rb +1 -1
- data/spec/lib/tokenizers/base_spec.rb +2 -2
- data/spec/lib/tokenizers/index_spec.rb +1 -1
- data/spec/lib/tokenizers/query_spec.rb +1 -1
- metadata +30 -30
- data/lib/picky/adapters/rack/query.rb +0 -65
- data/lib/picky/index/base.rb +0 -409
- data/lib/picky/index/base_indexed.rb +0 -29
- data/lib/picky/index/base_indexing.rb +0 -127
- data/lib/picky/index/memory.rb +0 -16
- data/lib/picky/index/redis.rb +0 -16
- data/lib/picky/query/qualifiers.rb +0 -76
- data/lib/picky/query/solr.rb +0 -60
- data/lib/picky/signals.rb +0 -8
- data/lib/picky-tasks.rb +0 -6
- data/lib/tasks/spec.rake +0 -11
- data/spec/lib/query/qualifiers_spec.rb +0 -31
@@ -1,48 +1,52 @@
|
|
1
|
-
module
|
1
|
+
module Picky
|
2
2
|
|
3
|
-
module
|
3
|
+
module Sources
|
4
4
|
|
5
|
-
|
6
|
-
#
|
7
|
-
class Location < Base
|
5
|
+
module Wrappers
|
8
6
|
|
9
|
-
|
10
|
-
|
11
|
-
def initialize source, grid, precision = 1
|
12
|
-
super source
|
13
|
-
@calculation = Calculations::Location.new grid, precision
|
14
|
-
end
|
15
|
-
|
16
|
-
# Yield the data (id, text for id) for the given category.
|
7
|
+
# Should this actually just be a tokenizer?
|
17
8
|
#
|
18
|
-
|
19
|
-
minimum = 1.0/0
|
9
|
+
class Location < Base
|
20
10
|
|
21
|
-
|
22
|
-
#
|
23
|
-
locations = []
|
11
|
+
attr_reader :calculation
|
24
12
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
location = location.to_f
|
29
|
-
minimum = location if location < minimum
|
30
|
-
locations << [indexed_id, location]
|
13
|
+
def initialize source, grid, precision = 1
|
14
|
+
super source
|
15
|
+
@calculation = Calculations::Location.new grid, precision
|
31
16
|
end
|
32
17
|
|
33
|
-
|
34
|
-
|
35
|
-
# Recalculate locations.
|
18
|
+
# Yield the data (id, text for id) for the given category.
|
36
19
|
#
|
37
|
-
|
38
|
-
|
39
|
-
|
20
|
+
def harvest category
|
21
|
+
minimum = 1.0/0
|
22
|
+
|
23
|
+
# Cache.
|
24
|
+
#
|
25
|
+
locations = []
|
26
|
+
|
27
|
+
# Gather min/max.
|
28
|
+
#
|
29
|
+
source.harvest category do |indexed_id, location|
|
30
|
+
location = location.to_f
|
31
|
+
minimum = location if location < minimum
|
32
|
+
locations << [indexed_id, location]
|
40
33
|
end
|
34
|
+
|
35
|
+
calculation.minimum = minimum
|
36
|
+
|
37
|
+
# Recalculate locations.
|
38
|
+
#
|
39
|
+
locations.each do |indexed_id, location|
|
40
|
+
calculation.recalculated_range(location).each do |new_location|
|
41
|
+
yield indexed_id, new_location.to_s
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# TODO Move to the right place.
|
46
|
+
#
|
47
|
+
category.indexing_exact[:location_minimum] = minimum
|
41
48
|
end
|
42
49
|
|
43
|
-
# TODO Move to the right place.
|
44
|
-
#
|
45
|
-
category.indexing_exact[:location_minimum] = minimum
|
46
50
|
end
|
47
51
|
|
48
52
|
end
|
data/lib/picky/statistics.rb
CHANGED
@@ -1,60 +1,63 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
#
|
3
|
+
module Picky
|
3
4
|
|
4
|
-
# Gathers various statistics.
|
5
|
-
#
|
6
|
-
class Statistics # :nodoc:all
|
5
|
+
# Gathers various statistics.
|
6
|
+
#
|
7
|
+
class Statistics # :nodoc:all
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
9
|
+
def initialize
|
10
|
+
@indexes = ["\033[1mIndexes analysis\033[m:"]
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
def preamble
|
14
|
+
loc = lines_of_code File.open('app/application.rb').read
|
14
15
|
|
15
|
-
|
16
|
-
\033[1mApplication(s)\033[m
|
17
|
-
|
18
|
-
|
19
|
-
PREAMBLE
|
20
|
-
|
16
|
+
@preamble ||= <<-PREAMBLE
|
17
|
+
\033[1mApplication(s)\033[m
|
18
|
+
Definition LOC: #{"%4d" % loc}
|
19
|
+
Indexes defined: #{"%4d" % Indexes.size}
|
20
|
+
PREAMBLE
|
21
|
+
end
|
21
22
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
# Gathers information about the application.
|
24
|
+
#
|
25
|
+
def application
|
26
|
+
preamble
|
27
|
+
@application = Application.apps.map &:indented_to_s
|
28
|
+
end
|
28
29
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
#{"#{category.index_name}".indented_to_s}\n
|
35
|
-
#{"#{category.name}".indented_to_s(4)}\n
|
36
|
-
#{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
|
37
|
-
#{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
|
38
|
-
ANALYSIS
|
30
|
+
# Gathers information about the indexes.
|
31
|
+
#
|
32
|
+
def analyze object
|
33
|
+
object.each_category do |category|
|
34
|
+
@indexes << <<-ANALYSIS
|
35
|
+
#{"#{category.index_name}".indented_to_s}\n
|
36
|
+
#{"#{category.name}".indented_to_s(4)}\n
|
37
|
+
#{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
|
38
|
+
#{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
|
39
|
+
ANALYSIS
|
40
|
+
end
|
39
41
|
end
|
40
|
-
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
43
|
+
# Outputs all gathered statistics.
|
44
|
+
#
|
45
|
+
def to_s
|
46
|
+
<<-STATS
|
46
47
|
|
47
|
-
Picky Configuration:
|
48
|
+
Picky Configuration:
|
48
49
|
|
49
|
-
#{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
|
50
|
-
STATS
|
51
|
-
|
50
|
+
#{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
|
51
|
+
STATS
|
52
|
+
end
|
52
53
|
|
53
|
-
|
54
|
-
|
54
|
+
# Internal methods.
|
55
|
+
#
|
56
|
+
|
57
|
+
def lines_of_code text
|
58
|
+
text.scan(/^\s*[^#\s].*$/).size
|
59
|
+
end
|
55
60
|
|
56
|
-
def lines_of_code text
|
57
|
-
text.scan(/^\s*[^#\s].*$/).size
|
58
61
|
end
|
59
62
|
|
60
63
|
end
|
data/lib/picky/tasks.rb
ADDED
@@ -1,16 +1,18 @@
|
|
1
|
-
module
|
1
|
+
module Picky
|
2
2
|
|
3
|
-
|
4
|
-
#
|
5
|
-
class Base
|
3
|
+
module Tokenizers # :nodoc:all
|
6
4
|
|
7
|
-
#
|
5
|
+
# Defines tokenizing processes used both in indexing and querying.
|
8
6
|
#
|
9
|
-
|
7
|
+
class Base
|
10
8
|
|
11
|
-
|
12
|
-
|
13
|
-
|
9
|
+
# TODO Move EMPTY_STRING top level.
|
10
|
+
#
|
11
|
+
EMPTY_STRING = ''.freeze
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
|
15
|
+
<<-TOKENIZER
|
14
16
|
Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
|
15
17
|
Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
|
16
18
|
Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
|
@@ -19,204 +21,207 @@ Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_wor
|
|
19
21
|
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
|
20
22
|
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
|
21
23
|
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
22
|
-
|
23
|
-
|
24
|
+
TOKENIZER
|
25
|
+
end
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
27
|
+
# Stopwords.
|
28
|
+
#
|
29
|
+
# We only allow regexps (even if string would be okay
|
30
|
+
# too for gsub! - it's too hard to understand)
|
31
|
+
#
|
32
|
+
def stopwords regexp
|
33
|
+
check_argument_in __method__, Regexp, regexp
|
34
|
+
@remove_stopwords_regexp = regexp
|
35
|
+
end
|
36
|
+
def remove_stopwords text
|
37
|
+
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
|
38
|
+
text
|
39
|
+
end
|
40
|
+
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
41
|
+
def remove_non_single_stopwords text
|
42
|
+
return text if text.match @@non_single_stopword_regexp
|
43
|
+
remove_stopwords text
|
44
|
+
end
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
46
|
+
# Illegals.
|
47
|
+
#
|
48
|
+
# We only allow regexps (even if string would be okay
|
49
|
+
# too for gsub! - it's too hard to understand)
|
50
|
+
#
|
51
|
+
def removes_characters regexp
|
52
|
+
check_argument_in __method__, Regexp, regexp
|
53
|
+
@removes_characters_regexp = regexp
|
54
|
+
end
|
55
|
+
def remove_illegals text
|
56
|
+
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
|
57
|
+
text
|
58
|
+
end
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
60
|
+
# Splitting.
|
61
|
+
#
|
62
|
+
# We allow Strings and Regexps.
|
63
|
+
# Note: We do not test against to_str since symbols do not work with String#split.
|
64
|
+
#
|
65
|
+
def splits_text_on regexp_or_string
|
66
|
+
raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
|
67
|
+
@splits_text_on = regexp_or_string
|
68
|
+
end
|
69
|
+
def split text
|
70
|
+
text.split @splits_text_on
|
71
|
+
end
|
70
72
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
73
|
+
# Normalizing.
|
74
|
+
#
|
75
|
+
# We only allow arrays.
|
76
|
+
#
|
77
|
+
def normalizes_words regexp_replaces
|
78
|
+
raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
|
79
|
+
@normalizes_words_regexp_replaces = regexp_replaces
|
80
|
+
end
|
81
|
+
def normalize_with_patterns text
|
82
|
+
return text unless @normalizes_words_regexp_replaces
|
81
83
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
84
|
+
@normalizes_words_regexp_replaces.each do |regex, replace|
|
85
|
+
# This should be sufficient
|
86
|
+
#
|
87
|
+
text.gsub!(regex, replace) and break
|
88
|
+
end
|
89
|
+
|
90
|
+
remove_after_normalizing_illegals text
|
91
|
+
text
|
86
92
|
end
|
87
|
-
remove_after_normalizing_illegals text
|
88
|
-
text
|
89
|
-
end
|
90
93
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
94
|
+
# Illegal after normalizing.
|
95
|
+
#
|
96
|
+
# We only allow regexps (even if string would be okay
|
97
|
+
# too for gsub! - it's too hard to understand)
|
98
|
+
#
|
99
|
+
def removes_characters_after_splitting regexp
|
100
|
+
check_argument_in __method__, Regexp, regexp
|
101
|
+
@removes_characters_after_splitting_regexp = regexp
|
102
|
+
end
|
103
|
+
def remove_after_normalizing_illegals text
|
104
|
+
text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
|
105
|
+
end
|
103
106
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
107
|
+
# Substitute Characters with this substituter.
|
108
|
+
#
|
109
|
+
# Default is European Character substitution.
|
110
|
+
#
|
111
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
112
|
+
raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
|
113
|
+
@substituter = substituter
|
114
|
+
end
|
115
|
+
def substitute_characters text
|
116
|
+
substituter?? substituter.substitute(text) : text
|
117
|
+
end
|
115
118
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
119
|
+
# Reject tokens after tokenizing based on the given criteria.
|
120
|
+
#
|
121
|
+
# Note: Currently only for indexing.
|
122
|
+
#
|
123
|
+
def reject_token_if &condition
|
124
|
+
@reject_condition = condition
|
125
|
+
end
|
126
|
+
def reject tokens
|
127
|
+
tokens.reject! &@reject_condition
|
128
|
+
end
|
126
129
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
130
|
+
def case_sensitive case_sensitive
|
131
|
+
@case_sensitive = case_sensitive
|
132
|
+
end
|
133
|
+
def downcase?
|
134
|
+
!@case_sensitive
|
135
|
+
end
|
133
136
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
137
|
+
# Checks if the right argument type has been given.
|
138
|
+
#
|
139
|
+
def check_argument_in method, type, argument, &condition
|
140
|
+
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
|
141
|
+
end
|
139
142
|
|
140
143
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
144
|
+
# Returns a number of tokens, generated from the given text.
|
145
|
+
#
|
146
|
+
# Note:
|
147
|
+
# * preprocess, pretokenize are hooks
|
148
|
+
#
|
149
|
+
def tokenize text
|
150
|
+
text = preprocess text # processing the text
|
151
|
+
return empty_tokens if text.blank?
|
152
|
+
words = pretokenize text # splitting and preparations for tokenizing
|
153
|
+
return empty_tokens if words.empty?
|
154
|
+
tokens = tokens_for words # creating tokens / strings
|
155
|
+
process tokens # processing tokens / strings
|
156
|
+
end
|
157
|
+
|
158
|
+
attr_reader :substituter
|
159
|
+
alias substituter? substituter
|
154
160
|
|
155
|
-
|
156
|
-
|
161
|
+
def initialize options = {}
|
162
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
163
|
+
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
164
|
+
stopwords options[:stopwords] if options[:stopwords]
|
165
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
166
|
+
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
167
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
168
|
+
case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
|
157
169
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
164
|
-
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
165
|
-
case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
|
170
|
+
# Defaults.
|
171
|
+
#
|
172
|
+
splits_text_on options[:splits_text_on] || /\s/
|
173
|
+
reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
|
174
|
+
end
|
166
175
|
|
167
|
-
#
|
176
|
+
# Default preprocessing hook.
|
168
177
|
#
|
169
|
-
|
170
|
-
|
171
|
-
|
178
|
+
# Does:
|
179
|
+
# 1. Character substitution.
|
180
|
+
# 2. Remove illegal expressions.
|
181
|
+
# 3. Remove non-single stopwords. (Stopwords that occur with other words)
|
182
|
+
#
|
183
|
+
def preprocess text
|
184
|
+
text = substitute_characters text
|
185
|
+
remove_illegals text
|
186
|
+
# We do not remove single stopwords e.g. in the indexer for
|
187
|
+
# an entirely different reason than in the query tokenizer.
|
188
|
+
# An indexed thing with just name "UND" (a possible stopword)
|
189
|
+
# should not lose its name.
|
190
|
+
#
|
191
|
+
remove_non_single_stopwords text
|
192
|
+
text
|
193
|
+
end
|
194
|
+
# Pretokenizing.
|
195
|
+
#
|
196
|
+
# Does:
|
197
|
+
# 1. Split the text into words.
|
198
|
+
# 2. Normalize each word.
|
199
|
+
#
|
200
|
+
def pretokenize text
|
201
|
+
words = split text
|
202
|
+
words.collect! do |word|
|
203
|
+
normalize_with_patterns word
|
204
|
+
word
|
205
|
+
end
|
206
|
+
end
|
207
|
+
# Basic postprocessing (overridden in both query/index tokenizers).
|
208
|
+
#
|
209
|
+
def process tokens
|
210
|
+
reject tokens # Reject any tokens that don't meet criteria
|
211
|
+
tokens
|
212
|
+
end
|
172
213
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
remove_illegals text
|
183
|
-
# We do not remove single stopwords e.g. in the indexer for
|
184
|
-
# an entirely different reason than in the query tokenizer.
|
185
|
-
# An indexed thing with just name "UND" (a possible stopword)
|
186
|
-
# should not lose its name.
|
187
|
-
#
|
188
|
-
remove_non_single_stopwords text
|
189
|
-
text
|
190
|
-
end
|
191
|
-
# Pretokenizing.
|
192
|
-
#
|
193
|
-
# Does:
|
194
|
-
# 1. Split the text into words.
|
195
|
-
# 2. Normalize each word.
|
196
|
-
#
|
197
|
-
def pretokenize text
|
198
|
-
words = split text
|
199
|
-
words.collect! do |word|
|
200
|
-
normalize_with_patterns word
|
201
|
-
word
|
214
|
+
# # Converts words into real tokens.
|
215
|
+
# #
|
216
|
+
# def tokens_for words
|
217
|
+
# Query::Tokens.new words.collect! { |word| token_for word }
|
218
|
+
# end
|
219
|
+
# Turns non-blank text into symbols.
|
220
|
+
#
|
221
|
+
def symbolize text
|
222
|
+
text.blank? ? nil : text.to_sym
|
202
223
|
end
|
203
|
-
end
|
204
|
-
# Basic postprocessing (overridden in both query/index tokenizers).
|
205
|
-
#
|
206
|
-
def process tokens
|
207
|
-
reject tokens # Reject any tokens that don't meet criteria
|
208
|
-
tokens
|
209
|
-
end
|
210
224
|
|
211
|
-
# # Converts words into real tokens.
|
212
|
-
# #
|
213
|
-
# def tokens_for words
|
214
|
-
# Query::Tokens.new words.collect! { |word| token_for word }
|
215
|
-
# end
|
216
|
-
# Turns non-blank text into symbols.
|
217
|
-
#
|
218
|
-
def symbolize text
|
219
|
-
text.blank? ? nil : text.to_sym
|
220
225
|
end
|
221
226
|
|
222
227
|
end
|
@@ -1,28 +1,32 @@
|
|
1
|
-
module
|
1
|
+
module Picky
|
2
2
|
|
3
|
-
|
4
|
-
#
|
5
|
-
# Override in indexing subclasses and define in configuration.
|
6
|
-
#
|
7
|
-
class Index < Base
|
3
|
+
module Tokenizers
|
8
4
|
|
9
|
-
|
10
|
-
@default = new_default
|
11
|
-
end
|
12
|
-
def self.default
|
13
|
-
@default ||= new
|
14
|
-
end
|
15
|
-
|
16
|
-
# Does not actually return a token, but a
|
17
|
-
# symbol "token".
|
5
|
+
# The base indexing tokenizer.
|
18
6
|
#
|
19
|
-
|
20
|
-
words.collect! { |word| word.downcase! if downcase?; word.to_sym }
|
21
|
-
end
|
22
|
-
# Returns empty tokens.
|
7
|
+
# Override in indexing subclasses and define in configuration.
|
23
8
|
#
|
24
|
-
|
25
|
-
|
9
|
+
class Index < Base
|
10
|
+
|
11
|
+
def self.default= new_default
|
12
|
+
@default = new_default
|
13
|
+
end
|
14
|
+
def self.default
|
15
|
+
@default ||= new
|
16
|
+
end
|
17
|
+
|
18
|
+
# Does not actually return a token, but a
|
19
|
+
# symbol "token".
|
20
|
+
#
|
21
|
+
def tokens_for words
|
22
|
+
words.collect! { |word| word.downcase! if downcase?; word.to_sym }
|
23
|
+
end
|
24
|
+
# Returns empty tokens.
|
25
|
+
#
|
26
|
+
def empty_tokens
|
27
|
+
[]
|
28
|
+
end
|
29
|
+
|
26
30
|
end
|
27
31
|
|
28
32
|
end
|