picky 2.7.0 → 3.0.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/picky/adapters/rack/base.rb +20 -16
- data/lib/picky/adapters/rack/live_parameters.rb +28 -24
- data/lib/picky/adapters/rack/search.rb +67 -0
- data/lib/picky/adapters/rack.rb +27 -23
- data/lib/picky/application.rb +246 -236
- data/lib/picky/backend/base.rb +115 -119
- data/lib/picky/backend/file/basic.rb +102 -98
- data/lib/picky/backend/file/json.rb +27 -23
- data/lib/picky/backend/file/marshal.rb +32 -28
- data/lib/picky/backend/file/text.rb +45 -41
- data/lib/picky/backend/files.rb +19 -15
- data/lib/picky/backend/redis/basic.rb +76 -72
- data/lib/picky/backend/redis/list_hash.rb +40 -36
- data/lib/picky/backend/redis/string_hash.rb +30 -26
- data/lib/picky/backend/redis.rb +32 -28
- data/lib/picky/bundle.rb +82 -57
- data/lib/{bundling.rb → picky/bundling.rb} +0 -0
- data/lib/picky/calculations/location.rb +51 -47
- data/lib/picky/categories.rb +60 -56
- data/lib/picky/categories_indexed.rb +73 -82
- data/lib/picky/categories_indexing.rb +12 -8
- data/lib/picky/category.rb +109 -120
- data/lib/picky/category_indexed.rb +39 -41
- data/lib/picky/category_indexing.rb +123 -125
- data/lib/picky/character_substituters/west_european.rb +32 -26
- data/lib/{constants.rb → picky/constants.rb} +0 -0
- data/lib/picky/cores.rb +96 -92
- data/lib/{deployment.rb → picky/deployment.rb} +0 -0
- data/lib/picky/frontend_adapters/rack.rb +133 -118
- data/lib/picky/generators/aliases.rb +5 -3
- data/lib/picky/generators/base.rb +11 -7
- data/lib/picky/generators/partial/default.rb +7 -3
- data/lib/picky/generators/partial/none.rb +24 -20
- data/lib/picky/generators/partial/strategy.rb +20 -16
- data/lib/picky/generators/partial/substring.rb +94 -90
- data/lib/picky/generators/partial_generator.rb +11 -7
- data/lib/picky/generators/similarity/default.rb +9 -5
- data/lib/picky/generators/similarity/double_metaphone.rb +20 -16
- data/lib/picky/generators/similarity/metaphone.rb +20 -16
- data/lib/picky/generators/similarity/none.rb +23 -19
- data/lib/picky/generators/similarity/phonetic.rb +49 -45
- data/lib/picky/generators/similarity/soundex.rb +20 -16
- data/lib/picky/generators/similarity/strategy.rb +10 -6
- data/lib/picky/generators/similarity_generator.rb +11 -7
- data/lib/picky/generators/strategy.rb +14 -10
- data/lib/picky/generators/weights/default.rb +9 -5
- data/lib/picky/generators/weights/logarithmic.rb +30 -26
- data/lib/picky/generators/weights/strategy.rb +10 -6
- data/lib/picky/generators/weights_generator.rb +11 -7
- data/lib/picky/helpers/measuring.rb +20 -16
- data/lib/picky/indexed/bundle/base.rb +39 -37
- data/lib/picky/indexed/bundle/memory.rb +68 -64
- data/lib/picky/indexed/bundle/redis.rb +73 -69
- data/lib/picky/indexed/wrappers/bundle/calculation.rb +26 -22
- data/lib/picky/indexed/wrappers/bundle/location.rb +30 -26
- data/lib/picky/indexed/wrappers/bundle/wrapper.rb +36 -32
- data/lib/picky/indexed/wrappers/category/location.rb +17 -13
- data/lib/picky/indexed/wrappers/exact_first.rb +46 -42
- data/lib/picky/indexers/base.rb +26 -22
- data/lib/picky/indexers/parallel.rb +62 -58
- data/lib/picky/indexers/serial.rb +41 -37
- data/lib/picky/indexes/index.rb +400 -0
- data/lib/picky/indexes/index_indexed.rb +24 -0
- data/lib/picky/indexes/index_indexing.rb +138 -0
- data/lib/picky/indexes/memory.rb +20 -0
- data/lib/picky/indexes/redis.rb +20 -0
- data/lib/picky/indexes.rb +68 -61
- data/lib/picky/indexes_indexed.rb +16 -12
- data/lib/picky/indexes_indexing.rb +41 -37
- data/lib/picky/indexing/bundle/base.rb +216 -205
- data/lib/picky/indexing/bundle/memory.rb +16 -11
- data/lib/picky/indexing/bundle/redis.rb +14 -12
- data/lib/picky/indexing/wrappers/category/location.rb +17 -13
- data/lib/picky/interfaces/live_parameters.rb +159 -154
- data/lib/picky/loader.rb +267 -304
- data/lib/picky/loggers/search.rb +20 -13
- data/lib/picky/no_source_specified_exception.rb +7 -3
- data/lib/picky/performant.rb +6 -2
- data/lib/picky/query/allocation.rb +71 -67
- data/lib/picky/query/allocations.rb +99 -94
- data/lib/picky/query/combination.rb +70 -66
- data/lib/picky/query/combinations/base.rb +56 -52
- data/lib/picky/query/combinations/memory.rb +36 -32
- data/lib/picky/query/combinations/redis.rb +66 -62
- data/lib/picky/query/indexes.rb +175 -160
- data/lib/picky/query/qualifier_category_mapper.rb +43 -0
- data/lib/picky/query/token.rb +165 -172
- data/lib/picky/query/tokens.rb +86 -82
- data/lib/picky/query/weights.rb +44 -48
- data/lib/picky/query.rb +5 -1
- data/lib/picky/rack/harakiri.rb +51 -47
- data/lib/picky/results.rb +81 -77
- data/lib/picky/search.rb +169 -158
- data/lib/picky/sinatra.rb +34 -0
- data/lib/picky/sources/base.rb +73 -70
- data/lib/picky/sources/couch.rb +61 -57
- data/lib/picky/sources/csv.rb +68 -64
- data/lib/picky/sources/db.rb +139 -135
- data/lib/picky/sources/delicious.rb +52 -48
- data/lib/picky/sources/mongo.rb +68 -63
- data/lib/picky/sources/wrappers/base.rb +20 -16
- data/lib/picky/sources/wrappers/location.rb +37 -33
- data/lib/picky/statistics.rb +46 -43
- data/lib/picky/tasks.rb +3 -0
- data/lib/picky/tokenizers/base.rb +192 -187
- data/lib/picky/tokenizers/index.rb +25 -21
- data/lib/picky/tokenizers/location.rb +33 -29
- data/lib/picky/tokenizers/query.rb +49 -43
- data/lib/picky.rb +21 -13
- data/lib/tasks/application.rake +1 -1
- data/lib/tasks/index.rake +3 -3
- data/lib/tasks/routes.rake +1 -1
- data/lib/tasks/server.rake +1 -1
- data/spec/lib/adapters/rack/base_spec.rb +1 -1
- data/spec/lib/adapters/rack/live_parameters_spec.rb +1 -1
- data/spec/lib/adapters/rack/query_spec.rb +1 -1
- data/spec/lib/application_spec.rb +39 -32
- data/spec/lib/backend/file/basic_spec.rb +2 -2
- data/spec/lib/backend/file/json_spec.rb +2 -2
- data/spec/lib/backend/file/marshal_spec.rb +2 -2
- data/spec/lib/backend/file/text_spec.rb +1 -1
- data/spec/lib/backend/files_spec.rb +14 -24
- data/spec/lib/backend/redis/basic_spec.rb +2 -2
- data/spec/lib/backend/redis/list_hash_spec.rb +3 -3
- data/spec/lib/backend/redis/string_hash_spec.rb +3 -3
- data/spec/lib/backend/redis_spec.rb +20 -13
- data/spec/lib/calculations/location_spec.rb +1 -1
- data/spec/lib/categories_indexed_spec.rb +16 -34
- data/spec/lib/category_indexed_spec.rb +9 -27
- data/spec/lib/category_indexing_spec.rb +2 -3
- data/spec/lib/category_spec.rb +10 -10
- data/spec/lib/character_substituters/west_european_spec.rb +6 -5
- data/spec/lib/cores_spec.rb +17 -17
- data/spec/lib/extensions/symbol_spec.rb +15 -1
- data/spec/lib/frontend_adapters/rack_spec.rb +20 -20
- data/spec/lib/generators/aliases_spec.rb +3 -3
- data/spec/lib/generators/cacher_strategy_spec.rb +1 -1
- data/spec/lib/generators/partial/default_spec.rb +3 -3
- data/spec/lib/generators/partial/none_spec.rb +2 -2
- data/spec/lib/generators/partial/substring_spec.rb +1 -1
- data/spec/lib/generators/partial_generator_spec.rb +3 -3
- data/spec/lib/generators/similarity/double_metaphone_spec.rb +1 -1
- data/spec/lib/generators/similarity/metaphone_spec.rb +1 -1
- data/spec/lib/generators/similarity/none_spec.rb +1 -1
- data/spec/lib/generators/similarity/phonetic_spec.rb +1 -1
- data/spec/lib/generators/similarity/soundex_spec.rb +1 -1
- data/spec/lib/generators/similarity_generator_spec.rb +2 -2
- data/spec/lib/generators/weights/logarithmic_spec.rb +1 -1
- data/spec/lib/generators/weights_generator_spec.rb +1 -1
- data/spec/lib/helpers/measuring_spec.rb +2 -2
- data/spec/lib/indexed/bundle/memory_spec.rb +6 -6
- data/spec/lib/indexed/bundle/redis_spec.rb +4 -4
- data/spec/lib/indexed/wrappers/bundle/calculation_spec.rb +2 -3
- data/spec/lib/indexed/wrappers/bundle/wrapper_spec.rb +2 -2
- data/spec/lib/indexed/wrappers/exact_first_spec.rb +5 -5
- data/spec/lib/indexers/base_spec.rb +1 -1
- data/spec/lib/indexers/parallel_spec.rb +1 -1
- data/spec/lib/indexers/serial_spec.rb +1 -1
- data/spec/lib/{index/base_indexed_spec.rb → indexes/index_indexed_spec.rb} +3 -3
- data/spec/lib/{index/base_indexing_spec.rb → indexes/index_indexing_spec.rb} +19 -2
- data/spec/lib/{index/base_spec.rb → indexes/index_spec.rb} +6 -25
- data/spec/lib/{index → indexes}/redis_spec.rb +1 -1
- data/spec/lib/indexes_class_spec.rb +2 -2
- data/spec/lib/indexes_indexed_spec.rb +1 -1
- data/spec/lib/indexes_indexing_spec.rb +1 -1
- data/spec/lib/indexes_spec.rb +1 -1
- data/spec/lib/indexing/bundle/base_spec.rb +7 -5
- data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +4 -4
- data/spec/lib/indexing/bundle/memory_spec.rb +15 -15
- data/spec/lib/indexing/bundle/redis_spec.rb +9 -9
- data/spec/lib/interfaces/live_parameters_spec.rb +5 -5
- data/spec/lib/loader_spec.rb +17 -19
- data/spec/lib/loggers/search_spec.rb +2 -2
- data/spec/lib/query/allocation_spec.rb +1 -1
- data/spec/lib/query/allocations_spec.rb +1 -1
- data/spec/lib/query/combination_spec.rb +4 -4
- data/spec/lib/query/combinations/base_spec.rb +1 -1
- data/spec/lib/query/combinations/memory_spec.rb +1 -1
- data/spec/lib/query/combinations/redis_spec.rb +1 -1
- data/spec/lib/query/indexes_spec.rb +7 -2
- data/spec/lib/query/qualifier_category_mapper_spec.rb +34 -0
- data/spec/lib/query/token_spec.rb +32 -53
- data/spec/lib/query/tokens_spec.rb +30 -35
- data/spec/lib/query/weights_spec.rb +16 -16
- data/spec/lib/rack/harakiri_spec.rb +5 -5
- data/spec/lib/results_spec.rb +1 -1
- data/spec/lib/search_spec.rb +24 -22
- data/spec/lib/sinatra_spec.rb +36 -0
- data/spec/lib/sources/base_spec.rb +1 -1
- data/spec/lib/sources/couch_spec.rb +9 -9
- data/spec/lib/sources/csv_spec.rb +7 -7
- data/spec/lib/sources/db_spec.rb +2 -2
- data/spec/lib/sources/delicious_spec.rb +5 -5
- data/spec/lib/sources/mongo_spec.rb +7 -7
- data/spec/lib/sources/wrappers/base_spec.rb +2 -2
- data/spec/lib/sources/wrappers/location_spec.rb +1 -1
- data/spec/lib/statistics_spec.rb +1 -1
- data/spec/lib/tokenizers/base_spec.rb +2 -2
- data/spec/lib/tokenizers/index_spec.rb +1 -1
- data/spec/lib/tokenizers/query_spec.rb +1 -1
- metadata +30 -30
- data/lib/picky/adapters/rack/query.rb +0 -65
- data/lib/picky/index/base.rb +0 -409
- data/lib/picky/index/base_indexed.rb +0 -29
- data/lib/picky/index/base_indexing.rb +0 -127
- data/lib/picky/index/memory.rb +0 -16
- data/lib/picky/index/redis.rb +0 -16
- data/lib/picky/query/qualifiers.rb +0 -76
- data/lib/picky/query/solr.rb +0 -60
- data/lib/picky/signals.rb +0 -8
- data/lib/picky-tasks.rb +0 -6
- data/lib/tasks/spec.rake +0 -11
- data/spec/lib/query/qualifiers_spec.rb +0 -31
|
@@ -1,48 +1,52 @@
|
|
|
1
|
-
module
|
|
1
|
+
module Picky
|
|
2
2
|
|
|
3
|
-
module
|
|
3
|
+
module Sources
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
#
|
|
7
|
-
class Location < Base
|
|
5
|
+
module Wrappers
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def initialize source, grid, precision = 1
|
|
12
|
-
super source
|
|
13
|
-
@calculation = Calculations::Location.new grid, precision
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
# Yield the data (id, text for id) for the given category.
|
|
7
|
+
# Should this actually just be a tokenizer?
|
|
17
8
|
#
|
|
18
|
-
|
|
19
|
-
minimum = 1.0/0
|
|
9
|
+
class Location < Base
|
|
20
10
|
|
|
21
|
-
|
|
22
|
-
#
|
|
23
|
-
locations = []
|
|
11
|
+
attr_reader :calculation
|
|
24
12
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
location = location.to_f
|
|
29
|
-
minimum = location if location < minimum
|
|
30
|
-
locations << [indexed_id, location]
|
|
13
|
+
def initialize source, grid, precision = 1
|
|
14
|
+
super source
|
|
15
|
+
@calculation = Calculations::Location.new grid, precision
|
|
31
16
|
end
|
|
32
17
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# Recalculate locations.
|
|
18
|
+
# Yield the data (id, text for id) for the given category.
|
|
36
19
|
#
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
20
|
+
def harvest category
|
|
21
|
+
minimum = 1.0/0
|
|
22
|
+
|
|
23
|
+
# Cache.
|
|
24
|
+
#
|
|
25
|
+
locations = []
|
|
26
|
+
|
|
27
|
+
# Gather min/max.
|
|
28
|
+
#
|
|
29
|
+
source.harvest category do |indexed_id, location|
|
|
30
|
+
location = location.to_f
|
|
31
|
+
minimum = location if location < minimum
|
|
32
|
+
locations << [indexed_id, location]
|
|
40
33
|
end
|
|
34
|
+
|
|
35
|
+
calculation.minimum = minimum
|
|
36
|
+
|
|
37
|
+
# Recalculate locations.
|
|
38
|
+
#
|
|
39
|
+
locations.each do |indexed_id, location|
|
|
40
|
+
calculation.recalculated_range(location).each do |new_location|
|
|
41
|
+
yield indexed_id, new_location.to_s
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# TODO Move to the right place.
|
|
46
|
+
#
|
|
47
|
+
category.indexing_exact[:location_minimum] = minimum
|
|
41
48
|
end
|
|
42
49
|
|
|
43
|
-
# TODO Move to the right place.
|
|
44
|
-
#
|
|
45
|
-
category.indexing_exact[:location_minimum] = minimum
|
|
46
50
|
end
|
|
47
51
|
|
|
48
52
|
end
|
data/lib/picky/statistics.rb
CHANGED
|
@@ -1,60 +1,63 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
#
|
|
3
|
+
module Picky
|
|
3
4
|
|
|
4
|
-
# Gathers various statistics.
|
|
5
|
-
#
|
|
6
|
-
class Statistics # :nodoc:all
|
|
5
|
+
# Gathers various statistics.
|
|
6
|
+
#
|
|
7
|
+
class Statistics # :nodoc:all
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
def initialize
|
|
10
|
+
@indexes = ["\033[1mIndexes analysis\033[m:"]
|
|
11
|
+
end
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
def preamble
|
|
14
|
+
loc = lines_of_code File.open('app/application.rb').read
|
|
14
15
|
|
|
15
|
-
|
|
16
|
-
\033[1mApplication(s)\033[m
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
PREAMBLE
|
|
20
|
-
|
|
16
|
+
@preamble ||= <<-PREAMBLE
|
|
17
|
+
\033[1mApplication(s)\033[m
|
|
18
|
+
Definition LOC: #{"%4d" % loc}
|
|
19
|
+
Indexes defined: #{"%4d" % Indexes.size}
|
|
20
|
+
PREAMBLE
|
|
21
|
+
end
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
# Gathers information about the application.
|
|
24
|
+
#
|
|
25
|
+
def application
|
|
26
|
+
preamble
|
|
27
|
+
@application = Application.apps.map &:indented_to_s
|
|
28
|
+
end
|
|
28
29
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#{"#{category.index_name}".indented_to_s}\n
|
|
35
|
-
#{"#{category.name}".indented_to_s(4)}\n
|
|
36
|
-
#{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
|
|
37
|
-
#{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
|
|
38
|
-
ANALYSIS
|
|
30
|
+
# Gathers information about the indexes.
|
|
31
|
+
#
|
|
32
|
+
def analyze object
|
|
33
|
+
object.each_category do |category|
|
|
34
|
+
@indexes << <<-ANALYSIS
|
|
35
|
+
#{"#{category.index_name}".indented_to_s}\n
|
|
36
|
+
#{"#{category.name}".indented_to_s(4)}\n
|
|
37
|
+
#{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
|
|
38
|
+
#{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
|
|
39
|
+
ANALYSIS
|
|
40
|
+
end
|
|
39
41
|
end
|
|
40
|
-
end
|
|
41
42
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
# Outputs all gathered statistics.
|
|
44
|
+
#
|
|
45
|
+
def to_s
|
|
46
|
+
<<-STATS
|
|
46
47
|
|
|
47
|
-
Picky Configuration:
|
|
48
|
+
Picky Configuration:
|
|
48
49
|
|
|
49
|
-
#{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
|
|
50
|
-
STATS
|
|
51
|
-
|
|
50
|
+
#{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
|
|
51
|
+
STATS
|
|
52
|
+
end
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
54
|
+
# Internal methods.
|
|
55
|
+
#
|
|
56
|
+
|
|
57
|
+
def lines_of_code text
|
|
58
|
+
text.scan(/^\s*[^#\s].*$/).size
|
|
59
|
+
end
|
|
55
60
|
|
|
56
|
-
def lines_of_code text
|
|
57
|
-
text.scan(/^\s*[^#\s].*$/).size
|
|
58
61
|
end
|
|
59
62
|
|
|
60
63
|
end
|
data/lib/picky/tasks.rb
ADDED
|
@@ -1,16 +1,18 @@
|
|
|
1
|
-
module
|
|
1
|
+
module Picky
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
class Base
|
|
3
|
+
module Tokenizers # :nodoc:all
|
|
6
4
|
|
|
7
|
-
#
|
|
5
|
+
# Defines tokenizing processes used both in indexing and querying.
|
|
8
6
|
#
|
|
9
|
-
|
|
7
|
+
class Base
|
|
10
8
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
# TODO Move EMPTY_STRING top level.
|
|
10
|
+
#
|
|
11
|
+
EMPTY_STRING = ''.freeze
|
|
12
|
+
|
|
13
|
+
def to_s
|
|
14
|
+
reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
|
|
15
|
+
<<-TOKENIZER
|
|
14
16
|
Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
|
|
15
17
|
Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
|
|
16
18
|
Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
|
|
@@ -19,204 +21,207 @@ Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_wor
|
|
|
19
21
|
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
|
|
20
22
|
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
|
|
21
23
|
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
|
22
|
-
|
|
23
|
-
|
|
24
|
+
TOKENIZER
|
|
25
|
+
end
|
|
24
26
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
27
|
+
# Stopwords.
|
|
28
|
+
#
|
|
29
|
+
# We only allow regexps (even if string would be okay
|
|
30
|
+
# too for gsub! - it's too hard to understand)
|
|
31
|
+
#
|
|
32
|
+
def stopwords regexp
|
|
33
|
+
check_argument_in __method__, Regexp, regexp
|
|
34
|
+
@remove_stopwords_regexp = regexp
|
|
35
|
+
end
|
|
36
|
+
def remove_stopwords text
|
|
37
|
+
text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
|
|
38
|
+
text
|
|
39
|
+
end
|
|
40
|
+
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
|
41
|
+
def remove_non_single_stopwords text
|
|
42
|
+
return text if text.match @@non_single_stopword_regexp
|
|
43
|
+
remove_stopwords text
|
|
44
|
+
end
|
|
43
45
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
46
|
+
# Illegals.
|
|
47
|
+
#
|
|
48
|
+
# We only allow regexps (even if string would be okay
|
|
49
|
+
# too for gsub! - it's too hard to understand)
|
|
50
|
+
#
|
|
51
|
+
def removes_characters regexp
|
|
52
|
+
check_argument_in __method__, Regexp, regexp
|
|
53
|
+
@removes_characters_regexp = regexp
|
|
54
|
+
end
|
|
55
|
+
def remove_illegals text
|
|
56
|
+
text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
|
|
57
|
+
text
|
|
58
|
+
end
|
|
57
59
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
60
|
+
# Splitting.
|
|
61
|
+
#
|
|
62
|
+
# We allow Strings and Regexps.
|
|
63
|
+
# Note: We do not test against to_str since symbols do not work with String#split.
|
|
64
|
+
#
|
|
65
|
+
def splits_text_on regexp_or_string
|
|
66
|
+
raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
|
|
67
|
+
@splits_text_on = regexp_or_string
|
|
68
|
+
end
|
|
69
|
+
def split text
|
|
70
|
+
text.split @splits_text_on
|
|
71
|
+
end
|
|
70
72
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
73
|
+
# Normalizing.
|
|
74
|
+
#
|
|
75
|
+
# We only allow arrays.
|
|
76
|
+
#
|
|
77
|
+
def normalizes_words regexp_replaces
|
|
78
|
+
raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
|
|
79
|
+
@normalizes_words_regexp_replaces = regexp_replaces
|
|
80
|
+
end
|
|
81
|
+
def normalize_with_patterns text
|
|
82
|
+
return text unless @normalizes_words_regexp_replaces
|
|
81
83
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
@normalizes_words_regexp_replaces.each do |regex, replace|
|
|
85
|
+
# This should be sufficient
|
|
86
|
+
#
|
|
87
|
+
text.gsub!(regex, replace) and break
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
remove_after_normalizing_illegals text
|
|
91
|
+
text
|
|
86
92
|
end
|
|
87
|
-
remove_after_normalizing_illegals text
|
|
88
|
-
text
|
|
89
|
-
end
|
|
90
93
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
94
|
+
# Illegal after normalizing.
|
|
95
|
+
#
|
|
96
|
+
# We only allow regexps (even if string would be okay
|
|
97
|
+
# too for gsub! - it's too hard to understand)
|
|
98
|
+
#
|
|
99
|
+
def removes_characters_after_splitting regexp
|
|
100
|
+
check_argument_in __method__, Regexp, regexp
|
|
101
|
+
@removes_characters_after_splitting_regexp = regexp
|
|
102
|
+
end
|
|
103
|
+
def remove_after_normalizing_illegals text
|
|
104
|
+
text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
|
|
105
|
+
end
|
|
103
106
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
# Substitute Characters with this substituter.
|
|
108
|
+
#
|
|
109
|
+
# Default is European Character substitution.
|
|
110
|
+
#
|
|
111
|
+
def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
|
|
112
|
+
raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
|
|
113
|
+
@substituter = substituter
|
|
114
|
+
end
|
|
115
|
+
def substitute_characters text
|
|
116
|
+
substituter?? substituter.substitute(text) : text
|
|
117
|
+
end
|
|
115
118
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
119
|
+
# Reject tokens after tokenizing based on the given criteria.
|
|
120
|
+
#
|
|
121
|
+
# Note: Currently only for indexing.
|
|
122
|
+
#
|
|
123
|
+
def reject_token_if &condition
|
|
124
|
+
@reject_condition = condition
|
|
125
|
+
end
|
|
126
|
+
def reject tokens
|
|
127
|
+
tokens.reject! &@reject_condition
|
|
128
|
+
end
|
|
126
129
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
130
|
+
def case_sensitive case_sensitive
|
|
131
|
+
@case_sensitive = case_sensitive
|
|
132
|
+
end
|
|
133
|
+
def downcase?
|
|
134
|
+
!@case_sensitive
|
|
135
|
+
end
|
|
133
136
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
137
|
+
# Checks if the right argument type has been given.
|
|
138
|
+
#
|
|
139
|
+
def check_argument_in method, type, argument, &condition
|
|
140
|
+
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
|
|
141
|
+
end
|
|
139
142
|
|
|
140
143
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
144
|
+
# Returns a number of tokens, generated from the given text.
|
|
145
|
+
#
|
|
146
|
+
# Note:
|
|
147
|
+
# * preprocess, pretokenize are hooks
|
|
148
|
+
#
|
|
149
|
+
def tokenize text
|
|
150
|
+
text = preprocess text # processing the text
|
|
151
|
+
return empty_tokens if text.blank?
|
|
152
|
+
words = pretokenize text # splitting and preparations for tokenizing
|
|
153
|
+
return empty_tokens if words.empty?
|
|
154
|
+
tokens = tokens_for words # creating tokens / strings
|
|
155
|
+
process tokens # processing tokens / strings
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
attr_reader :substituter
|
|
159
|
+
alias substituter? substituter
|
|
154
160
|
|
|
155
|
-
|
|
156
|
-
|
|
161
|
+
def initialize options = {}
|
|
162
|
+
removes_characters options[:removes_characters] if options[:removes_characters]
|
|
163
|
+
contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
|
|
164
|
+
stopwords options[:stopwords] if options[:stopwords]
|
|
165
|
+
normalizes_words options[:normalizes_words] if options[:normalizes_words]
|
|
166
|
+
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
|
167
|
+
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
|
168
|
+
case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
|
|
157
169
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
|
|
164
|
-
substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
|
|
165
|
-
case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
|
|
170
|
+
# Defaults.
|
|
171
|
+
#
|
|
172
|
+
splits_text_on options[:splits_text_on] || /\s/
|
|
173
|
+
reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
|
|
174
|
+
end
|
|
166
175
|
|
|
167
|
-
#
|
|
176
|
+
# Default preprocessing hook.
|
|
168
177
|
#
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
178
|
+
# Does:
|
|
179
|
+
# 1. Character substitution.
|
|
180
|
+
# 2. Remove illegal expressions.
|
|
181
|
+
# 3. Remove non-single stopwords. (Stopwords that occur with other words)
|
|
182
|
+
#
|
|
183
|
+
def preprocess text
|
|
184
|
+
text = substitute_characters text
|
|
185
|
+
remove_illegals text
|
|
186
|
+
# We do not remove single stopwords e.g. in the indexer for
|
|
187
|
+
# an entirely different reason than in the query tokenizer.
|
|
188
|
+
# An indexed thing with just name "UND" (a possible stopword)
|
|
189
|
+
# should not lose its name.
|
|
190
|
+
#
|
|
191
|
+
remove_non_single_stopwords text
|
|
192
|
+
text
|
|
193
|
+
end
|
|
194
|
+
# Pretokenizing.
|
|
195
|
+
#
|
|
196
|
+
# Does:
|
|
197
|
+
# 1. Split the text into words.
|
|
198
|
+
# 2. Normalize each word.
|
|
199
|
+
#
|
|
200
|
+
def pretokenize text
|
|
201
|
+
words = split text
|
|
202
|
+
words.collect! do |word|
|
|
203
|
+
normalize_with_patterns word
|
|
204
|
+
word
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
# Basic postprocessing (overridden in both query/index tokenizers).
|
|
208
|
+
#
|
|
209
|
+
def process tokens
|
|
210
|
+
reject tokens # Reject any tokens that don't meet criteria
|
|
211
|
+
tokens
|
|
212
|
+
end
|
|
172
213
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
remove_illegals text
|
|
183
|
-
# We do not remove single stopwords e.g. in the indexer for
|
|
184
|
-
# an entirely different reason than in the query tokenizer.
|
|
185
|
-
# An indexed thing with just name "UND" (a possible stopword)
|
|
186
|
-
# should not lose its name.
|
|
187
|
-
#
|
|
188
|
-
remove_non_single_stopwords text
|
|
189
|
-
text
|
|
190
|
-
end
|
|
191
|
-
# Pretokenizing.
|
|
192
|
-
#
|
|
193
|
-
# Does:
|
|
194
|
-
# 1. Split the text into words.
|
|
195
|
-
# 2. Normalize each word.
|
|
196
|
-
#
|
|
197
|
-
def pretokenize text
|
|
198
|
-
words = split text
|
|
199
|
-
words.collect! do |word|
|
|
200
|
-
normalize_with_patterns word
|
|
201
|
-
word
|
|
214
|
+
# # Converts words into real tokens.
|
|
215
|
+
# #
|
|
216
|
+
# def tokens_for words
|
|
217
|
+
# Query::Tokens.new words.collect! { |word| token_for word }
|
|
218
|
+
# end
|
|
219
|
+
# Turns non-blank text into symbols.
|
|
220
|
+
#
|
|
221
|
+
def symbolize text
|
|
222
|
+
text.blank? ? nil : text.to_sym
|
|
202
223
|
end
|
|
203
|
-
end
|
|
204
|
-
# Basic postprocessing (overridden in both query/index tokenizers).
|
|
205
|
-
#
|
|
206
|
-
def process tokens
|
|
207
|
-
reject tokens # Reject any tokens that don't meet criteria
|
|
208
|
-
tokens
|
|
209
|
-
end
|
|
210
224
|
|
|
211
|
-
# # Converts words into real tokens.
|
|
212
|
-
# #
|
|
213
|
-
# def tokens_for words
|
|
214
|
-
# Query::Tokens.new words.collect! { |word| token_for word }
|
|
215
|
-
# end
|
|
216
|
-
# Turns non-blank text into symbols.
|
|
217
|
-
#
|
|
218
|
-
def symbolize text
|
|
219
|
-
text.blank? ? nil : text.to_sym
|
|
220
225
|
end
|
|
221
226
|
|
|
222
227
|
end
|
|
@@ -1,28 +1,32 @@
|
|
|
1
|
-
module
|
|
1
|
+
module Picky
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
#
|
|
5
|
-
# Override in indexing subclasses and define in configuration.
|
|
6
|
-
#
|
|
7
|
-
class Index < Base
|
|
3
|
+
module Tokenizers
|
|
8
4
|
|
|
9
|
-
|
|
10
|
-
@default = new_default
|
|
11
|
-
end
|
|
12
|
-
def self.default
|
|
13
|
-
@default ||= new
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
# Does not actually return a token, but a
|
|
17
|
-
# symbol "token".
|
|
5
|
+
# The base indexing tokenizer.
|
|
18
6
|
#
|
|
19
|
-
|
|
20
|
-
words.collect! { |word| word.downcase! if downcase?; word.to_sym }
|
|
21
|
-
end
|
|
22
|
-
# Returns empty tokens.
|
|
7
|
+
# Override in indexing subclasses and define in configuration.
|
|
23
8
|
#
|
|
24
|
-
|
|
25
|
-
|
|
9
|
+
class Index < Base
|
|
10
|
+
|
|
11
|
+
def self.default= new_default
|
|
12
|
+
@default = new_default
|
|
13
|
+
end
|
|
14
|
+
def self.default
|
|
15
|
+
@default ||= new
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Does not actually return a token, but a
|
|
19
|
+
# symbol "token".
|
|
20
|
+
#
|
|
21
|
+
def tokens_for words
|
|
22
|
+
words.collect! { |word| word.downcase! if downcase?; word.to_sym }
|
|
23
|
+
end
|
|
24
|
+
# Returns empty tokens.
|
|
25
|
+
#
|
|
26
|
+
def empty_tokens
|
|
27
|
+
[]
|
|
28
|
+
end
|
|
29
|
+
|
|
26
30
|
end
|
|
27
31
|
|
|
28
32
|
end
|