picky 2.7.0 → 3.0.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (213) hide show
  1. data/lib/picky/adapters/rack/base.rb +20 -16
  2. data/lib/picky/adapters/rack/live_parameters.rb +28 -24
  3. data/lib/picky/adapters/rack/search.rb +67 -0
  4. data/lib/picky/adapters/rack.rb +27 -23
  5. data/lib/picky/application.rb +246 -236
  6. data/lib/picky/backend/base.rb +115 -119
  7. data/lib/picky/backend/file/basic.rb +102 -98
  8. data/lib/picky/backend/file/json.rb +27 -23
  9. data/lib/picky/backend/file/marshal.rb +32 -28
  10. data/lib/picky/backend/file/text.rb +45 -41
  11. data/lib/picky/backend/files.rb +19 -15
  12. data/lib/picky/backend/redis/basic.rb +76 -72
  13. data/lib/picky/backend/redis/list_hash.rb +40 -36
  14. data/lib/picky/backend/redis/string_hash.rb +30 -26
  15. data/lib/picky/backend/redis.rb +32 -28
  16. data/lib/picky/bundle.rb +82 -57
  17. data/lib/{bundling.rb → picky/bundling.rb} +0 -0
  18. data/lib/picky/calculations/location.rb +51 -47
  19. data/lib/picky/categories.rb +60 -56
  20. data/lib/picky/categories_indexed.rb +73 -82
  21. data/lib/picky/categories_indexing.rb +12 -8
  22. data/lib/picky/category.rb +109 -120
  23. data/lib/picky/category_indexed.rb +39 -41
  24. data/lib/picky/category_indexing.rb +123 -125
  25. data/lib/picky/character_substituters/west_european.rb +32 -26
  26. data/lib/{constants.rb → picky/constants.rb} +0 -0
  27. data/lib/picky/cores.rb +96 -92
  28. data/lib/{deployment.rb → picky/deployment.rb} +0 -0
  29. data/lib/picky/frontend_adapters/rack.rb +133 -118
  30. data/lib/picky/generators/aliases.rb +5 -3
  31. data/lib/picky/generators/base.rb +11 -7
  32. data/lib/picky/generators/partial/default.rb +7 -3
  33. data/lib/picky/generators/partial/none.rb +24 -20
  34. data/lib/picky/generators/partial/strategy.rb +20 -16
  35. data/lib/picky/generators/partial/substring.rb +94 -90
  36. data/lib/picky/generators/partial_generator.rb +11 -7
  37. data/lib/picky/generators/similarity/default.rb +9 -5
  38. data/lib/picky/generators/similarity/double_metaphone.rb +20 -16
  39. data/lib/picky/generators/similarity/metaphone.rb +20 -16
  40. data/lib/picky/generators/similarity/none.rb +23 -19
  41. data/lib/picky/generators/similarity/phonetic.rb +49 -45
  42. data/lib/picky/generators/similarity/soundex.rb +20 -16
  43. data/lib/picky/generators/similarity/strategy.rb +10 -6
  44. data/lib/picky/generators/similarity_generator.rb +11 -7
  45. data/lib/picky/generators/strategy.rb +14 -10
  46. data/lib/picky/generators/weights/default.rb +9 -5
  47. data/lib/picky/generators/weights/logarithmic.rb +30 -26
  48. data/lib/picky/generators/weights/strategy.rb +10 -6
  49. data/lib/picky/generators/weights_generator.rb +11 -7
  50. data/lib/picky/helpers/measuring.rb +20 -16
  51. data/lib/picky/indexed/bundle/base.rb +39 -37
  52. data/lib/picky/indexed/bundle/memory.rb +68 -64
  53. data/lib/picky/indexed/bundle/redis.rb +73 -69
  54. data/lib/picky/indexed/wrappers/bundle/calculation.rb +26 -22
  55. data/lib/picky/indexed/wrappers/bundle/location.rb +30 -26
  56. data/lib/picky/indexed/wrappers/bundle/wrapper.rb +36 -32
  57. data/lib/picky/indexed/wrappers/category/location.rb +17 -13
  58. data/lib/picky/indexed/wrappers/exact_first.rb +46 -42
  59. data/lib/picky/indexers/base.rb +26 -22
  60. data/lib/picky/indexers/parallel.rb +62 -58
  61. data/lib/picky/indexers/serial.rb +41 -37
  62. data/lib/picky/indexes/index.rb +400 -0
  63. data/lib/picky/indexes/index_indexed.rb +24 -0
  64. data/lib/picky/indexes/index_indexing.rb +138 -0
  65. data/lib/picky/indexes/memory.rb +20 -0
  66. data/lib/picky/indexes/redis.rb +20 -0
  67. data/lib/picky/indexes.rb +68 -61
  68. data/lib/picky/indexes_indexed.rb +16 -12
  69. data/lib/picky/indexes_indexing.rb +41 -37
  70. data/lib/picky/indexing/bundle/base.rb +216 -205
  71. data/lib/picky/indexing/bundle/memory.rb +16 -11
  72. data/lib/picky/indexing/bundle/redis.rb +14 -12
  73. data/lib/picky/indexing/wrappers/category/location.rb +17 -13
  74. data/lib/picky/interfaces/live_parameters.rb +159 -154
  75. data/lib/picky/loader.rb +267 -304
  76. data/lib/picky/loggers/search.rb +20 -13
  77. data/lib/picky/no_source_specified_exception.rb +7 -3
  78. data/lib/picky/performant.rb +6 -2
  79. data/lib/picky/query/allocation.rb +71 -67
  80. data/lib/picky/query/allocations.rb +99 -94
  81. data/lib/picky/query/combination.rb +70 -66
  82. data/lib/picky/query/combinations/base.rb +56 -52
  83. data/lib/picky/query/combinations/memory.rb +36 -32
  84. data/lib/picky/query/combinations/redis.rb +66 -62
  85. data/lib/picky/query/indexes.rb +175 -160
  86. data/lib/picky/query/qualifier_category_mapper.rb +43 -0
  87. data/lib/picky/query/token.rb +165 -172
  88. data/lib/picky/query/tokens.rb +86 -82
  89. data/lib/picky/query/weights.rb +44 -48
  90. data/lib/picky/query.rb +5 -1
  91. data/lib/picky/rack/harakiri.rb +51 -47
  92. data/lib/picky/results.rb +81 -77
  93. data/lib/picky/search.rb +169 -158
  94. data/lib/picky/sinatra.rb +34 -0
  95. data/lib/picky/sources/base.rb +73 -70
  96. data/lib/picky/sources/couch.rb +61 -57
  97. data/lib/picky/sources/csv.rb +68 -64
  98. data/lib/picky/sources/db.rb +139 -135
  99. data/lib/picky/sources/delicious.rb +52 -48
  100. data/lib/picky/sources/mongo.rb +68 -63
  101. data/lib/picky/sources/wrappers/base.rb +20 -16
  102. data/lib/picky/sources/wrappers/location.rb +37 -33
  103. data/lib/picky/statistics.rb +46 -43
  104. data/lib/picky/tasks.rb +3 -0
  105. data/lib/picky/tokenizers/base.rb +192 -187
  106. data/lib/picky/tokenizers/index.rb +25 -21
  107. data/lib/picky/tokenizers/location.rb +33 -29
  108. data/lib/picky/tokenizers/query.rb +49 -43
  109. data/lib/picky.rb +21 -13
  110. data/lib/tasks/application.rake +1 -1
  111. data/lib/tasks/index.rake +3 -3
  112. data/lib/tasks/routes.rake +1 -1
  113. data/lib/tasks/server.rake +1 -1
  114. data/spec/lib/adapters/rack/base_spec.rb +1 -1
  115. data/spec/lib/adapters/rack/live_parameters_spec.rb +1 -1
  116. data/spec/lib/adapters/rack/query_spec.rb +1 -1
  117. data/spec/lib/application_spec.rb +39 -32
  118. data/spec/lib/backend/file/basic_spec.rb +2 -2
  119. data/spec/lib/backend/file/json_spec.rb +2 -2
  120. data/spec/lib/backend/file/marshal_spec.rb +2 -2
  121. data/spec/lib/backend/file/text_spec.rb +1 -1
  122. data/spec/lib/backend/files_spec.rb +14 -24
  123. data/spec/lib/backend/redis/basic_spec.rb +2 -2
  124. data/spec/lib/backend/redis/list_hash_spec.rb +3 -3
  125. data/spec/lib/backend/redis/string_hash_spec.rb +3 -3
  126. data/spec/lib/backend/redis_spec.rb +20 -13
  127. data/spec/lib/calculations/location_spec.rb +1 -1
  128. data/spec/lib/categories_indexed_spec.rb +16 -34
  129. data/spec/lib/category_indexed_spec.rb +9 -27
  130. data/spec/lib/category_indexing_spec.rb +2 -3
  131. data/spec/lib/category_spec.rb +10 -10
  132. data/spec/lib/character_substituters/west_european_spec.rb +6 -5
  133. data/spec/lib/cores_spec.rb +17 -17
  134. data/spec/lib/extensions/symbol_spec.rb +15 -1
  135. data/spec/lib/frontend_adapters/rack_spec.rb +20 -20
  136. data/spec/lib/generators/aliases_spec.rb +3 -3
  137. data/spec/lib/generators/cacher_strategy_spec.rb +1 -1
  138. data/spec/lib/generators/partial/default_spec.rb +3 -3
  139. data/spec/lib/generators/partial/none_spec.rb +2 -2
  140. data/spec/lib/generators/partial/substring_spec.rb +1 -1
  141. data/spec/lib/generators/partial_generator_spec.rb +3 -3
  142. data/spec/lib/generators/similarity/double_metaphone_spec.rb +1 -1
  143. data/spec/lib/generators/similarity/metaphone_spec.rb +1 -1
  144. data/spec/lib/generators/similarity/none_spec.rb +1 -1
  145. data/spec/lib/generators/similarity/phonetic_spec.rb +1 -1
  146. data/spec/lib/generators/similarity/soundex_spec.rb +1 -1
  147. data/spec/lib/generators/similarity_generator_spec.rb +2 -2
  148. data/spec/lib/generators/weights/logarithmic_spec.rb +1 -1
  149. data/spec/lib/generators/weights_generator_spec.rb +1 -1
  150. data/spec/lib/helpers/measuring_spec.rb +2 -2
  151. data/spec/lib/indexed/bundle/memory_spec.rb +6 -6
  152. data/spec/lib/indexed/bundle/redis_spec.rb +4 -4
  153. data/spec/lib/indexed/wrappers/bundle/calculation_spec.rb +2 -3
  154. data/spec/lib/indexed/wrappers/bundle/wrapper_spec.rb +2 -2
  155. data/spec/lib/indexed/wrappers/exact_first_spec.rb +5 -5
  156. data/spec/lib/indexers/base_spec.rb +1 -1
  157. data/spec/lib/indexers/parallel_spec.rb +1 -1
  158. data/spec/lib/indexers/serial_spec.rb +1 -1
  159. data/spec/lib/{index/base_indexed_spec.rb → indexes/index_indexed_spec.rb} +3 -3
  160. data/spec/lib/{index/base_indexing_spec.rb → indexes/index_indexing_spec.rb} +19 -2
  161. data/spec/lib/{index/base_spec.rb → indexes/index_spec.rb} +6 -25
  162. data/spec/lib/{index → indexes}/redis_spec.rb +1 -1
  163. data/spec/lib/indexes_class_spec.rb +2 -2
  164. data/spec/lib/indexes_indexed_spec.rb +1 -1
  165. data/spec/lib/indexes_indexing_spec.rb +1 -1
  166. data/spec/lib/indexes_spec.rb +1 -1
  167. data/spec/lib/indexing/bundle/base_spec.rb +7 -5
  168. data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +4 -4
  169. data/spec/lib/indexing/bundle/memory_spec.rb +15 -15
  170. data/spec/lib/indexing/bundle/redis_spec.rb +9 -9
  171. data/spec/lib/interfaces/live_parameters_spec.rb +5 -5
  172. data/spec/lib/loader_spec.rb +17 -19
  173. data/spec/lib/loggers/search_spec.rb +2 -2
  174. data/spec/lib/query/allocation_spec.rb +1 -1
  175. data/spec/lib/query/allocations_spec.rb +1 -1
  176. data/spec/lib/query/combination_spec.rb +4 -4
  177. data/spec/lib/query/combinations/base_spec.rb +1 -1
  178. data/spec/lib/query/combinations/memory_spec.rb +1 -1
  179. data/spec/lib/query/combinations/redis_spec.rb +1 -1
  180. data/spec/lib/query/indexes_spec.rb +7 -2
  181. data/spec/lib/query/qualifier_category_mapper_spec.rb +34 -0
  182. data/spec/lib/query/token_spec.rb +32 -53
  183. data/spec/lib/query/tokens_spec.rb +30 -35
  184. data/spec/lib/query/weights_spec.rb +16 -16
  185. data/spec/lib/rack/harakiri_spec.rb +5 -5
  186. data/spec/lib/results_spec.rb +1 -1
  187. data/spec/lib/search_spec.rb +24 -22
  188. data/spec/lib/sinatra_spec.rb +36 -0
  189. data/spec/lib/sources/base_spec.rb +1 -1
  190. data/spec/lib/sources/couch_spec.rb +9 -9
  191. data/spec/lib/sources/csv_spec.rb +7 -7
  192. data/spec/lib/sources/db_spec.rb +2 -2
  193. data/spec/lib/sources/delicious_spec.rb +5 -5
  194. data/spec/lib/sources/mongo_spec.rb +7 -7
  195. data/spec/lib/sources/wrappers/base_spec.rb +2 -2
  196. data/spec/lib/sources/wrappers/location_spec.rb +1 -1
  197. data/spec/lib/statistics_spec.rb +1 -1
  198. data/spec/lib/tokenizers/base_spec.rb +2 -2
  199. data/spec/lib/tokenizers/index_spec.rb +1 -1
  200. data/spec/lib/tokenizers/query_spec.rb +1 -1
  201. metadata +30 -30
  202. data/lib/picky/adapters/rack/query.rb +0 -65
  203. data/lib/picky/index/base.rb +0 -409
  204. data/lib/picky/index/base_indexed.rb +0 -29
  205. data/lib/picky/index/base_indexing.rb +0 -127
  206. data/lib/picky/index/memory.rb +0 -16
  207. data/lib/picky/index/redis.rb +0 -16
  208. data/lib/picky/query/qualifiers.rb +0 -76
  209. data/lib/picky/query/solr.rb +0 -60
  210. data/lib/picky/signals.rb +0 -8
  211. data/lib/picky-tasks.rb +0 -6
  212. data/lib/tasks/spec.rake +0 -11
  213. data/spec/lib/query/qualifiers_spec.rb +0 -31
@@ -1,48 +1,52 @@
1
- module Sources
1
+ module Picky
2
2
 
3
- module Wrappers
3
+ module Sources
4
4
 
5
- # Should this actually just be a tokenizer?
6
- #
7
- class Location < Base
5
+ module Wrappers
8
6
 
9
- attr_reader :calculation
10
-
11
- def initialize source, grid, precision = 1
12
- super source
13
- @calculation = Calculations::Location.new grid, precision
14
- end
15
-
16
- # Yield the data (id, text for id) for the given category.
7
+ # Should this actually just be a tokenizer?
17
8
  #
18
- def harvest category
19
- minimum = 1.0/0
9
+ class Location < Base
20
10
 
21
- # Cache. TODO Make option?
22
- #
23
- locations = []
11
+ attr_reader :calculation
24
12
 
25
- # Gather min/max.
26
- #
27
- source.harvest category do |indexed_id, location|
28
- location = location.to_f
29
- minimum = location if location < minimum
30
- locations << [indexed_id, location]
13
+ def initialize source, grid, precision = 1
14
+ super source
15
+ @calculation = Calculations::Location.new grid, precision
31
16
  end
32
17
 
33
- calculation.minimum = minimum
34
-
35
- # Recalculate locations.
18
+ # Yield the data (id, text for id) for the given category.
36
19
  #
37
- locations.each do |indexed_id, location|
38
- calculation.recalculated_range(location).each do |new_location|
39
- yield indexed_id, new_location.to_s
20
+ def harvest category
21
+ minimum = 1.0/0
22
+
23
+ # Cache.
24
+ #
25
+ locations = []
26
+
27
+ # Gather min/max.
28
+ #
29
+ source.harvest category do |indexed_id, location|
30
+ location = location.to_f
31
+ minimum = location if location < minimum
32
+ locations << [indexed_id, location]
40
33
  end
34
+
35
+ calculation.minimum = minimum
36
+
37
+ # Recalculate locations.
38
+ #
39
+ locations.each do |indexed_id, location|
40
+ calculation.recalculated_range(location).each do |new_location|
41
+ yield indexed_id, new_location.to_s
42
+ end
43
+ end
44
+
45
+ # TODO Move to the right place.
46
+ #
47
+ category.indexing_exact[:location_minimum] = minimum
41
48
  end
42
49
 
43
- # TODO Move to the right place.
44
- #
45
- category.indexing_exact[:location_minimum] = minimum
46
50
  end
47
51
 
48
52
  end
@@ -1,60 +1,63 @@
1
1
  # encoding: utf-8
2
2
  #
3
+ module Picky
3
4
 
4
- # Gathers various statistics.
5
- #
6
- class Statistics # :nodoc:all
5
+ # Gathers various statistics.
6
+ #
7
+ class Statistics # :nodoc:all
7
8
 
8
- def initialize
9
- @indexes = ["\033[1mIndexes analysis\033[m:"]
10
- end
9
+ def initialize
10
+ @indexes = ["\033[1mIndexes analysis\033[m:"]
11
+ end
11
12
 
12
- def preamble
13
- loc = lines_of_code File.open('app/application.rb').read
13
+ def preamble
14
+ loc = lines_of_code File.open('app/application.rb').read
14
15
 
15
- @preamble ||= <<-PREAMBLE
16
- \033[1mApplication(s)\033[m
17
- Definition LOC: #{"%4d" % loc}
18
- Indexes defined: #{"%4d" % Indexes.size}
19
- PREAMBLE
20
- end
16
+ @preamble ||= <<-PREAMBLE
17
+ \033[1mApplication(s)\033[m
18
+ Definition LOC: #{"%4d" % loc}
19
+ Indexes defined: #{"%4d" % Indexes.size}
20
+ PREAMBLE
21
+ end
21
22
 
22
- # Gathers information about the application.
23
- #
24
- def application
25
- preamble
26
- @application = Application.apps.map &:indented_to_s
27
- end
23
+ # Gathers information about the application.
24
+ #
25
+ def application
26
+ preamble
27
+ @application = Application.apps.map &:indented_to_s
28
+ end
28
29
 
29
- # Gathers information about the indexes.
30
- #
31
- def analyze object
32
- object.each_category do |category|
33
- @indexes << <<-ANALYSIS
34
- #{"#{category.index_name}".indented_to_s}\n
35
- #{"#{category.name}".indented_to_s(4)}\n
36
- #{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
37
- #{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
38
- ANALYSIS
30
+ # Gathers information about the indexes.
31
+ #
32
+ def analyze object
33
+ object.each_category do |category|
34
+ @indexes << <<-ANALYSIS
35
+ #{"#{category.index_name}".indented_to_s}\n
36
+ #{"#{category.name}".indented_to_s(4)}\n
37
+ #{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
38
+ #{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
39
+ ANALYSIS
40
+ end
39
41
  end
40
- end
41
42
 
42
- # Outputs all gathered statistics.
43
- #
44
- def to_s
45
- <<-STATS
43
+ # Outputs all gathered statistics.
44
+ #
45
+ def to_s
46
+ <<-STATS
46
47
 
47
- Picky Configuration:
48
+ Picky Configuration:
48
49
 
49
- #{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
50
- STATS
51
- end
50
+ #{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
51
+ STATS
52
+ end
52
53
 
53
- # Internal methods.
54
- #
54
+ # Internal methods.
55
+ #
56
+
57
+ def lines_of_code text
58
+ text.scan(/^\s*[^#\s].*$/).size
59
+ end
55
60
 
56
- def lines_of_code text
57
- text.scan(/^\s*[^#\s].*$/).size
58
61
  end
59
62
 
60
63
  end
@@ -0,0 +1,3 @@
1
+ all_rake_files = File.expand_path '../../tasks/*.rake', __FILE__
2
+
3
+ Dir[all_rake_files].each { |rakefile| load rakefile }
@@ -1,16 +1,18 @@
1
- module Tokenizers # :nodoc:all
1
+ module Picky
2
2
 
3
- # Defines tokenizing processes used both in indexing and querying.
4
- #
5
- class Base
3
+ module Tokenizers # :nodoc:all
6
4
 
7
- # TODO Move EMPTY_STRING top level.
5
+ # Defines tokenizing processes used both in indexing and querying.
8
6
  #
9
- EMPTY_STRING = ''.freeze
7
+ class Base
10
8
 
11
- def to_s
12
- reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
13
- <<-TOKENIZER
9
+ # TODO Move EMPTY_STRING top level.
10
+ #
11
+ EMPTY_STRING = ''.freeze
12
+
13
+ def to_s
14
+ reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
15
+ <<-TOKENIZER
14
16
  Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
15
17
  Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
16
18
  Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
@@ -19,204 +21,207 @@ Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_wor
19
21
  Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
20
22
  Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
21
23
  Case sensitive? #{@case_sensitive ? "Yes." : "-"}
22
- TOKENIZER
23
- end
24
+ TOKENIZER
25
+ end
24
26
 
25
- # Stopwords.
26
- #
27
- # We only allow regexps (even if string would be okay
28
- # too for gsub! - it's too hard to understand)
29
- #
30
- def stopwords regexp
31
- check_argument_in __method__, Regexp, regexp
32
- @remove_stopwords_regexp = regexp
33
- end
34
- def remove_stopwords text
35
- text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
36
- text
37
- end
38
- @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
39
- def remove_non_single_stopwords text
40
- return text if text.match @@non_single_stopword_regexp
41
- remove_stopwords text
42
- end
27
+ # Stopwords.
28
+ #
29
+ # We only allow regexps (even if string would be okay
30
+ # too for gsub! - it's too hard to understand)
31
+ #
32
+ def stopwords regexp
33
+ check_argument_in __method__, Regexp, regexp
34
+ @remove_stopwords_regexp = regexp
35
+ end
36
+ def remove_stopwords text
37
+ text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
38
+ text
39
+ end
40
+ @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
41
+ def remove_non_single_stopwords text
42
+ return text if text.match @@non_single_stopword_regexp
43
+ remove_stopwords text
44
+ end
43
45
 
44
- # Illegals.
45
- #
46
- # We only allow regexps (even if string would be okay
47
- # too for gsub! - it's too hard to understand)
48
- #
49
- def removes_characters regexp
50
- check_argument_in __method__, Regexp, regexp
51
- @removes_characters_regexp = regexp
52
- end
53
- def remove_illegals text
54
- text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
55
- text
56
- end
46
+ # Illegals.
47
+ #
48
+ # We only allow regexps (even if string would be okay
49
+ # too for gsub! - it's too hard to understand)
50
+ #
51
+ def removes_characters regexp
52
+ check_argument_in __method__, Regexp, regexp
53
+ @removes_characters_regexp = regexp
54
+ end
55
+ def remove_illegals text
56
+ text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
57
+ text
58
+ end
57
59
 
58
- # Splitting.
59
- #
60
- # We allow Strings and Regexps.
61
- # Note: We do not test against to_str since symbols do not work with String#split.
62
- #
63
- def splits_text_on regexp_or_string
64
- raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
65
- @splits_text_on = regexp_or_string
66
- end
67
- def split text
68
- text.split @splits_text_on
69
- end
60
+ # Splitting.
61
+ #
62
+ # We allow Strings and Regexps.
63
+ # Note: We do not test against to_str since symbols do not work with String#split.
64
+ #
65
+ def splits_text_on regexp_or_string
66
+ raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
67
+ @splits_text_on = regexp_or_string
68
+ end
69
+ def split text
70
+ text.split @splits_text_on
71
+ end
70
72
 
71
- # Normalizing.
72
- #
73
- # We only allow arrays.
74
- #
75
- def normalizes_words regexp_replaces
76
- raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
77
- @normalizes_words_regexp_replaces = regexp_replaces
78
- end
79
- def normalize_with_patterns text
80
- return text unless @normalizes_words_regexp_replaces
73
+ # Normalizing.
74
+ #
75
+ # We only allow arrays.
76
+ #
77
+ def normalizes_words regexp_replaces
78
+ raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
79
+ @normalizes_words_regexp_replaces = regexp_replaces
80
+ end
81
+ def normalize_with_patterns text
82
+ return text unless @normalizes_words_regexp_replaces
81
83
 
82
- @normalizes_words_regexp_replaces.each do |regex, replace|
83
- # This should be sufficient
84
- #
85
- text.gsub!(regex, replace) and break
84
+ @normalizes_words_regexp_replaces.each do |regex, replace|
85
+ # This should be sufficient
86
+ #
87
+ text.gsub!(regex, replace) and break
88
+ end
89
+
90
+ remove_after_normalizing_illegals text
91
+ text
86
92
  end
87
- remove_after_normalizing_illegals text
88
- text
89
- end
90
93
 
91
- # Illegal after normalizing.
92
- #
93
- # We only allow regexps (even if string would be okay
94
- # too for gsub! - it's too hard to understand)
95
- #
96
- def removes_characters_after_splitting regexp
97
- check_argument_in __method__, Regexp, regexp
98
- @removes_characters_after_splitting_regexp = regexp
99
- end
100
- def remove_after_normalizing_illegals text
101
- text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
102
- end
94
+ # Illegal after normalizing.
95
+ #
96
+ # We only allow regexps (even if string would be okay
97
+ # too for gsub! - it's too hard to understand)
98
+ #
99
+ def removes_characters_after_splitting regexp
100
+ check_argument_in __method__, Regexp, regexp
101
+ @removes_characters_after_splitting_regexp = regexp
102
+ end
103
+ def remove_after_normalizing_illegals text
104
+ text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
105
+ end
103
106
 
104
- # Substitute Characters with this substituter.
105
- #
106
- # Default is European Character substitution.
107
- #
108
- def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
109
- raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
110
- @substituter = substituter
111
- end
112
- def substitute_characters text
113
- substituter?? substituter.substitute(text) : text
114
- end
107
+ # Substitute Characters with this substituter.
108
+ #
109
+ # Default is European Character substitution.
110
+ #
111
+ def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
112
+ raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
113
+ @substituter = substituter
114
+ end
115
+ def substitute_characters text
116
+ substituter?? substituter.substitute(text) : text
117
+ end
115
118
 
116
- # Reject tokens after tokenizing based on the given criteria.
117
- #
118
- # Note: Currently only for indexing.
119
- #
120
- def reject_token_if &condition
121
- @reject_condition = condition
122
- end
123
- def reject tokens
124
- tokens.reject! &@reject_condition
125
- end
119
+ # Reject tokens after tokenizing based on the given criteria.
120
+ #
121
+ # Note: Currently only for indexing.
122
+ #
123
+ def reject_token_if &condition
124
+ @reject_condition = condition
125
+ end
126
+ def reject tokens
127
+ tokens.reject! &@reject_condition
128
+ end
126
129
 
127
- def case_sensitive case_sensitive
128
- @case_sensitive = case_sensitive
129
- end
130
- def downcase?
131
- !@case_sensitive
132
- end
130
+ def case_sensitive case_sensitive
131
+ @case_sensitive = case_sensitive
132
+ end
133
+ def downcase?
134
+ !@case_sensitive
135
+ end
133
136
 
134
- # Checks if the right argument type has been given.
135
- #
136
- def check_argument_in method, type, argument, &condition
137
- raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
138
- end
137
+ # Checks if the right argument type has been given.
138
+ #
139
+ def check_argument_in method, type, argument, &condition
140
+ raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
141
+ end
139
142
 
140
143
 
141
- # Returns a number of tokens, generated from the given text.
142
- #
143
- # Note:
144
- # * preprocess, pretokenize are hooks
145
- #
146
- def tokenize text
147
- text = preprocess text # processing the text
148
- return empty_tokens if text.blank?
149
- words = pretokenize text # splitting and preparations for tokenizing
150
- return empty_tokens if words.empty?
151
- tokens = tokens_for words # creating tokens / strings
152
- process tokens # processing tokens / strings
153
- end
144
+ # Returns a number of tokens, generated from the given text.
145
+ #
146
+ # Note:
147
+ # * preprocess, pretokenize are hooks
148
+ #
149
+ def tokenize text
150
+ text = preprocess text # processing the text
151
+ return empty_tokens if text.blank?
152
+ words = pretokenize text # splitting and preparations for tokenizing
153
+ return empty_tokens if words.empty?
154
+ tokens = tokens_for words # creating tokens / strings
155
+ process tokens # processing tokens / strings
156
+ end
157
+
158
+ attr_reader :substituter
159
+ alias substituter? substituter
154
160
 
155
- attr_reader :substituter
156
- alias substituter? substituter
161
+ def initialize options = {}
162
+ removes_characters options[:removes_characters] if options[:removes_characters]
163
+ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
164
+ stopwords options[:stopwords] if options[:stopwords]
165
+ normalizes_words options[:normalizes_words] if options[:normalizes_words]
166
+ removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
167
+ substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
168
+ case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
157
169
 
158
- def initialize options = {}
159
- removes_characters options[:removes_characters] if options[:removes_characters]
160
- contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
161
- stopwords options[:stopwords] if options[:stopwords]
162
- normalizes_words options[:normalizes_words] if options[:normalizes_words]
163
- removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
164
- substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
165
- case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
170
+ # Defaults.
171
+ #
172
+ splits_text_on options[:splits_text_on] || /\s/
173
+ reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
174
+ end
166
175
 
167
- # Defaults.
176
+ # Default preprocessing hook.
168
177
  #
169
- splits_text_on options[:splits_text_on] || /\s/
170
- reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
171
- end
178
+ # Does:
179
+ # 1. Character substitution.
180
+ # 2. Remove illegal expressions.
181
+ # 3. Remove non-single stopwords. (Stopwords that occur with other words)
182
+ #
183
+ def preprocess text
184
+ text = substitute_characters text
185
+ remove_illegals text
186
+ # We do not remove single stopwords e.g. in the indexer for
187
+ # an entirely different reason than in the query tokenizer.
188
+ # An indexed thing with just name "UND" (a possible stopword)
189
+ # should not lose its name.
190
+ #
191
+ remove_non_single_stopwords text
192
+ text
193
+ end
194
+ # Pretokenizing.
195
+ #
196
+ # Does:
197
+ # 1. Split the text into words.
198
+ # 2. Normalize each word.
199
+ #
200
+ def pretokenize text
201
+ words = split text
202
+ words.collect! do |word|
203
+ normalize_with_patterns word
204
+ word
205
+ end
206
+ end
207
+ # Basic postprocessing (overridden in both query/index tokenizers).
208
+ #
209
+ def process tokens
210
+ reject tokens # Reject any tokens that don't meet criteria
211
+ tokens
212
+ end
172
213
 
173
- # Default preprocessing hook.
174
- #
175
- # Does:
176
- # 1. Character substitution.
177
- # 2. Remove illegal expressions.
178
- # 3. Remove non-single stopwords. (Stopwords that occur with other words)
179
- #
180
- def preprocess text
181
- text = substitute_characters text
182
- remove_illegals text
183
- # We do not remove single stopwords e.g. in the indexer for
184
- # an entirely different reason than in the query tokenizer.
185
- # An indexed thing with just name "UND" (a possible stopword)
186
- # should not lose its name.
187
- #
188
- remove_non_single_stopwords text
189
- text
190
- end
191
- # Pretokenizing.
192
- #
193
- # Does:
194
- # 1. Split the text into words.
195
- # 2. Normalize each word.
196
- #
197
- def pretokenize text
198
- words = split text
199
- words.collect! do |word|
200
- normalize_with_patterns word
201
- word
214
+ # # Converts words into real tokens.
215
+ # #
216
+ # def tokens_for words
217
+ # Query::Tokens.new words.collect! { |word| token_for word }
218
+ # end
219
+ # Turns non-blank text into symbols.
220
+ #
221
+ def symbolize text
222
+ text.blank? ? nil : text.to_sym
202
223
  end
203
- end
204
- # Basic postprocessing (overridden in both query/index tokenizers).
205
- #
206
- def process tokens
207
- reject tokens # Reject any tokens that don't meet criteria
208
- tokens
209
- end
210
224
 
211
- # # Converts words into real tokens.
212
- # #
213
- # def tokens_for words
214
- # Query::Tokens.new words.collect! { |word| token_for word }
215
- # end
216
- # Turns non-blank text into symbols.
217
- #
218
- def symbolize text
219
- text.blank? ? nil : text.to_sym
220
225
  end
221
226
 
222
227
  end
@@ -1,28 +1,32 @@
1
- module Tokenizers
1
+ module Picky
2
2
 
3
- # The base indexing tokenizer.
4
- #
5
- # Override in indexing subclasses and define in configuration.
6
- #
7
- class Index < Base
3
+ module Tokenizers
8
4
 
9
- def self.default= new_default
10
- @default = new_default
11
- end
12
- def self.default
13
- @default ||= new
14
- end
15
-
16
- # Does not actually return a token, but a
17
- # symbol "token".
5
+ # The base indexing tokenizer.
18
6
  #
19
- def tokens_for words
20
- words.collect! { |word| word.downcase! if downcase?; word.to_sym }
21
- end
22
- # Returns empty tokens.
7
+ # Override in indexing subclasses and define in configuration.
23
8
  #
24
- def empty_tokens
25
- []
9
+ class Index < Base
10
+
11
+ def self.default= new_default
12
+ @default = new_default
13
+ end
14
+ def self.default
15
+ @default ||= new
16
+ end
17
+
18
+ # Does not actually return a token, but a
19
+ # symbol "token".
20
+ #
21
+ def tokens_for words
22
+ words.collect! { |word| word.downcase! if downcase?; word.to_sym }
23
+ end
24
+ # Returns empty tokens.
25
+ #
26
+ def empty_tokens
27
+ []
28
+ end
29
+
26
30
  end
27
31
 
28
32
  end