picky 2.7.0 → 3.0.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. data/lib/picky/adapters/rack/base.rb +20 -16
  2. data/lib/picky/adapters/rack/live_parameters.rb +28 -24
  3. data/lib/picky/adapters/rack/search.rb +67 -0
  4. data/lib/picky/adapters/rack.rb +27 -23
  5. data/lib/picky/application.rb +246 -236
  6. data/lib/picky/backend/base.rb +115 -119
  7. data/lib/picky/backend/file/basic.rb +102 -98
  8. data/lib/picky/backend/file/json.rb +27 -23
  9. data/lib/picky/backend/file/marshal.rb +32 -28
  10. data/lib/picky/backend/file/text.rb +45 -41
  11. data/lib/picky/backend/files.rb +19 -15
  12. data/lib/picky/backend/redis/basic.rb +76 -72
  13. data/lib/picky/backend/redis/list_hash.rb +40 -36
  14. data/lib/picky/backend/redis/string_hash.rb +30 -26
  15. data/lib/picky/backend/redis.rb +32 -28
  16. data/lib/picky/bundle.rb +82 -57
  17. data/lib/{bundling.rb → picky/bundling.rb} +0 -0
  18. data/lib/picky/calculations/location.rb +51 -47
  19. data/lib/picky/categories.rb +60 -56
  20. data/lib/picky/categories_indexed.rb +73 -82
  21. data/lib/picky/categories_indexing.rb +12 -8
  22. data/lib/picky/category.rb +109 -120
  23. data/lib/picky/category_indexed.rb +39 -41
  24. data/lib/picky/category_indexing.rb +123 -125
  25. data/lib/picky/character_substituters/west_european.rb +32 -26
  26. data/lib/{constants.rb → picky/constants.rb} +0 -0
  27. data/lib/picky/cores.rb +96 -92
  28. data/lib/{deployment.rb → picky/deployment.rb} +0 -0
  29. data/lib/picky/frontend_adapters/rack.rb +133 -118
  30. data/lib/picky/generators/aliases.rb +5 -3
  31. data/lib/picky/generators/base.rb +11 -7
  32. data/lib/picky/generators/partial/default.rb +7 -3
  33. data/lib/picky/generators/partial/none.rb +24 -20
  34. data/lib/picky/generators/partial/strategy.rb +20 -16
  35. data/lib/picky/generators/partial/substring.rb +94 -90
  36. data/lib/picky/generators/partial_generator.rb +11 -7
  37. data/lib/picky/generators/similarity/default.rb +9 -5
  38. data/lib/picky/generators/similarity/double_metaphone.rb +20 -16
  39. data/lib/picky/generators/similarity/metaphone.rb +20 -16
  40. data/lib/picky/generators/similarity/none.rb +23 -19
  41. data/lib/picky/generators/similarity/phonetic.rb +49 -45
  42. data/lib/picky/generators/similarity/soundex.rb +20 -16
  43. data/lib/picky/generators/similarity/strategy.rb +10 -6
  44. data/lib/picky/generators/similarity_generator.rb +11 -7
  45. data/lib/picky/generators/strategy.rb +14 -10
  46. data/lib/picky/generators/weights/default.rb +9 -5
  47. data/lib/picky/generators/weights/logarithmic.rb +30 -26
  48. data/lib/picky/generators/weights/strategy.rb +10 -6
  49. data/lib/picky/generators/weights_generator.rb +11 -7
  50. data/lib/picky/helpers/measuring.rb +20 -16
  51. data/lib/picky/indexed/bundle/base.rb +39 -37
  52. data/lib/picky/indexed/bundle/memory.rb +68 -64
  53. data/lib/picky/indexed/bundle/redis.rb +73 -69
  54. data/lib/picky/indexed/wrappers/bundle/calculation.rb +26 -22
  55. data/lib/picky/indexed/wrappers/bundle/location.rb +30 -26
  56. data/lib/picky/indexed/wrappers/bundle/wrapper.rb +36 -32
  57. data/lib/picky/indexed/wrappers/category/location.rb +17 -13
  58. data/lib/picky/indexed/wrappers/exact_first.rb +46 -42
  59. data/lib/picky/indexers/base.rb +26 -22
  60. data/lib/picky/indexers/parallel.rb +62 -58
  61. data/lib/picky/indexers/serial.rb +41 -37
  62. data/lib/picky/indexes/index.rb +400 -0
  63. data/lib/picky/indexes/index_indexed.rb +24 -0
  64. data/lib/picky/indexes/index_indexing.rb +138 -0
  65. data/lib/picky/indexes/memory.rb +20 -0
  66. data/lib/picky/indexes/redis.rb +20 -0
  67. data/lib/picky/indexes.rb +68 -61
  68. data/lib/picky/indexes_indexed.rb +16 -12
  69. data/lib/picky/indexes_indexing.rb +41 -37
  70. data/lib/picky/indexing/bundle/base.rb +216 -205
  71. data/lib/picky/indexing/bundle/memory.rb +16 -11
  72. data/lib/picky/indexing/bundle/redis.rb +14 -12
  73. data/lib/picky/indexing/wrappers/category/location.rb +17 -13
  74. data/lib/picky/interfaces/live_parameters.rb +159 -154
  75. data/lib/picky/loader.rb +267 -304
  76. data/lib/picky/loggers/search.rb +20 -13
  77. data/lib/picky/no_source_specified_exception.rb +7 -3
  78. data/lib/picky/performant.rb +6 -2
  79. data/lib/picky/query/allocation.rb +71 -67
  80. data/lib/picky/query/allocations.rb +99 -94
  81. data/lib/picky/query/combination.rb +70 -66
  82. data/lib/picky/query/combinations/base.rb +56 -52
  83. data/lib/picky/query/combinations/memory.rb +36 -32
  84. data/lib/picky/query/combinations/redis.rb +66 -62
  85. data/lib/picky/query/indexes.rb +175 -160
  86. data/lib/picky/query/qualifier_category_mapper.rb +43 -0
  87. data/lib/picky/query/token.rb +165 -172
  88. data/lib/picky/query/tokens.rb +86 -82
  89. data/lib/picky/query/weights.rb +44 -48
  90. data/lib/picky/query.rb +5 -1
  91. data/lib/picky/rack/harakiri.rb +51 -47
  92. data/lib/picky/results.rb +81 -77
  93. data/lib/picky/search.rb +169 -158
  94. data/lib/picky/sinatra.rb +34 -0
  95. data/lib/picky/sources/base.rb +73 -70
  96. data/lib/picky/sources/couch.rb +61 -57
  97. data/lib/picky/sources/csv.rb +68 -64
  98. data/lib/picky/sources/db.rb +139 -135
  99. data/lib/picky/sources/delicious.rb +52 -48
  100. data/lib/picky/sources/mongo.rb +68 -63
  101. data/lib/picky/sources/wrappers/base.rb +20 -16
  102. data/lib/picky/sources/wrappers/location.rb +37 -33
  103. data/lib/picky/statistics.rb +46 -43
  104. data/lib/picky/tasks.rb +3 -0
  105. data/lib/picky/tokenizers/base.rb +192 -187
  106. data/lib/picky/tokenizers/index.rb +25 -21
  107. data/lib/picky/tokenizers/location.rb +33 -29
  108. data/lib/picky/tokenizers/query.rb +49 -43
  109. data/lib/picky.rb +21 -13
  110. data/lib/tasks/application.rake +1 -1
  111. data/lib/tasks/index.rake +3 -3
  112. data/lib/tasks/routes.rake +1 -1
  113. data/lib/tasks/server.rake +1 -1
  114. data/spec/lib/adapters/rack/base_spec.rb +1 -1
  115. data/spec/lib/adapters/rack/live_parameters_spec.rb +1 -1
  116. data/spec/lib/adapters/rack/query_spec.rb +1 -1
  117. data/spec/lib/application_spec.rb +39 -32
  118. data/spec/lib/backend/file/basic_spec.rb +2 -2
  119. data/spec/lib/backend/file/json_spec.rb +2 -2
  120. data/spec/lib/backend/file/marshal_spec.rb +2 -2
  121. data/spec/lib/backend/file/text_spec.rb +1 -1
  122. data/spec/lib/backend/files_spec.rb +14 -24
  123. data/spec/lib/backend/redis/basic_spec.rb +2 -2
  124. data/spec/lib/backend/redis/list_hash_spec.rb +3 -3
  125. data/spec/lib/backend/redis/string_hash_spec.rb +3 -3
  126. data/spec/lib/backend/redis_spec.rb +20 -13
  127. data/spec/lib/calculations/location_spec.rb +1 -1
  128. data/spec/lib/categories_indexed_spec.rb +16 -34
  129. data/spec/lib/category_indexed_spec.rb +9 -27
  130. data/spec/lib/category_indexing_spec.rb +2 -3
  131. data/spec/lib/category_spec.rb +10 -10
  132. data/spec/lib/character_substituters/west_european_spec.rb +6 -5
  133. data/spec/lib/cores_spec.rb +17 -17
  134. data/spec/lib/extensions/symbol_spec.rb +15 -1
  135. data/spec/lib/frontend_adapters/rack_spec.rb +20 -20
  136. data/spec/lib/generators/aliases_spec.rb +3 -3
  137. data/spec/lib/generators/cacher_strategy_spec.rb +1 -1
  138. data/spec/lib/generators/partial/default_spec.rb +3 -3
  139. data/spec/lib/generators/partial/none_spec.rb +2 -2
  140. data/spec/lib/generators/partial/substring_spec.rb +1 -1
  141. data/spec/lib/generators/partial_generator_spec.rb +3 -3
  142. data/spec/lib/generators/similarity/double_metaphone_spec.rb +1 -1
  143. data/spec/lib/generators/similarity/metaphone_spec.rb +1 -1
  144. data/spec/lib/generators/similarity/none_spec.rb +1 -1
  145. data/spec/lib/generators/similarity/phonetic_spec.rb +1 -1
  146. data/spec/lib/generators/similarity/soundex_spec.rb +1 -1
  147. data/spec/lib/generators/similarity_generator_spec.rb +2 -2
  148. data/spec/lib/generators/weights/logarithmic_spec.rb +1 -1
  149. data/spec/lib/generators/weights_generator_spec.rb +1 -1
  150. data/spec/lib/helpers/measuring_spec.rb +2 -2
  151. data/spec/lib/indexed/bundle/memory_spec.rb +6 -6
  152. data/spec/lib/indexed/bundle/redis_spec.rb +4 -4
  153. data/spec/lib/indexed/wrappers/bundle/calculation_spec.rb +2 -3
  154. data/spec/lib/indexed/wrappers/bundle/wrapper_spec.rb +2 -2
  155. data/spec/lib/indexed/wrappers/exact_first_spec.rb +5 -5
  156. data/spec/lib/indexers/base_spec.rb +1 -1
  157. data/spec/lib/indexers/parallel_spec.rb +1 -1
  158. data/spec/lib/indexers/serial_spec.rb +1 -1
  159. data/spec/lib/{index/base_indexed_spec.rb → indexes/index_indexed_spec.rb} +3 -3
  160. data/spec/lib/{index/base_indexing_spec.rb → indexes/index_indexing_spec.rb} +19 -2
  161. data/spec/lib/{index/base_spec.rb → indexes/index_spec.rb} +6 -25
  162. data/spec/lib/{index → indexes}/redis_spec.rb +1 -1
  163. data/spec/lib/indexes_class_spec.rb +2 -2
  164. data/spec/lib/indexes_indexed_spec.rb +1 -1
  165. data/spec/lib/indexes_indexing_spec.rb +1 -1
  166. data/spec/lib/indexes_spec.rb +1 -1
  167. data/spec/lib/indexing/bundle/base_spec.rb +7 -5
  168. data/spec/lib/indexing/bundle/memory_partial_generation_speed_spec.rb +4 -4
  169. data/spec/lib/indexing/bundle/memory_spec.rb +15 -15
  170. data/spec/lib/indexing/bundle/redis_spec.rb +9 -9
  171. data/spec/lib/interfaces/live_parameters_spec.rb +5 -5
  172. data/spec/lib/loader_spec.rb +17 -19
  173. data/spec/lib/loggers/search_spec.rb +2 -2
  174. data/spec/lib/query/allocation_spec.rb +1 -1
  175. data/spec/lib/query/allocations_spec.rb +1 -1
  176. data/spec/lib/query/combination_spec.rb +4 -4
  177. data/spec/lib/query/combinations/base_spec.rb +1 -1
  178. data/spec/lib/query/combinations/memory_spec.rb +1 -1
  179. data/spec/lib/query/combinations/redis_spec.rb +1 -1
  180. data/spec/lib/query/indexes_spec.rb +7 -2
  181. data/spec/lib/query/qualifier_category_mapper_spec.rb +34 -0
  182. data/spec/lib/query/token_spec.rb +32 -53
  183. data/spec/lib/query/tokens_spec.rb +30 -35
  184. data/spec/lib/query/weights_spec.rb +16 -16
  185. data/spec/lib/rack/harakiri_spec.rb +5 -5
  186. data/spec/lib/results_spec.rb +1 -1
  187. data/spec/lib/search_spec.rb +24 -22
  188. data/spec/lib/sinatra_spec.rb +36 -0
  189. data/spec/lib/sources/base_spec.rb +1 -1
  190. data/spec/lib/sources/couch_spec.rb +9 -9
  191. data/spec/lib/sources/csv_spec.rb +7 -7
  192. data/spec/lib/sources/db_spec.rb +2 -2
  193. data/spec/lib/sources/delicious_spec.rb +5 -5
  194. data/spec/lib/sources/mongo_spec.rb +7 -7
  195. data/spec/lib/sources/wrappers/base_spec.rb +2 -2
  196. data/spec/lib/sources/wrappers/location_spec.rb +1 -1
  197. data/spec/lib/statistics_spec.rb +1 -1
  198. data/spec/lib/tokenizers/base_spec.rb +2 -2
  199. data/spec/lib/tokenizers/index_spec.rb +1 -1
  200. data/spec/lib/tokenizers/query_spec.rb +1 -1
  201. metadata +30 -30
  202. data/lib/picky/adapters/rack/query.rb +0 -65
  203. data/lib/picky/index/base.rb +0 -409
  204. data/lib/picky/index/base_indexed.rb +0 -29
  205. data/lib/picky/index/base_indexing.rb +0 -127
  206. data/lib/picky/index/memory.rb +0 -16
  207. data/lib/picky/index/redis.rb +0 -16
  208. data/lib/picky/query/qualifiers.rb +0 -76
  209. data/lib/picky/query/solr.rb +0 -60
  210. data/lib/picky/signals.rb +0 -8
  211. data/lib/picky-tasks.rb +0 -6
  212. data/lib/tasks/spec.rake +0 -11
  213. data/spec/lib/query/qualifiers_spec.rb +0 -31
@@ -1,48 +1,52 @@
1
- module Sources
1
+ module Picky
2
2
 
3
- module Wrappers
3
+ module Sources
4
4
 
5
- # Should this actually just be a tokenizer?
6
- #
7
- class Location < Base
5
+ module Wrappers
8
6
 
9
- attr_reader :calculation
10
-
11
- def initialize source, grid, precision = 1
12
- super source
13
- @calculation = Calculations::Location.new grid, precision
14
- end
15
-
16
- # Yield the data (id, text for id) for the given category.
7
+ # Should this actually just be a tokenizer?
17
8
  #
18
- def harvest category
19
- minimum = 1.0/0
9
+ class Location < Base
20
10
 
21
- # Cache. TODO Make option?
22
- #
23
- locations = []
11
+ attr_reader :calculation
24
12
 
25
- # Gather min/max.
26
- #
27
- source.harvest category do |indexed_id, location|
28
- location = location.to_f
29
- minimum = location if location < minimum
30
- locations << [indexed_id, location]
13
+ def initialize source, grid, precision = 1
14
+ super source
15
+ @calculation = Calculations::Location.new grid, precision
31
16
  end
32
17
 
33
- calculation.minimum = minimum
34
-
35
- # Recalculate locations.
18
+ # Yield the data (id, text for id) for the given category.
36
19
  #
37
- locations.each do |indexed_id, location|
38
- calculation.recalculated_range(location).each do |new_location|
39
- yield indexed_id, new_location.to_s
20
+ def harvest category
21
+ minimum = 1.0/0
22
+
23
+ # Cache.
24
+ #
25
+ locations = []
26
+
27
+ # Gather min/max.
28
+ #
29
+ source.harvest category do |indexed_id, location|
30
+ location = location.to_f
31
+ minimum = location if location < minimum
32
+ locations << [indexed_id, location]
40
33
  end
34
+
35
+ calculation.minimum = minimum
36
+
37
+ # Recalculate locations.
38
+ #
39
+ locations.each do |indexed_id, location|
40
+ calculation.recalculated_range(location).each do |new_location|
41
+ yield indexed_id, new_location.to_s
42
+ end
43
+ end
44
+
45
+ # TODO Move to the right place.
46
+ #
47
+ category.indexing_exact[:location_minimum] = minimum
41
48
  end
42
49
 
43
- # TODO Move to the right place.
44
- #
45
- category.indexing_exact[:location_minimum] = minimum
46
50
  end
47
51
 
48
52
  end
@@ -1,60 +1,63 @@
1
1
  # encoding: utf-8
2
2
  #
3
+ module Picky
3
4
 
4
- # Gathers various statistics.
5
- #
6
- class Statistics # :nodoc:all
5
+ # Gathers various statistics.
6
+ #
7
+ class Statistics # :nodoc:all
7
8
 
8
- def initialize
9
- @indexes = ["\033[1mIndexes analysis\033[m:"]
10
- end
9
+ def initialize
10
+ @indexes = ["\033[1mIndexes analysis\033[m:"]
11
+ end
11
12
 
12
- def preamble
13
- loc = lines_of_code File.open('app/application.rb').read
13
+ def preamble
14
+ loc = lines_of_code File.open('app/application.rb').read
14
15
 
15
- @preamble ||= <<-PREAMBLE
16
- \033[1mApplication(s)\033[m
17
- Definition LOC: #{"%4d" % loc}
18
- Indexes defined: #{"%4d" % Indexes.size}
19
- PREAMBLE
20
- end
16
+ @preamble ||= <<-PREAMBLE
17
+ \033[1mApplication(s)\033[m
18
+ Definition LOC: #{"%4d" % loc}
19
+ Indexes defined: #{"%4d" % Indexes.size}
20
+ PREAMBLE
21
+ end
21
22
 
22
- # Gathers information about the application.
23
- #
24
- def application
25
- preamble
26
- @application = Application.apps.map &:indented_to_s
27
- end
23
+ # Gathers information about the application.
24
+ #
25
+ def application
26
+ preamble
27
+ @application = Application.apps.map &:indented_to_s
28
+ end
28
29
 
29
- # Gathers information about the indexes.
30
- #
31
- def analyze object
32
- object.each_category do |category|
33
- @indexes << <<-ANALYSIS
34
- #{"#{category.index_name}".indented_to_s}\n
35
- #{"#{category.name}".indented_to_s(4)}\n
36
- #{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
37
- #{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
38
- ANALYSIS
30
+ # Gathers information about the indexes.
31
+ #
32
+ def analyze object
33
+ object.each_category do |category|
34
+ @indexes << <<-ANALYSIS
35
+ #{"#{category.index_name}".indented_to_s}\n
36
+ #{"#{category.name}".indented_to_s(4)}\n
37
+ #{"exact\n#{Analyzer.new.analyze(category.indexed_exact).indented_to_s}".indented_to_s(6)}\n
38
+ #{"partial\n#{Analyzer.new.analyze(category.indexed_partial).indented_to_s}".indented_to_s(6)}
39
+ ANALYSIS
40
+ end
39
41
  end
40
- end
41
42
 
42
- # Outputs all gathered statistics.
43
- #
44
- def to_s
45
- <<-STATS
43
+ # Outputs all gathered statistics.
44
+ #
45
+ def to_s
46
+ <<-STATS
46
47
 
47
- Picky Configuration:
48
+ Picky Configuration:
48
49
 
49
- #{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
50
- STATS
51
- end
50
+ #{[@preamble, @application, @indexes.join("\n")].compact.join("\n")}
51
+ STATS
52
+ end
52
53
 
53
- # Internal methods.
54
- #
54
+ # Internal methods.
55
+ #
56
+
57
+ def lines_of_code text
58
+ text.scan(/^\s*[^#\s].*$/).size
59
+ end
55
60
 
56
- def lines_of_code text
57
- text.scan(/^\s*[^#\s].*$/).size
58
61
  end
59
62
 
60
63
  end
@@ -0,0 +1,3 @@
1
+ all_rake_files = File.expand_path '../../tasks/*.rake', __FILE__
2
+
3
+ Dir[all_rake_files].each { |rakefile| load rakefile }
@@ -1,16 +1,18 @@
1
- module Tokenizers # :nodoc:all
1
+ module Picky
2
2
 
3
- # Defines tokenizing processes used both in indexing and querying.
4
- #
5
- class Base
3
+ module Tokenizers # :nodoc:all
6
4
 
7
- # TODO Move EMPTY_STRING top level.
5
+ # Defines tokenizing processes used both in indexing and querying.
8
6
  #
9
- EMPTY_STRING = ''.freeze
7
+ class Base
10
8
 
11
- def to_s
12
- reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
13
- <<-TOKENIZER
9
+ # TODO Move EMPTY_STRING top level.
10
+ #
11
+ EMPTY_STRING = ''.freeze
12
+
13
+ def to_s
14
+ reject_condition_location = @reject_condition.to_s[/:(\d+) \(lambda\)/, 1]
15
+ <<-TOKENIZER
14
16
  Removes characters: #{@removes_characters_regexp ? "/#{@removes_characters_regexp.source}/" : '-'}
15
17
  Stopwords: #{@remove_stopwords_regexp ? "/#{@remove_stopwords_regexp.source}/" : '-'}
16
18
  Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on.source}/" : (@splits_text_on ? @splits_text_on : '-')}
@@ -19,204 +21,207 @@ Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_wor
19
21
  Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
20
22
  Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
21
23
  Case sensitive? #{@case_sensitive ? "Yes." : "-"}
22
- TOKENIZER
23
- end
24
+ TOKENIZER
25
+ end
24
26
 
25
- # Stopwords.
26
- #
27
- # We only allow regexps (even if string would be okay
28
- # too for gsub! - it's too hard to understand)
29
- #
30
- def stopwords regexp
31
- check_argument_in __method__, Regexp, regexp
32
- @remove_stopwords_regexp = regexp
33
- end
34
- def remove_stopwords text
35
- text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
36
- text
37
- end
38
- @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
39
- def remove_non_single_stopwords text
40
- return text if text.match @@non_single_stopword_regexp
41
- remove_stopwords text
42
- end
27
+ # Stopwords.
28
+ #
29
+ # We only allow regexps (even if string would be okay
30
+ # too for gsub! - it's too hard to understand)
31
+ #
32
+ def stopwords regexp
33
+ check_argument_in __method__, Regexp, regexp
34
+ @remove_stopwords_regexp = regexp
35
+ end
36
+ def remove_stopwords text
37
+ text.gsub! @remove_stopwords_regexp, EMPTY_STRING if @remove_stopwords_regexp
38
+ text
39
+ end
40
+ @@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
41
+ def remove_non_single_stopwords text
42
+ return text if text.match @@non_single_stopword_regexp
43
+ remove_stopwords text
44
+ end
43
45
 
44
- # Illegals.
45
- #
46
- # We only allow regexps (even if string would be okay
47
- # too for gsub! - it's too hard to understand)
48
- #
49
- def removes_characters regexp
50
- check_argument_in __method__, Regexp, regexp
51
- @removes_characters_regexp = regexp
52
- end
53
- def remove_illegals text
54
- text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
55
- text
56
- end
46
+ # Illegals.
47
+ #
48
+ # We only allow regexps (even if string would be okay
49
+ # too for gsub! - it's too hard to understand)
50
+ #
51
+ def removes_characters regexp
52
+ check_argument_in __method__, Regexp, regexp
53
+ @removes_characters_regexp = regexp
54
+ end
55
+ def remove_illegals text
56
+ text.gsub! @removes_characters_regexp, EMPTY_STRING if @removes_characters_regexp
57
+ text
58
+ end
57
59
 
58
- # Splitting.
59
- #
60
- # We allow Strings and Regexps.
61
- # Note: We do not test against to_str since symbols do not work with String#split.
62
- #
63
- def splits_text_on regexp_or_string
64
- raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
65
- @splits_text_on = regexp_or_string
66
- end
67
- def split text
68
- text.split @splits_text_on
69
- end
60
+ # Splitting.
61
+ #
62
+ # We allow Strings and Regexps.
63
+ # Note: We do not test against to_str since symbols do not work with String#split.
64
+ #
65
+ def splits_text_on regexp_or_string
66
+ raise ArgumentError.new "#{__method__} takes a Regexp or String as argument, not a #{regexp_or_string.class}." unless Regexp === regexp_or_string || String === regexp_or_string
67
+ @splits_text_on = regexp_or_string
68
+ end
69
+ def split text
70
+ text.split @splits_text_on
71
+ end
70
72
 
71
- # Normalizing.
72
- #
73
- # We only allow arrays.
74
- #
75
- def normalizes_words regexp_replaces
76
- raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
77
- @normalizes_words_regexp_replaces = regexp_replaces
78
- end
79
- def normalize_with_patterns text
80
- return text unless @normalizes_words_regexp_replaces
73
+ # Normalizing.
74
+ #
75
+ # We only allow arrays.
76
+ #
77
+ def normalizes_words regexp_replaces
78
+ raise ArgumentError.new "#{__method__} takes an Array of replaces as argument, not a #{regexp_replaces.class}." unless regexp_replaces.respond_to?(:to_ary)
79
+ @normalizes_words_regexp_replaces = regexp_replaces
80
+ end
81
+ def normalize_with_patterns text
82
+ return text unless @normalizes_words_regexp_replaces
81
83
 
82
- @normalizes_words_regexp_replaces.each do |regex, replace|
83
- # This should be sufficient
84
- #
85
- text.gsub!(regex, replace) and break
84
+ @normalizes_words_regexp_replaces.each do |regex, replace|
85
+ # This should be sufficient
86
+ #
87
+ text.gsub!(regex, replace) and break
88
+ end
89
+
90
+ remove_after_normalizing_illegals text
91
+ text
86
92
  end
87
- remove_after_normalizing_illegals text
88
- text
89
- end
90
93
 
91
- # Illegal after normalizing.
92
- #
93
- # We only allow regexps (even if string would be okay
94
- # too for gsub! - it's too hard to understand)
95
- #
96
- def removes_characters_after_splitting regexp
97
- check_argument_in __method__, Regexp, regexp
98
- @removes_characters_after_splitting_regexp = regexp
99
- end
100
- def remove_after_normalizing_illegals text
101
- text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
102
- end
94
+ # Illegal after normalizing.
95
+ #
96
+ # We only allow regexps (even if string would be okay
97
+ # too for gsub! - it's too hard to understand)
98
+ #
99
+ def removes_characters_after_splitting regexp
100
+ check_argument_in __method__, Regexp, regexp
101
+ @removes_characters_after_splitting_regexp = regexp
102
+ end
103
+ def remove_after_normalizing_illegals text
104
+ text.gsub! @removes_characters_after_splitting_regexp, EMPTY_STRING if @removes_characters_after_splitting_regexp
105
+ end
103
106
 
104
- # Substitute Characters with this substituter.
105
- #
106
- # Default is European Character substitution.
107
- #
108
- def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
109
- raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
110
- @substituter = substituter
111
- end
112
- def substitute_characters text
113
- substituter?? substituter.substitute(text) : text
114
- end
107
+ # Substitute Characters with this substituter.
108
+ #
109
+ # Default is European Character substitution.
110
+ #
111
+ def substitutes_characters_with substituter = CharacterSubstituters::WestEuropean.new
112
+ raise ArgumentError.new "The substitutes_characters_with option needs a character substituter, which responds to #substitute." unless substituter.respond_to?(:substitute)
113
+ @substituter = substituter
114
+ end
115
+ def substitute_characters text
116
+ substituter?? substituter.substitute(text) : text
117
+ end
115
118
 
116
- # Reject tokens after tokenizing based on the given criteria.
117
- #
118
- # Note: Currently only for indexing.
119
- #
120
- def reject_token_if &condition
121
- @reject_condition = condition
122
- end
123
- def reject tokens
124
- tokens.reject! &@reject_condition
125
- end
119
+ # Reject tokens after tokenizing based on the given criteria.
120
+ #
121
+ # Note: Currently only for indexing.
122
+ #
123
+ def reject_token_if &condition
124
+ @reject_condition = condition
125
+ end
126
+ def reject tokens
127
+ tokens.reject! &@reject_condition
128
+ end
126
129
 
127
- def case_sensitive case_sensitive
128
- @case_sensitive = case_sensitive
129
- end
130
- def downcase?
131
- !@case_sensitive
132
- end
130
+ def case_sensitive case_sensitive
131
+ @case_sensitive = case_sensitive
132
+ end
133
+ def downcase?
134
+ !@case_sensitive
135
+ end
133
136
 
134
- # Checks if the right argument type has been given.
135
- #
136
- def check_argument_in method, type, argument, &condition
137
- raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
138
- end
137
+ # Checks if the right argument type has been given.
138
+ #
139
+ def check_argument_in method, type, argument, &condition
140
+ raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
141
+ end
139
142
 
140
143
 
141
- # Returns a number of tokens, generated from the given text.
142
- #
143
- # Note:
144
- # * preprocess, pretokenize are hooks
145
- #
146
- def tokenize text
147
- text = preprocess text # processing the text
148
- return empty_tokens if text.blank?
149
- words = pretokenize text # splitting and preparations for tokenizing
150
- return empty_tokens if words.empty?
151
- tokens = tokens_for words # creating tokens / strings
152
- process tokens # processing tokens / strings
153
- end
144
+ # Returns a number of tokens, generated from the given text.
145
+ #
146
+ # Note:
147
+ # * preprocess, pretokenize are hooks
148
+ #
149
+ def tokenize text
150
+ text = preprocess text # processing the text
151
+ return empty_tokens if text.blank?
152
+ words = pretokenize text # splitting and preparations for tokenizing
153
+ return empty_tokens if words.empty?
154
+ tokens = tokens_for words # creating tokens / strings
155
+ process tokens # processing tokens / strings
156
+ end
157
+
158
+ attr_reader :substituter
159
+ alias substituter? substituter
154
160
 
155
- attr_reader :substituter
156
- alias substituter? substituter
161
+ def initialize options = {}
162
+ removes_characters options[:removes_characters] if options[:removes_characters]
163
+ contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
164
+ stopwords options[:stopwords] if options[:stopwords]
165
+ normalizes_words options[:normalizes_words] if options[:normalizes_words]
166
+ removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
167
+ substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
168
+ case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
157
169
 
158
- def initialize options = {}
159
- removes_characters options[:removes_characters] if options[:removes_characters]
160
- contracts_expressions *options[:contracts_expressions] if options[:contracts_expressions]
161
- stopwords options[:stopwords] if options[:stopwords]
162
- normalizes_words options[:normalizes_words] if options[:normalizes_words]
163
- removes_characters_after_splitting options[:removes_characters_after_splitting] if options[:removes_characters_after_splitting]
164
- substitutes_characters_with options[:substitutes_characters_with] if options[:substitutes_characters_with]
165
- case_sensitive options[:case_sensitive] unless options[:case_sensitive].nil?
170
+ # Defaults.
171
+ #
172
+ splits_text_on options[:splits_text_on] || /\s/
173
+ reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
174
+ end
166
175
 
167
- # Defaults.
176
+ # Default preprocessing hook.
168
177
  #
169
- splits_text_on options[:splits_text_on] || /\s/
170
- reject_token_if &(options[:reject_token_if] || options[:rejects_token_if] || :blank?) # TODO Decide on using an s or not.
171
- end
178
+ # Does:
179
+ # 1. Character substitution.
180
+ # 2. Remove illegal expressions.
181
+ # 3. Remove non-single stopwords. (Stopwords that occur with other words)
182
+ #
183
+ def preprocess text
184
+ text = substitute_characters text
185
+ remove_illegals text
186
+ # We do not remove single stopwords e.g. in the indexer for
187
+ # an entirely different reason than in the query tokenizer.
188
+ # An indexed thing with just name "UND" (a possible stopword)
189
+ # should not lose its name.
190
+ #
191
+ remove_non_single_stopwords text
192
+ text
193
+ end
194
+ # Pretokenizing.
195
+ #
196
+ # Does:
197
+ # 1. Split the text into words.
198
+ # 2. Normalize each word.
199
+ #
200
+ def pretokenize text
201
+ words = split text
202
+ words.collect! do |word|
203
+ normalize_with_patterns word
204
+ word
205
+ end
206
+ end
207
+ # Basic postprocessing (overridden in both query/index tokenizers).
208
+ #
209
+ def process tokens
210
+ reject tokens # Reject any tokens that don't meet criteria
211
+ tokens
212
+ end
172
213
 
173
- # Default preprocessing hook.
174
- #
175
- # Does:
176
- # 1. Character substitution.
177
- # 2. Remove illegal expressions.
178
- # 3. Remove non-single stopwords. (Stopwords that occur with other words)
179
- #
180
- def preprocess text
181
- text = substitute_characters text
182
- remove_illegals text
183
- # We do not remove single stopwords e.g. in the indexer for
184
- # an entirely different reason than in the query tokenizer.
185
- # An indexed thing with just name "UND" (a possible stopword)
186
- # should not lose its name.
187
- #
188
- remove_non_single_stopwords text
189
- text
190
- end
191
- # Pretokenizing.
192
- #
193
- # Does:
194
- # 1. Split the text into words.
195
- # 2. Normalize each word.
196
- #
197
- def pretokenize text
198
- words = split text
199
- words.collect! do |word|
200
- normalize_with_patterns word
201
- word
214
+ # # Converts words into real tokens.
215
+ # #
216
+ # def tokens_for words
217
+ # Query::Tokens.new words.collect! { |word| token_for word }
218
+ # end
219
+ # Turns non-blank text into symbols.
220
+ #
221
+ def symbolize text
222
+ text.blank? ? nil : text.to_sym
202
223
  end
203
- end
204
- # Basic postprocessing (overridden in both query/index tokenizers).
205
- #
206
- def process tokens
207
- reject tokens # Reject any tokens that don't meet criteria
208
- tokens
209
- end
210
224
 
211
- # # Converts words into real tokens.
212
- # #
213
- # def tokens_for words
214
- # Query::Tokens.new words.collect! { |word| token_for word }
215
- # end
216
- # Turns non-blank text into symbols.
217
- #
218
- def symbolize text
219
- text.blank? ? nil : text.to_sym
220
225
  end
221
226
 
222
227
  end
@@ -1,28 +1,32 @@
1
- module Tokenizers
1
+ module Picky
2
2
 
3
- # The base indexing tokenizer.
4
- #
5
- # Override in indexing subclasses and define in configuration.
6
- #
7
- class Index < Base
3
+ module Tokenizers
8
4
 
9
- def self.default= new_default
10
- @default = new_default
11
- end
12
- def self.default
13
- @default ||= new
14
- end
15
-
16
- # Does not actually return a token, but a
17
- # symbol "token".
5
+ # The base indexing tokenizer.
18
6
  #
19
- def tokens_for words
20
- words.collect! { |word| word.downcase! if downcase?; word.to_sym }
21
- end
22
- # Returns empty tokens.
7
+ # Override in indexing subclasses and define in configuration.
23
8
  #
24
- def empty_tokens
25
- []
9
+ class Index < Base
10
+
11
+ def self.default= new_default
12
+ @default = new_default
13
+ end
14
+ def self.default
15
+ @default ||= new
16
+ end
17
+
18
+ # Does not actually return a token, but a
19
+ # symbol "token".
20
+ #
21
+ def tokens_for words
22
+ words.collect! { |word| word.downcase! if downcase?; word.to_sym }
23
+ end
24
+ # Returns empty tokens.
25
+ #
26
+ def empty_tokens
27
+ []
28
+ end
29
+
26
30
  end
27
31
 
28
32
  end