picky 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/lib/picky/application.rb +38 -37
  2. data/lib/picky/cacher/partial/default.rb +1 -3
  3. data/lib/picky/cacher/partial/subtoken.rb +44 -18
  4. data/lib/picky/configuration/field.rb +6 -2
  5. data/lib/picky/configuration/indexes.rb +16 -7
  6. data/lib/picky/configuration/queries.rb +3 -13
  7. data/lib/picky/extensions/symbol.rb +19 -4
  8. data/lib/picky/generator.rb +9 -0
  9. data/lib/picky/helpers/measuring.rb +3 -3
  10. data/lib/picky/index/bundle.rb +5 -4
  11. data/lib/picky/index/category.rb +14 -7
  12. data/lib/picky/index/combined.rb +6 -1
  13. data/lib/picky/indexers/no_source_specified_error.rb +2 -0
  14. data/lib/picky/indexes.rb +3 -9
  15. data/lib/picky/query/allocation.rb +1 -1
  16. data/lib/picky/query/allocations.rb +2 -2
  17. data/lib/picky/rack/harakiri.rb +10 -8
  18. data/lib/picky/routing.rb +19 -21
  19. data/lib/picky/solr/schema_generator.rb +4 -4
  20. data/lib/picky/sources/base.rb +16 -4
  21. data/lib/picky/sources/csv.rb +3 -0
  22. data/lib/picky/sources/db.rb +30 -22
  23. data/lib/picky/tokenizers/base.rb +7 -5
  24. data/lib/picky/tokenizers/index.rb +5 -5
  25. data/lib/picky/tokenizers/query.rb +9 -9
  26. data/prototype_project/app/application.rb +36 -29
  27. data/prototype_project/app/db.yml +1 -1
  28. data/prototype_project/config.ru +3 -2
  29. data/spec/ext/performant_spec.rb +2 -2
  30. data/spec/lib/application_spec.rb +54 -8
  31. data/spec/lib/cacher/partial/default_spec.rb +15 -0
  32. data/spec/lib/cacher/partial/subtoken_spec.rb +54 -2
  33. data/spec/lib/extensions/symbol_spec.rb +124 -30
  34. data/spec/lib/index/bundle_partial_generation_speed_spec.rb +1 -1
  35. data/spec/lib/query/allocations_spec.rb +5 -5
  36. data/spec/lib/query/combinations_spec.rb +3 -3
  37. data/spec/lib/rack/harakiri_spec.rb +29 -0
  38. data/spec/lib/routing_spec.rb +22 -98
  39. data/spec/lib/tokenizers/index_spec.rb +1 -1
  40. data/spec/specific/speed_spec.rb +4 -5
  41. metadata +7 -3
@@ -1,5 +1,7 @@
1
1
  module Indexers
2
2
 
3
+ # Raised if no source is available on a field.
4
+ #
3
5
  class NoSourceSpecifiedException < StandardError; end
4
6
 
5
7
  end
data/lib/picky/indexes.rb CHANGED
@@ -171,16 +171,10 @@ module Indexes
171
171
  end
172
172
  end
173
173
 
174
- # Loads all index definitions.
175
- #
176
- def self.setup
177
- self.types ||= []
178
- self.type_mapping ||= {}
179
- configuration.types.each do |type|
180
- add type.generate
181
- end
182
- end
183
174
  def self.add type
175
+ self.type_mapping ||= {}
176
+ self.types ||= []
177
+
184
178
  self.type_mapping[type.name] = type
185
179
  self.types << type
186
180
  end
@@ -61,7 +61,7 @@ module Query
61
61
  # Transform the allocation into result form.
62
62
  #
63
63
  def to_result
64
- [self.result_type, self.score, self.count, @combinations.to_result, self.ids] if self.count > 0
64
+ [self.result_type, self.score, count, @combinations.to_result, self.ids] if count = self.count > 0
65
65
  end
66
66
 
67
67
  # Json representation of this allocation.
@@ -66,8 +66,8 @@ module Query
66
66
  # TODO can there be no @allocations???
67
67
  return [] if @allocations.empty?
68
68
  ids = @allocations.first.ids
69
- indexes = Array.new(ids.size) { |i| i }.sort_by { rand }
70
- indexes.first(amount).map { |i| ids[i] }
69
+ indexes = Array.new(ids.size) { |id| id }.sort_by { rand }
70
+ indexes.first(amount).map { |id| ids[id] }
71
71
  end
72
72
 
73
73
  # This is the main method of this class that will replace ids and count.
@@ -1,22 +1,24 @@
1
- # Simple Rack Middleware to kill Unicorns after X requests.
2
- #
3
- # Use as follows in e.g. your rackup File:
4
- #
5
- # Rack::Harakiri.after = 50
6
- # use Rack::Harakiri
7
- #
8
1
  module Rack
2
+
3
+ # Simple Rack Middleware to kill Unicorns after X requests.
4
+ #
5
+ # Use as follows in e.g. your rackup File:
6
+ #
7
+ # Rack::Harakiri.after = 50
8
+ # use Rack::Harakiri
9
+ #
9
10
  class Harakiri
10
11
 
11
12
  # Set the amount of requests before the Unicorn commits Harakiri.
12
13
  #
13
14
  cattr_accessor :after
15
+ attr_reader :quit_after_requests
14
16
 
15
17
  def initialize app
16
18
  @app = app
17
19
 
18
20
  @requests = 0
19
- @quit_after_requests = @@after || 50
21
+ @quit_after_requests = self.class.after || 50
20
22
  end
21
23
 
22
24
  def call env
data/lib/picky/routing.rb CHANGED
@@ -14,14 +14,6 @@ class Routing
14
14
  @defaults = @@defaults.dup
15
15
  end
16
16
 
17
- # #
18
- # #
19
- # def define_using &block
20
- # reset_routes
21
- # instance_eval &block
22
- # routes.freeze
23
- # end
24
-
25
17
  #
26
18
  #
27
19
  def reset_routes
@@ -60,21 +52,27 @@ class Routing
60
52
 
61
53
  #
62
54
  #
63
- def route url, query, route_options = {}
64
- query.tokenizer = @defaults[:tokenizer] if @defaults[:tokenizer]
65
- routes.add_route generate_app(query, route_options), default_options(url, route_options)
55
+ def route options = {}
56
+ mappings, route_options = split options
57
+ mappings.each do |url, query|
58
+ route_one url, query, route_options
59
+ end
66
60
  end
67
- #
68
- #
69
- def live url, *indexes_and_options
70
- route_options = Hash === indexes_and_options.last ? indexes_and_options.pop : {}
71
- route url, Query::Live.new(*indexes_and_options), route_options
61
+ def split options
62
+ mappings = {}
63
+ route_options = {}
64
+ options.each_pair do |key, value|
65
+ if Regexp === key or String === key
66
+ mappings[key] = value
67
+ else
68
+ route_options[key] = value
69
+ end
70
+ end
71
+ [mappings, route_options]
72
72
  end
73
- #
74
- #
75
- def full url, *indexes_and_options
76
- route_options = Hash === indexes_and_options.last ? indexes_and_options.pop : {}
77
- route url, Query::Full.new(*indexes_and_options), route_options
73
+ def route_one url, query, route_options = {}
74
+ query.tokenizer = @defaults[:tokenizer] if @defaults[:tokenizer]
75
+ routes.add_route generate_app(query, route_options), default_options(url, route_options)
78
76
  end
79
77
  #
80
78
  #
@@ -54,8 +54,8 @@ module Solr
54
54
  def read_template
55
55
  template_path = File.join PICKY_ROOT, 'solr', 'conf', 'schema.xml.erb'
56
56
  schema = ''
57
- File.open(template_path, 'r') do |f|
58
- schema = f.read
57
+ File.open(template_path, 'r') do |file|
58
+ schema = file.read
59
59
  end
60
60
  schema
61
61
  end
@@ -64,8 +64,8 @@ module Solr
64
64
  #
65
65
  def write result
66
66
  schema_path = File.join PICKY_ROOT, 'solr', 'conf', 'schema.xml'
67
- File.open(schema_path, 'w') do |f|
68
- f << result
67
+ File.open(schema_path, 'w') do |file|
68
+ file << result
69
69
  end
70
70
  end
71
71
 
@@ -1,16 +1,28 @@
1
1
  module Sources
2
2
 
3
+ # Sources are where your data comes from.
4
+ # Harvest is the most important method as it is used always to get data.
5
+ #
3
6
  class Base
4
7
 
5
- def connect_backend
6
-
8
+ # Note: Methods listed for illustrative purposes.
9
+ #
10
+
11
+ # Yield the data (id, text for id) for the given type and field.
12
+ #
13
+ def harvest type, field
14
+ # yields nothing
7
15
  end
8
16
 
9
- def take_snapshot type
17
+ # Connects to the backend.
18
+ #
19
+ def connect_backend
10
20
 
11
21
  end
12
22
 
13
- def harvest type, field
23
+ # Take a snapshot of your data, if it is fast changing.
24
+ #
25
+ def take_snapshot type
14
26
 
15
27
  end
16
28
 
@@ -2,6 +2,9 @@ require 'csv'
2
2
 
3
3
  module Sources
4
4
 
5
+ # Describes a CSV source, a file with csv in it.
6
+ # Give it a sequence of field names and a file option with the filename.
7
+ #
5
8
  class NoCSVFileGiven < StandardError; end
6
9
 
7
10
  class CSV < Base
@@ -1,13 +1,16 @@
1
1
  module Sources
2
2
 
3
+ # Describes a database source. Just give it a select statement
4
+ # (with id in it), and a file option or the options from an AR config file.
5
+ #
3
6
  class DB < Base
4
7
 
5
8
  attr_reader :select_statement, :database, :connection_options
6
9
 
7
- def initialize select_statement, with_options = { :file => 'app/db.yml' }
10
+ def initialize select_statement, options = { :file => 'app/db.yml' }
8
11
  @select_statement = select_statement
9
12
  @database = create_database_adapter
10
- configure with_options
13
+ @options = options
11
14
  end
12
15
 
13
16
  # Get a configured Database backend.
@@ -34,7 +37,7 @@ module Sources
34
37
  #
35
38
  def configure options
36
39
  @connection_options = if filename = options[:file]
37
- File.open(File.join(PICKY_ROOT, filename)) { |f| YAML::load(f) }
40
+ File.open(File.join(PICKY_ROOT, filename)) { |file| YAML::load(file) }
38
41
  else
39
42
  options
40
43
  end
@@ -44,6 +47,7 @@ module Sources
44
47
  # Connect the backend.
45
48
  #
46
49
  def connect_backend
50
+ configure @options
47
51
  return if PICKY_ENVIRONMENT.to_s == 'test' # TODO Unclean.
48
52
  raise "Database backend not configured" unless connection_options
49
53
  database.establish_connection connection_options
@@ -55,15 +59,17 @@ module Sources
55
59
  connect_backend
56
60
 
57
61
  origin = snapshot_table_name type
58
-
59
- database.connection.execute "DROP TABLE IF EXISTS #{origin}"
60
- database.connection.execute "CREATE TABLE #{origin} AS #{select_statement}"
61
- database.connection.execute "ALTER TABLE #{origin} CHANGE COLUMN id indexed_id INTEGER"
62
- database.connection.execute "ALTER TABLE #{origin} ADD COLUMN id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT"
63
-
62
+
63
+ on_database = database.connection
64
+
65
+ on_database.execute "DROP TABLE IF EXISTS #{origin}"
66
+ on_database.execute "CREATE TABLE #{origin} AS #{select_statement}"
67
+ on_database.execute "ALTER TABLE #{origin} CHANGE COLUMN id indexed_id INTEGER"
68
+ on_database.execute "ALTER TABLE #{origin} ADD COLUMN id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT"
69
+
64
70
  # Execute any special queries this type needs executed.
65
71
  #
66
- database.connection.execute type.after_indexing if type.after_indexing
72
+ on_database.execute type.after_indexing if type.after_indexing
67
73
  end
68
74
 
69
75
  # Counts all the entries that are used for the index.
@@ -86,6 +92,8 @@ module Sources
86
92
  # Example:
87
93
  # "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
88
94
  #
95
+ # TODO Perhaps it should be just harvest field.
96
+ #
89
97
  def harvest type, field
90
98
  connect_backend
91
99
 
@@ -98,24 +106,12 @@ module Sources
98
106
  end
99
107
  end
100
108
 
101
- # Override in subclasses.
102
- #
103
- def chunksize
104
- 25_000
105
- end
106
-
107
109
  # Gets database from the backend.
108
110
  #
109
111
  def get_data type, field, offset
110
112
  database.connection.execute harvest_statement_with_offset(type, field, offset)
111
113
  end
112
114
 
113
- # Base harvest statement for dbs.
114
- #
115
- def harvest_statement type, field
116
- "SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
117
- end
118
-
119
115
  # Builds a harvest statement for getting data to index.
120
116
  #
121
117
  # TODO Use the adapter for this.
@@ -128,6 +124,18 @@ module Sources
128
124
  "#{statement} st.id > #{offset} LIMIT #{chunksize}"
129
125
  end
130
126
 
127
+ # Base harvest statement for dbs.
128
+ #
129
+ def harvest_statement type, field
130
+ "SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
131
+ end
132
+
133
+ # Override in subclasses.
134
+ #
135
+ def chunksize
136
+ 25_000
137
+ end
138
+
131
139
  end
132
140
 
133
141
  end
@@ -1,5 +1,7 @@
1
1
  module Tokenizers
2
2
 
3
+ # Defines tokenizing processes used both in indexing and querying.
4
+ #
3
5
  class Base
4
6
 
5
7
  # Stopwords.
@@ -21,7 +23,7 @@ module Tokenizers
21
23
 
22
24
  # Contraction.
23
25
  #
24
- def self.contract_expressions what, to_what
26
+ def self.contracts_expressions what, to_what
25
27
  define_method :contract do |text|
26
28
  text.gsub! what, to_what
27
29
  end
@@ -32,7 +34,7 @@ module Tokenizers
32
34
  #
33
35
  # TODO Should there be a legal?
34
36
  #
35
- def self.illegal_characters regexp
37
+ def self.removes_characters regexp
36
38
  define_method :remove_illegals do |text|
37
39
  text.gsub! regexp, ''
38
40
  end
@@ -41,7 +43,7 @@ module Tokenizers
41
43
 
42
44
  # Splitting.
43
45
  #
44
- def self.split_text_on regexp
46
+ def self.splits_text_on regexp
45
47
  define_method :split do |text|
46
48
  text.split regexp
47
49
  end
@@ -50,7 +52,7 @@ module Tokenizers
50
52
 
51
53
  # Normalizing.
52
54
  #
53
- def self.normalize_words regexp_replaces
55
+ def self.normalizes_words regexp_replaces
54
56
  define_method :normalize_with_patterns do |text|
55
57
  regexp_replaces.each do |regex, replace|
56
58
  # This should be sufficient
@@ -65,7 +67,7 @@ module Tokenizers
65
67
 
66
68
  # Illegal after normalizing.
67
69
  #
68
- def self.illegal_characters_after_splitting regexp
70
+ def self.removes_characters_after_splitting regexp
69
71
  define_method :remove_after_normalizing_illegals do |text|
70
72
  text.gsub! regexp, ''
71
73
  end
@@ -9,12 +9,12 @@ module Tokenizers
9
9
 
10
10
  # Default handling definitions. Override in config.
11
11
  #
12
- illegal_characters(//)
12
+ removes_characters(//)
13
13
  stopwords(//)
14
- contract_expressions(//, '')
15
- split_text_on(/\s/)
16
- normalize_words([])
17
- illegal_characters_after_splitting(//)
14
+ contracts_expressions(//, '')
15
+ splits_text_on(/\s/)
16
+ normalizes_words([])
17
+ removes_characters_after_splitting(//)
18
18
 
19
19
  # Default indexing preprocessing hook.
20
20
  #
@@ -4,12 +4,12 @@ module Tokenizers
4
4
 
5
5
  # There are a few class methods that you can use to configure how a query works.
6
6
  #
7
- # illegal_characters regexp
7
+ # removes_characters regexp
8
8
  # illegal_after_normalizing regexp
9
9
  # stopwords regexp
10
- # contract_expressions regexp, to_string
11
- # split_text_on regexp
12
- # normalize_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
10
+ # contracts_expressions regexp, to_string
11
+ # splits_text_on regexp
12
+ # normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
13
13
  #
14
14
  class Query < Base
15
15
 
@@ -17,12 +17,12 @@ module Tokenizers
17
17
 
18
18
  # Default query tokenizer behaviour. Override in config.
19
19
  #
20
- illegal_characters(//)
20
+ removes_characters(//)
21
21
  stopwords(//)
22
- contract_expressions(//, '')
23
- split_text_on(/\s/)
24
- normalize_words([])
25
- illegal_characters_after_splitting(//)
22
+ contracts_expressions(//, '')
23
+ splits_text_on(/\s/)
24
+ normalizes_words([])
25
+ removes_characters_after_splitting(//)
26
26
 
27
27
  def preprocess text
28
28
  remove_illegals text # Remove illegal characters
@@ -4,43 +4,50 @@
4
4
  #
5
5
  # Have fun with Picky!
6
6
  #
7
- class PickySearch < Application # The App Constant needs to be identical in application.ru.
7
+ class PickySearch < Application # The App Constant needs to be identical in config.ru.
8
8
 
9
9
  # This is an example with books that you can adapt.
10
10
  #
11
11
  # Note: Much more is possible, but let's start out super easy.
12
12
  #
13
- # Ask me if you have questions or specific requests!
13
+ # Ask me if you have questions or specific requests.
14
14
  #
15
15
 
16
- indexes do
17
- illegal_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
18
- stopwords(/\b(and|the|of|it|in|for)\b/)
19
- split_text_on(/[\s\/\-\"\&\.]/)
16
+ indexing.removes_characters(/[^a-zA-Z0-9\s\/\-\"\&\.]/)
17
+ indexing.stopwords(/\b(and|the|of|it|in|for)\b/)
18
+ indexing.splits_text_on(/[\s\/\-\"\&\.]/)
20
19
 
21
- add_index :books,
22
- Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
23
- field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
24
- field(:author),
25
- field(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
26
- end
20
+ books_index = index :books,
21
+ Sources::DB.new('SELECT id, title, author, isbn13 as isbn FROM books', :file => 'app/db.yml'),
22
+ field(:title, :similarity => Similarity::DoubleLevenshtone.new(3)), # Up to three similar title word indexed.
23
+ field(:author),
24
+ field(:isbn, :partial => Partial::None.new) # Partially searching on an ISBN makes not much sense.
27
25
 
28
- queries do
29
- maximum_tokens 5
30
- # Note that Picky needs the following characters to
31
- # pass through, as they are control characters: *"~:
32
- #
33
- illegal_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
34
- stopwords(/\b(and|the|of|it|in|for)\b/)
35
- split_text_on(/[\s\/\-\,\&]+/)
36
-
37
- # The example defines two queries that use the same index(es).
38
- #
39
- # A Full query returns ids, combinations, and counts.
40
- # A Live query does return all that Full returns, without ids.
41
- #
42
- route %r{^/books/full}, Query::Full.new(Indexes[:books])
43
- route %r{^/books/live}, Query::Live.new(Indexes[:books])
44
- end
26
+ # Defines the maximum tokens (words) that pass through to the engine.
27
+ #
28
+ querying.maximum_tokens 5
29
+
30
+ # Note that Picky needs the following characters to
31
+ # pass through, as they are control characters: *"~:
32
+ #
33
+ querying.removes_characters(/[^a-zA-Z0-9\s\/\-\,\&\"\~\*\:]/)
34
+ querying.stopwords(/\b(and|the|of|it|in|for)\b/)
35
+ querying.splits_text_on(/[\s\/\-\,\&]+/)
36
+
37
+ # The example defines two queries that use the same index(es).
38
+ #
39
+ # A Full query returns ids, combinations, and counts.
40
+ # A Live query does return all that Full returns, without ids.
41
+ #
42
+ # Note: You can pass a query multiple indexes and it will combine them.
43
+ #
44
+ full_books = Query::Full.new books_index
45
+ live_books = Query::Live.new books_index
46
+
47
+ # Routing is simple.
48
+ # A path regexp pointing to a query that will be run.
49
+ #
50
+ route %r{^/books/full} => full_books
51
+ route %r{^/books/live} => live_books
45
52
 
46
53
  end