picky 0.11.2 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. data/lib/picky/Index_api.rb +49 -0
  2. data/lib/picky/alias_instances.rb +4 -1
  3. data/lib/picky/application.rb +16 -15
  4. data/lib/picky/cacher/partial/{subtoken.rb → substring.rb} +19 -18
  5. data/lib/picky/{character_substitution/european.rb → character_substituters/west_european.rb} +2 -2
  6. data/lib/picky/configuration/index.rb +67 -0
  7. data/lib/picky/cores.rb +3 -0
  8. data/lib/picky/index/bundle.rb +35 -51
  9. data/lib/picky/index/file/basic.rb +39 -5
  10. data/lib/picky/index/file/json.rb +10 -0
  11. data/lib/picky/index/file/marshal.rb +10 -0
  12. data/lib/picky/index/file/text.rb +22 -0
  13. data/lib/picky/index/files.rb +11 -36
  14. data/lib/picky/indexed/bundle.rb +61 -0
  15. data/lib/picky/{index → indexed}/categories.rb +1 -1
  16. data/lib/picky/{index → indexed}/category.rb +13 -16
  17. data/lib/picky/{index/type.rb → indexed/index.rb} +6 -6
  18. data/lib/picky/{index/types.rb → indexed/indexes.rb} +10 -10
  19. data/lib/picky/{index → indexed}/wrappers/exact_first.rb +8 -8
  20. data/lib/picky/indexers/no_source_specified_error.rb +1 -1
  21. data/lib/picky/indexers/serial.rb +64 -0
  22. data/lib/picky/indexers/solr.rb +1 -3
  23. data/lib/picky/indexes_api.rb +41 -0
  24. data/lib/picky/indexing/bundle.rb +43 -13
  25. data/lib/picky/indexing/category.rb +17 -64
  26. data/lib/picky/indexing/{type.rb → index.rb} +13 -3
  27. data/lib/picky/indexing/{types.rb → indexes.rb} +22 -22
  28. data/lib/picky/loader.rb +17 -22
  29. data/lib/picky/query/base.rb +1 -1
  30. data/lib/picky/rack/harakiri.rb +9 -2
  31. data/lib/picky/signals.rb +1 -1
  32. data/lib/picky/sources/base.rb +14 -14
  33. data/lib/picky/sources/couch.rb +8 -7
  34. data/lib/picky/sources/csv.rb +10 -10
  35. data/lib/picky/sources/db.rb +8 -8
  36. data/lib/picky/sources/delicious.rb +2 -2
  37. data/lib/picky/sources/wrappers/location.rb +3 -3
  38. data/lib/picky/tokenizers/base.rb +1 -11
  39. data/lib/picky/tokenizers/index.rb +0 -1
  40. data/lib/picky/tokenizers/query.rb +0 -1
  41. data/lib/tasks/index.rake +4 -4
  42. data/lib/tasks/shortcuts.rake +4 -4
  43. data/lib/tasks/try.rake +8 -8
  44. data/project_prototype/Gemfile +1 -1
  45. data/project_prototype/app/application.rb +13 -12
  46. data/spec/lib/application_spec.rb +10 -38
  47. data/spec/lib/cacher/partial/{subtoken_spec.rb → substring_spec.rb} +0 -0
  48. data/spec/lib/{character_substitution/european_spec.rb → character_substituters/west_european_spec.rb} +6 -2
  49. data/spec/lib/configuration/index_spec.rb +80 -0
  50. data/spec/lib/cores_spec.rb +1 -1
  51. data/spec/lib/index/file/text_spec.rb +1 -1
  52. data/spec/lib/index/files_spec.rb +12 -32
  53. data/spec/lib/indexed/bundle_spec.rb +119 -0
  54. data/spec/lib/{indexing → indexed}/categories_spec.rb +13 -14
  55. data/spec/lib/{index → indexed}/category_spec.rb +6 -6
  56. data/spec/lib/{index/type_spec.rb → indexed/index_spec.rb} +3 -3
  57. data/spec/lib/{index → indexed}/wrappers/exact_first_spec.rb +5 -5
  58. data/spec/lib/indexers/serial_spec.rb +62 -0
  59. data/spec/lib/indexing/bundle_partial_generation_speed_spec.rb +7 -5
  60. data/spec/lib/indexing/bundle_spec.rb +9 -14
  61. data/spec/lib/indexing/category_spec.rb +9 -125
  62. data/spec/lib/indexing/{type_spec.rb → index_spec.rb} +3 -3
  63. data/spec/lib/query/base_spec.rb +1 -1
  64. data/spec/lib/query/full_spec.rb +1 -1
  65. data/spec/lib/query/live_spec.rb +2 -4
  66. data/spec/lib/sources/couch_spec.rb +5 -5
  67. data/spec/lib/sources/db_spec.rb +6 -7
  68. data/spec/lib/tokenizers/base_spec.rb +1 -24
  69. data/spec/lib/tokenizers/query_spec.rb +0 -1
  70. metadata +38 -41
  71. data/lib/picky/bundle.rb +0 -33
  72. data/lib/picky/configuration/indexes.rb +0 -51
  73. data/lib/picky/configuration/queries.rb +0 -15
  74. data/lib/picky/indexers/base.rb +0 -85
  75. data/lib/picky/indexers/default.rb +0 -3
  76. data/lib/picky/type.rb +0 -46
  77. data/lib/picky/types.rb +0 -41
  78. data/lib/tasks/cache.rake +0 -46
  79. data/spec/lib/configuration/indexes_spec.rb +0 -28
  80. data/spec/lib/index/bundle_spec.rb +0 -151
  81. data/spec/lib/indexers/base_spec.rb +0 -89
@@ -1,15 +0,0 @@
1
- module Configuration
2
-
3
- #
4
- #
5
- class Queries
6
-
7
- #
8
- #
9
- def default_tokenizer options = {}
10
- Tokenizers::Query.default = Tokenizers::Query.new(options)
11
- end
12
-
13
- end
14
-
15
- end
@@ -1,85 +0,0 @@
1
- # encoding: utf-8
2
- module Indexers
3
- # Indexer.
4
- #
5
- # 1. Gets data from the original table and copies it into a "snapshot table".
6
- # 3. Processes the data. I.e. takes the snapshot table data words and tokenizes etc. them. Writes the result into a txt file.
7
- #
8
- class Base
9
-
10
- def initialize type, category
11
- @type = type
12
- @category = category
13
- end
14
-
15
- # Convenience method for getting the right Tokenizer.
16
- #
17
- def tokenizer
18
- @category.tokenizer
19
- end
20
- # Convenience methods for user subclasses.
21
- #
22
- # TODO Duplicate code in Index::Files.
23
- #
24
- # TODO Rename to prepared_index_file_name.
25
- #
26
- def search_index_file_name
27
- @category.search_index_file_name
28
- end
29
-
30
- # Executes the specific strategy.
31
- #
32
- def index
33
- process
34
- end
35
-
36
- # Get the source where the data is taken from.
37
- #
38
- def source
39
- @category.source || raise_no_source
40
- end
41
- def raise_no_source
42
- raise NoSourceSpecifiedException.new("No source given for index:#{@type.name}, category:#{@category.name}.") # TODO field.identifier
43
- end
44
-
45
- # Selects the original id (indexed id) and a column to process. The column data is called "token".
46
- #
47
- # Note: Puts together the parts first in an array, then releasing the array from time to time by joining.
48
- #
49
- def process
50
- comma = ?,
51
- newline = ?\n
52
-
53
- indexing_message
54
-
55
- # TODO Move open to Index::File.
56
- #
57
- # @category.prepared_index do |file|
58
- # source.harvest(@type, @category) do |indexed_id, text|
59
- # tokenizer.tokenize(text).each do |token_text|
60
- # next unless token_text
61
- # file.buffer indexed_id << comma << token_text << newline
62
- # end
63
- # file.write_maybe
64
- # end
65
- # end
66
- #
67
- File.open(search_index_file_name, 'w:binary') do |file|
68
- result = []
69
- source.harvest(@type, @category) do |indexed_id, text|
70
- tokenizer.tokenize(text).each do |token_text|
71
- next unless token_text
72
- result << indexed_id << comma << token_text << newline
73
- end
74
- file.write(result.join) && result.clear if result.size > 100_000
75
- end
76
- file.write result.join
77
- end
78
- end
79
-
80
- def indexing_message
81
- timed_exclaim "INDEX #{@type.name} #{@category.name}" #:#{@category.indexed_as}." # TODO field.identifier
82
- end
83
-
84
- end
85
- end
@@ -1,3 +0,0 @@
1
- module Indexers
2
- Default = Base
3
- end
data/lib/picky/type.rb DELETED
@@ -1,46 +0,0 @@
1
- # This class defines the indexing and index API.
2
- #
3
- # Note: A Type holds both an Index::Type and an Indexing::Type.
4
- #
5
- class Type
6
-
7
- # TODO Delegation.
8
- #
9
-
10
- attr_reader :name, :indexing, :index
11
-
12
- def initialize name, source, options = {}
13
- @name = name
14
- @indexing = Indexing::Type.new name, source, options
15
- @index = Index::Type.new name, options
16
-
17
- # Centralized registry.
18
- #
19
- ::Indexes.register self
20
- end
21
-
22
- # API.
23
- #
24
- # TODO Spec! Doc!
25
- #
26
- def category name, options = {}
27
- name = name.to_sym
28
-
29
- indexing.add_category name, options
30
- index.add_category name, options
31
-
32
- self
33
- end
34
- # def location name, options = {}
35
- # grid = options.delete :grid
36
- # precision = options.delete :precision
37
- #
38
- # options[:index_tokenizer] ||= Tokenizers::Index.new # TODO Or a specific location tokenizer.
39
- # options[:query_tokenizer] ||= Tokenizers::Query.new # TODO Or a specific location tokenizer.
40
- # options[:source_wrapper] ||= Sources::Wrappers::Location.new(options)
41
- #
42
- # new_category = category name, options
43
- # :source => Sources::Wrappers::Location.new(source, grid:2), :tokenizer => Tokenizers::Index.new
44
- # end
45
-
46
- end
data/lib/picky/types.rb DELETED
@@ -1,41 +0,0 @@
1
- # Comfortable API convenience class, splits methods to indexes.
2
- #
3
- class Types
4
-
5
- attr_reader :types, :type_mapping
6
-
7
- delegate :reload,
8
- :load_from_cache,
9
- :to => :@indexes
10
-
11
- delegate :check_caches,
12
- :find,
13
- :generate_cache_only,
14
- :generate_index_only,
15
- :index,
16
- :index_for_tests,
17
- :to => :@indexings
18
-
19
- def initialize
20
- @types = []
21
- @type_mapping = {}
22
-
23
- @indexes = Index::Types.new
24
- @indexings = Indexing::Types.new
25
- end
26
-
27
- def register type
28
- self.types << type
29
- self.type_mapping[type.name] = type
30
-
31
- @indexings.register type.indexing
32
- @indexes.register type.index # TODO Even necessary?
33
- end
34
-
35
- def [] name
36
- name = name.to_sym
37
-
38
- self.type_mapping[name]
39
- end
40
-
41
- end
data/lib/tasks/cache.rake DELETED
@@ -1,46 +0,0 @@
1
- namespace :cache do
2
-
3
- # Move to index namespace.
4
- #
5
-
6
- # desc "Generates the index cache files."
7
- # task :generate => :application do
8
- # Indexes.generate_caches
9
- # puts "Caches generated."
10
- # end
11
-
12
- # desc "Generates a specific index cache file like field=books:title. Note: Index tables need to be there. Will generate just the cache."
13
- # task :only => :application do
14
- # type_and_field = ENV['FIELD'] || ENV['field']
15
- # type, field = type_and_field.split ':'
16
- # Indexes.generate_cache_only type.to_sym, field.to_sym
17
- # end
18
-
19
-
20
- # desc 'Checks the index cache files'
21
- # task :check => :application do
22
- # Indexes.check_caches
23
- # puts "All caches look ok."
24
- # end
25
-
26
-
27
- # desc "Removes the index cache files."
28
- # task :clear => :application do
29
- # Indexes.clear_caches
30
- # puts "All index cache files removed."
31
- # end
32
-
33
-
34
- # desc 'Backup the index cache files'
35
- # task :backup => :application do
36
- # Indexes.backup_caches
37
- # puts "Index cache files moved to the backup directory"
38
- # end
39
-
40
- # desc 'Restore the index cache files'
41
- # task :restore => :application do
42
- # Indexes.restore_caches
43
- # puts "Index cache files restored from the backup directory"
44
- # end
45
-
46
- end
@@ -1,28 +0,0 @@
1
- # encoding: utf-8
2
- require 'spec_helper'
3
-
4
- describe Configuration::Indexes do
5
-
6
- before(:each) do
7
- @config = Configuration::Indexes.new
8
- end
9
-
10
- describe "types" do
11
- it "exists" do
12
- lambda { @config.types }.should_not raise_error
13
- end
14
- it "is initially empty" do
15
- @config.types.should be_empty
16
- end
17
- end
18
-
19
- describe "default_tokenizer" do
20
- it "is a default tokenizer" do
21
- @config.default_tokenizer.should be_kind_of(Tokenizers::Index)
22
- end
23
- it "does not cache" do
24
- @config.default_tokenizer.should_not == @config.default_tokenizer
25
- end
26
- end
27
-
28
- end
@@ -1,151 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Index::Bundle do
4
-
5
- before(:each) do
6
- @category = stub :category, :name => :some_category
7
- @type = stub :type, :name => :some_type
8
- @similarity = stub :similarity
9
- @index_class = Index::Bundle
10
- @index = @index_class.new :some_name, @category, @type, @similarity
11
- end
12
-
13
- describe 'identifier' do
14
- it 'should return a specific identifier' do
15
- @index.identifier.should == 'some_type: some_name some_category'
16
- end
17
- end
18
-
19
- describe 'initialize_index_for' do
20
- context 'token not yet assigned' do
21
- before(:each) do
22
- @index.stub! :index => {}
23
- end
24
- it 'should assign it an empty array' do
25
- @index.initialize_index_for :some_token
26
-
27
- @index.index[:some_token].should == []
28
- end
29
- end
30
- context 'token already assigned' do
31
- before(:each) do
32
- @index.stub! :index => { :some_token => :already_assigned }
33
- end
34
- it 'should not assign it anymore' do
35
- @index.initialize_index_for :some_token
36
-
37
- @index.index[:some_token].should == :already_assigned
38
- end
39
- end
40
- end
41
-
42
- # TODO
43
- #
44
- # describe 'retrieve' do
45
- # it 'should call the other methods correctly' do
46
- # results = stub :results
47
- # @index.stub! :execute_query => results
48
- # @index.should_receive(:extract).once.with results
49
- #
50
- # @index.retrieve
51
- # end
52
- # end
53
-
54
- describe 'load_from_index_file' do
55
- it 'should call two methods in order' do
56
- @index.should_receive(:load_from_index_generation_message).once.ordered
57
- @index.should_receive(:clear).once.ordered
58
- @index.should_receive(:retrieve).once.ordered
59
-
60
- @index.load_from_index_file
61
- end
62
- end
63
-
64
- describe 'ids' do
65
- before(:each) do
66
- @index.instance_variable_set :@index, { :existing => :some_ids }
67
- end
68
- it 'should return an empty array if not found' do
69
- @index.ids(:non_existing).should == []
70
- end
71
- it 'should return the ids if found' do
72
- @index.ids(:existing).should == :some_ids
73
- end
74
- end
75
-
76
- describe 'weight' do
77
- before(:each) do
78
- @index.instance_variable_set :@weights, { :existing => :specific }
79
- end
80
- it 'should return nil' do
81
- @index.weight(:non_existing).should == nil
82
- end
83
- it 'should return the weight for the text' do
84
- @index.weight(:existing).should == :specific
85
- end
86
- end
87
-
88
- describe 'load' do
89
- it 'should trigger loads' do
90
- @index.should_receive(:load_index).once.with
91
- @index.should_receive(:load_similarity).once.with
92
- @index.should_receive(:load_weights).once.with
93
-
94
- @index.load
95
- end
96
- end
97
- describe "loading indexes" do
98
- before(:each) do
99
- @index.stub! :timed_exclaim
100
- end
101
- describe "load_index" do
102
- it "uses the right file" do
103
- Yajl::Parser.stub! :parse
104
-
105
- File.should_receive(:open).once.with 'some/search/root/index/test/some_type/some_name_some_category_index.json', 'r'
106
-
107
- @index.load_index
108
- end
109
- end
110
- describe "load_similarity" do
111
- it "uses the right file" do
112
- Marshal.stub! :load
113
-
114
- File.should_receive(:open).once.with 'some/search/root/index/test/some_type/some_name_some_category_similarity.dump', 'r:binary'
115
-
116
- @index.load_similarity
117
- end
118
- end
119
- describe "load_weights" do
120
- it "uses the right file" do
121
- Yajl::Parser.stub! :parse
122
-
123
- File.should_receive(:open).once.with 'some/search/root/index/test/some_type/some_name_some_category_weights.json', 'r'
124
-
125
- @index.load_weights
126
- end
127
- end
128
- end
129
-
130
- describe 'initialization' do
131
- before(:each) do
132
- @category = stub :category, :name => :some_category
133
- @type = stub :type, :name => :some_type
134
-
135
- @index = @index_class.new :some_name, @category, @type, :similarity
136
- end
137
- it 'should initialize the index correctly' do
138
- @index.index.should == {}
139
- end
140
- it 'should initialize the weights index correctly' do
141
- @index.weights.should == {}
142
- end
143
- it 'should initialize the similarity index correctly' do
144
- @index.similarity.should == {}
145
- end
146
- it 'should initialize the similarity strategy correctly' do
147
- @index.similarity_strategy.should == :similarity
148
- end
149
- end
150
-
151
- end
@@ -1,89 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Indexers::Base do
4
-
5
- before(:each) do
6
- @type = stub :type,
7
- :name => :some_type,
8
- :snapshot_table_name => :some_prepared_table_name
9
- @field = stub :field,
10
- :name => :some_field_name,
11
- :search_index_file_name => :some_search_index_name,
12
- :indexed_name => :some_indexed_field_name
13
- @indexer = Indexers::Base.new @type, @field
14
- @indexer.stub! :timed_exclaim
15
- end
16
-
17
- describe "tokenizer" do
18
- it "delegates to the field" do
19
- @field.should_receive(:tokenizer).once.with
20
-
21
- @indexer.tokenizer
22
- end
23
- end
24
-
25
- describe "indexing_message" do
26
- it "informs the user about what it is going to index" do
27
- @indexer.should_receive(:timed_exclaim).once.with 'INDEX some_type some_field_name'
28
-
29
- @indexer.indexing_message
30
- end
31
- end
32
-
33
- describe "tokenizer" do
34
- it "should delegate to field" do
35
- @indexer.should_receive(:tokenizer).once.with
36
-
37
- @indexer.tokenizer
38
- end
39
- end
40
-
41
- describe 'search_index_file_name' do
42
- it 'should return a specific name' do
43
- @indexer.search_index_file_name.should == :some_search_index_name
44
- end
45
- end
46
-
47
- describe "index" do
48
- it "should execute! the indexer" do
49
- @indexer.should_receive(:process).once.with
50
-
51
- @indexer.index
52
- end
53
- end
54
-
55
- describe "source" do
56
- before(:each) do
57
- @source = stub :source
58
- end
59
- context "field has one" do
60
- before(:each) do
61
- @field.stub! :source => @source
62
- end
63
- it "should return that one" do
64
- @indexer.source.should == @source
65
- end
66
- end
67
- context "field doesn't have one" do
68
- before(:each) do
69
- @field.stub! :source => nil
70
- end
71
- it "should call raise_no_source" do
72
- @indexer.should_receive(:raise_no_source).once.with
73
-
74
- @indexer.source
75
- end
76
- end
77
- end
78
-
79
- describe "raise_no_source" do
80
- it "should raise" do
81
- lambda { @indexer.raise_no_source }.should raise_error(Indexers::NoSourceSpecifiedException)
82
- end
83
- end
84
-
85
- describe "chunked" do
86
-
87
- end
88
-
89
- end