picky 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/picky +14 -0
- data/lib/bundling.rb +10 -0
- data/lib/constants.rb +9 -0
- data/lib/deployment.rb +212 -0
- data/lib/picky/application.rb +40 -0
- data/lib/picky/cacher/convenience.rb +3 -0
- data/lib/picky/cacher/generator.rb +17 -0
- data/lib/picky/cacher/partial/default.rb +7 -0
- data/lib/picky/cacher/partial/none.rb +19 -0
- data/lib/picky/cacher/partial/strategy.rb +7 -0
- data/lib/picky/cacher/partial/subtoken.rb +91 -0
- data/lib/picky/cacher/partial_generator.rb +15 -0
- data/lib/picky/cacher/similarity/default.rb +7 -0
- data/lib/picky/cacher/similarity/double_levenshtone.rb +73 -0
- data/lib/picky/cacher/similarity/none.rb +25 -0
- data/lib/picky/cacher/similarity/strategy.rb +7 -0
- data/lib/picky/cacher/similarity_generator.rb +15 -0
- data/lib/picky/cacher/weights/default.rb +7 -0
- data/lib/picky/cacher/weights/logarithmic.rb +39 -0
- data/lib/picky/cacher/weights/strategy.rb +7 -0
- data/lib/picky/cacher/weights_generator.rb +15 -0
- data/lib/picky/configuration/configuration.rb +13 -0
- data/lib/picky/configuration/field.rb +68 -0
- data/lib/picky/configuration/indexes.rb +60 -0
- data/lib/picky/configuration/queries.rb +32 -0
- data/lib/picky/configuration/type.rb +52 -0
- data/lib/picky/cores.rb +101 -0
- data/lib/picky/db/configuration.rb +23 -0
- data/lib/picky/ext/ruby19/extconf.rb +7 -0
- data/lib/picky/ext/ruby19/performant.c +339 -0
- data/lib/picky/extensions/array.rb +45 -0
- data/lib/picky/extensions/hash.rb +11 -0
- data/lib/picky/extensions/module.rb +15 -0
- data/lib/picky/extensions/symbol.rb +18 -0
- data/lib/picky/generator.rb +156 -0
- data/lib/picky/helpers/cache.rb +23 -0
- data/lib/picky/helpers/gc.rb +11 -0
- data/lib/picky/helpers/measuring.rb +45 -0
- data/lib/picky/helpers/search.rb +27 -0
- data/lib/picky/index/bundle.rb +328 -0
- data/lib/picky/index/category.rb +109 -0
- data/lib/picky/index/combined.rb +38 -0
- data/lib/picky/index/type.rb +30 -0
- data/lib/picky/indexers/base.rb +77 -0
- data/lib/picky/indexers/default.rb +3 -0
- data/lib/picky/indexers/field.rb +13 -0
- data/lib/picky/indexers/no_source_specified_error.rb +5 -0
- data/lib/picky/indexers/solr.rb +60 -0
- data/lib/picky/indexes.rb +180 -0
- data/lib/picky/initializers/ext.rb +6 -0
- data/lib/picky/initializers/mysql.rb +22 -0
- data/lib/picky/loader.rb +287 -0
- data/lib/picky/loggers/search.rb +19 -0
- data/lib/picky/performant/array.rb +23 -0
- data/lib/picky/query/allocation.rb +82 -0
- data/lib/picky/query/allocations.rb +131 -0
- data/lib/picky/query/base.rb +124 -0
- data/lib/picky/query/combination.rb +69 -0
- data/lib/picky/query/combinations.rb +106 -0
- data/lib/picky/query/combinator.rb +92 -0
- data/lib/picky/query/full.rb +15 -0
- data/lib/picky/query/live.rb +22 -0
- data/lib/picky/query/qualifiers.rb +73 -0
- data/lib/picky/query/solr.rb +77 -0
- data/lib/picky/query/token.rb +215 -0
- data/lib/picky/query/tokens.rb +102 -0
- data/lib/picky/query/weigher.rb +159 -0
- data/lib/picky/query/weights.rb +55 -0
- data/lib/picky/rack/harakiri.rb +37 -0
- data/lib/picky/results/base.rb +103 -0
- data/lib/picky/results/full.rb +19 -0
- data/lib/picky/results/live.rb +19 -0
- data/lib/picky/routing.rb +165 -0
- data/lib/picky/signals.rb +11 -0
- data/lib/picky/solr/schema_generator.rb +73 -0
- data/lib/picky/sources/base.rb +19 -0
- data/lib/picky/sources/csv.rb +30 -0
- data/lib/picky/sources/db.rb +77 -0
- data/lib/picky/tokenizers/base.rb +130 -0
- data/lib/picky/tokenizers/default.rb +3 -0
- data/lib/picky/tokenizers/index.rb +73 -0
- data/lib/picky/tokenizers/query.rb +70 -0
- data/lib/picky/umlaut_substituter.rb +21 -0
- data/lib/picky-tasks.rb +6 -0
- data/lib/picky.rb +18 -0
- data/lib/tasks/application.rake +5 -0
- data/lib/tasks/cache.rake +53 -0
- data/lib/tasks/framework.rake +4 -0
- data/lib/tasks/index.rake +29 -0
- data/lib/tasks/server.rake +48 -0
- data/lib/tasks/shortcuts.rake +13 -0
- data/lib/tasks/solr.rake +36 -0
- data/lib/tasks/spec.rake +11 -0
- data/lib/tasks/statistics.rake +13 -0
- data/lib/tasks/try.rake +29 -0
- data/prototype_project/Gemfile +23 -0
- data/prototype_project/Rakefile +1 -0
- data/prototype_project/app/README +6 -0
- data/prototype_project/app/application.rb +50 -0
- data/prototype_project/app/application.ru +29 -0
- data/prototype_project/app/db.yml +10 -0
- data/prototype_project/app/logging.rb +20 -0
- data/prototype_project/app/unicorn.ru +10 -0
- data/prototype_project/log/README +1 -0
- data/prototype_project/script/console +34 -0
- data/prototype_project/tmp/README +0 -0
- data/prototype_project/tmp/pids/README +0 -0
- data/spec/ext/performant_spec.rb +64 -0
- data/spec/lib/application_spec.rb +61 -0
- data/spec/lib/cacher/partial/subtoken_spec.rb +89 -0
- data/spec/lib/cacher/partial_generator_spec.rb +35 -0
- data/spec/lib/cacher/similarity/double_levenshtone_spec.rb +60 -0
- data/spec/lib/cacher/similarity/none_spec.rb +23 -0
- data/spec/lib/cacher/similarity_generator_spec.rb +22 -0
- data/spec/lib/cacher/weights/logarithmic_spec.rb +30 -0
- data/spec/lib/cacher/weights_generator_spec.rb +21 -0
- data/spec/lib/configuration/configuration_spec.rb +38 -0
- data/spec/lib/configuration/type_spec.rb +49 -0
- data/spec/lib/configuration_spec.rb +8 -0
- data/spec/lib/cores_spec.rb +65 -0
- data/spec/lib/extensions/array_spec.rb +37 -0
- data/spec/lib/extensions/hash_spec.rb +11 -0
- data/spec/lib/extensions/module_spec.rb +27 -0
- data/spec/lib/extensions/symbol_spec.rb +85 -0
- data/spec/lib/generator_spec.rb +135 -0
- data/spec/lib/helpers/cache_spec.rb +35 -0
- data/spec/lib/helpers/gc_spec.rb +71 -0
- data/spec/lib/helpers/measuring_spec.rb +18 -0
- data/spec/lib/helpers/search_spec.rb +50 -0
- data/spec/lib/index/bundle_partial_generation_speed_spec.rb +47 -0
- data/spec/lib/index/bundle_spec.rb +260 -0
- data/spec/lib/index/category_spec.rb +203 -0
- data/spec/lib/indexers/base_spec.rb +73 -0
- data/spec/lib/indexers/field_spec.rb +20 -0
- data/spec/lib/loader_spec.rb +48 -0
- data/spec/lib/loggers/search_spec.rb +19 -0
- data/spec/lib/performant/array_spec.rb +13 -0
- data/spec/lib/query/allocation_spec.rb +194 -0
- data/spec/lib/query/allocations_spec.rb +336 -0
- data/spec/lib/query/base_spec.rb +104 -0
- data/spec/lib/query/combination_spec.rb +90 -0
- data/spec/lib/query/combinations_spec.rb +83 -0
- data/spec/lib/query/combinator_spec.rb +112 -0
- data/spec/lib/query/full_spec.rb +22 -0
- data/spec/lib/query/live_spec.rb +61 -0
- data/spec/lib/query/qualifiers_spec.rb +31 -0
- data/spec/lib/query/solr_spec.rb +51 -0
- data/spec/lib/query/token_spec.rb +297 -0
- data/spec/lib/query/tokens_spec.rb +189 -0
- data/spec/lib/query/weights_spec.rb +47 -0
- data/spec/lib/results/base_spec.rb +233 -0
- data/spec/lib/routing_spec.rb +318 -0
- data/spec/lib/solr/schema_generator_spec.rb +42 -0
- data/spec/lib/sources/db_spec.rb +91 -0
- data/spec/lib/tokenizers/base_spec.rb +61 -0
- data/spec/lib/tokenizers/index_spec.rb +51 -0
- data/spec/lib/tokenizers/query_spec.rb +105 -0
- data/spec/lib/umlaut_substituter_spec.rb +84 -0
- data/spec/specific/speed_spec.rb +55 -0
- metadata +371 -15
- data/README.textile +0 -9
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
module Solr
|
|
2
|
+
class SchemaGenerator
|
|
3
|
+
|
|
4
|
+
attr_reader :types
|
|
5
|
+
|
|
6
|
+
# Takes an array of index type configs.
|
|
7
|
+
#
|
|
8
|
+
def initialize configuration
|
|
9
|
+
@types = configuration.types
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
#
|
|
13
|
+
#
|
|
14
|
+
def generate
|
|
15
|
+
generate_schema_for bound_field_names
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Returns a binding with the values needed for the schema xml.
|
|
19
|
+
#
|
|
20
|
+
def bound_field_names
|
|
21
|
+
field_names = combine_field_names
|
|
22
|
+
binding
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# TODO
|
|
26
|
+
#
|
|
27
|
+
def combine_field_names
|
|
28
|
+
field_names = []
|
|
29
|
+
types.each do |type|
|
|
30
|
+
field_names += type.solr_fields.map(&:name)
|
|
31
|
+
end
|
|
32
|
+
field_names.uniq!
|
|
33
|
+
field_names
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
#
|
|
38
|
+
def generate_schema_for binding
|
|
39
|
+
template_text = read_template
|
|
40
|
+
result = evaluate_erb template_text, binding
|
|
41
|
+
write result
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
#
|
|
45
|
+
#
|
|
46
|
+
def evaluate_erb text, binding
|
|
47
|
+
require 'erb'
|
|
48
|
+
template = ERB.new text
|
|
49
|
+
template.result binding
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
#
|
|
53
|
+
#
|
|
54
|
+
def read_template
|
|
55
|
+
template_path = File.join SEARCH_ROOT, 'solr', 'conf', 'schema.xml.erb'
|
|
56
|
+
schema = ''
|
|
57
|
+
File.open(template_path, 'r') do |f|
|
|
58
|
+
schema = f.read
|
|
59
|
+
end
|
|
60
|
+
schema
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
#
|
|
64
|
+
#
|
|
65
|
+
def write result
|
|
66
|
+
schema_path = File.join SEARCH_ROOT, 'solr', 'conf', 'schema.xml'
|
|
67
|
+
File.open(schema_path, 'w') do |f|
|
|
68
|
+
f << result
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module Sources
|
|
2
|
+
|
|
3
|
+
class CSV < Base
|
|
4
|
+
|
|
5
|
+
attr_reader :file_name
|
|
6
|
+
|
|
7
|
+
def initialize file_name, *field_names
|
|
8
|
+
@file_name = file_name
|
|
9
|
+
@field_names
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Counts all the entries that are used for the index.
|
|
13
|
+
#
|
|
14
|
+
def count type
|
|
15
|
+
`wc -l #{file_name}`
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Harvests the data to index, chunked.
|
|
19
|
+
#
|
|
20
|
+
# Subclasses should override harvest_statement to define how their data is found.
|
|
21
|
+
# Example:
|
|
22
|
+
# "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
|
|
23
|
+
#
|
|
24
|
+
def harvest offset
|
|
25
|
+
File.open file_name, 'r'
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
module Sources
|
|
2
|
+
|
|
3
|
+
class DB < Base
|
|
4
|
+
|
|
5
|
+
attr_reader :select_statement, :database
|
|
6
|
+
|
|
7
|
+
def initialize select_statement, database_adapter
|
|
8
|
+
@select_statement = select_statement
|
|
9
|
+
@database = database_adapter
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Take the snapshot.
|
|
13
|
+
#
|
|
14
|
+
def take_snapshot type
|
|
15
|
+
database.connect
|
|
16
|
+
|
|
17
|
+
origin = snapshot_table_name type
|
|
18
|
+
|
|
19
|
+
database.connection.execute "DROP TABLE IF EXISTS #{origin}"
|
|
20
|
+
database.connection.execute "CREATE TABLE #{origin} AS #{select_statement}"
|
|
21
|
+
database.connection.execute "ALTER TABLE #{origin} CHANGE COLUMN id indexed_id INTEGER"
|
|
22
|
+
database.connection.execute "ALTER TABLE #{origin} ADD COLUMN id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT"
|
|
23
|
+
|
|
24
|
+
# Execute any special queries this type needs executed.
|
|
25
|
+
#
|
|
26
|
+
database.connection.execute type.after_indexing if type.after_indexing
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Counts all the entries that are used for the index.
|
|
30
|
+
#
|
|
31
|
+
def count type
|
|
32
|
+
database.connection.select_value("SELECT COUNT(id) FROM #{snapshot_table_name(type)}").to_i
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Ok here?
|
|
36
|
+
#
|
|
37
|
+
def snapshot_table_name type
|
|
38
|
+
"#{type.name}_type_index"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Harvests the data to index, chunked.
|
|
42
|
+
#
|
|
43
|
+
# Subclasses should override harvest_statement to define how their data is found.
|
|
44
|
+
# Example:
|
|
45
|
+
# "SELECT indexed_id, value FROM bla_table st WHERE kind = 'bla'"
|
|
46
|
+
#
|
|
47
|
+
def harvest type, field, offset, chunksize
|
|
48
|
+
database.connect
|
|
49
|
+
|
|
50
|
+
database.connection.execute harvest_statement_with_offset(type, field, offset, chunksize)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Base harvest statement for dbs.
|
|
54
|
+
#
|
|
55
|
+
def harvest_statement type, field
|
|
56
|
+
"SELECT indexed_id, #{field.name} FROM #{snapshot_table_name(type)} st"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Builds a harvest statement for getting data to index.
|
|
60
|
+
#
|
|
61
|
+
# TODO Use the adapter for this.
|
|
62
|
+
#
|
|
63
|
+
def harvest_statement_with_offset type, field, offset, chunksize
|
|
64
|
+
statement = harvest_statement type, field
|
|
65
|
+
|
|
66
|
+
if statement.include? 'WHERE'
|
|
67
|
+
statement += ' AND'
|
|
68
|
+
else
|
|
69
|
+
statement += ' WHERE'
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
"#{statement} st.id > #{offset} LIMIT #{chunksize}"
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
module Tokenizers
|
|
2
|
+
# The Tokenizers
|
|
3
|
+
#
|
|
4
|
+
class Base # TODO Rename Strategy.
|
|
5
|
+
|
|
6
|
+
# Stopwords.
|
|
7
|
+
#
|
|
8
|
+
def self.stopwords regexp
|
|
9
|
+
define_method :remove_stopwords do |text|
|
|
10
|
+
text.gsub! regexp, ''
|
|
11
|
+
end
|
|
12
|
+
# Use this method if you don't want to remove
|
|
13
|
+
# stopwords if it is just one word.
|
|
14
|
+
#
|
|
15
|
+
@@non_single_stopword_regexp = /^\b[\w:]+?\b[\.\*\~]?\s?$/
|
|
16
|
+
define_method :remove_non_single_stopwords do |text|
|
|
17
|
+
return text if text.match @@non_single_stopword_regexp
|
|
18
|
+
remove_stopwords text
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
def remove_stopwords text; end
|
|
22
|
+
|
|
23
|
+
# Contraction.
|
|
24
|
+
#
|
|
25
|
+
def self.contract_expressions what, to_what
|
|
26
|
+
define_method :contract do |text|
|
|
27
|
+
text.gsub! what, to_what
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
def contract text; end
|
|
31
|
+
|
|
32
|
+
# Illegals.
|
|
33
|
+
#
|
|
34
|
+
# TODO Should there be a legal?
|
|
35
|
+
#
|
|
36
|
+
def self.illegal_characters regexp
|
|
37
|
+
define_method :remove_illegals do |text|
|
|
38
|
+
text.gsub! regexp, ''
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
def remove_illegals text; end
|
|
42
|
+
|
|
43
|
+
# Splitting.
|
|
44
|
+
#
|
|
45
|
+
def self.split_text_on regexp
|
|
46
|
+
define_method :split do |text|
|
|
47
|
+
text.split regexp
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
def split text; end
|
|
51
|
+
|
|
52
|
+
# Normalizing.
|
|
53
|
+
#
|
|
54
|
+
def self.normalize_words regexp_replaces
|
|
55
|
+
define_method :normalize_with_patterns do |text|
|
|
56
|
+
regexp_replaces.each do |regex, replace|
|
|
57
|
+
# This should be sufficient
|
|
58
|
+
#
|
|
59
|
+
text.gsub!(regex, replace) and break
|
|
60
|
+
end
|
|
61
|
+
remove_after_normalizing_illegals text
|
|
62
|
+
text
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
def normalize_with_patterns text; end
|
|
66
|
+
|
|
67
|
+
# Illegal after normalizing.
|
|
68
|
+
#
|
|
69
|
+
# TODO Rename illegal_after_tokenizing?
|
|
70
|
+
#
|
|
71
|
+
def self.illegal_characters_after regexp
|
|
72
|
+
define_method :remove_after_normalizing_illegals do |text|
|
|
73
|
+
text.gsub! regexp, ''
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
def remove_after_normalizing_illegals text; end
|
|
77
|
+
|
|
78
|
+
# Returns a number of tokens, generated from the given text.
|
|
79
|
+
#
|
|
80
|
+
# Note:
|
|
81
|
+
# * preprocess, pretokenize are hooks
|
|
82
|
+
#
|
|
83
|
+
def tokenize text
|
|
84
|
+
text = preprocess text # processing the text
|
|
85
|
+
return empty_tokens if text.blank?
|
|
86
|
+
words = pretokenize text # splitting and preparations for tokenizing
|
|
87
|
+
return empty_tokens if words.empty?
|
|
88
|
+
tokens = tokens_for words # creating tokens / strings
|
|
89
|
+
process tokens # processing tokens / strings
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Hooks.
|
|
93
|
+
#
|
|
94
|
+
|
|
95
|
+
# Preprocessing.
|
|
96
|
+
#
|
|
97
|
+
def preprocess text; end
|
|
98
|
+
# Pretokenizing.
|
|
99
|
+
#
|
|
100
|
+
def pretokenize text; end
|
|
101
|
+
# Postprocessing.
|
|
102
|
+
#
|
|
103
|
+
def process tokens
|
|
104
|
+
reject tokens # Reject any tokens that don't meet criteria
|
|
105
|
+
tokens
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Rejects blank tokens.
|
|
109
|
+
#
|
|
110
|
+
def reject tokens
|
|
111
|
+
tokens.reject! &:blank?
|
|
112
|
+
end
|
|
113
|
+
# Converts words into real tokens.
|
|
114
|
+
#
|
|
115
|
+
def tokens_for words
|
|
116
|
+
::Query::Tokens.new words.collect! { |word| token_for word }
|
|
117
|
+
end
|
|
118
|
+
# Turns non-blank text into symbols.
|
|
119
|
+
#
|
|
120
|
+
def symbolize text
|
|
121
|
+
text.blank? ? nil : text.to_sym
|
|
122
|
+
end
|
|
123
|
+
# Returns a tokens object.
|
|
124
|
+
#
|
|
125
|
+
def empty_tokens
|
|
126
|
+
::Query::Tokens.new
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
module Tokenizers
|
|
2
|
+
# The base indexing tokenizer.
|
|
3
|
+
#
|
|
4
|
+
# Override in indexing subclasses and define in configuration.
|
|
5
|
+
#
|
|
6
|
+
class Index < Base
|
|
7
|
+
|
|
8
|
+
include UmlautSubstituter
|
|
9
|
+
|
|
10
|
+
# Default handling definitions. Override in config.
|
|
11
|
+
#
|
|
12
|
+
illegal_characters(//)
|
|
13
|
+
stopwords(//)
|
|
14
|
+
contract_expressions(//, '')
|
|
15
|
+
split_text_on(/\s/)
|
|
16
|
+
normalize_words([])
|
|
17
|
+
illegal_characters_after(//)
|
|
18
|
+
|
|
19
|
+
# Default indexing preprocessing hook.
|
|
20
|
+
#
|
|
21
|
+
# Does:
|
|
22
|
+
# 1. Umlaut substitution.
|
|
23
|
+
# 2. Downcasing.
|
|
24
|
+
# 3. Remove illegal expressions.
|
|
25
|
+
# 4. Contraction.
|
|
26
|
+
# 5. Remove non-single stopwords. (Stopwords that occur with other words)
|
|
27
|
+
#
|
|
28
|
+
def preprocess text
|
|
29
|
+
text = substitute_umlauts text
|
|
30
|
+
text.downcase!
|
|
31
|
+
remove_illegals text
|
|
32
|
+
contract text
|
|
33
|
+
# we do not remove single stopwords for an entirely different
|
|
34
|
+
# reason than in the query tokenizer.
|
|
35
|
+
# An indexed thing with just name "UND" (a stopword) should not lose its name.
|
|
36
|
+
#
|
|
37
|
+
remove_non_single_stopwords text
|
|
38
|
+
text
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Default indexing pretokenizing hook.
|
|
42
|
+
#
|
|
43
|
+
# Does:
|
|
44
|
+
# 1. Split the text into words.
|
|
45
|
+
# 2. Normalize each word.
|
|
46
|
+
#
|
|
47
|
+
# TODO Rename into wordize? Or somesuch?
|
|
48
|
+
#
|
|
49
|
+
def pretokenize text
|
|
50
|
+
words = split text
|
|
51
|
+
words.collect! do |word|
|
|
52
|
+
normalize_with_patterns word
|
|
53
|
+
word
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Does not actually return a token, but a
|
|
58
|
+
# symbol "token".
|
|
59
|
+
#
|
|
60
|
+
def token_for text
|
|
61
|
+
symbolize text
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Rejects tokens if they are too short (or blank).
|
|
65
|
+
#
|
|
66
|
+
# Override in subclasses to redefine behaviour.
|
|
67
|
+
#
|
|
68
|
+
def reject tokens
|
|
69
|
+
tokens.reject! { |token| token.to_s.size < 2 }
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
#
|
|
3
|
+
module Tokenizers
|
|
4
|
+
|
|
5
|
+
# There are a few class methods that you can use to configure how a query works.
|
|
6
|
+
#
|
|
7
|
+
# illegal_characters regexp
|
|
8
|
+
# illegal_after_normalizing regexp
|
|
9
|
+
# stopwords regexp
|
|
10
|
+
# contract_expressions regexp, to_string
|
|
11
|
+
# split_text_on regexp
|
|
12
|
+
# normalize_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
|
|
13
|
+
#
|
|
14
|
+
class Query < Base
|
|
15
|
+
|
|
16
|
+
include UmlautSubstituter
|
|
17
|
+
|
|
18
|
+
# Default query tokenizer behaviour. Override in config.
|
|
19
|
+
#
|
|
20
|
+
illegal_characters(//)
|
|
21
|
+
stopwords(//)
|
|
22
|
+
contract_expressions(//, '')
|
|
23
|
+
split_text_on(/\s/)
|
|
24
|
+
normalize_words([])
|
|
25
|
+
illegal_characters_after(//)
|
|
26
|
+
|
|
27
|
+
def preprocess text
|
|
28
|
+
remove_illegals text # Remove illegal characters
|
|
29
|
+
remove_non_single_stopwords text # remove stop words
|
|
30
|
+
contract text # contract st sankt etc
|
|
31
|
+
text
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Split the text and put some back together.
|
|
35
|
+
#
|
|
36
|
+
def pretokenize text
|
|
37
|
+
split text
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Let each token process itself.
|
|
41
|
+
# Reject, limit, and partialize tokens.
|
|
42
|
+
#
|
|
43
|
+
def process tokens
|
|
44
|
+
tokens.tokenize_with self
|
|
45
|
+
tokens.reject # Reject any tokens that don't meet criteria
|
|
46
|
+
tokens.cap # Cut off superfluous tokens
|
|
47
|
+
tokens.partialize_last # Set certain tokens as partial
|
|
48
|
+
tokens
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Called by the token.
|
|
52
|
+
#
|
|
53
|
+
# TODO Perhaps move to Normalizer?
|
|
54
|
+
#
|
|
55
|
+
def normalize text
|
|
56
|
+
text = substitute_umlauts text # Substitute special characters TODO Move to subclass
|
|
57
|
+
text.downcase! # Downcase all text
|
|
58
|
+
normalize_with_patterns text # normalize
|
|
59
|
+
text.to_sym # symbolize
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Returns a token for a word.
|
|
63
|
+
# The basic query tokenizer uses new tokens.
|
|
64
|
+
#
|
|
65
|
+
def token_for word
|
|
66
|
+
::Query::Token.processed word
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
#
|
|
3
|
+
module UmlautSubstituter
|
|
4
|
+
def substitute_umlauts text
|
|
5
|
+
trans = ActiveSupport::Multibyte.proxy_class.new(text).normalize(:kd)
|
|
6
|
+
|
|
7
|
+
# substitute special cases
|
|
8
|
+
#
|
|
9
|
+
trans.gsub!('ß', 'ss')
|
|
10
|
+
|
|
11
|
+
# substitute umlauts (of A,O,U,a,o,u)
|
|
12
|
+
#
|
|
13
|
+
trans.gsub!(/([AOUaou])\314\210/u, '\1e')
|
|
14
|
+
|
|
15
|
+
# get rid of ecutes, graves and …
|
|
16
|
+
#
|
|
17
|
+
trans.unpack('U*').select { |cp|
|
|
18
|
+
cp < 0x0300 || cp > 0x035F
|
|
19
|
+
}.pack('U*')
|
|
20
|
+
end
|
|
21
|
+
end
|
data/lib/picky-tasks.rb
ADDED
data/lib/picky.rb
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Require the constants.
|
|
2
|
+
#
|
|
3
|
+
# TODO Move to app?
|
|
4
|
+
#
|
|
5
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'constants'))
|
|
6
|
+
|
|
7
|
+
# Library bundling.
|
|
8
|
+
#
|
|
9
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'bundling'))
|
|
10
|
+
|
|
11
|
+
# Loader which handles framework and app loading.
|
|
12
|
+
#
|
|
13
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'picky', 'loader'))
|
|
14
|
+
|
|
15
|
+
# Load the framework
|
|
16
|
+
#
|
|
17
|
+
Loader.load_framework
|
|
18
|
+
puts "Loaded picky with environment '#{SEARCH_ENVIRONMENT}' in #{SEARCH_ROOT} on Ruby #{RUBY_VERSION}."
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
namespace :cache do
|
|
2
|
+
|
|
3
|
+
namespace :structure do
|
|
4
|
+
|
|
5
|
+
desc 'create the directory structure for the cache indexes'
|
|
6
|
+
task :create => :application do
|
|
7
|
+
Indexes.create_directory_structure
|
|
8
|
+
puts "Directory structure generated."
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
desc "Generates the index cache files."
|
|
14
|
+
task :generate => :application do
|
|
15
|
+
Indexes.generate_caches
|
|
16
|
+
puts "Caches generated."
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
desc "Generates a specific index cache file like field=books:title. Note: Index tables need to be there. Will generate just the cache."
|
|
20
|
+
task :only => :application do
|
|
21
|
+
type_and_field = ENV['FIELD'] || ENV['field']
|
|
22
|
+
type, field = type_and_field.split ':'
|
|
23
|
+
Indexes.generate_cache_only type.to_sym, field.to_sym
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
desc 'Checks the index cache files'
|
|
28
|
+
task :check => :application do
|
|
29
|
+
Indexes.check_caches
|
|
30
|
+
puts "All caches look ok."
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
desc "Removes the index cache files."
|
|
35
|
+
task :clear => :application do
|
|
36
|
+
Indexes.clear_caches
|
|
37
|
+
puts "All index cache files removed."
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
desc 'Backup the index cache files'
|
|
42
|
+
task :backup => :application do
|
|
43
|
+
Indexes.backup_caches
|
|
44
|
+
puts "Index cache files moved to the backup directory"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
desc 'Restore the index cache files'
|
|
48
|
+
task :restore => :application do
|
|
49
|
+
Indexes.restore_caches
|
|
50
|
+
puts "Index cache files restored from the backup directory"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Global.
|
|
2
|
+
#
|
|
3
|
+
namespace :index do
|
|
4
|
+
|
|
5
|
+
desc "Generates the temp tables: Copies the data from the public entries."
|
|
6
|
+
task :generate_temp_tables => :application do
|
|
7
|
+
Indexes.take_snapshot
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
desc "rake index:tables:update"
|
|
11
|
+
task :prepare => :application do
|
|
12
|
+
Indexes.take_snapshot
|
|
13
|
+
Indexes.configuration.index
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
desc "E.g. Generates a specific index table. Note: temp tables need to be there. Will generate just the index table."
|
|
17
|
+
task :only, [:type, :field] => :application do |_, options|
|
|
18
|
+
type, field = options.type, options.field
|
|
19
|
+
Indexes.generate_index_only type.to_sym, field.to_sym
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
desc "E.g. Generates a specific index cache file. Note: temp tables need to be there. Will generate index table and cache."
|
|
23
|
+
task :specific, [:type, :field] => :application do |_, options|
|
|
24
|
+
type, field = options.type, options.field
|
|
25
|
+
Indexes.generate_index_only type.to_sym, field.to_sym
|
|
26
|
+
Indexes.generate_cache_only type.to_sym, field.to_sym
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# TODO This file needs some love.
|
|
2
|
+
#
|
|
3
|
+
namespace :server do
|
|
4
|
+
def chdir_to_root
|
|
5
|
+
Dir.chdir SEARCH_ROOT
|
|
6
|
+
end
|
|
7
|
+
def current_pid
|
|
8
|
+
pid = `cat #{File.join(SEARCH_ROOT, 'tmp/pids/unicorn.pid')}`
|
|
9
|
+
pid.blank? ? nil : pid.chomp
|
|
10
|
+
end
|
|
11
|
+
desc "Start the unicorns. Weheee!"
|
|
12
|
+
task :start => :application do
|
|
13
|
+
chdir_to_root
|
|
14
|
+
# Rake::Task[:"solr:start"].invoke # TODO Move to better place.
|
|
15
|
+
config = {}
|
|
16
|
+
config['production'] = {
|
|
17
|
+
:port => 6000,
|
|
18
|
+
:daemonize => true
|
|
19
|
+
}
|
|
20
|
+
config['development'] = {
|
|
21
|
+
:port => 4000,
|
|
22
|
+
:daemonize => false
|
|
23
|
+
}
|
|
24
|
+
# TODO Move port!
|
|
25
|
+
port = SEARCH_ENVIRONMENT == 'production' ? 6000 : 4000
|
|
26
|
+
`export SEARCH_ENV=#{SEARCH_ENVIRONMENT}; unicorn -p #{config[SEARCH_ENVIRONMENT][:port]} -c #{File.join(SEARCH_ROOT, 'app/unicorn.ru')} #{config[SEARCH_ENVIRONMENT][:daemonize] ? '-D' : ''} #{File.join(SEARCH_ROOT, 'app/application.ru')}`
|
|
27
|
+
end
|
|
28
|
+
desc "Restart the unicorns!"
|
|
29
|
+
task :restart do
|
|
30
|
+
Rake::Task[:"server:stop"].invoke
|
|
31
|
+
sleep 15
|
|
32
|
+
Rake::Task[:"server:start"].invoke
|
|
33
|
+
end
|
|
34
|
+
desc "Stop the unicorns. Blam!"
|
|
35
|
+
task :stop => :application do
|
|
36
|
+
chdir_to_root
|
|
37
|
+
`kill -QUIT #{current_pid}` if current_pid
|
|
38
|
+
# Rake::Task[:"solr:stop"].invoke # TODO Move to better place.
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# TODO
|
|
42
|
+
#
|
|
43
|
+
desc 'send the USR1 signal to the thin server'
|
|
44
|
+
task :usr1 => :ruby_version do
|
|
45
|
+
puts "Sending USR1 signal to the thin server."
|
|
46
|
+
`pidof thin#{RUBY_VERSION_APPENDIX}`.split.each { |pid| Process.kill('USR1', pid.to_i) }
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
desc "Shortcut for indexing and caching."
|
|
2
|
+
task :index => :application do
|
|
3
|
+
Indexes.index
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
desc "shortcut for server:start"
|
|
7
|
+
task :start do
|
|
8
|
+
Rake::Task[:'server:start'].invoke
|
|
9
|
+
end
|
|
10
|
+
desc "shortcut for server:stop"
|
|
11
|
+
task :stop do
|
|
12
|
+
Rake::Task[:'server:stop'].invoke
|
|
13
|
+
end
|