runestone 1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.tm_properties +1 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +10 -0
- data/Rakefile +14 -0
- data/db/migrate/20181101150207_create_ts_tables.rb +31 -0
- data/lib/runestone.rb +103 -0
- data/lib/runestone/active_record/base_methods.rb +135 -0
- data/lib/runestone/active_record/relation_methods.rb +83 -0
- data/lib/runestone/corpus.rb +45 -0
- data/lib/runestone/engine.rb +21 -0
- data/lib/runestone/indexing_job.rb +8 -0
- data/lib/runestone/model.rb +92 -0
- data/lib/runestone/settings.rb +106 -0
- data/lib/runestone/version.rb +3 -0
- data/lib/runestone/web_search.rb +203 -0
- data/lib/runestone/web_search/and.rb +17 -0
- data/lib/runestone/web_search/or.rb +11 -0
- data/lib/runestone/web_search/phrase.rb +19 -0
- data/lib/runestone/web_search/token.rb +27 -0
- data/runestone.gemspec +32 -0
- data/test/corpus_test.rb +42 -0
- data/test/database.rb +119 -0
- data/test/delayed_index_test.rb +34 -0
- data/test/helper_test.rb +40 -0
- data/test/highlight_test.rb +26 -0
- data/test/indexing_test.rb +151 -0
- data/test/multi_index_test.rb +177 -0
- data/test/query_test.rb +129 -0
- data/test/synonym_test.rb +128 -0
- data/test/test_helper.rb +185 -0
- metadata +239 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
class Runestone::Engine < Rails::Engine
|
2
|
+
config.runestone = ActiveSupport::OrderedOptions.new
|
3
|
+
|
4
|
+
initializer :append_migrations do |app|
|
5
|
+
unless app.root.to_s.match root.to_s
|
6
|
+
config.paths["db/migrate"].expanded.each do |expanded_path|
|
7
|
+
app.config.paths["db/migrate"] << expanded_path
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
initializer "runestone.set_configs" do |app|
|
13
|
+
options = app.config.runestone
|
14
|
+
|
15
|
+
Runestone.runner = options.runner if options.runner
|
16
|
+
Runestone.dictionary = options.dictionary if options.dictionary
|
17
|
+
Runestone.job_queue = options.job_queue if options.job_queue
|
18
|
+
Runestone.typo_tolerances = options.typo_tolerances if options.typo_tolerances
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
class Runestone::Model < ActiveRecord::Base
|
2
|
+
|
3
|
+
self.table_name = :runestones
|
4
|
+
|
5
|
+
attr_accessor :highlights
|
6
|
+
|
7
|
+
belongs_to :record, polymorphic: true
|
8
|
+
|
9
|
+
def self.highlight(records, query, prefix: nil)
|
10
|
+
return [] if records.empty?
|
11
|
+
|
12
|
+
binds = []
|
13
|
+
records.each do |record|
|
14
|
+
binds += get_binds(record.data, record.record_type.constantize.highlights(dictionary: records.first.dictionary))
|
15
|
+
end
|
16
|
+
|
17
|
+
hlites = binds.uniq
|
18
|
+
|
19
|
+
newbinds = []
|
20
|
+
binds.each_with_index do |b|
|
21
|
+
newbinds << hlites.index(b)
|
22
|
+
end
|
23
|
+
binds = newbinds
|
24
|
+
|
25
|
+
hlites = get_highlights(hlites, query, prefix: prefix, dictionary: records.first.dictionary)
|
26
|
+
|
27
|
+
binds.map! { |x| hlites[x] }
|
28
|
+
|
29
|
+
records.each do |record|
|
30
|
+
record.highlights = highlight_data(
|
31
|
+
record.data,
|
32
|
+
binds,
|
33
|
+
record.record_type.constantize.highlights
|
34
|
+
)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.highlight_data(data, hlights, indexes)
|
39
|
+
str = {}
|
40
|
+
indexes.each do |key, value|
|
41
|
+
next unless data[key]
|
42
|
+
|
43
|
+
if data[key].is_a?(Hash)
|
44
|
+
str[key] = highlight_data(data[key], hlights, indexes[key])
|
45
|
+
elsif data[key].is_a?(Array)
|
46
|
+
str[key] = data[key].map { |i|
|
47
|
+
if i.is_a?(Hash)
|
48
|
+
highlight_data(i, hlights, indexes[key])
|
49
|
+
else
|
50
|
+
hlights.shift
|
51
|
+
end
|
52
|
+
}
|
53
|
+
else
|
54
|
+
str[key] = hlights.shift
|
55
|
+
end
|
56
|
+
end
|
57
|
+
str
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.get_highlights(words, query, prefix: nil, dictionary: nil)
|
61
|
+
dictionary ||= Runestone.dictionary
|
62
|
+
|
63
|
+
query = Arel::Nodes::TSQuery.new(Runestone::WebSearch.parse(query, prefix: prefix).typos.synonymize.to_s, language: dictionary).to_sql
|
64
|
+
connection.exec_query(<<-SQL).cast_values
|
65
|
+
SELECT ts_headline(#{connection.quote(dictionary)}, words, #{query}, 'ShortWord=2')
|
66
|
+
FROM unnest(ARRAY[ #{words.map{ |t| connection.quote(t) }.join(', ')} ]::varchar[]) AS words
|
67
|
+
SQL
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.get_binds(hash, highlight)
|
71
|
+
rt = []
|
72
|
+
highlight.each do |k, v|
|
73
|
+
next unless hash[k]
|
74
|
+
|
75
|
+
if hash[k].is_a?(Hash)
|
76
|
+
rt += get_binds(hash[k], highlight[k])
|
77
|
+
elsif hash[k].is_a?(Array)
|
78
|
+
hash[k].each do |i|
|
79
|
+
if i.is_a?(Hash)
|
80
|
+
rt += get_binds(i, highlight[k])
|
81
|
+
else
|
82
|
+
rt += i.is_a?(Array) ? i : [i]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
else
|
86
|
+
rt << hash[k].to_s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
rt
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
class Runestone::Settings
|
2
|
+
|
3
|
+
attr_reader :indexes, :dictionary
|
4
|
+
|
5
|
+
def initialize(model, name: , dictionary: , &block)
|
6
|
+
@name = name
|
7
|
+
@dictionary = dictionary
|
8
|
+
@indexes = {}
|
9
|
+
instance_exec(&block)
|
10
|
+
end
|
11
|
+
|
12
|
+
def index(*args, weight: 1)
|
13
|
+
@indexes[weight] = args.map(&:to_s)
|
14
|
+
end
|
15
|
+
|
16
|
+
def attribute(*names, &block)
|
17
|
+
raise ArgumentError.new('Cannot pass multiple attribute names if block given') if block_given? and names.length > 1
|
18
|
+
|
19
|
+
@attributes ||= {}
|
20
|
+
names.each do |name|
|
21
|
+
@attributes[name.to_sym] = block ? block : nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
alias :attributes :attribute
|
25
|
+
|
26
|
+
def extract_attributes(record)
|
27
|
+
attributes = {}
|
28
|
+
|
29
|
+
@attributes.each do |name, value|
|
30
|
+
attributes[name] = if value.is_a?(Proc)
|
31
|
+
record.instance_exec(&value)
|
32
|
+
else
|
33
|
+
rv = record.send(name)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
remove_nulls(attributes)
|
38
|
+
end
|
39
|
+
|
40
|
+
def vectorize(data)
|
41
|
+
conn = Runestone::Model.connection
|
42
|
+
tsvector = []
|
43
|
+
|
44
|
+
@indexes.each do |weight, paths|
|
45
|
+
tsweight = {4 => 'D', 3 => 'C', 2 => 'B', 1 => 'A'}[weight]
|
46
|
+
paths.each do |path|
|
47
|
+
path = path.to_s.split('.')
|
48
|
+
|
49
|
+
dig(data, path).each do |value|
|
50
|
+
next if !value
|
51
|
+
language = value.to_s.size <= 5 ? 'simple' : @dictionary
|
52
|
+
tsvector << "setweight(to_tsvector(#{conn.quote(language)}, #{conn.quote(value.to_s.downcase)}), #{conn.quote(tsweight)})"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
tsvector.empty? ? ["to_tsvector('')"] : tsvector
|
57
|
+
end
|
58
|
+
|
59
|
+
def corpus(data)
|
60
|
+
words = []
|
61
|
+
|
62
|
+
@indexes.each do |weight, paths|
|
63
|
+
paths.each do |path|
|
64
|
+
dig(data, path.to_s.split('.')).each do |value|
|
65
|
+
next if !value
|
66
|
+
value.to_s.split(/\s+/).each do |word|
|
67
|
+
words << word.downcase.gsub(/\A\W/, '').gsub(/\W\Z/, '')
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
words
|
74
|
+
end
|
75
|
+
|
76
|
+
def remove_nulls(value)
|
77
|
+
if value.is_a?(Hash)
|
78
|
+
nh = {}
|
79
|
+
value.each do |k, v|
|
80
|
+
nh[k] = if v.is_a?(Hash) || v.is_a?(Array)
|
81
|
+
remove_nulls(v)
|
82
|
+
elsif !v.nil?
|
83
|
+
v.is_a?(String) ? v.unicode_normalize(:nfc) : v
|
84
|
+
end
|
85
|
+
nh.delete(k) if nh[k].nil? || (nh[k].is_a?(Hash) && nh[k].empty?)
|
86
|
+
end
|
87
|
+
nh
|
88
|
+
elsif value.is_a?(Array)
|
89
|
+
value.select{|i| !i.nil? && !i.empty? }.map { |i| remove_nulls(i) }
|
90
|
+
else
|
91
|
+
value
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def dig(data, keys)
|
96
|
+
if data.is_a?(Hash)
|
97
|
+
key = keys.shift
|
98
|
+
dig(data[key.to_sym] || data[key.to_s], keys)
|
99
|
+
elsif data.is_a?(Array)
|
100
|
+
data.map{ |d| dig(d, keys.dup) }.flatten.compact
|
101
|
+
else
|
102
|
+
[data]
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -0,0 +1,203 @@
|
|
1
|
+
class Runestone::WebSearch
|
2
|
+
autoload :Or, "#{File.dirname(__FILE__)}/web_search/or"
|
3
|
+
autoload :And, "#{File.dirname(__FILE__)}/web_search/and"
|
4
|
+
autoload :Token, "#{File.dirname(__FILE__)}/web_search/token"
|
5
|
+
autoload :Phrase, "#{File.dirname(__FILE__)}/web_search/phrase"
|
6
|
+
|
7
|
+
class Match
|
8
|
+
attr_accessor :index, :substitution
|
9
|
+
def initialize(index, substitution)
|
10
|
+
@index = index
|
11
|
+
@substitution = substitution
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class PartialMatch
|
16
|
+
attr_accessor :start_index, :end_index, :substitution
|
17
|
+
def initialize(start_index, end_index, substitution)
|
18
|
+
@start_index = start_index
|
19
|
+
@end_index = end_index
|
20
|
+
@substitution = substitution
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_accessor :values
|
25
|
+
|
26
|
+
# prefix options: :all, :last, :none (default: :last)
|
27
|
+
def self.parse(query, prefix: :last)
|
28
|
+
prefix ||= :last
|
29
|
+
begin
|
30
|
+
query.unicode_normalize!
|
31
|
+
rescue Encoding::CompatibilityError
|
32
|
+
end
|
33
|
+
query.downcase!
|
34
|
+
|
35
|
+
q = []
|
36
|
+
stack = []
|
37
|
+
knot = false
|
38
|
+
tokens = query.gsub(/\"\s+\"/, '""').split(' ')
|
39
|
+
tokens.each_with_index do |token, i|
|
40
|
+
token.gsub!(/\(|\)|:|\||!|\&|\*/, '')
|
41
|
+
if token.start_with?('-')
|
42
|
+
knot = true
|
43
|
+
token.delete_prefix!('-')
|
44
|
+
else
|
45
|
+
knot = false
|
46
|
+
end
|
47
|
+
|
48
|
+
next if token.empty? || token == '""' || %w(' ").include?(token)
|
49
|
+
|
50
|
+
if token.start_with?('"') && token.end_with?('"')
|
51
|
+
token.delete_prefix!('"')
|
52
|
+
token.delete_suffix!('"')
|
53
|
+
|
54
|
+
q << Phrase.new([token], negative: knot)
|
55
|
+
elsif token.start_with?('"')
|
56
|
+
token.delete_prefix!('"')
|
57
|
+
stack.push(:phrase)
|
58
|
+
q << Phrase.new([Token.new(token)], negative: knot)
|
59
|
+
elsif token.end_with?('"')
|
60
|
+
token.delete_suffix!('"')
|
61
|
+
q.last.values << Token.new(token)
|
62
|
+
stack.pop
|
63
|
+
else
|
64
|
+
token = Token.new(token, negative: knot)
|
65
|
+
if !knot && prefix == :last && tokens.size - 1 == i
|
66
|
+
token.prefix = true
|
67
|
+
elsif !knot && prefix == :all
|
68
|
+
token.prefix = true
|
69
|
+
end
|
70
|
+
|
71
|
+
if stack.last == :phrase
|
72
|
+
q.last.values << token
|
73
|
+
else
|
74
|
+
q << token
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
new(q)
|
80
|
+
end
|
81
|
+
|
82
|
+
def initialize(values)
|
83
|
+
@values = values
|
84
|
+
end
|
85
|
+
|
86
|
+
def typos
|
87
|
+
tokens = @values.select{|t| t.is_a?(Token) && !t.negative }
|
88
|
+
sw = Runestone::Corpus.similar_words(*tokens.map(&:value))
|
89
|
+
q = @values.map do |t|
|
90
|
+
if t.is_a?(Token) && sw.has_key?(t.value)
|
91
|
+
Token.new(t.value, prefix: t.prefix, negative: t.negative, alts: sw[t.value])
|
92
|
+
else
|
93
|
+
t
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
Runestone::WebSearch.new(q)
|
98
|
+
end
|
99
|
+
|
100
|
+
def synonymize
|
101
|
+
parts = []
|
102
|
+
@values.each do |token|
|
103
|
+
if token.is_a?(Phrase) || token.negative
|
104
|
+
parts << token
|
105
|
+
else
|
106
|
+
parts << [] if parts.empty? || parts.last.is_a?(Phrase) || (!parts.last.is_a?(Array) && parts.last.negative)
|
107
|
+
parts.last << token
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
parts.map! do |part|
|
112
|
+
if !part.is_a?(Phrase) && (part.is_a?(Array) || !part.negative)
|
113
|
+
synonymize_part(part)
|
114
|
+
else
|
115
|
+
part
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
Runestone::WebSearch.new(parts)
|
120
|
+
end
|
121
|
+
|
122
|
+
def synonymize_part(part)
|
123
|
+
pending_matches = []
|
124
|
+
matches = []
|
125
|
+
|
126
|
+
part.each_with_index do |token, i|
|
127
|
+
|
128
|
+
pending_matches.select! do |match|
|
129
|
+
if match.end_index + 1 == i && match.substitution[token.value]
|
130
|
+
match.substitution[token.value].map do |nm|
|
131
|
+
if nm.is_a?(Hash)
|
132
|
+
match.end_index = i
|
133
|
+
match.alts = nm
|
134
|
+
true
|
135
|
+
else
|
136
|
+
matches << Match.new(match.start_index..i, Phrase.new(Array(nm), distance: 1))
|
137
|
+
false
|
138
|
+
end
|
139
|
+
end
|
140
|
+
else
|
141
|
+
false
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if match = Runestone.synonyms[token.value]
|
146
|
+
match.each do |m|
|
147
|
+
if m.is_a?(Hash)
|
148
|
+
pending_matches << PartialMatch.new(i, i, m)
|
149
|
+
else
|
150
|
+
matches << Match.new(i, Phrase.new(m.split(/\s+/), distance: 1))
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
matches.select! do |match|
|
158
|
+
if match.index.is_a?(Integer)
|
159
|
+
case part[match.index]
|
160
|
+
when Or
|
161
|
+
part[match.index].values << match.substitution
|
162
|
+
else
|
163
|
+
part[match.index] = Or.new([part[match.index], match.substitution])
|
164
|
+
end
|
165
|
+
|
166
|
+
false
|
167
|
+
else
|
168
|
+
true
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
groups = matches.inject([]) do |memo, match|
|
173
|
+
if memo.empty?
|
174
|
+
memo << [match]
|
175
|
+
elsif i = memo.index { |k| k.none? { |j| j.index.overlaps?(match.index) } }
|
176
|
+
memo[i] << match
|
177
|
+
else
|
178
|
+
memo << [match]
|
179
|
+
end
|
180
|
+
memo
|
181
|
+
end
|
182
|
+
|
183
|
+
if groups.empty?
|
184
|
+
And.new(part)
|
185
|
+
else
|
186
|
+
orrs = Or.new([])
|
187
|
+
groups.each do |g|
|
188
|
+
p = []
|
189
|
+
p << And.new(part[0..g.first.index.begin-1]) if g.first.index.begin > 0
|
190
|
+
g.each do |m|
|
191
|
+
p << Or.new([And.new(part[m.index]), m.substitution])
|
192
|
+
end
|
193
|
+
p << And.new(part[g.last.index.end+1..-1]) if g.last.index.end < part.size
|
194
|
+
orrs.values << And.new(p)
|
195
|
+
end
|
196
|
+
orrs
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def to_s(use_synonyms: true, allow_typos: true)
|
201
|
+
self.values.join(' & ')
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Runestone::WebSearch::And
|
2
|
+
attr_accessor :values, :negative
|
3
|
+
def initialize(values, negative: false)
|
4
|
+
@values = values
|
5
|
+
@negative = negative
|
6
|
+
end
|
7
|
+
|
8
|
+
def to_s
|
9
|
+
v = if values.size == 1
|
10
|
+
values.first.to_s
|
11
|
+
else
|
12
|
+
values.map(&:to_s).join(' & ')
|
13
|
+
end
|
14
|
+
|
15
|
+
negative ? "!#{v}" : v
|
16
|
+
end
|
17
|
+
end
|