soulheart 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/soulheart +69 -31
- data/lib/soulheart/base.rb +9 -8
- data/lib/soulheart/config.rb +44 -2
- data/lib/soulheart/helpers.rb +1 -1
- data/lib/soulheart/loader.rb +8 -8
- data/lib/soulheart/matcher.rb +6 -6
- data/lib/soulheart/server.rb +2 -0
- data/lib/soulheart/version.rb +1 -1
- metadata +3 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92cb6b53b32cdadf8fe407f940ee941f0c9cc0e4
|
4
|
+
data.tar.gz: 1af49cbdf332eade831a421fa3b47079e130516a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90287822a0fdb40a18ff3abbbdac18548e7f029886a11dfacad86e5ce6b57f10238d7504e6e5de68199a7a09117071c6f1e1788028dce56dda5ffff3edad40e6
|
7
|
+
data.tar.gz: 24940196b5f5d04374d9e068ef0fdb621dd62559fc01750529651b8b6512a48f9e7ac5cfa7ec03c728c14f997b3418198dd96d8115a8af45339da5e81f127157
|
data/bin/soulheart
CHANGED
@@ -11,6 +11,13 @@ require 'soulheart'
|
|
11
11
|
require 'optparse'
|
12
12
|
require 'tempfile'
|
13
13
|
|
14
|
+
@batch_size = 1000
|
15
|
+
@no_all = false
|
16
|
+
@no_combinatorial = false
|
17
|
+
@normalize_regex = false
|
18
|
+
@normalize_no_sym = false
|
19
|
+
@remove_results = false
|
20
|
+
|
14
21
|
parser = OptionParser.new do |opts|
|
15
22
|
opts.banner = 'Usage: soulheart [options] COMMAND'
|
16
23
|
|
@@ -21,54 +28,68 @@ parser = OptionParser.new do |opts|
|
|
21
28
|
Soulheart.redis = host
|
22
29
|
end
|
23
30
|
|
24
|
-
opts.on('-s', '--stop-words [FILE]', 'Path to file containing a list of stop words to overwrite defaults - "the", "at", "vs"') do |fn|
|
25
|
-
File.open(fn) do |file|
|
26
|
-
Soulheart.stop_words = file.readlines.map(&:strip).reject(&:empty?)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
31
|
opts.on('-h', '--help', 'Show this message') do
|
31
32
|
puts opts
|
32
33
|
exit
|
33
34
|
end
|
34
35
|
|
35
|
-
opts.
|
36
|
-
|
36
|
+
opts.separator ''
|
37
|
+
opts.separator ' load options:'
|
38
|
+
|
39
|
+
opts.on('-A', '--no-all', 'Do not add items into the "all" category') do |size|
|
40
|
+
@no_all = true
|
37
41
|
end
|
38
42
|
|
39
|
-
opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories
|
40
|
-
|
43
|
+
opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories') do |size|
|
44
|
+
@no_combinatorial = true
|
41
45
|
end
|
42
46
|
|
43
|
-
opts.
|
44
|
-
|
47
|
+
opts.separator ''
|
48
|
+
opts.separator ' normalize options:'
|
49
|
+
|
50
|
+
opts.on('-s', '--with-symbols', 'Do not remove symbols when normalizing terms') do |size|
|
51
|
+
@normalize_no_sym = true
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on('-x', '--regex', 'Use the first line from the FILE as the regular expression for normalizing terms') do |size|
|
55
|
+
@normalize_regex = true
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.separator ''
|
59
|
+
opts.separator ' clear options:'
|
60
|
+
|
61
|
+
opts.on('-R', '--remove-results', 'Remove results data - breaks the cache, fully clears all loaded data') do |size|
|
62
|
+
@remove_results = true
|
45
63
|
end
|
46
64
|
|
47
65
|
opts.separator ''
|
48
66
|
opts.separator 'Commands:'
|
49
|
-
opts.separator ' load
|
50
|
-
opts.separator "
|
67
|
+
opts.separator ' load FILE Loads data from a FILE - can be a local file or a url. Accepted formats are .json, .tsv and .csv'
|
68
|
+
opts.separator " stop-words FILE Load file containing a list of stop words to overwrite defaults - 'the', 'at' and 'vs'"
|
69
|
+
opts.separator " normalize Set the way that terms are normalized. Requires a file when including the --regex option"
|
70
|
+
opts.separator " clear Removes existing items and categories from the database"
|
51
71
|
opts.separator ''
|
52
72
|
opts.separator 'Additional info: https://sethherr.github.io/soulheart/commands/'
|
53
73
|
opts.separator ''
|
54
74
|
end
|
55
75
|
|
56
|
-
|
57
|
-
def load(file)
|
76
|
+
def open_file(file)
|
58
77
|
require 'uri'
|
59
78
|
if file =~ URI.regexp
|
60
79
|
require 'open-uri'
|
61
|
-
|
80
|
+
open(file)
|
62
81
|
elsif File.exist?(file)
|
63
|
-
|
82
|
+
File.open(file)
|
64
83
|
else
|
65
|
-
|
66
|
-
return true
|
84
|
+
raise StandardError, "Couldn't open file: #{file}"
|
67
85
|
end
|
86
|
+
end
|
68
87
|
|
88
|
+
def load(file)
|
89
|
+
f = open_file(file)
|
69
90
|
start_time = Time.now.to_i
|
70
91
|
count = 0
|
71
|
-
loader = Soulheart::Loader.new({no_all:
|
92
|
+
loader = Soulheart::Loader.new({no_all: @no_all, no_combinatorial: @no_combinatorial})
|
72
93
|
lines = []
|
73
94
|
begin
|
74
95
|
if file.match(/(c|t)sv\z/i)
|
@@ -81,10 +102,10 @@ def load(file)
|
|
81
102
|
end
|
82
103
|
elsif file.match(/json\z/i)
|
83
104
|
puts 'Reading JSON'
|
84
|
-
puts "Loading items in batches of #{
|
105
|
+
puts "Loading items in batches of #{@batch_size} ..."
|
85
106
|
until f.eof?
|
86
107
|
lines = []
|
87
|
-
|
108
|
+
@batch_size.times do
|
88
109
|
break if f.eof?
|
89
110
|
lines << MultiJson.decode(f.gets)
|
90
111
|
count += 1
|
@@ -100,20 +121,37 @@ def load(file)
|
|
100
121
|
puts "Total time to load: #{Time.now.to_i - start_time} second(s)"
|
101
122
|
end
|
102
123
|
|
103
|
-
def
|
104
|
-
|
124
|
+
def stop_words(file)
|
125
|
+
f = open_file(file)
|
126
|
+
Soulheart.stop_words = f.readlines.map(&:strip).reject(&:empty?)
|
105
127
|
end
|
106
128
|
|
107
|
-
|
129
|
+
def normalize(file=nil)
|
130
|
+
if @normalize_regex
|
131
|
+
f = open_file(file)
|
132
|
+
puts f.readlines.map(&:strip).reject(&:empty?).first
|
133
|
+
Soulheart.normalizer = f.readlines.map(&:strip).reject(&:empty?).first
|
134
|
+
elsif @normalize_no_sym
|
135
|
+
Soulheart.normalizer = ''
|
136
|
+
else
|
137
|
+
Soulheart.normalizer = Soulheart.default_normalizer
|
138
|
+
end
|
139
|
+
end
|
108
140
|
|
141
|
+
def clear
|
142
|
+
Soulheart::Loader.new.clear(@remove_results)
|
143
|
+
end
|
144
|
+
|
145
|
+
parser.parse!
|
109
146
|
case ARGV[0]
|
110
147
|
when 'load'
|
111
|
-
BATCH_SIZE ||= 1000
|
112
|
-
NO_ALL ||= false
|
113
|
-
NO_COMBINED_CATEGORIES ||= false
|
114
148
|
load ARGV[1]
|
115
|
-
when '
|
116
|
-
|
149
|
+
when 'stop-words'
|
150
|
+
stop_words ARGV[1]
|
151
|
+
when 'normalize'
|
152
|
+
ARGV[1] ? normalize(ARGV[1]) : normalize
|
153
|
+
when 'clear'
|
154
|
+
clear
|
117
155
|
load ARGV[1] if ARGV[1]
|
118
156
|
else
|
119
157
|
puts parser.help
|
data/lib/soulheart/base.rb
CHANGED
@@ -12,10 +12,6 @@ module Soulheart
|
|
12
12
|
10 * 60 # Setting to 10 minutes, but making it possible to edit down the line
|
13
13
|
end
|
14
14
|
|
15
|
-
def base_id
|
16
|
-
ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
|
17
|
-
end
|
18
|
-
|
19
15
|
def sorted_category_array
|
20
16
|
redis.smembers(categories_id).map { |c| normalize(c) }.uniq.sort
|
21
17
|
end
|
@@ -39,7 +35,7 @@ module Soulheart
|
|
39
35
|
end
|
40
36
|
|
41
37
|
def category_combos_id
|
42
|
-
"#{base_id}category_combos:"
|
38
|
+
"#{Soulheart.base_id}category_combos:"
|
43
39
|
end
|
44
40
|
|
45
41
|
def category_combos
|
@@ -47,7 +43,7 @@ module Soulheart
|
|
47
43
|
end
|
48
44
|
|
49
45
|
def categories_id
|
50
|
-
"#{base_id}categories:"
|
46
|
+
"#{Soulheart.base_id}categories:"
|
51
47
|
end
|
52
48
|
|
53
49
|
def hidden_categories_id
|
@@ -63,11 +59,16 @@ module Soulheart
|
|
63
59
|
end
|
64
60
|
|
65
61
|
def results_hashes_id
|
66
|
-
"#{base_id}database:"
|
62
|
+
"#{Soulheart.base_id}database:"
|
67
63
|
end
|
68
64
|
|
65
|
+
def normalize_type_id
|
66
|
+
"#{Soulheart.base_id}normalize:"
|
67
|
+
end
|
68
|
+
|
69
|
+
|
69
70
|
def cache_id(type = 'all')
|
70
|
-
"#{base_id}cache:#{type}:"
|
71
|
+
"#{Soulheart.base_id}cache:#{type}:"
|
71
72
|
end
|
72
73
|
end
|
73
74
|
end
|
data/lib/soulheart/config.rb
CHANGED
@@ -3,7 +3,7 @@ require 'redis'
|
|
3
3
|
|
4
4
|
module Soulheart
|
5
5
|
module Config
|
6
|
-
|
6
|
+
|
7
7
|
|
8
8
|
# Accepts:
|
9
9
|
# 1. A Redis URL String 'redis://host:port/db'
|
@@ -37,12 +37,54 @@ module Soulheart
|
|
37
37
|
)
|
38
38
|
end
|
39
39
|
|
40
|
+
def base_id
|
41
|
+
ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
|
42
|
+
end
|
43
|
+
|
44
|
+
def stop_words_id
|
45
|
+
"#{base_id}stop_list:"
|
46
|
+
end
|
47
|
+
|
48
|
+
def default_stop_words
|
49
|
+
%w(vs at the)
|
50
|
+
end
|
51
|
+
|
52
|
+
def redis_stop_words
|
53
|
+
return false unless redis.exists stop_words_id
|
54
|
+
redis.lrange(stop_words_id, 0, -1)
|
55
|
+
end
|
56
|
+
|
40
57
|
def stop_words
|
41
|
-
@stop_words ||=
|
58
|
+
@stop_words ||= redis_stop_words || default_stop_words
|
42
59
|
end
|
43
60
|
|
44
61
|
def stop_words=(arr)
|
62
|
+
redis.expire stop_words_id, 0
|
45
63
|
@stop_words = Array(arr).flatten
|
64
|
+
redis.lpush stop_words_id, @stop_words
|
65
|
+
end
|
66
|
+
|
67
|
+
def normalizer_id
|
68
|
+
"#{base_id}normalizer:"
|
69
|
+
end
|
70
|
+
|
71
|
+
def default_normalizer
|
72
|
+
'[^\p{Word}\ ]'
|
73
|
+
end
|
74
|
+
|
75
|
+
def redis_normalizer
|
76
|
+
return false unless redis.exists normalizer_id
|
77
|
+
redis.get normalizer_id
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalizer
|
81
|
+
@normalizer ||= redis_normalizer || default_normalizer
|
82
|
+
end
|
83
|
+
|
84
|
+
def normalizer=(str)
|
85
|
+
redis.expire normalizer_id, 0
|
86
|
+
@normalizer = str
|
87
|
+
redis.set normalizer_id, @normalizer
|
46
88
|
end
|
47
89
|
end
|
48
90
|
end
|
data/lib/soulheart/helpers.rb
CHANGED
@@ -3,7 +3,7 @@ module Soulheart
|
|
3
3
|
module Helpers
|
4
4
|
def normalize(str)
|
5
5
|
# Letter, Mark, Number, Connector_Punctuation (Chinese, Japanese, etc.)
|
6
|
-
str.downcase.gsub(
|
6
|
+
str.downcase.gsub(/#{Soulheart.normalizer}/i, '').strip
|
7
7
|
end
|
8
8
|
|
9
9
|
def prefixes_for_phrase(phrase)
|
data/lib/soulheart/loader.rb
CHANGED
@@ -39,9 +39,9 @@ module Soulheart
|
|
39
39
|
redis.sadd categories_id, categories
|
40
40
|
end
|
41
41
|
|
42
|
-
def delete_data(id="#{base_id}:")
|
42
|
+
def delete_data(id="#{Soulheart.base_id}:")
|
43
43
|
# delete the sorted sets for this type
|
44
|
-
phrases = redis.smembers(base_id)
|
44
|
+
phrases = redis.smembers(Soulheart.base_id)
|
45
45
|
redis.pipelined do
|
46
46
|
phrases.each do |p|
|
47
47
|
redis.del("#{id}#{p}")
|
@@ -72,10 +72,10 @@ module Soulheart
|
|
72
72
|
end
|
73
73
|
|
74
74
|
def load(items)
|
75
|
-
#
|
75
|
+
Soulheart.stop_words # Load stop words so we don't pipeline redis_stop_words accidentally
|
76
76
|
i = 0
|
77
77
|
items.each do |item|
|
78
|
-
item.replace(add_item(item))
|
78
|
+
item.replace(add_item(item)) # Replace with item return so we know we have category_id
|
79
79
|
i += 1
|
80
80
|
end
|
81
81
|
set_category_combos_array.each do |category_combo|
|
@@ -84,10 +84,10 @@ module Soulheart
|
|
84
84
|
next
|
85
85
|
elsif category_combo == 'all'
|
86
86
|
next if @no_all
|
87
|
-
elsif !category_combo.match(item['category'])
|
88
|
-
next
|
89
87
|
elsif @no_combinatorial
|
90
88
|
next
|
89
|
+
elsif !category_combo.match(item['category'])
|
90
|
+
next
|
91
91
|
end
|
92
92
|
add_item(item, category_id(category_combo), true) # send it base
|
93
93
|
i += 1
|
@@ -114,7 +114,7 @@ module Soulheart
|
|
114
114
|
item
|
115
115
|
end
|
116
116
|
|
117
|
-
def add_item(item, category_base_id=nil, cleaned=false)
|
117
|
+
def add_item(item, category_base_id=nil, cleaned=false)
|
118
118
|
item = clean(item) unless cleaned
|
119
119
|
category_base_id ||= category_id(item['category'])
|
120
120
|
priority = (-item['priority'])
|
@@ -125,7 +125,7 @@ module Soulheart
|
|
125
125
|
phrase = ([item['term']] + (item['aliases'] || [])).join(' ')
|
126
126
|
# Store all the prefixes
|
127
127
|
prefixes_for_phrase(phrase).each do |p|
|
128
|
-
redis.sadd(base_id, p) unless cleaned # remember prefix in a master set
|
128
|
+
redis.sadd(Soulheart.base_id, p) unless cleaned # remember prefix in a master set
|
129
129
|
# store the normalized term in the index for each of the categories
|
130
130
|
redis.zadd("#{category_base_id}#{p}", priority, item['term'])
|
131
131
|
end
|
data/lib/soulheart/matcher.rb
CHANGED
@@ -61,10 +61,10 @@ module Soulheart
|
|
61
61
|
redis.expire(@cachekey, cache_duration) # cache_duration is set in base.rb
|
62
62
|
end
|
63
63
|
|
64
|
-
def matching_hashes(
|
65
|
-
return [] unless
|
66
|
-
results = redis.hmget(results_hashes_id, *
|
67
|
-
results = results.reject(&:nil?) # handle cached results for
|
64
|
+
def matching_hashes(terms)
|
65
|
+
return [] unless terms.size > 0
|
66
|
+
results = redis.hmget(results_hashes_id, *terms)
|
67
|
+
results = results.reject(&:nil?) # handle cached results for terms which have since been deleted
|
68
68
|
results.map { |r| MultiJson.decode(r) }
|
69
69
|
end
|
70
70
|
|
@@ -74,8 +74,8 @@ module Soulheart
|
|
74
74
|
limit = @opts['per_page'].to_i + offset - 1
|
75
75
|
|
76
76
|
limit = 0 if limit < 0
|
77
|
-
|
78
|
-
matching_hashes(
|
77
|
+
terms = redis.zrange(@cachekey, offset, limit)
|
78
|
+
matching_hashes(terms)
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
data/lib/soulheart/server.rb
CHANGED
data/lib/soulheart/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: soulheart
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seth Herr
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-07-
|
11
|
+
date: 2015-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: hiredis
|
@@ -108,20 +108,6 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: rubocop
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
114
|
-
requirements:
|
115
|
-
- - ">="
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - ">="
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
125
111
|
description: Simple, fast autocomplete server for Ruby and Rails
|
126
112
|
email:
|
127
113
|
- seth.william.herr@gmail.com
|
@@ -164,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
164
150
|
version: '0'
|
165
151
|
requirements: []
|
166
152
|
rubyforge_project:
|
167
|
-
rubygems_version: 2.
|
153
|
+
rubygems_version: 2.2.2
|
168
154
|
signing_key:
|
169
155
|
specification_version: 4
|
170
156
|
summary: Simple, fast autocomplete server for Ruby and Rails
|