soulheart 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/soulheart +69 -31
- data/lib/soulheart/base.rb +9 -8
- data/lib/soulheart/config.rb +44 -2
- data/lib/soulheart/helpers.rb +1 -1
- data/lib/soulheart/loader.rb +8 -8
- data/lib/soulheart/matcher.rb +6 -6
- data/lib/soulheart/server.rb +2 -0
- data/lib/soulheart/version.rb +1 -1
- metadata +3 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92cb6b53b32cdadf8fe407f940ee941f0c9cc0e4
|
4
|
+
data.tar.gz: 1af49cbdf332eade831a421fa3b47079e130516a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90287822a0fdb40a18ff3abbbdac18548e7f029886a11dfacad86e5ce6b57f10238d7504e6e5de68199a7a09117071c6f1e1788028dce56dda5ffff3edad40e6
|
7
|
+
data.tar.gz: 24940196b5f5d04374d9e068ef0fdb621dd62559fc01750529651b8b6512a48f9e7ac5cfa7ec03c728c14f997b3418198dd96d8115a8af45339da5e81f127157
|
data/bin/soulheart
CHANGED
@@ -11,6 +11,13 @@ require 'soulheart'
|
|
11
11
|
require 'optparse'
|
12
12
|
require 'tempfile'
|
13
13
|
|
14
|
+
@batch_size = 1000
|
15
|
+
@no_all = false
|
16
|
+
@no_combinatorial = false
|
17
|
+
@normalize_regex = false
|
18
|
+
@normalize_no_sym = false
|
19
|
+
@remove_results = false
|
20
|
+
|
14
21
|
parser = OptionParser.new do |opts|
|
15
22
|
opts.banner = 'Usage: soulheart [options] COMMAND'
|
16
23
|
|
@@ -21,54 +28,68 @@ parser = OptionParser.new do |opts|
|
|
21
28
|
Soulheart.redis = host
|
22
29
|
end
|
23
30
|
|
24
|
-
opts.on('-s', '--stop-words [FILE]', 'Path to file containing a list of stop words to overwrite defaults - "the", "at", "vs"') do |fn|
|
25
|
-
File.open(fn) do |file|
|
26
|
-
Soulheart.stop_words = file.readlines.map(&:strip).reject(&:empty?)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
31
|
opts.on('-h', '--help', 'Show this message') do
|
31
32
|
puts opts
|
32
33
|
exit
|
33
34
|
end
|
34
35
|
|
35
|
-
opts.
|
36
|
-
|
36
|
+
opts.separator ''
|
37
|
+
opts.separator ' load options:'
|
38
|
+
|
39
|
+
opts.on('-A', '--no-all', 'Do not add items into the "all" category') do |size|
|
40
|
+
@no_all = true
|
37
41
|
end
|
38
42
|
|
39
|
-
opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories
|
40
|
-
|
43
|
+
opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories') do |size|
|
44
|
+
@no_combinatorial = true
|
41
45
|
end
|
42
46
|
|
43
|
-
opts.
|
44
|
-
|
47
|
+
opts.separator ''
|
48
|
+
opts.separator ' normalize options:'
|
49
|
+
|
50
|
+
opts.on('-s', '--with-symbols', 'Do not remove symbols when normalizing terms') do |size|
|
51
|
+
@normalize_no_sym = true
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on('-x', '--regex', 'Use the first line from the FILE as the regular expression for normalizing terms') do |size|
|
55
|
+
@normalize_regex = true
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.separator ''
|
59
|
+
opts.separator ' clear options:'
|
60
|
+
|
61
|
+
opts.on('-R', '--remove-results', 'Remove results data - breaks the cache, fully clears all loaded data') do |size|
|
62
|
+
@remove_results = true
|
45
63
|
end
|
46
64
|
|
47
65
|
opts.separator ''
|
48
66
|
opts.separator 'Commands:'
|
49
|
-
opts.separator ' load
|
50
|
-
opts.separator "
|
67
|
+
opts.separator ' load FILE Loads data from a FILE - can be a local file or a url. Accepted formats are .json, .tsv and .csv'
|
68
|
+
opts.separator " stop-words FILE Load file containing a list of stop words to overwrite defaults - 'the', 'at' and 'vs'"
|
69
|
+
opts.separator " normalize Set the way that terms are normalized. Requires a file when including the --regex option"
|
70
|
+
opts.separator " clear Removes existing items and categories from the database"
|
51
71
|
opts.separator ''
|
52
72
|
opts.separator 'Additional info: https://sethherr.github.io/soulheart/commands/'
|
53
73
|
opts.separator ''
|
54
74
|
end
|
55
75
|
|
56
|
-
|
57
|
-
def load(file)
|
76
|
+
def open_file(file)
|
58
77
|
require 'uri'
|
59
78
|
if file =~ URI.regexp
|
60
79
|
require 'open-uri'
|
61
|
-
|
80
|
+
open(file)
|
62
81
|
elsif File.exist?(file)
|
63
|
-
|
82
|
+
File.open(file)
|
64
83
|
else
|
65
|
-
|
66
|
-
return true
|
84
|
+
raise StandardError, "Couldn't open file: #{file}"
|
67
85
|
end
|
86
|
+
end
|
68
87
|
|
88
|
+
def load(file)
|
89
|
+
f = open_file(file)
|
69
90
|
start_time = Time.now.to_i
|
70
91
|
count = 0
|
71
|
-
loader = Soulheart::Loader.new({no_all:
|
92
|
+
loader = Soulheart::Loader.new({no_all: @no_all, no_combinatorial: @no_combinatorial})
|
72
93
|
lines = []
|
73
94
|
begin
|
74
95
|
if file.match(/(c|t)sv\z/i)
|
@@ -81,10 +102,10 @@ def load(file)
|
|
81
102
|
end
|
82
103
|
elsif file.match(/json\z/i)
|
83
104
|
puts 'Reading JSON'
|
84
|
-
puts "Loading items in batches of #{
|
105
|
+
puts "Loading items in batches of #{@batch_size} ..."
|
85
106
|
until f.eof?
|
86
107
|
lines = []
|
87
|
-
|
108
|
+
@batch_size.times do
|
88
109
|
break if f.eof?
|
89
110
|
lines << MultiJson.decode(f.gets)
|
90
111
|
count += 1
|
@@ -100,20 +121,37 @@ def load(file)
|
|
100
121
|
puts "Total time to load: #{Time.now.to_i - start_time} second(s)"
|
101
122
|
end
|
102
123
|
|
103
|
-
def
|
104
|
-
|
124
|
+
def stop_words(file)
|
125
|
+
f = open_file(file)
|
126
|
+
Soulheart.stop_words = f.readlines.map(&:strip).reject(&:empty?)
|
105
127
|
end
|
106
128
|
|
107
|
-
|
129
|
+
def normalize(file=nil)
|
130
|
+
if @normalize_regex
|
131
|
+
f = open_file(file)
|
132
|
+
puts f.readlines.map(&:strip).reject(&:empty?).first
|
133
|
+
Soulheart.normalizer = f.readlines.map(&:strip).reject(&:empty?).first
|
134
|
+
elsif @normalize_no_sym
|
135
|
+
Soulheart.normalizer = ''
|
136
|
+
else
|
137
|
+
Soulheart.normalizer = Soulheart.default_normalizer
|
138
|
+
end
|
139
|
+
end
|
108
140
|
|
141
|
+
def clear
|
142
|
+
Soulheart::Loader.new.clear(@remove_results)
|
143
|
+
end
|
144
|
+
|
145
|
+
parser.parse!
|
109
146
|
case ARGV[0]
|
110
147
|
when 'load'
|
111
|
-
BATCH_SIZE ||= 1000
|
112
|
-
NO_ALL ||= false
|
113
|
-
NO_COMBINED_CATEGORIES ||= false
|
114
148
|
load ARGV[1]
|
115
|
-
when '
|
116
|
-
|
149
|
+
when 'stop-words'
|
150
|
+
stop_words ARGV[1]
|
151
|
+
when 'normalize'
|
152
|
+
ARGV[1] ? normalize(ARGV[1]) : normalize
|
153
|
+
when 'clear'
|
154
|
+
clear
|
117
155
|
load ARGV[1] if ARGV[1]
|
118
156
|
else
|
119
157
|
puts parser.help
|
data/lib/soulheart/base.rb
CHANGED
@@ -12,10 +12,6 @@ module Soulheart
|
|
12
12
|
10 * 60 # Setting to 10 minutes, but making it possible to edit down the line
|
13
13
|
end
|
14
14
|
|
15
|
-
def base_id
|
16
|
-
ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
|
17
|
-
end
|
18
|
-
|
19
15
|
def sorted_category_array
|
20
16
|
redis.smembers(categories_id).map { |c| normalize(c) }.uniq.sort
|
21
17
|
end
|
@@ -39,7 +35,7 @@ module Soulheart
|
|
39
35
|
end
|
40
36
|
|
41
37
|
def category_combos_id
|
42
|
-
"#{base_id}category_combos:"
|
38
|
+
"#{Soulheart.base_id}category_combos:"
|
43
39
|
end
|
44
40
|
|
45
41
|
def category_combos
|
@@ -47,7 +43,7 @@ module Soulheart
|
|
47
43
|
end
|
48
44
|
|
49
45
|
def categories_id
|
50
|
-
"#{base_id}categories:"
|
46
|
+
"#{Soulheart.base_id}categories:"
|
51
47
|
end
|
52
48
|
|
53
49
|
def hidden_categories_id
|
@@ -63,11 +59,16 @@ module Soulheart
|
|
63
59
|
end
|
64
60
|
|
65
61
|
def results_hashes_id
|
66
|
-
"#{base_id}database:"
|
62
|
+
"#{Soulheart.base_id}database:"
|
67
63
|
end
|
68
64
|
|
65
|
+
def normalize_type_id
|
66
|
+
"#{Soulheart.base_id}normalize:"
|
67
|
+
end
|
68
|
+
|
69
|
+
|
69
70
|
def cache_id(type = 'all')
|
70
|
-
"#{base_id}cache:#{type}:"
|
71
|
+
"#{Soulheart.base_id}cache:#{type}:"
|
71
72
|
end
|
72
73
|
end
|
73
74
|
end
|
data/lib/soulheart/config.rb
CHANGED
@@ -3,7 +3,7 @@ require 'redis'
|
|
3
3
|
|
4
4
|
module Soulheart
|
5
5
|
module Config
|
6
|
-
|
6
|
+
|
7
7
|
|
8
8
|
# Accepts:
|
9
9
|
# 1. A Redis URL String 'redis://host:port/db'
|
@@ -37,12 +37,54 @@ module Soulheart
|
|
37
37
|
)
|
38
38
|
end
|
39
39
|
|
40
|
+
def base_id
|
41
|
+
ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
|
42
|
+
end
|
43
|
+
|
44
|
+
def stop_words_id
|
45
|
+
"#{base_id}stop_list:"
|
46
|
+
end
|
47
|
+
|
48
|
+
def default_stop_words
|
49
|
+
%w(vs at the)
|
50
|
+
end
|
51
|
+
|
52
|
+
def redis_stop_words
|
53
|
+
return false unless redis.exists stop_words_id
|
54
|
+
redis.lrange(stop_words_id, 0, -1)
|
55
|
+
end
|
56
|
+
|
40
57
|
def stop_words
|
41
|
-
@stop_words ||=
|
58
|
+
@stop_words ||= redis_stop_words || default_stop_words
|
42
59
|
end
|
43
60
|
|
44
61
|
def stop_words=(arr)
|
62
|
+
redis.expire stop_words_id, 0
|
45
63
|
@stop_words = Array(arr).flatten
|
64
|
+
redis.lpush stop_words_id, @stop_words
|
65
|
+
end
|
66
|
+
|
67
|
+
def normalizer_id
|
68
|
+
"#{base_id}normalizer:"
|
69
|
+
end
|
70
|
+
|
71
|
+
def default_normalizer
|
72
|
+
'[^\p{Word}\ ]'
|
73
|
+
end
|
74
|
+
|
75
|
+
def redis_normalizer
|
76
|
+
return false unless redis.exists normalizer_id
|
77
|
+
redis.get normalizer_id
|
78
|
+
end
|
79
|
+
|
80
|
+
def normalizer
|
81
|
+
@normalizer ||= redis_normalizer || default_normalizer
|
82
|
+
end
|
83
|
+
|
84
|
+
def normalizer=(str)
|
85
|
+
redis.expire normalizer_id, 0
|
86
|
+
@normalizer = str
|
87
|
+
redis.set normalizer_id, @normalizer
|
46
88
|
end
|
47
89
|
end
|
48
90
|
end
|
data/lib/soulheart/helpers.rb
CHANGED
@@ -3,7 +3,7 @@ module Soulheart
|
|
3
3
|
module Helpers
|
4
4
|
def normalize(str)
|
5
5
|
# Letter, Mark, Number, Connector_Punctuation (Chinese, Japanese, etc.)
|
6
|
-
str.downcase.gsub(
|
6
|
+
str.downcase.gsub(/#{Soulheart.normalizer}/i, '').strip
|
7
7
|
end
|
8
8
|
|
9
9
|
def prefixes_for_phrase(phrase)
|
data/lib/soulheart/loader.rb
CHANGED
@@ -39,9 +39,9 @@ module Soulheart
|
|
39
39
|
redis.sadd categories_id, categories
|
40
40
|
end
|
41
41
|
|
42
|
-
def delete_data(id="#{base_id}:")
|
42
|
+
def delete_data(id="#{Soulheart.base_id}:")
|
43
43
|
# delete the sorted sets for this type
|
44
|
-
phrases = redis.smembers(base_id)
|
44
|
+
phrases = redis.smembers(Soulheart.base_id)
|
45
45
|
redis.pipelined do
|
46
46
|
phrases.each do |p|
|
47
47
|
redis.del("#{id}#{p}")
|
@@ -72,10 +72,10 @@ module Soulheart
|
|
72
72
|
end
|
73
73
|
|
74
74
|
def load(items)
|
75
|
-
#
|
75
|
+
Soulheart.stop_words # Load stop words so we don't pipeline redis_stop_words accidentally
|
76
76
|
i = 0
|
77
77
|
items.each do |item|
|
78
|
-
item.replace(add_item(item))
|
78
|
+
item.replace(add_item(item)) # Replace with item return so we know we have category_id
|
79
79
|
i += 1
|
80
80
|
end
|
81
81
|
set_category_combos_array.each do |category_combo|
|
@@ -84,10 +84,10 @@ module Soulheart
|
|
84
84
|
next
|
85
85
|
elsif category_combo == 'all'
|
86
86
|
next if @no_all
|
87
|
-
elsif !category_combo.match(item['category'])
|
88
|
-
next
|
89
87
|
elsif @no_combinatorial
|
90
88
|
next
|
89
|
+
elsif !category_combo.match(item['category'])
|
90
|
+
next
|
91
91
|
end
|
92
92
|
add_item(item, category_id(category_combo), true) # send it base
|
93
93
|
i += 1
|
@@ -114,7 +114,7 @@ module Soulheart
|
|
114
114
|
item
|
115
115
|
end
|
116
116
|
|
117
|
-
def add_item(item, category_base_id=nil, cleaned=false)
|
117
|
+
def add_item(item, category_base_id=nil, cleaned=false)
|
118
118
|
item = clean(item) unless cleaned
|
119
119
|
category_base_id ||= category_id(item['category'])
|
120
120
|
priority = (-item['priority'])
|
@@ -125,7 +125,7 @@ module Soulheart
|
|
125
125
|
phrase = ([item['term']] + (item['aliases'] || [])).join(' ')
|
126
126
|
# Store all the prefixes
|
127
127
|
prefixes_for_phrase(phrase).each do |p|
|
128
|
-
redis.sadd(base_id, p) unless cleaned # remember prefix in a master set
|
128
|
+
redis.sadd(Soulheart.base_id, p) unless cleaned # remember prefix in a master set
|
129
129
|
# store the normalized term in the index for each of the categories
|
130
130
|
redis.zadd("#{category_base_id}#{p}", priority, item['term'])
|
131
131
|
end
|
data/lib/soulheart/matcher.rb
CHANGED
@@ -61,10 +61,10 @@ module Soulheart
|
|
61
61
|
redis.expire(@cachekey, cache_duration) # cache_duration is set in base.rb
|
62
62
|
end
|
63
63
|
|
64
|
-
def matching_hashes(
|
65
|
-
return [] unless
|
66
|
-
results = redis.hmget(results_hashes_id, *
|
67
|
-
results = results.reject(&:nil?) # handle cached results for
|
64
|
+
def matching_hashes(terms)
|
65
|
+
return [] unless terms.size > 0
|
66
|
+
results = redis.hmget(results_hashes_id, *terms)
|
67
|
+
results = results.reject(&:nil?) # handle cached results for terms which have since been deleted
|
68
68
|
results.map { |r| MultiJson.decode(r) }
|
69
69
|
end
|
70
70
|
|
@@ -74,8 +74,8 @@ module Soulheart
|
|
74
74
|
limit = @opts['per_page'].to_i + offset - 1
|
75
75
|
|
76
76
|
limit = 0 if limit < 0
|
77
|
-
|
78
|
-
matching_hashes(
|
77
|
+
terms = redis.zrange(@cachekey, offset, limit)
|
78
|
+
matching_hashes(terms)
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
data/lib/soulheart/server.rb
CHANGED
data/lib/soulheart/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: soulheart
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seth Herr
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-07-
|
11
|
+
date: 2015-07-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: hiredis
|
@@ -108,20 +108,6 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: rubocop
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
114
|
-
requirements:
|
115
|
-
- - ">="
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '0'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - ">="
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '0'
|
125
111
|
description: Simple, fast autocomplete server for Ruby and Rails
|
126
112
|
email:
|
127
113
|
- seth.william.herr@gmail.com
|
@@ -164,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
164
150
|
version: '0'
|
165
151
|
requirements: []
|
166
152
|
rubyforge_project:
|
167
|
-
rubygems_version: 2.
|
153
|
+
rubygems_version: 2.2.2
|
168
154
|
signing_key:
|
169
155
|
specification_version: 4
|
170
156
|
summary: Simple, fast autocomplete server for Ruby and Rails
|