soulheart 0.1.4 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0a9835278571fb659e0eae5ad1d39918bcd41049
4
- data.tar.gz: e3dbbeecea70b9cf11327f012f3d47e02f64f04b
3
+ metadata.gz: 92cb6b53b32cdadf8fe407f940ee941f0c9cc0e4
4
+ data.tar.gz: 1af49cbdf332eade831a421fa3b47079e130516a
5
5
  SHA512:
6
- metadata.gz: 01215919be332be2cc67d9885c5bb21380110fa04caf4a424e91f7649aacf696d08c88f24c5a95b0bf88b8329915c93bf1c37976ee3f95b0e96c1cfb7fd2449b
7
- data.tar.gz: e5b336efcc86b4250cecdb4d61c8427c831dca3eea2ba25f8bf2e362f2cd7ea00612951da3c1160f9f0eb1f9a0e62833b37aab69d20a7507bfafb3ccfbb105bc
6
+ metadata.gz: 90287822a0fdb40a18ff3abbbdac18548e7f029886a11dfacad86e5ce6b57f10238d7504e6e5de68199a7a09117071c6f1e1788028dce56dda5ffff3edad40e6
7
+ data.tar.gz: 24940196b5f5d04374d9e068ef0fdb621dd62559fc01750529651b8b6512a48f9e7ac5cfa7ec03c728c14f997b3418198dd96d8115a8af45339da5e81f127157
data/bin/soulheart CHANGED
@@ -11,6 +11,13 @@ require 'soulheart'
11
11
  require 'optparse'
12
12
  require 'tempfile'
13
13
 
14
+ @batch_size = 1000
15
+ @no_all = false
16
+ @no_combinatorial = false
17
+ @normalize_regex = false
18
+ @normalize_no_sym = false
19
+ @remove_results = false
20
+
14
21
  parser = OptionParser.new do |opts|
15
22
  opts.banner = 'Usage: soulheart [options] COMMAND'
16
23
 
@@ -21,54 +28,68 @@ parser = OptionParser.new do |opts|
21
28
  Soulheart.redis = host
22
29
  end
23
30
 
24
- opts.on('-s', '--stop-words [FILE]', 'Path to file containing a list of stop words to overwrite defaults - "the", "at", "vs"') do |fn|
25
- File.open(fn) do |file|
26
- Soulheart.stop_words = file.readlines.map(&:strip).reject(&:empty?)
27
- end
28
- end
29
-
30
31
  opts.on('-h', '--help', 'Show this message') do
31
32
  puts opts
32
33
  exit
33
34
  end
34
35
 
35
- opts.on('-A', '--no-all', 'on load: Do not add items into the "all" category (on load)') do |size|
36
- NO_ALL = true
36
+ opts.separator ''
37
+ opts.separator ' load options:'
38
+
39
+ opts.on('-A', '--no-all', 'Do not add items into the "all" category') do |size|
40
+ @no_all = true
37
41
  end
38
42
 
39
- opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories (on load)') do |size|
40
- NO_COMBINATORIAL = true
43
+ opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories') do |size|
44
+ @no_combinatorial = true
41
45
  end
42
46
 
43
- opts.on('-R', '--remove-results', 'Remove results data - breaks the cache, fully clears all loaded data. (on clear)') do |size|
44
- REMOVE_RESULTS = true
47
+ opts.separator ''
48
+ opts.separator ' normalize options:'
49
+
50
+ opts.on('-s', '--with-symbols', 'Do not remove symbols when normalizing terms') do |size|
51
+ @normalize_no_sym = true
52
+ end
53
+
54
+ opts.on('-x', '--regex', 'Use the first line from the FILE as the regular expression for normalizing terms') do |size|
55
+ @normalize_regex = true
56
+ end
57
+
58
+ opts.separator ''
59
+ opts.separator ' clear options:'
60
+
61
+ opts.on('-R', '--remove-results', 'Remove results data - breaks the cache, fully clears all loaded data') do |size|
62
+ @remove_results = true
45
63
  end
46
64
 
47
65
  opts.separator ''
48
66
  opts.separator 'Commands:'
49
- opts.separator ' load FILE Loads data from a FILE - can be a local file or a url. Accepted formats are .json, .tsv and .csv'
50
- opts.separator " reset Removes all existing data (optionally pass a file to run load after reset)"
67
+ opts.separator ' load FILE Loads data from a FILE - can be a local file or a url. Accepted formats are .json, .tsv and .csv'
68
+ opts.separator " stop-words FILE Load file containing a list of stop words to overwrite defaults - 'the', 'at' and 'vs'"
69
+ opts.separator " normalize Set the way that terms are normalized. Requires a file when including the --regex option"
70
+ opts.separator " clear Removes existing items and categories from the database"
51
71
  opts.separator ''
52
72
  opts.separator 'Additional info: https://sethherr.github.io/soulheart/commands/'
53
73
  opts.separator ''
54
74
  end
55
75
 
56
-
57
- def load(file)
76
+ def open_file(file)
58
77
  require 'uri'
59
78
  if file =~ URI.regexp
60
79
  require 'open-uri'
61
- f = open(file)
80
+ open(file)
62
81
  elsif File.exist?(file)
63
- f = File.open(file)
82
+ File.open(file)
64
83
  else
65
- puts "Couldn't open file: #{file}"
66
- return true
84
+ raise StandardError, "Couldn't open file: #{file}"
67
85
  end
86
+ end
68
87
 
88
+ def load(file)
89
+ f = open_file(file)
69
90
  start_time = Time.now.to_i
70
91
  count = 0
71
- loader = Soulheart::Loader.new({no_all: NO_ALL, no_combinatorial: NO_COMBINATORIAL})
92
+ loader = Soulheart::Loader.new({no_all: @no_all, no_combinatorial: @no_combinatorial})
72
93
  lines = []
73
94
  begin
74
95
  if file.match(/(c|t)sv\z/i)
@@ -81,10 +102,10 @@ def load(file)
81
102
  end
82
103
  elsif file.match(/json\z/i)
83
104
  puts 'Reading JSON'
84
- puts "Loading items in batches of #{BATCH_SIZE} ..."
105
+ puts "Loading items in batches of #{@batch_size} ..."
85
106
  until f.eof?
86
107
  lines = []
87
- BATCH_SIZE.times do
108
+ @batch_size.times do
88
109
  break if f.eof?
89
110
  lines << MultiJson.decode(f.gets)
90
111
  count += 1
@@ -100,20 +121,37 @@ def load(file)
100
121
  puts "Total time to load: #{Time.now.to_i - start_time} second(s)"
101
122
  end
102
123
 
103
- def clear(remove_results)
104
- Soulheart::Loader.new.clear(remove_results)
124
+ def stop_words(file)
125
+ f = open_file(file)
126
+ Soulheart.stop_words = f.readlines.map(&:strip).reject(&:empty?)
105
127
  end
106
128
 
107
- parser.parse!
129
+ def normalize(file=nil)
130
+ if @normalize_regex
131
+ f = open_file(file)
132
+ puts f.readlines.map(&:strip).reject(&:empty?).first
133
+ Soulheart.normalizer = f.readlines.map(&:strip).reject(&:empty?).first
134
+ elsif @normalize_no_sym
135
+ Soulheart.normalizer = ''
136
+ else
137
+ Soulheart.normalizer = Soulheart.default_normalizer
138
+ end
139
+ end
108
140
 
141
+ def clear
142
+ Soulheart::Loader.new.clear(@remove_results)
143
+ end
144
+
145
+ parser.parse!
109
146
  case ARGV[0]
110
147
  when 'load'
111
- BATCH_SIZE ||= 1000
112
- NO_ALL ||= false
113
- NO_COMBINED_CATEGORIES ||= false
114
148
  load ARGV[1]
115
- when 'reset'
116
- clear(REMOVE_RESULTS)
149
+ when 'stop-words'
150
+ stop_words ARGV[1]
151
+ when 'normalize'
152
+ ARGV[1] ? normalize(ARGV[1]) : normalize
153
+ when 'clear'
154
+ clear
117
155
  load ARGV[1] if ARGV[1]
118
156
  else
119
157
  puts parser.help
@@ -12,10 +12,6 @@ module Soulheart
12
12
  10 * 60 # Setting to 10 minutes, but making it possible to edit down the line
13
13
  end
14
14
 
15
- def base_id
16
- ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
17
- end
18
-
19
15
  def sorted_category_array
20
16
  redis.smembers(categories_id).map { |c| normalize(c) }.uniq.sort
21
17
  end
@@ -39,7 +35,7 @@ module Soulheart
39
35
  end
40
36
 
41
37
  def category_combos_id
42
- "#{base_id}category_combos:"
38
+ "#{Soulheart.base_id}category_combos:"
43
39
  end
44
40
 
45
41
  def category_combos
@@ -47,7 +43,7 @@ module Soulheart
47
43
  end
48
44
 
49
45
  def categories_id
50
- "#{base_id}categories:"
46
+ "#{Soulheart.base_id}categories:"
51
47
  end
52
48
 
53
49
  def hidden_categories_id
@@ -63,11 +59,16 @@ module Soulheart
63
59
  end
64
60
 
65
61
  def results_hashes_id
66
- "#{base_id}database:"
62
+ "#{Soulheart.base_id}database:"
67
63
  end
68
64
 
65
+ def normalize_type_id
66
+ "#{Soulheart.base_id}normalize:"
67
+ end
68
+
69
+
69
70
  def cache_id(type = 'all')
70
- "#{base_id}cache:#{type}:"
71
+ "#{Soulheart.base_id}cache:#{type}:"
71
72
  end
72
73
  end
73
74
  end
@@ -3,7 +3,7 @@ require 'redis'
3
3
 
4
4
  module Soulheart
5
5
  module Config
6
- DEFAULT_STOP_WORDS = %w(vs at the)
6
+
7
7
 
8
8
  # Accepts:
9
9
  # 1. A Redis URL String 'redis://host:port/db'
@@ -37,12 +37,54 @@ module Soulheart
37
37
  )
38
38
  end
39
39
 
40
+ def base_id
41
+ ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
42
+ end
43
+
44
+ def stop_words_id
45
+ "#{base_id}stop_list:"
46
+ end
47
+
48
+ def default_stop_words
49
+ %w(vs at the)
50
+ end
51
+
52
+ def redis_stop_words
53
+ return false unless redis.exists stop_words_id
54
+ redis.lrange(stop_words_id, 0, -1)
55
+ end
56
+
40
57
  def stop_words
41
- @stop_words ||= DEFAULT_STOP_WORDS
58
+ @stop_words ||= redis_stop_words || default_stop_words
42
59
  end
43
60
 
44
61
  def stop_words=(arr)
62
+ redis.expire stop_words_id, 0
45
63
  @stop_words = Array(arr).flatten
64
+ redis.lpush stop_words_id, @stop_words
65
+ end
66
+
67
+ def normalizer_id
68
+ "#{base_id}normalizer:"
69
+ end
70
+
71
+ def default_normalizer
72
+ '[^\p{Word}\ ]'
73
+ end
74
+
75
+ def redis_normalizer
76
+ return false unless redis.exists normalizer_id
77
+ redis.get normalizer_id
78
+ end
79
+
80
+ def normalizer
81
+ @normalizer ||= redis_normalizer || default_normalizer
82
+ end
83
+
84
+ def normalizer=(str)
85
+ redis.expire normalizer_id, 0
86
+ @normalizer = str
87
+ redis.set normalizer_id, @normalizer
46
88
  end
47
89
  end
48
90
  end
@@ -3,7 +3,7 @@ module Soulheart
3
3
  module Helpers
4
4
  def normalize(str)
5
5
  # Letter, Mark, Number, Connector_Punctuation (Chinese, Japanese, etc.)
6
- str.downcase.gsub(/[^\p{Word}\ ]/i, '').strip
6
+ str.downcase.gsub(/#{Soulheart.normalizer}/i, '').strip
7
7
  end
8
8
 
9
9
  def prefixes_for_phrase(phrase)
@@ -39,9 +39,9 @@ module Soulheart
39
39
  redis.sadd categories_id, categories
40
40
  end
41
41
 
42
- def delete_data(id="#{base_id}:")
42
+ def delete_data(id="#{Soulheart.base_id}:")
43
43
  # delete the sorted sets for this type
44
- phrases = redis.smembers(base_id)
44
+ phrases = redis.smembers(Soulheart.base_id)
45
45
  redis.pipelined do
46
46
  phrases.each do |p|
47
47
  redis.del("#{id}#{p}")
@@ -72,10 +72,10 @@ module Soulheart
72
72
  end
73
73
 
74
74
  def load(items)
75
- # Replace with item return so we know we have category_id
75
+ Soulheart.stop_words # Load stop words so we don't pipeline redis_stop_words accidentally
76
76
  i = 0
77
77
  items.each do |item|
78
- item.replace(add_item(item))
78
+ item.replace(add_item(item)) # Replace with item return so we know we have category_id
79
79
  i += 1
80
80
  end
81
81
  set_category_combos_array.each do |category_combo|
@@ -84,10 +84,10 @@ module Soulheart
84
84
  next
85
85
  elsif category_combo == 'all'
86
86
  next if @no_all
87
- elsif !category_combo.match(item['category'])
88
- next
89
87
  elsif @no_combinatorial
90
88
  next
89
+ elsif !category_combo.match(item['category'])
90
+ next
91
91
  end
92
92
  add_item(item, category_id(category_combo), true) # send it base
93
93
  i += 1
@@ -114,7 +114,7 @@ module Soulheart
114
114
  item
115
115
  end
116
116
 
117
- def add_item(item, category_base_id=nil, cleaned=false)
117
+ def add_item(item, category_base_id=nil, cleaned=false)
118
118
  item = clean(item) unless cleaned
119
119
  category_base_id ||= category_id(item['category'])
120
120
  priority = (-item['priority'])
@@ -125,7 +125,7 @@ module Soulheart
125
125
  phrase = ([item['term']] + (item['aliases'] || [])).join(' ')
126
126
  # Store all the prefixes
127
127
  prefixes_for_phrase(phrase).each do |p|
128
- redis.sadd(base_id, p) unless cleaned # remember prefix in a master set
128
+ redis.sadd(Soulheart.base_id, p) unless cleaned # remember prefix in a master set
129
129
  # store the normalized term in the index for each of the categories
130
130
  redis.zadd("#{category_base_id}#{p}", priority, item['term'])
131
131
  end
@@ -61,10 +61,10 @@ module Soulheart
61
61
  redis.expire(@cachekey, cache_duration) # cache_duration is set in base.rb
62
62
  end
63
63
 
64
- def matching_hashes(ids)
65
- return [] unless ids.size > 0
66
- results = redis.hmget(results_hashes_id, *ids)
67
- results = results.reject(&:nil?) # handle cached results for ids which have since been deleted
64
+ def matching_hashes(terms)
65
+ return [] unless terms.size > 0
66
+ results = redis.hmget(results_hashes_id, *terms)
67
+ results = results.reject(&:nil?) # handle cached results for terms which have since been deleted
68
68
  results.map { |r| MultiJson.decode(r) }
69
69
  end
70
70
 
@@ -74,8 +74,8 @@ module Soulheart
74
74
  limit = @opts['per_page'].to_i + offset - 1
75
75
 
76
76
  limit = 0 if limit < 0
77
- ids = redis.zrange(@cachekey, offset, limit) # Using 'ids', even though keys are now terms
78
- matching_hashes(ids)
77
+ terms = redis.zrange(@cachekey, offset, limit)
78
+ matching_hashes(terms)
79
79
  end
80
80
  end
81
81
  end
@@ -28,6 +28,8 @@ module Soulheart
28
28
  soulheart_version: Soulheart::VERSION,
29
29
  current_time: Time.now.utc.strftime('%H:%M:%S UTC'),
30
30
  redis_used_memory: info['used_memory_human'],
31
+ stop_words: Soulheart.stop_words,
32
+ normalizer: Soulheart.normalizer,
31
33
  })
32
34
  end
33
35
 
@@ -1,3 +1,3 @@
1
1
  module Soulheart
2
- VERSION = '0.1.4'
2
+ VERSION = '0.2.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: soulheart
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Seth Herr
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-23 00:00:00.000000000 Z
11
+ date: 2015-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: hiredis
@@ -108,20 +108,6 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
- - !ruby/object:Gem::Dependency
112
- name: rubocop
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - ">="
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
111
  description: Simple, fast autocomplete server for Ruby and Rails
126
112
  email:
127
113
  - seth.william.herr@gmail.com
@@ -164,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
150
  version: '0'
165
151
  requirements: []
166
152
  rubyforge_project:
167
- rubygems_version: 2.4.8
153
+ rubygems_version: 2.2.2
168
154
  signing_key:
169
155
  specification_version: 4
170
156
  summary: Simple, fast autocomplete server for Ruby and Rails