soulheart 0.1.4 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0a9835278571fb659e0eae5ad1d39918bcd41049
4
- data.tar.gz: e3dbbeecea70b9cf11327f012f3d47e02f64f04b
3
+ metadata.gz: 92cb6b53b32cdadf8fe407f940ee941f0c9cc0e4
4
+ data.tar.gz: 1af49cbdf332eade831a421fa3b47079e130516a
5
5
  SHA512:
6
- metadata.gz: 01215919be332be2cc67d9885c5bb21380110fa04caf4a424e91f7649aacf696d08c88f24c5a95b0bf88b8329915c93bf1c37976ee3f95b0e96c1cfb7fd2449b
7
- data.tar.gz: e5b336efcc86b4250cecdb4d61c8427c831dca3eea2ba25f8bf2e362f2cd7ea00612951da3c1160f9f0eb1f9a0e62833b37aab69d20a7507bfafb3ccfbb105bc
6
+ metadata.gz: 90287822a0fdb40a18ff3abbbdac18548e7f029886a11dfacad86e5ce6b57f10238d7504e6e5de68199a7a09117071c6f1e1788028dce56dda5ffff3edad40e6
7
+ data.tar.gz: 24940196b5f5d04374d9e068ef0fdb621dd62559fc01750529651b8b6512a48f9e7ac5cfa7ec03c728c14f997b3418198dd96d8115a8af45339da5e81f127157
data/bin/soulheart CHANGED
@@ -11,6 +11,13 @@ require 'soulheart'
11
11
  require 'optparse'
12
12
  require 'tempfile'
13
13
 
14
+ @batch_size = 1000
15
+ @no_all = false
16
+ @no_combinatorial = false
17
+ @normalize_regex = false
18
+ @normalize_no_sym = false
19
+ @remove_results = false
20
+
14
21
  parser = OptionParser.new do |opts|
15
22
  opts.banner = 'Usage: soulheart [options] COMMAND'
16
23
 
@@ -21,54 +28,68 @@ parser = OptionParser.new do |opts|
21
28
  Soulheart.redis = host
22
29
  end
23
30
 
24
- opts.on('-s', '--stop-words [FILE]', 'Path to file containing a list of stop words to overwrite defaults - "the", "at", "vs"') do |fn|
25
- File.open(fn) do |file|
26
- Soulheart.stop_words = file.readlines.map(&:strip).reject(&:empty?)
27
- end
28
- end
29
-
30
31
  opts.on('-h', '--help', 'Show this message') do
31
32
  puts opts
32
33
  exit
33
34
  end
34
35
 
35
- opts.on('-A', '--no-all', 'on load: Do not add items into the "all" category (on load)') do |size|
36
- NO_ALL = true
36
+ opts.separator ''
37
+ opts.separator ' load options:'
38
+
39
+ opts.on('-A', '--no-all', 'Do not add items into the "all" category') do |size|
40
+ @no_all = true
37
41
  end
38
42
 
39
- opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories (on load)') do |size|
40
- NO_COMBINATORIAL = true
43
+ opts.on('-C', '--no-combinatorial', 'Do not create combined categories, do not add items to combined categories') do |size|
44
+ @no_combinatorial = true
41
45
  end
42
46
 
43
- opts.on('-R', '--remove-results', 'Remove results data - breaks the cache, fully clears all loaded data. (on clear)') do |size|
44
- REMOVE_RESULTS = true
47
+ opts.separator ''
48
+ opts.separator ' normalize options:'
49
+
50
+ opts.on('-s', '--with-symbols', 'Do not remove symbols when normalizing terms') do |size|
51
+ @normalize_no_sym = true
52
+ end
53
+
54
+ opts.on('-x', '--regex', 'Use the first line from the FILE as the regular expression for normalizing terms') do |size|
55
+ @normalize_regex = true
56
+ end
57
+
58
+ opts.separator ''
59
+ opts.separator ' clear options:'
60
+
61
+ opts.on('-R', '--remove-results', 'Remove results data - breaks the cache, fully clears all loaded data') do |size|
62
+ @remove_results = true
45
63
  end
46
64
 
47
65
  opts.separator ''
48
66
  opts.separator 'Commands:'
49
- opts.separator ' load FILE Loads data from a FILE - can be a local file or a url. Accepted formats are .json, .tsv and .csv'
50
- opts.separator " reset Removes all existing data (optionally pass a file to run load after reset)"
67
+ opts.separator ' load FILE Loads data from a FILE - can be a local file or a url. Accepted formats are .json, .tsv and .csv'
68
+ opts.separator " stop-words FILE Load file containing a list of stop words to overwrite defaults - 'the', 'at' and 'vs'"
69
+ opts.separator " normalize Set the way that terms are normalized. Requires a file when including the --regex option"
70
+ opts.separator " clear Removes existing items and categories from the database"
51
71
  opts.separator ''
52
72
  opts.separator 'Additional info: https://sethherr.github.io/soulheart/commands/'
53
73
  opts.separator ''
54
74
  end
55
75
 
56
-
57
- def load(file)
76
+ def open_file(file)
58
77
  require 'uri'
59
78
  if file =~ URI.regexp
60
79
  require 'open-uri'
61
- f = open(file)
80
+ open(file)
62
81
  elsif File.exist?(file)
63
- f = File.open(file)
82
+ File.open(file)
64
83
  else
65
- puts "Couldn't open file: #{file}"
66
- return true
84
+ raise StandardError, "Couldn't open file: #{file}"
67
85
  end
86
+ end
68
87
 
88
+ def load(file)
89
+ f = open_file(file)
69
90
  start_time = Time.now.to_i
70
91
  count = 0
71
- loader = Soulheart::Loader.new({no_all: NO_ALL, no_combinatorial: NO_COMBINATORIAL})
92
+ loader = Soulheart::Loader.new({no_all: @no_all, no_combinatorial: @no_combinatorial})
72
93
  lines = []
73
94
  begin
74
95
  if file.match(/(c|t)sv\z/i)
@@ -81,10 +102,10 @@ def load(file)
81
102
  end
82
103
  elsif file.match(/json\z/i)
83
104
  puts 'Reading JSON'
84
- puts "Loading items in batches of #{BATCH_SIZE} ..."
105
+ puts "Loading items in batches of #{@batch_size} ..."
85
106
  until f.eof?
86
107
  lines = []
87
- BATCH_SIZE.times do
108
+ @batch_size.times do
88
109
  break if f.eof?
89
110
  lines << MultiJson.decode(f.gets)
90
111
  count += 1
@@ -100,20 +121,37 @@ def load(file)
100
121
  puts "Total time to load: #{Time.now.to_i - start_time} second(s)"
101
122
  end
102
123
 
103
- def clear(remove_results)
104
- Soulheart::Loader.new.clear(remove_results)
124
+ def stop_words(file)
125
+ f = open_file(file)
126
+ Soulheart.stop_words = f.readlines.map(&:strip).reject(&:empty?)
105
127
  end
106
128
 
107
- parser.parse!
129
+ def normalize(file=nil)
130
+ if @normalize_regex
131
+ f = open_file(file)
132
+ puts f.readlines.map(&:strip).reject(&:empty?).first
133
+ Soulheart.normalizer = f.readlines.map(&:strip).reject(&:empty?).first
134
+ elsif @normalize_no_sym
135
+ Soulheart.normalizer = ''
136
+ else
137
+ Soulheart.normalizer = Soulheart.default_normalizer
138
+ end
139
+ end
108
140
 
141
+ def clear
142
+ Soulheart::Loader.new.clear(@remove_results)
143
+ end
144
+
145
+ parser.parse!
109
146
  case ARGV[0]
110
147
  when 'load'
111
- BATCH_SIZE ||= 1000
112
- NO_ALL ||= false
113
- NO_COMBINED_CATEGORIES ||= false
114
148
  load ARGV[1]
115
- when 'reset'
116
- clear(REMOVE_RESULTS)
149
+ when 'stop-words'
150
+ stop_words ARGV[1]
151
+ when 'normalize'
152
+ ARGV[1] ? normalize(ARGV[1]) : normalize
153
+ when 'clear'
154
+ clear
117
155
  load ARGV[1] if ARGV[1]
118
156
  else
119
157
  puts parser.help
@@ -12,10 +12,6 @@ module Soulheart
12
12
  10 * 60 # Setting to 10 minutes, but making it possible to edit down the line
13
13
  end
14
14
 
15
- def base_id
16
- ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
17
- end
18
-
19
15
  def sorted_category_array
20
16
  redis.smembers(categories_id).map { |c| normalize(c) }.uniq.sort
21
17
  end
@@ -39,7 +35,7 @@ module Soulheart
39
35
  end
40
36
 
41
37
  def category_combos_id
42
- "#{base_id}category_combos:"
38
+ "#{Soulheart.base_id}category_combos:"
43
39
  end
44
40
 
45
41
  def category_combos
@@ -47,7 +43,7 @@ module Soulheart
47
43
  end
48
44
 
49
45
  def categories_id
50
- "#{base_id}categories:"
46
+ "#{Soulheart.base_id}categories:"
51
47
  end
52
48
 
53
49
  def hidden_categories_id
@@ -63,11 +59,16 @@ module Soulheart
63
59
  end
64
60
 
65
61
  def results_hashes_id
66
- "#{base_id}database:"
62
+ "#{Soulheart.base_id}database:"
67
63
  end
68
64
 
65
+ def normalize_type_id
66
+ "#{Soulheart.base_id}normalize:"
67
+ end
68
+
69
+
69
70
  def cache_id(type = 'all')
70
- "#{base_id}cache:#{type}:"
71
+ "#{Soulheart.base_id}cache:#{type}:"
71
72
  end
72
73
  end
73
74
  end
@@ -3,7 +3,7 @@ require 'redis'
3
3
 
4
4
  module Soulheart
5
5
  module Config
6
- DEFAULT_STOP_WORDS = %w(vs at the)
6
+
7
7
 
8
8
  # Accepts:
9
9
  # 1. A Redis URL String 'redis://host:port/db'
@@ -37,12 +37,54 @@ module Soulheart
37
37
  )
38
38
  end
39
39
 
40
+ def base_id
41
+ ENV['RACK_ENV'] != 'test' ? 'soulheart:' : 'soulheart_test:'
42
+ end
43
+
44
+ def stop_words_id
45
+ "#{base_id}stop_list:"
46
+ end
47
+
48
+ def default_stop_words
49
+ %w(vs at the)
50
+ end
51
+
52
+ def redis_stop_words
53
+ return false unless redis.exists stop_words_id
54
+ redis.lrange(stop_words_id, 0, -1)
55
+ end
56
+
40
57
  def stop_words
41
- @stop_words ||= DEFAULT_STOP_WORDS
58
+ @stop_words ||= redis_stop_words || default_stop_words
42
59
  end
43
60
 
44
61
  def stop_words=(arr)
62
+ redis.expire stop_words_id, 0
45
63
  @stop_words = Array(arr).flatten
64
+ redis.lpush stop_words_id, @stop_words
65
+ end
66
+
67
+ def normalizer_id
68
+ "#{base_id}normalizer:"
69
+ end
70
+
71
+ def default_normalizer
72
+ '[^\p{Word}\ ]'
73
+ end
74
+
75
+ def redis_normalizer
76
+ return false unless redis.exists normalizer_id
77
+ redis.get normalizer_id
78
+ end
79
+
80
+ def normalizer
81
+ @normalizer ||= redis_normalizer || default_normalizer
82
+ end
83
+
84
+ def normalizer=(str)
85
+ redis.expire normalizer_id, 0
86
+ @normalizer = str
87
+ redis.set normalizer_id, @normalizer
46
88
  end
47
89
  end
48
90
  end
@@ -3,7 +3,7 @@ module Soulheart
3
3
  module Helpers
4
4
  def normalize(str)
5
5
  # Letter, Mark, Number, Connector_Punctuation (Chinese, Japanese, etc.)
6
- str.downcase.gsub(/[^\p{Word}\ ]/i, '').strip
6
+ str.downcase.gsub(/#{Soulheart.normalizer}/i, '').strip
7
7
  end
8
8
 
9
9
  def prefixes_for_phrase(phrase)
@@ -39,9 +39,9 @@ module Soulheart
39
39
  redis.sadd categories_id, categories
40
40
  end
41
41
 
42
- def delete_data(id="#{base_id}:")
42
+ def delete_data(id="#{Soulheart.base_id}:")
43
43
  # delete the sorted sets for this type
44
- phrases = redis.smembers(base_id)
44
+ phrases = redis.smembers(Soulheart.base_id)
45
45
  redis.pipelined do
46
46
  phrases.each do |p|
47
47
  redis.del("#{id}#{p}")
@@ -72,10 +72,10 @@ module Soulheart
72
72
  end
73
73
 
74
74
  def load(items)
75
- # Replace with item return so we know we have category_id
75
+ Soulheart.stop_words # Load stop words so we don't pipeline redis_stop_words accidentally
76
76
  i = 0
77
77
  items.each do |item|
78
- item.replace(add_item(item))
78
+ item.replace(add_item(item)) # Replace with item return so we know we have category_id
79
79
  i += 1
80
80
  end
81
81
  set_category_combos_array.each do |category_combo|
@@ -84,10 +84,10 @@ module Soulheart
84
84
  next
85
85
  elsif category_combo == 'all'
86
86
  next if @no_all
87
- elsif !category_combo.match(item['category'])
88
- next
89
87
  elsif @no_combinatorial
90
88
  next
89
+ elsif !category_combo.match(item['category'])
90
+ next
91
91
  end
92
92
  add_item(item, category_id(category_combo), true) # send it base
93
93
  i += 1
@@ -114,7 +114,7 @@ module Soulheart
114
114
  item
115
115
  end
116
116
 
117
- def add_item(item, category_base_id=nil, cleaned=false)
117
+ def add_item(item, category_base_id=nil, cleaned=false)
118
118
  item = clean(item) unless cleaned
119
119
  category_base_id ||= category_id(item['category'])
120
120
  priority = (-item['priority'])
@@ -125,7 +125,7 @@ module Soulheart
125
125
  phrase = ([item['term']] + (item['aliases'] || [])).join(' ')
126
126
  # Store all the prefixes
127
127
  prefixes_for_phrase(phrase).each do |p|
128
- redis.sadd(base_id, p) unless cleaned # remember prefix in a master set
128
+ redis.sadd(Soulheart.base_id, p) unless cleaned # remember prefix in a master set
129
129
  # store the normalized term in the index for each of the categories
130
130
  redis.zadd("#{category_base_id}#{p}", priority, item['term'])
131
131
  end
@@ -61,10 +61,10 @@ module Soulheart
61
61
  redis.expire(@cachekey, cache_duration) # cache_duration is set in base.rb
62
62
  end
63
63
 
64
- def matching_hashes(ids)
65
- return [] unless ids.size > 0
66
- results = redis.hmget(results_hashes_id, *ids)
67
- results = results.reject(&:nil?) # handle cached results for ids which have since been deleted
64
+ def matching_hashes(terms)
65
+ return [] unless terms.size > 0
66
+ results = redis.hmget(results_hashes_id, *terms)
67
+ results = results.reject(&:nil?) # handle cached results for terms which have since been deleted
68
68
  results.map { |r| MultiJson.decode(r) }
69
69
  end
70
70
 
@@ -74,8 +74,8 @@ module Soulheart
74
74
  limit = @opts['per_page'].to_i + offset - 1
75
75
 
76
76
  limit = 0 if limit < 0
77
- ids = redis.zrange(@cachekey, offset, limit) # Using 'ids', even though keys are now terms
78
- matching_hashes(ids)
77
+ terms = redis.zrange(@cachekey, offset, limit)
78
+ matching_hashes(terms)
79
79
  end
80
80
  end
81
81
  end
@@ -28,6 +28,8 @@ module Soulheart
28
28
  soulheart_version: Soulheart::VERSION,
29
29
  current_time: Time.now.utc.strftime('%H:%M:%S UTC'),
30
30
  redis_used_memory: info['used_memory_human'],
31
+ stop_words: Soulheart.stop_words,
32
+ normalizer: Soulheart.normalizer,
31
33
  })
32
34
  end
33
35
 
@@ -1,3 +1,3 @@
1
1
  module Soulheart
2
- VERSION = '0.1.4'
2
+ VERSION = '0.2.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: soulheart
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Seth Herr
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-23 00:00:00.000000000 Z
11
+ date: 2015-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: hiredis
@@ -108,20 +108,6 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
- - !ruby/object:Gem::Dependency
112
- name: rubocop
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - ">="
123
- - !ruby/object:Gem::Version
124
- version: '0'
125
111
  description: Simple, fast autocomplete server for Ruby and Rails
126
112
  email:
127
113
  - seth.william.herr@gmail.com
@@ -164,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
150
  version: '0'
165
151
  requirements: []
166
152
  rubyforge_project:
167
- rubygems_version: 2.4.8
153
+ rubygems_version: 2.2.2
168
154
  signing_key:
169
155
  specification_version: 4
170
156
  summary: Simple, fast autocomplete server for Ruby and Rails