trials 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 303d61afdd0937269e514488c1462361d6bf24a7ade281e65d3aae0270e9ae31
4
- data.tar.gz: e0e8e292b51e7166af634fdf2e6117ded4aa700567aa197280dfc35f8de96f27
3
+ metadata.gz: 8f8af3df6ab16bb8ba4b08d0ea7b1dd11507aeb2c6113a0598a5e9965d0264c9
4
+ data.tar.gz: c1e9cc7305f5cee913a7091d321811153d33e9e67d383bad0f1518034a195556
5
5
  SHA512:
6
- metadata.gz: 83503c1320739905b44c54e3bf31b469435562a508cbae1670749e0eeba6cc439a7c3f53b4a15389eeebbf8c3218473c778befbb05a2b75499b7904a0d07ddad
7
- data.tar.gz: ae364df5de6484dc9c461838b809d24644202e6979a65c90e206a95e60dabe755a08d695f1a7aa042fb490c52193dbfbf928404f83f75a6bf70499cf93390a54
6
+ metadata.gz: d4bdc989741b4f86a3b80f91979a3882c2666da4749ee7864f0b45596311cbdebd297e17950d27bb84d39efc6d30b16fa18fc1a1bd4f65b98cd213ec4e0cbba7
7
+ data.tar.gz: d3fa929a7157792cd5fce04b64f0d0312ce240f59de755d323dc1ee910a21f9a966e74342a5e4c0d05d732291ab28a7b7a324bad6a5e1afb3c0293ea0c682c1c
data/bin/trial CHANGED
@@ -1,12 +1,13 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require_relative '../lib/trials'
4
-
5
3
  # env
6
4
  START_TIME = Time.now
7
5
  SCRIPT_NAME = ARGV[0].gsub('.rb', '')
8
6
  RUN = "#{START_TIME.strftime("%Y%m%dT%H%M%S")}_#{SCRIPT_NAME}"
9
7
  ROOT = Dir.pwd
8
+
9
+ require_relative '../lib/trials'
10
+
10
11
  SECRETS = begin
11
12
  if File.exists?(File.expand_path('~/.trials/secrets.yml'))
12
13
  JSON.parse(
@@ -22,6 +23,10 @@ end
22
23
  FileUtils.mkdir_p("seeds")
23
24
  FileUtils.mkdir_p("results/#{RUN}")
24
25
  FileUtils.mkdir_p("tmp")
26
+ FileUtils.mkdir_p("cache")
27
+
28
+ # allow to init the current folder
29
+ return if SCRIPT_NAME == 'init'
25
30
 
26
31
  # log start
27
32
  log "starting #{SCRIPT_NAME} at #{START_TIME.iso8601}"
data/lib/trials.rb CHANGED
@@ -10,18 +10,18 @@ require 'nokogiri'
10
10
  require 'ostruct'
11
11
  require 'pp'
12
12
  require 'pry'
13
+ require 'rack'
13
14
  require 'rest-client'
15
+ require 'securerandom'
14
16
  require 'set'
15
17
  require 'smalltext'
16
18
  require 'sqlite3'
17
19
  require 'street_address'
18
20
  require 'tty-table'
19
21
  require 'yaml'
20
- require 'rack'
21
22
 
22
23
  require_relative 'trials/utils/various'
23
24
  require_relative 'trials/utils/csvs'
24
- require_relative 'trials/utils/hashes'
25
25
  require_relative 'trials/utils/logging'
26
26
  require_relative 'trials/utils/strings'
27
27
  require_relative 'trials/utils/files'
@@ -34,7 +34,5 @@ require_relative 'trials/utils/rollbar'
34
34
  require_relative 'trials/utils/aws'
35
35
  require_relative 'trials/utils/google_drive'
36
36
  require_relative 'trials/utils/h_array'
37
- require_relative 'trials/data_handling/addresses'
38
- require_relative 'trials/data_handling/names'
39
37
  require_relative 'trials/extensions/hash'
40
38
  require_relative 'trials/extensions/array'
@@ -6,7 +6,7 @@ def ddb_connection
6
6
  )
7
7
  end
8
8
 
9
- def cached_ddb_scan(query)
9
+ def ddb_scan_with_cache(query)
10
10
  json_cache(query.dig(:table_name)) { ddb_scan(query) }
11
11
  end
12
12
 
@@ -31,7 +31,6 @@ end
31
31
 
32
32
  def ddb_scan_without_segmentation(query)
33
33
  result = nil
34
- requests = 0
35
34
  items = []
36
35
 
37
36
  loop do
@@ -1,15 +1,15 @@
1
1
  def read_csv(filename)
2
2
  CSV
3
- .foreach(seeds_path(filename), headers: true)
3
+ .foreach(seed_path(filename), headers: true)
4
4
  .map(&:to_h)
5
5
  .map(&:symbolize_keys)
6
6
  .select { |i| i.values.any?(&:present?) }
7
7
  end
8
8
 
9
9
  def write_csv_from_hashes(file, hash_set, attrs: nil)
10
- attrs ||= uniq_hash_keys(hash_set)
10
+ attrs ||= hash_set.to_harray.uniq_keys
11
11
 
12
- CSV.open(results_path(file), 'w') do |csv|
12
+ CSV.open(result_path(file), 'w') do |csv|
13
13
  csv << attrs
14
14
 
15
15
  hash_set.each do |c|
@@ -1,110 +1,72 @@
1
- # paths
2
-
3
- def seeds_path(name)
4
- "#{ROOT}/seeds/#{name}"
5
- end
6
-
7
- alias seed_path seeds_path
8
-
9
- def seed_exists?(name)
10
- File.exist?(seed_path(name))
11
- end
12
-
13
- alias seeds_exist? seed_exists?
14
-
15
- def results_path(name)
16
- "#{ROOT}/results/#{RUN}/#{name}"
17
- end
18
-
19
- alias result_path results_path
20
-
21
- def result_exists?(name)
22
- File.exist?(result_path(name))
23
- end
24
-
25
- alias results_exist? result_exists?
26
-
27
- def tmp_path(name)
28
- "#{ROOT}/tmp/#{name}"
29
- end
30
-
31
- def tmp_exists?(name)
32
- File.exist?(tmp_path(name))
33
- end
34
-
35
- def list_dir(dir)
36
- Dir["#{seeds_path(dir)}/**/*"]
37
- end
38
-
39
- # reading
40
-
41
- def read(file)
42
- return unless seed_exists?(file)
43
- File.read(seeds_path(file))
44
- end
45
-
46
- alias read_seed read
47
- alias read_seeds read
48
-
49
- def read_tmp(file)
50
- return unless tmp_exists?(file)
51
- File.read(tmp_path(file))
52
- end
53
-
54
- def readlines(file)
55
- File.read(seeds_path(file)).split("\n")
56
- end
57
-
58
- # writing
59
-
60
- def write(file, content)
61
- FileUtils.mkdir_p(File.dirname(results_path(file)))
62
- File.open(results_path(file), 'w') { |f| f << content }
63
- end
64
-
65
- alias write_result write
66
- alias write_results write
67
-
68
- def write_tmp(file, content)
69
- FileUtils.mkdir_p(File.dirname(tmp_path(file)))
70
- File.open(tmp_path(file), 'w') { |f| f << content }
71
- end
72
-
73
- def append(file, content)
74
- File.open(results_path(file), 'a') { |f| f << content }
75
- end
76
-
77
- alias append_result append
78
- alias append_results append
79
-
80
- # deleting
81
-
82
- def delete(file)
83
- return unless result_exists?(file)
84
- FileUtils.rm_r(results_path(file))
85
- end
86
-
87
- alias delete_result delete
88
- alias delete_results delete
89
-
90
- def delete_tmp(file)
91
- return unless tmp_exists?(file)
92
- FileUtils.rm_r(tmp_path(file))
93
- end
94
-
95
- def delete_seeds(file)
96
- return unless seed_exists?(file)
97
- FileUtils.rm_r(seeds_path(file))
98
- end
99
-
100
- alias delete_seed delete_seeds
101
-
102
- # other
103
-
104
- def make_seed(file)
105
- FileUtils.cp(results_path(file), seeds_path(file))
106
- end
107
-
108
- def make_tmp(file)
109
- FileUtils.cp(results_path(file), tmp_path(file))
1
+ dirs = {
2
+ seed: { run: false, pluralized: ActiveSupport::Inflector.pluralize('seed') },
3
+ result: { run: true, pluralized: ActiveSupport::Inflector.pluralize('result') },
4
+ tmp: { run: false, pluralized: 'tmp' },
5
+ cache: { run: false, pluralized: 'cache' },
6
+ }
7
+
8
+ dirs.each do |dir, opts|
9
+ eval <<~RUBY
10
+ def #{opts.dig(:pluralized)}_root
11
+ File.join(
12
+ ROOT,
13
+ '#{opts.dig(:pluralized)}',
14
+ '#{opts.dig(:run) ? RUN : nil}',
15
+ ).to_s
16
+ end
17
+
18
+ def #{dir}_path(name)
19
+ File.join(
20
+ #{opts.dig(:pluralized)}_root,
21
+ name,
22
+ ).to_s
23
+ end
24
+
25
+ def #{dir}_exists?(name)
26
+ File.exist?(#{dir}_path(name))
27
+ end
28
+
29
+ def list_#{opts.dig(:pluralized)}(name = nil)
30
+ Dir[
31
+ File.join(
32
+ *[
33
+ #{opts.dig(:pluralized)}_root,
34
+ name,
35
+ '**',
36
+ '*',
37
+ ].compact
38
+ )
39
+ ].reject { |d| File.directory?(d) }
40
+ end
41
+
42
+ def read_#{dir}(name)
43
+ File.read(#{dir}_path(name)) if #{dir}_exists?(name)
44
+ end
45
+
46
+ def readlines_#{dir}(name)
47
+ read_#{dir}(name).split("\\n")
48
+ end
49
+
50
+ def write_#{dir}(name, content)
51
+ FileUtils.mkdir_p(File.dirname(#{dir}_path(name)))
52
+ File.open(#{dir}_path(name), 'w') { |f| f << content }
53
+ end
54
+
55
+ def append_to_#{dir}(name, content)
56
+ File.open(#{dir}_path(name), 'a') { |f| f << content }
57
+ end
58
+
59
+ def delete_#{dir}(name)
60
+ FileUtils.rm_r(#{dir}_path(name)) if #{dir}_exists?(name)
61
+ end
62
+ RUBY
63
+
64
+ dirs.each do |o_dir, o_opts|
65
+ next if o_dir == dir
66
+ eval <<~RUBY
67
+ def cp_#{o_dir}_to_#{opts.dig(:pluralized)}(name)
68
+ FileUtils.cp(#{o_dir}_path(name), #{dir}_path(name))
69
+ end
70
+ RUBY
71
+ end
110
72
  end
@@ -1,7 +1,7 @@
1
1
  def gd_session
2
2
  @gd_session ||= begin
3
- write('config.json', secrets.google.drive_config_json)
4
- session = GoogleDrive::Session.from_config(results_path("config.json"))
3
+ write_tmp('config.json', secrets.google.drive_config_json)
4
+ session = GoogleDrive::Session.from_config(tmp_path("config.json"))
5
5
  delete('config.json')
6
6
  session
7
7
  end
@@ -20,6 +20,14 @@ class HArray < Array
20
20
  end
21
21
  end
22
22
 
23
+ def normalize_keys
24
+ hashes.map do |h|
25
+ h
26
+ .map { |k, v| [normalize_key(k), v] }
27
+ .to_h
28
+ end
29
+ end
30
+
23
31
  def uniq_keys
24
32
  hashes.flat_map(&:keys).uniq.compact
25
33
  end
@@ -49,4 +57,10 @@ class HArray < Array
49
57
  .reduce(&:merge)
50
58
  end
51
59
  end
60
+
61
+ private
62
+
63
+ def normalize_key(k)
64
+ k.downcase.to_sym
65
+ end
52
66
  end
@@ -1,5 +1,5 @@
1
1
  def read_json(filename)
2
- result = JSON.parse(read(filename))
2
+ result = JSON.parse(read_seed(filename))
3
3
 
4
4
  aggressive_deep_symbolize_keys(result)
5
5
  end
@@ -11,12 +11,12 @@ end
11
11
  def json_cache(key)
12
12
  name = "json/#{key}.json"
13
13
 
14
- return aggressive_deep_symbolize_keys(JSON.parse(read_tmp(name))) if tmp_exists?(name)
14
+ return aggressive_deep_symbolize_keys(JSON.parse(read_cache(name))) if cache_exists?(name)
15
15
 
16
- write_tmp(name, yield.to_json)
16
+ write_cache(name, yield.to_json)
17
17
  json_cache(key)
18
18
  end
19
19
 
20
20
  def invalidate_json_cache
21
- delete_tmp("json")
21
+ delete_cache("json")
22
22
  end
@@ -1,7 +1,7 @@
1
1
  def render_table_from_hashes(hash_set, sort: true, headers: nil)
2
2
  return 'no data' if hash_set.blank?
3
3
 
4
- headers = headers || uniq_hash_keys(hash_set)
4
+ headers = headers || hash_set.to_harray.uniq_keys
5
5
 
6
6
  headers.sort! if sort
7
7
 
@@ -21,7 +21,7 @@ def log(item, nl: true, quiet: false, each: true)
21
21
  return
22
22
  end
23
23
 
24
- File.open(results_path('log.txt'), 'a') do |f|
24
+ File.open(result_path('log.txt'), 'a') do |f|
25
25
  f << begin
26
26
  if item.is_a?(String) || item.is_a?(Numeric)
27
27
  item.to_s
@@ -1,4 +1,4 @@
1
1
  def pdf_to_text(path)
2
- `pdftotext "#{seeds_path(path)}" #{tmp_path('tmp_pdf.txt')}; \
2
+ `pdftotext "#{seed_path(path)}" #{tmp_path('tmp_pdf.txt')}; \
3
3
  cat #{tmp_path('tmp_pdf.txt')}`
4
4
  end
@@ -1,6 +1,6 @@
1
1
  def db(db_name = nil)
2
2
  db_name ||= 'data.db'
3
- db = SQLite3::Database.new(tmp_path(db_name))
3
+ db = SQLite3::Database.new(cache_path(db_name))
4
4
  db.results_as_hash = true
5
5
  db
6
6
  end
@@ -14,7 +14,7 @@ def get_db(db_name = nil)
14
14
  end
15
15
 
16
16
  def import_csv_into_db(db_name = 'data.db', table, csv)
17
- system("sqlite3 -csv #{tmp_path(db_name)} '.import #{seed_path(csv)} #{table}'")
17
+ system("sqlite3 -csv #{cache_path(db_name)} '.import #{seed_path(csv)} #{table}'")
18
18
  end
19
19
 
20
20
  def query_db(db_name = nil, query)
@@ -2,6 +2,14 @@ def secrets
2
2
  SECRETS
3
3
  end
4
4
 
5
+ def run
6
+ RUN
7
+ end
8
+
9
+ def root_path
10
+ ROOT
11
+ end
12
+
5
13
  def or_nil
6
14
  val = yield
7
15
  raise if val.blank?
@@ -25,3 +33,13 @@ def aggressive_deep_symbolize_keys(maybe)
25
33
 
26
34
  maybe
27
35
  end
36
+
37
+ def marshal_fetch(key)
38
+ return Marshal.load(read_cache(key)) if cache_exists?(key)
39
+
40
+ File.open(cache_path(key), 'wb') do |f|
41
+ f.write(Marshal.dump(yield))
42
+ end
43
+
44
+ marshal_fetch(key)
45
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: trials
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - grahamotte
@@ -299,8 +299,6 @@ extra_rdoc_files: []
299
299
  files:
300
300
  - bin/trial
301
301
  - lib/trials.rb
302
- - lib/trials/data_handling/addresses.rb
303
- - lib/trials/data_handling/names.rb
304
302
  - lib/trials/extensions/array.rb
305
303
  - lib/trials/extensions/hash.rb
306
304
  - lib/trials/utils/aws.rb
@@ -309,7 +307,6 @@ files:
309
307
  - lib/trials/utils/files.rb
310
308
  - lib/trials/utils/google_drive.rb
311
309
  - lib/trials/utils/h_array.rb
312
- - lib/trials/utils/hashes.rb
313
310
  - lib/trials/utils/jsons.rb
314
311
  - lib/trials/utils/logging.rb
315
312
  - lib/trials/utils/pdfs.rb
@@ -1,37 +0,0 @@
1
- # normal format <number> <STREET NAME>, <CITY>, <STATE> <postal>
2
-
3
- def normalize_address(address)
4
- return if address.blank?
5
-
6
- cleaner_string = if address.respond_to?(:address1)
7
- "#{address.address1}, #{address.city}, #{address.state} #{address.postal_code}"
8
- else
9
- address
10
- end
11
-
12
- cleaner_string = cleaner_string.gsub('#', ' #').squish
13
- parsed = StreetAddress::US.parse(cleaner_string)
14
-
15
- return if parsed.blank?
16
- return if parsed.number.blank?
17
- return if parsed.street.blank?
18
- return if parsed.city.blank?
19
- return if parsed.state.blank?
20
- return if parsed.postal_code.blank?
21
-
22
- parsed.prefix = nil
23
- parsed.suffix = nil
24
- parsed.unit_prefix = nil
25
- parsed.unit = nil
26
- parsed.postal_code_ext = nil
27
-
28
- parsed.to_s.upcase
29
- end
30
-
31
- def parse_address(address_string)
32
- StreetAddress::US.parse(address_string)
33
- end
34
-
35
- def normalize_and_parse_address(address_string)
36
- StreetAddress::US.parse(normalize_address(address_string))
37
- end
@@ -1,84 +0,0 @@
1
- # normal <FIRST> <LAST>
2
-
3
- Name = Struct.new(:first_name, :middle_name, :last_name)
4
-
5
- class NamePartsParser
6
- def initialize(name_string)
7
- @namae = Namae.parse((name_string || '').upcase).first
8
- end
9
-
10
- def first
11
- normalize_name(split_first_and_middle(given).first)
12
- end
13
-
14
- def middle
15
- normalize_name(split_first_and_middle(given).last)
16
- end
17
-
18
- def last
19
- normalize_name(family)
20
- end
21
-
22
- private
23
-
24
- def given
25
- @namae&.given || ''
26
- end
27
-
28
- def family
29
- @namae&.family || ''
30
- end
31
-
32
- def split_first_and_middle(first_and_middle)
33
- names = first_and_middle.split(' ', 2)
34
- names.length == 1 ? names + [''] : names
35
- end
36
- end
37
-
38
- def normalize_name(name)
39
- return if name.blank?
40
-
41
- name = name_from_parts(name) if name.respond_to?(:first_name)
42
-
43
- name.strip.upcase.delete('^A-Z\ \-').squeeze(" ")
44
- end
45
-
46
- def normalize_full_names(names)
47
- Array.wrap(names)
48
- .map { |n| normalize_full_name(n) }
49
- .map(&:to_s)
50
- .map(&:presence)
51
- .compact
52
- .uniq
53
- end
54
-
55
- def normalize_full_name(name)
56
- return if name.blank?
57
-
58
- name.strip.upcase.delete('^A-Z\ \-').squeeze(" ")
59
- end
60
-
61
- def parse_name(name)
62
- NamePartsParser.new(name)
63
- end
64
-
65
- def normalize_and_parse_name(name)
66
- NamePartsParser.new(normalize_name(name))
67
- end
68
-
69
- # private
70
-
71
- def name_from_parts(name)
72
- first = normalize_name_part(name.first_name)
73
- last = normalize_name_part(name.last_name)
74
-
75
- [first, last].join(' ')
76
- end
77
-
78
- def full_name_from_parts(name)
79
- first = normalize_name_part(name.first_name)
80
- middle = normalize_name_part(name.middle_name)
81
- last = normalize_name_part(name.last_name)
82
-
83
- [first, middle, last].join(' ')
84
- end
@@ -1,90 +0,0 @@
1
- def uniq_hash_keys(hashes)
2
- hashes.flat_map(&:keys).uniq.compact
3
- end
4
-
5
- def sanitize_hash_value(hash, key:, type:, date_format: '%Y-%m-%d')
6
- hash.merge(
7
- key => or_nil do
8
- case type
9
- when :date
10
- Date.strptime(hash.dig(key), date_format)
11
- when :datetime
12
- DateTime.parse(hash.dig(key))
13
- when :integer, :int
14
- hash.dig(key).to_i
15
- when :float
16
- hash.dig(key).to_f
17
- when :string
18
- hash.dig(key).to_s
19
- when :alphanum
20
- string_to_alphanum(hash.dig(key))
21
- when :present?
22
- hash.dig(key).present?
23
- end
24
- end
25
- )
26
- end
27
-
28
- def sanitize_hash_values(hash, scheme = {})
29
- scheme.each do |k, v|
30
- hash = sanitize_hash_value(hash, key: k, type: v)
31
- end
32
-
33
- hash
34
- end
35
-
36
- def rename_hash_key(hash, from:, to:)
37
- hash[to] = hash.delete(from)
38
- hash
39
- end
40
-
41
- def rename_hash_keys(hash, scheme = {})
42
- scheme.each do |k, v|
43
- hash = rename_hash_key(hash, from: k, to: v)
44
- end
45
-
46
- hash
47
- end
48
-
49
- def merge_hash_groups(*groups, key:, join_type: :inner)
50
- groups = groups.map { |group| group.map { |g| [g.dig(key), g] }.to_h }
51
-
52
- keys = begin
53
- case join_type
54
- when :inner
55
- groups.map(&:keys).reduce(&:&)
56
- when :all
57
- groups.flat_map(&:keys).uniq
58
- when :first
59
- groups.first.keys
60
- end
61
- end
62
-
63
- keys.map { |key| groups.map { |g| g.dig(key) }.compact.reduce(&:merge) }
64
- end
65
-
66
- def count_for_group_by(batch, &block)
67
- batch
68
- .group_by(&block)
69
- .map { |k, v| [k, v.length] }
70
- .to_h
71
- end
72
-
73
- def array_to_count_hash(list)
74
- list.uniq.reduce({}) do |h, i|
75
- h[i] = list.count(i)
76
- h
77
- end
78
- end
79
-
80
- def update_counts_hash(counts, update)
81
- update.each do |k, v|
82
- if counts.key?(k)
83
- counts[k] += v
84
- else
85
- counts[k] = v
86
- end
87
- end
88
-
89
- counts
90
- end