mode 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +202 -0
- data/README.md +138 -0
- data/Rakefile +6 -0
- data/bin/mode +13 -0
- data/lib/mode.rb +13 -0
- data/lib/mode/cli.rb +6 -0
- data/lib/mode/cli/analyze.rb +20 -0
- data/lib/mode/cli/base.rb +15 -0
- data/lib/mode/cli/helpers.rb +22 -0
- data/lib/mode/cli/import.rb +62 -0
- data/lib/mode/cli/package.rb +13 -0
- data/lib/mode/cli/setup.rb +12 -0
- data/lib/mode/commands/analyze_field.rb +70 -0
- data/lib/mode/commands/analyze_schema.rb +88 -0
- data/lib/mode/commands/helpers.rb +10 -0
- data/lib/mode/commands/import.rb +37 -0
- data/lib/mode/commands/package.rb +60 -0
- data/lib/mode/commands/setup.rb +36 -0
- data/lib/mode/config.rb +54 -0
- data/lib/mode/version.rb +3 -0
- data/mode.gemspec +36 -0
- data/spec/commands/analyze_schema_spec.rb +24 -0
- data/spec/commands/setup_spec.rb +62 -0
- data/spec/config_spec.rb +34 -0
- data/spec/fixtures/MOCK_DATA.csv +100001 -0
- data/spec/fixtures/cb_clean_small.csv +100000 -0
- data/spec/fixtures/duplicate_keys.csv +3 -0
- data/spec/fixtures/espn_draft.csv +1 -0
- data/spec/fixtures/format_examples.csv.txt +6 -0
- data/spec/fixtures/format_examples_after_excel.csv.txt +1 -0
- data/spec/spec_helper.rb +19 -0
- metadata +232 -0
@@ -0,0 +1,15 @@
|
|
1
|
+
module Mode
|
2
|
+
module CLI
|
3
|
+
class Base < Thor
|
4
|
+
#
|
5
|
+
# CLI commands are split out into separate files that just reopen this class
|
6
|
+
# An alternative method for doing this would be to extend this class on include
|
7
|
+
# And then I'd have an explicit list of the commands we were including
|
8
|
+
#
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
include Mode::CLI::Helpers
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Mode
|
2
|
+
module CLI
|
3
|
+
module Helpers
|
4
|
+
def valid_file?(file)
|
5
|
+
!file.nil? && File.exist?(file)
|
6
|
+
end
|
7
|
+
|
8
|
+
def valid_table?(table)
|
9
|
+
table =~ /[\w\d\_\-]+\/[\w\d\_\-]+/
|
10
|
+
end
|
11
|
+
|
12
|
+
def pkg_name(path)
|
13
|
+
path.split('/').last.split('.').first
|
14
|
+
end
|
15
|
+
|
16
|
+
def sample_rate(path)
|
17
|
+
file_size = File.size(path)
|
18
|
+
DataKit::CSV::Analyzer.sample_rate(file_size)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Mode
|
2
|
+
module CLI
|
3
|
+
class Base < Thor
|
4
|
+
desc "import SOURCE ACCOUNT/TABLENAME [--update | --replace]", "Import a flat file into the Mode data warehouse"
|
5
|
+
long_desc <<-LONGDESC
|
6
|
+
The import commands allows you to create, update and
|
7
|
+
replace tables in the Mode data warehouse with data from
|
8
|
+
flat files and data packages. The default action is create
|
9
|
+
with optional flags to update or replace a table.
|
10
|
+
|
11
|
+
Data can be imported from one of three source types
|
12
|
+
|
13
|
+
1. CSV File
|
14
|
+
\x5> $ mode import gdp_quarterly.csv besquared/gdp_quarterly
|
15
|
+
|
16
|
+
2. JSON File (LD-JSON)
|
17
|
+
\x5> $ mode import gdp_quarterly.json besquared/gdp_quarterly
|
18
|
+
|
19
|
+
3. Data Package
|
20
|
+
\x5> $ mode import gdp_us/quarterly besquared/gdp_quarterly
|
21
|
+
|
22
|
+
|
23
|
+
Note: If you do not specify a data package resource name we'll attempt to use the first resource in the package.
|
24
|
+
LONGDESC
|
25
|
+
option :update, :type => :boolean
|
26
|
+
option :replace, :type => :boolean
|
27
|
+
option :primary_key, :banner => 'pos1[,pos2,...] (ex: 0,2)'
|
28
|
+
def import(source, table)
|
29
|
+
unless valid_table?(table)
|
30
|
+
error "Error: Invalid account or table name given"
|
31
|
+
return
|
32
|
+
end
|
33
|
+
|
34
|
+
if File.directory?(source)
|
35
|
+
unless Mode::Package::Base.exist?(source)
|
36
|
+
error "Error: Invalid package given"
|
37
|
+
return
|
38
|
+
end
|
39
|
+
|
40
|
+
package, resource_name = *source.split('/')
|
41
|
+
package = Mode::Package::Base.open(source)
|
42
|
+
else
|
43
|
+
unless valid_file?(source)
|
44
|
+
error "Error: Invalid source file given"
|
45
|
+
return
|
46
|
+
end
|
47
|
+
|
48
|
+
dst_path = Dir.mktmpdir
|
49
|
+
src_data = Mode::CSV::Parser.new(source)
|
50
|
+
builder = Mode::Package::Builder.new(src_data, dst_path, pkg_name(source), sample_rate(source))
|
51
|
+
|
52
|
+
package = builder.execute # make the package
|
53
|
+
end
|
54
|
+
|
55
|
+
account, table_name = *table.split('/')
|
56
|
+
resource_name = package.resources.first.name
|
57
|
+
|
58
|
+
Mode::Commands::Import.new(account, table_name, package, resource_name).execute
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Mode
|
2
|
+
module CLI
|
3
|
+
class Base < Thor
|
4
|
+
desc "package <csv path> <package path>", "Creates a new data package from a csv file"
|
5
|
+
option :sample, :banner => '<sampling rate> (ex: 0.5)'
|
6
|
+
option :keys, :banner => '<positions> (ex: 0,2)', :default => String.new
|
7
|
+
def package(csv, path)
|
8
|
+
keys = options[:keys].split(',').collect(&:strip).collect(&:to_i)
|
9
|
+
Mode::Commands::Package.new(csv, path, options.merge(keys: keys)).execute
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Mode
|
2
|
+
module CLI
|
3
|
+
class Base < Thor
|
4
|
+
desc "setup", "Setup a new mode configuration in the given directory (defaults to home)"
|
5
|
+
option :dir, :default => '~'
|
6
|
+
option :host, :default => 'www.modeanalytics.com'
|
7
|
+
def setup
|
8
|
+
Mode::Commands::Init.new(options).execute
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'terminal-table'
|
2
|
+
|
3
|
+
module Mode
|
4
|
+
module Commands
|
5
|
+
class AnalyzeField < Thor
|
6
|
+
attr_accessor :path
|
7
|
+
attr_accessor :field_pos
|
8
|
+
attr_accessor :options
|
9
|
+
|
10
|
+
def initialize(path, field_pos, options = {})
|
11
|
+
@path = path
|
12
|
+
@field_pos = field_pos.to_i
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
no_commands do
|
17
|
+
include Mode::Commands::Helpers
|
18
|
+
|
19
|
+
def execute
|
20
|
+
if path.nil? || !File.exist?(path)
|
21
|
+
error "Couldn't find file at #{path}"
|
22
|
+
return
|
23
|
+
end
|
24
|
+
|
25
|
+
csv = DataKit::CSV::Parser.new(path)
|
26
|
+
|
27
|
+
field_name = csv.headers[field_pos]
|
28
|
+
|
29
|
+
say "Analyzing #{field_name} at #{path || 'input'} (Sampling #{'%.2f' % (100 * sampling_rate)}%)..."
|
30
|
+
|
31
|
+
analysis, total_time = timer_block do
|
32
|
+
DataKit::CSV::FieldAnalyzer.analyze(csv, field_pos, {
|
33
|
+
:match_type => match_type,
|
34
|
+
:sampling_rate => sampling_rate
|
35
|
+
})
|
36
|
+
end
|
37
|
+
|
38
|
+
say "Analyzed #{analysis.sample_count} of #{analysis.row_count} rows in #{'%.2f' % total_time} seconds\n"
|
39
|
+
|
40
|
+
display(analysis)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def display(analysis)
|
46
|
+
table = Terminal::Table.new(:headings => [
|
47
|
+
'Row No.', 'Type', 'Value'
|
48
|
+
])
|
49
|
+
|
50
|
+
analysis.types.each do |type, rows|
|
51
|
+
rows.each do |row_num|
|
52
|
+
table.add_row [row_num, type, analysis.value_at(row_num)]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
say table
|
57
|
+
end
|
58
|
+
|
59
|
+
def match_type
|
60
|
+
options[:match_type] ? options[:match_type].to_sym : :any
|
61
|
+
end
|
62
|
+
|
63
|
+
def sampling_rate
|
64
|
+
1
|
65
|
+
# file_size = File.size(path)
|
66
|
+
# options[:sample].to_f || DataKit::CSV::SchemaAnalyzer.sampling_rate(file_size)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'terminal-table'
|
2
|
+
|
3
|
+
module Mode
|
4
|
+
module Commands
|
5
|
+
class AnalyzeSchema < Thor
|
6
|
+
attr_accessor :path
|
7
|
+
attr_accessor :options
|
8
|
+
|
9
|
+
def initialize(path, options = {})
|
10
|
+
@path = path
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
no_commands do
|
15
|
+
include Mode::Commands::Helpers
|
16
|
+
|
17
|
+
def execute
|
18
|
+
if path.nil? || !File.exist?(path)
|
19
|
+
error "Couldn't find file at #{path}"
|
20
|
+
return
|
21
|
+
end
|
22
|
+
|
23
|
+
csv = DataKit::CSV::Parser.new(path)
|
24
|
+
|
25
|
+
say "Analyzing #{path || 'input'} (Sampling #{'%.2f' % (100 * sample_rate)}%)..."
|
26
|
+
|
27
|
+
analyzer, total_time = timer_block do
|
28
|
+
DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => sample_rate)
|
29
|
+
end
|
30
|
+
|
31
|
+
say "Analyzed #{analyzer.sample_count} of #{analyzer.row_count} rows in #{'%.2f' % total_time} seconds\n"
|
32
|
+
|
33
|
+
display(analyzer)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def format_percentage_cell(numerator, denominator)
|
40
|
+
cell = { :alignment => :right }
|
41
|
+
|
42
|
+
if numerator == 0
|
43
|
+
cell[:value] = nil
|
44
|
+
else
|
45
|
+
cell[:value] = '%.2f' % (100 * (numerator / denominator.to_f)) + '%'
|
46
|
+
end
|
47
|
+
|
48
|
+
cell
|
49
|
+
end
|
50
|
+
|
51
|
+
def display(analysis)
|
52
|
+
table = Terminal::Table.new(:headings => [
|
53
|
+
'Field No.', 'Field', 'Type',
|
54
|
+
'String (%)', 'Integer (%)', 'Number (%)',
|
55
|
+
'Date/Time (%)', 'Boolean (%)', 'Empty (%)'
|
56
|
+
])
|
57
|
+
|
58
|
+
analysis.fields.each_with_index do |field_name, index|
|
59
|
+
row = [index, field_name]
|
60
|
+
|
61
|
+
field_type = analysis.type?(field_name)
|
62
|
+
|
63
|
+
if analysis.has_single_type?(field_name)
|
64
|
+
row << field_type
|
65
|
+
elsif analysis.has_only_numeric_types?(field_name)
|
66
|
+
row << field_type
|
67
|
+
else
|
68
|
+
row << '** ' + field_type
|
69
|
+
end
|
70
|
+
|
71
|
+
DataKit::Dataset::Field::Types.each do |type|
|
72
|
+
type_count = analysis.type_count(field_name, type)
|
73
|
+
row << format_percentage_cell(type_count, analysis.sample_count)
|
74
|
+
end
|
75
|
+
|
76
|
+
table.add_row(row)
|
77
|
+
end
|
78
|
+
|
79
|
+
say table
|
80
|
+
end
|
81
|
+
|
82
|
+
def sample_rate
|
83
|
+
file_size = File.size(path)
|
84
|
+
options[:sample] || DataKit::CSV::SchemaAnalyzer.sampling_rate(file_size)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Mode
|
2
|
+
module Commands
|
3
|
+
class Import < Thor
|
4
|
+
attr_accessor :account
|
5
|
+
attr_accessor :table_name
|
6
|
+
attr_accessor :package
|
7
|
+
attr_accessor :resource_name
|
8
|
+
|
9
|
+
def initialize(account, table_name, package, resource_name)
|
10
|
+
@account = account
|
11
|
+
@table_name = table_name
|
12
|
+
@package = package
|
13
|
+
@resource_name = resource_name
|
14
|
+
end
|
15
|
+
|
16
|
+
no_commands do
|
17
|
+
def execute
|
18
|
+
|
19
|
+
puts "Importing #{package.name}/#{resource_name} to #{account}/#{table_name}"
|
20
|
+
# We need to check the datapackage.md5
|
21
|
+
# We need to check the md5 of the source
|
22
|
+
# If either of these have changed the package needs to be reverified
|
23
|
+
|
24
|
+
# 1. Compress to temporary dir
|
25
|
+
# 2. POST /imports with name and zipfile
|
26
|
+
# 3. Poll for execution status until finished or timeout (what's the timeout?)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def valid_package?(package)
|
33
|
+
# package.valid?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
|
3
|
+
module Mode
|
4
|
+
module Commands
|
5
|
+
class Package < Thor
|
6
|
+
attr_accessor :data
|
7
|
+
attr_accessor :path
|
8
|
+
attr_accessor :name
|
9
|
+
attr_accessor :options
|
10
|
+
|
11
|
+
def initialize(data, name, options = {})
|
12
|
+
@data = data
|
13
|
+
@path = name
|
14
|
+
@name = name
|
15
|
+
@options = options
|
16
|
+
end
|
17
|
+
|
18
|
+
no_commands do
|
19
|
+
def execute
|
20
|
+
unless valid_data?(data)
|
21
|
+
error "Error: Couldn't find valid data file at #{data}"
|
22
|
+
return
|
23
|
+
end
|
24
|
+
|
25
|
+
unless valid_name?(name)
|
26
|
+
error "Error: Data package names can only contain letters, numbers, hyphens and underscores"
|
27
|
+
return
|
28
|
+
end
|
29
|
+
|
30
|
+
if Mode::Package::Base.exist?(name)
|
31
|
+
error "Error: A data package already exists at #{name}"
|
32
|
+
return
|
33
|
+
else
|
34
|
+
FileUtils.mkdir_p(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
csv = Mode::CSV::Parser.new(data)
|
38
|
+
package = Mode::Package::Builder.new(csv, path, name, sample_rate).execute
|
39
|
+
|
40
|
+
say "Finished packaging #{name}!"
|
41
|
+
say "Use `mode table create <account>/<tablename> #{name}` to create a table in the public data warehouse"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def valid_name?(name)
|
48
|
+
name =~ /\A[\w\d\-\_]+\z/
|
49
|
+
end
|
50
|
+
|
51
|
+
def valid_data?(data)
|
52
|
+
!data.nil? && File.exist?(data)
|
53
|
+
end
|
54
|
+
|
55
|
+
def sample_rate
|
56
|
+
Mode::CSV::Analyzer.sample_rate(File.size(data))
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Mode
|
2
|
+
module Commands
|
3
|
+
class Setup < Thor
|
4
|
+
attr_accessor :options
|
5
|
+
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
no_commands do
|
11
|
+
def execute
|
12
|
+
config_path = Mode::Config.full_path(options[:dir])
|
13
|
+
|
14
|
+
if File.exist?(config_path)
|
15
|
+
say "Configuration at #{config_path} already exists"
|
16
|
+
unless yes? "Would you like to modify the existing configuration? (y/n):"
|
17
|
+
return
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
say "Initializing configuration at #{config_path}"
|
22
|
+
|
23
|
+
config = Mode::Config.init(options[:dir])
|
24
|
+
|
25
|
+
config.username = ask "Mode username:"
|
26
|
+
|
27
|
+
say "Your can view your access tokens at http://#{options[:host]}/accounts/#{config.username}/access_tokens"
|
28
|
+
|
29
|
+
config.access_token = ask "Access token for #{config.username}:"
|
30
|
+
|
31
|
+
config.save
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/mode/config.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
module Mode
|
4
|
+
class Config
|
5
|
+
FILENAME = '.mode.yml'
|
6
|
+
|
7
|
+
attr_accessor :path
|
8
|
+
|
9
|
+
attr_accessor :username, :access_token
|
10
|
+
|
11
|
+
def initialize(path, filename = nil)
|
12
|
+
@path = self.class.full_path(path, filename)
|
13
|
+
|
14
|
+
if File.exist?(@path)
|
15
|
+
configure YAML.load_file(@path)
|
16
|
+
else
|
17
|
+
raise "Could not load configuration file from #{@path}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def save
|
22
|
+
File.open(path, 'w+') do |file|
|
23
|
+
file.write(to_yaml)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class << self
|
28
|
+
def init(path, filename = nil)
|
29
|
+
File.open(full_path(path, filename), 'w+') do |file|
|
30
|
+
file.write({}.to_yaml)
|
31
|
+
end
|
32
|
+
|
33
|
+
new(path, filename)
|
34
|
+
end
|
35
|
+
|
36
|
+
def full_path(path, filename = nil)
|
37
|
+
File.expand_path(File.join(path, filename || Mode::Config::FILENAME))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
def configure(config)
|
43
|
+
@username = config['username']
|
44
|
+
@access_token = config['access_token']
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_yaml
|
48
|
+
{
|
49
|
+
'username' => username,
|
50
|
+
'access_token' => access_token
|
51
|
+
}.to_yaml
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|