flat_kit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ee02f6b5e9ed51f565da86c04a0d4600cc0af271e3c8b90f67e22f00ff450fd
|
4
|
+
data.tar.gz: ee58c4ee864c91dc2e11c429891a5c4a7455541f3747f745aa8de0203b6c2142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1cbcb7d15633b06818d0647935475f1e87f9944baa2c0c8450fb9bb54bcad3c4f82a178c5ef3eb43529f05ef61091958f1eee751291dcf45f3632a18c2b0bfeb
|
7
|
+
data.tar.gz: ff78cd8f3e0795da93f50fc796b209a996a10b4fda7524ebf0878895ca467bbb178e2f4bf6da9547d08ba7c5fff1c8791a22c5f4341a59f569af0b22e8d8858c
|
data/HISTORY.md
CHANGED
@@ -1,4 +1,10 @@
|
|
1
1
|
# FlatKit Changelog
|
2
|
+
## Version 0.3.0
|
3
|
+
|
4
|
+
* Changing the event listening api to include meta data about the event
|
5
|
+
* Add field type detection
|
6
|
+
* Add a 'stats' command to generate stats about the data file
|
7
|
+
|
2
8
|
## Version 0.2.0
|
3
9
|
|
4
10
|
* add in event listening to allow for additional integrations
|
data/Manifest.txt
CHANGED
@@ -5,15 +5,28 @@ Manifest.txt
|
|
5
5
|
README.md
|
6
6
|
Rakefile
|
7
7
|
bin/fk
|
8
|
+
examples/stream-active-record-to-csv.rb
|
8
9
|
lib/flat_kit.rb
|
9
10
|
lib/flat_kit/cli.rb
|
10
11
|
lib/flat_kit/command.rb
|
11
12
|
lib/flat_kit/command/cat.rb
|
12
13
|
lib/flat_kit/command/merge.rb
|
13
14
|
lib/flat_kit/command/sort.rb
|
15
|
+
lib/flat_kit/command/stats.rb
|
14
16
|
lib/flat_kit/descendant_tracker.rb
|
15
17
|
lib/flat_kit/error.rb
|
16
18
|
lib/flat_kit/event_emitter.rb
|
19
|
+
lib/flat_kit/field_stats.rb
|
20
|
+
lib/flat_kit/field_type.rb
|
21
|
+
lib/flat_kit/field_type/boolean_type.rb
|
22
|
+
lib/flat_kit/field_type/date_type.rb
|
23
|
+
lib/flat_kit/field_type/float_type.rb
|
24
|
+
lib/flat_kit/field_type/guess_type.rb
|
25
|
+
lib/flat_kit/field_type/integer_type.rb
|
26
|
+
lib/flat_kit/field_type/null_type.rb
|
27
|
+
lib/flat_kit/field_type/string_type.rb
|
28
|
+
lib/flat_kit/field_type/timestamp_type.rb
|
29
|
+
lib/flat_kit/field_type/unknown_type.rb
|
17
30
|
lib/flat_kit/format.rb
|
18
31
|
lib/flat_kit/input.rb
|
19
32
|
lib/flat_kit/input/file.rb
|
@@ -31,11 +44,17 @@ lib/flat_kit/merge_tree.rb
|
|
31
44
|
lib/flat_kit/output.rb
|
32
45
|
lib/flat_kit/output/file.rb
|
33
46
|
lib/flat_kit/output/io.rb
|
47
|
+
lib/flat_kit/position.rb
|
34
48
|
lib/flat_kit/reader.rb
|
35
49
|
lib/flat_kit/record.rb
|
36
50
|
lib/flat_kit/sentinel_internal_node.rb
|
37
51
|
lib/flat_kit/sentinel_leaf_node.rb
|
38
52
|
lib/flat_kit/sort.rb
|
53
|
+
lib/flat_kit/stat_type.rb
|
54
|
+
lib/flat_kit/stat_type/nominal_stats.rb
|
55
|
+
lib/flat_kit/stat_type/numerical_stats.rb
|
56
|
+
lib/flat_kit/stat_type/ordinal_stats.rb
|
57
|
+
lib/flat_kit/stats.rb
|
39
58
|
lib/flat_kit/writer.rb
|
40
59
|
lib/flat_kit/xsv.rb
|
41
60
|
lib/flat_kit/xsv/format.rb
|
@@ -47,6 +66,15 @@ tasks/extension.rake
|
|
47
66
|
tasks/man.rake
|
48
67
|
tasks/this.rb
|
49
68
|
test/device_dataset.rb
|
69
|
+
test/field_type/test_boolean_type.rb
|
70
|
+
test/field_type/test_date_type.rb
|
71
|
+
test/field_type/test_float_type.rb
|
72
|
+
test/field_type/test_guess_type.rb
|
73
|
+
test/field_type/test_integer_type.rb
|
74
|
+
test/field_type/test_null_type.rb
|
75
|
+
test/field_type/test_string_type.rb
|
76
|
+
test/field_type/test_timestamp_type.rb
|
77
|
+
test/field_type/test_unknown_type.rb
|
50
78
|
test/input/test_file.rb
|
51
79
|
test/input/test_io.rb
|
52
80
|
test/jsonl/test_format.rb
|
@@ -55,8 +83,14 @@ test/jsonl/test_record.rb
|
|
55
83
|
test/jsonl/test_writer.rb
|
56
84
|
test/output/test_file.rb
|
57
85
|
test/output/test_io.rb
|
86
|
+
test/run
|
87
|
+
test/stat_type/test_nominal_stats.rb
|
88
|
+
test/stat_type/test_numerical_stats.rb
|
89
|
+
test/stat_type/test_ordinal_stats.rb
|
58
90
|
test/test_conversions.rb
|
59
91
|
test/test_event_emitter.rb
|
92
|
+
test/test_field_stats.rb
|
93
|
+
test/test_field_type.rb
|
60
94
|
test/test_format.rb
|
61
95
|
test/test_helper.rb
|
62
96
|
test/test_merge.rb
|
data/Rakefile
CHANGED
@@ -10,6 +10,7 @@ This.ruby_gemspec do |spec|
|
|
10
10
|
spec.add_dependency('oj', '~> 3.0')
|
11
11
|
spec.add_dependency('optimist', '~> 3.0')
|
12
12
|
|
13
|
+
spec.add_development_dependency( 'faker' , '~> 2.16')
|
13
14
|
spec.add_development_dependency( 'rake' , '~> 13.0')
|
14
15
|
spec.add_development_dependency( 'minitest' , '~> 5.11' )
|
15
16
|
spec.add_development_dependency( 'minitest-focus' , '~> 1.2' )
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env
|
2
|
+
|
3
|
+
#------------------------------------------------------------------------------
|
4
|
+
# This is an example to show how to stream an active record scope to a CSV file
|
5
|
+
# using FlatKit.
|
6
|
+
#------------------------------------------------------------------------------
|
7
|
+
|
8
|
+
require 'flat_kit' # gem 'flat_kit'
|
9
|
+
require 'progress_bar' # gem 'progress-bar'
|
10
|
+
|
11
|
+
# get an appropriate scope from one of your models - or any scope for that
|
12
|
+
# matter
|
13
|
+
scope = MyActiveRecordModel.all
|
14
|
+
|
15
|
+
# Output to a file that is csv, and automatically gzipped
|
16
|
+
#
|
17
|
+
output_csv = ::FlatKit::Xsv::Writer.new(destination: "export.csv.gz")
|
18
|
+
|
19
|
+
# handy progress bar
|
20
|
+
bar = ProgressBar.new(scope.count)
|
21
|
+
|
22
|
+
# using active record in batches to not pull all the recors from the database at
|
23
|
+
# once
|
24
|
+
#
|
25
|
+
# https://api.rubyonrails.org/classes/ActiveRecord/Batches.html#method-i-find_each
|
26
|
+
scope.find_each do |record|
|
27
|
+
|
28
|
+
# generate an XSV Record by pulling hte attributes out of the active record
|
29
|
+
# model. You may also want to generate a hash from a query or something
|
30
|
+
# along those lines. In any case pass in a Hash to complete_structured_data:
|
31
|
+
# and nil to data.
|
32
|
+
xsv_record = ::FlatKit::Xsv::Record.new(data: nil, complete_structured_data: record.attributes)
|
33
|
+
|
34
|
+
# FlatKit will automatically handle writing out the header line based upon
|
35
|
+
# the fields in the first record.
|
36
|
+
output_csv.write(xsv_record)
|
37
|
+
|
38
|
+
bar.increment!
|
39
|
+
end
|
40
|
+
|
41
|
+
# close the output file explicitly
|
42
|
+
output_csv.close
|
data/lib/flat_kit.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
module FlatKit
|
2
|
-
VERSION = "0.
|
2
|
+
VERSION = "0.3.0"
|
3
3
|
end
|
4
4
|
require 'flat_kit/error'
|
5
|
+
require 'flat_kit/descendant_tracker'
|
5
6
|
require 'flat_kit/logger'
|
6
7
|
require 'flat_kit/event_emitter'
|
7
|
-
|
8
|
+
|
9
|
+
require 'flat_kit/field_type'
|
8
10
|
require 'flat_kit/format'
|
11
|
+
require 'flat_kit/position'
|
9
12
|
require 'flat_kit/record'
|
10
13
|
require 'flat_kit/reader'
|
11
14
|
require 'flat_kit/writer'
|
@@ -16,6 +19,10 @@ require 'flat_kit/xsv'
|
|
16
19
|
require 'flat_kit/jsonl'
|
17
20
|
require 'flat_kit/merge'
|
18
21
|
require 'flat_kit/sort'
|
22
|
+
require 'flat_kit/stats'
|
23
|
+
|
24
|
+
require 'flat_kit/stat_type'
|
25
|
+
require 'flat_kit/field_stats'
|
19
26
|
|
20
27
|
require 'flat_kit/merge_tree'
|
21
28
|
require 'flat_kit/internal_node'
|
data/lib/flat_kit/cli.rb
CHANGED
@@ -72,7 +72,18 @@ module FlatKit
|
|
72
72
|
::FlatKit.logger.debug argv
|
73
73
|
|
74
74
|
command_name = argv.shift
|
75
|
+
if command_name.downcase == "help" then
|
76
|
+
parser.educate
|
77
|
+
exit 0
|
78
|
+
end
|
79
|
+
|
75
80
|
command_klass = FlatKit::Command.for(command_name)
|
81
|
+
if command_klass.nil? then
|
82
|
+
$stdout.puts "ERROR: Unknown command '#{command_name}'"
|
83
|
+
parser.educate
|
84
|
+
exit 0
|
85
|
+
end
|
86
|
+
|
76
87
|
command = command_klass.new(argv: argv, logger: ::FlatKit.logger, env: env)
|
77
88
|
command.call
|
78
89
|
end
|
data/lib/flat_kit/command.rb
CHANGED
@@ -0,0 +1,94 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Command
|
3
|
+
class Stats < ::FlatKit::Command
|
4
|
+
|
5
|
+
def self.name
|
6
|
+
"stats"
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.description
|
10
|
+
"Collect and report stats on the inputfile"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parser
|
14
|
+
::Optimist::Parser.new do
|
15
|
+
banner "#{Sort.description}"
|
16
|
+
banner ""
|
17
|
+
|
18
|
+
banner <<~BANNER
|
19
|
+
Given an input file collect basic statistics.
|
20
|
+
|
21
|
+
The statistics can vary based upon the datatype of the field.
|
22
|
+
|
23
|
+
Numeric fields will report the basic count, min, max, mean, standard deviation and sum.
|
24
|
+
Non-numeric fields that are comparable, like dates, will report count, min and max.
|
25
|
+
Other non-numeric fields will only report the count.
|
26
|
+
|
27
|
+
Adding --cardinality will report the count, and frequency of distinct values in the result.
|
28
|
+
This will allow for reporting the median value.
|
29
|
+
|
30
|
+
The fields upon which stats are collected may be selected with the --fields parameter.
|
31
|
+
By default statistics are collected on all fields.
|
32
|
+
|
33
|
+
The flatfile type(s) will be automatically determined by the file name.
|
34
|
+
|
35
|
+
The output can be dumped as a CSV, JSON or a a formated ascii table.
|
36
|
+
|
37
|
+
BANNER
|
38
|
+
|
39
|
+
banner <<~USAGE
|
40
|
+
|
41
|
+
Usage:
|
42
|
+
fk stats --everything file.json
|
43
|
+
fk stats --select surname,given_name file.csv
|
44
|
+
fk stats --select surname,given_name --output-format json file.csv > stats.json
|
45
|
+
fk stats --select field1,field2 --output-format json input.csv
|
46
|
+
fk stats --select field1 file.json.gz -o stats.csv
|
47
|
+
gunzip -c file.json.gz | fk stats --input-format json --output-format text
|
48
|
+
|
49
|
+
USAGE
|
50
|
+
|
51
|
+
banner <<~OPTIONS
|
52
|
+
|
53
|
+
Options:
|
54
|
+
|
55
|
+
OPTIONS
|
56
|
+
|
57
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
58
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
59
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
60
|
+
opt :select, "The comma separted list of field(s) to report stats on", required: false, type: :string
|
61
|
+
opt :everything, "Show all statistics that are possible", default: false
|
62
|
+
opt :cardinality, "Show the cardinality of the fields, this requires additional memory", default: false
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse
|
67
|
+
parser = self.class.parser
|
68
|
+
::Optimist::with_standard_exception_handling(parser) do
|
69
|
+
begin
|
70
|
+
opts = parser.parse(argv)
|
71
|
+
fields = ::FlatKit::Stats::AllFields
|
72
|
+
fields = CSV.parse_line(opts[:select]) if opts[:select]
|
73
|
+
|
74
|
+
stats = [FieldStats::CORE_STATS]
|
75
|
+
stats << FieldStats::CARDINALITY_STATS if opts[:cardinality] || opts[:everything]
|
76
|
+
|
77
|
+
paths = parser.leftovers
|
78
|
+
raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
|
79
|
+
path = paths.first || "-" # default to stdin
|
80
|
+
@stats = ::FlatKit::Stats.new(input: path, input_fallback: opts[:input_format],
|
81
|
+
output: opts[:output], output_fallback: opts[:output_format],
|
82
|
+
fields_to_stat: fields, stats_to_collect: stats)
|
83
|
+
rescue ::FlatKit::Error => e
|
84
|
+
raise ::Optimist::CommandlineError, e.message
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def call
|
90
|
+
@stats.call
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -23,5 +23,14 @@ module FlatKit
|
|
23
23
|
child_klass.send(method, *args)
|
24
24
|
end
|
25
25
|
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Find all the children that return truthy from the given method with args
|
29
|
+
#
|
30
|
+
def find_children(method, *args)
|
31
|
+
children.select do |child_klass|
|
32
|
+
child_klass.send(method, *args)
|
33
|
+
end
|
34
|
+
end
|
26
35
|
end
|
27
36
|
end
|
@@ -20,9 +20,9 @@ module FlatKit
|
|
20
20
|
_listeners.clear
|
21
21
|
end
|
22
22
|
|
23
|
-
def notify_listeners(name:, data:)
|
23
|
+
def notify_listeners(name:, data:, meta: nil)
|
24
24
|
_listeners.each do |l|
|
25
|
-
l.on_event(name: name, data: data)
|
25
|
+
l.on_event(name: name, data: data, meta: meta)
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
@@ -0,0 +1,241 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Collect stats on a single field. We may not know what the field data type is
|
3
|
+
# to start with, so collect a bunch of values until we have the threshold, and
|
4
|
+
# then calculte states based upon the data types determined by the guess
|
5
|
+
#
|
6
|
+
class FieldStats
|
7
|
+
DEFAULT_GUESS_THRESHOLD = 1000
|
8
|
+
|
9
|
+
CORE_STATS = :core
|
10
|
+
CARDINALITY_STATS = :cardinality
|
11
|
+
|
12
|
+
ALL_STATS = [ CORE_STATS, CARDINALITY_STATS ]
|
13
|
+
|
14
|
+
EXPORT_FIELDS = %w[
|
15
|
+
name
|
16
|
+
type
|
17
|
+
count
|
18
|
+
max
|
19
|
+
mean
|
20
|
+
min
|
21
|
+
stddev
|
22
|
+
sum
|
23
|
+
mode
|
24
|
+
unique_count
|
25
|
+
|
26
|
+
max_length
|
27
|
+
mean_length
|
28
|
+
min_length
|
29
|
+
stddev_length
|
30
|
+
mode_length
|
31
|
+
unique_count_lengths
|
32
|
+
|
33
|
+
null_count
|
34
|
+
unknown_count
|
35
|
+
out_of_type_count
|
36
|
+
total_count
|
37
|
+
null_percent
|
38
|
+
unknown_percent
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
attr_reader :type_counts
|
43
|
+
attr_reader :field_type
|
44
|
+
attr_reader :name
|
45
|
+
|
46
|
+
def initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD)
|
47
|
+
@name = name
|
48
|
+
@field_type = type
|
49
|
+
@guess_threshold = guess_threshold
|
50
|
+
@type_counts = Hash.new(0)
|
51
|
+
@out_of_type_count = 0
|
52
|
+
@values = []
|
53
|
+
@stats = nil
|
54
|
+
@length_stats = nil
|
55
|
+
@stats_to_collect = [stats_to_collect].flatten
|
56
|
+
|
57
|
+
@stats_to_collect.each do |collection_set|
|
58
|
+
next if ALL_STATS.include?(collection_set)
|
59
|
+
raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{ALL_STATS.map { |s| s.to_s }.join(", ") }"
|
60
|
+
end
|
61
|
+
raise ArgumentError, "type: must be FieldType subclasses - not #{type}" unless type.kind_of?(Class) && (type.superclass == ::FlatKit::FieldType)
|
62
|
+
end
|
63
|
+
|
64
|
+
def field_type_determined?
|
65
|
+
@field_type != ::FlatKit::FieldType::GuessType
|
66
|
+
end
|
67
|
+
|
68
|
+
def update(value)
|
69
|
+
update_type_count(value)
|
70
|
+
|
71
|
+
if field_type_determined? then
|
72
|
+
update_stats(value)
|
73
|
+
else
|
74
|
+
@values << value
|
75
|
+
|
76
|
+
if @values.size >= @guess_threshold then
|
77
|
+
resolve_guess
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def collecting_frequencies?
|
83
|
+
@stats_to_collect.include?(CARDINALITY_STATS)
|
84
|
+
end
|
85
|
+
|
86
|
+
def type
|
87
|
+
@field_type.type_name
|
88
|
+
end
|
89
|
+
|
90
|
+
def count
|
91
|
+
stats.count
|
92
|
+
end
|
93
|
+
|
94
|
+
def max
|
95
|
+
stats.max if stats.respond_to?(:max)
|
96
|
+
end
|
97
|
+
|
98
|
+
def mean
|
99
|
+
stats.mean if stats.respond_to?(:mean)
|
100
|
+
end
|
101
|
+
|
102
|
+
def min
|
103
|
+
stats.min if stats.respond_to?(:min)
|
104
|
+
end
|
105
|
+
|
106
|
+
def stddev
|
107
|
+
stats.stddev if stats.respond_to?(:stddev)
|
108
|
+
end
|
109
|
+
|
110
|
+
def sum
|
111
|
+
stats.sum if stats.respond_to?(:sum)
|
112
|
+
end
|
113
|
+
|
114
|
+
def mode
|
115
|
+
stats.mode if collecting_frequencies?
|
116
|
+
end
|
117
|
+
|
118
|
+
def unique_count
|
119
|
+
stats.unique_count if collecting_frequencies?
|
120
|
+
end
|
121
|
+
|
122
|
+
def unique_values
|
123
|
+
stats.unique_values if collecting_frequencies?
|
124
|
+
end
|
125
|
+
|
126
|
+
def frequencies
|
127
|
+
stats.frequencies if collecting_frequencies?
|
128
|
+
end
|
129
|
+
|
130
|
+
def min_length
|
131
|
+
length_stats.min if @length_stats
|
132
|
+
end
|
133
|
+
|
134
|
+
def max_length
|
135
|
+
length_stats.max if @length_stats
|
136
|
+
end
|
137
|
+
|
138
|
+
def mean_length
|
139
|
+
length_stats.mean if @length_stats
|
140
|
+
end
|
141
|
+
|
142
|
+
def stddev_length
|
143
|
+
length_stats.stddev if @length_stats
|
144
|
+
end
|
145
|
+
|
146
|
+
def mode_length
|
147
|
+
length_stats.mode if @length_stats && collecting_frequencies?
|
148
|
+
end
|
149
|
+
|
150
|
+
def unique_count_lengths
|
151
|
+
length_stats.unique_count if @length_stats && collecting_frequencies?
|
152
|
+
end
|
153
|
+
|
154
|
+
def unique_values_lengths
|
155
|
+
length_stats.unique_values if @length_stats && collecting_frequencies?
|
156
|
+
end
|
157
|
+
|
158
|
+
def length_frequencies
|
159
|
+
length_stats.frequencies if @length_stats && collecting_frequencies?
|
160
|
+
end
|
161
|
+
|
162
|
+
def null_count
|
163
|
+
type_counts[FieldType::NullType]
|
164
|
+
end
|
165
|
+
|
166
|
+
def total_count
|
167
|
+
stats.count + @out_of_type_count
|
168
|
+
end
|
169
|
+
|
170
|
+
def out_of_type_count
|
171
|
+
@out_of_type_count
|
172
|
+
end
|
173
|
+
|
174
|
+
def null_percent
|
175
|
+
return 0 if total_count.zero?
|
176
|
+
((null_count.to_f / total_count) * 100.0).truncate(2)
|
177
|
+
end
|
178
|
+
|
179
|
+
def unknown_count
|
180
|
+
type_counts[FieldType::UnknownType]
|
181
|
+
end
|
182
|
+
|
183
|
+
def unknown_percent
|
184
|
+
return 0 if total_count.zero?
|
185
|
+
((unknown_count.to_f / total_count) * 100.0).truncate(2)
|
186
|
+
end
|
187
|
+
|
188
|
+
def to_hash
|
189
|
+
resolve_guess
|
190
|
+
|
191
|
+
Hash.new.tap do |h|
|
192
|
+
EXPORT_FIELDS.each do |n|
|
193
|
+
h[n] = self.send(n)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
|
200
|
+
def stats
|
201
|
+
resolve_guess
|
202
|
+
@stats
|
203
|
+
end
|
204
|
+
|
205
|
+
def length_stats
|
206
|
+
resolve_guess
|
207
|
+
@length_stats
|
208
|
+
end
|
209
|
+
|
210
|
+
def update_stats(value)
|
211
|
+
coerced_value = @field_type.coerce(value)
|
212
|
+
if coerced_value == FieldType::CoerceFailure then
|
213
|
+
@out_of_type_count += 1
|
214
|
+
return
|
215
|
+
end
|
216
|
+
|
217
|
+
@stats.update(coerced_value)
|
218
|
+
@length_stats.update(coerced_value.to_s.length) if @length_stats
|
219
|
+
end
|
220
|
+
|
221
|
+
def update_type_count(value)
|
222
|
+
guess = FieldType.best_guess(value)
|
223
|
+
type_counts[guess] += 1
|
224
|
+
return guess
|
225
|
+
end
|
226
|
+
|
227
|
+
def resolve_guess
|
228
|
+
return if field_type_determined?
|
229
|
+
best_guess_type, _best_guess_count = type_counts.max_by { |k, v| v }
|
230
|
+
@field_type = best_guess_type
|
231
|
+
@stats = StatType.for(@field_type).new(collecting_frequencies: collecting_frequencies?)
|
232
|
+
if @field_type == ::FlatKit::FieldType::StringType then
|
233
|
+
@length_stats = ::FlatKit::StatType::NumericalStats.new(collecting_frequencies: collecting_frequencies?)
|
234
|
+
end
|
235
|
+
@values.each do |v|
|
236
|
+
update_stats(v)
|
237
|
+
end
|
238
|
+
@values.clear
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|