flat_kit 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ee02f6b5e9ed51f565da86c04a0d4600cc0af271e3c8b90f67e22f00ff450fd
|
4
|
+
data.tar.gz: ee58c4ee864c91dc2e11c429891a5c4a7455541f3747f745aa8de0203b6c2142
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1cbcb7d15633b06818d0647935475f1e87f9944baa2c0c8450fb9bb54bcad3c4f82a178c5ef3eb43529f05ef61091958f1eee751291dcf45f3632a18c2b0bfeb
|
7
|
+
data.tar.gz: ff78cd8f3e0795da93f50fc796b209a996a10b4fda7524ebf0878895ca467bbb178e2f4bf6da9547d08ba7c5fff1c8791a22c5f4341a59f569af0b22e8d8858c
|
data/HISTORY.md
CHANGED
@@ -1,4 +1,10 @@
|
|
1
1
|
# FlatKit Changelog
|
2
|
+
## Version 0.3.0
|
3
|
+
|
4
|
+
* Changing the event listening api to include meta data about the event
|
5
|
+
* Add field type detection
|
6
|
+
* Add a 'stats' command to generate stats about the data file
|
7
|
+
|
2
8
|
## Version 0.2.0
|
3
9
|
|
4
10
|
* add in event listening to allow for additional integrations
|
data/Manifest.txt
CHANGED
@@ -5,15 +5,28 @@ Manifest.txt
|
|
5
5
|
README.md
|
6
6
|
Rakefile
|
7
7
|
bin/fk
|
8
|
+
examples/stream-active-record-to-csv.rb
|
8
9
|
lib/flat_kit.rb
|
9
10
|
lib/flat_kit/cli.rb
|
10
11
|
lib/flat_kit/command.rb
|
11
12
|
lib/flat_kit/command/cat.rb
|
12
13
|
lib/flat_kit/command/merge.rb
|
13
14
|
lib/flat_kit/command/sort.rb
|
15
|
+
lib/flat_kit/command/stats.rb
|
14
16
|
lib/flat_kit/descendant_tracker.rb
|
15
17
|
lib/flat_kit/error.rb
|
16
18
|
lib/flat_kit/event_emitter.rb
|
19
|
+
lib/flat_kit/field_stats.rb
|
20
|
+
lib/flat_kit/field_type.rb
|
21
|
+
lib/flat_kit/field_type/boolean_type.rb
|
22
|
+
lib/flat_kit/field_type/date_type.rb
|
23
|
+
lib/flat_kit/field_type/float_type.rb
|
24
|
+
lib/flat_kit/field_type/guess_type.rb
|
25
|
+
lib/flat_kit/field_type/integer_type.rb
|
26
|
+
lib/flat_kit/field_type/null_type.rb
|
27
|
+
lib/flat_kit/field_type/string_type.rb
|
28
|
+
lib/flat_kit/field_type/timestamp_type.rb
|
29
|
+
lib/flat_kit/field_type/unknown_type.rb
|
17
30
|
lib/flat_kit/format.rb
|
18
31
|
lib/flat_kit/input.rb
|
19
32
|
lib/flat_kit/input/file.rb
|
@@ -31,11 +44,17 @@ lib/flat_kit/merge_tree.rb
|
|
31
44
|
lib/flat_kit/output.rb
|
32
45
|
lib/flat_kit/output/file.rb
|
33
46
|
lib/flat_kit/output/io.rb
|
47
|
+
lib/flat_kit/position.rb
|
34
48
|
lib/flat_kit/reader.rb
|
35
49
|
lib/flat_kit/record.rb
|
36
50
|
lib/flat_kit/sentinel_internal_node.rb
|
37
51
|
lib/flat_kit/sentinel_leaf_node.rb
|
38
52
|
lib/flat_kit/sort.rb
|
53
|
+
lib/flat_kit/stat_type.rb
|
54
|
+
lib/flat_kit/stat_type/nominal_stats.rb
|
55
|
+
lib/flat_kit/stat_type/numerical_stats.rb
|
56
|
+
lib/flat_kit/stat_type/ordinal_stats.rb
|
57
|
+
lib/flat_kit/stats.rb
|
39
58
|
lib/flat_kit/writer.rb
|
40
59
|
lib/flat_kit/xsv.rb
|
41
60
|
lib/flat_kit/xsv/format.rb
|
@@ -47,6 +66,15 @@ tasks/extension.rake
|
|
47
66
|
tasks/man.rake
|
48
67
|
tasks/this.rb
|
49
68
|
test/device_dataset.rb
|
69
|
+
test/field_type/test_boolean_type.rb
|
70
|
+
test/field_type/test_date_type.rb
|
71
|
+
test/field_type/test_float_type.rb
|
72
|
+
test/field_type/test_guess_type.rb
|
73
|
+
test/field_type/test_integer_type.rb
|
74
|
+
test/field_type/test_null_type.rb
|
75
|
+
test/field_type/test_string_type.rb
|
76
|
+
test/field_type/test_timestamp_type.rb
|
77
|
+
test/field_type/test_unknown_type.rb
|
50
78
|
test/input/test_file.rb
|
51
79
|
test/input/test_io.rb
|
52
80
|
test/jsonl/test_format.rb
|
@@ -55,8 +83,14 @@ test/jsonl/test_record.rb
|
|
55
83
|
test/jsonl/test_writer.rb
|
56
84
|
test/output/test_file.rb
|
57
85
|
test/output/test_io.rb
|
86
|
+
test/run
|
87
|
+
test/stat_type/test_nominal_stats.rb
|
88
|
+
test/stat_type/test_numerical_stats.rb
|
89
|
+
test/stat_type/test_ordinal_stats.rb
|
58
90
|
test/test_conversions.rb
|
59
91
|
test/test_event_emitter.rb
|
92
|
+
test/test_field_stats.rb
|
93
|
+
test/test_field_type.rb
|
60
94
|
test/test_format.rb
|
61
95
|
test/test_helper.rb
|
62
96
|
test/test_merge.rb
|
data/Rakefile
CHANGED
@@ -10,6 +10,7 @@ This.ruby_gemspec do |spec|
|
|
10
10
|
spec.add_dependency('oj', '~> 3.0')
|
11
11
|
spec.add_dependency('optimist', '~> 3.0')
|
12
12
|
|
13
|
+
spec.add_development_dependency( 'faker' , '~> 2.16')
|
13
14
|
spec.add_development_dependency( 'rake' , '~> 13.0')
|
14
15
|
spec.add_development_dependency( 'minitest' , '~> 5.11' )
|
15
16
|
spec.add_development_dependency( 'minitest-focus' , '~> 1.2' )
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env
|
2
|
+
|
3
|
+
#------------------------------------------------------------------------------
|
4
|
+
# This is an example to show how to stream an active record scope to a CSV file
|
5
|
+
# using FlatKit.
|
6
|
+
#------------------------------------------------------------------------------
|
7
|
+
|
8
|
+
require 'flat_kit' # gem 'flat_kit'
|
9
|
+
require 'progress_bar' # gem 'progress-bar'
|
10
|
+
|
11
|
+
# get an appropriate scope from one of your models - or any scope for that
|
12
|
+
# matter
|
13
|
+
scope = MyActiveRecordModel.all
|
14
|
+
|
15
|
+
# Output to a file that is csv, and automatically gzipped
|
16
|
+
#
|
17
|
+
output_csv = ::FlatKit::Xsv::Writer.new(destination: "export.csv.gz")
|
18
|
+
|
19
|
+
# handy progress bar
|
20
|
+
bar = ProgressBar.new(scope.count)
|
21
|
+
|
22
|
+
# using active record in batches to not pull all the recors from the database at
|
23
|
+
# once
|
24
|
+
#
|
25
|
+
# https://api.rubyonrails.org/classes/ActiveRecord/Batches.html#method-i-find_each
|
26
|
+
scope.find_each do |record|
|
27
|
+
|
28
|
+
# generate an XSV Record by pulling hte attributes out of the active record
|
29
|
+
# model. You may also want to generate a hash from a query or something
|
30
|
+
# along those lines. In any case pass in a Hash to complete_structured_data:
|
31
|
+
# and nil to data.
|
32
|
+
xsv_record = ::FlatKit::Xsv::Record.new(data: nil, complete_structured_data: record.attributes)
|
33
|
+
|
34
|
+
# FlatKit will automatically handle writing out the header line based upon
|
35
|
+
# the fields in the first record.
|
36
|
+
output_csv.write(xsv_record)
|
37
|
+
|
38
|
+
bar.increment!
|
39
|
+
end
|
40
|
+
|
41
|
+
# close the output file explicitly
|
42
|
+
output_csv.close
|
data/lib/flat_kit.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
module FlatKit
|
2
|
-
VERSION = "0.
|
2
|
+
VERSION = "0.3.0"
|
3
3
|
end
|
4
4
|
require 'flat_kit/error'
|
5
|
+
require 'flat_kit/descendant_tracker'
|
5
6
|
require 'flat_kit/logger'
|
6
7
|
require 'flat_kit/event_emitter'
|
7
|
-
|
8
|
+
|
9
|
+
require 'flat_kit/field_type'
|
8
10
|
require 'flat_kit/format'
|
11
|
+
require 'flat_kit/position'
|
9
12
|
require 'flat_kit/record'
|
10
13
|
require 'flat_kit/reader'
|
11
14
|
require 'flat_kit/writer'
|
@@ -16,6 +19,10 @@ require 'flat_kit/xsv'
|
|
16
19
|
require 'flat_kit/jsonl'
|
17
20
|
require 'flat_kit/merge'
|
18
21
|
require 'flat_kit/sort'
|
22
|
+
require 'flat_kit/stats'
|
23
|
+
|
24
|
+
require 'flat_kit/stat_type'
|
25
|
+
require 'flat_kit/field_stats'
|
19
26
|
|
20
27
|
require 'flat_kit/merge_tree'
|
21
28
|
require 'flat_kit/internal_node'
|
data/lib/flat_kit/cli.rb
CHANGED
@@ -72,7 +72,18 @@ module FlatKit
|
|
72
72
|
::FlatKit.logger.debug argv
|
73
73
|
|
74
74
|
command_name = argv.shift
|
75
|
+
if command_name.downcase == "help" then
|
76
|
+
parser.educate
|
77
|
+
exit 0
|
78
|
+
end
|
79
|
+
|
75
80
|
command_klass = FlatKit::Command.for(command_name)
|
81
|
+
if command_klass.nil? then
|
82
|
+
$stdout.puts "ERROR: Unknown command '#{command_name}'"
|
83
|
+
parser.educate
|
84
|
+
exit 0
|
85
|
+
end
|
86
|
+
|
76
87
|
command = command_klass.new(argv: argv, logger: ::FlatKit.logger, env: env)
|
77
88
|
command.call
|
78
89
|
end
|
data/lib/flat_kit/command.rb
CHANGED
@@ -0,0 +1,94 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Command
|
3
|
+
class Stats < ::FlatKit::Command
|
4
|
+
|
5
|
+
def self.name
|
6
|
+
"stats"
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.description
|
10
|
+
"Collect and report stats on the inputfile"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parser
|
14
|
+
::Optimist::Parser.new do
|
15
|
+
banner "#{Sort.description}"
|
16
|
+
banner ""
|
17
|
+
|
18
|
+
banner <<~BANNER
|
19
|
+
Given an input file collect basic statistics.
|
20
|
+
|
21
|
+
The statistics can vary based upon the datatype of the field.
|
22
|
+
|
23
|
+
Numeric fields will report the basic count, min, max, mean, standard deviation and sum.
|
24
|
+
Non-numeric fields that are comparable, like dates, will report count, min and max.
|
25
|
+
Other non-numeric fields will only report the count.
|
26
|
+
|
27
|
+
Adding --cardinality will report the count, and frequency of distinct values in the result.
|
28
|
+
This will allow for reporting the median value.
|
29
|
+
|
30
|
+
The fields upon which stats are collected may be selected with the --fields parameter.
|
31
|
+
By default statistics are collected on all fields.
|
32
|
+
|
33
|
+
The flatfile type(s) will be automatically determined by the file name.
|
34
|
+
|
35
|
+
The output can be dumped as a CSV, JSON or a a formated ascii table.
|
36
|
+
|
37
|
+
BANNER
|
38
|
+
|
39
|
+
banner <<~USAGE
|
40
|
+
|
41
|
+
Usage:
|
42
|
+
fk stats --everything file.json
|
43
|
+
fk stats --select surname,given_name file.csv
|
44
|
+
fk stats --select surname,given_name --output-format json file.csv > stats.json
|
45
|
+
fk stats --select field1,field2 --output-format json input.csv
|
46
|
+
fk stats --select field1 file.json.gz -o stats.csv
|
47
|
+
gunzip -c file.json.gz | fk stats --input-format json --output-format text
|
48
|
+
|
49
|
+
USAGE
|
50
|
+
|
51
|
+
banner <<~OPTIONS
|
52
|
+
|
53
|
+
Options:
|
54
|
+
|
55
|
+
OPTIONS
|
56
|
+
|
57
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
58
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
59
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
60
|
+
opt :select, "The comma separted list of field(s) to report stats on", required: false, type: :string
|
61
|
+
opt :everything, "Show all statistics that are possible", default: false
|
62
|
+
opt :cardinality, "Show the cardinality of the fields, this requires additional memory", default: false
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse
|
67
|
+
parser = self.class.parser
|
68
|
+
::Optimist::with_standard_exception_handling(parser) do
|
69
|
+
begin
|
70
|
+
opts = parser.parse(argv)
|
71
|
+
fields = ::FlatKit::Stats::AllFields
|
72
|
+
fields = CSV.parse_line(opts[:select]) if opts[:select]
|
73
|
+
|
74
|
+
stats = [FieldStats::CORE_STATS]
|
75
|
+
stats << FieldStats::CARDINALITY_STATS if opts[:cardinality] || opts[:everything]
|
76
|
+
|
77
|
+
paths = parser.leftovers
|
78
|
+
raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
|
79
|
+
path = paths.first || "-" # default to stdin
|
80
|
+
@stats = ::FlatKit::Stats.new(input: path, input_fallback: opts[:input_format],
|
81
|
+
output: opts[:output], output_fallback: opts[:output_format],
|
82
|
+
fields_to_stat: fields, stats_to_collect: stats)
|
83
|
+
rescue ::FlatKit::Error => e
|
84
|
+
raise ::Optimist::CommandlineError, e.message
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def call
|
90
|
+
@stats.call
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -23,5 +23,14 @@ module FlatKit
|
|
23
23
|
child_klass.send(method, *args)
|
24
24
|
end
|
25
25
|
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Find all the children that return truthy from the given method with args
|
29
|
+
#
|
30
|
+
def find_children(method, *args)
|
31
|
+
children.select do |child_klass|
|
32
|
+
child_klass.send(method, *args)
|
33
|
+
end
|
34
|
+
end
|
26
35
|
end
|
27
36
|
end
|
@@ -20,9 +20,9 @@ module FlatKit
|
|
20
20
|
_listeners.clear
|
21
21
|
end
|
22
22
|
|
23
|
-
def notify_listeners(name:, data:)
|
23
|
+
def notify_listeners(name:, data:, meta: nil)
|
24
24
|
_listeners.each do |l|
|
25
|
-
l.on_event(name: name, data: data)
|
25
|
+
l.on_event(name: name, data: data, meta: meta)
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
@@ -0,0 +1,241 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Collect stats on a single field. We may not know what the field data type is
|
3
|
+
# to start with, so collect a bunch of values until we have the threshold, and
|
4
|
+
# then calculte states based upon the data types determined by the guess
|
5
|
+
#
|
6
|
+
class FieldStats
|
7
|
+
DEFAULT_GUESS_THRESHOLD = 1000
|
8
|
+
|
9
|
+
CORE_STATS = :core
|
10
|
+
CARDINALITY_STATS = :cardinality
|
11
|
+
|
12
|
+
ALL_STATS = [ CORE_STATS, CARDINALITY_STATS ]
|
13
|
+
|
14
|
+
EXPORT_FIELDS = %w[
|
15
|
+
name
|
16
|
+
type
|
17
|
+
count
|
18
|
+
max
|
19
|
+
mean
|
20
|
+
min
|
21
|
+
stddev
|
22
|
+
sum
|
23
|
+
mode
|
24
|
+
unique_count
|
25
|
+
|
26
|
+
max_length
|
27
|
+
mean_length
|
28
|
+
min_length
|
29
|
+
stddev_length
|
30
|
+
mode_length
|
31
|
+
unique_count_lengths
|
32
|
+
|
33
|
+
null_count
|
34
|
+
unknown_count
|
35
|
+
out_of_type_count
|
36
|
+
total_count
|
37
|
+
null_percent
|
38
|
+
unknown_percent
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
attr_reader :type_counts
|
43
|
+
attr_reader :field_type
|
44
|
+
attr_reader :name
|
45
|
+
|
46
|
+
def initialize(name:, stats_to_collect: CORE_STATS, type: ::FlatKit::FieldType::GuessType, guess_threshold: DEFAULT_GUESS_THRESHOLD)
|
47
|
+
@name = name
|
48
|
+
@field_type = type
|
49
|
+
@guess_threshold = guess_threshold
|
50
|
+
@type_counts = Hash.new(0)
|
51
|
+
@out_of_type_count = 0
|
52
|
+
@values = []
|
53
|
+
@stats = nil
|
54
|
+
@length_stats = nil
|
55
|
+
@stats_to_collect = [stats_to_collect].flatten
|
56
|
+
|
57
|
+
@stats_to_collect.each do |collection_set|
|
58
|
+
next if ALL_STATS.include?(collection_set)
|
59
|
+
raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{ALL_STATS.map { |s| s.to_s }.join(", ") }"
|
60
|
+
end
|
61
|
+
raise ArgumentError, "type: must be FieldType subclasses - not #{type}" unless type.kind_of?(Class) && (type.superclass == ::FlatKit::FieldType)
|
62
|
+
end
|
63
|
+
|
64
|
+
def field_type_determined?
|
65
|
+
@field_type != ::FlatKit::FieldType::GuessType
|
66
|
+
end
|
67
|
+
|
68
|
+
def update(value)
|
69
|
+
update_type_count(value)
|
70
|
+
|
71
|
+
if field_type_determined? then
|
72
|
+
update_stats(value)
|
73
|
+
else
|
74
|
+
@values << value
|
75
|
+
|
76
|
+
if @values.size >= @guess_threshold then
|
77
|
+
resolve_guess
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def collecting_frequencies?
|
83
|
+
@stats_to_collect.include?(CARDINALITY_STATS)
|
84
|
+
end
|
85
|
+
|
86
|
+
def type
|
87
|
+
@field_type.type_name
|
88
|
+
end
|
89
|
+
|
90
|
+
def count
|
91
|
+
stats.count
|
92
|
+
end
|
93
|
+
|
94
|
+
def max
|
95
|
+
stats.max if stats.respond_to?(:max)
|
96
|
+
end
|
97
|
+
|
98
|
+
def mean
|
99
|
+
stats.mean if stats.respond_to?(:mean)
|
100
|
+
end
|
101
|
+
|
102
|
+
def min
|
103
|
+
stats.min if stats.respond_to?(:min)
|
104
|
+
end
|
105
|
+
|
106
|
+
def stddev
|
107
|
+
stats.stddev if stats.respond_to?(:stddev)
|
108
|
+
end
|
109
|
+
|
110
|
+
def sum
|
111
|
+
stats.sum if stats.respond_to?(:sum)
|
112
|
+
end
|
113
|
+
|
114
|
+
def mode
|
115
|
+
stats.mode if collecting_frequencies?
|
116
|
+
end
|
117
|
+
|
118
|
+
def unique_count
|
119
|
+
stats.unique_count if collecting_frequencies?
|
120
|
+
end
|
121
|
+
|
122
|
+
def unique_values
|
123
|
+
stats.unique_values if collecting_frequencies?
|
124
|
+
end
|
125
|
+
|
126
|
+
def frequencies
|
127
|
+
stats.frequencies if collecting_frequencies?
|
128
|
+
end
|
129
|
+
|
130
|
+
def min_length
|
131
|
+
length_stats.min if @length_stats
|
132
|
+
end
|
133
|
+
|
134
|
+
def max_length
|
135
|
+
length_stats.max if @length_stats
|
136
|
+
end
|
137
|
+
|
138
|
+
def mean_length
|
139
|
+
length_stats.mean if @length_stats
|
140
|
+
end
|
141
|
+
|
142
|
+
def stddev_length
|
143
|
+
length_stats.stddev if @length_stats
|
144
|
+
end
|
145
|
+
|
146
|
+
def mode_length
|
147
|
+
length_stats.mode if @length_stats && collecting_frequencies?
|
148
|
+
end
|
149
|
+
|
150
|
+
def unique_count_lengths
|
151
|
+
length_stats.unique_count if @length_stats && collecting_frequencies?
|
152
|
+
end
|
153
|
+
|
154
|
+
def unique_values_lengths
|
155
|
+
length_stats.unique_values if @length_stats && collecting_frequencies?
|
156
|
+
end
|
157
|
+
|
158
|
+
def length_frequencies
|
159
|
+
length_stats.frequencies if @length_stats && collecting_frequencies?
|
160
|
+
end
|
161
|
+
|
162
|
+
def null_count
|
163
|
+
type_counts[FieldType::NullType]
|
164
|
+
end
|
165
|
+
|
166
|
+
def total_count
|
167
|
+
stats.count + @out_of_type_count
|
168
|
+
end
|
169
|
+
|
170
|
+
def out_of_type_count
|
171
|
+
@out_of_type_count
|
172
|
+
end
|
173
|
+
|
174
|
+
def null_percent
|
175
|
+
return 0 if total_count.zero?
|
176
|
+
((null_count.to_f / total_count) * 100.0).truncate(2)
|
177
|
+
end
|
178
|
+
|
179
|
+
def unknown_count
|
180
|
+
type_counts[FieldType::UnknownType]
|
181
|
+
end
|
182
|
+
|
183
|
+
def unknown_percent
|
184
|
+
return 0 if total_count.zero?
|
185
|
+
((unknown_count.to_f / total_count) * 100.0).truncate(2)
|
186
|
+
end
|
187
|
+
|
188
|
+
def to_hash
|
189
|
+
resolve_guess
|
190
|
+
|
191
|
+
Hash.new.tap do |h|
|
192
|
+
EXPORT_FIELDS.each do |n|
|
193
|
+
h[n] = self.send(n)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
|
200
|
+
def stats
|
201
|
+
resolve_guess
|
202
|
+
@stats
|
203
|
+
end
|
204
|
+
|
205
|
+
def length_stats
|
206
|
+
resolve_guess
|
207
|
+
@length_stats
|
208
|
+
end
|
209
|
+
|
210
|
+
def update_stats(value)
|
211
|
+
coerced_value = @field_type.coerce(value)
|
212
|
+
if coerced_value == FieldType::CoerceFailure then
|
213
|
+
@out_of_type_count += 1
|
214
|
+
return
|
215
|
+
end
|
216
|
+
|
217
|
+
@stats.update(coerced_value)
|
218
|
+
@length_stats.update(coerced_value.to_s.length) if @length_stats
|
219
|
+
end
|
220
|
+
|
221
|
+
def update_type_count(value)
|
222
|
+
guess = FieldType.best_guess(value)
|
223
|
+
type_counts[guess] += 1
|
224
|
+
return guess
|
225
|
+
end
|
226
|
+
|
227
|
+
def resolve_guess
|
228
|
+
return if field_type_determined?
|
229
|
+
best_guess_type, _best_guess_count = type_counts.max_by { |k, v| v }
|
230
|
+
@field_type = best_guess_type
|
231
|
+
@stats = StatType.for(@field_type).new(collecting_frequencies: collecting_frequencies?)
|
232
|
+
if @field_type == ::FlatKit::FieldType::StringType then
|
233
|
+
@length_stats = ::FlatKit::StatType::NumericalStats.new(collecting_frequencies: collecting_frequencies?)
|
234
|
+
end
|
235
|
+
@values.each do |v|
|
236
|
+
update_stats(v)
|
237
|
+
end
|
238
|
+
@values.clear
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|