flat_kit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestNullType < ::Minitest::Test
|
5
|
+
def nully_items
|
6
|
+
[ "null", "NULL", "nil", "\\N" ]
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_null
|
10
|
+
assert(FlatKit::FieldType::NullType.matches?(nil))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_nully_items
|
14
|
+
nully_items.each do |s|
|
15
|
+
assert(FlatKit::FieldType::NullType.matches?(s), "#{s} should be null")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_other_class_does_not_match
|
20
|
+
[ 42, Object.new, true, false ].each do |x|
|
21
|
+
refute(FlatKit::FieldType::NullType.matches?(x), "#{x} should not be == null")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_coerce_coerces_nil
|
26
|
+
assert_nil(FlatKit::FieldType::NullType.coerce(nil))
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_coerces_nully_items
|
30
|
+
nully_items.each do |s|
|
31
|
+
assert_nil(FlatKit::FieldType::NullType.coerce(s))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_coerce_failure_non_non_nully_items
|
36
|
+
[ "whatever", 42, Object.new, true, false, Class].each do |x|
|
37
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::NullType.coerce(x))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestStringType < ::Minitest::Test
|
5
|
+
|
6
|
+
def test_string_will_not_match_non_string_data
|
7
|
+
[42, false, true, 12.5, Object.new].each do |o|
|
8
|
+
refute(FlatKit::FieldType::StringType.matches?(o))
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_string_type_returns_coerce_failures
|
13
|
+
[BasicObject.new].each do |o|
|
14
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::StringType.coerce(o))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module TestFieldType
|
5
|
+
class TestTimestampType < ::Minitest::Test
|
6
|
+
def test_time
|
7
|
+
assert(FlatKit::FieldType::TimestampType.matches?(Time.now))
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_date
|
11
|
+
refute(FlatKit::FieldType::TimestampType.matches?(Date.today))
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_date_time
|
15
|
+
refute(FlatKit::FieldType::TimestampType.matches?(DateTime.now))
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_builtin_formats
|
19
|
+
stock_formats = [
|
20
|
+
Time.now.httpdate,
|
21
|
+
Time.now.utc.httpdate,
|
22
|
+
Time.now.iso8601,
|
23
|
+
Time.now.utc.iso8601,
|
24
|
+
Time.now.rfc2822,
|
25
|
+
Time.now.utc.rfc2822,
|
26
|
+
]
|
27
|
+
|
28
|
+
stock_formats.each do |t|
|
29
|
+
assert(FlatKit::FieldType::TimestampType.matches?(t), "#{t} should match timestamp")
|
30
|
+
assert_instance_of(Time, FlatKit::FieldType::TimestampType.coerce(t), "#{t} should convert to timestamp")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_no_duplicate_formats
|
35
|
+
parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats
|
36
|
+
|
37
|
+
assert_equal(parse_formats.size, parse_formats.sort.uniq.size)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_parse_formats
|
41
|
+
parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats.dup
|
42
|
+
extra_formats = [
|
43
|
+
"%Y-%m-%dT%H:%M:%S.%N%z",
|
44
|
+
"%Y-%m-%d %H:%M:%S.%NZ",
|
45
|
+
"%Y-%m-%d %H:%M:%S.%N",
|
46
|
+
"%Y-%m-%dT%H:%M:%S.%3N%z",
|
47
|
+
"%Y-%m-%d %H:%M:%S.%3NZ",
|
48
|
+
"%Y-%m-%d %H:%M:%S.%3N",
|
49
|
+
"%Y-%m-%dT%H:%M:%S.%6N%z",
|
50
|
+
"%Y-%m-%d %H:%M:%S.%6NZ",
|
51
|
+
"%Y-%m-%d %H:%M:%S.%6N",
|
52
|
+
"%Y-%m-%dT%H:%M:%S.%9N%z",
|
53
|
+
"%Y-%m-%d %H:%M:%S.%9NZ",
|
54
|
+
"%Y-%m-%d %H:%M:%S.%9N",
|
55
|
+
]
|
56
|
+
parse_formats.concat(extra_formats)
|
57
|
+
|
58
|
+
parse_formats.each do |format|
|
59
|
+
now = Time.now
|
60
|
+
str = now.strftime(format)
|
61
|
+
|
62
|
+
assert(FlatKit::FieldType::TimestampType.matches?(str), "#{str} should match timestamp")
|
63
|
+
coerced = FlatKit::FieldType::TimestampType.coerce(str)
|
64
|
+
|
65
|
+
assert_instance_of(Time, coerced)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_other_class_does_not_match
|
70
|
+
[ 42, Object.new, true, false ].each do |x|
|
71
|
+
refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_N_number_does_not_match
|
76
|
+
x = "N89362"
|
77
|
+
refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_coerce_passthrough_time
|
81
|
+
t = Time.now
|
82
|
+
assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_coerce_do_not_passthrough_date
|
86
|
+
t = Date.today
|
87
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_date_coerce_passthrough_datetime
|
91
|
+
t = Time.now
|
92
|
+
assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_date_only_does_not_parse
|
96
|
+
t = Time.now.strftime("%Y-%m-%d")
|
97
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_number_coerce_failure
|
101
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(42))
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_number_coerce_failure_bad_parse
|
105
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce("1234 56 78 90"))
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestUnknownType < ::Minitest::Test
|
5
|
+
|
6
|
+
def unknown_items
|
7
|
+
[ 'na', 'n/a', 'unk', 'unknown']
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_unknown_items
|
11
|
+
unknown_items.each do |u|
|
12
|
+
assert(FlatKit::FieldType::UnknownType.matches?(u), "#{u} should be unknown")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_other_class_does_not_match
|
17
|
+
[ 42, Object.new, true, false, "whatever" ].each do |x|
|
18
|
+
refute(FlatKit::FieldType::UnknownType.matches?(x), "#{x} should not unknown ")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_coerce_unknown
|
23
|
+
unknown_items.each do |u|
|
24
|
+
assert_equal(u, FlatKit::FieldType::UnknownType.coerce(u), "#{u} should be unknown")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_other_class_does_not_coerce
|
29
|
+
[ 42, Object.new, true, false, "whatever" ].each do |x|
|
30
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::UnknownType.coerce(x))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/test/jsonl/test_writer.rb
CHANGED
@@ -41,12 +41,30 @@ module TestJsonl
|
|
41
41
|
assert_equal(expected, actual)
|
42
42
|
end
|
43
43
|
|
44
|
-
def
|
44
|
+
def test_postion
|
45
45
|
File.open(@write_path, "w+") do |f|
|
46
46
|
writer = ::FlatKit::Jsonl::Writer.new(destination: f)
|
47
47
|
|
48
|
-
|
49
|
-
|
48
|
+
byte_offset = 0
|
49
|
+
@records.each_with_index do |r, idx|
|
50
|
+
record_length = r.data.bytesize
|
51
|
+
|
52
|
+
position = writer.write(r)
|
53
|
+
|
54
|
+
# make sure write stores the last_position api and returns that value
|
55
|
+
assert_equal(position, writer.last_position)
|
56
|
+
|
57
|
+
assert_equal(idx, position.index)
|
58
|
+
assert_equal(byte_offset, position.offset)
|
59
|
+
assert_equal(record_length, position.bytesize)
|
60
|
+
|
61
|
+
byte_offset += record_length
|
62
|
+
|
63
|
+
current_position = writer.current_position
|
64
|
+
assert_equal(idx+1, current_position.index)
|
65
|
+
assert_equal(byte_offset, current_position.offset)
|
66
|
+
assert_equal(0, current_position.bytesize)
|
67
|
+
|
50
68
|
end
|
51
69
|
writer.close
|
52
70
|
|
data/test/run
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
test_dir = __dir__
|
5
|
+
|
6
|
+
$: << File.join(File.dirname(test_dir), "lib")
|
7
|
+
$: << test_dir
|
8
|
+
|
9
|
+
require_relative './test_helper'
|
10
|
+
|
11
|
+
require 'find'
|
12
|
+
|
13
|
+
if ARGV.empty? then
|
14
|
+
Find.find(test_dir) do |path|
|
15
|
+
next unless File.file?(path) && File.basename(path) =~ /\Atest_.*\.rb\Z/
|
16
|
+
require path
|
17
|
+
end
|
18
|
+
else
|
19
|
+
ARGV.each do |f|
|
20
|
+
require File.expand_path(f)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestStatType
|
4
|
+
class TestNominalStats < ::Minitest::Test
|
5
|
+
def setup
|
6
|
+
@unique_values = ('a'..'f').to_a
|
7
|
+
@values = Array.new.tap do |a|
|
8
|
+
@unique_values.each do |letter|
|
9
|
+
(Random.rand(42) + 1).times { a << letter }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
@frequencies = @values.tally
|
14
|
+
|
15
|
+
@stats = ::FlatKit::StatType::NominalStats.new
|
16
|
+
@all_stats = ::FlatKit::StatType::NominalStats.new(collecting_frequencies: true)
|
17
|
+
|
18
|
+
@values.each do |v|
|
19
|
+
@stats.update(v)
|
20
|
+
@all_stats.update(v)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_count
|
25
|
+
assert_equal(@values.size, @stats.count)
|
26
|
+
assert_equal(@values.size, @all_stats.count)
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_does_not_collect_unique_count_by_default
|
30
|
+
assert_nil(@stats.unique_count)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_does_not_collect_unique_values_by_default
|
34
|
+
assert_nil(@stats.unique_values)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_does_not_collect_frequencies_by_default
|
38
|
+
assert_nil(@stats.frequencies)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_unique_count
|
42
|
+
assert_equal(@unique_values.size, @all_stats.unique_count)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_unique_values
|
46
|
+
assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_frequencies
|
50
|
+
assert_equal(@frequencies, @all_stats.frequencies)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_default_to_hash
|
54
|
+
expecting = { "count" => @values.size }
|
55
|
+
assert_equal(expecting, @stats.to_hash)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_all_stats_hash
|
59
|
+
expecting = {
|
60
|
+
"count" => @values.size,
|
61
|
+
"unique_count" => @unique_values.size,
|
62
|
+
"unique_values" => @unique_values.sort,
|
63
|
+
"mode" => @frequencies.max_by { |k,v| v }.first
|
64
|
+
}
|
65
|
+
assert_equal(expecting, @all_stats.to_hash)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module TestStatType
|
5
|
+
class TestNumericalStats < ::Minitest::Test
|
6
|
+
def setup
|
7
|
+
@stats = FlatKit::StatType::NumericalStats.new
|
8
|
+
@full_stats = FlatKit::StatType::NumericalStats.new
|
9
|
+
@all_stats = FlatKit::StatType::NumericalStats.new(collecting_frequencies: true)
|
10
|
+
[ 1, 2, 3].each { |i| @full_stats.update( i ) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_intialized_with_usable_values
|
14
|
+
assert_equal(0, @stats.count)
|
15
|
+
assert_equal(Float::INFINITY, @stats.min)
|
16
|
+
assert_equal(-Float::INFINITY, @stats.max)
|
17
|
+
assert_equal(0.0, @stats.sum)
|
18
|
+
assert_equal(0.0, @stats.rate)
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_calculates_mean
|
22
|
+
assert_equal(2.0, @full_stats.mean)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_calculates_rate
|
26
|
+
assert_equal(0.5, @full_stats.rate)
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_tracks_the_maximum_value
|
30
|
+
assert_equal(3.0, @full_stats.max)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_tracks_the_minimum_value
|
34
|
+
assert_equal(1.0, @full_stats.min)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_tracks_the_count
|
38
|
+
assert_equal(3,@full_stats.count)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_tracks_the_sum
|
42
|
+
assert_equal(6.0, @full_stats.sum)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_calculates_the_standard_deviation
|
46
|
+
assert_equal(1.0, @full_stats.stddev)
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_calculates_the_sum_of_squares
|
50
|
+
assert_equal(14, @full_stats.sumsq)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_converts_to_a_hash
|
54
|
+
h = @full_stats.to_hash
|
55
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
|
56
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_converts_to_a_limited_hash_if_given_arguments
|
60
|
+
h = @full_stats.to_hash( "min", "max", "mean" )
|
61
|
+
assert_equal(3, h.size)
|
62
|
+
assert_equal(%w[ max mean min], h.keys.sort)
|
63
|
+
|
64
|
+
h = @full_stats.to_hash( %w[ count rate ] )
|
65
|
+
assert_equal(2, h.size)
|
66
|
+
assert_equal(%w[ count rate ], h.keys.sort)
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_raises_nomethoderror_if_an_invalid_stat_is_used
|
70
|
+
assert_raises(NoMethodError) { @full_stats.to_hash( "wibble" ) }
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_converts_to_a_json_string
|
74
|
+
j = @full_stats.to_json
|
75
|
+
h = JSON.parse( j )
|
76
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
|
77
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_converts_to_a_limited_json_hash_if_given_arguments
|
81
|
+
j = @full_stats.to_json( "min", "max", "mean" )
|
82
|
+
h = JSON.parse( j )
|
83
|
+
assert_equal(3, h.size)
|
84
|
+
assert_equal(%w[ max mean min], h.keys.sort)
|
85
|
+
|
86
|
+
j = @full_stats.to_json( %w[ count rate ] )
|
87
|
+
h = JSON.parse( j )
|
88
|
+
assert_equal(2, h.size)
|
89
|
+
assert_equal(%w[ count rate ], h.keys.sort)
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_raises_nomethoderror_if_an_invalid_json_stat_is_used
|
93
|
+
assert_raises(NoMethodError) { @full_stats.to_json( "wibble" ) }
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_collects_mode
|
97
|
+
values = Array.new.tap do |a|
|
98
|
+
100.times {
|
99
|
+
n = Random.rand(10)
|
100
|
+
a << n
|
101
|
+
@all_stats.update(n)
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
tally = values.tally
|
106
|
+
mode_value = tally.max_by { |v, count| count }.first
|
107
|
+
|
108
|
+
assert_equal(mode_value, @all_stats.mode)
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_collecting_frequences_reports_extra_stat_names
|
112
|
+
stat_names = @all_stats.collected_stats
|
113
|
+
assert_includes(stat_names, "mode")
|
114
|
+
assert_includes(stat_names, "unique_count")
|
115
|
+
assert_includes(stat_names, "unique_values")
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|