flat_kit 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestNullType < ::Minitest::Test
|
5
|
+
def nully_items
|
6
|
+
[ "null", "NULL", "nil", "\\N" ]
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_null
|
10
|
+
assert(FlatKit::FieldType::NullType.matches?(nil))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_nully_items
|
14
|
+
nully_items.each do |s|
|
15
|
+
assert(FlatKit::FieldType::NullType.matches?(s), "#{s} should be null")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_other_class_does_not_match
|
20
|
+
[ 42, Object.new, true, false ].each do |x|
|
21
|
+
refute(FlatKit::FieldType::NullType.matches?(x), "#{x} should not be == null")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_coerce_coerces_nil
|
26
|
+
assert_nil(FlatKit::FieldType::NullType.coerce(nil))
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_coerces_nully_items
|
30
|
+
nully_items.each do |s|
|
31
|
+
assert_nil(FlatKit::FieldType::NullType.coerce(s))
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_coerce_failure_non_non_nully_items
|
36
|
+
[ "whatever", 42, Object.new, true, false, Class].each do |x|
|
37
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::NullType.coerce(x))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestStringType < ::Minitest::Test
|
5
|
+
|
6
|
+
def test_string_will_not_match_non_string_data
|
7
|
+
[42, false, true, 12.5, Object.new].each do |o|
|
8
|
+
refute(FlatKit::FieldType::StringType.matches?(o))
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_string_type_returns_coerce_failures
|
13
|
+
[BasicObject.new].each do |o|
|
14
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::StringType.coerce(o))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module TestFieldType
|
5
|
+
class TestTimestampType < ::Minitest::Test
|
6
|
+
def test_time
|
7
|
+
assert(FlatKit::FieldType::TimestampType.matches?(Time.now))
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_date
|
11
|
+
refute(FlatKit::FieldType::TimestampType.matches?(Date.today))
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_date_time
|
15
|
+
refute(FlatKit::FieldType::TimestampType.matches?(DateTime.now))
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_builtin_formats
|
19
|
+
stock_formats = [
|
20
|
+
Time.now.httpdate,
|
21
|
+
Time.now.utc.httpdate,
|
22
|
+
Time.now.iso8601,
|
23
|
+
Time.now.utc.iso8601,
|
24
|
+
Time.now.rfc2822,
|
25
|
+
Time.now.utc.rfc2822,
|
26
|
+
]
|
27
|
+
|
28
|
+
stock_formats.each do |t|
|
29
|
+
assert(FlatKit::FieldType::TimestampType.matches?(t), "#{t} should match timestamp")
|
30
|
+
assert_instance_of(Time, FlatKit::FieldType::TimestampType.coerce(t), "#{t} should convert to timestamp")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_no_duplicate_formats
|
35
|
+
parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats
|
36
|
+
|
37
|
+
assert_equal(parse_formats.size, parse_formats.sort.uniq.size)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_parse_formats
|
41
|
+
parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats.dup
|
42
|
+
extra_formats = [
|
43
|
+
"%Y-%m-%dT%H:%M:%S.%N%z",
|
44
|
+
"%Y-%m-%d %H:%M:%S.%NZ",
|
45
|
+
"%Y-%m-%d %H:%M:%S.%N",
|
46
|
+
"%Y-%m-%dT%H:%M:%S.%3N%z",
|
47
|
+
"%Y-%m-%d %H:%M:%S.%3NZ",
|
48
|
+
"%Y-%m-%d %H:%M:%S.%3N",
|
49
|
+
"%Y-%m-%dT%H:%M:%S.%6N%z",
|
50
|
+
"%Y-%m-%d %H:%M:%S.%6NZ",
|
51
|
+
"%Y-%m-%d %H:%M:%S.%6N",
|
52
|
+
"%Y-%m-%dT%H:%M:%S.%9N%z",
|
53
|
+
"%Y-%m-%d %H:%M:%S.%9NZ",
|
54
|
+
"%Y-%m-%d %H:%M:%S.%9N",
|
55
|
+
]
|
56
|
+
parse_formats.concat(extra_formats)
|
57
|
+
|
58
|
+
parse_formats.each do |format|
|
59
|
+
now = Time.now
|
60
|
+
str = now.strftime(format)
|
61
|
+
|
62
|
+
assert(FlatKit::FieldType::TimestampType.matches?(str), "#{str} should match timestamp")
|
63
|
+
coerced = FlatKit::FieldType::TimestampType.coerce(str)
|
64
|
+
|
65
|
+
assert_instance_of(Time, coerced)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_other_class_does_not_match
|
70
|
+
[ 42, Object.new, true, false ].each do |x|
|
71
|
+
refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_N_number_does_not_match
|
76
|
+
x = "N89362"
|
77
|
+
refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_coerce_passthrough_time
|
81
|
+
t = Time.now
|
82
|
+
assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_coerce_do_not_passthrough_date
|
86
|
+
t = Date.today
|
87
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_date_coerce_passthrough_datetime
|
91
|
+
t = Time.now
|
92
|
+
assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_date_only_does_not_parse
|
96
|
+
t = Time.now.strftime("%Y-%m-%d")
|
97
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_number_coerce_failure
|
101
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(42))
|
102
|
+
end
|
103
|
+
|
104
|
+
def test_number_coerce_failure_bad_parse
|
105
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce("1234 56 78 90"))
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestUnknownType < ::Minitest::Test
|
5
|
+
|
6
|
+
def unknown_items
|
7
|
+
[ 'na', 'n/a', 'unk', 'unknown']
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_unknown_items
|
11
|
+
unknown_items.each do |u|
|
12
|
+
assert(FlatKit::FieldType::UnknownType.matches?(u), "#{u} should be unknown")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_other_class_does_not_match
|
17
|
+
[ 42, Object.new, true, false, "whatever" ].each do |x|
|
18
|
+
refute(FlatKit::FieldType::UnknownType.matches?(x), "#{x} should not unknown ")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_coerce_unknown
|
23
|
+
unknown_items.each do |u|
|
24
|
+
assert_equal(u, FlatKit::FieldType::UnknownType.coerce(u), "#{u} should be unknown")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_other_class_does_not_coerce
|
29
|
+
[ 42, Object.new, true, false, "whatever" ].each do |x|
|
30
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::UnknownType.coerce(x))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/test/jsonl/test_writer.rb
CHANGED
@@ -41,12 +41,30 @@ module TestJsonl
|
|
41
41
|
assert_equal(expected, actual)
|
42
42
|
end
|
43
43
|
|
44
|
-
def
|
44
|
+
def test_postion
|
45
45
|
File.open(@write_path, "w+") do |f|
|
46
46
|
writer = ::FlatKit::Jsonl::Writer.new(destination: f)
|
47
47
|
|
48
|
-
|
49
|
-
|
48
|
+
byte_offset = 0
|
49
|
+
@records.each_with_index do |r, idx|
|
50
|
+
record_length = r.data.bytesize
|
51
|
+
|
52
|
+
position = writer.write(r)
|
53
|
+
|
54
|
+
# make sure write stores the last_position api and returns that value
|
55
|
+
assert_equal(position, writer.last_position)
|
56
|
+
|
57
|
+
assert_equal(idx, position.index)
|
58
|
+
assert_equal(byte_offset, position.offset)
|
59
|
+
assert_equal(record_length, position.bytesize)
|
60
|
+
|
61
|
+
byte_offset += record_length
|
62
|
+
|
63
|
+
current_position = writer.current_position
|
64
|
+
assert_equal(idx+1, current_position.index)
|
65
|
+
assert_equal(byte_offset, current_position.offset)
|
66
|
+
assert_equal(0, current_position.bytesize)
|
67
|
+
|
50
68
|
end
|
51
69
|
writer.close
|
52
70
|
|
data/test/run
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
test_dir = __dir__
|
5
|
+
|
6
|
+
$: << File.join(File.dirname(test_dir), "lib")
|
7
|
+
$: << test_dir
|
8
|
+
|
9
|
+
require_relative './test_helper'
|
10
|
+
|
11
|
+
require 'find'
|
12
|
+
|
13
|
+
if ARGV.empty? then
|
14
|
+
Find.find(test_dir) do |path|
|
15
|
+
next unless File.file?(path) && File.basename(path) =~ /\Atest_.*\.rb\Z/
|
16
|
+
require path
|
17
|
+
end
|
18
|
+
else
|
19
|
+
ARGV.each do |f|
|
20
|
+
require File.expand_path(f)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestStatType
|
4
|
+
class TestNominalStats < ::Minitest::Test
|
5
|
+
def setup
|
6
|
+
@unique_values = ('a'..'f').to_a
|
7
|
+
@values = Array.new.tap do |a|
|
8
|
+
@unique_values.each do |letter|
|
9
|
+
(Random.rand(42) + 1).times { a << letter }
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
@frequencies = @values.tally
|
14
|
+
|
15
|
+
@stats = ::FlatKit::StatType::NominalStats.new
|
16
|
+
@all_stats = ::FlatKit::StatType::NominalStats.new(collecting_frequencies: true)
|
17
|
+
|
18
|
+
@values.each do |v|
|
19
|
+
@stats.update(v)
|
20
|
+
@all_stats.update(v)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_count
|
25
|
+
assert_equal(@values.size, @stats.count)
|
26
|
+
assert_equal(@values.size, @all_stats.count)
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_does_not_collect_unique_count_by_default
|
30
|
+
assert_nil(@stats.unique_count)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_does_not_collect_unique_values_by_default
|
34
|
+
assert_nil(@stats.unique_values)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_does_not_collect_frequencies_by_default
|
38
|
+
assert_nil(@stats.frequencies)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_unique_count
|
42
|
+
assert_equal(@unique_values.size, @all_stats.unique_count)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_unique_values
|
46
|
+
assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_frequencies
|
50
|
+
assert_equal(@frequencies, @all_stats.frequencies)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_default_to_hash
|
54
|
+
expecting = { "count" => @values.size }
|
55
|
+
assert_equal(expecting, @stats.to_hash)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_all_stats_hash
|
59
|
+
expecting = {
|
60
|
+
"count" => @values.size,
|
61
|
+
"unique_count" => @unique_values.size,
|
62
|
+
"unique_values" => @unique_values.sort,
|
63
|
+
"mode" => @frequencies.max_by { |k,v| v }.first
|
64
|
+
}
|
65
|
+
assert_equal(expecting, @all_stats.to_hash)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module TestStatType
|
5
|
+
class TestNumericalStats < ::Minitest::Test
|
6
|
+
def setup
|
7
|
+
@stats = FlatKit::StatType::NumericalStats.new
|
8
|
+
@full_stats = FlatKit::StatType::NumericalStats.new
|
9
|
+
@all_stats = FlatKit::StatType::NumericalStats.new(collecting_frequencies: true)
|
10
|
+
[ 1, 2, 3].each { |i| @full_stats.update( i ) }
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_intialized_with_usable_values
|
14
|
+
assert_equal(0, @stats.count)
|
15
|
+
assert_equal(Float::INFINITY, @stats.min)
|
16
|
+
assert_equal(-Float::INFINITY, @stats.max)
|
17
|
+
assert_equal(0.0, @stats.sum)
|
18
|
+
assert_equal(0.0, @stats.rate)
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_calculates_mean
|
22
|
+
assert_equal(2.0, @full_stats.mean)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_calculates_rate
|
26
|
+
assert_equal(0.5, @full_stats.rate)
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_tracks_the_maximum_value
|
30
|
+
assert_equal(3.0, @full_stats.max)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_tracks_the_minimum_value
|
34
|
+
assert_equal(1.0, @full_stats.min)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_tracks_the_count
|
38
|
+
assert_equal(3,@full_stats.count)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_tracks_the_sum
|
42
|
+
assert_equal(6.0, @full_stats.sum)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_calculates_the_standard_deviation
|
46
|
+
assert_equal(1.0, @full_stats.stddev)
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_calculates_the_sum_of_squares
|
50
|
+
assert_equal(14, @full_stats.sumsq)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_converts_to_a_hash
|
54
|
+
h = @full_stats.to_hash
|
55
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
|
56
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_converts_to_a_limited_hash_if_given_arguments
|
60
|
+
h = @full_stats.to_hash( "min", "max", "mean" )
|
61
|
+
assert_equal(3, h.size)
|
62
|
+
assert_equal(%w[ max mean min], h.keys.sort)
|
63
|
+
|
64
|
+
h = @full_stats.to_hash( %w[ count rate ] )
|
65
|
+
assert_equal(2, h.size)
|
66
|
+
assert_equal(%w[ count rate ], h.keys.sort)
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_raises_nomethoderror_if_an_invalid_stat_is_used
|
70
|
+
assert_raises(NoMethodError) { @full_stats.to_hash( "wibble" ) }
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_converts_to_a_json_string
|
74
|
+
j = @full_stats.to_json
|
75
|
+
h = JSON.parse( j )
|
76
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
|
77
|
+
assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_converts_to_a_limited_json_hash_if_given_arguments
|
81
|
+
j = @full_stats.to_json( "min", "max", "mean" )
|
82
|
+
h = JSON.parse( j )
|
83
|
+
assert_equal(3, h.size)
|
84
|
+
assert_equal(%w[ max mean min], h.keys.sort)
|
85
|
+
|
86
|
+
j = @full_stats.to_json( %w[ count rate ] )
|
87
|
+
h = JSON.parse( j )
|
88
|
+
assert_equal(2, h.size)
|
89
|
+
assert_equal(%w[ count rate ], h.keys.sort)
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_raises_nomethoderror_if_an_invalid_json_stat_is_used
|
93
|
+
assert_raises(NoMethodError) { @full_stats.to_json( "wibble" ) }
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_collects_mode
|
97
|
+
values = Array.new.tap do |a|
|
98
|
+
100.times {
|
99
|
+
n = Random.rand(10)
|
100
|
+
a << n
|
101
|
+
@all_stats.update(n)
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
tally = values.tally
|
106
|
+
mode_value = tally.max_by { |v, count| count }.first
|
107
|
+
|
108
|
+
assert_equal(mode_value, @all_stats.mode)
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_collecting_frequences_reports_extra_stat_names
|
112
|
+
stat_names = @all_stats.collected_stats
|
113
|
+
assert_includes(stat_names, "mode")
|
114
|
+
assert_includes(stat_names, "unique_count")
|
115
|
+
assert_includes(stat_names, "unique_values")
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|