flat_kit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,41 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestNullType < ::Minitest::Test
5
+ def nully_items
6
+ [ "null", "NULL", "nil", "\\N" ]
7
+ end
8
+
9
+ def test_null
10
+ assert(FlatKit::FieldType::NullType.matches?(nil))
11
+ end
12
+
13
+ def test_nully_items
14
+ nully_items.each do |s|
15
+ assert(FlatKit::FieldType::NullType.matches?(s), "#{s} should be null")
16
+ end
17
+ end
18
+
19
+ def test_other_class_does_not_match
20
+ [ 42, Object.new, true, false ].each do |x|
21
+ refute(FlatKit::FieldType::NullType.matches?(x), "#{x} should not be == null")
22
+ end
23
+ end
24
+
25
+ def test_coerce_coerces_nil
26
+ assert_nil(FlatKit::FieldType::NullType.coerce(nil))
27
+ end
28
+
29
+ def test_coerces_nully_items
30
+ nully_items.each do |s|
31
+ assert_nil(FlatKit::FieldType::NullType.coerce(s))
32
+ end
33
+ end
34
+
35
+ def test_coerce_failure_non_non_nully_items
36
+ [ "whatever", 42, Object.new, true, false, Class].each do |x|
37
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::NullType.coerce(x))
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,18 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestStringType < ::Minitest::Test
5
+
6
+ def test_string_will_not_match_non_string_data
7
+ [42, false, true, 12.5, Object.new].each do |o|
8
+ refute(FlatKit::FieldType::StringType.matches?(o))
9
+ end
10
+ end
11
+
12
+ def test_string_type_returns_coerce_failures
13
+ [BasicObject.new].each do |o|
14
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::StringType.coerce(o))
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,108 @@
1
+ require_relative '../test_helper'
2
+ require 'set'
3
+
4
+ module TestFieldType
5
+ class TestTimestampType < ::Minitest::Test
6
+ def test_time
7
+ assert(FlatKit::FieldType::TimestampType.matches?(Time.now))
8
+ end
9
+
10
+ def test_date
11
+ refute(FlatKit::FieldType::TimestampType.matches?(Date.today))
12
+ end
13
+
14
+ def test_date_time
15
+ refute(FlatKit::FieldType::TimestampType.matches?(DateTime.now))
16
+ end
17
+
18
+ def test_builtin_formats
19
+ stock_formats = [
20
+ Time.now.httpdate,
21
+ Time.now.utc.httpdate,
22
+ Time.now.iso8601,
23
+ Time.now.utc.iso8601,
24
+ Time.now.rfc2822,
25
+ Time.now.utc.rfc2822,
26
+ ]
27
+
28
+ stock_formats.each do |t|
29
+ assert(FlatKit::FieldType::TimestampType.matches?(t), "#{t} should match timestamp")
30
+ assert_instance_of(Time, FlatKit::FieldType::TimestampType.coerce(t), "#{t} should convert to timestamp")
31
+ end
32
+ end
33
+
34
+ def test_no_duplicate_formats
35
+ parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats
36
+
37
+ assert_equal(parse_formats.size, parse_formats.sort.uniq.size)
38
+ end
39
+
40
+ def test_parse_formats
41
+ parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats.dup
42
+ extra_formats = [
43
+ "%Y-%m-%dT%H:%M:%S.%N%z",
44
+ "%Y-%m-%d %H:%M:%S.%NZ",
45
+ "%Y-%m-%d %H:%M:%S.%N",
46
+ "%Y-%m-%dT%H:%M:%S.%3N%z",
47
+ "%Y-%m-%d %H:%M:%S.%3NZ",
48
+ "%Y-%m-%d %H:%M:%S.%3N",
49
+ "%Y-%m-%dT%H:%M:%S.%6N%z",
50
+ "%Y-%m-%d %H:%M:%S.%6NZ",
51
+ "%Y-%m-%d %H:%M:%S.%6N",
52
+ "%Y-%m-%dT%H:%M:%S.%9N%z",
53
+ "%Y-%m-%d %H:%M:%S.%9NZ",
54
+ "%Y-%m-%d %H:%M:%S.%9N",
55
+ ]
56
+ parse_formats.concat(extra_formats)
57
+
58
+ parse_formats.each do |format|
59
+ now = Time.now
60
+ str = now.strftime(format)
61
+
62
+ assert(FlatKit::FieldType::TimestampType.matches?(str), "#{str} should match timestamp")
63
+ coerced = FlatKit::FieldType::TimestampType.coerce(str)
64
+
65
+ assert_instance_of(Time, coerced)
66
+ end
67
+ end
68
+
69
+ def test_other_class_does_not_match
70
+ [ 42, Object.new, true, false ].each do |x|
71
+ refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
72
+ end
73
+ end
74
+
75
+ def test_N_number_does_not_match
76
+ x = "N89362"
77
+ refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
78
+ end
79
+
80
+ def test_coerce_passthrough_time
81
+ t = Time.now
82
+ assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
83
+ end
84
+
85
+ def test_coerce_do_not_passthrough_date
86
+ t = Date.today
87
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
88
+ end
89
+
90
+ def test_date_coerce_passthrough_datetime
91
+ t = Time.now
92
+ assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
93
+ end
94
+
95
+ def test_date_only_does_not_parse
96
+ t = Time.now.strftime("%Y-%m-%d")
97
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
98
+ end
99
+
100
+ def test_number_coerce_failure
101
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(42))
102
+ end
103
+
104
+ def test_number_coerce_failure_bad_parse
105
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce("1234 56 78 90"))
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,35 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestUnknownType < ::Minitest::Test
5
+
6
+ def unknown_items
7
+ [ 'na', 'n/a', 'unk', 'unknown']
8
+ end
9
+
10
+ def test_unknown_items
11
+ unknown_items.each do |u|
12
+ assert(FlatKit::FieldType::UnknownType.matches?(u), "#{u} should be unknown")
13
+ end
14
+ end
15
+
16
+ def test_other_class_does_not_match
17
+ [ 42, Object.new, true, false, "whatever" ].each do |x|
18
+ refute(FlatKit::FieldType::UnknownType.matches?(x), "#{x} should not unknown ")
19
+ end
20
+ end
21
+
22
+ def test_coerce_unknown
23
+ unknown_items.each do |u|
24
+ assert_equal(u, FlatKit::FieldType::UnknownType.coerce(u), "#{u} should be unknown")
25
+ end
26
+ end
27
+
28
+ def test_other_class_does_not_coerce
29
+ [ 42, Object.new, true, false, "whatever" ].each do |x|
30
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::UnknownType.coerce(x))
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -41,12 +41,30 @@ module TestJsonl
41
41
  assert_equal(expected, actual)
42
42
  end
43
43
 
44
- def test_writes_to_io
44
+ def test_postion
45
45
  File.open(@write_path, "w+") do |f|
46
46
  writer = ::FlatKit::Jsonl::Writer.new(destination: f)
47
47
 
48
- @records.each do |r|
49
- writer.write(r)
48
+ byte_offset = 0
49
+ @records.each_with_index do |r, idx|
50
+ record_length = r.data.bytesize
51
+
52
+ position = writer.write(r)
53
+
54
+ # make sure write stores the last_position api and returns that value
55
+ assert_equal(position, writer.last_position)
56
+
57
+ assert_equal(idx, position.index)
58
+ assert_equal(byte_offset, position.offset)
59
+ assert_equal(record_length, position.bytesize)
60
+
61
+ byte_offset += record_length
62
+
63
+ current_position = writer.current_position
64
+ assert_equal(idx+1, current_position.index)
65
+ assert_equal(byte_offset, current_position.offset)
66
+ assert_equal(0, current_position.bytesize)
67
+
50
68
  end
51
69
  writer.close
52
70
 
data/test/run ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ test_dir = __dir__
5
+
6
+ $: << File.join(File.dirname(test_dir), "lib")
7
+ $: << test_dir
8
+
9
+ require_relative './test_helper'
10
+
11
+ require 'find'
12
+
13
+ if ARGV.empty? then
14
+ Find.find(test_dir) do |path|
15
+ next unless File.file?(path) && File.basename(path) =~ /\Atest_.*\.rb\Z/
16
+ require path
17
+ end
18
+ else
19
+ ARGV.each do |f|
20
+ require File.expand_path(f)
21
+ end
22
+ end
23
+
@@ -0,0 +1,69 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestStatType
4
+ class TestNominalStats < ::Minitest::Test
5
+ def setup
6
+ @unique_values = ('a'..'f').to_a
7
+ @values = Array.new.tap do |a|
8
+ @unique_values.each do |letter|
9
+ (Random.rand(42) + 1).times { a << letter }
10
+ end
11
+ end
12
+
13
+ @frequencies = @values.tally
14
+
15
+ @stats = ::FlatKit::StatType::NominalStats.new
16
+ @all_stats = ::FlatKit::StatType::NominalStats.new(collecting_frequencies: true)
17
+
18
+ @values.each do |v|
19
+ @stats.update(v)
20
+ @all_stats.update(v)
21
+ end
22
+ end
23
+
24
+ def test_count
25
+ assert_equal(@values.size, @stats.count)
26
+ assert_equal(@values.size, @all_stats.count)
27
+ end
28
+
29
+ def test_does_not_collect_unique_count_by_default
30
+ assert_nil(@stats.unique_count)
31
+ end
32
+
33
+ def test_does_not_collect_unique_values_by_default
34
+ assert_nil(@stats.unique_values)
35
+ end
36
+
37
+ def test_does_not_collect_frequencies_by_default
38
+ assert_nil(@stats.frequencies)
39
+ end
40
+
41
+ def test_unique_count
42
+ assert_equal(@unique_values.size, @all_stats.unique_count)
43
+ end
44
+
45
+ def test_unique_values
46
+ assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
47
+ end
48
+
49
+ def test_frequencies
50
+ assert_equal(@frequencies, @all_stats.frequencies)
51
+ end
52
+
53
+ def test_default_to_hash
54
+ expecting = { "count" => @values.size }
55
+ assert_equal(expecting, @stats.to_hash)
56
+ end
57
+
58
+ def test_all_stats_hash
59
+ expecting = {
60
+ "count" => @values.size,
61
+ "unique_count" => @unique_values.size,
62
+ "unique_values" => @unique_values.sort,
63
+ "mode" => @frequencies.max_by { |k,v| v }.first
64
+ }
65
+ assert_equal(expecting, @all_stats.to_hash)
66
+ end
67
+ end
68
+ end
69
+
@@ -0,0 +1,118 @@
1
+ require_relative '../test_helper'
2
+
3
+
4
+ module TestStatType
5
+ class TestNumericalStats < ::Minitest::Test
6
+ def setup
7
+ @stats = FlatKit::StatType::NumericalStats.new
8
+ @full_stats = FlatKit::StatType::NumericalStats.new
9
+ @all_stats = FlatKit::StatType::NumericalStats.new(collecting_frequencies: true)
10
+ [ 1, 2, 3].each { |i| @full_stats.update( i ) }
11
+ end
12
+
13
+ def test_intialized_with_usable_values
14
+ assert_equal(0, @stats.count)
15
+ assert_equal(Float::INFINITY, @stats.min)
16
+ assert_equal(-Float::INFINITY, @stats.max)
17
+ assert_equal(0.0, @stats.sum)
18
+ assert_equal(0.0, @stats.rate)
19
+ end
20
+
21
+ def test_calculates_mean
22
+ assert_equal(2.0, @full_stats.mean)
23
+ end
24
+
25
+ def test_calculates_rate
26
+ assert_equal(0.5, @full_stats.rate)
27
+ end
28
+
29
+ def test_tracks_the_maximum_value
30
+ assert_equal(3.0, @full_stats.max)
31
+ end
32
+
33
+ def test_tracks_the_minimum_value
34
+ assert_equal(1.0, @full_stats.min)
35
+ end
36
+
37
+ def test_tracks_the_count
38
+ assert_equal(3,@full_stats.count)
39
+ end
40
+
41
+ def test_tracks_the_sum
42
+ assert_equal(6.0, @full_stats.sum)
43
+ end
44
+
45
+ def test_calculates_the_standard_deviation
46
+ assert_equal(1.0, @full_stats.stddev)
47
+ end
48
+
49
+ def test_calculates_the_sum_of_squares
50
+ assert_equal(14, @full_stats.sumsq)
51
+ end
52
+
53
+ def test_converts_to_a_hash
54
+ h = @full_stats.to_hash
55
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
56
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
57
+ end
58
+
59
+ def test_converts_to_a_limited_hash_if_given_arguments
60
+ h = @full_stats.to_hash( "min", "max", "mean" )
61
+ assert_equal(3, h.size)
62
+ assert_equal(%w[ max mean min], h.keys.sort)
63
+
64
+ h = @full_stats.to_hash( %w[ count rate ] )
65
+ assert_equal(2, h.size)
66
+ assert_equal(%w[ count rate ], h.keys.sort)
67
+ end
68
+
69
+ def test_raises_nomethoderror_if_an_invalid_stat_is_used
70
+ assert_raises(NoMethodError) { @full_stats.to_hash( "wibble" ) }
71
+ end
72
+
73
+ def test_converts_to_a_json_string
74
+ j = @full_stats.to_json
75
+ h = JSON.parse( j )
76
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
77
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
78
+ end
79
+
80
+ def test_converts_to_a_limited_json_hash_if_given_arguments
81
+ j = @full_stats.to_json( "min", "max", "mean" )
82
+ h = JSON.parse( j )
83
+ assert_equal(3, h.size)
84
+ assert_equal(%w[ max mean min], h.keys.sort)
85
+
86
+ j = @full_stats.to_json( %w[ count rate ] )
87
+ h = JSON.parse( j )
88
+ assert_equal(2, h.size)
89
+ assert_equal(%w[ count rate ], h.keys.sort)
90
+ end
91
+
92
+ def test_raises_nomethoderror_if_an_invalid_json_stat_is_used
93
+ assert_raises(NoMethodError) { @full_stats.to_json( "wibble" ) }
94
+ end
95
+
96
+ def test_collects_mode
97
+ values = Array.new.tap do |a|
98
+ 100.times {
99
+ n = Random.rand(10)
100
+ a << n
101
+ @all_stats.update(n)
102
+ }
103
+ end
104
+
105
+ tally = values.tally
106
+ mode_value = tally.max_by { |v, count| count }.first
107
+
108
+ assert_equal(mode_value, @all_stats.mode)
109
+ end
110
+
111
+ def test_collecting_frequences_reports_extra_stat_names
112
+ stat_names = @all_stats.collected_stats
113
+ assert_includes(stat_names, "mode")
114
+ assert_includes(stat_names, "unique_count")
115
+ assert_includes(stat_names, "unique_values")
116
+ end
117
+ end
118
+ end