flat_kit 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,41 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestNullType < ::Minitest::Test
5
+ def nully_items
6
+ [ "null", "NULL", "nil", "\\N" ]
7
+ end
8
+
9
+ def test_null
10
+ assert(FlatKit::FieldType::NullType.matches?(nil))
11
+ end
12
+
13
+ def test_nully_items
14
+ nully_items.each do |s|
15
+ assert(FlatKit::FieldType::NullType.matches?(s), "#{s} should be null")
16
+ end
17
+ end
18
+
19
+ def test_other_class_does_not_match
20
+ [ 42, Object.new, true, false ].each do |x|
21
+ refute(FlatKit::FieldType::NullType.matches?(x), "#{x} should not be == null")
22
+ end
23
+ end
24
+
25
+ def test_coerce_coerces_nil
26
+ assert_nil(FlatKit::FieldType::NullType.coerce(nil))
27
+ end
28
+
29
+ def test_coerces_nully_items
30
+ nully_items.each do |s|
31
+ assert_nil(FlatKit::FieldType::NullType.coerce(s))
32
+ end
33
+ end
34
+
35
+ def test_coerce_failure_non_non_nully_items
36
+ [ "whatever", 42, Object.new, true, false, Class].each do |x|
37
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::NullType.coerce(x))
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,18 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestStringType < ::Minitest::Test
5
+
6
+ def test_string_will_not_match_non_string_data
7
+ [42, false, true, 12.5, Object.new].each do |o|
8
+ refute(FlatKit::FieldType::StringType.matches?(o))
9
+ end
10
+ end
11
+
12
+ def test_string_type_returns_coerce_failures
13
+ [BasicObject.new].each do |o|
14
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::StringType.coerce(o))
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,108 @@
1
+ require_relative '../test_helper'
2
+ require 'set'
3
+
4
+ module TestFieldType
5
+ class TestTimestampType < ::Minitest::Test
6
+ def test_time
7
+ assert(FlatKit::FieldType::TimestampType.matches?(Time.now))
8
+ end
9
+
10
+ def test_date
11
+ refute(FlatKit::FieldType::TimestampType.matches?(Date.today))
12
+ end
13
+
14
+ def test_date_time
15
+ refute(FlatKit::FieldType::TimestampType.matches?(DateTime.now))
16
+ end
17
+
18
+ def test_builtin_formats
19
+ stock_formats = [
20
+ Time.now.httpdate,
21
+ Time.now.utc.httpdate,
22
+ Time.now.iso8601,
23
+ Time.now.utc.iso8601,
24
+ Time.now.rfc2822,
25
+ Time.now.utc.rfc2822,
26
+ ]
27
+
28
+ stock_formats.each do |t|
29
+ assert(FlatKit::FieldType::TimestampType.matches?(t), "#{t} should match timestamp")
30
+ assert_instance_of(Time, FlatKit::FieldType::TimestampType.coerce(t), "#{t} should convert to timestamp")
31
+ end
32
+ end
33
+
34
+ def test_no_duplicate_formats
35
+ parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats
36
+
37
+ assert_equal(parse_formats.size, parse_formats.sort.uniq.size)
38
+ end
39
+
40
+ def test_parse_formats
41
+ parse_formats = ::FlatKit::FieldType::TimestampType.parse_formats.dup
42
+ extra_formats = [
43
+ "%Y-%m-%dT%H:%M:%S.%N%z",
44
+ "%Y-%m-%d %H:%M:%S.%NZ",
45
+ "%Y-%m-%d %H:%M:%S.%N",
46
+ "%Y-%m-%dT%H:%M:%S.%3N%z",
47
+ "%Y-%m-%d %H:%M:%S.%3NZ",
48
+ "%Y-%m-%d %H:%M:%S.%3N",
49
+ "%Y-%m-%dT%H:%M:%S.%6N%z",
50
+ "%Y-%m-%d %H:%M:%S.%6NZ",
51
+ "%Y-%m-%d %H:%M:%S.%6N",
52
+ "%Y-%m-%dT%H:%M:%S.%9N%z",
53
+ "%Y-%m-%d %H:%M:%S.%9NZ",
54
+ "%Y-%m-%d %H:%M:%S.%9N",
55
+ ]
56
+ parse_formats.concat(extra_formats)
57
+
58
+ parse_formats.each do |format|
59
+ now = Time.now
60
+ str = now.strftime(format)
61
+
62
+ assert(FlatKit::FieldType::TimestampType.matches?(str), "#{str} should match timestamp")
63
+ coerced = FlatKit::FieldType::TimestampType.coerce(str)
64
+
65
+ assert_instance_of(Time, coerced)
66
+ end
67
+ end
68
+
69
+ def test_other_class_does_not_match
70
+ [ 42, Object.new, true, false ].each do |x|
71
+ refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
72
+ end
73
+ end
74
+
75
+ def test_N_number_does_not_match
76
+ x = "N89362"
77
+ refute(FlatKit::FieldType::TimestampType.matches?(x), "#{x} should not be date")
78
+ end
79
+
80
+ def test_coerce_passthrough_time
81
+ t = Time.now
82
+ assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
83
+ end
84
+
85
+ def test_coerce_do_not_passthrough_date
86
+ t = Date.today
87
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
88
+ end
89
+
90
+ def test_date_coerce_passthrough_datetime
91
+ t = Time.now
92
+ assert_equal(t, FlatKit::FieldType::TimestampType.coerce(t))
93
+ end
94
+
95
+ def test_date_only_does_not_parse
96
+ t = Time.now.strftime("%Y-%m-%d")
97
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(t))
98
+ end
99
+
100
+ def test_number_coerce_failure
101
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce(42))
102
+ end
103
+
104
+ def test_number_coerce_failure_bad_parse
105
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::TimestampType.coerce("1234 56 78 90"))
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,35 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestUnknownType < ::Minitest::Test
5
+
6
+ def unknown_items
7
+ [ 'na', 'n/a', 'unk', 'unknown']
8
+ end
9
+
10
+ def test_unknown_items
11
+ unknown_items.each do |u|
12
+ assert(FlatKit::FieldType::UnknownType.matches?(u), "#{u} should be unknown")
13
+ end
14
+ end
15
+
16
+ def test_other_class_does_not_match
17
+ [ 42, Object.new, true, false, "whatever" ].each do |x|
18
+ refute(FlatKit::FieldType::UnknownType.matches?(x), "#{x} should not unknown ")
19
+ end
20
+ end
21
+
22
+ def test_coerce_unknown
23
+ unknown_items.each do |u|
24
+ assert_equal(u, FlatKit::FieldType::UnknownType.coerce(u), "#{u} should be unknown")
25
+ end
26
+ end
27
+
28
+ def test_other_class_does_not_coerce
29
+ [ 42, Object.new, true, false, "whatever" ].each do |x|
30
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::UnknownType.coerce(x))
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -41,12 +41,30 @@ module TestJsonl
41
41
  assert_equal(expected, actual)
42
42
  end
43
43
 
44
- def test_writes_to_io
44
+ def test_postion
45
45
  File.open(@write_path, "w+") do |f|
46
46
  writer = ::FlatKit::Jsonl::Writer.new(destination: f)
47
47
 
48
- @records.each do |r|
49
- writer.write(r)
48
+ byte_offset = 0
49
+ @records.each_with_index do |r, idx|
50
+ record_length = r.data.bytesize
51
+
52
+ position = writer.write(r)
53
+
54
+ # make sure write stores the last_position api and returns that value
55
+ assert_equal(position, writer.last_position)
56
+
57
+ assert_equal(idx, position.index)
58
+ assert_equal(byte_offset, position.offset)
59
+ assert_equal(record_length, position.bytesize)
60
+
61
+ byte_offset += record_length
62
+
63
+ current_position = writer.current_position
64
+ assert_equal(idx+1, current_position.index)
65
+ assert_equal(byte_offset, current_position.offset)
66
+ assert_equal(0, current_position.bytesize)
67
+
50
68
  end
51
69
  writer.close
52
70
 
data/test/run ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ test_dir = __dir__
5
+
6
+ $: << File.join(File.dirname(test_dir), "lib")
7
+ $: << test_dir
8
+
9
+ require_relative './test_helper'
10
+
11
+ require 'find'
12
+
13
+ if ARGV.empty? then
14
+ Find.find(test_dir) do |path|
15
+ next unless File.file?(path) && File.basename(path) =~ /\Atest_.*\.rb\Z/
16
+ require path
17
+ end
18
+ else
19
+ ARGV.each do |f|
20
+ require File.expand_path(f)
21
+ end
22
+ end
23
+
@@ -0,0 +1,69 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestStatType
4
+ class TestNominalStats < ::Minitest::Test
5
+ def setup
6
+ @unique_values = ('a'..'f').to_a
7
+ @values = Array.new.tap do |a|
8
+ @unique_values.each do |letter|
9
+ (Random.rand(42) + 1).times { a << letter }
10
+ end
11
+ end
12
+
13
+ @frequencies = @values.tally
14
+
15
+ @stats = ::FlatKit::StatType::NominalStats.new
16
+ @all_stats = ::FlatKit::StatType::NominalStats.new(collecting_frequencies: true)
17
+
18
+ @values.each do |v|
19
+ @stats.update(v)
20
+ @all_stats.update(v)
21
+ end
22
+ end
23
+
24
+ def test_count
25
+ assert_equal(@values.size, @stats.count)
26
+ assert_equal(@values.size, @all_stats.count)
27
+ end
28
+
29
+ def test_does_not_collect_unique_count_by_default
30
+ assert_nil(@stats.unique_count)
31
+ end
32
+
33
+ def test_does_not_collect_unique_values_by_default
34
+ assert_nil(@stats.unique_values)
35
+ end
36
+
37
+ def test_does_not_collect_frequencies_by_default
38
+ assert_nil(@stats.frequencies)
39
+ end
40
+
41
+ def test_unique_count
42
+ assert_equal(@unique_values.size, @all_stats.unique_count)
43
+ end
44
+
45
+ def test_unique_values
46
+ assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
47
+ end
48
+
49
+ def test_frequencies
50
+ assert_equal(@frequencies, @all_stats.frequencies)
51
+ end
52
+
53
+ def test_default_to_hash
54
+ expecting = { "count" => @values.size }
55
+ assert_equal(expecting, @stats.to_hash)
56
+ end
57
+
58
+ def test_all_stats_hash
59
+ expecting = {
60
+ "count" => @values.size,
61
+ "unique_count" => @unique_values.size,
62
+ "unique_values" => @unique_values.sort,
63
+ "mode" => @frequencies.max_by { |k,v| v }.first
64
+ }
65
+ assert_equal(expecting, @all_stats.to_hash)
66
+ end
67
+ end
68
+ end
69
+
@@ -0,0 +1,118 @@
1
+ require_relative '../test_helper'
2
+
3
+
4
+ module TestStatType
5
+ class TestNumericalStats < ::Minitest::Test
6
+ def setup
7
+ @stats = FlatKit::StatType::NumericalStats.new
8
+ @full_stats = FlatKit::StatType::NumericalStats.new
9
+ @all_stats = FlatKit::StatType::NumericalStats.new(collecting_frequencies: true)
10
+ [ 1, 2, 3].each { |i| @full_stats.update( i ) }
11
+ end
12
+
13
+ def test_intialized_with_usable_values
14
+ assert_equal(0, @stats.count)
15
+ assert_equal(Float::INFINITY, @stats.min)
16
+ assert_equal(-Float::INFINITY, @stats.max)
17
+ assert_equal(0.0, @stats.sum)
18
+ assert_equal(0.0, @stats.rate)
19
+ end
20
+
21
+ def test_calculates_mean
22
+ assert_equal(2.0, @full_stats.mean)
23
+ end
24
+
25
+ def test_calculates_rate
26
+ assert_equal(0.5, @full_stats.rate)
27
+ end
28
+
29
+ def test_tracks_the_maximum_value
30
+ assert_equal(3.0, @full_stats.max)
31
+ end
32
+
33
+ def test_tracks_the_minimum_value
34
+ assert_equal(1.0, @full_stats.min)
35
+ end
36
+
37
+ def test_tracks_the_count
38
+ assert_equal(3,@full_stats.count)
39
+ end
40
+
41
+ def test_tracks_the_sum
42
+ assert_equal(6.0, @full_stats.sum)
43
+ end
44
+
45
+ def test_calculates_the_standard_deviation
46
+ assert_equal(1.0, @full_stats.stddev)
47
+ end
48
+
49
+ def test_calculates_the_sum_of_squares
50
+ assert_equal(14, @full_stats.sumsq)
51
+ end
52
+
53
+ def test_converts_to_a_hash
54
+ h = @full_stats.to_hash
55
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
56
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
57
+ end
58
+
59
+ def test_converts_to_a_limited_hash_if_given_arguments
60
+ h = @full_stats.to_hash( "min", "max", "mean" )
61
+ assert_equal(3, h.size)
62
+ assert_equal(%w[ max mean min], h.keys.sort)
63
+
64
+ h = @full_stats.to_hash( %w[ count rate ] )
65
+ assert_equal(2, h.size)
66
+ assert_equal(%w[ count rate ], h.keys.sort)
67
+ end
68
+
69
+ def test_raises_nomethoderror_if_an_invalid_stat_is_used
70
+ assert_raises(NoMethodError) { @full_stats.to_hash( "wibble" ) }
71
+ end
72
+
73
+ def test_converts_to_a_json_string
74
+ j = @full_stats.to_json
75
+ h = JSON.parse( j )
76
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
77
+ assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
78
+ end
79
+
80
+ def test_converts_to_a_limited_json_hash_if_given_arguments
81
+ j = @full_stats.to_json( "min", "max", "mean" )
82
+ h = JSON.parse( j )
83
+ assert_equal(3, h.size)
84
+ assert_equal(%w[ max mean min], h.keys.sort)
85
+
86
+ j = @full_stats.to_json( %w[ count rate ] )
87
+ h = JSON.parse( j )
88
+ assert_equal(2, h.size)
89
+ assert_equal(%w[ count rate ], h.keys.sort)
90
+ end
91
+
92
+ def test_raises_nomethoderror_if_an_invalid_json_stat_is_used
93
+ assert_raises(NoMethodError) { @full_stats.to_json( "wibble" ) }
94
+ end
95
+
96
+ def test_collects_mode
97
+ values = Array.new.tap do |a|
98
+ 100.times {
99
+ n = Random.rand(10)
100
+ a << n
101
+ @all_stats.update(n)
102
+ }
103
+ end
104
+
105
+ tally = values.tally
106
+ mode_value = tally.max_by { |v, count| count }.first
107
+
108
+ assert_equal(mode_value, @all_stats.mode)
109
+ end
110
+
111
+ def test_collecting_frequences_reports_extra_stat_names
112
+ stat_names = @all_stats.collected_stats
113
+ assert_includes(stat_names, "mode")
114
+ assert_includes(stat_names, "unique_count")
115
+ assert_includes(stat_names, "unique_values")
116
+ end
117
+ end
118
+ end