flat_kit 0.3.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +1 -2
  3. data/HISTORY.md +9 -0
  4. data/Manifest.txt +3 -42
  5. data/{bin → exe}/fk +2 -1
  6. data/flat_kit.gemspec +33 -0
  7. data/lib/flat_kit/cli.rb +46 -32
  8. data/lib/flat_kit/command/cat.rb +34 -32
  9. data/lib/flat_kit/command/merge.rb +37 -36
  10. data/lib/flat_kit/command/sort.rb +37 -37
  11. data/lib/flat_kit/command/stats.rb +41 -39
  12. data/lib/flat_kit/command.rb +10 -11
  13. data/lib/flat_kit/descendant_tracker.rb +9 -6
  14. data/lib/flat_kit/error.rb +4 -0
  15. data/lib/flat_kit/event_emitter.rb +5 -2
  16. data/lib/flat_kit/field_stats.rb +31 -26
  17. data/lib/flat_kit/field_type/boolean_type.rb +9 -5
  18. data/lib/flat_kit/field_type/date_type.rb +19 -17
  19. data/lib/flat_kit/field_type/float_type.rb +15 -9
  20. data/lib/flat_kit/field_type/guess_type.rb +9 -6
  21. data/lib/flat_kit/field_type/integer_type.rb +6 -4
  22. data/lib/flat_kit/field_type/null_type.rb +5 -1
  23. data/lib/flat_kit/field_type/string_type.rb +8 -6
  24. data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
  25. data/lib/flat_kit/field_type/unknown_type.rb +12 -8
  26. data/lib/flat_kit/field_type.rb +52 -44
  27. data/lib/flat_kit/format.rb +11 -5
  28. data/lib/flat_kit/input/file.rb +11 -9
  29. data/lib/flat_kit/input/io.rb +18 -21
  30. data/lib/flat_kit/input.rb +8 -7
  31. data/lib/flat_kit/internal_node.rb +22 -19
  32. data/lib/flat_kit/jsonl/format.rb +6 -2
  33. data/lib/flat_kit/jsonl/reader.rb +7 -4
  34. data/lib/flat_kit/jsonl/record.rb +15 -18
  35. data/lib/flat_kit/jsonl/writer.rb +8 -10
  36. data/lib/flat_kit/jsonl.rb +8 -4
  37. data/lib/flat_kit/leaf_node.rb +6 -5
  38. data/lib/flat_kit/log_formatter.rb +20 -0
  39. data/lib/flat_kit/logger.rb +12 -19
  40. data/lib/flat_kit/merge.rb +21 -18
  41. data/lib/flat_kit/merge_tree.rb +5 -6
  42. data/lib/flat_kit/output/file.rb +13 -9
  43. data/lib/flat_kit/output/io.rb +40 -35
  44. data/lib/flat_kit/output.rb +8 -7
  45. data/lib/flat_kit/position.rb +3 -4
  46. data/lib/flat_kit/reader.rb +8 -8
  47. data/lib/flat_kit/record.rb +12 -12
  48. data/lib/flat_kit/sentinel_internal_node.rb +6 -5
  49. data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
  50. data/lib/flat_kit/sort.rb +8 -9
  51. data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
  52. data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
  53. data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
  54. data/lib/flat_kit/stat_type.rb +18 -13
  55. data/lib/flat_kit/stats.rb +12 -14
  56. data/lib/flat_kit/writer.rb +5 -6
  57. data/lib/flat_kit/xsv/format.rb +6 -2
  58. data/lib/flat_kit/xsv/reader.rb +8 -6
  59. data/lib/flat_kit/xsv/record.rb +21 -15
  60. data/lib/flat_kit/xsv/writer.rb +13 -10
  61. data/lib/flat_kit/xsv.rb +7 -4
  62. data/lib/flat_kit.rb +31 -26
  63. metadata +20 -158
  64. data/Rakefile +0 -21
  65. data/examples/stream-active-record-to-csv.rb +0 -42
  66. data/tasks/default.rake +0 -242
  67. data/tasks/extension.rake +0 -38
  68. data/tasks/man.rake +0 -7
  69. data/tasks/this.rb +0 -208
  70. data/test/device_dataset.rb +0 -117
  71. data/test/field_type/test_boolean_type.rb +0 -65
  72. data/test/field_type/test_date_type.rb +0 -71
  73. data/test/field_type/test_float_type.rb +0 -56
  74. data/test/field_type/test_guess_type.rb +0 -14
  75. data/test/field_type/test_integer_type.rb +0 -52
  76. data/test/field_type/test_null_type.rb +0 -41
  77. data/test/field_type/test_string_type.rb +0 -18
  78. data/test/field_type/test_timestamp_type.rb +0 -108
  79. data/test/field_type/test_unknown_type.rb +0 -35
  80. data/test/input/test_file.rb +0 -73
  81. data/test/input/test_io.rb +0 -93
  82. data/test/jsonl/test_format.rb +0 -22
  83. data/test/jsonl/test_reader.rb +0 -49
  84. data/test/jsonl/test_record.rb +0 -61
  85. data/test/jsonl/test_writer.rb +0 -86
  86. data/test/output/test_file.rb +0 -60
  87. data/test/output/test_io.rb +0 -104
  88. data/test/run +0 -23
  89. data/test/stat_type/test_nominal_stats.rb +0 -69
  90. data/test/stat_type/test_numerical_stats.rb +0 -118
  91. data/test/stat_type/test_ordinal_stats.rb +0 -92
  92. data/test/test_conversions.rb +0 -45
  93. data/test/test_event_emitter.rb +0 -89
  94. data/test/test_field_stats.rb +0 -134
  95. data/test/test_field_type.rb +0 -34
  96. data/test/test_format.rb +0 -24
  97. data/test/test_helper.rb +0 -26
  98. data/test/test_merge.rb +0 -40
  99. data/test/test_merge_tree.rb +0 -64
  100. data/test/test_version.rb +0 -11
  101. data/test/xsv/test_format.rb +0 -22
  102. data/test/xsv/test_reader.rb +0 -61
  103. data/test/xsv/test_record.rb +0 -69
  104. data/test/xsv/test_writer.rb +0 -89
data/test/run DELETED
@@ -1,23 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
-
4
- test_dir = __dir__
5
-
6
- $: << File.join(File.dirname(test_dir), "lib")
7
- $: << test_dir
8
-
9
- require_relative './test_helper'
10
-
11
- require 'find'
12
-
13
- if ARGV.empty? then
14
- Find.find(test_dir) do |path|
15
- next unless File.file?(path) && File.basename(path) =~ /\Atest_.*\.rb\Z/
16
- require path
17
- end
18
- else
19
- ARGV.each do |f|
20
- require File.expand_path(f)
21
- end
22
- end
23
-
@@ -1,69 +0,0 @@
1
- require_relative '../test_helper'
2
-
3
- module TestStatType
4
- class TestNominalStats < ::Minitest::Test
5
- def setup
6
- @unique_values = ('a'..'f').to_a
7
- @values = Array.new.tap do |a|
8
- @unique_values.each do |letter|
9
- (Random.rand(42) + 1).times { a << letter }
10
- end
11
- end
12
-
13
- @frequencies = @values.tally
14
-
15
- @stats = ::FlatKit::StatType::NominalStats.new
16
- @all_stats = ::FlatKit::StatType::NominalStats.new(collecting_frequencies: true)
17
-
18
- @values.each do |v|
19
- @stats.update(v)
20
- @all_stats.update(v)
21
- end
22
- end
23
-
24
- def test_count
25
- assert_equal(@values.size, @stats.count)
26
- assert_equal(@values.size, @all_stats.count)
27
- end
28
-
29
- def test_does_not_collect_unique_count_by_default
30
- assert_nil(@stats.unique_count)
31
- end
32
-
33
- def test_does_not_collect_unique_values_by_default
34
- assert_nil(@stats.unique_values)
35
- end
36
-
37
- def test_does_not_collect_frequencies_by_default
38
- assert_nil(@stats.frequencies)
39
- end
40
-
41
- def test_unique_count
42
- assert_equal(@unique_values.size, @all_stats.unique_count)
43
- end
44
-
45
- def test_unique_values
46
- assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
47
- end
48
-
49
- def test_frequencies
50
- assert_equal(@frequencies, @all_stats.frequencies)
51
- end
52
-
53
- def test_default_to_hash
54
- expecting = { "count" => @values.size }
55
- assert_equal(expecting, @stats.to_hash)
56
- end
57
-
58
- def test_all_stats_hash
59
- expecting = {
60
- "count" => @values.size,
61
- "unique_count" => @unique_values.size,
62
- "unique_values" => @unique_values.sort,
63
- "mode" => @frequencies.max_by { |k,v| v }.first
64
- }
65
- assert_equal(expecting, @all_stats.to_hash)
66
- end
67
- end
68
- end
69
-
@@ -1,118 +0,0 @@
1
- require_relative '../test_helper'
2
-
3
-
4
- module TestStatType
5
- class TestNumericalStats < ::Minitest::Test
6
- def setup
7
- @stats = FlatKit::StatType::NumericalStats.new
8
- @full_stats = FlatKit::StatType::NumericalStats.new
9
- @all_stats = FlatKit::StatType::NumericalStats.new(collecting_frequencies: true)
10
- [ 1, 2, 3].each { |i| @full_stats.update( i ) }
11
- end
12
-
13
- def test_intialized_with_usable_values
14
- assert_equal(0, @stats.count)
15
- assert_equal(Float::INFINITY, @stats.min)
16
- assert_equal(-Float::INFINITY, @stats.max)
17
- assert_equal(0.0, @stats.sum)
18
- assert_equal(0.0, @stats.rate)
19
- end
20
-
21
- def test_calculates_mean
22
- assert_equal(2.0, @full_stats.mean)
23
- end
24
-
25
- def test_calculates_rate
26
- assert_equal(0.5, @full_stats.rate)
27
- end
28
-
29
- def test_tracks_the_maximum_value
30
- assert_equal(3.0, @full_stats.max)
31
- end
32
-
33
- def test_tracks_the_minimum_value
34
- assert_equal(1.0, @full_stats.min)
35
- end
36
-
37
- def test_tracks_the_count
38
- assert_equal(3,@full_stats.count)
39
- end
40
-
41
- def test_tracks_the_sum
42
- assert_equal(6.0, @full_stats.sum)
43
- end
44
-
45
- def test_calculates_the_standard_deviation
46
- assert_equal(1.0, @full_stats.stddev)
47
- end
48
-
49
- def test_calculates_the_sum_of_squares
50
- assert_equal(14, @full_stats.sumsq)
51
- end
52
-
53
- def test_converts_to_a_hash
54
- h = @full_stats.to_hash
55
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
56
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
57
- end
58
-
59
- def test_converts_to_a_limited_hash_if_given_arguments
60
- h = @full_stats.to_hash( "min", "max", "mean" )
61
- assert_equal(3, h.size)
62
- assert_equal(%w[ max mean min], h.keys.sort)
63
-
64
- h = @full_stats.to_hash( %w[ count rate ] )
65
- assert_equal(2, h.size)
66
- assert_equal(%w[ count rate ], h.keys.sort)
67
- end
68
-
69
- def test_raises_nomethoderror_if_an_invalid_stat_is_used
70
- assert_raises(NoMethodError) { @full_stats.to_hash( "wibble" ) }
71
- end
72
-
73
- def test_converts_to_a_json_string
74
- j = @full_stats.to_json
75
- h = JSON.parse( j )
76
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
77
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
78
- end
79
-
80
- def test_converts_to_a_limited_json_hash_if_given_arguments
81
- j = @full_stats.to_json( "min", "max", "mean" )
82
- h = JSON.parse( j )
83
- assert_equal(3, h.size)
84
- assert_equal(%w[ max mean min], h.keys.sort)
85
-
86
- j = @full_stats.to_json( %w[ count rate ] )
87
- h = JSON.parse( j )
88
- assert_equal(2, h.size)
89
- assert_equal(%w[ count rate ], h.keys.sort)
90
- end
91
-
92
- def test_raises_nomethoderror_if_an_invalid_json_stat_is_used
93
- assert_raises(NoMethodError) { @full_stats.to_json( "wibble" ) }
94
- end
95
-
96
- def test_collects_mode
97
- values = Array.new.tap do |a|
98
- 100.times {
99
- n = Random.rand(10)
100
- a << n
101
- @all_stats.update(n)
102
- }
103
- end
104
-
105
- tally = values.tally
106
- mode_value = tally.max_by { |v, count| count }.first
107
-
108
- assert_equal(mode_value, @all_stats.mode)
109
- end
110
-
111
- def test_collecting_frequences_reports_extra_stat_names
112
- stat_names = @all_stats.collected_stats
113
- assert_includes(stat_names, "mode")
114
- assert_includes(stat_names, "unique_count")
115
- assert_includes(stat_names, "unique_values")
116
- end
117
- end
118
- end
@@ -1,92 +0,0 @@
1
- require_relative '../test_helper'
2
-
3
- module TestStatType
4
- class TestOrdinalStats < ::Minitest::Test
5
- def setup
6
- today = Date.today
7
- next_month = today >> 1
8
- last_day_of_month = (next_month - 1).mday
9
-
10
- @start_date = Date.new(today.year, today.month, 1)
11
- @end_date = Date.new(today.year, today.month, last_day_of_month)
12
-
13
- @unique_values = (@start_date..@end_date).to_a
14
- @values = Array.new.tap do |a|
15
- @unique_values.each do |date|
16
- (Random.rand(42) + 1).times { a << date}
17
- end
18
- end
19
-
20
- @frequencies = @values.tally
21
-
22
- @stats = ::FlatKit::StatType::OrdinalStats.new
23
- @all_stats = ::FlatKit::StatType::OrdinalStats.new(collecting_frequencies: true)
24
-
25
- @values.each do |v|
26
- @stats.update(v)
27
- @all_stats.update(v)
28
- end
29
- end
30
-
31
- def test_count
32
- assert_equal(@values.size, @stats.count)
33
- assert_equal(@values.size, @all_stats.count)
34
- end
35
-
36
- def test_min
37
- assert_equal(@values.min, @stats.min)
38
- assert_equal(@values.min, @all_stats.min)
39
- end
40
-
41
- def test_max
42
- assert_equal(@values.max, @stats.max)
43
- assert_equal(@values.max, @all_stats.max)
44
- end
45
-
46
- def test_does_not_collect_unique_count_by_default
47
- assert_nil(@stats.unique_count)
48
- end
49
-
50
- def test_does_not_collect_unique_values_by_default
51
- assert_nil(@stats.unique_values)
52
- end
53
-
54
- def test_does_not_collect_frequencies_by_default
55
- assert_nil(@stats.frequencies)
56
- end
57
-
58
- def test_unique_count
59
- assert_equal(@unique_values.size, @all_stats.unique_count)
60
- end
61
-
62
- def test_unique_values
63
- assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
64
- end
65
-
66
- def test_frequencies
67
- assert_equal(@frequencies, @all_stats.frequencies)
68
- end
69
-
70
- def test_default_to_hash
71
- expecting = {
72
- "count" => @values.size,
73
- "max" => @values.max,
74
- "min" => @values.min,
75
- }
76
- assert_equal(expecting, @stats.to_hash)
77
- end
78
-
79
- def test_all_stats_hash
80
- expecting = {
81
- "count" => @values.size,
82
- "unique_count" => @unique_values.size,
83
- "unique_values" => @unique_values.sort,
84
- "mode" => @frequencies.max_by { |k,v| v }.first,
85
- "max" => @values.max,
86
- "min" => @values.min,
87
- }
88
- assert_equal(expecting, @all_stats.to_hash)
89
- end
90
- end
91
- end
92
-
@@ -1,45 +0,0 @@
1
- require 'test_helper'
2
-
3
- class TestConversions < ::Minitest::Test
4
- def setup
5
- @one_row_dataset = DeviceDataset.new(count: 1)
6
- @src_record = @one_row_dataset.records.first
7
- @csv_row = @one_row_dataset.records_as_csv_rows.first
8
- @compare_fields = @one_row_dataset.compare_fields
9
- end
10
-
11
- def test_from_csv_to_json
12
- xsv_record = FlatKit::Xsv::Record.new(data: @csv_row, compare_fields: @compare_fields)
13
- json_record = FlatKit::Jsonl::Record.from_record(xsv_record)
14
-
15
- assert_equal(@one_row_dataset.records.first, xsv_record.to_hash)
16
- assert_equal(@one_row_dataset.records.first, json_record.to_hash)
17
- assert_equal(xsv_record, json_record)
18
- end
19
-
20
- def test_from_json_to_csv
21
- src_json = JSON.generate(@src_record)
22
- json_record = FlatKit::Jsonl::Record.new(data: src_json, compare_fields: @compare_fields)
23
- xsv_record = FlatKit::Xsv::Record.from_record(json_record)
24
-
25
- assert_equal(@one_row_dataset.records.first, xsv_record.to_hash)
26
- assert_equal(@one_row_dataset.records.first, json_record.to_hash)
27
- assert_equal(xsv_record, json_record)
28
- end
29
-
30
- def test_roundtrip_csv_json_csv
31
- xsv_record = FlatKit::Xsv::Record.new(data: @csv_row, compare_fields: @compare_fields)
32
- json_record = FlatKit::Jsonl::Record.from_record(xsv_record)
33
- xsv2 = FlatKit::Xsv::Record.from_record(json_record)
34
-
35
- assert_equal(xsv_record.to_s, xsv2.to_s)
36
- end
37
-
38
- def test_roundtrip_json_csv_json
39
- src_json = JSON.generate(@src_record)
40
- json_record = FlatKit::Jsonl::Record.new(data: src_json, compare_fields: @compare_fields)
41
- xsv_record = FlatKit::Xsv::Record.from_record(json_record)
42
- json2 = FlatKit::Jsonl::Record.from_record(xsv_record)
43
- assert_equal(src_json, json2.to_s)
44
- end
45
- end
@@ -1,89 +0,0 @@
1
- require 'test_helper'
2
-
3
- class TestEventEmitter < ::Minitest::Test
4
- class Pub
5
- include ::FlatKit::EventEmitter
6
- end
7
-
8
- class Sub
9
- attr_reader :name
10
- attr_reader :data
11
- attr_reader :meta
12
-
13
- def initialize
14
- @name = nil
15
- @data = nil
16
- @meta = nil
17
- end
18
-
19
- def [](key)
20
- @meta[key]
21
- end
22
-
23
- def on_event(name:, data:, meta:)
24
- @name = name
25
- @data = data
26
- @meta = meta
27
- end
28
- end
29
-
30
- class BadSub; end
31
-
32
- def setup
33
- @emitter = Pub.new
34
- @receiver = Sub.new
35
- end
36
-
37
- def test_counts_no_listeners_before_adding_one
38
- assert_equal(0, @emitter.count_listeners)
39
- end
40
-
41
- def test_adds_listener
42
- @emitter.add_listener(@receiver)
43
- assert_equal(1, @emitter.count_listeners)
44
- end
45
-
46
- def test_removes_listener
47
- @emitter.add_listener(@receiver)
48
- assert_equal(1, @emitter.count_listeners)
49
-
50
- @emitter.remove_listener(@receiver)
51
- assert_equal(0, @emitter.count_listeners)
52
- end
53
-
54
- def test_only_adds_an_listener_once
55
- @emitter.add_listener(@receiver)
56
- assert_equal(1, @emitter.count_listeners)
57
-
58
- @emitter.add_listener(@receiver)
59
- assert_equal(1, @emitter.count_listeners)
60
- end
61
-
62
- def test_verifies_reciever_responds_t_observed
63
- assert_raises(::NoMethodError) { @emitter.add_listener(BadSub.new) }
64
- end
65
-
66
- def test_listeners_get_notified
67
- @receiver_2 = Sub.new
68
- @emitter.add_listener(@receiver)
69
- @emitter.add_listener(@receiver_2)
70
-
71
- meta = {
72
- foo: "foo",
73
- bar: 42,
74
- }
75
- @emitter.notify_listeners(name: :notification, data: "DATA!", meta: meta)
76
-
77
- assert_equal(:notification, @receiver.name)
78
- assert_equal(:notification, @receiver_2.name)
79
-
80
- assert_equal("DATA!", @receiver.data)
81
- assert_equal("DATA!", @receiver_2.data)
82
-
83
- assert_equal("foo", @receiver[:foo])
84
- assert_equal("foo", @receiver_2[:foo])
85
-
86
- assert_equal(42, @receiver[:bar])
87
- assert_equal(42, @receiver_2[:bar])
88
- end
89
- end
@@ -1,134 +0,0 @@
1
- require_relative './test_helper'
2
-
3
- require 'faker'
4
-
5
- class TestFieldStats < ::Minitest::Test
6
- # returns [FieldStats, Array] where the array is the original data
7
- def generate_data_with(count: 100, stats: ::FlatKit::FieldStats.new(name: "data"), &block)
8
- list = Array.new.tap do |a|
9
- count.times do
10
- n = block.call
11
- stats.update(n)
12
- a << n
13
- end
14
- end
15
- [stats, list]
16
- end
17
-
18
- def test_raises_error_on_invalid_stats
19
- assert_raises(ArgumentError) { ::FlatKit::FieldStats.new(name: "test", stats_to_collect: :whatever) }
20
- end
21
-
22
- def test_collects_numeric_default_stats
23
- field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
24
-
25
- null_count = 5
26
- null_count.times {
27
- field_stats.update(nil)
28
- }
29
-
30
- avg = number_data.sum / number_data.size
31
- min = number_data.min
32
- max = number_data.max
33
- sum = number_data.sum
34
-
35
- refute(field_stats.field_type_determined?)
36
-
37
- assert_equal(null_count, field_stats.null_count)
38
- assert_equal(number_data.size, field_stats.count)
39
-
40
- assert(field_stats.field_type_determined?)
41
-
42
- assert_in_epsilon(avg, field_stats.mean)
43
- assert_equal(min, field_stats.min)
44
- assert_equal(max, field_stats.max)
45
- assert_in_epsilon(sum, field_stats.sum)
46
- expected_percent = (null_count.to_f / (null_count + number_data.size)) * 100.0
47
- assert_in_epsilon(expected_percent, field_stats.null_percent)
48
- end
49
-
50
- def test_collect_numeric_cardinality_stats
51
- field_stats = ::FlatKit::FieldStats.new(name: "number-cardinality",
52
- stats_to_collect: ::FlatKit::FieldStats::ALL_STATS)
53
- field_stats, number_data = generate_data_with(stats: field_stats) {
54
- Faker::Number.within(range:1..25)
55
- }
56
-
57
- avg = number_data.sum.to_f / number_data.size
58
- min = number_data.min
59
- max = number_data.max
60
-
61
- assert(field_stats.collecting_frequencies?)
62
- refute(field_stats.field_type_determined?)
63
-
64
- assert_equal(number_data.size, field_stats.count)
65
-
66
- assert(field_stats.field_type_determined?)
67
-
68
- assert_in_epsilon(avg, field_stats.mean)
69
- assert_equal(min, field_stats.min)
70
- assert_equal(max, field_stats.max)
71
-
72
- assert_equal(number_data.tally.keys.size, field_stats.unique_count)
73
- assert_equal(number_data.tally.keys.sort, field_stats.unique_values.sort)
74
- assert_equal(number_data.tally, field_stats.frequencies)
75
-
76
- mode = number_data.tally.max_by{ |k,v| v }.first
77
- assert_equal(mode, field_stats.mode)
78
- end
79
-
80
- def test_unknown_type_stats
81
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-with-unknown")
82
- field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
83
-
84
- unknown_count = 20
85
- unknown_count.times {
86
- field_stats.update("unknown")
87
- }
88
-
89
- refute(field_stats.field_type_determined?)
90
-
91
- assert_equal(unknown_count, field_stats.unknown_count)
92
- assert_equal(unknown_count + number_data.size, field_stats.total_count)
93
-
94
- expected_percent = (unknown_count.to_f / (unknown_count + number_data.size)) * 100.0
95
-
96
- assert_in_epsilon(expected_percent, field_stats.unknown_percent)
97
- end
98
-
99
- def test_resolves_type_automatically
100
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-autoresolve",guess_threshold: 101)
101
- field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 1.0..100.0) }
102
-
103
- refute(field_stats.field_type_determined?)
104
- field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 200.0..300.0) }
105
- assert(field_stats.field_type_determined?)
106
- end
107
-
108
- def test_resolves_integer_appropriately_with_mixed_data
109
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
110
- field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..1).to_s }
111
- field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
112
-
113
- assert_equal(::FlatKit::FieldType::IntegerType, field_stats.field_type)
114
-
115
- end
116
-
117
- def test_resolves_boolean_appropriately_with_mixed_data
118
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
119
- field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Boolean.boolean.to_s }
120
- field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
121
- assert_equal(::FlatKit::FieldType::BooleanType, field_stats.field_type)
122
- end
123
-
124
- def test_resolves_string_appropriately_with_mixed_data
125
- field_stats = ::FlatKit::FieldStats.new(name: "string",guess_threshold: 100)
126
- field_stats, _ = generate_data_with(count: 61, stats: field_stats) { Faker::Color.name.to_s }
127
- field_stats, _ = generate_data_with(count: 59, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
128
- assert_equal(::FlatKit::FieldType::StringType, field_stats.field_type)
129
-
130
- assert_equal(120, field_stats.count)
131
- assert_equal(0, field_stats.unknown_count)
132
- assert_equal(0, field_stats.null_count)
133
- end
134
- end
@@ -1,34 +0,0 @@
1
- require_relative './test_helper'
2
-
3
- module TestFieldType
4
- class TestFieldType < ::Minitest::Test
5
-
6
- def test_weight_raises_exception
7
- assert_raises(NotImplementedError) { ::FlatKit::FieldType.weight }
8
- end
9
-
10
- def test_best_guesses
11
- guesses = {
12
- "t" => ::FlatKit::FieldType::BooleanType,
13
- "1" => ::FlatKit::FieldType::BooleanType,
14
- "0" => ::FlatKit::FieldType::BooleanType,
15
- "n" => ::FlatKit::FieldType::BooleanType,
16
- "42" => ::FlatKit::FieldType::IntegerType,
17
- "nil" => ::FlatKit::FieldType::NullType,
18
- "n/a" => ::FlatKit::FieldType::UnknownType,
19
- "foo" => ::FlatKit::FieldType::StringType,
20
- "12.3" => ::FlatKit::FieldType::FloatType,
21
- "2021-02-26" => ::FlatKit::FieldType::DateType,
22
- "2020-03-03T12:34:56Z" => ::FlatKit::FieldType::TimestampType,
23
- }
24
-
25
- guesses.each do |test, expected|
26
- assert_equal(expected, ::FlatKit::FieldType.best_guess(test), "Expected '#{test}' to be #{expected}")
27
- end
28
- end
29
-
30
- def test_children_exist
31
- assert_equal(9,::FlatKit::FieldType.children.size)
32
- end
33
- end
34
- end
data/test/test_format.rb DELETED
@@ -1,24 +0,0 @@
1
- require_relative 'test_helper'
2
-
3
- class TestFormat < ::Minitest::Test
4
- def test_finds_jsonl_format
5
- klass = ::FlatKit::Format.for("data.json.gz")
6
- assert_equal(::FlatKit::Jsonl::Format, klass)
7
- end
8
-
9
- def test_finds_xsv_format
10
- klass = ::FlatKit::Format.for("data.csv.gz")
11
- assert_equal(::FlatKit::Xsv::Format, klass)
12
- end
13
-
14
- def test_finds_jsonl_format_for_full_path
15
- klass = ::FlatKit::Format.for("tmp/sorted/foo.jsonl")
16
- assert_equal(::FlatKit::Jsonl::Format, klass)
17
- end
18
-
19
- def test_finds_jsonl_format_with_fallback
20
- path = "tmp/sorted/foo.json"
21
- klass = ::FlatKit::Format.for_with_fallback!(path: path, fallback: "auto")
22
- assert_equal(::FlatKit::Jsonl::Format, klass)
23
- end
24
- end
data/test/test_helper.rb DELETED
@@ -1,26 +0,0 @@
1
- require 'simplecov'
2
- SimpleCov.start if ENV['COVERAGE']
3
-
4
- require 'byebug'
5
-
6
- require 'minitest/autorun'
7
- require 'minitest/focus'
8
- require 'minitest/pride'
9
-
10
- module TestHelper
11
- def scratch_dir
12
- p = Pathname.new(__FILE__).parent.parent.join('tmp/testing_scratch')
13
- p.mkpath
14
- p
15
- end
16
-
17
- def generate_slug(length: 10)
18
- SecureRandom.alphanumeric(10)
19
- end
20
-
21
- def scratch_file(prefix: "test_", slug: generate_slug, extension: ".jsonl")
22
- scratch_dir.join("#{prefix}#{slug}#{extension}")
23
- end
24
- end
25
- require_relative '../lib/flat_kit'
26
- require_relative './device_dataset'
data/test/test_merge.rb DELETED
@@ -1,40 +0,0 @@
1
- require 'test_helper'
2
-
3
- class TestMerge < ::Minitest::Test
4
-
5
- def test_can_use_use_dash_as_output
6
- merge = ::FlatKit::Merge.new(inputs: [], input_fallback: "json",
7
- output: "-", output_fallback: "json", compare_fields: [])
8
- assert_match(/STDOUT/, merge.writer.output.name)
9
- assert_instance_of(::FlatKit::Output::IO, merge.writer.output)
10
- end
11
-
12
- def test_can_use_a_text_path_as_output
13
- test_path = "tmp/test_can_use_a_text_path_as_output.json"
14
- begin
15
- merge = ::FlatKit::Merge.new(output: test_path, inputs: [], input_fallback: "json", compare_fields: [])
16
- assert_equal(test_path, merge.writer.output.name)
17
- assert_instance_of(::FlatKit::Output::File, merge.writer.output)
18
- merge.writer.close
19
- ensure
20
- File.unlink(test_path) if File.exist?(test_path)
21
- end
22
- end
23
-
24
- def test_can_use_a_pathname_as_output
25
- test_path = Pathname.new("tmp/test_can_use_a_pathname_as_output.json")
26
- begin
27
- merge = ::FlatKit::Merge.new(output: test_path, inputs: [], input_fallback: "json", compare_fields: [])
28
- assert_equal(test_path.to_s, merge.writer.output.name)
29
- assert_instance_of(::FlatKit::Output::File, merge.writer.output)
30
- merge.writer.close
31
- ensure
32
- test_path.unlink if test_path.exist?
33
- end
34
- end
35
-
36
- def test_raises_error_if_unable_to_parse_output
37
- test_path = Object.new
38
- assert_raises(FlatKit::Error) { ::FlatKit::Merge.new(output: test_path, inputs: [], compare_fields: []) }
39
- end
40
- end