flat_kit 0.3.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +1 -2
  3. data/HISTORY.md +9 -0
  4. data/Manifest.txt +3 -42
  5. data/{bin → exe}/fk +2 -1
  6. data/flat_kit.gemspec +33 -0
  7. data/lib/flat_kit/cli.rb +46 -32
  8. data/lib/flat_kit/command/cat.rb +34 -32
  9. data/lib/flat_kit/command/merge.rb +37 -36
  10. data/lib/flat_kit/command/sort.rb +37 -37
  11. data/lib/flat_kit/command/stats.rb +41 -39
  12. data/lib/flat_kit/command.rb +10 -11
  13. data/lib/flat_kit/descendant_tracker.rb +9 -6
  14. data/lib/flat_kit/error.rb +4 -0
  15. data/lib/flat_kit/event_emitter.rb +5 -2
  16. data/lib/flat_kit/field_stats.rb +31 -26
  17. data/lib/flat_kit/field_type/boolean_type.rb +9 -5
  18. data/lib/flat_kit/field_type/date_type.rb +19 -17
  19. data/lib/flat_kit/field_type/float_type.rb +15 -9
  20. data/lib/flat_kit/field_type/guess_type.rb +9 -6
  21. data/lib/flat_kit/field_type/integer_type.rb +6 -4
  22. data/lib/flat_kit/field_type/null_type.rb +5 -1
  23. data/lib/flat_kit/field_type/string_type.rb +8 -6
  24. data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
  25. data/lib/flat_kit/field_type/unknown_type.rb +12 -8
  26. data/lib/flat_kit/field_type.rb +52 -44
  27. data/lib/flat_kit/format.rb +11 -5
  28. data/lib/flat_kit/input/file.rb +11 -9
  29. data/lib/flat_kit/input/io.rb +18 -21
  30. data/lib/flat_kit/input.rb +8 -7
  31. data/lib/flat_kit/internal_node.rb +22 -19
  32. data/lib/flat_kit/jsonl/format.rb +6 -2
  33. data/lib/flat_kit/jsonl/reader.rb +7 -4
  34. data/lib/flat_kit/jsonl/record.rb +15 -18
  35. data/lib/flat_kit/jsonl/writer.rb +8 -10
  36. data/lib/flat_kit/jsonl.rb +8 -4
  37. data/lib/flat_kit/leaf_node.rb +6 -5
  38. data/lib/flat_kit/log_formatter.rb +20 -0
  39. data/lib/flat_kit/logger.rb +12 -19
  40. data/lib/flat_kit/merge.rb +21 -18
  41. data/lib/flat_kit/merge_tree.rb +5 -6
  42. data/lib/flat_kit/output/file.rb +13 -9
  43. data/lib/flat_kit/output/io.rb +40 -35
  44. data/lib/flat_kit/output.rb +8 -7
  45. data/lib/flat_kit/position.rb +3 -4
  46. data/lib/flat_kit/reader.rb +8 -8
  47. data/lib/flat_kit/record.rb +12 -12
  48. data/lib/flat_kit/sentinel_internal_node.rb +6 -5
  49. data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
  50. data/lib/flat_kit/sort.rb +8 -9
  51. data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
  52. data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
  53. data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
  54. data/lib/flat_kit/stat_type.rb +18 -13
  55. data/lib/flat_kit/stats.rb +12 -14
  56. data/lib/flat_kit/writer.rb +5 -6
  57. data/lib/flat_kit/xsv/format.rb +6 -2
  58. data/lib/flat_kit/xsv/reader.rb +8 -6
  59. data/lib/flat_kit/xsv/record.rb +21 -15
  60. data/lib/flat_kit/xsv/writer.rb +13 -10
  61. data/lib/flat_kit/xsv.rb +7 -4
  62. data/lib/flat_kit.rb +31 -26
  63. metadata +20 -158
  64. data/Rakefile +0 -21
  65. data/examples/stream-active-record-to-csv.rb +0 -42
  66. data/tasks/default.rake +0 -242
  67. data/tasks/extension.rake +0 -38
  68. data/tasks/man.rake +0 -7
  69. data/tasks/this.rb +0 -208
  70. data/test/device_dataset.rb +0 -117
  71. data/test/field_type/test_boolean_type.rb +0 -65
  72. data/test/field_type/test_date_type.rb +0 -71
  73. data/test/field_type/test_float_type.rb +0 -56
  74. data/test/field_type/test_guess_type.rb +0 -14
  75. data/test/field_type/test_integer_type.rb +0 -52
  76. data/test/field_type/test_null_type.rb +0 -41
  77. data/test/field_type/test_string_type.rb +0 -18
  78. data/test/field_type/test_timestamp_type.rb +0 -108
  79. data/test/field_type/test_unknown_type.rb +0 -35
  80. data/test/input/test_file.rb +0 -73
  81. data/test/input/test_io.rb +0 -93
  82. data/test/jsonl/test_format.rb +0 -22
  83. data/test/jsonl/test_reader.rb +0 -49
  84. data/test/jsonl/test_record.rb +0 -61
  85. data/test/jsonl/test_writer.rb +0 -86
  86. data/test/output/test_file.rb +0 -60
  87. data/test/output/test_io.rb +0 -104
  88. data/test/run +0 -23
  89. data/test/stat_type/test_nominal_stats.rb +0 -69
  90. data/test/stat_type/test_numerical_stats.rb +0 -118
  91. data/test/stat_type/test_ordinal_stats.rb +0 -92
  92. data/test/test_conversions.rb +0 -45
  93. data/test/test_event_emitter.rb +0 -89
  94. data/test/test_field_stats.rb +0 -134
  95. data/test/test_field_type.rb +0 -34
  96. data/test/test_format.rb +0 -24
  97. data/test/test_helper.rb +0 -26
  98. data/test/test_merge.rb +0 -40
  99. data/test/test_merge_tree.rb +0 -64
  100. data/test/test_version.rb +0 -11
  101. data/test/xsv/test_format.rb +0 -22
  102. data/test/xsv/test_reader.rb +0 -61
  103. data/test/xsv/test_record.rb +0 -69
  104. data/test/xsv/test_writer.rb +0 -89
data/test/run DELETED
@@ -1,23 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
-
4
- test_dir = __dir__
5
-
6
- $: << File.join(File.dirname(test_dir), "lib")
7
- $: << test_dir
8
-
9
- require_relative './test_helper'
10
-
11
- require 'find'
12
-
13
- if ARGV.empty? then
14
- Find.find(test_dir) do |path|
15
- next unless File.file?(path) && File.basename(path) =~ /\Atest_.*\.rb\Z/
16
- require path
17
- end
18
- else
19
- ARGV.each do |f|
20
- require File.expand_path(f)
21
- end
22
- end
23
-
@@ -1,69 +0,0 @@
1
- require_relative '../test_helper'
2
-
3
- module TestStatType
4
- class TestNominalStats < ::Minitest::Test
5
- def setup
6
- @unique_values = ('a'..'f').to_a
7
- @values = Array.new.tap do |a|
8
- @unique_values.each do |letter|
9
- (Random.rand(42) + 1).times { a << letter }
10
- end
11
- end
12
-
13
- @frequencies = @values.tally
14
-
15
- @stats = ::FlatKit::StatType::NominalStats.new
16
- @all_stats = ::FlatKit::StatType::NominalStats.new(collecting_frequencies: true)
17
-
18
- @values.each do |v|
19
- @stats.update(v)
20
- @all_stats.update(v)
21
- end
22
- end
23
-
24
- def test_count
25
- assert_equal(@values.size, @stats.count)
26
- assert_equal(@values.size, @all_stats.count)
27
- end
28
-
29
- def test_does_not_collect_unique_count_by_default
30
- assert_nil(@stats.unique_count)
31
- end
32
-
33
- def test_does_not_collect_unique_values_by_default
34
- assert_nil(@stats.unique_values)
35
- end
36
-
37
- def test_does_not_collect_frequencies_by_default
38
- assert_nil(@stats.frequencies)
39
- end
40
-
41
- def test_unique_count
42
- assert_equal(@unique_values.size, @all_stats.unique_count)
43
- end
44
-
45
- def test_unique_values
46
- assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
47
- end
48
-
49
- def test_frequencies
50
- assert_equal(@frequencies, @all_stats.frequencies)
51
- end
52
-
53
- def test_default_to_hash
54
- expecting = { "count" => @values.size }
55
- assert_equal(expecting, @stats.to_hash)
56
- end
57
-
58
- def test_all_stats_hash
59
- expecting = {
60
- "count" => @values.size,
61
- "unique_count" => @unique_values.size,
62
- "unique_values" => @unique_values.sort,
63
- "mode" => @frequencies.max_by { |k,v| v }.first
64
- }
65
- assert_equal(expecting, @all_stats.to_hash)
66
- end
67
- end
68
- end
69
-
@@ -1,118 +0,0 @@
1
- require_relative '../test_helper'
2
-
3
-
4
- module TestStatType
5
- class TestNumericalStats < ::Minitest::Test
6
- def setup
7
- @stats = FlatKit::StatType::NumericalStats.new
8
- @full_stats = FlatKit::StatType::NumericalStats.new
9
- @all_stats = FlatKit::StatType::NumericalStats.new(collecting_frequencies: true)
10
- [ 1, 2, 3].each { |i| @full_stats.update( i ) }
11
- end
12
-
13
- def test_intialized_with_usable_values
14
- assert_equal(0, @stats.count)
15
- assert_equal(Float::INFINITY, @stats.min)
16
- assert_equal(-Float::INFINITY, @stats.max)
17
- assert_equal(0.0, @stats.sum)
18
- assert_equal(0.0, @stats.rate)
19
- end
20
-
21
- def test_calculates_mean
22
- assert_equal(2.0, @full_stats.mean)
23
- end
24
-
25
- def test_calculates_rate
26
- assert_equal(0.5, @full_stats.rate)
27
- end
28
-
29
- def test_tracks_the_maximum_value
30
- assert_equal(3.0, @full_stats.max)
31
- end
32
-
33
- def test_tracks_the_minimum_value
34
- assert_equal(1.0, @full_stats.min)
35
- end
36
-
37
- def test_tracks_the_count
38
- assert_equal(3,@full_stats.count)
39
- end
40
-
41
- def test_tracks_the_sum
42
- assert_equal(6.0, @full_stats.sum)
43
- end
44
-
45
- def test_calculates_the_standard_deviation
46
- assert_equal(1.0, @full_stats.stddev)
47
- end
48
-
49
- def test_calculates_the_sum_of_squares
50
- assert_equal(14, @full_stats.sumsq)
51
- end
52
-
53
- def test_converts_to_a_hash
54
- h = @full_stats.to_hash
55
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
56
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
57
- end
58
-
59
- def test_converts_to_a_limited_hash_if_given_arguments
60
- h = @full_stats.to_hash( "min", "max", "mean" )
61
- assert_equal(3, h.size)
62
- assert_equal(%w[ max mean min], h.keys.sort)
63
-
64
- h = @full_stats.to_hash( %w[ count rate ] )
65
- assert_equal(2, h.size)
66
- assert_equal(%w[ count rate ], h.keys.sort)
67
- end
68
-
69
- def test_raises_nomethoderror_if_an_invalid_stat_is_used
70
- assert_raises(NoMethodError) { @full_stats.to_hash( "wibble" ) }
71
- end
72
-
73
- def test_converts_to_a_json_string
74
- j = @full_stats.to_json
75
- h = JSON.parse( j )
76
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats.size, h.size)
77
- assert_equal(::FlatKit::StatType::NumericalStats.default_stats, h.keys.sort)
78
- end
79
-
80
- def test_converts_to_a_limited_json_hash_if_given_arguments
81
- j = @full_stats.to_json( "min", "max", "mean" )
82
- h = JSON.parse( j )
83
- assert_equal(3, h.size)
84
- assert_equal(%w[ max mean min], h.keys.sort)
85
-
86
- j = @full_stats.to_json( %w[ count rate ] )
87
- h = JSON.parse( j )
88
- assert_equal(2, h.size)
89
- assert_equal(%w[ count rate ], h.keys.sort)
90
- end
91
-
92
- def test_raises_nomethoderror_if_an_invalid_json_stat_is_used
93
- assert_raises(NoMethodError) { @full_stats.to_json( "wibble" ) }
94
- end
95
-
96
- def test_collects_mode
97
- values = Array.new.tap do |a|
98
- 100.times {
99
- n = Random.rand(10)
100
- a << n
101
- @all_stats.update(n)
102
- }
103
- end
104
-
105
- tally = values.tally
106
- mode_value = tally.max_by { |v, count| count }.first
107
-
108
- assert_equal(mode_value, @all_stats.mode)
109
- end
110
-
111
- def test_collecting_frequences_reports_extra_stat_names
112
- stat_names = @all_stats.collected_stats
113
- assert_includes(stat_names, "mode")
114
- assert_includes(stat_names, "unique_count")
115
- assert_includes(stat_names, "unique_values")
116
- end
117
- end
118
- end
@@ -1,92 +0,0 @@
1
- require_relative '../test_helper'
2
-
3
- module TestStatType
4
- class TestOrdinalStats < ::Minitest::Test
5
- def setup
6
- today = Date.today
7
- next_month = today >> 1
8
- last_day_of_month = (next_month - 1).mday
9
-
10
- @start_date = Date.new(today.year, today.month, 1)
11
- @end_date = Date.new(today.year, today.month, last_day_of_month)
12
-
13
- @unique_values = (@start_date..@end_date).to_a
14
- @values = Array.new.tap do |a|
15
- @unique_values.each do |date|
16
- (Random.rand(42) + 1).times { a << date}
17
- end
18
- end
19
-
20
- @frequencies = @values.tally
21
-
22
- @stats = ::FlatKit::StatType::OrdinalStats.new
23
- @all_stats = ::FlatKit::StatType::OrdinalStats.new(collecting_frequencies: true)
24
-
25
- @values.each do |v|
26
- @stats.update(v)
27
- @all_stats.update(v)
28
- end
29
- end
30
-
31
- def test_count
32
- assert_equal(@values.size, @stats.count)
33
- assert_equal(@values.size, @all_stats.count)
34
- end
35
-
36
- def test_min
37
- assert_equal(@values.min, @stats.min)
38
- assert_equal(@values.min, @all_stats.min)
39
- end
40
-
41
- def test_max
42
- assert_equal(@values.max, @stats.max)
43
- assert_equal(@values.max, @all_stats.max)
44
- end
45
-
46
- def test_does_not_collect_unique_count_by_default
47
- assert_nil(@stats.unique_count)
48
- end
49
-
50
- def test_does_not_collect_unique_values_by_default
51
- assert_nil(@stats.unique_values)
52
- end
53
-
54
- def test_does_not_collect_frequencies_by_default
55
- assert_nil(@stats.frequencies)
56
- end
57
-
58
- def test_unique_count
59
- assert_equal(@unique_values.size, @all_stats.unique_count)
60
- end
61
-
62
- def test_unique_values
63
- assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
64
- end
65
-
66
- def test_frequencies
67
- assert_equal(@frequencies, @all_stats.frequencies)
68
- end
69
-
70
- def test_default_to_hash
71
- expecting = {
72
- "count" => @values.size,
73
- "max" => @values.max,
74
- "min" => @values.min,
75
- }
76
- assert_equal(expecting, @stats.to_hash)
77
- end
78
-
79
- def test_all_stats_hash
80
- expecting = {
81
- "count" => @values.size,
82
- "unique_count" => @unique_values.size,
83
- "unique_values" => @unique_values.sort,
84
- "mode" => @frequencies.max_by { |k,v| v }.first,
85
- "max" => @values.max,
86
- "min" => @values.min,
87
- }
88
- assert_equal(expecting, @all_stats.to_hash)
89
- end
90
- end
91
- end
92
-
@@ -1,45 +0,0 @@
1
- require 'test_helper'
2
-
3
- class TestConversions < ::Minitest::Test
4
- def setup
5
- @one_row_dataset = DeviceDataset.new(count: 1)
6
- @src_record = @one_row_dataset.records.first
7
- @csv_row = @one_row_dataset.records_as_csv_rows.first
8
- @compare_fields = @one_row_dataset.compare_fields
9
- end
10
-
11
- def test_from_csv_to_json
12
- xsv_record = FlatKit::Xsv::Record.new(data: @csv_row, compare_fields: @compare_fields)
13
- json_record = FlatKit::Jsonl::Record.from_record(xsv_record)
14
-
15
- assert_equal(@one_row_dataset.records.first, xsv_record.to_hash)
16
- assert_equal(@one_row_dataset.records.first, json_record.to_hash)
17
- assert_equal(xsv_record, json_record)
18
- end
19
-
20
- def test_from_json_to_csv
21
- src_json = JSON.generate(@src_record)
22
- json_record = FlatKit::Jsonl::Record.new(data: src_json, compare_fields: @compare_fields)
23
- xsv_record = FlatKit::Xsv::Record.from_record(json_record)
24
-
25
- assert_equal(@one_row_dataset.records.first, xsv_record.to_hash)
26
- assert_equal(@one_row_dataset.records.first, json_record.to_hash)
27
- assert_equal(xsv_record, json_record)
28
- end
29
-
30
- def test_roundtrip_csv_json_csv
31
- xsv_record = FlatKit::Xsv::Record.new(data: @csv_row, compare_fields: @compare_fields)
32
- json_record = FlatKit::Jsonl::Record.from_record(xsv_record)
33
- xsv2 = FlatKit::Xsv::Record.from_record(json_record)
34
-
35
- assert_equal(xsv_record.to_s, xsv2.to_s)
36
- end
37
-
38
- def test_roundtrip_json_csv_json
39
- src_json = JSON.generate(@src_record)
40
- json_record = FlatKit::Jsonl::Record.new(data: src_json, compare_fields: @compare_fields)
41
- xsv_record = FlatKit::Xsv::Record.from_record(json_record)
42
- json2 = FlatKit::Jsonl::Record.from_record(xsv_record)
43
- assert_equal(src_json, json2.to_s)
44
- end
45
- end
@@ -1,89 +0,0 @@
1
- require 'test_helper'
2
-
3
- class TestEventEmitter < ::Minitest::Test
4
- class Pub
5
- include ::FlatKit::EventEmitter
6
- end
7
-
8
- class Sub
9
- attr_reader :name
10
- attr_reader :data
11
- attr_reader :meta
12
-
13
- def initialize
14
- @name = nil
15
- @data = nil
16
- @meta = nil
17
- end
18
-
19
- def [](key)
20
- @meta[key]
21
- end
22
-
23
- def on_event(name:, data:, meta:)
24
- @name = name
25
- @data = data
26
- @meta = meta
27
- end
28
- end
29
-
30
- class BadSub; end
31
-
32
- def setup
33
- @emitter = Pub.new
34
- @receiver = Sub.new
35
- end
36
-
37
- def test_counts_no_listeners_before_adding_one
38
- assert_equal(0, @emitter.count_listeners)
39
- end
40
-
41
- def test_adds_listener
42
- @emitter.add_listener(@receiver)
43
- assert_equal(1, @emitter.count_listeners)
44
- end
45
-
46
- def test_removes_listener
47
- @emitter.add_listener(@receiver)
48
- assert_equal(1, @emitter.count_listeners)
49
-
50
- @emitter.remove_listener(@receiver)
51
- assert_equal(0, @emitter.count_listeners)
52
- end
53
-
54
- def test_only_adds_an_listener_once
55
- @emitter.add_listener(@receiver)
56
- assert_equal(1, @emitter.count_listeners)
57
-
58
- @emitter.add_listener(@receiver)
59
- assert_equal(1, @emitter.count_listeners)
60
- end
61
-
62
- def test_verifies_reciever_responds_t_observed
63
- assert_raises(::NoMethodError) { @emitter.add_listener(BadSub.new) }
64
- end
65
-
66
- def test_listeners_get_notified
67
- @receiver_2 = Sub.new
68
- @emitter.add_listener(@receiver)
69
- @emitter.add_listener(@receiver_2)
70
-
71
- meta = {
72
- foo: "foo",
73
- bar: 42,
74
- }
75
- @emitter.notify_listeners(name: :notification, data: "DATA!", meta: meta)
76
-
77
- assert_equal(:notification, @receiver.name)
78
- assert_equal(:notification, @receiver_2.name)
79
-
80
- assert_equal("DATA!", @receiver.data)
81
- assert_equal("DATA!", @receiver_2.data)
82
-
83
- assert_equal("foo", @receiver[:foo])
84
- assert_equal("foo", @receiver_2[:foo])
85
-
86
- assert_equal(42, @receiver[:bar])
87
- assert_equal(42, @receiver_2[:bar])
88
- end
89
- end
@@ -1,134 +0,0 @@
1
- require_relative './test_helper'
2
-
3
- require 'faker'
4
-
5
- class TestFieldStats < ::Minitest::Test
6
- # returns [FieldStats, Array] where the array is the original data
7
- def generate_data_with(count: 100, stats: ::FlatKit::FieldStats.new(name: "data"), &block)
8
- list = Array.new.tap do |a|
9
- count.times do
10
- n = block.call
11
- stats.update(n)
12
- a << n
13
- end
14
- end
15
- [stats, list]
16
- end
17
-
18
- def test_raises_error_on_invalid_stats
19
- assert_raises(ArgumentError) { ::FlatKit::FieldStats.new(name: "test", stats_to_collect: :whatever) }
20
- end
21
-
22
- def test_collects_numeric_default_stats
23
- field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
24
-
25
- null_count = 5
26
- null_count.times {
27
- field_stats.update(nil)
28
- }
29
-
30
- avg = number_data.sum / number_data.size
31
- min = number_data.min
32
- max = number_data.max
33
- sum = number_data.sum
34
-
35
- refute(field_stats.field_type_determined?)
36
-
37
- assert_equal(null_count, field_stats.null_count)
38
- assert_equal(number_data.size, field_stats.count)
39
-
40
- assert(field_stats.field_type_determined?)
41
-
42
- assert_in_epsilon(avg, field_stats.mean)
43
- assert_equal(min, field_stats.min)
44
- assert_equal(max, field_stats.max)
45
- assert_in_epsilon(sum, field_stats.sum)
46
- expected_percent = (null_count.to_f / (null_count + number_data.size)) * 100.0
47
- assert_in_epsilon(expected_percent, field_stats.null_percent)
48
- end
49
-
50
- def test_collect_numeric_cardinality_stats
51
- field_stats = ::FlatKit::FieldStats.new(name: "number-cardinality",
52
- stats_to_collect: ::FlatKit::FieldStats::ALL_STATS)
53
- field_stats, number_data = generate_data_with(stats: field_stats) {
54
- Faker::Number.within(range:1..25)
55
- }
56
-
57
- avg = number_data.sum.to_f / number_data.size
58
- min = number_data.min
59
- max = number_data.max
60
-
61
- assert(field_stats.collecting_frequencies?)
62
- refute(field_stats.field_type_determined?)
63
-
64
- assert_equal(number_data.size, field_stats.count)
65
-
66
- assert(field_stats.field_type_determined?)
67
-
68
- assert_in_epsilon(avg, field_stats.mean)
69
- assert_equal(min, field_stats.min)
70
- assert_equal(max, field_stats.max)
71
-
72
- assert_equal(number_data.tally.keys.size, field_stats.unique_count)
73
- assert_equal(number_data.tally.keys.sort, field_stats.unique_values.sort)
74
- assert_equal(number_data.tally, field_stats.frequencies)
75
-
76
- mode = number_data.tally.max_by{ |k,v| v }.first
77
- assert_equal(mode, field_stats.mode)
78
- end
79
-
80
- def test_unknown_type_stats
81
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-with-unknown")
82
- field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
83
-
84
- unknown_count = 20
85
- unknown_count.times {
86
- field_stats.update("unknown")
87
- }
88
-
89
- refute(field_stats.field_type_determined?)
90
-
91
- assert_equal(unknown_count, field_stats.unknown_count)
92
- assert_equal(unknown_count + number_data.size, field_stats.total_count)
93
-
94
- expected_percent = (unknown_count.to_f / (unknown_count + number_data.size)) * 100.0
95
-
96
- assert_in_epsilon(expected_percent, field_stats.unknown_percent)
97
- end
98
-
99
- def test_resolves_type_automatically
100
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-autoresolve",guess_threshold: 101)
101
- field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 1.0..100.0) }
102
-
103
- refute(field_stats.field_type_determined?)
104
- field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 200.0..300.0) }
105
- assert(field_stats.field_type_determined?)
106
- end
107
-
108
- def test_resolves_integer_appropriately_with_mixed_data
109
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
110
- field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..1).to_s }
111
- field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
112
-
113
- assert_equal(::FlatKit::FieldType::IntegerType, field_stats.field_type)
114
-
115
- end
116
-
117
- def test_resolves_boolean_appropriately_with_mixed_data
118
- field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
119
- field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Boolean.boolean.to_s }
120
- field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
121
- assert_equal(::FlatKit::FieldType::BooleanType, field_stats.field_type)
122
- end
123
-
124
- def test_resolves_string_appropriately_with_mixed_data
125
- field_stats = ::FlatKit::FieldStats.new(name: "string",guess_threshold: 100)
126
- field_stats, _ = generate_data_with(count: 61, stats: field_stats) { Faker::Color.name.to_s }
127
- field_stats, _ = generate_data_with(count: 59, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
128
- assert_equal(::FlatKit::FieldType::StringType, field_stats.field_type)
129
-
130
- assert_equal(120, field_stats.count)
131
- assert_equal(0, field_stats.unknown_count)
132
- assert_equal(0, field_stats.null_count)
133
- end
134
- end
@@ -1,34 +0,0 @@
1
- require_relative './test_helper'
2
-
3
- module TestFieldType
4
- class TestFieldType < ::Minitest::Test
5
-
6
- def test_weight_raises_exception
7
- assert_raises(NotImplementedError) { ::FlatKit::FieldType.weight }
8
- end
9
-
10
- def test_best_guesses
11
- guesses = {
12
- "t" => ::FlatKit::FieldType::BooleanType,
13
- "1" => ::FlatKit::FieldType::BooleanType,
14
- "0" => ::FlatKit::FieldType::BooleanType,
15
- "n" => ::FlatKit::FieldType::BooleanType,
16
- "42" => ::FlatKit::FieldType::IntegerType,
17
- "nil" => ::FlatKit::FieldType::NullType,
18
- "n/a" => ::FlatKit::FieldType::UnknownType,
19
- "foo" => ::FlatKit::FieldType::StringType,
20
- "12.3" => ::FlatKit::FieldType::FloatType,
21
- "2021-02-26" => ::FlatKit::FieldType::DateType,
22
- "2020-03-03T12:34:56Z" => ::FlatKit::FieldType::TimestampType,
23
- }
24
-
25
- guesses.each do |test, expected|
26
- assert_equal(expected, ::FlatKit::FieldType.best_guess(test), "Expected '#{test}' to be #{expected}")
27
- end
28
- end
29
-
30
- def test_children_exist
31
- assert_equal(9,::FlatKit::FieldType.children.size)
32
- end
33
- end
34
- end
data/test/test_format.rb DELETED
@@ -1,24 +0,0 @@
1
- require_relative 'test_helper'
2
-
3
- class TestFormat < ::Minitest::Test
4
- def test_finds_jsonl_format
5
- klass = ::FlatKit::Format.for("data.json.gz")
6
- assert_equal(::FlatKit::Jsonl::Format, klass)
7
- end
8
-
9
- def test_finds_xsv_format
10
- klass = ::FlatKit::Format.for("data.csv.gz")
11
- assert_equal(::FlatKit::Xsv::Format, klass)
12
- end
13
-
14
- def test_finds_jsonl_format_for_full_path
15
- klass = ::FlatKit::Format.for("tmp/sorted/foo.jsonl")
16
- assert_equal(::FlatKit::Jsonl::Format, klass)
17
- end
18
-
19
- def test_finds_jsonl_format_with_fallback
20
- path = "tmp/sorted/foo.json"
21
- klass = ::FlatKit::Format.for_with_fallback!(path: path, fallback: "auto")
22
- assert_equal(::FlatKit::Jsonl::Format, klass)
23
- end
24
- end
data/test/test_helper.rb DELETED
@@ -1,26 +0,0 @@
1
- require 'simplecov'
2
- SimpleCov.start if ENV['COVERAGE']
3
-
4
- require 'byebug'
5
-
6
- require 'minitest/autorun'
7
- require 'minitest/focus'
8
- require 'minitest/pride'
9
-
10
- module TestHelper
11
- def scratch_dir
12
- p = Pathname.new(__FILE__).parent.parent.join('tmp/testing_scratch')
13
- p.mkpath
14
- p
15
- end
16
-
17
- def generate_slug(length: 10)
18
- SecureRandom.alphanumeric(10)
19
- end
20
-
21
- def scratch_file(prefix: "test_", slug: generate_slug, extension: ".jsonl")
22
- scratch_dir.join("#{prefix}#{slug}#{extension}")
23
- end
24
- end
25
- require_relative '../lib/flat_kit'
26
- require_relative './device_dataset'
data/test/test_merge.rb DELETED
@@ -1,40 +0,0 @@
1
- require 'test_helper'
2
-
3
- class TestMerge < ::Minitest::Test
4
-
5
- def test_can_use_use_dash_as_output
6
- merge = ::FlatKit::Merge.new(inputs: [], input_fallback: "json",
7
- output: "-", output_fallback: "json", compare_fields: [])
8
- assert_match(/STDOUT/, merge.writer.output.name)
9
- assert_instance_of(::FlatKit::Output::IO, merge.writer.output)
10
- end
11
-
12
- def test_can_use_a_text_path_as_output
13
- test_path = "tmp/test_can_use_a_text_path_as_output.json"
14
- begin
15
- merge = ::FlatKit::Merge.new(output: test_path, inputs: [], input_fallback: "json", compare_fields: [])
16
- assert_equal(test_path, merge.writer.output.name)
17
- assert_instance_of(::FlatKit::Output::File, merge.writer.output)
18
- merge.writer.close
19
- ensure
20
- File.unlink(test_path) if File.exist?(test_path)
21
- end
22
- end
23
-
24
- def test_can_use_a_pathname_as_output
25
- test_path = Pathname.new("tmp/test_can_use_a_pathname_as_output.json")
26
- begin
27
- merge = ::FlatKit::Merge.new(output: test_path, inputs: [], input_fallback: "json", compare_fields: [])
28
- assert_equal(test_path.to_s, merge.writer.output.name)
29
- assert_instance_of(::FlatKit::Output::File, merge.writer.output)
30
- merge.writer.close
31
- ensure
32
- test_path.unlink if test_path.exist?
33
- end
34
- end
35
-
36
- def test_raises_error_if_unable_to_parse_output
37
- test_path = Object.new
38
- assert_raises(FlatKit::Error) { ::FlatKit::Merge.new(output: test_path, inputs: [], compare_fields: []) }
39
- end
40
- end