flat_kit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,92 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestStatType
4
+ class TestOrdinalStats < ::Minitest::Test
5
+ def setup
6
+ today = Date.today
7
+ next_month = today >> 1
8
+ last_day_of_month = (next_month - 1).mday
9
+
10
+ @start_date = Date.new(today.year, today.month, 1)
11
+ @end_date = Date.new(today.year, today.month, last_day_of_month)
12
+
13
+ @unique_values = (@start_date..@end_date).to_a
14
+ @values = Array.new.tap do |a|
15
+ @unique_values.each do |date|
16
+ (Random.rand(42) + 1).times { a << date}
17
+ end
18
+ end
19
+
20
+ @frequencies = @values.tally
21
+
22
+ @stats = ::FlatKit::StatType::OrdinalStats.new
23
+ @all_stats = ::FlatKit::StatType::OrdinalStats.new(collecting_frequencies: true)
24
+
25
+ @values.each do |v|
26
+ @stats.update(v)
27
+ @all_stats.update(v)
28
+ end
29
+ end
30
+
31
+ def test_count
32
+ assert_equal(@values.size, @stats.count)
33
+ assert_equal(@values.size, @all_stats.count)
34
+ end
35
+
36
+ def test_min
37
+ assert_equal(@values.min, @stats.min)
38
+ assert_equal(@values.min, @all_stats.min)
39
+ end
40
+
41
+ def test_max
42
+ assert_equal(@values.max, @stats.max)
43
+ assert_equal(@values.max, @all_stats.max)
44
+ end
45
+
46
+ def test_does_not_collect_unique_count_by_default
47
+ assert_nil(@stats.unique_count)
48
+ end
49
+
50
+ def test_does_not_collect_unique_values_by_default
51
+ assert_nil(@stats.unique_values)
52
+ end
53
+
54
+ def test_does_not_collect_frequencies_by_default
55
+ assert_nil(@stats.frequencies)
56
+ end
57
+
58
+ def test_unique_count
59
+ assert_equal(@unique_values.size, @all_stats.unique_count)
60
+ end
61
+
62
+ def test_unique_values
63
+ assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
64
+ end
65
+
66
+ def test_frequencies
67
+ assert_equal(@frequencies, @all_stats.frequencies)
68
+ end
69
+
70
+ def test_default_to_hash
71
+ expecting = {
72
+ "count" => @values.size,
73
+ "max" => @values.max,
74
+ "min" => @values.min,
75
+ }
76
+ assert_equal(expecting, @stats.to_hash)
77
+ end
78
+
79
+ def test_all_stats_hash
80
+ expecting = {
81
+ "count" => @values.size,
82
+ "unique_count" => @unique_values.size,
83
+ "unique_values" => @unique_values.sort,
84
+ "mode" => @frequencies.max_by { |k,v| v }.first,
85
+ "max" => @values.max,
86
+ "min" => @values.min,
87
+ }
88
+ assert_equal(expecting, @all_stats.to_hash)
89
+ end
90
+ end
91
+ end
92
+
@@ -8,15 +8,22 @@ class TestEventEmitter < ::Minitest::Test
8
8
  class Sub
9
9
  attr_reader :name
10
10
  attr_reader :data
11
+ attr_reader :meta
11
12
 
12
13
  def initialize
13
14
  @name = nil
14
15
  @data = nil
16
+ @meta = nil
15
17
  end
16
18
 
17
- def on_event(name:, data:)
19
+ def [](key)
20
+ @meta[key]
21
+ end
22
+
23
+ def on_event(name:, data:, meta:)
18
24
  @name = name
19
25
  @data = data
26
+ @meta = meta
20
27
  end
21
28
  end
22
29
 
@@ -61,12 +68,22 @@ class TestEventEmitter < ::Minitest::Test
61
68
  @emitter.add_listener(@receiver)
62
69
  @emitter.add_listener(@receiver_2)
63
70
 
64
- @emitter.notify_listeners(name: :notification, data: "DATA!")
71
+ meta = {
72
+ foo: "foo",
73
+ bar: 42,
74
+ }
75
+ @emitter.notify_listeners(name: :notification, data: "DATA!", meta: meta)
65
76
 
66
77
  assert_equal(:notification, @receiver.name)
67
78
  assert_equal(:notification, @receiver_2.name)
68
79
 
69
80
  assert_equal("DATA!", @receiver.data)
70
81
  assert_equal("DATA!", @receiver_2.data)
82
+
83
+ assert_equal("foo", @receiver[:foo])
84
+ assert_equal("foo", @receiver_2[:foo])
85
+
86
+ assert_equal(42, @receiver[:bar])
87
+ assert_equal(42, @receiver_2[:bar])
71
88
  end
72
89
  end
@@ -0,0 +1,134 @@
1
+ require_relative './test_helper'
2
+
3
+ require 'faker'
4
+
5
+ class TestFieldStats < ::Minitest::Test
6
+ # returns [FieldStats, Array] where the array is the original data
7
+ def generate_data_with(count: 100, stats: ::FlatKit::FieldStats.new(name: "data"), &block)
8
+ list = Array.new.tap do |a|
9
+ count.times do
10
+ n = block.call
11
+ stats.update(n)
12
+ a << n
13
+ end
14
+ end
15
+ [stats, list]
16
+ end
17
+
18
+ def test_raises_error_on_invalid_stats
19
+ assert_raises(ArgumentError) { ::FlatKit::FieldStats.new(name: "test", stats_to_collect: :whatever) }
20
+ end
21
+
22
+ def test_collects_numeric_default_stats
23
+ field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
24
+
25
+ null_count = 5
26
+ null_count.times {
27
+ field_stats.update(nil)
28
+ }
29
+
30
+ avg = number_data.sum / number_data.size
31
+ min = number_data.min
32
+ max = number_data.max
33
+ sum = number_data.sum
34
+
35
+ refute(field_stats.field_type_determined?)
36
+
37
+ assert_equal(null_count, field_stats.null_count)
38
+ assert_equal(number_data.size, field_stats.count)
39
+
40
+ assert(field_stats.field_type_determined?)
41
+
42
+ assert_in_epsilon(avg, field_stats.mean)
43
+ assert_equal(min, field_stats.min)
44
+ assert_equal(max, field_stats.max)
45
+ assert_in_epsilon(sum, field_stats.sum)
46
+ expected_percent = (null_count.to_f / (null_count + number_data.size)) * 100.0
47
+ assert_in_epsilon(expected_percent, field_stats.null_percent)
48
+ end
49
+
50
+ def test_collect_numeric_cardinality_stats
51
+ field_stats = ::FlatKit::FieldStats.new(name: "number-cardinality",
52
+ stats_to_collect: ::FlatKit::FieldStats::ALL_STATS)
53
+ field_stats, number_data = generate_data_with(stats: field_stats) {
54
+ Faker::Number.within(range:1..25)
55
+ }
56
+
57
+ avg = number_data.sum.to_f / number_data.size
58
+ min = number_data.min
59
+ max = number_data.max
60
+
61
+ assert(field_stats.collecting_frequencies?)
62
+ refute(field_stats.field_type_determined?)
63
+
64
+ assert_equal(number_data.size, field_stats.count)
65
+
66
+ assert(field_stats.field_type_determined?)
67
+
68
+ assert_in_epsilon(avg, field_stats.mean)
69
+ assert_equal(min, field_stats.min)
70
+ assert_equal(max, field_stats.max)
71
+
72
+ assert_equal(number_data.tally.keys.size, field_stats.unique_count)
73
+ assert_equal(number_data.tally.keys.sort, field_stats.unique_values.sort)
74
+ assert_equal(number_data.tally, field_stats.frequencies)
75
+
76
+ mode = number_data.tally.max_by{ |k,v| v }.first
77
+ assert_equal(mode, field_stats.mode)
78
+ end
79
+
80
+ def test_unknown_type_stats
81
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-with-unknown")
82
+ field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
83
+
84
+ unknown_count = 20
85
+ unknown_count.times {
86
+ field_stats.update("unknown")
87
+ }
88
+
89
+ refute(field_stats.field_type_determined?)
90
+
91
+ assert_equal(unknown_count, field_stats.unknown_count)
92
+ assert_equal(unknown_count + number_data.size, field_stats.total_count)
93
+
94
+ expected_percent = (unknown_count.to_f / (unknown_count + number_data.size)) * 100.0
95
+
96
+ assert_in_epsilon(expected_percent, field_stats.unknown_percent)
97
+ end
98
+
99
+ def test_resolves_type_automatically
100
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-autoresolve",guess_threshold: 101)
101
+ field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 1.0..100.0) }
102
+
103
+ refute(field_stats.field_type_determined?)
104
+ field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 200.0..300.0) }
105
+ assert(field_stats.field_type_determined?)
106
+ end
107
+
108
+ def test_resolves_integer_appropriately_with_mixed_data
109
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
110
+ field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..1).to_s }
111
+ field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
112
+
113
+ assert_equal(::FlatKit::FieldType::IntegerType, field_stats.field_type)
114
+
115
+ end
116
+
117
+ def test_resolves_boolean_appropriately_with_mixed_data
118
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
119
+ field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Boolean.boolean.to_s }
120
+ field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
121
+ assert_equal(::FlatKit::FieldType::BooleanType, field_stats.field_type)
122
+ end
123
+
124
+ def test_resolves_string_appropriately_with_mixed_data
125
+ field_stats = ::FlatKit::FieldStats.new(name: "string",guess_threshold: 100)
126
+ field_stats, _ = generate_data_with(count: 61, stats: field_stats) { Faker::Color.name.to_s }
127
+ field_stats, _ = generate_data_with(count: 59, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
128
+ assert_equal(::FlatKit::FieldType::StringType, field_stats.field_type)
129
+
130
+ assert_equal(120, field_stats.count)
131
+ assert_equal(0, field_stats.unknown_count)
132
+ assert_equal(0, field_stats.null_count)
133
+ end
134
+ end
@@ -0,0 +1,34 @@
1
+ require_relative './test_helper'
2
+
3
+ module TestFieldType
4
+ class TestFieldType < ::Minitest::Test
5
+
6
+ def test_weight_raises_exception
7
+ assert_raises(NotImplementedError) { ::FlatKit::FieldType.weight }
8
+ end
9
+
10
+ def test_best_guesses
11
+ guesses = {
12
+ "t" => ::FlatKit::FieldType::BooleanType,
13
+ "1" => ::FlatKit::FieldType::BooleanType,
14
+ "0" => ::FlatKit::FieldType::BooleanType,
15
+ "n" => ::FlatKit::FieldType::BooleanType,
16
+ "42" => ::FlatKit::FieldType::IntegerType,
17
+ "nil" => ::FlatKit::FieldType::NullType,
18
+ "n/a" => ::FlatKit::FieldType::UnknownType,
19
+ "foo" => ::FlatKit::FieldType::StringType,
20
+ "12.3" => ::FlatKit::FieldType::FloatType,
21
+ "2021-02-26" => ::FlatKit::FieldType::DateType,
22
+ "2020-03-03T12:34:56Z" => ::FlatKit::FieldType::TimestampType,
23
+ }
24
+
25
+ guesses.each do |test, expected|
26
+ assert_equal(expected, ::FlatKit::FieldType.best_guess(test), "Expected '#{test}' to be #{expected}")
27
+ end
28
+ end
29
+
30
+ def test_children_exist
31
+ assert_equal(9,::FlatKit::FieldType.children.size)
32
+ end
33
+ end
34
+ end
@@ -41,12 +41,33 @@ module TestXsv
41
41
  assert_equal(expected, actual)
42
42
  end
43
43
 
44
- def test_writes_to_io
44
+ def test_position
45
45
  File.open(@write_path, "w+") do |f|
46
- writer = ::FlatKit::Xsv::Writer.new(destination: f,fields: @reader.fields)
46
+ writer = ::FlatKit::Xsv::Writer.new(destination: f,fields: :auto)
47
+ records_bytes = 0
48
+ header_bytes = nil
49
+
50
+ @records.each_with_index do |r, idx|
51
+ record_length = r.to_s.bytesize
52
+
53
+ position = writer.write(r)
54
+ # make sure write stores the last_position api and returns that value
55
+ assert_equal(position, writer.last_position)
56
+
57
+ header_bytes = writer.header_bytes if header_bytes == nil
58
+ assert(header_bytes > 0)
59
+
60
+ assert_equal(idx, position.index)
61
+ assert_equal(header_bytes + records_bytes, position.offset)
62
+ assert_equal(record_length, position.bytesize)
63
+
64
+ records_bytes += record_length
65
+
66
+ current_position = writer.current_position
67
+ assert_equal(idx+1, current_position.index)
68
+ assert_equal(header_bytes + records_bytes, current_position.offset)
69
+ assert_equal(0, current_position.bytesize)
47
70
 
48
- @records.each do |r|
49
- writer.write(r)
50
71
  end
51
72
  writer.close
52
73
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flat_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Hinegardner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-03 00:00:00.000000000 Z
11
+ date: 2021-03-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: oj
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faker
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.16'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.16'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -128,15 +142,28 @@ files:
128
142
  - README.md
129
143
  - Rakefile
130
144
  - bin/fk
145
+ - examples/stream-active-record-to-csv.rb
131
146
  - lib/flat_kit.rb
132
147
  - lib/flat_kit/cli.rb
133
148
  - lib/flat_kit/command.rb
134
149
  - lib/flat_kit/command/cat.rb
135
150
  - lib/flat_kit/command/merge.rb
136
151
  - lib/flat_kit/command/sort.rb
152
+ - lib/flat_kit/command/stats.rb
137
153
  - lib/flat_kit/descendant_tracker.rb
138
154
  - lib/flat_kit/error.rb
139
155
  - lib/flat_kit/event_emitter.rb
156
+ - lib/flat_kit/field_stats.rb
157
+ - lib/flat_kit/field_type.rb
158
+ - lib/flat_kit/field_type/boolean_type.rb
159
+ - lib/flat_kit/field_type/date_type.rb
160
+ - lib/flat_kit/field_type/float_type.rb
161
+ - lib/flat_kit/field_type/guess_type.rb
162
+ - lib/flat_kit/field_type/integer_type.rb
163
+ - lib/flat_kit/field_type/null_type.rb
164
+ - lib/flat_kit/field_type/string_type.rb
165
+ - lib/flat_kit/field_type/timestamp_type.rb
166
+ - lib/flat_kit/field_type/unknown_type.rb
140
167
  - lib/flat_kit/format.rb
141
168
  - lib/flat_kit/input.rb
142
169
  - lib/flat_kit/input/file.rb
@@ -154,11 +181,17 @@ files:
154
181
  - lib/flat_kit/output.rb
155
182
  - lib/flat_kit/output/file.rb
156
183
  - lib/flat_kit/output/io.rb
184
+ - lib/flat_kit/position.rb
157
185
  - lib/flat_kit/reader.rb
158
186
  - lib/flat_kit/record.rb
159
187
  - lib/flat_kit/sentinel_internal_node.rb
160
188
  - lib/flat_kit/sentinel_leaf_node.rb
161
189
  - lib/flat_kit/sort.rb
190
+ - lib/flat_kit/stat_type.rb
191
+ - lib/flat_kit/stat_type/nominal_stats.rb
192
+ - lib/flat_kit/stat_type/numerical_stats.rb
193
+ - lib/flat_kit/stat_type/ordinal_stats.rb
194
+ - lib/flat_kit/stats.rb
162
195
  - lib/flat_kit/writer.rb
163
196
  - lib/flat_kit/xsv.rb
164
197
  - lib/flat_kit/xsv/format.rb
@@ -170,6 +203,15 @@ files:
170
203
  - tasks/man.rake
171
204
  - tasks/this.rb
172
205
  - test/device_dataset.rb
206
+ - test/field_type/test_boolean_type.rb
207
+ - test/field_type/test_date_type.rb
208
+ - test/field_type/test_float_type.rb
209
+ - test/field_type/test_guess_type.rb
210
+ - test/field_type/test_integer_type.rb
211
+ - test/field_type/test_null_type.rb
212
+ - test/field_type/test_string_type.rb
213
+ - test/field_type/test_timestamp_type.rb
214
+ - test/field_type/test_unknown_type.rb
173
215
  - test/input/test_file.rb
174
216
  - test/input/test_io.rb
175
217
  - test/jsonl/test_format.rb
@@ -178,8 +220,14 @@ files:
178
220
  - test/jsonl/test_writer.rb
179
221
  - test/output/test_file.rb
180
222
  - test/output/test_io.rb
223
+ - test/run
224
+ - test/stat_type/test_nominal_stats.rb
225
+ - test/stat_type/test_numerical_stats.rb
226
+ - test/stat_type/test_ordinal_stats.rb
181
227
  - test/test_conversions.rb
182
228
  - test/test_event_emitter.rb
229
+ - test/test_field_stats.rb
230
+ - test/test_field_type.rb
183
231
  - test/test_format.rb
184
232
  - test/test_helper.rb
185
233
  - test/test_merge.rb
@@ -219,6 +267,15 @@ summary: A library and commandline program for reading, writing, indexing, sorti
219
267
  and merging CSV, TSV, JSON and other flat-file formats.
220
268
  test_files:
221
269
  - test/device_dataset.rb
270
+ - test/field_type/test_boolean_type.rb
271
+ - test/field_type/test_date_type.rb
272
+ - test/field_type/test_float_type.rb
273
+ - test/field_type/test_guess_type.rb
274
+ - test/field_type/test_integer_type.rb
275
+ - test/field_type/test_null_type.rb
276
+ - test/field_type/test_string_type.rb
277
+ - test/field_type/test_timestamp_type.rb
278
+ - test/field_type/test_unknown_type.rb
222
279
  - test/input/test_file.rb
223
280
  - test/input/test_io.rb
224
281
  - test/jsonl/test_format.rb
@@ -227,8 +284,14 @@ test_files:
227
284
  - test/jsonl/test_writer.rb
228
285
  - test/output/test_file.rb
229
286
  - test/output/test_io.rb
287
+ - test/run
288
+ - test/stat_type/test_nominal_stats.rb
289
+ - test/stat_type/test_numerical_stats.rb
290
+ - test/stat_type/test_ordinal_stats.rb
230
291
  - test/test_conversions.rb
231
292
  - test/test_event_emitter.rb
293
+ - test/test_field_stats.rb
294
+ - test/test_field_type.rb
232
295
  - test/test_format.rb
233
296
  - test/test_helper.rb
234
297
  - test/test_merge.rb