flat_kit 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,92 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestStatType
4
+ class TestOrdinalStats < ::Minitest::Test
5
+ def setup
6
+ today = Date.today
7
+ next_month = today >> 1
8
+ last_day_of_month = (next_month - 1).mday
9
+
10
+ @start_date = Date.new(today.year, today.month, 1)
11
+ @end_date = Date.new(today.year, today.month, last_day_of_month)
12
+
13
+ @unique_values = (@start_date..@end_date).to_a
14
+ @values = Array.new.tap do |a|
15
+ @unique_values.each do |date|
16
+ (Random.rand(42) + 1).times { a << date}
17
+ end
18
+ end
19
+
20
+ @frequencies = @values.tally
21
+
22
+ @stats = ::FlatKit::StatType::OrdinalStats.new
23
+ @all_stats = ::FlatKit::StatType::OrdinalStats.new(collecting_frequencies: true)
24
+
25
+ @values.each do |v|
26
+ @stats.update(v)
27
+ @all_stats.update(v)
28
+ end
29
+ end
30
+
31
+ def test_count
32
+ assert_equal(@values.size, @stats.count)
33
+ assert_equal(@values.size, @all_stats.count)
34
+ end
35
+
36
+ def test_min
37
+ assert_equal(@values.min, @stats.min)
38
+ assert_equal(@values.min, @all_stats.min)
39
+ end
40
+
41
+ def test_max
42
+ assert_equal(@values.max, @stats.max)
43
+ assert_equal(@values.max, @all_stats.max)
44
+ end
45
+
46
+ def test_does_not_collect_unique_count_by_default
47
+ assert_nil(@stats.unique_count)
48
+ end
49
+
50
+ def test_does_not_collect_unique_values_by_default
51
+ assert_nil(@stats.unique_values)
52
+ end
53
+
54
+ def test_does_not_collect_frequencies_by_default
55
+ assert_nil(@stats.frequencies)
56
+ end
57
+
58
+ def test_unique_count
59
+ assert_equal(@unique_values.size, @all_stats.unique_count)
60
+ end
61
+
62
+ def test_unique_values
63
+ assert_equal(@unique_values.sort, @all_stats.unique_values.sort)
64
+ end
65
+
66
+ def test_frequencies
67
+ assert_equal(@frequencies, @all_stats.frequencies)
68
+ end
69
+
70
+ def test_default_to_hash
71
+ expecting = {
72
+ "count" => @values.size,
73
+ "max" => @values.max,
74
+ "min" => @values.min,
75
+ }
76
+ assert_equal(expecting, @stats.to_hash)
77
+ end
78
+
79
+ def test_all_stats_hash
80
+ expecting = {
81
+ "count" => @values.size,
82
+ "unique_count" => @unique_values.size,
83
+ "unique_values" => @unique_values.sort,
84
+ "mode" => @frequencies.max_by { |k,v| v }.first,
85
+ "max" => @values.max,
86
+ "min" => @values.min,
87
+ }
88
+ assert_equal(expecting, @all_stats.to_hash)
89
+ end
90
+ end
91
+ end
92
+
@@ -8,15 +8,22 @@ class TestEventEmitter < ::Minitest::Test
8
8
  class Sub
9
9
  attr_reader :name
10
10
  attr_reader :data
11
+ attr_reader :meta
11
12
 
12
13
  def initialize
13
14
  @name = nil
14
15
  @data = nil
16
+ @meta = nil
15
17
  end
16
18
 
17
- def on_event(name:, data:)
19
+ def [](key)
20
+ @meta[key]
21
+ end
22
+
23
+ def on_event(name:, data:, meta:)
18
24
  @name = name
19
25
  @data = data
26
+ @meta = meta
20
27
  end
21
28
  end
22
29
 
@@ -61,12 +68,22 @@ class TestEventEmitter < ::Minitest::Test
61
68
  @emitter.add_listener(@receiver)
62
69
  @emitter.add_listener(@receiver_2)
63
70
 
64
- @emitter.notify_listeners(name: :notification, data: "DATA!")
71
+ meta = {
72
+ foo: "foo",
73
+ bar: 42,
74
+ }
75
+ @emitter.notify_listeners(name: :notification, data: "DATA!", meta: meta)
65
76
 
66
77
  assert_equal(:notification, @receiver.name)
67
78
  assert_equal(:notification, @receiver_2.name)
68
79
 
69
80
  assert_equal("DATA!", @receiver.data)
70
81
  assert_equal("DATA!", @receiver_2.data)
82
+
83
+ assert_equal("foo", @receiver[:foo])
84
+ assert_equal("foo", @receiver_2[:foo])
85
+
86
+ assert_equal(42, @receiver[:bar])
87
+ assert_equal(42, @receiver_2[:bar])
71
88
  end
72
89
  end
@@ -0,0 +1,134 @@
1
+ require_relative './test_helper'
2
+
3
+ require 'faker'
4
+
5
+ class TestFieldStats < ::Minitest::Test
6
+ # returns [FieldStats, Array] where the array is the original data
7
+ def generate_data_with(count: 100, stats: ::FlatKit::FieldStats.new(name: "data"), &block)
8
+ list = Array.new.tap do |a|
9
+ count.times do
10
+ n = block.call
11
+ stats.update(n)
12
+ a << n
13
+ end
14
+ end
15
+ [stats, list]
16
+ end
17
+
18
+ def test_raises_error_on_invalid_stats
19
+ assert_raises(ArgumentError) { ::FlatKit::FieldStats.new(name: "test", stats_to_collect: :whatever) }
20
+ end
21
+
22
+ def test_collects_numeric_default_stats
23
+ field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
24
+
25
+ null_count = 5
26
+ null_count.times {
27
+ field_stats.update(nil)
28
+ }
29
+
30
+ avg = number_data.sum / number_data.size
31
+ min = number_data.min
32
+ max = number_data.max
33
+ sum = number_data.sum
34
+
35
+ refute(field_stats.field_type_determined?)
36
+
37
+ assert_equal(null_count, field_stats.null_count)
38
+ assert_equal(number_data.size, field_stats.count)
39
+
40
+ assert(field_stats.field_type_determined?)
41
+
42
+ assert_in_epsilon(avg, field_stats.mean)
43
+ assert_equal(min, field_stats.min)
44
+ assert_equal(max, field_stats.max)
45
+ assert_in_epsilon(sum, field_stats.sum)
46
+ expected_percent = (null_count.to_f / (null_count + number_data.size)) * 100.0
47
+ assert_in_epsilon(expected_percent, field_stats.null_percent)
48
+ end
49
+
50
+ def test_collect_numeric_cardinality_stats
51
+ field_stats = ::FlatKit::FieldStats.new(name: "number-cardinality",
52
+ stats_to_collect: ::FlatKit::FieldStats::ALL_STATS)
53
+ field_stats, number_data = generate_data_with(stats: field_stats) {
54
+ Faker::Number.within(range:1..25)
55
+ }
56
+
57
+ avg = number_data.sum.to_f / number_data.size
58
+ min = number_data.min
59
+ max = number_data.max
60
+
61
+ assert(field_stats.collecting_frequencies?)
62
+ refute(field_stats.field_type_determined?)
63
+
64
+ assert_equal(number_data.size, field_stats.count)
65
+
66
+ assert(field_stats.field_type_determined?)
67
+
68
+ assert_in_epsilon(avg, field_stats.mean)
69
+ assert_equal(min, field_stats.min)
70
+ assert_equal(max, field_stats.max)
71
+
72
+ assert_equal(number_data.tally.keys.size, field_stats.unique_count)
73
+ assert_equal(number_data.tally.keys.sort, field_stats.unique_values.sort)
74
+ assert_equal(number_data.tally, field_stats.frequencies)
75
+
76
+ mode = number_data.tally.max_by{ |k,v| v }.first
77
+ assert_equal(mode, field_stats.mode)
78
+ end
79
+
80
+ def test_unknown_type_stats
81
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-with-unknown")
82
+ field_stats, number_data = generate_data_with { Faker::Number.within(range: 1.0..100.0) }
83
+
84
+ unknown_count = 20
85
+ unknown_count.times {
86
+ field_stats.update("unknown")
87
+ }
88
+
89
+ refute(field_stats.field_type_determined?)
90
+
91
+ assert_equal(unknown_count, field_stats.unknown_count)
92
+ assert_equal(unknown_count + number_data.size, field_stats.total_count)
93
+
94
+ expected_percent = (unknown_count.to_f / (unknown_count + number_data.size)) * 100.0
95
+
96
+ assert_in_epsilon(expected_percent, field_stats.unknown_percent)
97
+ end
98
+
99
+ def test_resolves_type_automatically
100
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-autoresolve",guess_threshold: 101)
101
+ field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 1.0..100.0) }
102
+
103
+ refute(field_stats.field_type_determined?)
104
+ field_stats, _ = generate_data_with(stats: field_stats) { Faker::Number.within(range: 200.0..300.0) }
105
+ assert(field_stats.field_type_determined?)
106
+ end
107
+
108
+ def test_resolves_integer_appropriately_with_mixed_data
109
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
110
+ field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..1).to_s }
111
+ field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
112
+
113
+ assert_equal(::FlatKit::FieldType::IntegerType, field_stats.field_type)
114
+
115
+ end
116
+
117
+ def test_resolves_boolean_appropriately_with_mixed_data
118
+ field_stats = ::FlatKit::FieldStats.new(name: "numeric-integer",guess_threshold: 100)
119
+ field_stats, _ = generate_data_with(count: 70, stats: field_stats) { Faker::Boolean.boolean.to_s }
120
+ field_stats, _ = generate_data_with(count: 40, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
121
+ assert_equal(::FlatKit::FieldType::BooleanType, field_stats.field_type)
122
+ end
123
+
124
+ def test_resolves_string_appropriately_with_mixed_data
125
+ field_stats = ::FlatKit::FieldStats.new(name: "string",guess_threshold: 100)
126
+ field_stats, _ = generate_data_with(count: 61, stats: field_stats) { Faker::Color.name.to_s }
127
+ field_stats, _ = generate_data_with(count: 59, stats: field_stats) { Faker::Number.within(range: 0..200).to_s }
128
+ assert_equal(::FlatKit::FieldType::StringType, field_stats.field_type)
129
+
130
+ assert_equal(120, field_stats.count)
131
+ assert_equal(0, field_stats.unknown_count)
132
+ assert_equal(0, field_stats.null_count)
133
+ end
134
+ end
@@ -0,0 +1,34 @@
1
+ require_relative './test_helper'
2
+
3
+ module TestFieldType
4
+ class TestFieldType < ::Minitest::Test
5
+
6
+ def test_weight_raises_exception
7
+ assert_raises(NotImplementedError) { ::FlatKit::FieldType.weight }
8
+ end
9
+
10
+ def test_best_guesses
11
+ guesses = {
12
+ "t" => ::FlatKit::FieldType::BooleanType,
13
+ "1" => ::FlatKit::FieldType::BooleanType,
14
+ "0" => ::FlatKit::FieldType::BooleanType,
15
+ "n" => ::FlatKit::FieldType::BooleanType,
16
+ "42" => ::FlatKit::FieldType::IntegerType,
17
+ "nil" => ::FlatKit::FieldType::NullType,
18
+ "n/a" => ::FlatKit::FieldType::UnknownType,
19
+ "foo" => ::FlatKit::FieldType::StringType,
20
+ "12.3" => ::FlatKit::FieldType::FloatType,
21
+ "2021-02-26" => ::FlatKit::FieldType::DateType,
22
+ "2020-03-03T12:34:56Z" => ::FlatKit::FieldType::TimestampType,
23
+ }
24
+
25
+ guesses.each do |test, expected|
26
+ assert_equal(expected, ::FlatKit::FieldType.best_guess(test), "Expected '#{test}' to be #{expected}")
27
+ end
28
+ end
29
+
30
+ def test_children_exist
31
+ assert_equal(9,::FlatKit::FieldType.children.size)
32
+ end
33
+ end
34
+ end
@@ -41,12 +41,33 @@ module TestXsv
41
41
  assert_equal(expected, actual)
42
42
  end
43
43
 
44
- def test_writes_to_io
44
+ def test_position
45
45
  File.open(@write_path, "w+") do |f|
46
- writer = ::FlatKit::Xsv::Writer.new(destination: f,fields: @reader.fields)
46
+ writer = ::FlatKit::Xsv::Writer.new(destination: f,fields: :auto)
47
+ records_bytes = 0
48
+ header_bytes = nil
49
+
50
+ @records.each_with_index do |r, idx|
51
+ record_length = r.to_s.bytesize
52
+
53
+ position = writer.write(r)
54
+ # make sure write stores the last_position api and returns that value
55
+ assert_equal(position, writer.last_position)
56
+
57
+ header_bytes = writer.header_bytes if header_bytes == nil
58
+ assert(header_bytes > 0)
59
+
60
+ assert_equal(idx, position.index)
61
+ assert_equal(header_bytes + records_bytes, position.offset)
62
+ assert_equal(record_length, position.bytesize)
63
+
64
+ records_bytes += record_length
65
+
66
+ current_position = writer.current_position
67
+ assert_equal(idx+1, current_position.index)
68
+ assert_equal(header_bytes + records_bytes, current_position.offset)
69
+ assert_equal(0, current_position.bytesize)
47
70
 
48
- @records.each do |r|
49
- writer.write(r)
50
71
  end
51
72
  writer.close
52
73
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flat_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy Hinegardner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-02-03 00:00:00.000000000 Z
11
+ date: 2021-03-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: oj
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faker
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.16'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.16'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rake
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -128,15 +142,28 @@ files:
128
142
  - README.md
129
143
  - Rakefile
130
144
  - bin/fk
145
+ - examples/stream-active-record-to-csv.rb
131
146
  - lib/flat_kit.rb
132
147
  - lib/flat_kit/cli.rb
133
148
  - lib/flat_kit/command.rb
134
149
  - lib/flat_kit/command/cat.rb
135
150
  - lib/flat_kit/command/merge.rb
136
151
  - lib/flat_kit/command/sort.rb
152
+ - lib/flat_kit/command/stats.rb
137
153
  - lib/flat_kit/descendant_tracker.rb
138
154
  - lib/flat_kit/error.rb
139
155
  - lib/flat_kit/event_emitter.rb
156
+ - lib/flat_kit/field_stats.rb
157
+ - lib/flat_kit/field_type.rb
158
+ - lib/flat_kit/field_type/boolean_type.rb
159
+ - lib/flat_kit/field_type/date_type.rb
160
+ - lib/flat_kit/field_type/float_type.rb
161
+ - lib/flat_kit/field_type/guess_type.rb
162
+ - lib/flat_kit/field_type/integer_type.rb
163
+ - lib/flat_kit/field_type/null_type.rb
164
+ - lib/flat_kit/field_type/string_type.rb
165
+ - lib/flat_kit/field_type/timestamp_type.rb
166
+ - lib/flat_kit/field_type/unknown_type.rb
140
167
  - lib/flat_kit/format.rb
141
168
  - lib/flat_kit/input.rb
142
169
  - lib/flat_kit/input/file.rb
@@ -154,11 +181,17 @@ files:
154
181
  - lib/flat_kit/output.rb
155
182
  - lib/flat_kit/output/file.rb
156
183
  - lib/flat_kit/output/io.rb
184
+ - lib/flat_kit/position.rb
157
185
  - lib/flat_kit/reader.rb
158
186
  - lib/flat_kit/record.rb
159
187
  - lib/flat_kit/sentinel_internal_node.rb
160
188
  - lib/flat_kit/sentinel_leaf_node.rb
161
189
  - lib/flat_kit/sort.rb
190
+ - lib/flat_kit/stat_type.rb
191
+ - lib/flat_kit/stat_type/nominal_stats.rb
192
+ - lib/flat_kit/stat_type/numerical_stats.rb
193
+ - lib/flat_kit/stat_type/ordinal_stats.rb
194
+ - lib/flat_kit/stats.rb
162
195
  - lib/flat_kit/writer.rb
163
196
  - lib/flat_kit/xsv.rb
164
197
  - lib/flat_kit/xsv/format.rb
@@ -170,6 +203,15 @@ files:
170
203
  - tasks/man.rake
171
204
  - tasks/this.rb
172
205
  - test/device_dataset.rb
206
+ - test/field_type/test_boolean_type.rb
207
+ - test/field_type/test_date_type.rb
208
+ - test/field_type/test_float_type.rb
209
+ - test/field_type/test_guess_type.rb
210
+ - test/field_type/test_integer_type.rb
211
+ - test/field_type/test_null_type.rb
212
+ - test/field_type/test_string_type.rb
213
+ - test/field_type/test_timestamp_type.rb
214
+ - test/field_type/test_unknown_type.rb
173
215
  - test/input/test_file.rb
174
216
  - test/input/test_io.rb
175
217
  - test/jsonl/test_format.rb
@@ -178,8 +220,14 @@ files:
178
220
  - test/jsonl/test_writer.rb
179
221
  - test/output/test_file.rb
180
222
  - test/output/test_io.rb
223
+ - test/run
224
+ - test/stat_type/test_nominal_stats.rb
225
+ - test/stat_type/test_numerical_stats.rb
226
+ - test/stat_type/test_ordinal_stats.rb
181
227
  - test/test_conversions.rb
182
228
  - test/test_event_emitter.rb
229
+ - test/test_field_stats.rb
230
+ - test/test_field_type.rb
183
231
  - test/test_format.rb
184
232
  - test/test_helper.rb
185
233
  - test/test_merge.rb
@@ -219,6 +267,15 @@ summary: A library and commandline program for reading, writing, indexing, sorti
219
267
  and merging CSV, TSV, JSON and other flat-file formats.
220
268
  test_files:
221
269
  - test/device_dataset.rb
270
+ - test/field_type/test_boolean_type.rb
271
+ - test/field_type/test_date_type.rb
272
+ - test/field_type/test_float_type.rb
273
+ - test/field_type/test_guess_type.rb
274
+ - test/field_type/test_integer_type.rb
275
+ - test/field_type/test_null_type.rb
276
+ - test/field_type/test_string_type.rb
277
+ - test/field_type/test_timestamp_type.rb
278
+ - test/field_type/test_unknown_type.rb
222
279
  - test/input/test_file.rb
223
280
  - test/input/test_io.rb
224
281
  - test/jsonl/test_format.rb
@@ -227,8 +284,14 @@ test_files:
227
284
  - test/jsonl/test_writer.rb
228
285
  - test/output/test_file.rb
229
286
  - test/output/test_io.rb
287
+ - test/run
288
+ - test/stat_type/test_nominal_stats.rb
289
+ - test/stat_type/test_numerical_stats.rb
290
+ - test/stat_type/test_ordinal_stats.rb
230
291
  - test/test_conversions.rb
231
292
  - test/test_event_emitter.rb
293
+ - test/test_field_stats.rb
294
+ - test/test_field_type.rb
232
295
  - test/test_format.rb
233
296
  - test/test_helper.rb
234
297
  - test/test_merge.rb