flat_kit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,66 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Stats
|
3
|
+
include ::FlatKit::EventEmitter
|
4
|
+
|
5
|
+
AllFields = Class.new.freeze
|
6
|
+
|
7
|
+
attr_reader :reader
|
8
|
+
attr_reader :writer
|
9
|
+
attr_reader :fields_to_stat
|
10
|
+
attr_reader :stats_to_collect
|
11
|
+
attr_reader :stats_by_field
|
12
|
+
|
13
|
+
def initialize(input:, input_fallback: "auto",
|
14
|
+
output:, output_fallback: "auto",
|
15
|
+
fields_to_stat: AllFields, stats_to_collect: FieldStats::CORE_STATS)
|
16
|
+
|
17
|
+
@fields_to_stat = fields_to_stat
|
18
|
+
@stats_to_collect = stats_to_collect
|
19
|
+
@stats_by_field = Hash.new
|
20
|
+
@record_count = 0
|
21
|
+
|
22
|
+
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, fallback: input_fallback)
|
23
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
24
|
+
reader_format: @reader.format_name)
|
25
|
+
end
|
26
|
+
|
27
|
+
def call
|
28
|
+
calculate_stats
|
29
|
+
write_stat_records
|
30
|
+
@writer.close
|
31
|
+
end
|
32
|
+
|
33
|
+
def collecting_stats_on_field?(name)
|
34
|
+
return true if @fields_to_stat == AllFields
|
35
|
+
return @fields_to_stat.include?(name)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def calculate_stats
|
41
|
+
::FlatKit.logger.debug "Calculating statistics on #{reader.source}"
|
42
|
+
reader.each do |record|
|
43
|
+
record.to_hash.each do |field_name, field_value|
|
44
|
+
if collecting_stats_on_field?(field_name) then
|
45
|
+
update_stats_for_field(name: field_name, value: field_value)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
@record_count += 1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def update_stats_for_field(name:, value:)
|
53
|
+
field_stats = @stats_by_field[name] ||= FieldStats.new(name: name, stats_to_collect: @stats_to_collect)
|
54
|
+
field_stats.update(value)
|
55
|
+
end
|
56
|
+
|
57
|
+
def write_stat_records
|
58
|
+
@stats_by_field.each do |name, stats|
|
59
|
+
h = stats.to_hash.merge({"total_record_count" => @record_count })
|
60
|
+
record = ::FlatKit::Jsonl::Record.new(data: nil, complete_structured_data: h)
|
61
|
+
|
62
|
+
@writer.write(record)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/flat_kit/writer.rb
CHANGED
@@ -12,6 +12,9 @@ module FlatKit
|
|
12
12
|
#
|
13
13
|
class Writer
|
14
14
|
attr_reader :destination
|
15
|
+
attr_reader :output
|
16
|
+
attr_reader :count
|
17
|
+
attr_reader :last_position
|
15
18
|
|
16
19
|
def self.create_writer_from_path(path:, fallback:, reader_format:)
|
17
20
|
fallback = reader_format if fallback == "auto"
|
@@ -21,18 +24,30 @@ module FlatKit
|
|
21
24
|
|
22
25
|
def initialize(destination:)
|
23
26
|
@destination = destination
|
27
|
+
@output = ::FlatKit::Output.from(@destination)
|
28
|
+
@count = 0
|
29
|
+
@last_position = nil
|
24
30
|
end
|
25
31
|
|
26
32
|
def format_name
|
27
33
|
self.class.format_name
|
28
34
|
end
|
29
35
|
|
36
|
+
def current_position
|
37
|
+
::FlatKit::Position.new(index: @count, # since this hasn't been written yet its the right index
|
38
|
+
offset: output.tell,
|
39
|
+
bytesize: 0) # nothing has been written yet
|
40
|
+
end
|
41
|
+
|
42
|
+
# The write method MUST return a Position object detailing the location the
|
43
|
+
# record was written in the output stream.
|
44
|
+
#
|
30
45
|
def write(record)
|
31
|
-
raise NotImplementedError, "#{self.class} needs to implement #write"
|
46
|
+
raise NotImplementedError, "#{self.class} needs to implement #write that returns Position"
|
32
47
|
end
|
33
48
|
|
34
49
|
def close
|
35
|
-
|
50
|
+
output.close
|
36
51
|
end
|
37
52
|
end
|
38
53
|
end
|
data/lib/flat_kit/xsv/writer.rb
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
module FlatKit
|
2
2
|
module Xsv
|
3
3
|
class Writer < ::FlatKit::Writer
|
4
|
-
attr_reader :output
|
5
|
-
attr_reader :count
|
6
4
|
attr_reader :fields
|
5
|
+
attr_reader :header_bytes
|
7
6
|
|
8
7
|
def self.format_name
|
9
8
|
::FlatKit::Xsv::Format.format_name
|
@@ -19,8 +18,6 @@ module FlatKit
|
|
19
18
|
def initialize(destination:, fields: :auto, **csv_options)
|
20
19
|
super(destination: destination)
|
21
20
|
@fields = fields
|
22
|
-
@output = ::FlatKit::Output.from(@destination)
|
23
|
-
@count = 0
|
24
21
|
@we_write_the_header = nil
|
25
22
|
@csv_options = Writer.default_csv_options.dup
|
26
23
|
|
@@ -31,10 +28,16 @@ module FlatKit
|
|
31
28
|
@we_write_the_header = false
|
32
29
|
end
|
33
30
|
|
31
|
+
@header_bytes = 0
|
34
32
|
@csv_options.merge!(csv_options)
|
35
33
|
@csv = CSV.new(output.io, **@csv_options)
|
36
34
|
end
|
37
35
|
|
36
|
+
# write the record and return the Position the record was written
|
37
|
+
#
|
38
|
+
# In the case of the header being written automatcially, the Postion returned is the
|
39
|
+
# position of the reocrd, not the header
|
40
|
+
#
|
38
41
|
def write(record)
|
39
42
|
case record
|
40
43
|
when FlatKit::Xsv::Record
|
@@ -52,18 +55,30 @@ module FlatKit
|
|
52
55
|
raise ::FlatKit::Error, e
|
53
56
|
end
|
54
57
|
|
55
|
-
def close
|
56
|
-
@output.close
|
57
|
-
end
|
58
|
-
|
59
58
|
private
|
60
59
|
|
61
60
|
def write_record(record)
|
62
61
|
if @we_write_the_header && @count == 0 then
|
63
62
|
@csv << record.ordered_fields
|
63
|
+
@header_bytes = output.tell
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
|
+
# the index of the record being written is the same as the count of records written so far
|
67
|
+
record_index = @count
|
68
|
+
|
69
|
+
# get the current output stream position to calculate bytes written
|
70
|
+
start_offset = output.tell
|
71
|
+
|
66
72
|
@csv << record.to_a
|
73
|
+
|
74
|
+
ending_offset = output.io.tell
|
75
|
+
bytes_written = (ending_offset - start_offset)
|
76
|
+
|
77
|
+
@count += 1
|
78
|
+
|
79
|
+
@last_position = ::FlatKit::Position.new(index: record_index,
|
80
|
+
offset: start_offset,
|
81
|
+
bytesize: bytes_written)
|
67
82
|
end
|
68
83
|
end
|
69
84
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestBooleanType < ::Minitest::Test
|
5
|
+
def truthy_items
|
6
|
+
t = %w[ yes Y true t 1 y ]
|
7
|
+
t << 1
|
8
|
+
end
|
9
|
+
|
10
|
+
def falsey_items
|
11
|
+
f = %w[ no n false f 0 N ]
|
12
|
+
f << 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_true
|
16
|
+
assert(FlatKit::FieldType::BooleanType.matches?(true))
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_false
|
20
|
+
assert(FlatKit::FieldType::BooleanType.matches?(false))
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_truthy_items
|
24
|
+
truthy_items.each do |s|
|
25
|
+
assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_falsey_items
|
30
|
+
falsey_items.each do |s|
|
31
|
+
assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_other_class_does_not_match
|
36
|
+
refute(FlatKit::FieldType::BooleanType.matches?(Object.new))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_coerces_falsey_to_boolean
|
40
|
+
falsey_items.each do |t|
|
41
|
+
refute(FlatKit::FieldType::BooleanType.coerce(t))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_true_is_truthy
|
46
|
+
assert(FlatKit::FieldType::BooleanType.coerce(true))
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_false_is_falsey
|
50
|
+
refute(FlatKit::FieldType::BooleanType.coerce(false))
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_0_is_false
|
54
|
+
refute(FlatKit::FieldType::BooleanType.coerce(0))
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_1_is_false
|
58
|
+
assert(FlatKit::FieldType::BooleanType.coerce(1))
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_42_is_false
|
62
|
+
assert(FlatKit::FieldType::BooleanType.coerce(42.0))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestDateType < ::Minitest::Test
|
5
|
+
def test_time_does_not_match
|
6
|
+
refute(FlatKit::FieldType::DateType.matches?(Time.now))
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_date
|
10
|
+
assert(FlatKit::FieldType::DateType.matches?(Date.today))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_datetime_does_not_match
|
14
|
+
refute(FlatKit::FieldType::DateType.matches?(DateTime.now))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_formats
|
18
|
+
formats = ::FlatKit::FieldType::DateType.parse_formats
|
19
|
+
|
20
|
+
assert_equal(formats.size, formats.sort.uniq.size)
|
21
|
+
|
22
|
+
formats.each do |df|
|
23
|
+
s = Time.now.strftime("#{df}")
|
24
|
+
assert(FlatKit::FieldType::DateType.matches?(s), "#{s} should match date")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_other_class_does_not_match
|
29
|
+
[ 42, Object.new, true, false ].each do |x|
|
30
|
+
refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_N_number_does_not_match
|
35
|
+
x = "N89362"
|
36
|
+
refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_coerce
|
40
|
+
formats = ::FlatKit::FieldType::DateType.parse_formats
|
41
|
+
|
42
|
+
formats.each do |df|
|
43
|
+
s = Time.now.strftime("#{df}")
|
44
|
+
assert_instance_of(Date, FlatKit::FieldType::DateType.coerce(s), "#{s} should convert to date")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_date_coerce_does_not_passthrough_time
|
49
|
+
t = Time.now
|
50
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_date_coerce_passthrough_date
|
54
|
+
t = Date.today
|
55
|
+
assert_equal(t, FlatKit::FieldType::DateType.coerce(t))
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_date_coerce_does_not_passthrough_datetime
|
59
|
+
t = DateTime.now
|
60
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_number_coerce_failure
|
64
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(42))
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_number_coerce_failure_bad_parse
|
68
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce("1234 56 78 90"))
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestFloatType < ::Minitest::Test
|
5
|
+
def test_float_matches
|
6
|
+
assert(FlatKit::FieldType::FloatType.matches?(42.0))
|
7
|
+
assert(FlatKit::FieldType::FloatType.matches?(42.1))
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_integer_does_not_match
|
11
|
+
refute(FlatKit::FieldType::FloatType.matches?(42))
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_string_of_digits_does_not_match
|
15
|
+
refute(FlatKit::FieldType::FloatType.matches?("42"))
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_string_of_digits_with_dot_matches
|
19
|
+
assert(FlatKit::FieldType::FloatType.matches?("42.0"))
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_string_of_leters_does_not_match
|
23
|
+
refute(FlatKit::FieldType::FloatType.matches?("abc"))
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_scientific_notation_matches
|
27
|
+
assert(FlatKit::FieldType::FloatType.matches?("1e-10"))
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_other_class_does_not_match
|
31
|
+
refute(FlatKit::FieldType::FloatType.matches?(Object.new))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_integer_coerces
|
35
|
+
assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce(42))
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_integer_strings_coerce
|
39
|
+
assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce("42"))
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_float_strings_coerce
|
43
|
+
assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce("42.6"))
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_float_coerces
|
47
|
+
assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce(42.6))
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_non_numercic_do_not_coerce
|
51
|
+
[ "eleven", nil, false, Object.new ].each do |nope|
|
52
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::FloatType.coerce(nope))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestGuessType < ::Minitest::Test
|
5
|
+
|
6
|
+
def test_guess_type_should_not_match_anything
|
7
|
+
refute(FlatKit::FieldType::GuessType.matches?(nil))
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_guess_type_returns_coerce_failure
|
11
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::GuessType.coerce(nil))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestIntegerType < ::Minitest::Test
|
5
|
+
def test_matches_integer
|
6
|
+
assert(FlatKit::FieldType::IntegerType.matches?(42))
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_matches_negative_integer
|
10
|
+
assert(FlatKit::FieldType::IntegerType.matches?("-42"))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_float_does_not_match
|
14
|
+
refute(FlatKit::FieldType::IntegerType.matches?(42.0))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_string_of_digits_matches
|
18
|
+
assert(FlatKit::FieldType::IntegerType.matches?("42"))
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_string_with_some_digiets_does_not_match
|
22
|
+
refute(FlatKit::FieldType::IntegerType.matches?("42.0"))
|
23
|
+
refute(FlatKit::FieldType::IntegerType.matches?("abc"))
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_other_class_does_not_match
|
27
|
+
refute(FlatKit::FieldType::IntegerType.matches?(Object.new))
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_integer_coerces
|
31
|
+
assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_integer_strings_coerce
|
35
|
+
assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce("42"))
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_float_coerces
|
39
|
+
assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42.6))
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_float_strings_do_not_coerce
|
43
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce("42.6"))
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_non_numercic_do_not_coerce
|
47
|
+
[ "eleven", nil, false, Object.new ].each do |nope|
|
48
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce(nope))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|