flat_kit 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,66 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Stats
|
3
|
+
include ::FlatKit::EventEmitter
|
4
|
+
|
5
|
+
AllFields = Class.new.freeze
|
6
|
+
|
7
|
+
attr_reader :reader
|
8
|
+
attr_reader :writer
|
9
|
+
attr_reader :fields_to_stat
|
10
|
+
attr_reader :stats_to_collect
|
11
|
+
attr_reader :stats_by_field
|
12
|
+
|
13
|
+
def initialize(input:, input_fallback: "auto",
|
14
|
+
output:, output_fallback: "auto",
|
15
|
+
fields_to_stat: AllFields, stats_to_collect: FieldStats::CORE_STATS)
|
16
|
+
|
17
|
+
@fields_to_stat = fields_to_stat
|
18
|
+
@stats_to_collect = stats_to_collect
|
19
|
+
@stats_by_field = Hash.new
|
20
|
+
@record_count = 0
|
21
|
+
|
22
|
+
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, fallback: input_fallback)
|
23
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
24
|
+
reader_format: @reader.format_name)
|
25
|
+
end
|
26
|
+
|
27
|
+
def call
|
28
|
+
calculate_stats
|
29
|
+
write_stat_records
|
30
|
+
@writer.close
|
31
|
+
end
|
32
|
+
|
33
|
+
def collecting_stats_on_field?(name)
|
34
|
+
return true if @fields_to_stat == AllFields
|
35
|
+
return @fields_to_stat.include?(name)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def calculate_stats
|
41
|
+
::FlatKit.logger.debug "Calculating statistics on #{reader.source}"
|
42
|
+
reader.each do |record|
|
43
|
+
record.to_hash.each do |field_name, field_value|
|
44
|
+
if collecting_stats_on_field?(field_name) then
|
45
|
+
update_stats_for_field(name: field_name, value: field_value)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
@record_count += 1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def update_stats_for_field(name:, value:)
|
53
|
+
field_stats = @stats_by_field[name] ||= FieldStats.new(name: name, stats_to_collect: @stats_to_collect)
|
54
|
+
field_stats.update(value)
|
55
|
+
end
|
56
|
+
|
57
|
+
def write_stat_records
|
58
|
+
@stats_by_field.each do |name, stats|
|
59
|
+
h = stats.to_hash.merge({"total_record_count" => @record_count })
|
60
|
+
record = ::FlatKit::Jsonl::Record.new(data: nil, complete_structured_data: h)
|
61
|
+
|
62
|
+
@writer.write(record)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
data/lib/flat_kit/writer.rb
CHANGED
@@ -12,6 +12,9 @@ module FlatKit
|
|
12
12
|
#
|
13
13
|
class Writer
|
14
14
|
attr_reader :destination
|
15
|
+
attr_reader :output
|
16
|
+
attr_reader :count
|
17
|
+
attr_reader :last_position
|
15
18
|
|
16
19
|
def self.create_writer_from_path(path:, fallback:, reader_format:)
|
17
20
|
fallback = reader_format if fallback == "auto"
|
@@ -21,18 +24,30 @@ module FlatKit
|
|
21
24
|
|
22
25
|
def initialize(destination:)
|
23
26
|
@destination = destination
|
27
|
+
@output = ::FlatKit::Output.from(@destination)
|
28
|
+
@count = 0
|
29
|
+
@last_position = nil
|
24
30
|
end
|
25
31
|
|
26
32
|
def format_name
|
27
33
|
self.class.format_name
|
28
34
|
end
|
29
35
|
|
36
|
+
def current_position
|
37
|
+
::FlatKit::Position.new(index: @count, # since this hasn't been written yet its the right index
|
38
|
+
offset: output.tell,
|
39
|
+
bytesize: 0) # nothing has been written yet
|
40
|
+
end
|
41
|
+
|
42
|
+
# The write method MUST return a Position object detailing the location the
|
43
|
+
# record was written in the output stream.
|
44
|
+
#
|
30
45
|
def write(record)
|
31
|
-
raise NotImplementedError, "#{self.class} needs to implement #write"
|
46
|
+
raise NotImplementedError, "#{self.class} needs to implement #write that returns Position"
|
32
47
|
end
|
33
48
|
|
34
49
|
def close
|
35
|
-
|
50
|
+
output.close
|
36
51
|
end
|
37
52
|
end
|
38
53
|
end
|
data/lib/flat_kit/xsv/writer.rb
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
module FlatKit
|
2
2
|
module Xsv
|
3
3
|
class Writer < ::FlatKit::Writer
|
4
|
-
attr_reader :output
|
5
|
-
attr_reader :count
|
6
4
|
attr_reader :fields
|
5
|
+
attr_reader :header_bytes
|
7
6
|
|
8
7
|
def self.format_name
|
9
8
|
::FlatKit::Xsv::Format.format_name
|
@@ -19,8 +18,6 @@ module FlatKit
|
|
19
18
|
def initialize(destination:, fields: :auto, **csv_options)
|
20
19
|
super(destination: destination)
|
21
20
|
@fields = fields
|
22
|
-
@output = ::FlatKit::Output.from(@destination)
|
23
|
-
@count = 0
|
24
21
|
@we_write_the_header = nil
|
25
22
|
@csv_options = Writer.default_csv_options.dup
|
26
23
|
|
@@ -31,10 +28,16 @@ module FlatKit
|
|
31
28
|
@we_write_the_header = false
|
32
29
|
end
|
33
30
|
|
31
|
+
@header_bytes = 0
|
34
32
|
@csv_options.merge!(csv_options)
|
35
33
|
@csv = CSV.new(output.io, **@csv_options)
|
36
34
|
end
|
37
35
|
|
36
|
+
# write the record and return the Position the record was written
|
37
|
+
#
|
38
|
+
# In the case of the header being written automatcially, the Postion returned is the
|
39
|
+
# position of the reocrd, not the header
|
40
|
+
#
|
38
41
|
def write(record)
|
39
42
|
case record
|
40
43
|
when FlatKit::Xsv::Record
|
@@ -52,18 +55,30 @@ module FlatKit
|
|
52
55
|
raise ::FlatKit::Error, e
|
53
56
|
end
|
54
57
|
|
55
|
-
def close
|
56
|
-
@output.close
|
57
|
-
end
|
58
|
-
|
59
58
|
private
|
60
59
|
|
61
60
|
def write_record(record)
|
62
61
|
if @we_write_the_header && @count == 0 then
|
63
62
|
@csv << record.ordered_fields
|
63
|
+
@header_bytes = output.tell
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
|
+
# the index of the record being written is the same as the count of records written so far
|
67
|
+
record_index = @count
|
68
|
+
|
69
|
+
# get the current output stream position to calculate bytes written
|
70
|
+
start_offset = output.tell
|
71
|
+
|
66
72
|
@csv << record.to_a
|
73
|
+
|
74
|
+
ending_offset = output.io.tell
|
75
|
+
bytes_written = (ending_offset - start_offset)
|
76
|
+
|
77
|
+
@count += 1
|
78
|
+
|
79
|
+
@last_position = ::FlatKit::Position.new(index: record_index,
|
80
|
+
offset: start_offset,
|
81
|
+
bytesize: bytes_written)
|
67
82
|
end
|
68
83
|
end
|
69
84
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestBooleanType < ::Minitest::Test
|
5
|
+
def truthy_items
|
6
|
+
t = %w[ yes Y true t 1 y ]
|
7
|
+
t << 1
|
8
|
+
end
|
9
|
+
|
10
|
+
def falsey_items
|
11
|
+
f = %w[ no n false f 0 N ]
|
12
|
+
f << 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_true
|
16
|
+
assert(FlatKit::FieldType::BooleanType.matches?(true))
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_false
|
20
|
+
assert(FlatKit::FieldType::BooleanType.matches?(false))
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_truthy_items
|
24
|
+
truthy_items.each do |s|
|
25
|
+
assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_falsey_items
|
30
|
+
falsey_items.each do |s|
|
31
|
+
assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_other_class_does_not_match
|
36
|
+
refute(FlatKit::FieldType::BooleanType.matches?(Object.new))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_coerces_falsey_to_boolean
|
40
|
+
falsey_items.each do |t|
|
41
|
+
refute(FlatKit::FieldType::BooleanType.coerce(t))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_true_is_truthy
|
46
|
+
assert(FlatKit::FieldType::BooleanType.coerce(true))
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_false_is_falsey
|
50
|
+
refute(FlatKit::FieldType::BooleanType.coerce(false))
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_0_is_false
|
54
|
+
refute(FlatKit::FieldType::BooleanType.coerce(0))
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_1_is_false
|
58
|
+
assert(FlatKit::FieldType::BooleanType.coerce(1))
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_42_is_false
|
62
|
+
assert(FlatKit::FieldType::BooleanType.coerce(42.0))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestDateType < ::Minitest::Test
|
5
|
+
def test_time_does_not_match
|
6
|
+
refute(FlatKit::FieldType::DateType.matches?(Time.now))
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_date
|
10
|
+
assert(FlatKit::FieldType::DateType.matches?(Date.today))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_datetime_does_not_match
|
14
|
+
refute(FlatKit::FieldType::DateType.matches?(DateTime.now))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_formats
|
18
|
+
formats = ::FlatKit::FieldType::DateType.parse_formats
|
19
|
+
|
20
|
+
assert_equal(formats.size, formats.sort.uniq.size)
|
21
|
+
|
22
|
+
formats.each do |df|
|
23
|
+
s = Time.now.strftime("#{df}")
|
24
|
+
assert(FlatKit::FieldType::DateType.matches?(s), "#{s} should match date")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_other_class_does_not_match
|
29
|
+
[ 42, Object.new, true, false ].each do |x|
|
30
|
+
refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_N_number_does_not_match
|
35
|
+
x = "N89362"
|
36
|
+
refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_coerce
|
40
|
+
formats = ::FlatKit::FieldType::DateType.parse_formats
|
41
|
+
|
42
|
+
formats.each do |df|
|
43
|
+
s = Time.now.strftime("#{df}")
|
44
|
+
assert_instance_of(Date, FlatKit::FieldType::DateType.coerce(s), "#{s} should convert to date")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_date_coerce_does_not_passthrough_time
|
49
|
+
t = Time.now
|
50
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_date_coerce_passthrough_date
|
54
|
+
t = Date.today
|
55
|
+
assert_equal(t, FlatKit::FieldType::DateType.coerce(t))
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_date_coerce_does_not_passthrough_datetime
|
59
|
+
t = DateTime.now
|
60
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_number_coerce_failure
|
64
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(42))
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_number_coerce_failure_bad_parse
|
68
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce("1234 56 78 90"))
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestFloatType < ::Minitest::Test
|
5
|
+
def test_float_matches
|
6
|
+
assert(FlatKit::FieldType::FloatType.matches?(42.0))
|
7
|
+
assert(FlatKit::FieldType::FloatType.matches?(42.1))
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_integer_does_not_match
|
11
|
+
refute(FlatKit::FieldType::FloatType.matches?(42))
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_string_of_digits_does_not_match
|
15
|
+
refute(FlatKit::FieldType::FloatType.matches?("42"))
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_string_of_digits_with_dot_matches
|
19
|
+
assert(FlatKit::FieldType::FloatType.matches?("42.0"))
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_string_of_leters_does_not_match
|
23
|
+
refute(FlatKit::FieldType::FloatType.matches?("abc"))
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_scientific_notation_matches
|
27
|
+
assert(FlatKit::FieldType::FloatType.matches?("1e-10"))
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_other_class_does_not_match
|
31
|
+
refute(FlatKit::FieldType::FloatType.matches?(Object.new))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_integer_coerces
|
35
|
+
assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce(42))
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_integer_strings_coerce
|
39
|
+
assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce("42"))
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_float_strings_coerce
|
43
|
+
assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce("42.6"))
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_float_coerces
|
47
|
+
assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce(42.6))
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_non_numercic_do_not_coerce
|
51
|
+
[ "eleven", nil, false, Object.new ].each do |nope|
|
52
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::FloatType.coerce(nope))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestGuessType < ::Minitest::Test
|
5
|
+
|
6
|
+
def test_guess_type_should_not_match_anything
|
7
|
+
refute(FlatKit::FieldType::GuessType.matches?(nil))
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_guess_type_returns_coerce_failure
|
11
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::GuessType.coerce(nil))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative '../test_helper'
|
2
|
+
|
3
|
+
module TestFieldType
|
4
|
+
class TestIntegerType < ::Minitest::Test
|
5
|
+
def test_matches_integer
|
6
|
+
assert(FlatKit::FieldType::IntegerType.matches?(42))
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_matches_negative_integer
|
10
|
+
assert(FlatKit::FieldType::IntegerType.matches?("-42"))
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_float_does_not_match
|
14
|
+
refute(FlatKit::FieldType::IntegerType.matches?(42.0))
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_string_of_digits_matches
|
18
|
+
assert(FlatKit::FieldType::IntegerType.matches?("42"))
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_string_with_some_digiets_does_not_match
|
22
|
+
refute(FlatKit::FieldType::IntegerType.matches?("42.0"))
|
23
|
+
refute(FlatKit::FieldType::IntegerType.matches?("abc"))
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_other_class_does_not_match
|
27
|
+
refute(FlatKit::FieldType::IntegerType.matches?(Object.new))
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_integer_coerces
|
31
|
+
assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42))
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_integer_strings_coerce
|
35
|
+
assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce("42"))
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_float_coerces
|
39
|
+
assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42.6))
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_float_strings_do_not_coerce
|
43
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce("42.6"))
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_non_numercic_do_not_coerce
|
47
|
+
[ "eleven", nil, false, Object.new ].each do |nope|
|
48
|
+
assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce(nope))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|