flat_kit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,66 @@
1
+ module FlatKit
2
+ class Stats
3
+ include ::FlatKit::EventEmitter
4
+
5
+ AllFields = Class.new.freeze
6
+
7
+ attr_reader :reader
8
+ attr_reader :writer
9
+ attr_reader :fields_to_stat
10
+ attr_reader :stats_to_collect
11
+ attr_reader :stats_by_field
12
+
13
+ def initialize(input:, input_fallback: "auto",
14
+ output:, output_fallback: "auto",
15
+ fields_to_stat: AllFields, stats_to_collect: FieldStats::CORE_STATS)
16
+
17
+ @fields_to_stat = fields_to_stat
18
+ @stats_to_collect = stats_to_collect
19
+ @stats_by_field = Hash.new
20
+ @record_count = 0
21
+
22
+ @reader = ::FlatKit::Reader.create_reader_from_path(path: input, fallback: input_fallback)
23
+ @writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
24
+ reader_format: @reader.format_name)
25
+ end
26
+
27
+ def call
28
+ calculate_stats
29
+ write_stat_records
30
+ @writer.close
31
+ end
32
+
33
+ def collecting_stats_on_field?(name)
34
+ return true if @fields_to_stat == AllFields
35
+ return @fields_to_stat.include?(name)
36
+ end
37
+
38
+ private
39
+
40
+ def calculate_stats
41
+ ::FlatKit.logger.debug "Calculating statistics on #{reader.source}"
42
+ reader.each do |record|
43
+ record.to_hash.each do |field_name, field_value|
44
+ if collecting_stats_on_field?(field_name) then
45
+ update_stats_for_field(name: field_name, value: field_value)
46
+ end
47
+ end
48
+ @record_count += 1
49
+ end
50
+ end
51
+
52
+ def update_stats_for_field(name:, value:)
53
+ field_stats = @stats_by_field[name] ||= FieldStats.new(name: name, stats_to_collect: @stats_to_collect)
54
+ field_stats.update(value)
55
+ end
56
+
57
+ def write_stat_records
58
+ @stats_by_field.each do |name, stats|
59
+ h = stats.to_hash.merge({"total_record_count" => @record_count })
60
+ record = ::FlatKit::Jsonl::Record.new(data: nil, complete_structured_data: h)
61
+
62
+ @writer.write(record)
63
+ end
64
+ end
65
+ end
66
+ end
@@ -12,6 +12,9 @@ module FlatKit
12
12
  #
13
13
  class Writer
14
14
  attr_reader :destination
15
+ attr_reader :output
16
+ attr_reader :count
17
+ attr_reader :last_position
15
18
 
16
19
  def self.create_writer_from_path(path:, fallback:, reader_format:)
17
20
  fallback = reader_format if fallback == "auto"
@@ -21,18 +24,30 @@ module FlatKit
21
24
 
22
25
  def initialize(destination:)
23
26
  @destination = destination
27
+ @output = ::FlatKit::Output.from(@destination)
28
+ @count = 0
29
+ @last_position = nil
24
30
  end
25
31
 
26
32
  def format_name
27
33
  self.class.format_name
28
34
  end
29
35
 
36
+ def current_position
37
+ ::FlatKit::Position.new(index: @count, # since this hasn't been written yet its the right index
38
+ offset: output.tell,
39
+ bytesize: 0) # nothing has been written yet
40
+ end
41
+
42
+ # The write method MUST return a Position object detailing the location the
43
+ # record was written in the output stream.
44
+ #
30
45
  def write(record)
31
- raise NotImplementedError, "#{self.class} needs to implement #write"
46
+ raise NotImplementedError, "#{self.class} needs to implement #write that returns Position"
32
47
  end
33
48
 
34
49
  def close
35
- raise NotImplementedError, "#{self.class} needs to implement #close"
50
+ output.close
36
51
  end
37
52
  end
38
53
  end
@@ -1,9 +1,8 @@
1
1
  module FlatKit
2
2
  module Xsv
3
3
  class Writer < ::FlatKit::Writer
4
- attr_reader :output
5
- attr_reader :count
6
4
  attr_reader :fields
5
+ attr_reader :header_bytes
7
6
 
8
7
  def self.format_name
9
8
  ::FlatKit::Xsv::Format.format_name
@@ -19,8 +18,6 @@ module FlatKit
19
18
  def initialize(destination:, fields: :auto, **csv_options)
20
19
  super(destination: destination)
21
20
  @fields = fields
22
- @output = ::FlatKit::Output.from(@destination)
23
- @count = 0
24
21
  @we_write_the_header = nil
25
22
  @csv_options = Writer.default_csv_options.dup
26
23
 
@@ -31,10 +28,16 @@ module FlatKit
31
28
  @we_write_the_header = false
32
29
  end
33
30
 
31
+ @header_bytes = 0
34
32
  @csv_options.merge!(csv_options)
35
33
  @csv = CSV.new(output.io, **@csv_options)
36
34
  end
37
35
 
36
+ # write the record and return the Position the record was written
37
+ #
38
+ # In the case of the header being written automatcially, the Postion returned is the
39
+ # position of the reocrd, not the header
40
+ #
38
41
  def write(record)
39
42
  case record
40
43
  when FlatKit::Xsv::Record
@@ -52,18 +55,30 @@ module FlatKit
52
55
  raise ::FlatKit::Error, e
53
56
  end
54
57
 
55
- def close
56
- @output.close
57
- end
58
-
59
58
  private
60
59
 
61
60
  def write_record(record)
62
61
  if @we_write_the_header && @count == 0 then
63
62
  @csv << record.ordered_fields
63
+ @header_bytes = output.tell
64
64
  end
65
- @count += 1
65
+
66
+ # the index of the record being written is the same as the count of records written so far
67
+ record_index = @count
68
+
69
+ # get the current output stream position to calculate bytes written
70
+ start_offset = output.tell
71
+
66
72
  @csv << record.to_a
73
+
74
+ ending_offset = output.io.tell
75
+ bytes_written = (ending_offset - start_offset)
76
+
77
+ @count += 1
78
+
79
+ @last_position = ::FlatKit::Position.new(index: record_index,
80
+ offset: start_offset,
81
+ bytesize: bytes_written)
67
82
  end
68
83
  end
69
84
  end
@@ -0,0 +1,65 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestBooleanType < ::Minitest::Test
5
+ def truthy_items
6
+ t = %w[ yes Y true t 1 y ]
7
+ t << 1
8
+ end
9
+
10
+ def falsey_items
11
+ f = %w[ no n false f 0 N ]
12
+ f << 0
13
+ end
14
+
15
+ def test_true
16
+ assert(FlatKit::FieldType::BooleanType.matches?(true))
17
+ end
18
+
19
+ def test_false
20
+ assert(FlatKit::FieldType::BooleanType.matches?(false))
21
+ end
22
+
23
+ def test_truthy_items
24
+ truthy_items.each do |s|
25
+ assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
26
+ end
27
+ end
28
+
29
+ def test_falsey_items
30
+ falsey_items.each do |s|
31
+ assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
32
+ end
33
+ end
34
+
35
+ def test_other_class_does_not_match
36
+ refute(FlatKit::FieldType::BooleanType.matches?(Object.new))
37
+ end
38
+
39
+ def test_coerces_falsey_to_boolean
40
+ falsey_items.each do |t|
41
+ refute(FlatKit::FieldType::BooleanType.coerce(t))
42
+ end
43
+ end
44
+
45
+ def test_true_is_truthy
46
+ assert(FlatKit::FieldType::BooleanType.coerce(true))
47
+ end
48
+
49
+ def test_false_is_falsey
50
+ refute(FlatKit::FieldType::BooleanType.coerce(false))
51
+ end
52
+
53
+ def test_0_is_false
54
+ refute(FlatKit::FieldType::BooleanType.coerce(0))
55
+ end
56
+
57
+ def test_1_is_false
58
+ assert(FlatKit::FieldType::BooleanType.coerce(1))
59
+ end
60
+
61
+ def test_42_is_false
62
+ assert(FlatKit::FieldType::BooleanType.coerce(42.0))
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,71 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestDateType < ::Minitest::Test
5
+ def test_time_does_not_match
6
+ refute(FlatKit::FieldType::DateType.matches?(Time.now))
7
+ end
8
+
9
+ def test_date
10
+ assert(FlatKit::FieldType::DateType.matches?(Date.today))
11
+ end
12
+
13
+ def test_datetime_does_not_match
14
+ refute(FlatKit::FieldType::DateType.matches?(DateTime.now))
15
+ end
16
+
17
+ def test_formats
18
+ formats = ::FlatKit::FieldType::DateType.parse_formats
19
+
20
+ assert_equal(formats.size, formats.sort.uniq.size)
21
+
22
+ formats.each do |df|
23
+ s = Time.now.strftime("#{df}")
24
+ assert(FlatKit::FieldType::DateType.matches?(s), "#{s} should match date")
25
+ end
26
+ end
27
+
28
+ def test_other_class_does_not_match
29
+ [ 42, Object.new, true, false ].each do |x|
30
+ refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
31
+ end
32
+ end
33
+
34
+ def test_N_number_does_not_match
35
+ x = "N89362"
36
+ refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
37
+ end
38
+
39
+ def test_coerce
40
+ formats = ::FlatKit::FieldType::DateType.parse_formats
41
+
42
+ formats.each do |df|
43
+ s = Time.now.strftime("#{df}")
44
+ assert_instance_of(Date, FlatKit::FieldType::DateType.coerce(s), "#{s} should convert to date")
45
+ end
46
+ end
47
+
48
+ def test_date_coerce_does_not_passthrough_time
49
+ t = Time.now
50
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
51
+ end
52
+
53
+ def test_date_coerce_passthrough_date
54
+ t = Date.today
55
+ assert_equal(t, FlatKit::FieldType::DateType.coerce(t))
56
+ end
57
+
58
+ def test_date_coerce_does_not_passthrough_datetime
59
+ t = DateTime.now
60
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
61
+ end
62
+
63
+ def test_number_coerce_failure
64
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(42))
65
+ end
66
+
67
+ def test_number_coerce_failure_bad_parse
68
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce("1234 56 78 90"))
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,56 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestFloatType < ::Minitest::Test
5
+ def test_float_matches
6
+ assert(FlatKit::FieldType::FloatType.matches?(42.0))
7
+ assert(FlatKit::FieldType::FloatType.matches?(42.1))
8
+ end
9
+
10
+ def test_integer_does_not_match
11
+ refute(FlatKit::FieldType::FloatType.matches?(42))
12
+ end
13
+
14
+ def test_string_of_digits_does_not_match
15
+ refute(FlatKit::FieldType::FloatType.matches?("42"))
16
+ end
17
+
18
+ def test_string_of_digits_with_dot_matches
19
+ assert(FlatKit::FieldType::FloatType.matches?("42.0"))
20
+ end
21
+
22
+ def test_string_of_leters_does_not_match
23
+ refute(FlatKit::FieldType::FloatType.matches?("abc"))
24
+ end
25
+
26
+ def test_scientific_notation_matches
27
+ assert(FlatKit::FieldType::FloatType.matches?("1e-10"))
28
+ end
29
+
30
+ def test_other_class_does_not_match
31
+ refute(FlatKit::FieldType::FloatType.matches?(Object.new))
32
+ end
33
+
34
+ def test_integer_coerces
35
+ assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce(42))
36
+ end
37
+
38
+ def test_integer_strings_coerce
39
+ assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce("42"))
40
+ end
41
+
42
+ def test_float_strings_coerce
43
+ assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce("42.6"))
44
+ end
45
+
46
+ def test_float_coerces
47
+ assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce(42.6))
48
+ end
49
+
50
+ def test_non_numercic_do_not_coerce
51
+ [ "eleven", nil, false, Object.new ].each do |nope|
52
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::FloatType.coerce(nope))
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,14 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestGuessType < ::Minitest::Test
5
+
6
+ def test_guess_type_should_not_match_anything
7
+ refute(FlatKit::FieldType::GuessType.matches?(nil))
8
+ end
9
+
10
+ def test_guess_type_returns_coerce_failure
11
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::GuessType.coerce(nil))
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,52 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestIntegerType < ::Minitest::Test
5
+ def test_matches_integer
6
+ assert(FlatKit::FieldType::IntegerType.matches?(42))
7
+ end
8
+
9
+ def test_matches_negative_integer
10
+ assert(FlatKit::FieldType::IntegerType.matches?("-42"))
11
+ end
12
+
13
+ def test_float_does_not_match
14
+ refute(FlatKit::FieldType::IntegerType.matches?(42.0))
15
+ end
16
+
17
+ def test_string_of_digits_matches
18
+ assert(FlatKit::FieldType::IntegerType.matches?("42"))
19
+ end
20
+
21
+ def test_string_with_some_digiets_does_not_match
22
+ refute(FlatKit::FieldType::IntegerType.matches?("42.0"))
23
+ refute(FlatKit::FieldType::IntegerType.matches?("abc"))
24
+ end
25
+
26
+ def test_other_class_does_not_match
27
+ refute(FlatKit::FieldType::IntegerType.matches?(Object.new))
28
+ end
29
+
30
+ def test_integer_coerces
31
+ assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42))
32
+ end
33
+
34
+ def test_integer_strings_coerce
35
+ assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce("42"))
36
+ end
37
+
38
+ def test_float_coerces
39
+ assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42.6))
40
+ end
41
+
42
+ def test_float_strings_do_not_coerce
43
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce("42.6"))
44
+ end
45
+
46
+ def test_non_numercic_do_not_coerce
47
+ [ "eleven", nil, false, Object.new ].each do |nope|
48
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce(nope))
49
+ end
50
+ end
51
+ end
52
+ end