flat_kit 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,66 @@
1
+ module FlatKit
2
+ class Stats
3
+ include ::FlatKit::EventEmitter
4
+
5
+ AllFields = Class.new.freeze
6
+
7
+ attr_reader :reader
8
+ attr_reader :writer
9
+ attr_reader :fields_to_stat
10
+ attr_reader :stats_to_collect
11
+ attr_reader :stats_by_field
12
+
13
+ def initialize(input:, input_fallback: "auto",
14
+ output:, output_fallback: "auto",
15
+ fields_to_stat: AllFields, stats_to_collect: FieldStats::CORE_STATS)
16
+
17
+ @fields_to_stat = fields_to_stat
18
+ @stats_to_collect = stats_to_collect
19
+ @stats_by_field = Hash.new
20
+ @record_count = 0
21
+
22
+ @reader = ::FlatKit::Reader.create_reader_from_path(path: input, fallback: input_fallback)
23
+ @writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
24
+ reader_format: @reader.format_name)
25
+ end
26
+
27
+ def call
28
+ calculate_stats
29
+ write_stat_records
30
+ @writer.close
31
+ end
32
+
33
+ def collecting_stats_on_field?(name)
34
+ return true if @fields_to_stat == AllFields
35
+ return @fields_to_stat.include?(name)
36
+ end
37
+
38
+ private
39
+
40
+ def calculate_stats
41
+ ::FlatKit.logger.debug "Calculating statistics on #{reader.source}"
42
+ reader.each do |record|
43
+ record.to_hash.each do |field_name, field_value|
44
+ if collecting_stats_on_field?(field_name) then
45
+ update_stats_for_field(name: field_name, value: field_value)
46
+ end
47
+ end
48
+ @record_count += 1
49
+ end
50
+ end
51
+
52
+ def update_stats_for_field(name:, value:)
53
+ field_stats = @stats_by_field[name] ||= FieldStats.new(name: name, stats_to_collect: @stats_to_collect)
54
+ field_stats.update(value)
55
+ end
56
+
57
+ def write_stat_records
58
+ @stats_by_field.each do |name, stats|
59
+ h = stats.to_hash.merge({"total_record_count" => @record_count })
60
+ record = ::FlatKit::Jsonl::Record.new(data: nil, complete_structured_data: h)
61
+
62
+ @writer.write(record)
63
+ end
64
+ end
65
+ end
66
+ end
@@ -12,6 +12,9 @@ module FlatKit
12
12
  #
13
13
  class Writer
14
14
  attr_reader :destination
15
+ attr_reader :output
16
+ attr_reader :count
17
+ attr_reader :last_position
15
18
 
16
19
  def self.create_writer_from_path(path:, fallback:, reader_format:)
17
20
  fallback = reader_format if fallback == "auto"
@@ -21,18 +24,30 @@ module FlatKit
21
24
 
22
25
  def initialize(destination:)
23
26
  @destination = destination
27
+ @output = ::FlatKit::Output.from(@destination)
28
+ @count = 0
29
+ @last_position = nil
24
30
  end
25
31
 
26
32
  def format_name
27
33
  self.class.format_name
28
34
  end
29
35
 
36
+ def current_position
37
+ ::FlatKit::Position.new(index: @count, # since this hasn't been written yet its the right index
38
+ offset: output.tell,
39
+ bytesize: 0) # nothing has been written yet
40
+ end
41
+
42
+ # The write method MUST return a Position object detailing the location the
43
+ # record was written in the output stream.
44
+ #
30
45
  def write(record)
31
- raise NotImplementedError, "#{self.class} needs to implement #write"
46
+ raise NotImplementedError, "#{self.class} needs to implement #write that returns Position"
32
47
  end
33
48
 
34
49
  def close
35
- raise NotImplementedError, "#{self.class} needs to implement #close"
50
+ output.close
36
51
  end
37
52
  end
38
53
  end
@@ -1,9 +1,8 @@
1
1
  module FlatKit
2
2
  module Xsv
3
3
  class Writer < ::FlatKit::Writer
4
- attr_reader :output
5
- attr_reader :count
6
4
  attr_reader :fields
5
+ attr_reader :header_bytes
7
6
 
8
7
  def self.format_name
9
8
  ::FlatKit::Xsv::Format.format_name
@@ -19,8 +18,6 @@ module FlatKit
19
18
  def initialize(destination:, fields: :auto, **csv_options)
20
19
  super(destination: destination)
21
20
  @fields = fields
22
- @output = ::FlatKit::Output.from(@destination)
23
- @count = 0
24
21
  @we_write_the_header = nil
25
22
  @csv_options = Writer.default_csv_options.dup
26
23
 
@@ -31,10 +28,16 @@ module FlatKit
31
28
  @we_write_the_header = false
32
29
  end
33
30
 
31
+ @header_bytes = 0
34
32
  @csv_options.merge!(csv_options)
35
33
  @csv = CSV.new(output.io, **@csv_options)
36
34
  end
37
35
 
36
+ # write the record and return the Position the record was written
37
+ #
38
+ # In the case of the header being written automatcially, the Postion returned is the
39
+ # position of the reocrd, not the header
40
+ #
38
41
  def write(record)
39
42
  case record
40
43
  when FlatKit::Xsv::Record
@@ -52,18 +55,30 @@ module FlatKit
52
55
  raise ::FlatKit::Error, e
53
56
  end
54
57
 
55
- def close
56
- @output.close
57
- end
58
-
59
58
  private
60
59
 
61
60
  def write_record(record)
62
61
  if @we_write_the_header && @count == 0 then
63
62
  @csv << record.ordered_fields
63
+ @header_bytes = output.tell
64
64
  end
65
- @count += 1
65
+
66
+ # the index of the record being written is the same as the count of records written so far
67
+ record_index = @count
68
+
69
+ # get the current output stream position to calculate bytes written
70
+ start_offset = output.tell
71
+
66
72
  @csv << record.to_a
73
+
74
+ ending_offset = output.io.tell
75
+ bytes_written = (ending_offset - start_offset)
76
+
77
+ @count += 1
78
+
79
+ @last_position = ::FlatKit::Position.new(index: record_index,
80
+ offset: start_offset,
81
+ bytesize: bytes_written)
67
82
  end
68
83
  end
69
84
  end
@@ -0,0 +1,65 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestBooleanType < ::Minitest::Test
5
+ def truthy_items
6
+ t = %w[ yes Y true t 1 y ]
7
+ t << 1
8
+ end
9
+
10
+ def falsey_items
11
+ f = %w[ no n false f 0 N ]
12
+ f << 0
13
+ end
14
+
15
+ def test_true
16
+ assert(FlatKit::FieldType::BooleanType.matches?(true))
17
+ end
18
+
19
+ def test_false
20
+ assert(FlatKit::FieldType::BooleanType.matches?(false))
21
+ end
22
+
23
+ def test_truthy_items
24
+ truthy_items.each do |s|
25
+ assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
26
+ end
27
+ end
28
+
29
+ def test_falsey_items
30
+ falsey_items.each do |s|
31
+ assert(FlatKit::FieldType::BooleanType.matches?(s), "#{s} should be boolean")
32
+ end
33
+ end
34
+
35
+ def test_other_class_does_not_match
36
+ refute(FlatKit::FieldType::BooleanType.matches?(Object.new))
37
+ end
38
+
39
+ def test_coerces_falsey_to_boolean
40
+ falsey_items.each do |t|
41
+ refute(FlatKit::FieldType::BooleanType.coerce(t))
42
+ end
43
+ end
44
+
45
+ def test_true_is_truthy
46
+ assert(FlatKit::FieldType::BooleanType.coerce(true))
47
+ end
48
+
49
+ def test_false_is_falsey
50
+ refute(FlatKit::FieldType::BooleanType.coerce(false))
51
+ end
52
+
53
+ def test_0_is_false
54
+ refute(FlatKit::FieldType::BooleanType.coerce(0))
55
+ end
56
+
57
+ def test_1_is_false
58
+ assert(FlatKit::FieldType::BooleanType.coerce(1))
59
+ end
60
+
61
+ def test_42_is_false
62
+ assert(FlatKit::FieldType::BooleanType.coerce(42.0))
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,71 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestDateType < ::Minitest::Test
5
+ def test_time_does_not_match
6
+ refute(FlatKit::FieldType::DateType.matches?(Time.now))
7
+ end
8
+
9
+ def test_date
10
+ assert(FlatKit::FieldType::DateType.matches?(Date.today))
11
+ end
12
+
13
+ def test_datetime_does_not_match
14
+ refute(FlatKit::FieldType::DateType.matches?(DateTime.now))
15
+ end
16
+
17
+ def test_formats
18
+ formats = ::FlatKit::FieldType::DateType.parse_formats
19
+
20
+ assert_equal(formats.size, formats.sort.uniq.size)
21
+
22
+ formats.each do |df|
23
+ s = Time.now.strftime("#{df}")
24
+ assert(FlatKit::FieldType::DateType.matches?(s), "#{s} should match date")
25
+ end
26
+ end
27
+
28
+ def test_other_class_does_not_match
29
+ [ 42, Object.new, true, false ].each do |x|
30
+ refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
31
+ end
32
+ end
33
+
34
+ def test_N_number_does_not_match
35
+ x = "N89362"
36
+ refute(FlatKit::FieldType::DateType.matches?(x), "#{x} should not be date")
37
+ end
38
+
39
+ def test_coerce
40
+ formats = ::FlatKit::FieldType::DateType.parse_formats
41
+
42
+ formats.each do |df|
43
+ s = Time.now.strftime("#{df}")
44
+ assert_instance_of(Date, FlatKit::FieldType::DateType.coerce(s), "#{s} should convert to date")
45
+ end
46
+ end
47
+
48
+ def test_date_coerce_does_not_passthrough_time
49
+ t = Time.now
50
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
51
+ end
52
+
53
+ def test_date_coerce_passthrough_date
54
+ t = Date.today
55
+ assert_equal(t, FlatKit::FieldType::DateType.coerce(t))
56
+ end
57
+
58
+ def test_date_coerce_does_not_passthrough_datetime
59
+ t = DateTime.now
60
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(t))
61
+ end
62
+
63
+ def test_number_coerce_failure
64
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce(42))
65
+ end
66
+
67
+ def test_number_coerce_failure_bad_parse
68
+ assert_equal(::FlatKit::FieldType::CoerceFailure, FlatKit::FieldType::DateType.coerce("1234 56 78 90"))
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,56 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestFloatType < ::Minitest::Test
5
+ def test_float_matches
6
+ assert(FlatKit::FieldType::FloatType.matches?(42.0))
7
+ assert(FlatKit::FieldType::FloatType.matches?(42.1))
8
+ end
9
+
10
+ def test_integer_does_not_match
11
+ refute(FlatKit::FieldType::FloatType.matches?(42))
12
+ end
13
+
14
+ def test_string_of_digits_does_not_match
15
+ refute(FlatKit::FieldType::FloatType.matches?("42"))
16
+ end
17
+
18
+ def test_string_of_digits_with_dot_matches
19
+ assert(FlatKit::FieldType::FloatType.matches?("42.0"))
20
+ end
21
+
22
+ def test_string_of_leters_does_not_match
23
+ refute(FlatKit::FieldType::FloatType.matches?("abc"))
24
+ end
25
+
26
+ def test_scientific_notation_matches
27
+ assert(FlatKit::FieldType::FloatType.matches?("1e-10"))
28
+ end
29
+
30
+ def test_other_class_does_not_match
31
+ refute(FlatKit::FieldType::FloatType.matches?(Object.new))
32
+ end
33
+
34
+ def test_integer_coerces
35
+ assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce(42))
36
+ end
37
+
38
+ def test_integer_strings_coerce
39
+ assert_equal(42.0, ::FlatKit::FieldType::FloatType.coerce("42"))
40
+ end
41
+
42
+ def test_float_strings_coerce
43
+ assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce("42.6"))
44
+ end
45
+
46
+ def test_float_coerces
47
+ assert_equal(42.6, ::FlatKit::FieldType::FloatType.coerce(42.6))
48
+ end
49
+
50
+ def test_non_numercic_do_not_coerce
51
+ [ "eleven", nil, false, Object.new ].each do |nope|
52
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::FloatType.coerce(nope))
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,14 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestGuessType < ::Minitest::Test
5
+
6
+ def test_guess_type_should_not_match_anything
7
+ refute(FlatKit::FieldType::GuessType.matches?(nil))
8
+ end
9
+
10
+ def test_guess_type_returns_coerce_failure
11
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::GuessType.coerce(nil))
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,52 @@
1
+ require_relative '../test_helper'
2
+
3
+ module TestFieldType
4
+ class TestIntegerType < ::Minitest::Test
5
+ def test_matches_integer
6
+ assert(FlatKit::FieldType::IntegerType.matches?(42))
7
+ end
8
+
9
+ def test_matches_negative_integer
10
+ assert(FlatKit::FieldType::IntegerType.matches?("-42"))
11
+ end
12
+
13
+ def test_float_does_not_match
14
+ refute(FlatKit::FieldType::IntegerType.matches?(42.0))
15
+ end
16
+
17
+ def test_string_of_digits_matches
18
+ assert(FlatKit::FieldType::IntegerType.matches?("42"))
19
+ end
20
+
21
+ def test_string_with_some_digiets_does_not_match
22
+ refute(FlatKit::FieldType::IntegerType.matches?("42.0"))
23
+ refute(FlatKit::FieldType::IntegerType.matches?("abc"))
24
+ end
25
+
26
+ def test_other_class_does_not_match
27
+ refute(FlatKit::FieldType::IntegerType.matches?(Object.new))
28
+ end
29
+
30
+ def test_integer_coerces
31
+ assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42))
32
+ end
33
+
34
+ def test_integer_strings_coerce
35
+ assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce("42"))
36
+ end
37
+
38
+ def test_float_coerces
39
+ assert_equal(42, ::FlatKit::FieldType::IntegerType.coerce(42.6))
40
+ end
41
+
42
+ def test_float_strings_do_not_coerce
43
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce("42.6"))
44
+ end
45
+
46
+ def test_non_numercic_do_not_coerce
47
+ [ "eleven", nil, false, Object.new ].each do |nope|
48
+ assert_equal(::FlatKit::FieldType::CoerceFailure, ::FlatKit::FieldType::IntegerType.coerce(nope))
49
+ end
50
+ end
51
+ end
52
+ end