flat_kit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,47 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class TimestampType < FieldType
4
+
5
+ def self.parse_formats
6
+ @timestamp_formats ||= [
7
+ "%Y-%m-%d %H:%M:%S.%NZ",
8
+ "%Y-%m-%d %H:%M:%S.%N",
9
+ "%Y-%m-%dT%H:%M:%S.%N%z", # w3cdtf
10
+ "%Y-%m-%d %H:%M:%S",
11
+ "%Y-%m-%dT%H:%M:%S%z",
12
+ "%Y-%m-%dT%H:%M:%SZ",
13
+ "%Y%m%dT%H%M%S",
14
+ "%a, %d %b %Y %H:%M:%S %z", # rfc2822, httpdate
15
+ ].freeze
16
+ end
17
+
18
+ def self.type_name
19
+ "timestamp"
20
+ end
21
+
22
+ def self.matches?(data)
23
+ coerced = coerce(data)
24
+ return coerced.kind_of?(Time)
25
+ end
26
+
27
+ def self.coerce(data)
28
+ case data
29
+ when Time
30
+ data
31
+ when String
32
+ parse_formats.each do |format|
33
+ begin
34
+ coerced_data = Time.strptime(data, format).utc
35
+ return coerced_data
36
+ rescue => _
37
+ # do nothing
38
+ end
39
+ end
40
+ CoerceFailure
41
+ else
42
+ CoerceFailure
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,26 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class UnknownType < FieldType
4
+
5
+ REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
6
+
7
+ def self.type_name
8
+ "unknown"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ return false unless data.kind_of?(String)
13
+ return true if data.length == 0
14
+ return REGEX.match?(data)
15
+ end
16
+
17
+ def self.coerce(data)
18
+ return data if REGEX.match?(data)
19
+ return CoerceFailure
20
+ rescue
21
+ return CoerceFailure
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -48,7 +48,7 @@ module FlatKit
48
48
  end
49
49
 
50
50
  def complete_structured_data
51
- @complete_structured_data ||= Oj.load(data)
51
+ @complete_structured_data ||= Oj.load(data, mode: :strict)
52
52
  end
53
53
  alias to_hash complete_structured_data
54
54
 
@@ -60,7 +60,7 @@ module FlatKit
60
60
  # to parse
61
61
  def data
62
62
  if @data.nil? && complete_structured_data? then
63
- @data = Oj.dump(complete_structured_data)
63
+ @data = Oj.dump(complete_structured_data, mode: :json)
64
64
  end
65
65
  @data
66
66
  end
@@ -1,8 +1,6 @@
1
1
  module FlatKit
2
2
  module Jsonl
3
3
  class Writer < ::FlatKit::Writer
4
- attr_reader :output
5
- attr_reader :count
6
4
 
7
5
  def self.format_name
8
6
  ::FlatKit::Jsonl::Format.format_name
@@ -10,10 +8,10 @@ module FlatKit
10
8
 
11
9
  def initialize(destination:)
12
10
  super
13
- @output = ::FlatKit::Output.from(@destination)
14
- @count = 0
15
11
  end
16
12
 
13
+ # write the record and return the Position the record was written
14
+ #
17
15
  def write(record)
18
16
  case record
19
17
  when FlatKit::Jsonl::Record
@@ -31,14 +29,25 @@ module FlatKit
31
29
  raise ::FlatKit::Error, e
32
30
  end
33
31
 
34
- def close
35
- @output.close
36
- end
37
-
38
32
  def write_record(record)
39
- # enforces ending in newlin if it doesn't already have one
33
+ # the index of the record being written is the same as the count of records written so far
34
+ record_index = @count
35
+
36
+ # get the current output stream position to calculate bytes written
37
+ start_offset = output.io.tell
38
+
39
+ # enforces ending in newline if it doesn't already have one
40
40
  output.io.puts record.to_s
41
+
42
+ ending_offset = output.io.tell
43
+ bytes_written = (ending_offset - start_offset)
44
+
41
45
  @count += 1
46
+
47
+ @last_position = ::FlatKit::Position.new(index: record_index,
48
+ offset: start_offset,
49
+ bytesize: bytes_written)
50
+
42
51
  end
43
52
  end
44
53
  end
@@ -28,8 +28,10 @@ module FlatKit
28
28
 
29
29
  notify_listeners(name: :start, data: :start)
30
30
  merge_tree.each do |record|
31
- writer.write(record)
32
- notify_listeners(name: :record, data: record)
31
+
32
+ position = writer.write(record)
33
+ meta = { position: position }
34
+ notify_listeners(name: :record, data: record, meta: meta)
33
35
  end
34
36
  notify_listeners(name: :stop, data: :stop)
35
37
 
@@ -22,6 +22,10 @@ module FlatKit
22
22
  raise NotImplementedError, "#{self.class} must implement #io"
23
23
  end
24
24
 
25
+ def tell
26
+ io.tell
27
+ end
28
+
25
29
  def close
26
30
  raise NotImplementedError, "#{self.class} must implement #close"
27
31
  end
@@ -0,0 +1,19 @@
1
+ module FlatKit
2
+ # The information about the position of a record in an IO stream
3
+ #
4
+ # Generally this is going to be returned by a write_record method to return
5
+ # information about the record that was just written
6
+ #
7
+ class Position
8
+
9
+ attr_reader :index # zero based
10
+ attr_reader :offset # byte offset in the IO stream
11
+ attr_reader :bytesize # byte length of the record
12
+
13
+ def initialize(index: nil, offset: nil, bytesize: nil)
14
+ @index = index
15
+ @offset = offset
16
+ @bytesize = bytesize
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,65 @@
1
+ module FlatKit
2
+ class StatType
3
+ def self.nominal_types
4
+ [FieldType::BooleanType, FieldType::StringType, FieldType::NullType ]
5
+ end
6
+
7
+ def self.ordinal_types
8
+ [FieldType::DateType, FieldType::TimestampType]
9
+ end
10
+
11
+ def self.numerical_types
12
+ [FieldType::FloatType, FieldType::IntegerType]
13
+ end
14
+
15
+ def self.for(type)
16
+ return OrdinalStats if ordinal_types.include?(type)
17
+ return NominalStats if nominal_types.include?(type)
18
+ return NumericalStats if numerical_types.include?(type)
19
+ raise ArgumentError, "Unknown stat type for #{type}"
20
+ end
21
+
22
+ def collected_stats
23
+ raise NotImplementedError, "#{self.class.name} must implement #collected_stats"
24
+ end
25
+
26
+ #
27
+ # call-seq:
28
+ # stat.to_hash -> Hash
29
+ # stat.to_hash( %w[ count max mean ]) -> Hash
30
+ #
31
+ # return a hash of the stats. By default this returns a hash of all stats
32
+ # but passing in an array of items will limit the stats returned to only
33
+ # those in the Array.
34
+ #
35
+ # If passed in an empty array or nil to to_hash then STATS is assumed to be
36
+ # the list of stats to return in the hash.
37
+ #
38
+ def to_hash( *args )
39
+ h = {}
40
+ args = [ args ].flatten
41
+ args = self.collected_stats if args.empty?
42
+ args.each do |meth|
43
+ h[meth] = self.send( meth )
44
+ end
45
+ return h
46
+ end
47
+
48
+ #
49
+ # call-seq:
50
+ # stat.to_json -> String
51
+ # stat.to_json( *args ) -> String
52
+ #
53
+ # return a json string of the stats. By default this returns a json string
54
+ # of all the stats. If an array of items is passed in, those that match the
55
+ # known stats will be all that is included in the json output.
56
+ #
57
+ def to_json( *args )
58
+ h = to_hash( *args )
59
+ Oj.dump(h)
60
+ end
61
+ end
62
+ end
63
+ require 'flat_kit/stat_type/nominal_stats'
64
+ require 'flat_kit/stat_type/ordinal_stats'
65
+ require 'flat_kit/stat_type/numerical_stats'
@@ -0,0 +1,58 @@
1
+ module FlatKit
2
+ class StatType
3
+
4
+ # Status object to keep track of the count and frequency of values
5
+ #
6
+ class NominalStats < StatType
7
+
8
+ attr_reader :count
9
+
10
+ def self.default_stats
11
+ @default_stats ||= %w[ count ]
12
+ end
13
+
14
+ def self.all_stats
15
+ @all_stats ||= %w[ count unique_count unique_values mode ]
16
+ end
17
+
18
+ def initialize(collecting_frequencies: false)
19
+ @mutex = Mutex.new
20
+ @count = 0
21
+ @collecting_frequencies = collecting_frequencies
22
+ @frequencies = Hash.new(0)
23
+ end
24
+
25
+ def collected_stats
26
+ return self.class.default_stats unless @collecting_frequencies
27
+ return self.class.all_stats
28
+ end
29
+
30
+ def mode
31
+ return nil unless @collecting_frequencies
32
+ @frequencies.max_by{ |item, item_count| item_count }.first
33
+ end
34
+
35
+ def unique_count
36
+ return nil unless @collecting_frequencies
37
+ @frequencies.size
38
+ end
39
+
40
+ def unique_values
41
+ return nil unless @collecting_frequencies
42
+ @frequencies.keys
43
+ end
44
+
45
+ def frequencies
46
+ return nil unless @collecting_frequencies
47
+ @frequencies
48
+ end
49
+
50
+ def update(value)
51
+ @mutex.synchronize do
52
+ @count += 1
53
+ @frequencies[value] += 1 if @collecting_frequencies
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,120 @@
1
+ #--
2
+ # Copyright (c) 2008, 2009 Jeremy Hinegardner
3
+ # All rights reserved. See LICENSE and/or COPYING for details.
4
+ #
5
+ # Pulled from Hitimes, which I also wrote
6
+ #++
7
+
8
+ require 'thread'
9
+ require 'oj'
10
+
11
+ module FlatKit
12
+ class StatType
13
+ #
14
+ # Stats object will keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
15
+ # and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
16
+ #
17
+ # this contrived example shows getting a list of all the files in a directory
18
+ # and running stats on file sizes.
19
+ #
20
+ # s = FlatKit::Stats.new
21
+ # dir = ARGV.shift || Dir.pwd
22
+ # Dir.entries( dir ).each do |entry|
23
+ # fs = File.stat( entry )
24
+ # if fs.file? then
25
+ # s.update( fs.size )
26
+ # end
27
+ # end
28
+ #
29
+ # %w[ count min max mean sum stddev rate ].each do |m|
30
+ # puts "#{m.rjust(6)} : #{s.send( m ) }"
31
+ # end
32
+ #
33
+ class NumericalStats < NominalStats
34
+ # A list of the available stats
35
+
36
+ attr_reader :min
37
+ attr_reader :max
38
+ attr_reader :sum
39
+ attr_reader :sumsq
40
+
41
+ def self.default_stats
42
+ @default_stats ||= %w[ count max mean min rate stddev sum sumsq ]
43
+ end
44
+
45
+ def self.all_stats
46
+ @all_stats ||= %w[ count max mean min mode rate stddev sum sumsq unique_count unique_values ]
47
+ end
48
+
49
+ def initialize(collecting_frequencies: false)
50
+ super
51
+ @min = Float::INFINITY
52
+ @max = -Float::INFINITY
53
+ @sum = 0.0
54
+ @sumsq = 0.0
55
+ end
56
+
57
+ # call-seq:
58
+ # stat.update( val ) -> val
59
+ #
60
+ # Update the running stats with the new value.
61
+ # Return the input value.
62
+ def update(value)
63
+ @mutex.synchronize do
64
+ @min = (value < @min) ? value : @min
65
+ @max = (value > @max) ? value : @max
66
+
67
+ @count += 1
68
+ @sum += value
69
+ @sumsq += (value * value)
70
+
71
+ # from Nomnial update
72
+ @frequencies[value] += 1 if @collecting_frequencies
73
+ end
74
+
75
+ return value
76
+ end
77
+
78
+ # call-seq:
79
+ # stat.mean -> Float
80
+ #
81
+ # Return the arithmetic mean of the values put into the Stats object. If no
82
+ # values have passed through the stats object then 0.0 is returned;
83
+ def mean
84
+ return 0.0 if @count.zero?
85
+ return @sum / @count
86
+ end
87
+
88
+ # call-seq:
89
+ # stat.rate -> Float
90
+ #
91
+ # Return the +count+ divided by +sum+.
92
+ #
93
+ # In many cases when Stats#update( _value_ ) is called, the _value_ is a unit
94
+ # of time, typically seconds or microseconds. #rate is a convenience for those
95
+ # times. In this case, where _value_ is a unit if time, then count divided by
96
+ # sum is a useful value, i.e. +something per unit of time+.
97
+ #
98
+ # In the case where _value_ is a non-time related value, then the value
99
+ # returned by _rate_ is not really useful.
100
+ #
101
+ def rate
102
+ return 0.0 if @sum.zero?
103
+ return @count / @sum
104
+ end
105
+
106
+ #
107
+ # call-seq:
108
+ # stat.stddev -> Float
109
+ #
110
+ # Return the standard deviation of all the values that have passed through the
111
+ # Stats object. The standard deviation has no meaning unless the count is > 1,
112
+ # therefore if the current _stat.count_ is < 1 then 0.0 will be returned;
113
+ #
114
+ def stddev
115
+ return 0.0 unless @count > 1
116
+ Math.sqrt((@sumsq - ((@sum * @sum)/@count)) / (@count - 1))
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,42 @@
1
+ module FlatKit
2
+ class StatType
3
+ # Same as NominalStats and also collects min and max
4
+ #
5
+ class OrdinalStats < NominalStats
6
+
7
+ attr_reader :min
8
+ attr_reader :max
9
+
10
+ def self.default_stats
11
+ @default_stats ||= %w[ count max min ]
12
+ end
13
+
14
+ def self.all_stats
15
+ @all_stats ||= %w[ count max min unique_count unique_values mode ]
16
+ end
17
+
18
+ def initialize(collecting_frequencies: false)
19
+ super
20
+ @min = nil
21
+ @max = nil
22
+ end
23
+
24
+ def update(value)
25
+ @mutex.synchronize do
26
+
27
+ if @min.nil? || (value < @min) then
28
+ @min = value
29
+ end
30
+
31
+ if @max.nil? || (value > @max) then
32
+ @max = value
33
+ end
34
+
35
+ @count += 1
36
+
37
+ @frequencies[value] += 1 if @collecting_frequencies
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end