flat_kit 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,47 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class TimestampType < FieldType
4
+
5
+ def self.parse_formats
6
+ @timestamp_formats ||= [
7
+ "%Y-%m-%d %H:%M:%S.%NZ",
8
+ "%Y-%m-%d %H:%M:%S.%N",
9
+ "%Y-%m-%dT%H:%M:%S.%N%z", # w3cdtf
10
+ "%Y-%m-%d %H:%M:%S",
11
+ "%Y-%m-%dT%H:%M:%S%z",
12
+ "%Y-%m-%dT%H:%M:%SZ",
13
+ "%Y%m%dT%H%M%S",
14
+ "%a, %d %b %Y %H:%M:%S %z", # rfc2822, httpdate
15
+ ].freeze
16
+ end
17
+
18
+ def self.type_name
19
+ "timestamp"
20
+ end
21
+
22
+ def self.matches?(data)
23
+ coerced = coerce(data)
24
+ return coerced.kind_of?(Time)
25
+ end
26
+
27
+ def self.coerce(data)
28
+ case data
29
+ when Time
30
+ data
31
+ when String
32
+ parse_formats.each do |format|
33
+ begin
34
+ coerced_data = Time.strptime(data, format).utc
35
+ return coerced_data
36
+ rescue => _
37
+ # do nothing
38
+ end
39
+ end
40
+ CoerceFailure
41
+ else
42
+ CoerceFailure
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,26 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class UnknownType < FieldType
4
+
5
+ REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
6
+
7
+ def self.type_name
8
+ "unknown"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ return false unless data.kind_of?(String)
13
+ return true if data.length == 0
14
+ return REGEX.match?(data)
15
+ end
16
+
17
+ def self.coerce(data)
18
+ return data if REGEX.match?(data)
19
+ return CoerceFailure
20
+ rescue
21
+ return CoerceFailure
22
+ end
23
+
24
+ end
25
+ end
26
+ end
@@ -48,7 +48,7 @@ module FlatKit
48
48
  end
49
49
 
50
50
  def complete_structured_data
51
- @complete_structured_data ||= Oj.load(data)
51
+ @complete_structured_data ||= Oj.load(data, mode: :strict)
52
52
  end
53
53
  alias to_hash complete_structured_data
54
54
 
@@ -60,7 +60,7 @@ module FlatKit
60
60
  # to parse
61
61
  def data
62
62
  if @data.nil? && complete_structured_data? then
63
- @data = Oj.dump(complete_structured_data)
63
+ @data = Oj.dump(complete_structured_data, mode: :json)
64
64
  end
65
65
  @data
66
66
  end
@@ -1,8 +1,6 @@
1
1
  module FlatKit
2
2
  module Jsonl
3
3
  class Writer < ::FlatKit::Writer
4
- attr_reader :output
5
- attr_reader :count
6
4
 
7
5
  def self.format_name
8
6
  ::FlatKit::Jsonl::Format.format_name
@@ -10,10 +8,10 @@ module FlatKit
10
8
 
11
9
  def initialize(destination:)
12
10
  super
13
- @output = ::FlatKit::Output.from(@destination)
14
- @count = 0
15
11
  end
16
12
 
13
+ # write the record and return the Position the record was written
14
+ #
17
15
  def write(record)
18
16
  case record
19
17
  when FlatKit::Jsonl::Record
@@ -31,14 +29,25 @@ module FlatKit
31
29
  raise ::FlatKit::Error, e
32
30
  end
33
31
 
34
- def close
35
- @output.close
36
- end
37
-
38
32
  def write_record(record)
39
- # enforces ending in newlin if it doesn't already have one
33
+ # the index of the record being written is the same as the count of records written so far
34
+ record_index = @count
35
+
36
+ # get the current output stream position to calculate bytes written
37
+ start_offset = output.io.tell
38
+
39
+ # enforces ending in newline if it doesn't already have one
40
40
  output.io.puts record.to_s
41
+
42
+ ending_offset = output.io.tell
43
+ bytes_written = (ending_offset - start_offset)
44
+
41
45
  @count += 1
46
+
47
+ @last_position = ::FlatKit::Position.new(index: record_index,
48
+ offset: start_offset,
49
+ bytesize: bytes_written)
50
+
42
51
  end
43
52
  end
44
53
  end
@@ -28,8 +28,10 @@ module FlatKit
28
28
 
29
29
  notify_listeners(name: :start, data: :start)
30
30
  merge_tree.each do |record|
31
- writer.write(record)
32
- notify_listeners(name: :record, data: record)
31
+
32
+ position = writer.write(record)
33
+ meta = { position: position }
34
+ notify_listeners(name: :record, data: record, meta: meta)
33
35
  end
34
36
  notify_listeners(name: :stop, data: :stop)
35
37
 
@@ -22,6 +22,10 @@ module FlatKit
22
22
  raise NotImplementedError, "#{self.class} must implement #io"
23
23
  end
24
24
 
25
+ def tell
26
+ io.tell
27
+ end
28
+
25
29
  def close
26
30
  raise NotImplementedError, "#{self.class} must implement #close"
27
31
  end
@@ -0,0 +1,19 @@
1
+ module FlatKit
2
+ # The information about the position of a record in an IO stream
3
+ #
4
+ # Generally this is going to be returned by a write_record method to return
5
+ # information about the record that was just written
6
+ #
7
+ class Position
8
+
9
+ attr_reader :index # zero based
10
+ attr_reader :offset # byte offset in the IO stream
11
+ attr_reader :bytesize # byte length of the record
12
+
13
+ def initialize(index: nil, offset: nil, bytesize: nil)
14
+ @index = index
15
+ @offset = offset
16
+ @bytesize = bytesize
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,65 @@
1
+ module FlatKit
2
+ class StatType
3
+ def self.nominal_types
4
+ [FieldType::BooleanType, FieldType::StringType, FieldType::NullType ]
5
+ end
6
+
7
+ def self.ordinal_types
8
+ [FieldType::DateType, FieldType::TimestampType]
9
+ end
10
+
11
+ def self.numerical_types
12
+ [FieldType::FloatType, FieldType::IntegerType]
13
+ end
14
+
15
+ def self.for(type)
16
+ return OrdinalStats if ordinal_types.include?(type)
17
+ return NominalStats if nominal_types.include?(type)
18
+ return NumericalStats if numerical_types.include?(type)
19
+ raise ArgumentError, "Unknown stat type for #{type}"
20
+ end
21
+
22
+ def collected_stats
23
+ raise NotImplementedError, "#{self.class.name} must implement #collected_stats"
24
+ end
25
+
26
+ #
27
+ # call-seq:
28
+ # stat.to_hash -> Hash
29
+ # stat.to_hash( %w[ count max mean ]) -> Hash
30
+ #
31
+ # return a hash of the stats. By default this returns a hash of all stats
32
+ # but passing in an array of items will limit the stats returned to only
33
+ # those in the Array.
34
+ #
35
+ # If passed in an empty array or nil to to_hash then STATS is assumed to be
36
+ # the list of stats to return in the hash.
37
+ #
38
+ def to_hash( *args )
39
+ h = {}
40
+ args = [ args ].flatten
41
+ args = self.collected_stats if args.empty?
42
+ args.each do |meth|
43
+ h[meth] = self.send( meth )
44
+ end
45
+ return h
46
+ end
47
+
48
+ #
49
+ # call-seq:
50
+ # stat.to_json -> String
51
+ # stat.to_json( *args ) -> String
52
+ #
53
+ # return a json string of the stats. By default this returns a json string
54
+ # of all the stats. If an array of items is passed in, those that match the
55
+ # known stats will be all that is included in the json output.
56
+ #
57
+ def to_json( *args )
58
+ h = to_hash( *args )
59
+ Oj.dump(h)
60
+ end
61
+ end
62
+ end
63
+ require 'flat_kit/stat_type/nominal_stats'
64
+ require 'flat_kit/stat_type/ordinal_stats'
65
+ require 'flat_kit/stat_type/numerical_stats'
@@ -0,0 +1,58 @@
1
+ module FlatKit
2
+ class StatType
3
+
4
+ # Status object to keep track of the count and frequency of values
5
+ #
6
+ class NominalStats < StatType
7
+
8
+ attr_reader :count
9
+
10
+ def self.default_stats
11
+ @default_stats ||= %w[ count ]
12
+ end
13
+
14
+ def self.all_stats
15
+ @all_stats ||= %w[ count unique_count unique_values mode ]
16
+ end
17
+
18
+ def initialize(collecting_frequencies: false)
19
+ @mutex = Mutex.new
20
+ @count = 0
21
+ @collecting_frequencies = collecting_frequencies
22
+ @frequencies = Hash.new(0)
23
+ end
24
+
25
+ def collected_stats
26
+ return self.class.default_stats unless @collecting_frequencies
27
+ return self.class.all_stats
28
+ end
29
+
30
+ def mode
31
+ return nil unless @collecting_frequencies
32
+ @frequencies.max_by{ |item, item_count| item_count }.first
33
+ end
34
+
35
+ def unique_count
36
+ return nil unless @collecting_frequencies
37
+ @frequencies.size
38
+ end
39
+
40
+ def unique_values
41
+ return nil unless @collecting_frequencies
42
+ @frequencies.keys
43
+ end
44
+
45
+ def frequencies
46
+ return nil unless @collecting_frequencies
47
+ @frequencies
48
+ end
49
+
50
+ def update(value)
51
+ @mutex.synchronize do
52
+ @count += 1
53
+ @frequencies[value] += 1 if @collecting_frequencies
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,120 @@
1
+ #--
2
+ # Copyright (c) 2008, 2009 Jeremy Hinegardner
3
+ # All rights reserved. See LICENSE and/or COPYING for details.
4
+ #
5
+ # Pulled from Hitimes, which I also wrote
6
+ #++
7
+
8
+ require 'thread'
9
+ require 'oj'
10
+
11
+ module FlatKit
12
+ class StatType
13
+ #
14
+ # Stats object will keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
15
+ # and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
16
+ #
17
+ # this contrived example shows getting a list of all the files in a directory
18
+ # and running stats on file sizes.
19
+ #
20
+ # s = FlatKit::Stats.new
21
+ # dir = ARGV.shift || Dir.pwd
22
+ # Dir.entries( dir ).each do |entry|
23
+ # fs = File.stat( entry )
24
+ # if fs.file? then
25
+ # s.update( fs.size )
26
+ # end
27
+ # end
28
+ #
29
+ # %w[ count min max mean sum stddev rate ].each do |m|
30
+ # puts "#{m.rjust(6)} : #{s.send( m ) }"
31
+ # end
32
+ #
33
+ class NumericalStats < NominalStats
34
+ # A list of the available stats
35
+
36
+ attr_reader :min
37
+ attr_reader :max
38
+ attr_reader :sum
39
+ attr_reader :sumsq
40
+
41
+ def self.default_stats
42
+ @default_stats ||= %w[ count max mean min rate stddev sum sumsq ]
43
+ end
44
+
45
+ def self.all_stats
46
+ @all_stats ||= %w[ count max mean min mode rate stddev sum sumsq unique_count unique_values ]
47
+ end
48
+
49
+ def initialize(collecting_frequencies: false)
50
+ super
51
+ @min = Float::INFINITY
52
+ @max = -Float::INFINITY
53
+ @sum = 0.0
54
+ @sumsq = 0.0
55
+ end
56
+
57
+ # call-seq:
58
+ # stat.update( val ) -> val
59
+ #
60
+ # Update the running stats with the new value.
61
+ # Return the input value.
62
+ def update(value)
63
+ @mutex.synchronize do
64
+ @min = (value < @min) ? value : @min
65
+ @max = (value > @max) ? value : @max
66
+
67
+ @count += 1
68
+ @sum += value
69
+ @sumsq += (value * value)
70
+
71
+ # from Nomnial update
72
+ @frequencies[value] += 1 if @collecting_frequencies
73
+ end
74
+
75
+ return value
76
+ end
77
+
78
+ # call-seq:
79
+ # stat.mean -> Float
80
+ #
81
+ # Return the arithmetic mean of the values put into the Stats object. If no
82
+ # values have passed through the stats object then 0.0 is returned;
83
+ def mean
84
+ return 0.0 if @count.zero?
85
+ return @sum / @count
86
+ end
87
+
88
+ # call-seq:
89
+ # stat.rate -> Float
90
+ #
91
+ # Return the +count+ divided by +sum+.
92
+ #
93
+ # In many cases when Stats#update( _value_ ) is called, the _value_ is a unit
94
+ # of time, typically seconds or microseconds. #rate is a convenience for those
95
+ # times. In this case, where _value_ is a unit if time, then count divided by
96
+ # sum is a useful value, i.e. +something per unit of time+.
97
+ #
98
+ # In the case where _value_ is a non-time related value, then the value
99
+ # returned by _rate_ is not really useful.
100
+ #
101
+ def rate
102
+ return 0.0 if @sum.zero?
103
+ return @count / @sum
104
+ end
105
+
106
+ #
107
+ # call-seq:
108
+ # stat.stddev -> Float
109
+ #
110
+ # Return the standard deviation of all the values that have passed through the
111
+ # Stats object. The standard deviation has no meaning unless the count is > 1,
112
+ # therefore if the current _stat.count_ is < 1 then 0.0 will be returned;
113
+ #
114
+ def stddev
115
+ return 0.0 unless @count > 1
116
+ Math.sqrt((@sumsq - ((@sum * @sum)/@count)) / (@count - 1))
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,42 @@
1
+ module FlatKit
2
+ class StatType
3
+ # Same as NominalStats and also collects min and max
4
+ #
5
+ class OrdinalStats < NominalStats
6
+
7
+ attr_reader :min
8
+ attr_reader :max
9
+
10
+ def self.default_stats
11
+ @default_stats ||= %w[ count max min ]
12
+ end
13
+
14
+ def self.all_stats
15
+ @all_stats ||= %w[ count max min unique_count unique_values mode ]
16
+ end
17
+
18
+ def initialize(collecting_frequencies: false)
19
+ super
20
+ @min = nil
21
+ @max = nil
22
+ end
23
+
24
+ def update(value)
25
+ @mutex.synchronize do
26
+
27
+ if @min.nil? || (value < @min) then
28
+ @min = value
29
+ end
30
+
31
+ if @max.nil? || (value > @max) then
32
+ @max = value
33
+ end
34
+
35
+ @count += 1
36
+
37
+ @frequencies[value] += 1 if @collecting_frequencies
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end