flat_kit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,47 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class TimestampType < FieldType
|
4
|
+
|
5
|
+
def self.parse_formats
|
6
|
+
@timestamp_formats ||= [
|
7
|
+
"%Y-%m-%d %H:%M:%S.%NZ",
|
8
|
+
"%Y-%m-%d %H:%M:%S.%N",
|
9
|
+
"%Y-%m-%dT%H:%M:%S.%N%z", # w3cdtf
|
10
|
+
"%Y-%m-%d %H:%M:%S",
|
11
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
12
|
+
"%Y-%m-%dT%H:%M:%SZ",
|
13
|
+
"%Y%m%dT%H%M%S",
|
14
|
+
"%a, %d %b %Y %H:%M:%S %z", # rfc2822, httpdate
|
15
|
+
].freeze
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.type_name
|
19
|
+
"timestamp"
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.matches?(data)
|
23
|
+
coerced = coerce(data)
|
24
|
+
return coerced.kind_of?(Time)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.coerce(data)
|
28
|
+
case data
|
29
|
+
when Time
|
30
|
+
data
|
31
|
+
when String
|
32
|
+
parse_formats.each do |format|
|
33
|
+
begin
|
34
|
+
coerced_data = Time.strptime(data, format).utc
|
35
|
+
return coerced_data
|
36
|
+
rescue => _
|
37
|
+
# do nothing
|
38
|
+
end
|
39
|
+
end
|
40
|
+
CoerceFailure
|
41
|
+
else
|
42
|
+
CoerceFailure
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class UnknownType < FieldType
|
4
|
+
|
5
|
+
REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"unknown"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
return false unless data.kind_of?(String)
|
13
|
+
return true if data.length == 0
|
14
|
+
return REGEX.match?(data)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.coerce(data)
|
18
|
+
return data if REGEX.match?(data)
|
19
|
+
return CoerceFailure
|
20
|
+
rescue
|
21
|
+
return CoerceFailure
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -48,7 +48,7 @@ module FlatKit
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def complete_structured_data
|
51
|
-
@complete_structured_data ||= Oj.load(data)
|
51
|
+
@complete_structured_data ||= Oj.load(data, mode: :strict)
|
52
52
|
end
|
53
53
|
alias to_hash complete_structured_data
|
54
54
|
|
@@ -60,7 +60,7 @@ module FlatKit
|
|
60
60
|
# to parse
|
61
61
|
def data
|
62
62
|
if @data.nil? && complete_structured_data? then
|
63
|
-
@data = Oj.dump(complete_structured_data)
|
63
|
+
@data = Oj.dump(complete_structured_data, mode: :json)
|
64
64
|
end
|
65
65
|
@data
|
66
66
|
end
|
@@ -1,8 +1,6 @@
|
|
1
1
|
module FlatKit
|
2
2
|
module Jsonl
|
3
3
|
class Writer < ::FlatKit::Writer
|
4
|
-
attr_reader :output
|
5
|
-
attr_reader :count
|
6
4
|
|
7
5
|
def self.format_name
|
8
6
|
::FlatKit::Jsonl::Format.format_name
|
@@ -10,10 +8,10 @@ module FlatKit
|
|
10
8
|
|
11
9
|
def initialize(destination:)
|
12
10
|
super
|
13
|
-
@output = ::FlatKit::Output.from(@destination)
|
14
|
-
@count = 0
|
15
11
|
end
|
16
12
|
|
13
|
+
# write the record and return the Position the record was written
|
14
|
+
#
|
17
15
|
def write(record)
|
18
16
|
case record
|
19
17
|
when FlatKit::Jsonl::Record
|
@@ -31,14 +29,25 @@ module FlatKit
|
|
31
29
|
raise ::FlatKit::Error, e
|
32
30
|
end
|
33
31
|
|
34
|
-
def close
|
35
|
-
@output.close
|
36
|
-
end
|
37
|
-
|
38
32
|
def write_record(record)
|
39
|
-
#
|
33
|
+
# the index of the record being written is the same as the count of records written so far
|
34
|
+
record_index = @count
|
35
|
+
|
36
|
+
# get the current output stream position to calculate bytes written
|
37
|
+
start_offset = output.io.tell
|
38
|
+
|
39
|
+
# enforces ending in newline if it doesn't already have one
|
40
40
|
output.io.puts record.to_s
|
41
|
+
|
42
|
+
ending_offset = output.io.tell
|
43
|
+
bytes_written = (ending_offset - start_offset)
|
44
|
+
|
41
45
|
@count += 1
|
46
|
+
|
47
|
+
@last_position = ::FlatKit::Position.new(index: record_index,
|
48
|
+
offset: start_offset,
|
49
|
+
bytesize: bytes_written)
|
50
|
+
|
42
51
|
end
|
43
52
|
end
|
44
53
|
end
|
data/lib/flat_kit/merge.rb
CHANGED
@@ -28,8 +28,10 @@ module FlatKit
|
|
28
28
|
|
29
29
|
notify_listeners(name: :start, data: :start)
|
30
30
|
merge_tree.each do |record|
|
31
|
-
|
32
|
-
|
31
|
+
|
32
|
+
position = writer.write(record)
|
33
|
+
meta = { position: position }
|
34
|
+
notify_listeners(name: :record, data: record, meta: meta)
|
33
35
|
end
|
34
36
|
notify_listeners(name: :stop, data: :stop)
|
35
37
|
|
data/lib/flat_kit/output.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# The information about the position of a record in an IO stream
|
3
|
+
#
|
4
|
+
# Generally this is going to be returned by a write_record method to return
|
5
|
+
# information about the record that was just written
|
6
|
+
#
|
7
|
+
class Position
|
8
|
+
|
9
|
+
attr_reader :index # zero based
|
10
|
+
attr_reader :offset # byte offset in the IO stream
|
11
|
+
attr_reader :bytesize # byte length of the record
|
12
|
+
|
13
|
+
def initialize(index: nil, offset: nil, bytesize: nil)
|
14
|
+
@index = index
|
15
|
+
@offset = offset
|
16
|
+
@bytesize = bytesize
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class StatType
|
3
|
+
def self.nominal_types
|
4
|
+
[FieldType::BooleanType, FieldType::StringType, FieldType::NullType ]
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.ordinal_types
|
8
|
+
[FieldType::DateType, FieldType::TimestampType]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.numerical_types
|
12
|
+
[FieldType::FloatType, FieldType::IntegerType]
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.for(type)
|
16
|
+
return OrdinalStats if ordinal_types.include?(type)
|
17
|
+
return NominalStats if nominal_types.include?(type)
|
18
|
+
return NumericalStats if numerical_types.include?(type)
|
19
|
+
raise ArgumentError, "Unknown stat type for #{type}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def collected_stats
|
23
|
+
raise NotImplementedError, "#{self.class.name} must implement #collected_stats"
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# call-seq:
|
28
|
+
# stat.to_hash -> Hash
|
29
|
+
# stat.to_hash( %w[ count max mean ]) -> Hash
|
30
|
+
#
|
31
|
+
# return a hash of the stats. By default this returns a hash of all stats
|
32
|
+
# but passing in an array of items will limit the stats returned to only
|
33
|
+
# those in the Array.
|
34
|
+
#
|
35
|
+
# If passed in an empty array or nil to to_hash then STATS is assumed to be
|
36
|
+
# the list of stats to return in the hash.
|
37
|
+
#
|
38
|
+
def to_hash( *args )
|
39
|
+
h = {}
|
40
|
+
args = [ args ].flatten
|
41
|
+
args = self.collected_stats if args.empty?
|
42
|
+
args.each do |meth|
|
43
|
+
h[meth] = self.send( meth )
|
44
|
+
end
|
45
|
+
return h
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# call-seq:
|
50
|
+
# stat.to_json -> String
|
51
|
+
# stat.to_json( *args ) -> String
|
52
|
+
#
|
53
|
+
# return a json string of the stats. By default this returns a json string
|
54
|
+
# of all the stats. If an array of items is passed in, those that match the
|
55
|
+
# known stats will be all that is included in the json output.
|
56
|
+
#
|
57
|
+
def to_json( *args )
|
58
|
+
h = to_hash( *args )
|
59
|
+
Oj.dump(h)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
require 'flat_kit/stat_type/nominal_stats'
|
64
|
+
require 'flat_kit/stat_type/ordinal_stats'
|
65
|
+
require 'flat_kit/stat_type/numerical_stats'
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class StatType
|
3
|
+
|
4
|
+
# Status object to keep track of the count and frequency of values
|
5
|
+
#
|
6
|
+
class NominalStats < StatType
|
7
|
+
|
8
|
+
attr_reader :count
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[ count ]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[ count unique_count unique_values mode ]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
@mutex = Mutex.new
|
20
|
+
@count = 0
|
21
|
+
@collecting_frequencies = collecting_frequencies
|
22
|
+
@frequencies = Hash.new(0)
|
23
|
+
end
|
24
|
+
|
25
|
+
def collected_stats
|
26
|
+
return self.class.default_stats unless @collecting_frequencies
|
27
|
+
return self.class.all_stats
|
28
|
+
end
|
29
|
+
|
30
|
+
def mode
|
31
|
+
return nil unless @collecting_frequencies
|
32
|
+
@frequencies.max_by{ |item, item_count| item_count }.first
|
33
|
+
end
|
34
|
+
|
35
|
+
def unique_count
|
36
|
+
return nil unless @collecting_frequencies
|
37
|
+
@frequencies.size
|
38
|
+
end
|
39
|
+
|
40
|
+
def unique_values
|
41
|
+
return nil unless @collecting_frequencies
|
42
|
+
@frequencies.keys
|
43
|
+
end
|
44
|
+
|
45
|
+
def frequencies
|
46
|
+
return nil unless @collecting_frequencies
|
47
|
+
@frequencies
|
48
|
+
end
|
49
|
+
|
50
|
+
def update(value)
|
51
|
+
@mutex.synchronize do
|
52
|
+
@count += 1
|
53
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008, 2009 Jeremy Hinegardner
|
3
|
+
# All rights reserved. See LICENSE and/or COPYING for details.
|
4
|
+
#
|
5
|
+
# Pulled from Hitimes, which I also wrote
|
6
|
+
#++
|
7
|
+
|
8
|
+
require 'thread'
|
9
|
+
require 'oj'
|
10
|
+
|
11
|
+
module FlatKit
|
12
|
+
class StatType
|
13
|
+
#
|
14
|
+
# Stats object will keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
15
|
+
# and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
|
16
|
+
#
|
17
|
+
# this contrived example shows getting a list of all the files in a directory
|
18
|
+
# and running stats on file sizes.
|
19
|
+
#
|
20
|
+
# s = FlatKit::Stats.new
|
21
|
+
# dir = ARGV.shift || Dir.pwd
|
22
|
+
# Dir.entries( dir ).each do |entry|
|
23
|
+
# fs = File.stat( entry )
|
24
|
+
# if fs.file? then
|
25
|
+
# s.update( fs.size )
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# %w[ count min max mean sum stddev rate ].each do |m|
|
30
|
+
# puts "#{m.rjust(6)} : #{s.send( m ) }"
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
class NumericalStats < NominalStats
|
34
|
+
# A list of the available stats
|
35
|
+
|
36
|
+
attr_reader :min
|
37
|
+
attr_reader :max
|
38
|
+
attr_reader :sum
|
39
|
+
attr_reader :sumsq
|
40
|
+
|
41
|
+
def self.default_stats
|
42
|
+
@default_stats ||= %w[ count max mean min rate stddev sum sumsq ]
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.all_stats
|
46
|
+
@all_stats ||= %w[ count max mean min mode rate stddev sum sumsq unique_count unique_values ]
|
47
|
+
end
|
48
|
+
|
49
|
+
def initialize(collecting_frequencies: false)
|
50
|
+
super
|
51
|
+
@min = Float::INFINITY
|
52
|
+
@max = -Float::INFINITY
|
53
|
+
@sum = 0.0
|
54
|
+
@sumsq = 0.0
|
55
|
+
end
|
56
|
+
|
57
|
+
# call-seq:
|
58
|
+
# stat.update( val ) -> val
|
59
|
+
#
|
60
|
+
# Update the running stats with the new value.
|
61
|
+
# Return the input value.
|
62
|
+
def update(value)
|
63
|
+
@mutex.synchronize do
|
64
|
+
@min = (value < @min) ? value : @min
|
65
|
+
@max = (value > @max) ? value : @max
|
66
|
+
|
67
|
+
@count += 1
|
68
|
+
@sum += value
|
69
|
+
@sumsq += (value * value)
|
70
|
+
|
71
|
+
# from Nomnial update
|
72
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
73
|
+
end
|
74
|
+
|
75
|
+
return value
|
76
|
+
end
|
77
|
+
|
78
|
+
# call-seq:
|
79
|
+
# stat.mean -> Float
|
80
|
+
#
|
81
|
+
# Return the arithmetic mean of the values put into the Stats object. If no
|
82
|
+
# values have passed through the stats object then 0.0 is returned;
|
83
|
+
def mean
|
84
|
+
return 0.0 if @count.zero?
|
85
|
+
return @sum / @count
|
86
|
+
end
|
87
|
+
|
88
|
+
# call-seq:
|
89
|
+
# stat.rate -> Float
|
90
|
+
#
|
91
|
+
# Return the +count+ divided by +sum+.
|
92
|
+
#
|
93
|
+
# In many cases when Stats#update( _value_ ) is called, the _value_ is a unit
|
94
|
+
# of time, typically seconds or microseconds. #rate is a convenience for those
|
95
|
+
# times. In this case, where _value_ is a unit if time, then count divided by
|
96
|
+
# sum is a useful value, i.e. +something per unit of time+.
|
97
|
+
#
|
98
|
+
# In the case where _value_ is a non-time related value, then the value
|
99
|
+
# returned by _rate_ is not really useful.
|
100
|
+
#
|
101
|
+
def rate
|
102
|
+
return 0.0 if @sum.zero?
|
103
|
+
return @count / @sum
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# call-seq:
|
108
|
+
# stat.stddev -> Float
|
109
|
+
#
|
110
|
+
# Return the standard deviation of all the values that have passed through the
|
111
|
+
# Stats object. The standard deviation has no meaning unless the count is > 1,
|
112
|
+
# therefore if the current _stat.count_ is < 1 then 0.0 will be returned;
|
113
|
+
#
|
114
|
+
def stddev
|
115
|
+
return 0.0 unless @count > 1
|
116
|
+
Math.sqrt((@sumsq - ((@sum * @sum)/@count)) / (@count - 1))
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class StatType
|
3
|
+
# Same as NominalStats and also collects min and max
|
4
|
+
#
|
5
|
+
class OrdinalStats < NominalStats
|
6
|
+
|
7
|
+
attr_reader :min
|
8
|
+
attr_reader :max
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[ count max min ]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[ count max min unique_count unique_values mode ]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
super
|
20
|
+
@min = nil
|
21
|
+
@max = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def update(value)
|
25
|
+
@mutex.synchronize do
|
26
|
+
|
27
|
+
if @min.nil? || (value < @min) then
|
28
|
+
@min = value
|
29
|
+
end
|
30
|
+
|
31
|
+
if @max.nil? || (value > @max) then
|
32
|
+
@max = value
|
33
|
+
end
|
34
|
+
|
35
|
+
@count += 1
|
36
|
+
|
37
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|