flat_kit 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,47 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class TimestampType < FieldType
|
4
|
+
|
5
|
+
def self.parse_formats
|
6
|
+
@timestamp_formats ||= [
|
7
|
+
"%Y-%m-%d %H:%M:%S.%NZ",
|
8
|
+
"%Y-%m-%d %H:%M:%S.%N",
|
9
|
+
"%Y-%m-%dT%H:%M:%S.%N%z", # w3cdtf
|
10
|
+
"%Y-%m-%d %H:%M:%S",
|
11
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
12
|
+
"%Y-%m-%dT%H:%M:%SZ",
|
13
|
+
"%Y%m%dT%H%M%S",
|
14
|
+
"%a, %d %b %Y %H:%M:%S %z", # rfc2822, httpdate
|
15
|
+
].freeze
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.type_name
|
19
|
+
"timestamp"
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.matches?(data)
|
23
|
+
coerced = coerce(data)
|
24
|
+
return coerced.kind_of?(Time)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.coerce(data)
|
28
|
+
case data
|
29
|
+
when Time
|
30
|
+
data
|
31
|
+
when String
|
32
|
+
parse_formats.each do |format|
|
33
|
+
begin
|
34
|
+
coerced_data = Time.strptime(data, format).utc
|
35
|
+
return coerced_data
|
36
|
+
rescue => _
|
37
|
+
# do nothing
|
38
|
+
end
|
39
|
+
end
|
40
|
+
CoerceFailure
|
41
|
+
else
|
42
|
+
CoerceFailure
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class UnknownType < FieldType
|
4
|
+
|
5
|
+
REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"unknown"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
return false unless data.kind_of?(String)
|
13
|
+
return true if data.length == 0
|
14
|
+
return REGEX.match?(data)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.coerce(data)
|
18
|
+
return data if REGEX.match?(data)
|
19
|
+
return CoerceFailure
|
20
|
+
rescue
|
21
|
+
return CoerceFailure
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -48,7 +48,7 @@ module FlatKit
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def complete_structured_data
|
51
|
-
@complete_structured_data ||= Oj.load(data)
|
51
|
+
@complete_structured_data ||= Oj.load(data, mode: :strict)
|
52
52
|
end
|
53
53
|
alias to_hash complete_structured_data
|
54
54
|
|
@@ -60,7 +60,7 @@ module FlatKit
|
|
60
60
|
# to parse
|
61
61
|
def data
|
62
62
|
if @data.nil? && complete_structured_data? then
|
63
|
-
@data = Oj.dump(complete_structured_data)
|
63
|
+
@data = Oj.dump(complete_structured_data, mode: :json)
|
64
64
|
end
|
65
65
|
@data
|
66
66
|
end
|
@@ -1,8 +1,6 @@
|
|
1
1
|
module FlatKit
|
2
2
|
module Jsonl
|
3
3
|
class Writer < ::FlatKit::Writer
|
4
|
-
attr_reader :output
|
5
|
-
attr_reader :count
|
6
4
|
|
7
5
|
def self.format_name
|
8
6
|
::FlatKit::Jsonl::Format.format_name
|
@@ -10,10 +8,10 @@ module FlatKit
|
|
10
8
|
|
11
9
|
def initialize(destination:)
|
12
10
|
super
|
13
|
-
@output = ::FlatKit::Output.from(@destination)
|
14
|
-
@count = 0
|
15
11
|
end
|
16
12
|
|
13
|
+
# write the record and return the Position the record was written
|
14
|
+
#
|
17
15
|
def write(record)
|
18
16
|
case record
|
19
17
|
when FlatKit::Jsonl::Record
|
@@ -31,14 +29,25 @@ module FlatKit
|
|
31
29
|
raise ::FlatKit::Error, e
|
32
30
|
end
|
33
31
|
|
34
|
-
def close
|
35
|
-
@output.close
|
36
|
-
end
|
37
|
-
|
38
32
|
def write_record(record)
|
39
|
-
#
|
33
|
+
# the index of the record being written is the same as the count of records written so far
|
34
|
+
record_index = @count
|
35
|
+
|
36
|
+
# get the current output stream position to calculate bytes written
|
37
|
+
start_offset = output.io.tell
|
38
|
+
|
39
|
+
# enforces ending in newline if it doesn't already have one
|
40
40
|
output.io.puts record.to_s
|
41
|
+
|
42
|
+
ending_offset = output.io.tell
|
43
|
+
bytes_written = (ending_offset - start_offset)
|
44
|
+
|
41
45
|
@count += 1
|
46
|
+
|
47
|
+
@last_position = ::FlatKit::Position.new(index: record_index,
|
48
|
+
offset: start_offset,
|
49
|
+
bytesize: bytes_written)
|
50
|
+
|
42
51
|
end
|
43
52
|
end
|
44
53
|
end
|
data/lib/flat_kit/merge.rb
CHANGED
@@ -28,8 +28,10 @@ module FlatKit
|
|
28
28
|
|
29
29
|
notify_listeners(name: :start, data: :start)
|
30
30
|
merge_tree.each do |record|
|
31
|
-
|
32
|
-
|
31
|
+
|
32
|
+
position = writer.write(record)
|
33
|
+
meta = { position: position }
|
34
|
+
notify_listeners(name: :record, data: record, meta: meta)
|
33
35
|
end
|
34
36
|
notify_listeners(name: :stop, data: :stop)
|
35
37
|
|
data/lib/flat_kit/output.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# The information about the position of a record in an IO stream
|
3
|
+
#
|
4
|
+
# Generally this is going to be returned by a write_record method to return
|
5
|
+
# information about the record that was just written
|
6
|
+
#
|
7
|
+
class Position
|
8
|
+
|
9
|
+
attr_reader :index # zero based
|
10
|
+
attr_reader :offset # byte offset in the IO stream
|
11
|
+
attr_reader :bytesize # byte length of the record
|
12
|
+
|
13
|
+
def initialize(index: nil, offset: nil, bytesize: nil)
|
14
|
+
@index = index
|
15
|
+
@offset = offset
|
16
|
+
@bytesize = bytesize
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class StatType
|
3
|
+
def self.nominal_types
|
4
|
+
[FieldType::BooleanType, FieldType::StringType, FieldType::NullType ]
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.ordinal_types
|
8
|
+
[FieldType::DateType, FieldType::TimestampType]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.numerical_types
|
12
|
+
[FieldType::FloatType, FieldType::IntegerType]
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.for(type)
|
16
|
+
return OrdinalStats if ordinal_types.include?(type)
|
17
|
+
return NominalStats if nominal_types.include?(type)
|
18
|
+
return NumericalStats if numerical_types.include?(type)
|
19
|
+
raise ArgumentError, "Unknown stat type for #{type}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def collected_stats
|
23
|
+
raise NotImplementedError, "#{self.class.name} must implement #collected_stats"
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# call-seq:
|
28
|
+
# stat.to_hash -> Hash
|
29
|
+
# stat.to_hash( %w[ count max mean ]) -> Hash
|
30
|
+
#
|
31
|
+
# return a hash of the stats. By default this returns a hash of all stats
|
32
|
+
# but passing in an array of items will limit the stats returned to only
|
33
|
+
# those in the Array.
|
34
|
+
#
|
35
|
+
# If passed in an empty array or nil to to_hash then STATS is assumed to be
|
36
|
+
# the list of stats to return in the hash.
|
37
|
+
#
|
38
|
+
def to_hash( *args )
|
39
|
+
h = {}
|
40
|
+
args = [ args ].flatten
|
41
|
+
args = self.collected_stats if args.empty?
|
42
|
+
args.each do |meth|
|
43
|
+
h[meth] = self.send( meth )
|
44
|
+
end
|
45
|
+
return h
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# call-seq:
|
50
|
+
# stat.to_json -> String
|
51
|
+
# stat.to_json( *args ) -> String
|
52
|
+
#
|
53
|
+
# return a json string of the stats. By default this returns a json string
|
54
|
+
# of all the stats. If an array of items is passed in, those that match the
|
55
|
+
# known stats will be all that is included in the json output.
|
56
|
+
#
|
57
|
+
def to_json( *args )
|
58
|
+
h = to_hash( *args )
|
59
|
+
Oj.dump(h)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
require 'flat_kit/stat_type/nominal_stats'
|
64
|
+
require 'flat_kit/stat_type/ordinal_stats'
|
65
|
+
require 'flat_kit/stat_type/numerical_stats'
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class StatType
|
3
|
+
|
4
|
+
# Status object to keep track of the count and frequency of values
|
5
|
+
#
|
6
|
+
class NominalStats < StatType
|
7
|
+
|
8
|
+
attr_reader :count
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[ count ]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[ count unique_count unique_values mode ]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
@mutex = Mutex.new
|
20
|
+
@count = 0
|
21
|
+
@collecting_frequencies = collecting_frequencies
|
22
|
+
@frequencies = Hash.new(0)
|
23
|
+
end
|
24
|
+
|
25
|
+
def collected_stats
|
26
|
+
return self.class.default_stats unless @collecting_frequencies
|
27
|
+
return self.class.all_stats
|
28
|
+
end
|
29
|
+
|
30
|
+
def mode
|
31
|
+
return nil unless @collecting_frequencies
|
32
|
+
@frequencies.max_by{ |item, item_count| item_count }.first
|
33
|
+
end
|
34
|
+
|
35
|
+
def unique_count
|
36
|
+
return nil unless @collecting_frequencies
|
37
|
+
@frequencies.size
|
38
|
+
end
|
39
|
+
|
40
|
+
def unique_values
|
41
|
+
return nil unless @collecting_frequencies
|
42
|
+
@frequencies.keys
|
43
|
+
end
|
44
|
+
|
45
|
+
def frequencies
|
46
|
+
return nil unless @collecting_frequencies
|
47
|
+
@frequencies
|
48
|
+
end
|
49
|
+
|
50
|
+
def update(value)
|
51
|
+
@mutex.synchronize do
|
52
|
+
@count += 1
|
53
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2008, 2009 Jeremy Hinegardner
|
3
|
+
# All rights reserved. See LICENSE and/or COPYING for details.
|
4
|
+
#
|
5
|
+
# Pulled from Hitimes, which I also wrote
|
6
|
+
#++
|
7
|
+
|
8
|
+
require 'thread'
|
9
|
+
require 'oj'
|
10
|
+
|
11
|
+
module FlatKit
|
12
|
+
class StatType
|
13
|
+
#
|
14
|
+
# Stats object will keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
15
|
+
# and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
|
16
|
+
#
|
17
|
+
# this contrived example shows getting a list of all the files in a directory
|
18
|
+
# and running stats on file sizes.
|
19
|
+
#
|
20
|
+
# s = FlatKit::Stats.new
|
21
|
+
# dir = ARGV.shift || Dir.pwd
|
22
|
+
# Dir.entries( dir ).each do |entry|
|
23
|
+
# fs = File.stat( entry )
|
24
|
+
# if fs.file? then
|
25
|
+
# s.update( fs.size )
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# %w[ count min max mean sum stddev rate ].each do |m|
|
30
|
+
# puts "#{m.rjust(6)} : #{s.send( m ) }"
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
class NumericalStats < NominalStats
|
34
|
+
# A list of the available stats
|
35
|
+
|
36
|
+
attr_reader :min
|
37
|
+
attr_reader :max
|
38
|
+
attr_reader :sum
|
39
|
+
attr_reader :sumsq
|
40
|
+
|
41
|
+
def self.default_stats
|
42
|
+
@default_stats ||= %w[ count max mean min rate stddev sum sumsq ]
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.all_stats
|
46
|
+
@all_stats ||= %w[ count max mean min mode rate stddev sum sumsq unique_count unique_values ]
|
47
|
+
end
|
48
|
+
|
49
|
+
def initialize(collecting_frequencies: false)
|
50
|
+
super
|
51
|
+
@min = Float::INFINITY
|
52
|
+
@max = -Float::INFINITY
|
53
|
+
@sum = 0.0
|
54
|
+
@sumsq = 0.0
|
55
|
+
end
|
56
|
+
|
57
|
+
# call-seq:
|
58
|
+
# stat.update( val ) -> val
|
59
|
+
#
|
60
|
+
# Update the running stats with the new value.
|
61
|
+
# Return the input value.
|
62
|
+
def update(value)
|
63
|
+
@mutex.synchronize do
|
64
|
+
@min = (value < @min) ? value : @min
|
65
|
+
@max = (value > @max) ? value : @max
|
66
|
+
|
67
|
+
@count += 1
|
68
|
+
@sum += value
|
69
|
+
@sumsq += (value * value)
|
70
|
+
|
71
|
+
# from Nomnial update
|
72
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
73
|
+
end
|
74
|
+
|
75
|
+
return value
|
76
|
+
end
|
77
|
+
|
78
|
+
# call-seq:
|
79
|
+
# stat.mean -> Float
|
80
|
+
#
|
81
|
+
# Return the arithmetic mean of the values put into the Stats object. If no
|
82
|
+
# values have passed through the stats object then 0.0 is returned;
|
83
|
+
def mean
|
84
|
+
return 0.0 if @count.zero?
|
85
|
+
return @sum / @count
|
86
|
+
end
|
87
|
+
|
88
|
+
# call-seq:
|
89
|
+
# stat.rate -> Float
|
90
|
+
#
|
91
|
+
# Return the +count+ divided by +sum+.
|
92
|
+
#
|
93
|
+
# In many cases when Stats#update( _value_ ) is called, the _value_ is a unit
|
94
|
+
# of time, typically seconds or microseconds. #rate is a convenience for those
|
95
|
+
# times. In this case, where _value_ is a unit if time, then count divided by
|
96
|
+
# sum is a useful value, i.e. +something per unit of time+.
|
97
|
+
#
|
98
|
+
# In the case where _value_ is a non-time related value, then the value
|
99
|
+
# returned by _rate_ is not really useful.
|
100
|
+
#
|
101
|
+
def rate
|
102
|
+
return 0.0 if @sum.zero?
|
103
|
+
return @count / @sum
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# call-seq:
|
108
|
+
# stat.stddev -> Float
|
109
|
+
#
|
110
|
+
# Return the standard deviation of all the values that have passed through the
|
111
|
+
# Stats object. The standard deviation has no meaning unless the count is > 1,
|
112
|
+
# therefore if the current _stat.count_ is < 1 then 0.0 will be returned;
|
113
|
+
#
|
114
|
+
def stddev
|
115
|
+
return 0.0 unless @count > 1
|
116
|
+
Math.sqrt((@sumsq - ((@sum * @sum)/@count)) / (@count - 1))
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class StatType
|
3
|
+
# Same as NominalStats and also collects min and max
|
4
|
+
#
|
5
|
+
class OrdinalStats < NominalStats
|
6
|
+
|
7
|
+
attr_reader :min
|
8
|
+
attr_reader :max
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[ count max min ]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[ count max min unique_count unique_values mode ]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
super
|
20
|
+
@min = nil
|
21
|
+
@max = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def update(value)
|
25
|
+
@mutex.synchronize do
|
26
|
+
|
27
|
+
if @min.nil? || (value < @min) then
|
28
|
+
@min = value
|
29
|
+
end
|
30
|
+
|
31
|
+
if @max.nil? || (value > @max) then
|
32
|
+
@max = value
|
33
|
+
end
|
34
|
+
|
35
|
+
@count += 1
|
36
|
+
|
37
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|