flat_kit 0.3.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +9 -0
- data/Manifest.txt +3 -42
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +46 -32
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +41 -39
- data/lib/flat_kit/command.rb +10 -11
- data/lib/flat_kit/descendant_tracker.rb +9 -6
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +5 -2
- data/lib/flat_kit/field_stats.rb +31 -26
- data/lib/flat_kit/field_type/boolean_type.rb +9 -5
- data/lib/flat_kit/field_type/date_type.rb +19 -17
- data/lib/flat_kit/field_type/float_type.rb +15 -9
- data/lib/flat_kit/field_type/guess_type.rb +9 -6
- data/lib/flat_kit/field_type/integer_type.rb +6 -4
- data/lib/flat_kit/field_type/null_type.rb +5 -1
- data/lib/flat_kit/field_type/string_type.rb +8 -6
- data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
- data/lib/flat_kit/field_type/unknown_type.rb +12 -8
- data/lib/flat_kit/field_type.rb +52 -44
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +15 -18
- data/lib/flat_kit/jsonl/writer.rb +8 -10
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -18
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +8 -7
- data/lib/flat_kit/position.rb +3 -4
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
- data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
- data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
- data/lib/flat_kit/stat_type.rb +18 -13
- data/lib/flat_kit/stats.rb +12 -14
- data/lib/flat_kit/writer.rb +5 -6
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +13 -10
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +31 -26
- metadata +20 -158
- data/Rakefile +0 -21
- data/examples/stream-active-record-to-csv.rb +0 -42
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/field_type/test_boolean_type.rb +0 -65
- data/test/field_type/test_date_type.rb +0 -71
- data/test/field_type/test_float_type.rb +0 -56
- data/test/field_type/test_guess_type.rb +0 -14
- data/test/field_type/test_integer_type.rb +0 -52
- data/test/field_type/test_null_type.rb +0 -41
- data/test/field_type/test_string_type.rb +0 -18
- data/test/field_type/test_timestamp_type.rb +0 -108
- data/test/field_type/test_unknown_type.rb +0 -35
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -86
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/run +0 -23
- data/test/stat_type/test_nominal_stats.rb +0 -69
- data/test/stat_type/test_numerical_stats.rb +0 -118
- data/test/stat_type/test_ordinal_stats.rb +0 -92
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -89
- data/test/test_field_stats.rb +0 -134
- data/test/test_field_type.rb +0 -34
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -89
data/lib/flat_kit/logger.rb
CHANGED
@@ -1,19 +1,12 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
class LogFormatter < ::Logger::Formatter
|
5
|
-
FORMAT = "%s %5d %05s : %s\n".freeze
|
6
|
-
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
|
7
|
-
def initialize
|
8
|
-
super
|
9
|
-
self.datetime_format = DATETIME_FORMAT
|
10
|
-
end
|
11
|
-
|
12
|
-
def call(severity, time, progname, msg)
|
13
|
-
FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
|
14
|
-
end
|
15
|
-
end
|
3
|
+
require "logger"
|
16
4
|
|
5
|
+
# Public: Top level namespace for the gem
|
6
|
+
#
|
7
|
+
module FlatKit
|
8
|
+
# Internal: Logger class
|
9
|
+
#
|
17
10
|
class Logger
|
18
11
|
def self.for_io(io)
|
19
12
|
::Logger.new(io, formatter: LogFormatter.new)
|
@@ -26,11 +19,11 @@ module FlatKit
|
|
26
19
|
end
|
27
20
|
|
28
21
|
def self.log_to(destination = $stderr)
|
29
|
-
if destination.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
22
|
+
@logger = if destination.is_a?(::IO)
|
23
|
+
::FlatKit::Logger.for_io(destination)
|
24
|
+
else
|
25
|
+
::FlatKit::Logger.for_path(destination)
|
26
|
+
end
|
34
27
|
end
|
35
28
|
|
36
29
|
def self.logger
|
data/lib/flat_kit/merge.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Class implementing merging from N inputs and output to 1 output.
|
5
|
+
#
|
2
6
|
class Merge
|
3
|
-
|
4
7
|
include ::FlatKit::EventEmitter
|
5
8
|
|
6
|
-
attr_reader :readers
|
7
|
-
attr_reader :writer
|
8
|
-
attr_reader :compare_fields
|
9
|
+
attr_reader :readers, :writer, :compare_fields
|
9
10
|
|
10
|
-
def initialize(inputs:, input_fallback: "auto",
|
11
|
-
output:, output_fallback: "auto",
|
12
|
-
compare_fields:)
|
11
|
+
def initialize(inputs:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
13
12
|
@compare_fields = compare_fields
|
14
13
|
@readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
|
15
14
|
fallback: input_fallback)
|
@@ -19,21 +18,12 @@ module FlatKit
|
|
19
18
|
|
20
19
|
def call
|
21
20
|
::FlatKit.logger.debug "Merging the following files into #{writer.destination}"
|
22
|
-
::FlatKit.logger.debug "Using this key for sorting: #{compare_fields.join(
|
21
|
+
::FlatKit.logger.debug "Using this key for sorting: #{compare_fields.join(', ')}"
|
23
22
|
readers.each do |r|
|
24
23
|
::FlatKit.logger.debug " #{r.source}"
|
25
24
|
end
|
26
25
|
|
27
|
-
|
28
|
-
|
29
|
-
notify_listeners(name: :start, data: :start)
|
30
|
-
merge_tree.each do |record|
|
31
|
-
|
32
|
-
position = writer.write(record)
|
33
|
-
meta = { position: position }
|
34
|
-
notify_listeners(name: :record, data: record, meta: meta)
|
35
|
-
end
|
36
|
-
notify_listeners(name: :stop, data: :stop)
|
26
|
+
run_merge(readers)
|
37
27
|
|
38
28
|
readers.each do |r|
|
39
29
|
::FlatKit.logger.debug " #{r.source} produced #{r.count} records"
|
@@ -42,5 +32,18 @@ module FlatKit
|
|
42
32
|
writer.close
|
43
33
|
::FlatKit.logger.debug "Wrote #{writer.count} records to #{writer.destination}"
|
44
34
|
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def run_merge(readers)
|
39
|
+
tree = ::FlatKit::MergeTree.new(readers)
|
40
|
+
notify_listeners(name: :start, data: :start)
|
41
|
+
tree.each do |record|
|
42
|
+
position = writer.write(record)
|
43
|
+
meta = { position: position }
|
44
|
+
notify_listeners(name: :record, data: record, meta: meta)
|
45
|
+
end
|
46
|
+
notify_listeners(name: :stop, data: :stop)
|
47
|
+
end
|
45
48
|
end
|
46
49
|
end
|
data/lib/flat_kit/merge_tree.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: Merge a list of sorted records from Readers into a single output Writer
|
3
5
|
#
|
@@ -29,9 +31,7 @@ module FlatKit
|
|
29
31
|
class MergeTree
|
30
32
|
include Enumerable
|
31
33
|
|
32
|
-
attr_reader :leaves
|
33
|
-
attr_reader :levels
|
34
|
-
attr_reader :readers
|
34
|
+
attr_reader :leaves, :levels, :readers
|
35
35
|
|
36
36
|
def initialize(readers)
|
37
37
|
@readers = readers
|
@@ -44,9 +44,7 @@ module FlatKit
|
|
44
44
|
|
45
45
|
# Need to pad the leaves to an even number so that the slicing by 2 for
|
46
46
|
# the tournament will work
|
47
|
-
if @leaves.size.odd?
|
48
|
-
@leaves << SentinelLeafNode.new
|
49
|
-
end
|
47
|
+
@leaves << SentinelLeafNode.new if @leaves.size.odd?
|
50
48
|
|
51
49
|
init_tree
|
52
50
|
end
|
@@ -94,6 +92,7 @@ module FlatKit
|
|
94
92
|
def each
|
95
93
|
loop do
|
96
94
|
break if root.leaf.finished?
|
95
|
+
|
97
96
|
yield root.value
|
98
97
|
# consume the yielded value and have the tournament tree replay those
|
99
98
|
# brackets affected
|
data/lib/flat_kit/output/file.rb
CHANGED
@@ -1,22 +1,31 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "zlib"
|
4
|
+
require "pathname"
|
2
5
|
|
3
6
|
module FlatKit
|
4
7
|
class Output
|
8
|
+
# Internal: File output implementation
|
9
|
+
#
|
5
10
|
class File < Output
|
6
11
|
attr_reader :path
|
7
12
|
|
13
|
+
# internal api method for testing purposes
|
14
|
+
attr_reader :io
|
15
|
+
|
8
16
|
def self.handles?(obj)
|
9
17
|
return true if obj.instance_of?(Pathname)
|
10
18
|
return false unless obj.instance_of?(String)
|
11
19
|
|
12
20
|
# incase these get loaded in different orders
|
13
|
-
return false if ::FlatKit::Output::IO.
|
14
|
-
return false if ::FlatKit::Output::IO.
|
21
|
+
return false if ::FlatKit::Output::IO.stdout?(obj)
|
22
|
+
return false if ::FlatKit::Output::IO.stderr?(obj)
|
15
23
|
|
16
|
-
|
24
|
+
true
|
17
25
|
end
|
18
26
|
|
19
27
|
def initialize(obj)
|
28
|
+
super()
|
20
29
|
@path = Pathname.new(obj)
|
21
30
|
path.dirname.mkpath
|
22
31
|
@io = open_output(path)
|
@@ -30,11 +39,6 @@ module FlatKit
|
|
30
39
|
@io.close
|
31
40
|
end
|
32
41
|
|
33
|
-
# internal api method for testing purposes
|
34
|
-
def io
|
35
|
-
@io
|
36
|
-
end
|
37
|
-
|
38
42
|
private
|
39
43
|
|
40
44
|
# open the opropriate otuput type depending on the destination file name
|
data/lib/flat_kit/output/io.rb
CHANGED
@@ -1,73 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class Output
|
5
|
+
# Internal: Non-file Output impelementation - this is genrally to stdout or stderr
|
6
|
+
#
|
3
7
|
class IO < Output
|
4
|
-
attr_reader :count
|
8
|
+
attr_reader :count, :name
|
9
|
+
|
10
|
+
# internal api method for testing
|
11
|
+
attr_reader :io
|
5
12
|
|
6
|
-
STDOUTS = %w[
|
7
|
-
STDERRS = %w[
|
13
|
+
STDOUTS = %w[stdout STDOUT - <stdout>].freeze
|
14
|
+
STDERRS = %w[stderr STDERR <stderr>].freeze
|
8
15
|
|
9
16
|
def self.handles?(obj)
|
10
|
-
return true if
|
11
|
-
return true if
|
12
|
-
return true if [
|
13
|
-
|
17
|
+
return true if stderr?(obj)
|
18
|
+
return true if stdout?(obj)
|
19
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
20
|
+
|
21
|
+
false
|
14
22
|
end
|
15
23
|
|
16
|
-
def self.
|
24
|
+
def self.stderr?(obj)
|
17
25
|
case obj
|
18
26
|
when String
|
19
27
|
return true if STDERRS.include?(obj)
|
20
28
|
when ::IO
|
21
|
-
return true if obj ==
|
29
|
+
return true if obj == $stderr
|
22
30
|
end
|
23
|
-
|
31
|
+
false
|
24
32
|
end
|
25
33
|
|
26
|
-
def self.
|
34
|
+
def self.stdout?(obj)
|
27
35
|
case obj
|
28
36
|
when String
|
29
37
|
return true if STDOUTS.include?(obj)
|
30
38
|
when ::IO
|
31
|
-
return true if obj ==
|
39
|
+
return true if obj == $stdout
|
32
40
|
end
|
33
|
-
|
41
|
+
false
|
34
42
|
end
|
35
43
|
|
36
44
|
def initialize(obj)
|
45
|
+
super()
|
37
46
|
@count = 0
|
38
|
-
|
47
|
+
@name = nil
|
48
|
+
@io = nil
|
49
|
+
init_name_and_io(obj)
|
50
|
+
end
|
51
|
+
|
52
|
+
# this goes to an io stream and we are not in charge of opening it
|
53
|
+
def close
|
54
|
+
@io.close
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def init_name_and_io(obj)
|
60
|
+
if self.class.stdout?(obj)
|
39
61
|
@name = "<STDOUT>"
|
40
62
|
@io = $stdout
|
41
|
-
elsif self.class.
|
63
|
+
elsif self.class.stderr?(obj)
|
42
64
|
@name = "<STDERR>"
|
43
65
|
@io = $stderr
|
44
|
-
elsif obj.
|
45
|
-
@name = obj.path
|
66
|
+
elsif obj.is_a?(::IO)
|
67
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
46
68
|
@io = obj
|
47
|
-
elsif obj.
|
48
|
-
@name = obj.inspect
|
49
|
-
@io = obj
|
50
|
-
elsif obj.kind_of?(::IO) then
|
69
|
+
elsif obj.is_a?(::StringIO)
|
51
70
|
@name = obj.inspect
|
52
71
|
@io = obj
|
53
72
|
else
|
54
73
|
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
55
74
|
end
|
56
75
|
end
|
57
|
-
|
58
|
-
def name
|
59
|
-
@name
|
60
|
-
end
|
61
|
-
|
62
|
-
# this goes to an io stream and we are not in charge of opening it
|
63
|
-
def close
|
64
|
-
@io.close
|
65
|
-
end
|
66
|
-
|
67
|
-
# internal api method for testing
|
68
|
-
def io
|
69
|
-
@io
|
70
|
-
end
|
71
76
|
end
|
72
77
|
end
|
73
78
|
end
|
data/lib/flat_kit/output.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Base clases for all output handlers
|
5
|
+
#
|
2
6
|
class Output
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.from(out)
|
6
|
-
return out if out.
|
10
|
+
return out if out.is_a?(::FlatKit::Output)
|
7
11
|
|
8
12
|
out_klass = find_child(:handles?, out)
|
9
|
-
if out_klass
|
10
|
-
return out_klass.new(out)
|
11
|
-
end
|
13
|
+
return out_klass.new(out) if out_klass
|
12
14
|
|
13
15
|
raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
|
14
16
|
end
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
18
20
|
end
|
19
21
|
|
20
|
-
#
|
21
22
|
def io
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
23
24
|
end
|
@@ -32,5 +33,5 @@ module FlatKit
|
|
32
33
|
end
|
33
34
|
end
|
34
35
|
|
35
|
-
require
|
36
|
-
require
|
36
|
+
require "flat_kit/output/io"
|
37
|
+
require "flat_kit/output/file"
|
data/lib/flat_kit/position.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# The information about the position of a record in an IO stream
|
3
5
|
#
|
@@ -5,10 +7,7 @@ module FlatKit
|
|
5
7
|
# information about the record that was just written
|
6
8
|
#
|
7
9
|
class Position
|
8
|
-
|
9
|
-
attr_reader :index # zero based
|
10
|
-
attr_reader :offset # byte offset in the IO stream
|
11
|
-
attr_reader :bytesize # byte length of the record
|
10
|
+
attr_reader :index, :offset, :bytesize # zero based # byte offset in the IO stream # byte length of the record
|
12
11
|
|
13
12
|
def initialize(index: nil, offset: nil, bytesize: nil)
|
14
13
|
@index = index
|
data/lib/flat_kit/reader.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: the base class for all format readers.
|
3
5
|
#
|
@@ -14,24 +16,21 @@ module FlatKit
|
|
14
16
|
# API:
|
15
17
|
#
|
16
18
|
# initialize(source:, compare_fields:)
|
17
|
-
# each -> Yields / returns
|
19
|
+
# each -> Yields / returns
|
18
20
|
#
|
19
21
|
class Reader
|
20
22
|
include Enumerable
|
21
23
|
|
22
|
-
attr_reader :source
|
23
|
-
attr_reader :compare_fields
|
24
|
+
attr_reader :source, :compare_fields
|
24
25
|
|
25
26
|
def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
|
26
27
|
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
27
|
-
|
28
|
+
format.reader.new(source: path, compare_fields: compare_fields)
|
28
29
|
end
|
29
30
|
|
30
31
|
def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
|
31
32
|
# default to stdin if there are no paths
|
32
|
-
if paths.empty?
|
33
|
-
paths << "-"
|
34
|
-
end
|
33
|
+
paths << "-" if paths.empty?
|
35
34
|
|
36
35
|
paths.map do |path|
|
37
36
|
create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
|
@@ -55,7 +54,8 @@ module FlatKit
|
|
55
54
|
|
56
55
|
def resolve_compare_fields(value)
|
57
56
|
return [] if value == :none
|
58
|
-
|
57
|
+
|
58
|
+
value
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
data/lib/flat_kit/record.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: The base class that all record classes should inherit from.
|
3
5
|
#
|
@@ -35,11 +37,9 @@ module FlatKit
|
|
35
37
|
# # the initialize method must call super(data:, compare_fields:) to
|
36
38
|
# initializa the root data structures
|
37
39
|
class Record
|
38
|
-
|
39
40
|
include Comparable
|
40
41
|
|
41
|
-
attr_reader :data
|
42
|
-
attr_reader :compare_fields
|
42
|
+
attr_reader :data, :compare_fields
|
43
43
|
|
44
44
|
def initialize(data:, compare_fields:)
|
45
45
|
@data = data
|
@@ -57,15 +57,15 @@ module FlatKit
|
|
57
57
|
my_val = self[field]
|
58
58
|
other_val = other[field]
|
59
59
|
|
60
|
-
if my_val.nil? && other_val.nil?
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
60
|
+
compare_result = if my_val.nil? && other_val.nil?
|
61
|
+
0
|
62
|
+
elsif my_val.nil?
|
63
|
+
-1
|
64
|
+
elsif other_val.nil?
|
65
|
+
1
|
66
|
+
else
|
67
|
+
my_val <=> (other_val)
|
68
|
+
end
|
69
69
|
|
70
70
|
return compare_result unless compare_result.zero?
|
71
71
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
3
5
|
# class.
|
@@ -8,12 +10,10 @@ module FlatKit
|
|
8
10
|
class SentinelInternalNode
|
9
11
|
include Comparable
|
10
12
|
|
11
|
-
attr_reader :left
|
12
|
-
attr_reader :right
|
13
|
-
attr_reader :winner
|
13
|
+
attr_reader :left, :right, :winner
|
14
14
|
attr_accessor :next_level
|
15
15
|
|
16
|
-
def initialize(
|
16
|
+
def initialize(*)
|
17
17
|
@left = nil
|
18
18
|
@right = nil
|
19
19
|
@winner = nil
|
@@ -31,7 +31,8 @@ module FlatKit
|
|
31
31
|
# A sentinal node is always greater than any other node
|
32
32
|
def <=>(other)
|
33
33
|
return 0 if other.sentinel?
|
34
|
-
|
34
|
+
|
35
|
+
1
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
3
5
|
#
|
@@ -31,7 +33,8 @@ module FlatKit
|
|
31
33
|
# A sentinal node is always greater than any other node
|
32
34
|
def <=>(other)
|
33
35
|
return 0 if other.sentinel?
|
34
|
-
|
36
|
+
|
37
|
+
1
|
35
38
|
end
|
36
39
|
end
|
37
40
|
end
|
data/lib/flat_kit/sort.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Sorts an Input and sends the sorted records to an Output
|
5
|
+
#
|
2
6
|
class Sort
|
3
|
-
attr_reader :reader
|
4
|
-
attr_reader :writer
|
5
|
-
attr_reader :compare_fields
|
6
|
-
|
7
|
-
def initialize(input:, input_fallback: "auto",
|
8
|
-
output:, output_fallback: "auto",
|
9
|
-
compare_fields:)
|
7
|
+
attr_reader :reader, :writer, :compare_fields
|
10
8
|
|
9
|
+
def initialize(input:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
11
10
|
@compare_fields = compare_fields
|
12
11
|
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
13
12
|
fallback: input_fallback)
|
@@ -16,8 +15,8 @@ module FlatKit
|
|
16
15
|
end
|
17
16
|
|
18
17
|
def call
|
19
|
-
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(
|
20
|
-
records =
|
18
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(', ')}"
|
19
|
+
records = [].tap do |a|
|
21
20
|
reader.each do |r|
|
22
21
|
a << r
|
23
22
|
end
|
@@ -1,21 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class StatType
|
3
|
-
|
4
|
-
# Status object to keep track of the count and frequency of values
|
5
|
+
# Internal: Status object to keep track of the count and frequency of values.
|
5
6
|
#
|
6
7
|
class NominalStats < StatType
|
7
|
-
|
8
8
|
attr_reader :count
|
9
9
|
|
10
10
|
def self.default_stats
|
11
|
-
@default_stats ||= %w[
|
11
|
+
@default_stats ||= %w[count]
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.all_stats
|
15
|
-
@all_stats ||= %w[
|
15
|
+
@all_stats ||= %w[count unique_count unique_values mode]
|
16
16
|
end
|
17
17
|
|
18
18
|
def initialize(collecting_frequencies: false)
|
19
|
+
super()
|
19
20
|
@mutex = Mutex.new
|
20
21
|
@count = 0
|
21
22
|
@collecting_frequencies = collecting_frequencies
|
@@ -24,26 +25,31 @@ module FlatKit
|
|
24
25
|
|
25
26
|
def collected_stats
|
26
27
|
return self.class.default_stats unless @collecting_frequencies
|
27
|
-
|
28
|
+
|
29
|
+
self.class.all_stats
|
28
30
|
end
|
29
31
|
|
30
32
|
def mode
|
31
33
|
return nil unless @collecting_frequencies
|
32
|
-
|
34
|
+
|
35
|
+
@frequencies.max_by { |_item, item_count| item_count }.first
|
33
36
|
end
|
34
37
|
|
35
38
|
def unique_count
|
36
39
|
return nil unless @collecting_frequencies
|
40
|
+
|
37
41
|
@frequencies.size
|
38
42
|
end
|
39
43
|
|
40
44
|
def unique_values
|
41
45
|
return nil unless @collecting_frequencies
|
46
|
+
|
42
47
|
@frequencies.keys
|
43
48
|
end
|
44
49
|
|
45
50
|
def frequencies
|
46
51
|
return nil unless @collecting_frequencies
|
52
|
+
|
47
53
|
@frequencies
|
48
54
|
end
|
49
55
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
#--
|
2
4
|
# Copyright (c) 2008, 2009 Jeremy Hinegardner
|
3
5
|
# All rights reserved. See LICENSE and/or COPYING for details.
|
@@ -5,16 +7,14 @@
|
|
5
7
|
# Pulled from Hitimes, which I also wrote
|
6
8
|
#++
|
7
9
|
|
8
|
-
require
|
9
|
-
require 'oj'
|
10
|
+
require "oj"
|
10
11
|
|
11
12
|
module FlatKit
|
12
13
|
class StatType
|
13
|
-
#
|
14
|
-
# Stats object will keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
14
|
+
# Internal: Stats object to keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
15
15
|
# and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
|
16
16
|
#
|
17
|
-
#
|
17
|
+
# This contrived example shows getting a list of all the files in a directory
|
18
18
|
# and running stats on file sizes.
|
19
19
|
#
|
20
20
|
# s = FlatKit::Stats.new
|
@@ -33,17 +33,14 @@ module FlatKit
|
|
33
33
|
class NumericalStats < NominalStats
|
34
34
|
# A list of the available stats
|
35
35
|
|
36
|
-
attr_reader :min
|
37
|
-
attr_reader :max
|
38
|
-
attr_reader :sum
|
39
|
-
attr_reader :sumsq
|
36
|
+
attr_reader :min, :max, :sum, :sumsq
|
40
37
|
|
41
38
|
def self.default_stats
|
42
|
-
@default_stats ||= %w[
|
39
|
+
@default_stats ||= %w[count max mean min rate stddev sum sumsq]
|
43
40
|
end
|
44
41
|
|
45
42
|
def self.all_stats
|
46
|
-
@all_stats ||= %w[
|
43
|
+
@all_stats ||= %w[count max mean min mode rate stddev sum sumsq unique_count unique_values]
|
47
44
|
end
|
48
45
|
|
49
46
|
def initialize(collecting_frequencies: false)
|
@@ -61,8 +58,8 @@ module FlatKit
|
|
61
58
|
# Return the input value.
|
62
59
|
def update(value)
|
63
60
|
@mutex.synchronize do
|
64
|
-
@min =
|
65
|
-
@max =
|
61
|
+
@min = [value, @min].min
|
62
|
+
@max = [value, @max].max
|
66
63
|
|
67
64
|
@count += 1
|
68
65
|
@sum += value
|
@@ -72,17 +69,18 @@ module FlatKit
|
|
72
69
|
@frequencies[value] += 1 if @collecting_frequencies
|
73
70
|
end
|
74
71
|
|
75
|
-
|
72
|
+
value
|
76
73
|
end
|
77
74
|
|
78
75
|
# call-seq:
|
79
76
|
# stat.mean -> Float
|
80
|
-
#
|
77
|
+
#
|
81
78
|
# Return the arithmetic mean of the values put into the Stats object. If no
|
82
79
|
# values have passed through the stats object then 0.0 is returned;
|
83
80
|
def mean
|
84
81
|
return 0.0 if @count.zero?
|
85
|
-
|
82
|
+
|
83
|
+
@sum / @count
|
86
84
|
end
|
87
85
|
|
88
86
|
# call-seq:
|
@@ -100,7 +98,8 @@ module FlatKit
|
|
100
98
|
#
|
101
99
|
def rate
|
102
100
|
return 0.0 if @sum.zero?
|
103
|
-
|
101
|
+
|
102
|
+
@count / @sum
|
104
103
|
end
|
105
104
|
|
106
105
|
#
|
@@ -113,7 +112,8 @@ module FlatKit
|
|
113
112
|
#
|
114
113
|
def stddev
|
115
114
|
return 0.0 unless @count > 1
|
116
|
-
|
115
|
+
|
116
|
+
Math.sqrt((@sumsq - ((@sum * @sum) / @count)) / (@count - 1))
|
117
117
|
end
|
118
118
|
end
|
119
119
|
end
|