flat_kit 0.3.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +9 -0
- data/Manifest.txt +3 -42
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +46 -32
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +41 -39
- data/lib/flat_kit/command.rb +10 -11
- data/lib/flat_kit/descendant_tracker.rb +9 -6
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +5 -2
- data/lib/flat_kit/field_stats.rb +31 -26
- data/lib/flat_kit/field_type/boolean_type.rb +9 -5
- data/lib/flat_kit/field_type/date_type.rb +19 -17
- data/lib/flat_kit/field_type/float_type.rb +15 -9
- data/lib/flat_kit/field_type/guess_type.rb +9 -6
- data/lib/flat_kit/field_type/integer_type.rb +6 -4
- data/lib/flat_kit/field_type/null_type.rb +5 -1
- data/lib/flat_kit/field_type/string_type.rb +8 -6
- data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
- data/lib/flat_kit/field_type/unknown_type.rb +12 -8
- data/lib/flat_kit/field_type.rb +52 -44
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +15 -18
- data/lib/flat_kit/jsonl/writer.rb +8 -10
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -18
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +8 -7
- data/lib/flat_kit/position.rb +3 -4
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
- data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
- data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
- data/lib/flat_kit/stat_type.rb +18 -13
- data/lib/flat_kit/stats.rb +12 -14
- data/lib/flat_kit/writer.rb +5 -6
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +13 -10
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +31 -26
- metadata +20 -158
- data/Rakefile +0 -21
- data/examples/stream-active-record-to-csv.rb +0 -42
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/field_type/test_boolean_type.rb +0 -65
- data/test/field_type/test_date_type.rb +0 -71
- data/test/field_type/test_float_type.rb +0 -56
- data/test/field_type/test_guess_type.rb +0 -14
- data/test/field_type/test_integer_type.rb +0 -52
- data/test/field_type/test_null_type.rb +0 -41
- data/test/field_type/test_string_type.rb +0 -18
- data/test/field_type/test_timestamp_type.rb +0 -108
- data/test/field_type/test_unknown_type.rb +0 -35
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -86
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/run +0 -23
- data/test/stat_type/test_nominal_stats.rb +0 -69
- data/test/stat_type/test_numerical_stats.rb +0 -118
- data/test/stat_type/test_ordinal_stats.rb +0 -92
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -89
- data/test/test_field_stats.rb +0 -134
- data/test/test_field_type.rb +0 -34
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -89
data/lib/flat_kit/logger.rb
CHANGED
@@ -1,19 +1,12 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
class LogFormatter < ::Logger::Formatter
|
5
|
-
FORMAT = "%s %5d %05s : %s\n".freeze
|
6
|
-
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
|
7
|
-
def initialize
|
8
|
-
super
|
9
|
-
self.datetime_format = DATETIME_FORMAT
|
10
|
-
end
|
11
|
-
|
12
|
-
def call(severity, time, progname, msg)
|
13
|
-
FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
|
14
|
-
end
|
15
|
-
end
|
3
|
+
require "logger"
|
16
4
|
|
5
|
+
# Public: Top level namespace for the gem
|
6
|
+
#
|
7
|
+
module FlatKit
|
8
|
+
# Internal: Logger class
|
9
|
+
#
|
17
10
|
class Logger
|
18
11
|
def self.for_io(io)
|
19
12
|
::Logger.new(io, formatter: LogFormatter.new)
|
@@ -26,11 +19,11 @@ module FlatKit
|
|
26
19
|
end
|
27
20
|
|
28
21
|
def self.log_to(destination = $stderr)
|
29
|
-
if destination.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
22
|
+
@logger = if destination.is_a?(::IO)
|
23
|
+
::FlatKit::Logger.for_io(destination)
|
24
|
+
else
|
25
|
+
::FlatKit::Logger.for_path(destination)
|
26
|
+
end
|
34
27
|
end
|
35
28
|
|
36
29
|
def self.logger
|
data/lib/flat_kit/merge.rb
CHANGED
@@ -1,15 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Class implementing merging from N inputs and output to 1 output.
|
5
|
+
#
|
2
6
|
class Merge
|
3
|
-
|
4
7
|
include ::FlatKit::EventEmitter
|
5
8
|
|
6
|
-
attr_reader :readers
|
7
|
-
attr_reader :writer
|
8
|
-
attr_reader :compare_fields
|
9
|
+
attr_reader :readers, :writer, :compare_fields
|
9
10
|
|
10
|
-
def initialize(inputs:, input_fallback: "auto",
|
11
|
-
output:, output_fallback: "auto",
|
12
|
-
compare_fields:)
|
11
|
+
def initialize(inputs:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
13
12
|
@compare_fields = compare_fields
|
14
13
|
@readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
|
15
14
|
fallback: input_fallback)
|
@@ -19,21 +18,12 @@ module FlatKit
|
|
19
18
|
|
20
19
|
def call
|
21
20
|
::FlatKit.logger.debug "Merging the following files into #{writer.destination}"
|
22
|
-
::FlatKit.logger.debug "Using this key for sorting: #{compare_fields.join(
|
21
|
+
::FlatKit.logger.debug "Using this key for sorting: #{compare_fields.join(', ')}"
|
23
22
|
readers.each do |r|
|
24
23
|
::FlatKit.logger.debug " #{r.source}"
|
25
24
|
end
|
26
25
|
|
27
|
-
|
28
|
-
|
29
|
-
notify_listeners(name: :start, data: :start)
|
30
|
-
merge_tree.each do |record|
|
31
|
-
|
32
|
-
position = writer.write(record)
|
33
|
-
meta = { position: position }
|
34
|
-
notify_listeners(name: :record, data: record, meta: meta)
|
35
|
-
end
|
36
|
-
notify_listeners(name: :stop, data: :stop)
|
26
|
+
run_merge(readers)
|
37
27
|
|
38
28
|
readers.each do |r|
|
39
29
|
::FlatKit.logger.debug " #{r.source} produced #{r.count} records"
|
@@ -42,5 +32,18 @@ module FlatKit
|
|
42
32
|
writer.close
|
43
33
|
::FlatKit.logger.debug "Wrote #{writer.count} records to #{writer.destination}"
|
44
34
|
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def run_merge(readers)
|
39
|
+
tree = ::FlatKit::MergeTree.new(readers)
|
40
|
+
notify_listeners(name: :start, data: :start)
|
41
|
+
tree.each do |record|
|
42
|
+
position = writer.write(record)
|
43
|
+
meta = { position: position }
|
44
|
+
notify_listeners(name: :record, data: record, meta: meta)
|
45
|
+
end
|
46
|
+
notify_listeners(name: :stop, data: :stop)
|
47
|
+
end
|
45
48
|
end
|
46
49
|
end
|
data/lib/flat_kit/merge_tree.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: Merge a list of sorted records from Readers into a single output Writer
|
3
5
|
#
|
@@ -29,9 +31,7 @@ module FlatKit
|
|
29
31
|
class MergeTree
|
30
32
|
include Enumerable
|
31
33
|
|
32
|
-
attr_reader :leaves
|
33
|
-
attr_reader :levels
|
34
|
-
attr_reader :readers
|
34
|
+
attr_reader :leaves, :levels, :readers
|
35
35
|
|
36
36
|
def initialize(readers)
|
37
37
|
@readers = readers
|
@@ -44,9 +44,7 @@ module FlatKit
|
|
44
44
|
|
45
45
|
# Need to pad the leaves to an even number so that the slicing by 2 for
|
46
46
|
# the tournament will work
|
47
|
-
if @leaves.size.odd?
|
48
|
-
@leaves << SentinelLeafNode.new
|
49
|
-
end
|
47
|
+
@leaves << SentinelLeafNode.new if @leaves.size.odd?
|
50
48
|
|
51
49
|
init_tree
|
52
50
|
end
|
@@ -94,6 +92,7 @@ module FlatKit
|
|
94
92
|
def each
|
95
93
|
loop do
|
96
94
|
break if root.leaf.finished?
|
95
|
+
|
97
96
|
yield root.value
|
98
97
|
# consume the yielded value and have the tournament tree replay those
|
99
98
|
# brackets affected
|
data/lib/flat_kit/output/file.rb
CHANGED
@@ -1,22 +1,31 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "zlib"
|
4
|
+
require "pathname"
|
2
5
|
|
3
6
|
module FlatKit
|
4
7
|
class Output
|
8
|
+
# Internal: File output implementation
|
9
|
+
#
|
5
10
|
class File < Output
|
6
11
|
attr_reader :path
|
7
12
|
|
13
|
+
# internal api method for testing purposes
|
14
|
+
attr_reader :io
|
15
|
+
|
8
16
|
def self.handles?(obj)
|
9
17
|
return true if obj.instance_of?(Pathname)
|
10
18
|
return false unless obj.instance_of?(String)
|
11
19
|
|
12
20
|
# incase these get loaded in different orders
|
13
|
-
return false if ::FlatKit::Output::IO.
|
14
|
-
return false if ::FlatKit::Output::IO.
|
21
|
+
return false if ::FlatKit::Output::IO.stdout?(obj)
|
22
|
+
return false if ::FlatKit::Output::IO.stderr?(obj)
|
15
23
|
|
16
|
-
|
24
|
+
true
|
17
25
|
end
|
18
26
|
|
19
27
|
def initialize(obj)
|
28
|
+
super()
|
20
29
|
@path = Pathname.new(obj)
|
21
30
|
path.dirname.mkpath
|
22
31
|
@io = open_output(path)
|
@@ -30,11 +39,6 @@ module FlatKit
|
|
30
39
|
@io.close
|
31
40
|
end
|
32
41
|
|
33
|
-
# internal api method for testing purposes
|
34
|
-
def io
|
35
|
-
@io
|
36
|
-
end
|
37
|
-
|
38
42
|
private
|
39
43
|
|
40
44
|
# open the opropriate otuput type depending on the destination file name
|
data/lib/flat_kit/output/io.rb
CHANGED
@@ -1,73 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class Output
|
5
|
+
# Internal: Non-file Output impelementation - this is genrally to stdout or stderr
|
6
|
+
#
|
3
7
|
class IO < Output
|
4
|
-
attr_reader :count
|
8
|
+
attr_reader :count, :name
|
9
|
+
|
10
|
+
# internal api method for testing
|
11
|
+
attr_reader :io
|
5
12
|
|
6
|
-
STDOUTS = %w[
|
7
|
-
STDERRS = %w[
|
13
|
+
STDOUTS = %w[stdout STDOUT - <stdout>].freeze
|
14
|
+
STDERRS = %w[stderr STDERR <stderr>].freeze
|
8
15
|
|
9
16
|
def self.handles?(obj)
|
10
|
-
return true if
|
11
|
-
return true if
|
12
|
-
return true if [
|
13
|
-
|
17
|
+
return true if stderr?(obj)
|
18
|
+
return true if stdout?(obj)
|
19
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
20
|
+
|
21
|
+
false
|
14
22
|
end
|
15
23
|
|
16
|
-
def self.
|
24
|
+
def self.stderr?(obj)
|
17
25
|
case obj
|
18
26
|
when String
|
19
27
|
return true if STDERRS.include?(obj)
|
20
28
|
when ::IO
|
21
|
-
return true if obj ==
|
29
|
+
return true if obj == $stderr
|
22
30
|
end
|
23
|
-
|
31
|
+
false
|
24
32
|
end
|
25
33
|
|
26
|
-
def self.
|
34
|
+
def self.stdout?(obj)
|
27
35
|
case obj
|
28
36
|
when String
|
29
37
|
return true if STDOUTS.include?(obj)
|
30
38
|
when ::IO
|
31
|
-
return true if obj ==
|
39
|
+
return true if obj == $stdout
|
32
40
|
end
|
33
|
-
|
41
|
+
false
|
34
42
|
end
|
35
43
|
|
36
44
|
def initialize(obj)
|
45
|
+
super()
|
37
46
|
@count = 0
|
38
|
-
|
47
|
+
@name = nil
|
48
|
+
@io = nil
|
49
|
+
init_name_and_io(obj)
|
50
|
+
end
|
51
|
+
|
52
|
+
# this goes to an io stream and we are not in charge of opening it
|
53
|
+
def close
|
54
|
+
@io.close
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def init_name_and_io(obj)
|
60
|
+
if self.class.stdout?(obj)
|
39
61
|
@name = "<STDOUT>"
|
40
62
|
@io = $stdout
|
41
|
-
elsif self.class.
|
63
|
+
elsif self.class.stderr?(obj)
|
42
64
|
@name = "<STDERR>"
|
43
65
|
@io = $stderr
|
44
|
-
elsif obj.
|
45
|
-
@name = obj.path
|
66
|
+
elsif obj.is_a?(::IO)
|
67
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
46
68
|
@io = obj
|
47
|
-
elsif obj.
|
48
|
-
@name = obj.inspect
|
49
|
-
@io = obj
|
50
|
-
elsif obj.kind_of?(::IO) then
|
69
|
+
elsif obj.is_a?(::StringIO)
|
51
70
|
@name = obj.inspect
|
52
71
|
@io = obj
|
53
72
|
else
|
54
73
|
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
55
74
|
end
|
56
75
|
end
|
57
|
-
|
58
|
-
def name
|
59
|
-
@name
|
60
|
-
end
|
61
|
-
|
62
|
-
# this goes to an io stream and we are not in charge of opening it
|
63
|
-
def close
|
64
|
-
@io.close
|
65
|
-
end
|
66
|
-
|
67
|
-
# internal api method for testing
|
68
|
-
def io
|
69
|
-
@io
|
70
|
-
end
|
71
76
|
end
|
72
77
|
end
|
73
78
|
end
|
data/lib/flat_kit/output.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Base clases for all output handlers
|
5
|
+
#
|
2
6
|
class Output
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.from(out)
|
6
|
-
return out if out.
|
10
|
+
return out if out.is_a?(::FlatKit::Output)
|
7
11
|
|
8
12
|
out_klass = find_child(:handles?, out)
|
9
|
-
if out_klass
|
10
|
-
return out_klass.new(out)
|
11
|
-
end
|
13
|
+
return out_klass.new(out) if out_klass
|
12
14
|
|
13
15
|
raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
|
14
16
|
end
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
18
20
|
end
|
19
21
|
|
20
|
-
#
|
21
22
|
def io
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
23
24
|
end
|
@@ -32,5 +33,5 @@ module FlatKit
|
|
32
33
|
end
|
33
34
|
end
|
34
35
|
|
35
|
-
require
|
36
|
-
require
|
36
|
+
require "flat_kit/output/io"
|
37
|
+
require "flat_kit/output/file"
|
data/lib/flat_kit/position.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# The information about the position of a record in an IO stream
|
3
5
|
#
|
@@ -5,10 +7,7 @@ module FlatKit
|
|
5
7
|
# information about the record that was just written
|
6
8
|
#
|
7
9
|
class Position
|
8
|
-
|
9
|
-
attr_reader :index # zero based
|
10
|
-
attr_reader :offset # byte offset in the IO stream
|
11
|
-
attr_reader :bytesize # byte length of the record
|
10
|
+
attr_reader :index, :offset, :bytesize # zero based # byte offset in the IO stream # byte length of the record
|
12
11
|
|
13
12
|
def initialize(index: nil, offset: nil, bytesize: nil)
|
14
13
|
@index = index
|
data/lib/flat_kit/reader.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: the base class for all format readers.
|
3
5
|
#
|
@@ -14,24 +16,21 @@ module FlatKit
|
|
14
16
|
# API:
|
15
17
|
#
|
16
18
|
# initialize(source:, compare_fields:)
|
17
|
-
# each -> Yields / returns
|
19
|
+
# each -> Yields / returns
|
18
20
|
#
|
19
21
|
class Reader
|
20
22
|
include Enumerable
|
21
23
|
|
22
|
-
attr_reader :source
|
23
|
-
attr_reader :compare_fields
|
24
|
+
attr_reader :source, :compare_fields
|
24
25
|
|
25
26
|
def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
|
26
27
|
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
27
|
-
|
28
|
+
format.reader.new(source: path, compare_fields: compare_fields)
|
28
29
|
end
|
29
30
|
|
30
31
|
def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
|
31
32
|
# default to stdin if there are no paths
|
32
|
-
if paths.empty?
|
33
|
-
paths << "-"
|
34
|
-
end
|
33
|
+
paths << "-" if paths.empty?
|
35
34
|
|
36
35
|
paths.map do |path|
|
37
36
|
create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
|
@@ -55,7 +54,8 @@ module FlatKit
|
|
55
54
|
|
56
55
|
def resolve_compare_fields(value)
|
57
56
|
return [] if value == :none
|
58
|
-
|
57
|
+
|
58
|
+
value
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
data/lib/flat_kit/record.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: The base class that all record classes should inherit from.
|
3
5
|
#
|
@@ -35,11 +37,9 @@ module FlatKit
|
|
35
37
|
# # the initialize method must call super(data:, compare_fields:) to
|
36
38
|
# initializa the root data structures
|
37
39
|
class Record
|
38
|
-
|
39
40
|
include Comparable
|
40
41
|
|
41
|
-
attr_reader :data
|
42
|
-
attr_reader :compare_fields
|
42
|
+
attr_reader :data, :compare_fields
|
43
43
|
|
44
44
|
def initialize(data:, compare_fields:)
|
45
45
|
@data = data
|
@@ -57,15 +57,15 @@ module FlatKit
|
|
57
57
|
my_val = self[field]
|
58
58
|
other_val = other[field]
|
59
59
|
|
60
|
-
if my_val.nil? && other_val.nil?
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
60
|
+
compare_result = if my_val.nil? && other_val.nil?
|
61
|
+
0
|
62
|
+
elsif my_val.nil?
|
63
|
+
-1
|
64
|
+
elsif other_val.nil?
|
65
|
+
1
|
66
|
+
else
|
67
|
+
my_val <=> (other_val)
|
68
|
+
end
|
69
69
|
|
70
70
|
return compare_result unless compare_result.zero?
|
71
71
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
3
5
|
# class.
|
@@ -8,12 +10,10 @@ module FlatKit
|
|
8
10
|
class SentinelInternalNode
|
9
11
|
include Comparable
|
10
12
|
|
11
|
-
attr_reader :left
|
12
|
-
attr_reader :right
|
13
|
-
attr_reader :winner
|
13
|
+
attr_reader :left, :right, :winner
|
14
14
|
attr_accessor :next_level
|
15
15
|
|
16
|
-
def initialize(
|
16
|
+
def initialize(*)
|
17
17
|
@left = nil
|
18
18
|
@right = nil
|
19
19
|
@winner = nil
|
@@ -31,7 +31,8 @@ module FlatKit
|
|
31
31
|
# A sentinal node is always greater than any other node
|
32
32
|
def <=>(other)
|
33
33
|
return 0 if other.sentinel?
|
34
|
-
|
34
|
+
|
35
|
+
1
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
3
5
|
#
|
@@ -31,7 +33,8 @@ module FlatKit
|
|
31
33
|
# A sentinal node is always greater than any other node
|
32
34
|
def <=>(other)
|
33
35
|
return 0 if other.sentinel?
|
34
|
-
|
36
|
+
|
37
|
+
1
|
35
38
|
end
|
36
39
|
end
|
37
40
|
end
|
data/lib/flat_kit/sort.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Sorts an Input and sends the sorted records to an Output
|
5
|
+
#
|
2
6
|
class Sort
|
3
|
-
attr_reader :reader
|
4
|
-
attr_reader :writer
|
5
|
-
attr_reader :compare_fields
|
6
|
-
|
7
|
-
def initialize(input:, input_fallback: "auto",
|
8
|
-
output:, output_fallback: "auto",
|
9
|
-
compare_fields:)
|
7
|
+
attr_reader :reader, :writer, :compare_fields
|
10
8
|
|
9
|
+
def initialize(input:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
11
10
|
@compare_fields = compare_fields
|
12
11
|
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
13
12
|
fallback: input_fallback)
|
@@ -16,8 +15,8 @@ module FlatKit
|
|
16
15
|
end
|
17
16
|
|
18
17
|
def call
|
19
|
-
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(
|
20
|
-
records =
|
18
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(', ')}"
|
19
|
+
records = [].tap do |a|
|
21
20
|
reader.each do |r|
|
22
21
|
a << r
|
23
22
|
end
|
@@ -1,21 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class StatType
|
3
|
-
|
4
|
-
# Status object to keep track of the count and frequency of values
|
5
|
+
# Internal: Status object to keep track of the count and frequency of values.
|
5
6
|
#
|
6
7
|
class NominalStats < StatType
|
7
|
-
|
8
8
|
attr_reader :count
|
9
9
|
|
10
10
|
def self.default_stats
|
11
|
-
@default_stats ||= %w[
|
11
|
+
@default_stats ||= %w[count]
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.all_stats
|
15
|
-
@all_stats ||= %w[
|
15
|
+
@all_stats ||= %w[count unique_count unique_values mode]
|
16
16
|
end
|
17
17
|
|
18
18
|
def initialize(collecting_frequencies: false)
|
19
|
+
super()
|
19
20
|
@mutex = Mutex.new
|
20
21
|
@count = 0
|
21
22
|
@collecting_frequencies = collecting_frequencies
|
@@ -24,26 +25,31 @@ module FlatKit
|
|
24
25
|
|
25
26
|
def collected_stats
|
26
27
|
return self.class.default_stats unless @collecting_frequencies
|
27
|
-
|
28
|
+
|
29
|
+
self.class.all_stats
|
28
30
|
end
|
29
31
|
|
30
32
|
def mode
|
31
33
|
return nil unless @collecting_frequencies
|
32
|
-
|
34
|
+
|
35
|
+
@frequencies.max_by { |_item, item_count| item_count }.first
|
33
36
|
end
|
34
37
|
|
35
38
|
def unique_count
|
36
39
|
return nil unless @collecting_frequencies
|
40
|
+
|
37
41
|
@frequencies.size
|
38
42
|
end
|
39
43
|
|
40
44
|
def unique_values
|
41
45
|
return nil unless @collecting_frequencies
|
46
|
+
|
42
47
|
@frequencies.keys
|
43
48
|
end
|
44
49
|
|
45
50
|
def frequencies
|
46
51
|
return nil unless @collecting_frequencies
|
52
|
+
|
47
53
|
@frequencies
|
48
54
|
end
|
49
55
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
#--
|
2
4
|
# Copyright (c) 2008, 2009 Jeremy Hinegardner
|
3
5
|
# All rights reserved. See LICENSE and/or COPYING for details.
|
@@ -5,16 +7,14 @@
|
|
5
7
|
# Pulled from Hitimes, which I also wrote
|
6
8
|
#++
|
7
9
|
|
8
|
-
require
|
9
|
-
require 'oj'
|
10
|
+
require "oj"
|
10
11
|
|
11
12
|
module FlatKit
|
12
13
|
class StatType
|
13
|
-
#
|
14
|
-
# Stats object will keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
14
|
+
# Internal: Stats object to keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
15
15
|
# and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
|
16
16
|
#
|
17
|
-
#
|
17
|
+
# This contrived example shows getting a list of all the files in a directory
|
18
18
|
# and running stats on file sizes.
|
19
19
|
#
|
20
20
|
# s = FlatKit::Stats.new
|
@@ -33,17 +33,14 @@ module FlatKit
|
|
33
33
|
class NumericalStats < NominalStats
|
34
34
|
# A list of the available stats
|
35
35
|
|
36
|
-
attr_reader :min
|
37
|
-
attr_reader :max
|
38
|
-
attr_reader :sum
|
39
|
-
attr_reader :sumsq
|
36
|
+
attr_reader :min, :max, :sum, :sumsq
|
40
37
|
|
41
38
|
def self.default_stats
|
42
|
-
@default_stats ||= %w[
|
39
|
+
@default_stats ||= %w[count max mean min rate stddev sum sumsq]
|
43
40
|
end
|
44
41
|
|
45
42
|
def self.all_stats
|
46
|
-
@all_stats ||= %w[
|
43
|
+
@all_stats ||= %w[count max mean min mode rate stddev sum sumsq unique_count unique_values]
|
47
44
|
end
|
48
45
|
|
49
46
|
def initialize(collecting_frequencies: false)
|
@@ -61,8 +58,8 @@ module FlatKit
|
|
61
58
|
# Return the input value.
|
62
59
|
def update(value)
|
63
60
|
@mutex.synchronize do
|
64
|
-
@min =
|
65
|
-
@max =
|
61
|
+
@min = [value, @min].min
|
62
|
+
@max = [value, @max].max
|
66
63
|
|
67
64
|
@count += 1
|
68
65
|
@sum += value
|
@@ -72,17 +69,18 @@ module FlatKit
|
|
72
69
|
@frequencies[value] += 1 if @collecting_frequencies
|
73
70
|
end
|
74
71
|
|
75
|
-
|
72
|
+
value
|
76
73
|
end
|
77
74
|
|
78
75
|
# call-seq:
|
79
76
|
# stat.mean -> Float
|
80
|
-
#
|
77
|
+
#
|
81
78
|
# Return the arithmetic mean of the values put into the Stats object. If no
|
82
79
|
# values have passed through the stats object then 0.0 is returned;
|
83
80
|
def mean
|
84
81
|
return 0.0 if @count.zero?
|
85
|
-
|
82
|
+
|
83
|
+
@sum / @count
|
86
84
|
end
|
87
85
|
|
88
86
|
# call-seq:
|
@@ -100,7 +98,8 @@ module FlatKit
|
|
100
98
|
#
|
101
99
|
def rate
|
102
100
|
return 0.0 if @sum.zero?
|
103
|
-
|
101
|
+
|
102
|
+
@count / @sum
|
104
103
|
end
|
105
104
|
|
106
105
|
#
|
@@ -113,7 +112,8 @@ module FlatKit
|
|
113
112
|
#
|
114
113
|
def stddev
|
115
114
|
return 0.0 unless @count > 1
|
116
|
-
|
115
|
+
|
116
|
+
Math.sqrt((@sumsq - ((@sum * @sum) / @count)) / (@count - 1))
|
117
117
|
end
|
118
118
|
end
|
119
119
|
end
|