flat_kit 0.3.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +13 -0
- data/Manifest.txt +3 -42
- data/README.md +2 -0
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +46 -32
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +41 -39
- data/lib/flat_kit/command.rb +10 -11
- data/lib/flat_kit/descendant_tracker.rb +9 -6
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +6 -3
- data/lib/flat_kit/field_stats.rb +31 -26
- data/lib/flat_kit/field_type/boolean_type.rb +9 -5
- data/lib/flat_kit/field_type/date_type.rb +19 -17
- data/lib/flat_kit/field_type/float_type.rb +15 -9
- data/lib/flat_kit/field_type/guess_type.rb +9 -6
- data/lib/flat_kit/field_type/integer_type.rb +6 -4
- data/lib/flat_kit/field_type/null_type.rb +5 -1
- data/lib/flat_kit/field_type/string_type.rb +8 -6
- data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
- data/lib/flat_kit/field_type/unknown_type.rb +12 -8
- data/lib/flat_kit/field_type.rb +52 -44
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +15 -18
- data/lib/flat_kit/jsonl/writer.rb +8 -10
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +13 -21
- data/lib/flat_kit/merge.rb +21 -18
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +8 -7
- data/lib/flat_kit/position.rb +3 -4
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -13
- data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
- data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
- data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
- data/lib/flat_kit/stat_type.rb +18 -13
- data/lib/flat_kit/stats.rb +12 -15
- data/lib/flat_kit/writer.rb +5 -6
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +22 -18
- data/lib/flat_kit/xsv/writer.rb +13 -10
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +31 -26
- metadata +20 -161
- data/Rakefile +0 -21
- data/examples/stream-active-record-to-csv.rb +0 -42
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/field_type/test_boolean_type.rb +0 -65
- data/test/field_type/test_date_type.rb +0 -71
- data/test/field_type/test_float_type.rb +0 -56
- data/test/field_type/test_guess_type.rb +0 -14
- data/test/field_type/test_integer_type.rb +0 -52
- data/test/field_type/test_null_type.rb +0 -41
- data/test/field_type/test_string_type.rb +0 -18
- data/test/field_type/test_timestamp_type.rb +0 -108
- data/test/field_type/test_unknown_type.rb +0 -35
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -86
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/run +0 -23
- data/test/stat_type/test_nominal_stats.rb +0 -69
- data/test/stat_type/test_numerical_stats.rb +0 -118
- data/test/stat_type/test_ordinal_stats.rb +0 -92
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -89
- data/test/test_field_stats.rb +0 -134
- data/test/test_field_type.rb +0 -34
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -89
data/lib/flat_kit/logger.rb
CHANGED
|
@@ -1,36 +1,28 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
class LogFormatter < ::Logger::Formatter
|
|
5
|
-
FORMAT = "%s %5d %05s : %s\n".freeze
|
|
6
|
-
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
|
|
7
|
-
def initialize
|
|
8
|
-
super
|
|
9
|
-
self.datetime_format = DATETIME_FORMAT
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
def call(severity, time, progname, msg)
|
|
13
|
-
FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
|
|
14
|
-
end
|
|
15
|
-
end
|
|
3
|
+
require "logger"
|
|
16
4
|
|
|
5
|
+
# Public: Top level namespace for the gem
|
|
6
|
+
#
|
|
7
|
+
module FlatKit
|
|
8
|
+
# Internal: Logger class
|
|
9
|
+
#
|
|
17
10
|
class Logger
|
|
18
11
|
def self.for_io(io)
|
|
19
12
|
::Logger.new(io, formatter: LogFormatter.new)
|
|
20
13
|
end
|
|
21
14
|
|
|
22
15
|
def self.for_path(path)
|
|
23
|
-
|
|
24
|
-
for_io(io)
|
|
16
|
+
for_io(File.open(path.to_s, "a"))
|
|
25
17
|
end
|
|
26
18
|
end
|
|
27
19
|
|
|
28
20
|
def self.log_to(destination = $stderr)
|
|
29
|
-
if destination.
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
21
|
+
@logger = if destination.is_a?(::IO)
|
|
22
|
+
::FlatKit::Logger.for_io(destination)
|
|
23
|
+
else
|
|
24
|
+
::FlatKit::Logger.for_path(destination)
|
|
25
|
+
end
|
|
34
26
|
end
|
|
35
27
|
|
|
36
28
|
def self.logger
|
data/lib/flat_kit/merge.rb
CHANGED
|
@@ -1,15 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
4
|
+
# Internal: Class implementing merging from N inputs and output to 1 output.
|
|
5
|
+
#
|
|
2
6
|
class Merge
|
|
3
|
-
|
|
4
7
|
include ::FlatKit::EventEmitter
|
|
5
8
|
|
|
6
|
-
attr_reader :readers
|
|
7
|
-
attr_reader :writer
|
|
8
|
-
attr_reader :compare_fields
|
|
9
|
+
attr_reader :readers, :writer, :compare_fields
|
|
9
10
|
|
|
10
|
-
def initialize(inputs:, input_fallback: "auto",
|
|
11
|
-
output:, output_fallback: "auto",
|
|
12
|
-
compare_fields:)
|
|
11
|
+
def initialize(inputs:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
|
13
12
|
@compare_fields = compare_fields
|
|
14
13
|
@readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
|
|
15
14
|
fallback: input_fallback)
|
|
@@ -19,21 +18,12 @@ module FlatKit
|
|
|
19
18
|
|
|
20
19
|
def call
|
|
21
20
|
::FlatKit.logger.debug "Merging the following files into #{writer.destination}"
|
|
22
|
-
::FlatKit.logger.debug "Using this key for sorting: #{compare_fields.join(
|
|
21
|
+
::FlatKit.logger.debug "Using this key for sorting: #{compare_fields.join(', ')}"
|
|
23
22
|
readers.each do |r|
|
|
24
23
|
::FlatKit.logger.debug " #{r.source}"
|
|
25
24
|
end
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
notify_listeners(name: :start, data: :start)
|
|
30
|
-
merge_tree.each do |record|
|
|
31
|
-
|
|
32
|
-
position = writer.write(record)
|
|
33
|
-
meta = { position: position }
|
|
34
|
-
notify_listeners(name: :record, data: record, meta: meta)
|
|
35
|
-
end
|
|
36
|
-
notify_listeners(name: :stop, data: :stop)
|
|
26
|
+
run_merge(readers)
|
|
37
27
|
|
|
38
28
|
readers.each do |r|
|
|
39
29
|
::FlatKit.logger.debug " #{r.source} produced #{r.count} records"
|
|
@@ -42,5 +32,18 @@ module FlatKit
|
|
|
42
32
|
writer.close
|
|
43
33
|
::FlatKit.logger.debug "Wrote #{writer.count} records to #{writer.destination}"
|
|
44
34
|
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def run_merge(readers)
|
|
39
|
+
tree = ::FlatKit::MergeTree.new(readers)
|
|
40
|
+
notify_listeners(name: :start, data: :start)
|
|
41
|
+
tree.each do |record|
|
|
42
|
+
position = writer.write(record)
|
|
43
|
+
meta = { position: position }
|
|
44
|
+
notify_listeners(name: :record, data: record, meta: meta)
|
|
45
|
+
end
|
|
46
|
+
notify_listeners(name: :stop, data: :stop)
|
|
47
|
+
end
|
|
45
48
|
end
|
|
46
49
|
end
|
data/lib/flat_kit/merge_tree.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
# Public: Merge a list of sorted records from Readers into a single output Writer
|
|
3
5
|
#
|
|
@@ -29,9 +31,7 @@ module FlatKit
|
|
|
29
31
|
class MergeTree
|
|
30
32
|
include Enumerable
|
|
31
33
|
|
|
32
|
-
attr_reader :leaves
|
|
33
|
-
attr_reader :levels
|
|
34
|
-
attr_reader :readers
|
|
34
|
+
attr_reader :leaves, :levels, :readers
|
|
35
35
|
|
|
36
36
|
def initialize(readers)
|
|
37
37
|
@readers = readers
|
|
@@ -44,9 +44,7 @@ module FlatKit
|
|
|
44
44
|
|
|
45
45
|
# Need to pad the leaves to an even number so that the slicing by 2 for
|
|
46
46
|
# the tournament will work
|
|
47
|
-
if @leaves.size.odd?
|
|
48
|
-
@leaves << SentinelLeafNode.new
|
|
49
|
-
end
|
|
47
|
+
@leaves << SentinelLeafNode.new if @leaves.size.odd?
|
|
50
48
|
|
|
51
49
|
init_tree
|
|
52
50
|
end
|
|
@@ -94,6 +92,7 @@ module FlatKit
|
|
|
94
92
|
def each
|
|
95
93
|
loop do
|
|
96
94
|
break if root.leaf.finished?
|
|
95
|
+
|
|
97
96
|
yield root.value
|
|
98
97
|
# consume the yielded value and have the tournament tree replay those
|
|
99
98
|
# brackets affected
|
data/lib/flat_kit/output/file.rb
CHANGED
|
@@ -1,22 +1,31 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "zlib"
|
|
4
|
+
require "pathname"
|
|
2
5
|
|
|
3
6
|
module FlatKit
|
|
4
7
|
class Output
|
|
8
|
+
# Internal: File output implementation
|
|
9
|
+
#
|
|
5
10
|
class File < Output
|
|
6
11
|
attr_reader :path
|
|
7
12
|
|
|
13
|
+
# internal api method for testing purposes
|
|
14
|
+
attr_reader :io
|
|
15
|
+
|
|
8
16
|
def self.handles?(obj)
|
|
9
17
|
return true if obj.instance_of?(Pathname)
|
|
10
18
|
return false unless obj.instance_of?(String)
|
|
11
19
|
|
|
12
20
|
# incase these get loaded in different orders
|
|
13
|
-
return false if ::FlatKit::Output::IO.
|
|
14
|
-
return false if ::FlatKit::Output::IO.
|
|
21
|
+
return false if ::FlatKit::Output::IO.stdout?(obj)
|
|
22
|
+
return false if ::FlatKit::Output::IO.stderr?(obj)
|
|
15
23
|
|
|
16
|
-
|
|
24
|
+
true
|
|
17
25
|
end
|
|
18
26
|
|
|
19
27
|
def initialize(obj)
|
|
28
|
+
super()
|
|
20
29
|
@path = Pathname.new(obj)
|
|
21
30
|
path.dirname.mkpath
|
|
22
31
|
@io = open_output(path)
|
|
@@ -30,11 +39,6 @@ module FlatKit
|
|
|
30
39
|
@io.close
|
|
31
40
|
end
|
|
32
41
|
|
|
33
|
-
# internal api method for testing purposes
|
|
34
|
-
def io
|
|
35
|
-
@io
|
|
36
|
-
end
|
|
37
|
-
|
|
38
42
|
private
|
|
39
43
|
|
|
40
44
|
# open the opropriate otuput type depending on the destination file name
|
data/lib/flat_kit/output/io.rb
CHANGED
|
@@ -1,73 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
class Output
|
|
5
|
+
# Internal: Non-file Output impelementation - this is genrally to stdout or stderr
|
|
6
|
+
#
|
|
3
7
|
class IO < Output
|
|
4
|
-
attr_reader :count
|
|
8
|
+
attr_reader :count, :name
|
|
9
|
+
|
|
10
|
+
# internal api method for testing
|
|
11
|
+
attr_reader :io
|
|
5
12
|
|
|
6
|
-
STDOUTS = %w[
|
|
7
|
-
STDERRS = %w[
|
|
13
|
+
STDOUTS = %w[stdout STDOUT - <stdout>].freeze
|
|
14
|
+
STDERRS = %w[stderr STDERR <stderr>].freeze
|
|
8
15
|
|
|
9
16
|
def self.handles?(obj)
|
|
10
|
-
return true if
|
|
11
|
-
return true if
|
|
12
|
-
return true if [
|
|
13
|
-
|
|
17
|
+
return true if stderr?(obj)
|
|
18
|
+
return true if stdout?(obj)
|
|
19
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
|
20
|
+
|
|
21
|
+
false
|
|
14
22
|
end
|
|
15
23
|
|
|
16
|
-
def self.
|
|
24
|
+
def self.stderr?(obj)
|
|
17
25
|
case obj
|
|
18
26
|
when String
|
|
19
27
|
return true if STDERRS.include?(obj)
|
|
20
28
|
when ::IO
|
|
21
|
-
return true if obj ==
|
|
29
|
+
return true if obj == $stderr
|
|
22
30
|
end
|
|
23
|
-
|
|
31
|
+
false
|
|
24
32
|
end
|
|
25
33
|
|
|
26
|
-
def self.
|
|
34
|
+
def self.stdout?(obj)
|
|
27
35
|
case obj
|
|
28
36
|
when String
|
|
29
37
|
return true if STDOUTS.include?(obj)
|
|
30
38
|
when ::IO
|
|
31
|
-
return true if obj ==
|
|
39
|
+
return true if obj == $stdout
|
|
32
40
|
end
|
|
33
|
-
|
|
41
|
+
false
|
|
34
42
|
end
|
|
35
43
|
|
|
36
44
|
def initialize(obj)
|
|
45
|
+
super()
|
|
37
46
|
@count = 0
|
|
38
|
-
|
|
47
|
+
@name = nil
|
|
48
|
+
@io = nil
|
|
49
|
+
init_name_and_io(obj)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# this goes to an io stream and we are not in charge of opening it
|
|
53
|
+
def close
|
|
54
|
+
@io.close
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
def init_name_and_io(obj)
|
|
60
|
+
if self.class.stdout?(obj)
|
|
39
61
|
@name = "<STDOUT>"
|
|
40
62
|
@io = $stdout
|
|
41
|
-
elsif self.class.
|
|
63
|
+
elsif self.class.stderr?(obj)
|
|
42
64
|
@name = "<STDERR>"
|
|
43
65
|
@io = $stderr
|
|
44
|
-
elsif obj.
|
|
45
|
-
@name = obj.path
|
|
66
|
+
elsif obj.is_a?(::IO)
|
|
67
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
|
46
68
|
@io = obj
|
|
47
|
-
elsif obj.
|
|
48
|
-
@name = obj.inspect
|
|
49
|
-
@io = obj
|
|
50
|
-
elsif obj.kind_of?(::IO) then
|
|
69
|
+
elsif obj.is_a?(::StringIO)
|
|
51
70
|
@name = obj.inspect
|
|
52
71
|
@io = obj
|
|
53
72
|
else
|
|
54
73
|
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
|
55
74
|
end
|
|
56
75
|
end
|
|
57
|
-
|
|
58
|
-
def name
|
|
59
|
-
@name
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# this goes to an io stream and we are not in charge of opening it
|
|
63
|
-
def close
|
|
64
|
-
@io.close
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
# internal api method for testing
|
|
68
|
-
def io
|
|
69
|
-
@io
|
|
70
|
-
end
|
|
71
76
|
end
|
|
72
77
|
end
|
|
73
78
|
end
|
data/lib/flat_kit/output.rb
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
4
|
+
# Internal: Base clases for all output handlers
|
|
5
|
+
#
|
|
2
6
|
class Output
|
|
3
7
|
extend DescendantTracker
|
|
4
8
|
|
|
5
9
|
def self.from(out)
|
|
6
|
-
return out if out.
|
|
10
|
+
return out if out.is_a?(::FlatKit::Output)
|
|
7
11
|
|
|
8
12
|
out_klass = find_child(:handles?, out)
|
|
9
|
-
if out_klass
|
|
10
|
-
return out_klass.new(out)
|
|
11
|
-
end
|
|
13
|
+
return out_klass.new(out) if out_klass
|
|
12
14
|
|
|
13
15
|
raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
|
|
14
16
|
end
|
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
|
18
20
|
end
|
|
19
21
|
|
|
20
|
-
#
|
|
21
22
|
def io
|
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
|
23
24
|
end
|
|
@@ -32,5 +33,5 @@ module FlatKit
|
|
|
32
33
|
end
|
|
33
34
|
end
|
|
34
35
|
|
|
35
|
-
require
|
|
36
|
-
require
|
|
36
|
+
require "flat_kit/output/io"
|
|
37
|
+
require "flat_kit/output/file"
|
data/lib/flat_kit/position.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
# The information about the position of a record in an IO stream
|
|
3
5
|
#
|
|
@@ -5,10 +7,7 @@ module FlatKit
|
|
|
5
7
|
# information about the record that was just written
|
|
6
8
|
#
|
|
7
9
|
class Position
|
|
8
|
-
|
|
9
|
-
attr_reader :index # zero based
|
|
10
|
-
attr_reader :offset # byte offset in the IO stream
|
|
11
|
-
attr_reader :bytesize # byte length of the record
|
|
10
|
+
attr_reader :index, :offset, :bytesize # zero based # byte offset in the IO stream # byte length of the record
|
|
12
11
|
|
|
13
12
|
def initialize(index: nil, offset: nil, bytesize: nil)
|
|
14
13
|
@index = index
|
data/lib/flat_kit/reader.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
# Public: the base class for all format readers.
|
|
3
5
|
#
|
|
@@ -14,24 +16,21 @@ module FlatKit
|
|
|
14
16
|
# API:
|
|
15
17
|
#
|
|
16
18
|
# initialize(source:, compare_fields:)
|
|
17
|
-
# each -> Yields / returns
|
|
19
|
+
# each -> Yields / returns
|
|
18
20
|
#
|
|
19
21
|
class Reader
|
|
20
22
|
include Enumerable
|
|
21
23
|
|
|
22
|
-
attr_reader :source
|
|
23
|
-
attr_reader :compare_fields
|
|
24
|
+
attr_reader :source, :compare_fields
|
|
24
25
|
|
|
25
26
|
def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
|
|
26
27
|
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
|
27
|
-
|
|
28
|
+
format.reader.new(source: path, compare_fields: compare_fields)
|
|
28
29
|
end
|
|
29
30
|
|
|
30
31
|
def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
|
|
31
32
|
# default to stdin if there are no paths
|
|
32
|
-
if paths.empty?
|
|
33
|
-
paths << "-"
|
|
34
|
-
end
|
|
33
|
+
paths << "-" if paths.empty?
|
|
35
34
|
|
|
36
35
|
paths.map do |path|
|
|
37
36
|
create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
|
|
@@ -55,7 +54,8 @@ module FlatKit
|
|
|
55
54
|
|
|
56
55
|
def resolve_compare_fields(value)
|
|
57
56
|
return [] if value == :none
|
|
58
|
-
|
|
57
|
+
|
|
58
|
+
value
|
|
59
59
|
end
|
|
60
60
|
end
|
|
61
61
|
end
|
data/lib/flat_kit/record.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
# Public: The base class that all record classes should inherit from.
|
|
3
5
|
#
|
|
@@ -35,11 +37,9 @@ module FlatKit
|
|
|
35
37
|
# # the initialize method must call super(data:, compare_fields:) to
|
|
36
38
|
# initializa the root data structures
|
|
37
39
|
class Record
|
|
38
|
-
|
|
39
40
|
include Comparable
|
|
40
41
|
|
|
41
|
-
attr_reader :data
|
|
42
|
-
attr_reader :compare_fields
|
|
42
|
+
attr_reader :data, :compare_fields
|
|
43
43
|
|
|
44
44
|
def initialize(data:, compare_fields:)
|
|
45
45
|
@data = data
|
|
@@ -57,15 +57,15 @@ module FlatKit
|
|
|
57
57
|
my_val = self[field]
|
|
58
58
|
other_val = other[field]
|
|
59
59
|
|
|
60
|
-
if my_val.nil? && other_val.nil?
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
60
|
+
compare_result = if my_val.nil? && other_val.nil?
|
|
61
|
+
0
|
|
62
|
+
elsif my_val.nil?
|
|
63
|
+
-1
|
|
64
|
+
elsif other_val.nil?
|
|
65
|
+
1
|
|
66
|
+
else
|
|
67
|
+
my_val <=> other_val
|
|
68
|
+
end
|
|
69
69
|
|
|
70
70
|
return compare_result unless compare_result.zero?
|
|
71
71
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
|
3
5
|
# class.
|
|
@@ -8,12 +10,10 @@ module FlatKit
|
|
|
8
10
|
class SentinelInternalNode
|
|
9
11
|
include Comparable
|
|
10
12
|
|
|
11
|
-
attr_reader :left
|
|
12
|
-
attr_reader :right
|
|
13
|
-
attr_reader :winner
|
|
13
|
+
attr_reader :left, :right, :winner
|
|
14
14
|
attr_accessor :next_level
|
|
15
15
|
|
|
16
|
-
def initialize(
|
|
16
|
+
def initialize(*)
|
|
17
17
|
@left = nil
|
|
18
18
|
@right = nil
|
|
19
19
|
@winner = nil
|
|
@@ -31,7 +31,8 @@ module FlatKit
|
|
|
31
31
|
# A sentinal node is always greater than any other node
|
|
32
32
|
def <=>(other)
|
|
33
33
|
return 0 if other.sentinel?
|
|
34
|
-
|
|
34
|
+
|
|
35
|
+
1
|
|
35
36
|
end
|
|
36
37
|
end
|
|
37
38
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
|
3
5
|
#
|
|
@@ -31,7 +33,8 @@ module FlatKit
|
|
|
31
33
|
# A sentinal node is always greater than any other node
|
|
32
34
|
def <=>(other)
|
|
33
35
|
return 0 if other.sentinel?
|
|
34
|
-
|
|
36
|
+
|
|
37
|
+
1
|
|
35
38
|
end
|
|
36
39
|
end
|
|
37
40
|
end
|
data/lib/flat_kit/sort.rb
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
4
|
+
# Internal: Sorts an Input and sends the sorted records to an Output
|
|
5
|
+
#
|
|
2
6
|
class Sort
|
|
3
|
-
attr_reader :reader
|
|
4
|
-
attr_reader :writer
|
|
5
|
-
attr_reader :compare_fields
|
|
6
|
-
|
|
7
|
-
def initialize(input:, input_fallback: "auto",
|
|
8
|
-
output:, output_fallback: "auto",
|
|
9
|
-
compare_fields:)
|
|
7
|
+
attr_reader :reader, :writer, :compare_fields
|
|
10
8
|
|
|
9
|
+
def initialize(input:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
|
11
10
|
@compare_fields = compare_fields
|
|
12
11
|
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
|
13
12
|
fallback: input_fallback)
|
|
@@ -16,12 +15,8 @@ module FlatKit
|
|
|
16
15
|
end
|
|
17
16
|
|
|
18
17
|
def call
|
|
19
|
-
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(
|
|
20
|
-
records =
|
|
21
|
-
reader.each do |r|
|
|
22
|
-
a << r
|
|
23
|
-
end
|
|
24
|
-
end
|
|
18
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(', ')}"
|
|
19
|
+
records = reader.map { |r| r }
|
|
25
20
|
::FlatKit.logger.info "Read #{reader.count} records into #{records.size} element array"
|
|
26
21
|
records.sort!
|
|
27
22
|
::FlatKit.logger.info "Sorted #{records.size} records"
|
|
@@ -1,21 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module FlatKit
|
|
2
4
|
class StatType
|
|
3
|
-
|
|
4
|
-
# Status object to keep track of the count and frequency of values
|
|
5
|
+
# Internal: Status object to keep track of the count and frequency of values.
|
|
5
6
|
#
|
|
6
7
|
class NominalStats < StatType
|
|
7
|
-
|
|
8
8
|
attr_reader :count
|
|
9
9
|
|
|
10
10
|
def self.default_stats
|
|
11
|
-
@default_stats ||= %w[
|
|
11
|
+
@default_stats ||= %w[count]
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
def self.all_stats
|
|
15
|
-
@all_stats ||= %w[
|
|
15
|
+
@all_stats ||= %w[count unique_count unique_values mode]
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
def initialize(collecting_frequencies: false)
|
|
19
|
+
super()
|
|
19
20
|
@mutex = Mutex.new
|
|
20
21
|
@count = 0
|
|
21
22
|
@collecting_frequencies = collecting_frequencies
|
|
@@ -24,26 +25,31 @@ module FlatKit
|
|
|
24
25
|
|
|
25
26
|
def collected_stats
|
|
26
27
|
return self.class.default_stats unless @collecting_frequencies
|
|
27
|
-
|
|
28
|
+
|
|
29
|
+
self.class.all_stats
|
|
28
30
|
end
|
|
29
31
|
|
|
30
32
|
def mode
|
|
31
33
|
return nil unless @collecting_frequencies
|
|
32
|
-
|
|
34
|
+
|
|
35
|
+
@frequencies.max_by { |_item, item_count| item_count }.first
|
|
33
36
|
end
|
|
34
37
|
|
|
35
38
|
def unique_count
|
|
36
39
|
return nil unless @collecting_frequencies
|
|
40
|
+
|
|
37
41
|
@frequencies.size
|
|
38
42
|
end
|
|
39
43
|
|
|
40
44
|
def unique_values
|
|
41
45
|
return nil unless @collecting_frequencies
|
|
46
|
+
|
|
42
47
|
@frequencies.keys
|
|
43
48
|
end
|
|
44
49
|
|
|
45
50
|
def frequencies
|
|
46
51
|
return nil unless @collecting_frequencies
|
|
52
|
+
|
|
47
53
|
@frequencies
|
|
48
54
|
end
|
|
49
55
|
|