flat_kit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CONTRIBUTING.md +46 -0
- data/HISTORY.md +5 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +66 -0
- data/README.md +80 -0
- data/Rakefile +20 -0
- data/bin/fk +4 -0
- data/lib/flat_kit.rb +23 -0
- data/lib/flat_kit/cli.rb +80 -0
- data/lib/flat_kit/command.rb +53 -0
- data/lib/flat_kit/command/cat.rb +93 -0
- data/lib/flat_kit/command/merge.rb +88 -0
- data/lib/flat_kit/command/sort.rb +88 -0
- data/lib/flat_kit/descendant_tracker.rb +27 -0
- data/lib/flat_kit/error.rb +5 -0
- data/lib/flat_kit/format.rb +34 -0
- data/lib/flat_kit/input.rb +32 -0
- data/lib/flat_kit/input/file.rb +53 -0
- data/lib/flat_kit/input/io.rb +54 -0
- data/lib/flat_kit/internal_node.rb +84 -0
- data/lib/flat_kit/jsonl.rb +8 -0
- data/lib/flat_kit/jsonl/format.rb +25 -0
- data/lib/flat_kit/jsonl/reader.rb +30 -0
- data/lib/flat_kit/jsonl/record.rb +84 -0
- data/lib/flat_kit/jsonl/writer.rb +45 -0
- data/lib/flat_kit/leaf_node.rb +71 -0
- data/lib/flat_kit/logger.rb +39 -0
- data/lib/flat_kit/merge.rb +35 -0
- data/lib/flat_kit/merge_tree.rb +104 -0
- data/lib/flat_kit/output.rb +32 -0
- data/lib/flat_kit/output/file.rb +55 -0
- data/lib/flat_kit/output/io.rb +73 -0
- data/lib/flat_kit/reader.rb +61 -0
- data/lib/flat_kit/record.rb +83 -0
- data/lib/flat_kit/sentinel_internal_node.rb +37 -0
- data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
- data/lib/flat_kit/sort.rb +35 -0
- data/lib/flat_kit/writer.rb +38 -0
- data/lib/flat_kit/xsv.rb +8 -0
- data/lib/flat_kit/xsv/format.rb +25 -0
- data/lib/flat_kit/xsv/reader.rb +45 -0
- data/lib/flat_kit/xsv/record.rb +90 -0
- data/lib/flat_kit/xsv/writer.rb +70 -0
- data/tasks/default.rake +242 -0
- data/tasks/extension.rake +38 -0
- data/tasks/man.rake +7 -0
- data/tasks/this.rb +208 -0
- data/test/device_dataset.rb +117 -0
- data/test/input/test_file.rb +73 -0
- data/test/input/test_io.rb +93 -0
- data/test/jsonl/test_format.rb +22 -0
- data/test/jsonl/test_reader.rb +49 -0
- data/test/jsonl/test_record.rb +61 -0
- data/test/jsonl/test_writer.rb +68 -0
- data/test/output/test_file.rb +60 -0
- data/test/output/test_io.rb +104 -0
- data/test/test_conversions.rb +45 -0
- data/test/test_format.rb +24 -0
- data/test/test_helper.rb +26 -0
- data/test/test_merge.rb +40 -0
- data/test/test_merge_tree.rb +64 -0
- data/test/test_version.rb +11 -0
- data/test/xsv/test_format.rb +22 -0
- data/test/xsv/test_reader.rb +61 -0
- data/test/xsv/test_record.rb +69 -0
- data/test/xsv/test_writer.rb +68 -0
- metadata +237 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Output
|
3
|
+
extend DescendantTracker
|
4
|
+
|
5
|
+
def self.from(out)
|
6
|
+
return out if out.kind_of?(::FlatKit::Output)
|
7
|
+
|
8
|
+
out_klass = find_child(:handles?, out)
|
9
|
+
if out_klass then
|
10
|
+
return out_klass.new(out)
|
11
|
+
end
|
12
|
+
|
13
|
+
raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def name
|
17
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
def io
|
22
|
+
raise NotImplementedError, "#{self.class} must implement #io"
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
raise NotImplementedError, "#{self.class} must implement #close"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
require 'flat_kit/output/io'
|
32
|
+
require 'flat_kit/output/file'
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class Output
|
5
|
+
class File < Output
|
6
|
+
attr_reader :path
|
7
|
+
|
8
|
+
def self.handles?(obj)
|
9
|
+
return true if obj.instance_of?(Pathname)
|
10
|
+
return false unless obj.instance_of?(String)
|
11
|
+
|
12
|
+
# incase these get loaded in different orders
|
13
|
+
return false if ::FlatKit::Output::IO.is_stdout?(obj)
|
14
|
+
return false if ::FlatKit::Output::IO.is_stderr?(obj)
|
15
|
+
|
16
|
+
return true
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(obj)
|
20
|
+
@path = Pathname.new(obj)
|
21
|
+
path.dirname.mkpath
|
22
|
+
@io = open_output(path)
|
23
|
+
end
|
24
|
+
|
25
|
+
def name
|
26
|
+
path.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
def close
|
30
|
+
@io.close
|
31
|
+
end
|
32
|
+
|
33
|
+
# internal api method for testing purposes
|
34
|
+
def io
|
35
|
+
@io
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# open the opropriate otuput type depending on the destination file name
|
41
|
+
#
|
42
|
+
# TODO: add in bzip
|
43
|
+
def open_output(path)
|
44
|
+
case path.extname
|
45
|
+
when ".gz"
|
46
|
+
Zlib::GzipWriter.open(path.to_s)
|
47
|
+
# when ".gz"
|
48
|
+
# ::IO.popen("gzip -c > #{path}", "w")
|
49
|
+
else
|
50
|
+
path.open("wb")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Output
|
3
|
+
class IO < Output
|
4
|
+
attr_reader :count
|
5
|
+
|
6
|
+
STDOUTS = %w[ stdout STDOUT - <stdout> ]
|
7
|
+
STDERRS = %w[ stderr STDERR <stderr> ]
|
8
|
+
|
9
|
+
def self.handles?(obj)
|
10
|
+
return true if is_stderr?(obj)
|
11
|
+
return true if is_stdout?(obj)
|
12
|
+
return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.is_stderr?(obj)
|
17
|
+
case obj
|
18
|
+
when String
|
19
|
+
return true if STDERRS.include?(obj)
|
20
|
+
when ::IO
|
21
|
+
return true if obj == ::STDERR
|
22
|
+
end
|
23
|
+
return false
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.is_stdout?(obj)
|
27
|
+
case obj
|
28
|
+
when String
|
29
|
+
return true if STDOUTS.include?(obj)
|
30
|
+
when ::IO
|
31
|
+
return true if obj == ::STDOUT
|
32
|
+
end
|
33
|
+
return false
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize(obj)
|
37
|
+
@count = 0
|
38
|
+
if self.class.is_stdout?(obj) then
|
39
|
+
@name = "<STDOUT>"
|
40
|
+
@io = $stdout
|
41
|
+
elsif self.class.is_stderr?(obj) then
|
42
|
+
@name = "<STDERR>"
|
43
|
+
@io = $stderr
|
44
|
+
elsif obj.kind_of?(::File) then
|
45
|
+
@name = obj.path
|
46
|
+
@io = obj
|
47
|
+
elsif obj.kind_of?(::StringIO) then
|
48
|
+
@name = obj.inspect
|
49
|
+
@io = obj
|
50
|
+
elsif obj.kind_of?(::IO) then
|
51
|
+
@name = obj.inspect
|
52
|
+
@io = obj
|
53
|
+
else
|
54
|
+
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def name
|
59
|
+
@name
|
60
|
+
end
|
61
|
+
|
62
|
+
# this goes to an io stream and we are not in charge of opening it
|
63
|
+
def close
|
64
|
+
@io.close
|
65
|
+
end
|
66
|
+
|
67
|
+
# internal api method for testing
|
68
|
+
def io
|
69
|
+
@io
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: the base class for all format readers.
|
3
|
+
#
|
4
|
+
# A format reader needs to be able to open the appropriate file format and
|
5
|
+
# implement Enumerable to iterate over all the records in the file format.
|
6
|
+
#
|
7
|
+
# If it is appropriate for the reader to be able to read from a IO object
|
8
|
+
# directly, that needs to be supported also.
|
9
|
+
#
|
10
|
+
# The ::FlatKit::Reader class should never be used directly, only the reader
|
11
|
+
# from the appropriate format should be used.
|
12
|
+
#
|
13
|
+
#
|
14
|
+
# API:
|
15
|
+
#
|
16
|
+
# initialize(source:, compare_fields:)
|
17
|
+
# each -> Yields / returns
|
18
|
+
#
|
19
|
+
class Reader
|
20
|
+
include Enumerable
|
21
|
+
|
22
|
+
attr_reader :source
|
23
|
+
attr_reader :compare_fields
|
24
|
+
|
25
|
+
def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
|
26
|
+
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
27
|
+
return format.reader.new(source: path, compare_fields: compare_fields)
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
|
31
|
+
# default to stdin if there are no paths
|
32
|
+
if paths.empty? then
|
33
|
+
paths << "-"
|
34
|
+
end
|
35
|
+
|
36
|
+
paths.map do |path|
|
37
|
+
create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize(source:, compare_fields: :none)
|
42
|
+
@source = source
|
43
|
+
@compare_fields = resolve_compare_fields(compare_fields)
|
44
|
+
end
|
45
|
+
|
46
|
+
def format_name
|
47
|
+
self.class.format_name
|
48
|
+
end
|
49
|
+
|
50
|
+
def each
|
51
|
+
raise NotImplementedError, "#{self.class} needs to implement #each"
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def resolve_compare_fields(value)
|
57
|
+
return [] if value == :none
|
58
|
+
return value
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: The base class that all record classes should inherit from.
|
3
|
+
#
|
4
|
+
# Its goal is to be an efficient comparator of data that can be inflated from
|
5
|
+
# a source structure to a fully realized hash.
|
6
|
+
#
|
7
|
+
# All records need to be able to be initialized from a data structure that it
|
8
|
+
# is handed to it by the Reader intance within the same Format.
|
9
|
+
#
|
10
|
+
# Records are generally not going to be created outside of this library, they
|
11
|
+
# are tied to a specific format and provide a common interface that can be
|
12
|
+
# used for:
|
13
|
+
#
|
14
|
+
# * comparison between records from different source / destinations formats
|
15
|
+
# * conversion to a different format
|
16
|
+
#
|
17
|
+
# Given that - the way to create a record is either from another Record
|
18
|
+
# instance:
|
19
|
+
#
|
20
|
+
# Record.from_record(other) # create a record from another record
|
21
|
+
#
|
22
|
+
# or the way a Reader will do it
|
23
|
+
#
|
24
|
+
# Record.new(...) # generally only used by a Reader instance to
|
25
|
+
# # yield new reocrds
|
26
|
+
#
|
27
|
+
#
|
28
|
+
# When Implementing a new Format, the corresponding Record class for that
|
29
|
+
# Format must:
|
30
|
+
#
|
31
|
+
# * implement `#[](key)` which will be used to lookup the values of the
|
32
|
+
# comparable fields.
|
33
|
+
# * implement `#to_hash` which is used when conversions
|
34
|
+
# * implement `.from_record` which is used in conversion
|
35
|
+
# # the initialize method must call super(data:, compare_fields:) to
|
36
|
+
# initializa the root data structures
|
37
|
+
class Record
|
38
|
+
|
39
|
+
include Comparable
|
40
|
+
|
41
|
+
attr_reader :data
|
42
|
+
attr_reader :compare_fields
|
43
|
+
|
44
|
+
def initialize(data:, compare_fields:)
|
45
|
+
@data = data
|
46
|
+
@compare_fields = compare_fields
|
47
|
+
end
|
48
|
+
|
49
|
+
def format_name
|
50
|
+
self.class.format_name
|
51
|
+
end
|
52
|
+
|
53
|
+
def <=>(other)
|
54
|
+
compare_result = nil
|
55
|
+
|
56
|
+
compare_fields.each do |field|
|
57
|
+
my_val = self[field]
|
58
|
+
other_val = other[field]
|
59
|
+
|
60
|
+
return 0 if my_val.nil? && other_val.nil?
|
61
|
+
return -1 if my_val.nil?
|
62
|
+
return 1 if other_val.nil?
|
63
|
+
|
64
|
+
compare_result = my_val.<=>(other_val)
|
65
|
+
|
66
|
+
return compare_result unless compare_result.zero?
|
67
|
+
end
|
68
|
+
compare_result
|
69
|
+
end
|
70
|
+
|
71
|
+
def [](key)
|
72
|
+
raise NotImplementedError, "#{self.class} must implement #[](key)"
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_hash
|
76
|
+
raise NotImplementedError, "#{self.class} must implement #to_hash"
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.from_record
|
80
|
+
raise NotImplementedError, "#{self.class} must implement #{self.class}.from_record"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
3
|
+
# class.
|
4
|
+
#
|
5
|
+
# This class represents an empty / completed node in the merge tree where all
|
6
|
+
# the data from the descendant leaf node is full used up.
|
7
|
+
#
|
8
|
+
class SentinelInternalNode
|
9
|
+
include Comparable
|
10
|
+
|
11
|
+
attr_reader :left
|
12
|
+
attr_reader :right
|
13
|
+
attr_reader :winner
|
14
|
+
attr_accessor :next_level
|
15
|
+
|
16
|
+
def initialize(left: nil, right: nil)
|
17
|
+
@left = nil
|
18
|
+
@right = nil
|
19
|
+
@winner = nil
|
20
|
+
@next_level = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
def sentinel?
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
def leaf?
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
# A sentinal node is always greater than any other node
|
32
|
+
def <=>(other)
|
33
|
+
return 0 if other.sentinel?
|
34
|
+
return 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
3
|
+
#
|
4
|
+
# This class represents a LeafNode that has no more data
|
5
|
+
#
|
6
|
+
class SentinelLeafNode
|
7
|
+
include Comparable
|
8
|
+
|
9
|
+
attr_accessor :next_level
|
10
|
+
|
11
|
+
def sentinel?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def leaf?
|
16
|
+
true
|
17
|
+
end
|
18
|
+
|
19
|
+
def next
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
def finished?
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
def value
|
28
|
+
nil
|
29
|
+
end
|
30
|
+
|
31
|
+
# A sentinal node is always greater than any other node
|
32
|
+
def <=>(other)
|
33
|
+
return 0 if other.sentinel?
|
34
|
+
return 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Sort
|
3
|
+
attr_reader :reader
|
4
|
+
attr_reader :writer
|
5
|
+
attr_reader :compare_fields
|
6
|
+
|
7
|
+
def initialize(input:, input_fallback: "auto",
|
8
|
+
output:, output_fallback: "auto",
|
9
|
+
compare_fields:)
|
10
|
+
|
11
|
+
@compare_fields = compare_fields
|
12
|
+
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
13
|
+
fallback: input_fallback)
|
14
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
15
|
+
reader_format: @reader.format_name)
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(", ")}"
|
20
|
+
records = Array.new.tap do |a|
|
21
|
+
reader.each do |r|
|
22
|
+
a << r
|
23
|
+
end
|
24
|
+
end
|
25
|
+
::FlatKit.logger.info "Read #{reader.count} records into #{records.size} element array"
|
26
|
+
records.sort!
|
27
|
+
::FlatKit.logger.info "Sorted #{records.size} records"
|
28
|
+
records.each do |r|
|
29
|
+
writer.write(r)
|
30
|
+
end
|
31
|
+
writer.close
|
32
|
+
::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: The base class for all format writers.
|
3
|
+
#
|
4
|
+
# A format writer will only write those Records, and on that, only those of
|
5
|
+
# its own format.
|
6
|
+
#
|
7
|
+
# It must implement a #write methods takes a Record. It can convert the record
|
8
|
+
# to one matching its own format if it whishes. But it should in any case
|
9
|
+
# check the Record format to make sure it matches
|
10
|
+
#
|
11
|
+
# See the Xsv::Writer and Jsonl::Writer for examples.
|
12
|
+
#
|
13
|
+
class Writer
|
14
|
+
attr_reader :destination
|
15
|
+
|
16
|
+
def self.create_writer_from_path(path:, fallback:, reader_format:)
|
17
|
+
fallback = reader_format if fallback == "auto"
|
18
|
+
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
19
|
+
format.writer.new(destination: path)
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(destination:)
|
23
|
+
@destination = destination
|
24
|
+
end
|
25
|
+
|
26
|
+
def format_name
|
27
|
+
self.class.format_name
|
28
|
+
end
|
29
|
+
|
30
|
+
def write(record)
|
31
|
+
raise NotImplementedError, "#{self.class} needs to implement #write"
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
raise NotImplementedError, "#{self.class} needs to implement #close"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|