flat_kit 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CONTRIBUTING.md +46 -0
- data/HISTORY.md +5 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +66 -0
- data/README.md +80 -0
- data/Rakefile +20 -0
- data/bin/fk +4 -0
- data/lib/flat_kit.rb +23 -0
- data/lib/flat_kit/cli.rb +80 -0
- data/lib/flat_kit/command.rb +53 -0
- data/lib/flat_kit/command/cat.rb +93 -0
- data/lib/flat_kit/command/merge.rb +88 -0
- data/lib/flat_kit/command/sort.rb +88 -0
- data/lib/flat_kit/descendant_tracker.rb +27 -0
- data/lib/flat_kit/error.rb +5 -0
- data/lib/flat_kit/format.rb +34 -0
- data/lib/flat_kit/input.rb +32 -0
- data/lib/flat_kit/input/file.rb +53 -0
- data/lib/flat_kit/input/io.rb +54 -0
- data/lib/flat_kit/internal_node.rb +84 -0
- data/lib/flat_kit/jsonl.rb +8 -0
- data/lib/flat_kit/jsonl/format.rb +25 -0
- data/lib/flat_kit/jsonl/reader.rb +30 -0
- data/lib/flat_kit/jsonl/record.rb +84 -0
- data/lib/flat_kit/jsonl/writer.rb +45 -0
- data/lib/flat_kit/leaf_node.rb +71 -0
- data/lib/flat_kit/logger.rb +39 -0
- data/lib/flat_kit/merge.rb +35 -0
- data/lib/flat_kit/merge_tree.rb +104 -0
- data/lib/flat_kit/output.rb +32 -0
- data/lib/flat_kit/output/file.rb +55 -0
- data/lib/flat_kit/output/io.rb +73 -0
- data/lib/flat_kit/reader.rb +61 -0
- data/lib/flat_kit/record.rb +83 -0
- data/lib/flat_kit/sentinel_internal_node.rb +37 -0
- data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
- data/lib/flat_kit/sort.rb +35 -0
- data/lib/flat_kit/writer.rb +38 -0
- data/lib/flat_kit/xsv.rb +8 -0
- data/lib/flat_kit/xsv/format.rb +25 -0
- data/lib/flat_kit/xsv/reader.rb +45 -0
- data/lib/flat_kit/xsv/record.rb +90 -0
- data/lib/flat_kit/xsv/writer.rb +70 -0
- data/tasks/default.rake +242 -0
- data/tasks/extension.rake +38 -0
- data/tasks/man.rake +7 -0
- data/tasks/this.rb +208 -0
- data/test/device_dataset.rb +117 -0
- data/test/input/test_file.rb +73 -0
- data/test/input/test_io.rb +93 -0
- data/test/jsonl/test_format.rb +22 -0
- data/test/jsonl/test_reader.rb +49 -0
- data/test/jsonl/test_record.rb +61 -0
- data/test/jsonl/test_writer.rb +68 -0
- data/test/output/test_file.rb +60 -0
- data/test/output/test_io.rb +104 -0
- data/test/test_conversions.rb +45 -0
- data/test/test_format.rb +24 -0
- data/test/test_helper.rb +26 -0
- data/test/test_merge.rb +40 -0
- data/test/test_merge_tree.rb +64 -0
- data/test/test_version.rb +11 -0
- data/test/xsv/test_format.rb +22 -0
- data/test/xsv/test_reader.rb +61 -0
- data/test/xsv/test_record.rb +69 -0
- data/test/xsv/test_writer.rb +68 -0
- metadata +237 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Output
|
3
|
+
extend DescendantTracker
|
4
|
+
|
5
|
+
def self.from(out)
|
6
|
+
return out if out.kind_of?(::FlatKit::Output)
|
7
|
+
|
8
|
+
out_klass = find_child(:handles?, out)
|
9
|
+
if out_klass then
|
10
|
+
return out_klass.new(out)
|
11
|
+
end
|
12
|
+
|
13
|
+
raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def name
|
17
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
def io
|
22
|
+
raise NotImplementedError, "#{self.class} must implement #io"
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
raise NotImplementedError, "#{self.class} must implement #close"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
require 'flat_kit/output/io'
|
32
|
+
require 'flat_kit/output/file'
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class Output
|
5
|
+
class File < Output
|
6
|
+
attr_reader :path
|
7
|
+
|
8
|
+
def self.handles?(obj)
|
9
|
+
return true if obj.instance_of?(Pathname)
|
10
|
+
return false unless obj.instance_of?(String)
|
11
|
+
|
12
|
+
# incase these get loaded in different orders
|
13
|
+
return false if ::FlatKit::Output::IO.is_stdout?(obj)
|
14
|
+
return false if ::FlatKit::Output::IO.is_stderr?(obj)
|
15
|
+
|
16
|
+
return true
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(obj)
|
20
|
+
@path = Pathname.new(obj)
|
21
|
+
path.dirname.mkpath
|
22
|
+
@io = open_output(path)
|
23
|
+
end
|
24
|
+
|
25
|
+
def name
|
26
|
+
path.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
def close
|
30
|
+
@io.close
|
31
|
+
end
|
32
|
+
|
33
|
+
# internal api method for testing purposes
|
34
|
+
def io
|
35
|
+
@io
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# open the opropriate otuput type depending on the destination file name
|
41
|
+
#
|
42
|
+
# TODO: add in bzip
|
43
|
+
def open_output(path)
|
44
|
+
case path.extname
|
45
|
+
when ".gz"
|
46
|
+
Zlib::GzipWriter.open(path.to_s)
|
47
|
+
# when ".gz"
|
48
|
+
# ::IO.popen("gzip -c > #{path}", "w")
|
49
|
+
else
|
50
|
+
path.open("wb")
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Output
|
3
|
+
class IO < Output
|
4
|
+
attr_reader :count
|
5
|
+
|
6
|
+
STDOUTS = %w[ stdout STDOUT - <stdout> ]
|
7
|
+
STDERRS = %w[ stderr STDERR <stderr> ]
|
8
|
+
|
9
|
+
def self.handles?(obj)
|
10
|
+
return true if is_stderr?(obj)
|
11
|
+
return true if is_stdout?(obj)
|
12
|
+
return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.is_stderr?(obj)
|
17
|
+
case obj
|
18
|
+
when String
|
19
|
+
return true if STDERRS.include?(obj)
|
20
|
+
when ::IO
|
21
|
+
return true if obj == ::STDERR
|
22
|
+
end
|
23
|
+
return false
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.is_stdout?(obj)
|
27
|
+
case obj
|
28
|
+
when String
|
29
|
+
return true if STDOUTS.include?(obj)
|
30
|
+
when ::IO
|
31
|
+
return true if obj == ::STDOUT
|
32
|
+
end
|
33
|
+
return false
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize(obj)
|
37
|
+
@count = 0
|
38
|
+
if self.class.is_stdout?(obj) then
|
39
|
+
@name = "<STDOUT>"
|
40
|
+
@io = $stdout
|
41
|
+
elsif self.class.is_stderr?(obj) then
|
42
|
+
@name = "<STDERR>"
|
43
|
+
@io = $stderr
|
44
|
+
elsif obj.kind_of?(::File) then
|
45
|
+
@name = obj.path
|
46
|
+
@io = obj
|
47
|
+
elsif obj.kind_of?(::StringIO) then
|
48
|
+
@name = obj.inspect
|
49
|
+
@io = obj
|
50
|
+
elsif obj.kind_of?(::IO) then
|
51
|
+
@name = obj.inspect
|
52
|
+
@io = obj
|
53
|
+
else
|
54
|
+
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def name
|
59
|
+
@name
|
60
|
+
end
|
61
|
+
|
62
|
+
# this goes to an io stream and we are not in charge of opening it
|
63
|
+
def close
|
64
|
+
@io.close
|
65
|
+
end
|
66
|
+
|
67
|
+
# internal api method for testing
|
68
|
+
def io
|
69
|
+
@io
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: the base class for all format readers.
|
3
|
+
#
|
4
|
+
# A format reader needs to be able to open the appropriate file format and
|
5
|
+
# implement Enumerable to iterate over all the records in the file format.
|
6
|
+
#
|
7
|
+
# If it is appropriate for the reader to be able to read from a IO object
|
8
|
+
# directly, that needs to be supported also.
|
9
|
+
#
|
10
|
+
# The ::FlatKit::Reader class should never be used directly, only the reader
|
11
|
+
# from the appropriate format should be used.
|
12
|
+
#
|
13
|
+
#
|
14
|
+
# API:
|
15
|
+
#
|
16
|
+
# initialize(source:, compare_fields:)
|
17
|
+
# each -> Yields / returns
|
18
|
+
#
|
19
|
+
class Reader
|
20
|
+
include Enumerable
|
21
|
+
|
22
|
+
attr_reader :source
|
23
|
+
attr_reader :compare_fields
|
24
|
+
|
25
|
+
def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
|
26
|
+
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
27
|
+
return format.reader.new(source: path, compare_fields: compare_fields)
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
|
31
|
+
# default to stdin if there are no paths
|
32
|
+
if paths.empty? then
|
33
|
+
paths << "-"
|
34
|
+
end
|
35
|
+
|
36
|
+
paths.map do |path|
|
37
|
+
create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize(source:, compare_fields: :none)
|
42
|
+
@source = source
|
43
|
+
@compare_fields = resolve_compare_fields(compare_fields)
|
44
|
+
end
|
45
|
+
|
46
|
+
def format_name
|
47
|
+
self.class.format_name
|
48
|
+
end
|
49
|
+
|
50
|
+
def each
|
51
|
+
raise NotImplementedError, "#{self.class} needs to implement #each"
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def resolve_compare_fields(value)
|
57
|
+
return [] if value == :none
|
58
|
+
return value
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: The base class that all record classes should inherit from.
|
3
|
+
#
|
4
|
+
# Its goal is to be an efficient comparator of data that can be inflated from
|
5
|
+
# a source structure to a fully realized hash.
|
6
|
+
#
|
7
|
+
# All records need to be able to be initialized from a data structure that it
|
8
|
+
# is handed to it by the Reader intance within the same Format.
|
9
|
+
#
|
10
|
+
# Records are generally not going to be created outside of this library, they
|
11
|
+
# are tied to a specific format and provide a common interface that can be
|
12
|
+
# used for:
|
13
|
+
#
|
14
|
+
# * comparison between records from different source / destinations formats
|
15
|
+
# * conversion to a different format
|
16
|
+
#
|
17
|
+
# Given that - the way to create a record is either from another Record
|
18
|
+
# instance:
|
19
|
+
#
|
20
|
+
# Record.from_record(other) # create a record from another record
|
21
|
+
#
|
22
|
+
# or the way a Reader will do it
|
23
|
+
#
|
24
|
+
# Record.new(...) # generally only used by a Reader instance to
|
25
|
+
# # yield new reocrds
|
26
|
+
#
|
27
|
+
#
|
28
|
+
# When Implementing a new Format, the corresponding Record class for that
|
29
|
+
# Format must:
|
30
|
+
#
|
31
|
+
# * implement `#[](key)` which will be used to lookup the values of the
|
32
|
+
# comparable fields.
|
33
|
+
# * implement `#to_hash` which is used when conversions
|
34
|
+
# * implement `.from_record` which is used in conversion
|
35
|
+
# # the initialize method must call super(data:, compare_fields:) to
|
36
|
+
# initializa the root data structures
|
37
|
+
class Record
|
38
|
+
|
39
|
+
include Comparable
|
40
|
+
|
41
|
+
attr_reader :data
|
42
|
+
attr_reader :compare_fields
|
43
|
+
|
44
|
+
def initialize(data:, compare_fields:)
|
45
|
+
@data = data
|
46
|
+
@compare_fields = compare_fields
|
47
|
+
end
|
48
|
+
|
49
|
+
def format_name
|
50
|
+
self.class.format_name
|
51
|
+
end
|
52
|
+
|
53
|
+
def <=>(other)
|
54
|
+
compare_result = nil
|
55
|
+
|
56
|
+
compare_fields.each do |field|
|
57
|
+
my_val = self[field]
|
58
|
+
other_val = other[field]
|
59
|
+
|
60
|
+
return 0 if my_val.nil? && other_val.nil?
|
61
|
+
return -1 if my_val.nil?
|
62
|
+
return 1 if other_val.nil?
|
63
|
+
|
64
|
+
compare_result = my_val.<=>(other_val)
|
65
|
+
|
66
|
+
return compare_result unless compare_result.zero?
|
67
|
+
end
|
68
|
+
compare_result
|
69
|
+
end
|
70
|
+
|
71
|
+
def [](key)
|
72
|
+
raise NotImplementedError, "#{self.class} must implement #[](key)"
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_hash
|
76
|
+
raise NotImplementedError, "#{self.class} must implement #to_hash"
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.from_record
|
80
|
+
raise NotImplementedError, "#{self.class} must implement #{self.class}.from_record"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
3
|
+
# class.
|
4
|
+
#
|
5
|
+
# This class represents an empty / completed node in the merge tree where all
|
6
|
+
# the data from the descendant leaf node is full used up.
|
7
|
+
#
|
8
|
+
class SentinelInternalNode
|
9
|
+
include Comparable
|
10
|
+
|
11
|
+
attr_reader :left
|
12
|
+
attr_reader :right
|
13
|
+
attr_reader :winner
|
14
|
+
attr_accessor :next_level
|
15
|
+
|
16
|
+
def initialize(left: nil, right: nil)
|
17
|
+
@left = nil
|
18
|
+
@right = nil
|
19
|
+
@winner = nil
|
20
|
+
@next_level = nil
|
21
|
+
end
|
22
|
+
|
23
|
+
def sentinel?
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
def leaf?
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
# A sentinal node is always greater than any other node
|
32
|
+
def <=>(other)
|
33
|
+
return 0 if other.sentinel?
|
34
|
+
return 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
3
|
+
#
|
4
|
+
# This class represents a LeafNode that has no more data
|
5
|
+
#
|
6
|
+
class SentinelLeafNode
|
7
|
+
include Comparable
|
8
|
+
|
9
|
+
attr_accessor :next_level
|
10
|
+
|
11
|
+
def sentinel?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def leaf?
|
16
|
+
true
|
17
|
+
end
|
18
|
+
|
19
|
+
def next
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
def finished?
|
24
|
+
true
|
25
|
+
end
|
26
|
+
|
27
|
+
def value
|
28
|
+
nil
|
29
|
+
end
|
30
|
+
|
31
|
+
# A sentinal node is always greater than any other node
|
32
|
+
def <=>(other)
|
33
|
+
return 0 if other.sentinel?
|
34
|
+
return 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Sort
|
3
|
+
attr_reader :reader
|
4
|
+
attr_reader :writer
|
5
|
+
attr_reader :compare_fields
|
6
|
+
|
7
|
+
def initialize(input:, input_fallback: "auto",
|
8
|
+
output:, output_fallback: "auto",
|
9
|
+
compare_fields:)
|
10
|
+
|
11
|
+
@compare_fields = compare_fields
|
12
|
+
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
13
|
+
fallback: input_fallback)
|
14
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
15
|
+
reader_format: @reader.format_name)
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(", ")}"
|
20
|
+
records = Array.new.tap do |a|
|
21
|
+
reader.each do |r|
|
22
|
+
a << r
|
23
|
+
end
|
24
|
+
end
|
25
|
+
::FlatKit.logger.info "Read #{reader.count} records into #{records.size} element array"
|
26
|
+
records.sort!
|
27
|
+
::FlatKit.logger.info "Sorted #{records.size} records"
|
28
|
+
records.each do |r|
|
29
|
+
writer.write(r)
|
30
|
+
end
|
31
|
+
writer.close
|
32
|
+
::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: The base class for all format writers.
|
3
|
+
#
|
4
|
+
# A format writer will only write those Records, and on that, only those of
|
5
|
+
# its own format.
|
6
|
+
#
|
7
|
+
# It must implement a #write methods takes a Record. It can convert the record
|
8
|
+
# to one matching its own format if it whishes. But it should in any case
|
9
|
+
# check the Record format to make sure it matches
|
10
|
+
#
|
11
|
+
# See the Xsv::Writer and Jsonl::Writer for examples.
|
12
|
+
#
|
13
|
+
class Writer
|
14
|
+
attr_reader :destination
|
15
|
+
|
16
|
+
def self.create_writer_from_path(path:, fallback:, reader_format:)
|
17
|
+
fallback = reader_format if fallback == "auto"
|
18
|
+
format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
|
19
|
+
format.writer.new(destination: path)
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(destination:)
|
23
|
+
@destination = destination
|
24
|
+
end
|
25
|
+
|
26
|
+
def format_name
|
27
|
+
self.class.format_name
|
28
|
+
end
|
29
|
+
|
30
|
+
def write(record)
|
31
|
+
raise NotImplementedError, "#{self.class} needs to implement #write"
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
raise NotImplementedError, "#{self.class} needs to implement #close"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|