flat_kit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CONTRIBUTING.md +46 -0
- data/HISTORY.md +5 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +66 -0
- data/README.md +80 -0
- data/Rakefile +20 -0
- data/bin/fk +4 -0
- data/lib/flat_kit.rb +23 -0
- data/lib/flat_kit/cli.rb +80 -0
- data/lib/flat_kit/command.rb +53 -0
- data/lib/flat_kit/command/cat.rb +93 -0
- data/lib/flat_kit/command/merge.rb +88 -0
- data/lib/flat_kit/command/sort.rb +88 -0
- data/lib/flat_kit/descendant_tracker.rb +27 -0
- data/lib/flat_kit/error.rb +5 -0
- data/lib/flat_kit/format.rb +34 -0
- data/lib/flat_kit/input.rb +32 -0
- data/lib/flat_kit/input/file.rb +53 -0
- data/lib/flat_kit/input/io.rb +54 -0
- data/lib/flat_kit/internal_node.rb +84 -0
- data/lib/flat_kit/jsonl.rb +8 -0
- data/lib/flat_kit/jsonl/format.rb +25 -0
- data/lib/flat_kit/jsonl/reader.rb +30 -0
- data/lib/flat_kit/jsonl/record.rb +84 -0
- data/lib/flat_kit/jsonl/writer.rb +45 -0
- data/lib/flat_kit/leaf_node.rb +71 -0
- data/lib/flat_kit/logger.rb +39 -0
- data/lib/flat_kit/merge.rb +35 -0
- data/lib/flat_kit/merge_tree.rb +104 -0
- data/lib/flat_kit/output.rb +32 -0
- data/lib/flat_kit/output/file.rb +55 -0
- data/lib/flat_kit/output/io.rb +73 -0
- data/lib/flat_kit/reader.rb +61 -0
- data/lib/flat_kit/record.rb +83 -0
- data/lib/flat_kit/sentinel_internal_node.rb +37 -0
- data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
- data/lib/flat_kit/sort.rb +35 -0
- data/lib/flat_kit/writer.rb +38 -0
- data/lib/flat_kit/xsv.rb +8 -0
- data/lib/flat_kit/xsv/format.rb +25 -0
- data/lib/flat_kit/xsv/reader.rb +45 -0
- data/lib/flat_kit/xsv/record.rb +90 -0
- data/lib/flat_kit/xsv/writer.rb +70 -0
- data/tasks/default.rake +242 -0
- data/tasks/extension.rake +38 -0
- data/tasks/man.rake +7 -0
- data/tasks/this.rb +208 -0
- data/test/device_dataset.rb +117 -0
- data/test/input/test_file.rb +73 -0
- data/test/input/test_io.rb +93 -0
- data/test/jsonl/test_format.rb +22 -0
- data/test/jsonl/test_reader.rb +49 -0
- data/test/jsonl/test_record.rb +61 -0
- data/test/jsonl/test_writer.rb +68 -0
- data/test/output/test_file.rb +60 -0
- data/test/output/test_io.rb +104 -0
- data/test/test_conversions.rb +45 -0
- data/test/test_format.rb +24 -0
- data/test/test_helper.rb +26 -0
- data/test/test_merge.rb +40 -0
- data/test/test_merge_tree.rb +64 -0
- data/test/test_version.rb +11 -0
- data/test/xsv/test_format.rb +22 -0
- data/test/xsv/test_reader.rb +61 -0
- data/test/xsv/test_record.rb +69 -0
- data/test/xsv/test_writer.rb +68 -0
- metadata +237 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'csv'
|
2
|
+
module FlatKit
|
3
|
+
class Command
|
4
|
+
class Merge < ::FlatKit::Command
|
5
|
+
|
6
|
+
def self.name
|
7
|
+
"merge"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.description
|
11
|
+
"Merge sorted files together that have the same structure."
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.parser
|
15
|
+
::Optimist::Parser.new do
|
16
|
+
banner "#{Merge.description}"
|
17
|
+
banner ""
|
18
|
+
|
19
|
+
banner <<~BANNER
|
20
|
+
Given a set of input files that have the same structure, and are already
|
21
|
+
sorted by a set of keys. The Merge command will merge all those files
|
22
|
+
into a single output file.
|
23
|
+
|
24
|
+
The --key parameter is required, and it must be a comma separated list
|
25
|
+
of field nams on the input on which to use as the sort key for the merge
|
26
|
+
process.
|
27
|
+
|
28
|
+
There must also be at least 2 input files. Merging only 1 file into an
|
29
|
+
output file is the same as the 'cat' command.
|
30
|
+
|
31
|
+
The flatfile type(s) will be automatically determined by the file name.
|
32
|
+
If the output is not a file, but to stdout then the output type will
|
33
|
+
be the same as the first input file, or it can be specified as a commandline
|
34
|
+
switch.
|
35
|
+
|
36
|
+
The merge will do a single pass through the input to generate the
|
37
|
+
output.
|
38
|
+
BANNER
|
39
|
+
|
40
|
+
banner <<~USAGE
|
41
|
+
|
42
|
+
Usage:
|
43
|
+
fk merge --key surname,given_name file1.csv file2.csv > all.csv
|
44
|
+
fk merge --key surname,given_name --output-format json file1.csv file2.csv > all.json
|
45
|
+
fk merge --key field1,field2 --output-format json input*.csv | gzip -c > all.json.gz
|
46
|
+
fk merge --key field12 file*.json.gz -o all.json.gz
|
47
|
+
|
48
|
+
USAGE
|
49
|
+
|
50
|
+
banner <<~OPTIONS
|
51
|
+
|
52
|
+
Options:
|
53
|
+
|
54
|
+
OPTIONS
|
55
|
+
|
56
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
57
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
58
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
59
|
+
opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
attr_reader :compare_keys
|
64
|
+
|
65
|
+
def parse
|
66
|
+
parser = self.class.parser
|
67
|
+
::Optimist::with_standard_exception_handling(parser) do
|
68
|
+
begin
|
69
|
+
@opts = parser.parse(argv)
|
70
|
+
@compare_keys = CSV.parse_line(opts[:key])
|
71
|
+
paths = parser.leftovers
|
72
|
+
raise ::Optimist::CommandlineError, "At least 2 input files are required" if paths.size < 2
|
73
|
+
|
74
|
+
@merge = ::FlatKit::Merge.new(inputs: paths, input_fallback: opts[:input_format],
|
75
|
+
compare_fields: @compare_keys,
|
76
|
+
output: opts[:output], output_fallback: opts[:output_format])
|
77
|
+
rescue ::FlatKit::Error => e
|
78
|
+
raise ::Optimist::CommandlineError, e.message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def call
|
84
|
+
@merge.call
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'csv'
|
2
|
+
module FlatKit
|
3
|
+
class Command
|
4
|
+
class Sort < ::FlatKit::Command
|
5
|
+
|
6
|
+
def self.name
|
7
|
+
"sort"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.description
|
11
|
+
"Sort a given file by a set of fields."
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.parser
|
15
|
+
::Optimist::Parser.new do
|
16
|
+
banner "#{Sort.description}"
|
17
|
+
banner ""
|
18
|
+
|
19
|
+
banner <<~BANNER
|
20
|
+
Given an input file and a sort key, order the records in that file by that
|
21
|
+
key. If no input file is given the stdin is assumed. If no output file
|
22
|
+
is given then stdout is assumed.
|
23
|
+
|
24
|
+
The --key parameter is required, and it must be a comma separated list
|
25
|
+
of field nams on the input on which to use as the sort key for the merge
|
26
|
+
process.
|
27
|
+
|
28
|
+
There must also be only 1 input files.
|
29
|
+
|
30
|
+
The flatfile type(s) will be automatically determined by the file name.
|
31
|
+
If the output is not a file, but to stdout then the output type will
|
32
|
+
be the same as the first input file, or it can be specified as a commandline
|
33
|
+
switch.
|
34
|
+
|
35
|
+
BANNER
|
36
|
+
|
37
|
+
banner <<~USAGE
|
38
|
+
|
39
|
+
Usage:
|
40
|
+
fk sort --key surname,given_name file.csv > sorted.csv
|
41
|
+
fk sort --key surname,given_name --output-format json file.csv > sorted.json
|
42
|
+
fk sort --key field1,field2 --output-format json input.csv | gzip -c > sorted.json.gz
|
43
|
+
fk sort --key field1 file.json.gz -o sorted.json.gz
|
44
|
+
gunzip -c file.json.gz | fk sort --key field1 --input-format json --output-format json > gzip -c sorted.json.gz
|
45
|
+
|
46
|
+
USAGE
|
47
|
+
|
48
|
+
banner <<~OPTIONS
|
49
|
+
|
50
|
+
Options:
|
51
|
+
|
52
|
+
OPTIONS
|
53
|
+
|
54
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
55
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
56
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
57
|
+
opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :compare_keys
|
62
|
+
attr_reader :reader
|
63
|
+
attr_reader :sort
|
64
|
+
|
65
|
+
def parse
|
66
|
+
parser = self.class.parser
|
67
|
+
::Optimist::with_standard_exception_handling(parser) do
|
68
|
+
begin
|
69
|
+
@opts = parser.parse(argv)
|
70
|
+
@compare_keys = CSV.parse_line(opts[:key])
|
71
|
+
paths = parser.leftovers
|
72
|
+
raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
|
73
|
+
path = paths.first || "-" # default to stdin
|
74
|
+
@sort = ::FlatKit::Sort.new(input: path, input_fallback: opts[:input_format],
|
75
|
+
output: opts[:output], output_fallback: opts[:output_format],
|
76
|
+
compare_fields: @compare_keys)
|
77
|
+
rescue ::FlatKit::Error => e
|
78
|
+
raise ::Optimist::CommandlineError, e.message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def call
|
84
|
+
sort.call
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
module DescendantTracker
|
5
|
+
def inherited(klass)
|
6
|
+
super
|
7
|
+
return unless klass.instance_of?(Class)
|
8
|
+
self.children << klass
|
9
|
+
end
|
10
|
+
|
11
|
+
def children
|
12
|
+
unless defined? @_children
|
13
|
+
@_children = Set.new
|
14
|
+
end
|
15
|
+
@_children
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Find the first child that returns truthy from the given method with args
|
20
|
+
#
|
21
|
+
def find_child(method, *args)
|
22
|
+
children.find do |child_klass|
|
23
|
+
child_klass.send(method, *args)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Format
|
3
|
+
extend DescendantTracker
|
4
|
+
|
5
|
+
def self.format_name
|
6
|
+
raise NotImplementedError, "#{self.class} must implemente #{self.class}.format_name"
|
7
|
+
end
|
8
|
+
|
9
|
+
def format_name
|
10
|
+
self.class.format_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.for(path)
|
14
|
+
find_child(:handles?, path.to_s)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.for_with_fallback(path:, fallback: "auto")
|
18
|
+
# test by path
|
19
|
+
format = ::FlatKit::Format.for(path)
|
20
|
+
return format unless format.nil?
|
21
|
+
|
22
|
+
# now try the fallback
|
23
|
+
format = ::FlatKit::Format.for(fallback)
|
24
|
+
return format
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.for_with_fallback!(path:, fallback: "auto")
|
28
|
+
format = for_with_fallback(path: path, fallback: fallback)
|
29
|
+
raise ::FlatKit::Error::UnknownFormat, "Unable to figure out format for '#{path}' with fallback '#{fallback}'" if format.nil?
|
30
|
+
|
31
|
+
return format
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Input
|
3
|
+
extend DescendantTracker
|
4
|
+
|
5
|
+
def self.from(input)
|
6
|
+
return input if input.kind_of?(::FlatKit::Input)
|
7
|
+
|
8
|
+
in_klass = find_child(:handles?, input)
|
9
|
+
if in_klass then
|
10
|
+
return in_klass.new(input)
|
11
|
+
end
|
12
|
+
|
13
|
+
raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def name
|
17
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
def io
|
22
|
+
raise NotImplementedError, "#{self.class} must implement #io"
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
raise NotImplementedError, "#{self.class} must implement #close"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
require 'flat_kit/input/io'
|
32
|
+
require 'flat_kit/input/file'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class Input
|
5
|
+
class File < Input
|
6
|
+
attr_reader :path
|
7
|
+
attr_reader :count
|
8
|
+
|
9
|
+
def self.handles?(obj)
|
10
|
+
return true if obj.instance_of?(Pathname)
|
11
|
+
return false unless obj.instance_of?(String)
|
12
|
+
|
13
|
+
# incase these get loaded in different orders
|
14
|
+
return false if ::FlatKit::Input::IO.is_stdin?(obj)
|
15
|
+
|
16
|
+
return true
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(obj)
|
20
|
+
@count = 0
|
21
|
+
@path = Pathname.new(obj)
|
22
|
+
raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
|
23
|
+
@io = open_input(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
def name
|
27
|
+
path.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def close
|
31
|
+
@io.close
|
32
|
+
end
|
33
|
+
|
34
|
+
def io
|
35
|
+
@io
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# open the opropriate input type depending on the source file name
|
41
|
+
#
|
42
|
+
# TODO: add in bzip
|
43
|
+
def open_input(path)
|
44
|
+
case path.extname
|
45
|
+
when ".gz"
|
46
|
+
Zlib::GzipReader.open(path.to_s)
|
47
|
+
else
|
48
|
+
path.open("rb")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Input
|
3
|
+
class IO < Input
|
4
|
+
STDINS = %w[ stdin STDIN - <stdin> ]
|
5
|
+
|
6
|
+
def self.handles?(obj)
|
7
|
+
return true if is_stdin?(obj)
|
8
|
+
return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
|
9
|
+
return false
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.is_stdin?(obj)
|
13
|
+
case obj
|
14
|
+
when String
|
15
|
+
return true if STDINS.include?(obj)
|
16
|
+
when ::IO
|
17
|
+
return true if obj == ::STDIN
|
18
|
+
end
|
19
|
+
return false
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(obj)
|
23
|
+
if self.class.is_stdin?(obj) then
|
24
|
+
@name = "<STDIN>"
|
25
|
+
@io = $stdin
|
26
|
+
elsif obj.kind_of?(::File) then
|
27
|
+
@name = obj.path
|
28
|
+
@io = obj
|
29
|
+
elsif obj.kind_of?(::StringIO) then
|
30
|
+
@name = obj.inspect
|
31
|
+
@io = obj
|
32
|
+
elsif obj.kind_of?(::IO) then
|
33
|
+
@name = obj.inspect
|
34
|
+
@io = obj
|
35
|
+
else
|
36
|
+
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def name
|
41
|
+
@name
|
42
|
+
end
|
43
|
+
|
44
|
+
# this goes to an io stream and we are not in charge of opening it
|
45
|
+
def close
|
46
|
+
@io.close
|
47
|
+
end
|
48
|
+
|
49
|
+
def io
|
50
|
+
@io
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: This is a class used internally by MergeTree and should not be used
|
3
|
+
# outside of that context.
|
4
|
+
#
|
5
|
+
# The InternalNode represents a single element of the tournament tree
|
6
|
+
# altorithm holding references to the to other internal nodes that competed in
|
7
|
+
# this node and which one is the winner.
|
8
|
+
#
|
9
|
+
# A reference to the leaf node that is associated with the winner is also kept
|
10
|
+
# here.
|
11
|
+
#
|
12
|
+
class InternalNode
|
13
|
+
|
14
|
+
include Comparable
|
15
|
+
|
16
|
+
attr_accessor :left # Internal Node
|
17
|
+
attr_accessor :right # Internal Node
|
18
|
+
attr_accessor :winner # Internal Node
|
19
|
+
attr_accessor :next_level # Who to tell
|
20
|
+
attr_accessor :leaf # winning leaf node
|
21
|
+
|
22
|
+
def initialize(left:, right:)
|
23
|
+
@left = left
|
24
|
+
@left.next_level = self
|
25
|
+
|
26
|
+
@right = right
|
27
|
+
@right.next_level = self
|
28
|
+
@next_level = nil
|
29
|
+
|
30
|
+
play
|
31
|
+
end
|
32
|
+
|
33
|
+
def value
|
34
|
+
winner.value
|
35
|
+
end
|
36
|
+
|
37
|
+
def sentinel?
|
38
|
+
false
|
39
|
+
end
|
40
|
+
|
41
|
+
def leaf?
|
42
|
+
false
|
43
|
+
end
|
44
|
+
|
45
|
+
# We are being told that the passed in node no longer has data in it and is
|
46
|
+
# to be removed from the tree.
|
47
|
+
#
|
48
|
+
# We replace our reference to this node with a sentinal node so that
|
49
|
+
# comparisons work correctly.
|
50
|
+
#
|
51
|
+
# After updating the node, we then need to check and see if both of our
|
52
|
+
# child nodes are sentinels, and if so, then tell our parent to remove us
|
53
|
+
# from the tree.
|
54
|
+
#
|
55
|
+
def player_finished(node)
|
56
|
+
if left.object_id == node.object_id then
|
57
|
+
@left = SentinelInternalNode.new
|
58
|
+
@left.next_level = self
|
59
|
+
elsif right.object_id == node.object_id then
|
60
|
+
@right = SentinelInternalNode.new
|
61
|
+
@right.next_level = self
|
62
|
+
else
|
63
|
+
raise FlatKit::Error, "Unknown player #{node}"
|
64
|
+
end
|
65
|
+
|
66
|
+
if @right.sentinel? && @left.sentinel? then
|
67
|
+
next_level.player_finished(self) if next_level
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def play
|
72
|
+
@winner = left <= right ? left : right
|
73
|
+
if !@winner.sentinel? then
|
74
|
+
@leaf = winner.leaf
|
75
|
+
end
|
76
|
+
next_level.play if next_level
|
77
|
+
end
|
78
|
+
|
79
|
+
def <=>(other)
|
80
|
+
return -1 if other.sentinel?
|
81
|
+
value.<=>(other.value)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|