flat_kit 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CONTRIBUTING.md +46 -0
- data/HISTORY.md +5 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +66 -0
- data/README.md +80 -0
- data/Rakefile +20 -0
- data/bin/fk +4 -0
- data/lib/flat_kit.rb +23 -0
- data/lib/flat_kit/cli.rb +80 -0
- data/lib/flat_kit/command.rb +53 -0
- data/lib/flat_kit/command/cat.rb +93 -0
- data/lib/flat_kit/command/merge.rb +88 -0
- data/lib/flat_kit/command/sort.rb +88 -0
- data/lib/flat_kit/descendant_tracker.rb +27 -0
- data/lib/flat_kit/error.rb +5 -0
- data/lib/flat_kit/format.rb +34 -0
- data/lib/flat_kit/input.rb +32 -0
- data/lib/flat_kit/input/file.rb +53 -0
- data/lib/flat_kit/input/io.rb +54 -0
- data/lib/flat_kit/internal_node.rb +84 -0
- data/lib/flat_kit/jsonl.rb +8 -0
- data/lib/flat_kit/jsonl/format.rb +25 -0
- data/lib/flat_kit/jsonl/reader.rb +30 -0
- data/lib/flat_kit/jsonl/record.rb +84 -0
- data/lib/flat_kit/jsonl/writer.rb +45 -0
- data/lib/flat_kit/leaf_node.rb +71 -0
- data/lib/flat_kit/logger.rb +39 -0
- data/lib/flat_kit/merge.rb +35 -0
- data/lib/flat_kit/merge_tree.rb +104 -0
- data/lib/flat_kit/output.rb +32 -0
- data/lib/flat_kit/output/file.rb +55 -0
- data/lib/flat_kit/output/io.rb +73 -0
- data/lib/flat_kit/reader.rb +61 -0
- data/lib/flat_kit/record.rb +83 -0
- data/lib/flat_kit/sentinel_internal_node.rb +37 -0
- data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
- data/lib/flat_kit/sort.rb +35 -0
- data/lib/flat_kit/writer.rb +38 -0
- data/lib/flat_kit/xsv.rb +8 -0
- data/lib/flat_kit/xsv/format.rb +25 -0
- data/lib/flat_kit/xsv/reader.rb +45 -0
- data/lib/flat_kit/xsv/record.rb +90 -0
- data/lib/flat_kit/xsv/writer.rb +70 -0
- data/tasks/default.rake +242 -0
- data/tasks/extension.rake +38 -0
- data/tasks/man.rake +7 -0
- data/tasks/this.rb +208 -0
- data/test/device_dataset.rb +117 -0
- data/test/input/test_file.rb +73 -0
- data/test/input/test_io.rb +93 -0
- data/test/jsonl/test_format.rb +22 -0
- data/test/jsonl/test_reader.rb +49 -0
- data/test/jsonl/test_record.rb +61 -0
- data/test/jsonl/test_writer.rb +68 -0
- data/test/output/test_file.rb +60 -0
- data/test/output/test_io.rb +104 -0
- data/test/test_conversions.rb +45 -0
- data/test/test_format.rb +24 -0
- data/test/test_helper.rb +26 -0
- data/test/test_merge.rb +40 -0
- data/test/test_merge_tree.rb +64 -0
- data/test/test_version.rb +11 -0
- data/test/xsv/test_format.rb +22 -0
- data/test/xsv/test_reader.rb +61 -0
- data/test/xsv/test_record.rb +69 -0
- data/test/xsv/test_writer.rb +68 -0
- metadata +237 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'csv'
|
2
|
+
module FlatKit
|
3
|
+
class Command
|
4
|
+
class Merge < ::FlatKit::Command
|
5
|
+
|
6
|
+
def self.name
|
7
|
+
"merge"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.description
|
11
|
+
"Merge sorted files together that have the same structure."
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.parser
|
15
|
+
::Optimist::Parser.new do
|
16
|
+
banner "#{Merge.description}"
|
17
|
+
banner ""
|
18
|
+
|
19
|
+
banner <<~BANNER
|
20
|
+
Given a set of input files that have the same structure, and are already
|
21
|
+
sorted by a set of keys. The Merge command will merge all those files
|
22
|
+
into a single output file.
|
23
|
+
|
24
|
+
The --key parameter is required, and it must be a comma separated list
|
25
|
+
of field nams on the input on which to use as the sort key for the merge
|
26
|
+
process.
|
27
|
+
|
28
|
+
There must also be at least 2 input files. Merging only 1 file into an
|
29
|
+
output file is the same as the 'cat' command.
|
30
|
+
|
31
|
+
The flatfile type(s) will be automatically determined by the file name.
|
32
|
+
If the output is not a file, but to stdout then the output type will
|
33
|
+
be the same as the first input file, or it can be specified as a commandline
|
34
|
+
switch.
|
35
|
+
|
36
|
+
The merge will do a single pass through the input to generate the
|
37
|
+
output.
|
38
|
+
BANNER
|
39
|
+
|
40
|
+
banner <<~USAGE
|
41
|
+
|
42
|
+
Usage:
|
43
|
+
fk merge --key surname,given_name file1.csv file2.csv > all.csv
|
44
|
+
fk merge --key surname,given_name --output-format json file1.csv file2.csv > all.json
|
45
|
+
fk merge --key field1,field2 --output-format json input*.csv | gzip -c > all.json.gz
|
46
|
+
fk merge --key field12 file*.json.gz -o all.json.gz
|
47
|
+
|
48
|
+
USAGE
|
49
|
+
|
50
|
+
banner <<~OPTIONS
|
51
|
+
|
52
|
+
Options:
|
53
|
+
|
54
|
+
OPTIONS
|
55
|
+
|
56
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
57
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
58
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
59
|
+
opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
attr_reader :compare_keys
|
64
|
+
|
65
|
+
def parse
|
66
|
+
parser = self.class.parser
|
67
|
+
::Optimist::with_standard_exception_handling(parser) do
|
68
|
+
begin
|
69
|
+
@opts = parser.parse(argv)
|
70
|
+
@compare_keys = CSV.parse_line(opts[:key])
|
71
|
+
paths = parser.leftovers
|
72
|
+
raise ::Optimist::CommandlineError, "At least 2 input files are required" if paths.size < 2
|
73
|
+
|
74
|
+
@merge = ::FlatKit::Merge.new(inputs: paths, input_fallback: opts[:input_format],
|
75
|
+
compare_fields: @compare_keys,
|
76
|
+
output: opts[:output], output_fallback: opts[:output_format])
|
77
|
+
rescue ::FlatKit::Error => e
|
78
|
+
raise ::Optimist::CommandlineError, e.message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def call
|
84
|
+
@merge.call
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'csv'
|
2
|
+
module FlatKit
|
3
|
+
class Command
|
4
|
+
class Sort < ::FlatKit::Command
|
5
|
+
|
6
|
+
def self.name
|
7
|
+
"sort"
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.description
|
11
|
+
"Sort a given file by a set of fields."
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.parser
|
15
|
+
::Optimist::Parser.new do
|
16
|
+
banner "#{Sort.description}"
|
17
|
+
banner ""
|
18
|
+
|
19
|
+
banner <<~BANNER
|
20
|
+
Given an input file and a sort key, order the records in that file by that
|
21
|
+
key. If no input file is given the stdin is assumed. If no output file
|
22
|
+
is given then stdout is assumed.
|
23
|
+
|
24
|
+
The --key parameter is required, and it must be a comma separated list
|
25
|
+
of field nams on the input on which to use as the sort key for the merge
|
26
|
+
process.
|
27
|
+
|
28
|
+
There must also be only 1 input files.
|
29
|
+
|
30
|
+
The flatfile type(s) will be automatically determined by the file name.
|
31
|
+
If the output is not a file, but to stdout then the output type will
|
32
|
+
be the same as the first input file, or it can be specified as a commandline
|
33
|
+
switch.
|
34
|
+
|
35
|
+
BANNER
|
36
|
+
|
37
|
+
banner <<~USAGE
|
38
|
+
|
39
|
+
Usage:
|
40
|
+
fk sort --key surname,given_name file.csv > sorted.csv
|
41
|
+
fk sort --key surname,given_name --output-format json file.csv > sorted.json
|
42
|
+
fk sort --key field1,field2 --output-format json input.csv | gzip -c > sorted.json.gz
|
43
|
+
fk sort --key field1 file.json.gz -o sorted.json.gz
|
44
|
+
gunzip -c file.json.gz | fk sort --key field1 --input-format json --output-format json > gzip -c sorted.json.gz
|
45
|
+
|
46
|
+
USAGE
|
47
|
+
|
48
|
+
banner <<~OPTIONS
|
49
|
+
|
50
|
+
Options:
|
51
|
+
|
52
|
+
OPTIONS
|
53
|
+
|
54
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
55
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
56
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
57
|
+
opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :compare_keys
|
62
|
+
attr_reader :reader
|
63
|
+
attr_reader :sort
|
64
|
+
|
65
|
+
def parse
|
66
|
+
parser = self.class.parser
|
67
|
+
::Optimist::with_standard_exception_handling(parser) do
|
68
|
+
begin
|
69
|
+
@opts = parser.parse(argv)
|
70
|
+
@compare_keys = CSV.parse_line(opts[:key])
|
71
|
+
paths = parser.leftovers
|
72
|
+
raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
|
73
|
+
path = paths.first || "-" # default to stdin
|
74
|
+
@sort = ::FlatKit::Sort.new(input: path, input_fallback: opts[:input_format],
|
75
|
+
output: opts[:output], output_fallback: opts[:output_format],
|
76
|
+
compare_fields: @compare_keys)
|
77
|
+
rescue ::FlatKit::Error => e
|
78
|
+
raise ::Optimist::CommandlineError, e.message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def call
|
84
|
+
sort.call
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
module DescendantTracker
|
5
|
+
def inherited(klass)
|
6
|
+
super
|
7
|
+
return unless klass.instance_of?(Class)
|
8
|
+
self.children << klass
|
9
|
+
end
|
10
|
+
|
11
|
+
def children
|
12
|
+
unless defined? @_children
|
13
|
+
@_children = Set.new
|
14
|
+
end
|
15
|
+
@_children
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Find the first child that returns truthy from the given method with args
|
20
|
+
#
|
21
|
+
def find_child(method, *args)
|
22
|
+
children.find do |child_klass|
|
23
|
+
child_klass.send(method, *args)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Format
|
3
|
+
extend DescendantTracker
|
4
|
+
|
5
|
+
def self.format_name
|
6
|
+
raise NotImplementedError, "#{self.class} must implemente #{self.class}.format_name"
|
7
|
+
end
|
8
|
+
|
9
|
+
def format_name
|
10
|
+
self.class.format_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.for(path)
|
14
|
+
find_child(:handles?, path.to_s)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.for_with_fallback(path:, fallback: "auto")
|
18
|
+
# test by path
|
19
|
+
format = ::FlatKit::Format.for(path)
|
20
|
+
return format unless format.nil?
|
21
|
+
|
22
|
+
# now try the fallback
|
23
|
+
format = ::FlatKit::Format.for(fallback)
|
24
|
+
return format
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.for_with_fallback!(path:, fallback: "auto")
|
28
|
+
format = for_with_fallback(path: path, fallback: fallback)
|
29
|
+
raise ::FlatKit::Error::UnknownFormat, "Unable to figure out format for '#{path}' with fallback '#{fallback}'" if format.nil?
|
30
|
+
|
31
|
+
return format
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Input
|
3
|
+
extend DescendantTracker
|
4
|
+
|
5
|
+
def self.from(input)
|
6
|
+
return input if input.kind_of?(::FlatKit::Input)
|
7
|
+
|
8
|
+
in_klass = find_child(:handles?, input)
|
9
|
+
if in_klass then
|
10
|
+
return in_klass.new(input)
|
11
|
+
end
|
12
|
+
|
13
|
+
raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def name
|
17
|
+
raise NotImplementedError, "#{self.class} must implement #name"
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
def io
|
22
|
+
raise NotImplementedError, "#{self.class} must implement #io"
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
raise NotImplementedError, "#{self.class} must implement #close"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
require 'flat_kit/input/io'
|
32
|
+
require 'flat_kit/input/file'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class Input
|
5
|
+
class File < Input
|
6
|
+
attr_reader :path
|
7
|
+
attr_reader :count
|
8
|
+
|
9
|
+
def self.handles?(obj)
|
10
|
+
return true if obj.instance_of?(Pathname)
|
11
|
+
return false unless obj.instance_of?(String)
|
12
|
+
|
13
|
+
# incase these get loaded in different orders
|
14
|
+
return false if ::FlatKit::Input::IO.is_stdin?(obj)
|
15
|
+
|
16
|
+
return true
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(obj)
|
20
|
+
@count = 0
|
21
|
+
@path = Pathname.new(obj)
|
22
|
+
raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
|
23
|
+
@io = open_input(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
def name
|
27
|
+
path.to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
def close
|
31
|
+
@io.close
|
32
|
+
end
|
33
|
+
|
34
|
+
def io
|
35
|
+
@io
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# open the opropriate input type depending on the source file name
|
41
|
+
#
|
42
|
+
# TODO: add in bzip
|
43
|
+
def open_input(path)
|
44
|
+
case path.extname
|
45
|
+
when ".gz"
|
46
|
+
Zlib::GzipReader.open(path.to_s)
|
47
|
+
else
|
48
|
+
path.open("rb")
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Input
|
3
|
+
class IO < Input
|
4
|
+
STDINS = %w[ stdin STDIN - <stdin> ]
|
5
|
+
|
6
|
+
def self.handles?(obj)
|
7
|
+
return true if is_stdin?(obj)
|
8
|
+
return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
|
9
|
+
return false
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.is_stdin?(obj)
|
13
|
+
case obj
|
14
|
+
when String
|
15
|
+
return true if STDINS.include?(obj)
|
16
|
+
when ::IO
|
17
|
+
return true if obj == ::STDIN
|
18
|
+
end
|
19
|
+
return false
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(obj)
|
23
|
+
if self.class.is_stdin?(obj) then
|
24
|
+
@name = "<STDIN>"
|
25
|
+
@io = $stdin
|
26
|
+
elsif obj.kind_of?(::File) then
|
27
|
+
@name = obj.path
|
28
|
+
@io = obj
|
29
|
+
elsif obj.kind_of?(::StringIO) then
|
30
|
+
@name = obj.inspect
|
31
|
+
@io = obj
|
32
|
+
elsif obj.kind_of?(::IO) then
|
33
|
+
@name = obj.inspect
|
34
|
+
@io = obj
|
35
|
+
else
|
36
|
+
raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def name
|
41
|
+
@name
|
42
|
+
end
|
43
|
+
|
44
|
+
# this goes to an io stream and we are not in charge of opening it
|
45
|
+
def close
|
46
|
+
@io.close
|
47
|
+
end
|
48
|
+
|
49
|
+
def io
|
50
|
+
@io
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: This is a class used internally by MergeTree and should not be used
|
3
|
+
# outside of that context.
|
4
|
+
#
|
5
|
+
# The InternalNode represents a single element of the tournament tree
|
6
|
+
# altorithm holding references to the to other internal nodes that competed in
|
7
|
+
# this node and which one is the winner.
|
8
|
+
#
|
9
|
+
# A reference to the leaf node that is associated with the winner is also kept
|
10
|
+
# here.
|
11
|
+
#
|
12
|
+
class InternalNode
|
13
|
+
|
14
|
+
include Comparable
|
15
|
+
|
16
|
+
attr_accessor :left # Internal Node
|
17
|
+
attr_accessor :right # Internal Node
|
18
|
+
attr_accessor :winner # Internal Node
|
19
|
+
attr_accessor :next_level # Who to tell
|
20
|
+
attr_accessor :leaf # winning leaf node
|
21
|
+
|
22
|
+
def initialize(left:, right:)
|
23
|
+
@left = left
|
24
|
+
@left.next_level = self
|
25
|
+
|
26
|
+
@right = right
|
27
|
+
@right.next_level = self
|
28
|
+
@next_level = nil
|
29
|
+
|
30
|
+
play
|
31
|
+
end
|
32
|
+
|
33
|
+
def value
|
34
|
+
winner.value
|
35
|
+
end
|
36
|
+
|
37
|
+
def sentinel?
|
38
|
+
false
|
39
|
+
end
|
40
|
+
|
41
|
+
def leaf?
|
42
|
+
false
|
43
|
+
end
|
44
|
+
|
45
|
+
# We are being told that the passed in node no longer has data in it and is
|
46
|
+
# to be removed from the tree.
|
47
|
+
#
|
48
|
+
# We replace our reference to this node with a sentinal node so that
|
49
|
+
# comparisons work correctly.
|
50
|
+
#
|
51
|
+
# After updating the node, we then need to check and see if both of our
|
52
|
+
# child nodes are sentinels, and if so, then tell our parent to remove us
|
53
|
+
# from the tree.
|
54
|
+
#
|
55
|
+
def player_finished(node)
|
56
|
+
if left.object_id == node.object_id then
|
57
|
+
@left = SentinelInternalNode.new
|
58
|
+
@left.next_level = self
|
59
|
+
elsif right.object_id == node.object_id then
|
60
|
+
@right = SentinelInternalNode.new
|
61
|
+
@right.next_level = self
|
62
|
+
else
|
63
|
+
raise FlatKit::Error, "Unknown player #{node}"
|
64
|
+
end
|
65
|
+
|
66
|
+
if @right.sentinel? && @left.sentinel? then
|
67
|
+
next_level.player_finished(self) if next_level
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def play
|
72
|
+
@winner = left <= right ? left : right
|
73
|
+
if !@winner.sentinel? then
|
74
|
+
@leaf = winner.leaf
|
75
|
+
end
|
76
|
+
next_level.play if next_level
|
77
|
+
end
|
78
|
+
|
79
|
+
def <=>(other)
|
80
|
+
return -1 if other.sentinel?
|
81
|
+
value.<=>(other.value)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|