flat_kit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CONTRIBUTING.md +46 -0
- data/HISTORY.md +5 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +66 -0
- data/README.md +80 -0
- data/Rakefile +20 -0
- data/bin/fk +4 -0
- data/lib/flat_kit.rb +23 -0
- data/lib/flat_kit/cli.rb +80 -0
- data/lib/flat_kit/command.rb +53 -0
- data/lib/flat_kit/command/cat.rb +93 -0
- data/lib/flat_kit/command/merge.rb +88 -0
- data/lib/flat_kit/command/sort.rb +88 -0
- data/lib/flat_kit/descendant_tracker.rb +27 -0
- data/lib/flat_kit/error.rb +5 -0
- data/lib/flat_kit/format.rb +34 -0
- data/lib/flat_kit/input.rb +32 -0
- data/lib/flat_kit/input/file.rb +53 -0
- data/lib/flat_kit/input/io.rb +54 -0
- data/lib/flat_kit/internal_node.rb +84 -0
- data/lib/flat_kit/jsonl.rb +8 -0
- data/lib/flat_kit/jsonl/format.rb +25 -0
- data/lib/flat_kit/jsonl/reader.rb +30 -0
- data/lib/flat_kit/jsonl/record.rb +84 -0
- data/lib/flat_kit/jsonl/writer.rb +45 -0
- data/lib/flat_kit/leaf_node.rb +71 -0
- data/lib/flat_kit/logger.rb +39 -0
- data/lib/flat_kit/merge.rb +35 -0
- data/lib/flat_kit/merge_tree.rb +104 -0
- data/lib/flat_kit/output.rb +32 -0
- data/lib/flat_kit/output/file.rb +55 -0
- data/lib/flat_kit/output/io.rb +73 -0
- data/lib/flat_kit/reader.rb +61 -0
- data/lib/flat_kit/record.rb +83 -0
- data/lib/flat_kit/sentinel_internal_node.rb +37 -0
- data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
- data/lib/flat_kit/sort.rb +35 -0
- data/lib/flat_kit/writer.rb +38 -0
- data/lib/flat_kit/xsv.rb +8 -0
- data/lib/flat_kit/xsv/format.rb +25 -0
- data/lib/flat_kit/xsv/reader.rb +45 -0
- data/lib/flat_kit/xsv/record.rb +90 -0
- data/lib/flat_kit/xsv/writer.rb +70 -0
- data/tasks/default.rake +242 -0
- data/tasks/extension.rake +38 -0
- data/tasks/man.rake +7 -0
- data/tasks/this.rb +208 -0
- data/test/device_dataset.rb +117 -0
- data/test/input/test_file.rb +73 -0
- data/test/input/test_io.rb +93 -0
- data/test/jsonl/test_format.rb +22 -0
- data/test/jsonl/test_reader.rb +49 -0
- data/test/jsonl/test_record.rb +61 -0
- data/test/jsonl/test_writer.rb +68 -0
- data/test/output/test_file.rb +60 -0
- data/test/output/test_io.rb +104 -0
- data/test/test_conversions.rb +45 -0
- data/test/test_format.rb +24 -0
- data/test/test_helper.rb +26 -0
- data/test/test_merge.rb +40 -0
- data/test/test_merge_tree.rb +64 -0
- data/test/test_version.rb +11 -0
- data/test/xsv/test_format.rb +22 -0
- data/test/xsv/test_reader.rb +61 -0
- data/test/xsv/test_record.rb +69 -0
- data/test/xsv/test_writer.rb +68 -0
- metadata +237 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module FlatKit
|
2
|
+
module Jsonl
|
3
|
+
class Format < ::FlatKit::Format
|
4
|
+
def self.format_name
|
5
|
+
"jsonl"
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.handles?(filename)
|
9
|
+
parts = filename.split(".")
|
10
|
+
%w[ json jsonl ndjson ].each do |ext|
|
11
|
+
return true if parts.include?(ext)
|
12
|
+
end
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.reader
|
17
|
+
::FlatKit::Jsonl::Reader
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.writer
|
21
|
+
::FlatKit::Jsonl::Writer
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module FlatKit
|
2
|
+
module Jsonl
|
3
|
+
class Reader < ::FlatKit::Reader
|
4
|
+
attr_reader :input
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def self.format_name
|
8
|
+
::FlatKit::Jsonl::Format.format_name
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(source:, compare_fields: :none)
|
12
|
+
super
|
13
|
+
@input = ::FlatKit::Input.from(source)
|
14
|
+
@count = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
while line = input.io.gets do
|
19
|
+
record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
|
20
|
+
@count += 1
|
21
|
+
yield record
|
22
|
+
end
|
23
|
+
input.close
|
24
|
+
rescue => e
|
25
|
+
::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
|
26
|
+
raise ::FlatKit::Error, e
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'oj'
|
2
|
+
require 'flat_kit/record'
|
3
|
+
|
4
|
+
module FlatKit
|
5
|
+
module Jsonl
|
6
|
+
class Record < ::FlatKit::Record
|
7
|
+
attr_reader :compare_data
|
8
|
+
|
9
|
+
def self.format_name
|
10
|
+
::FlatKit::Jsonl::Format.format_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.from_record(record)
|
14
|
+
if record.instance_of?(FlatKit::Jsonl::Record) then
|
15
|
+
|
16
|
+
structured = record.complete_structured_data? ? record.complete_structured_data : nil
|
17
|
+
|
18
|
+
new(data: record.data, compare_fields: record.compare_fields,
|
19
|
+
compare_data: record.compare_data,
|
20
|
+
complete_structured_data: structured)
|
21
|
+
else
|
22
|
+
new(data: nil, compare_fields: record.compare_fields,
|
23
|
+
complete_structured_data: record.to_hash)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(data:, compare_fields: :none,
|
28
|
+
compare_data: Hash.new,
|
29
|
+
complete_structured_data: nil)
|
30
|
+
super(data: data, compare_fields: compare_fields)
|
31
|
+
|
32
|
+
@complete_structured_data = complete_structured_data
|
33
|
+
|
34
|
+
if complete_structured_data? && (compare_data.nil? || compare_data.empty?) then
|
35
|
+
@compare_data = complete_structured_data
|
36
|
+
else
|
37
|
+
@compare_data = compare_data
|
38
|
+
end
|
39
|
+
|
40
|
+
# only load compare data if it dosn't exist
|
41
|
+
if data && compare_data.empty? then
|
42
|
+
quick_parse
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def [](key)
|
47
|
+
compare_data[key]
|
48
|
+
end
|
49
|
+
|
50
|
+
def complete_structured_data
|
51
|
+
@complete_structured_data ||= Oj.load(data)
|
52
|
+
end
|
53
|
+
alias to_hash complete_structured_data
|
54
|
+
|
55
|
+
def complete_structured_data?
|
56
|
+
!(@complete_structured_data.nil? || @complete_structured_data.empty?)
|
57
|
+
end
|
58
|
+
|
59
|
+
# overriding parent accessor since we may be initialized without raw bytes
|
60
|
+
# to parse
|
61
|
+
def data
|
62
|
+
if @data.nil? && complete_structured_data? then
|
63
|
+
@data = Oj.dump(complete_structured_data)
|
64
|
+
end
|
65
|
+
@data
|
66
|
+
end
|
67
|
+
alias to_s data
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def quick_parse
|
72
|
+
Oj::Doc.open(data) do |doc|
|
73
|
+
compare_fields.each do |field|
|
74
|
+
val = doc.fetch("/#{field}")
|
75
|
+
@compare_data[field] = val
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module FlatKit
|
2
|
+
module Jsonl
|
3
|
+
class Writer < ::FlatKit::Writer
|
4
|
+
attr_reader :output
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def self.format_name
|
8
|
+
::FlatKit::Jsonl::Format.format_name
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(destination:)
|
12
|
+
super
|
13
|
+
@output = ::FlatKit::Output.from(@destination)
|
14
|
+
@count = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def write(record)
|
18
|
+
case record
|
19
|
+
when FlatKit::Jsonl::Record
|
20
|
+
write_record(record)
|
21
|
+
when FlatKit::Record
|
22
|
+
converted_record = ::FlatKit::Jsonl::Record.from_record(record)
|
23
|
+
write_record(converted_record)
|
24
|
+
else
|
25
|
+
raise FlatKit::Error, "Unable to write records of type #{record.class}"
|
26
|
+
end
|
27
|
+
rescue FlatKit::Error => fe
|
28
|
+
raise fe
|
29
|
+
rescue => e
|
30
|
+
::FlatKit.logger.error "Error reading jsonl records from #{output.name}: #{e}"
|
31
|
+
raise ::FlatKit::Error, e
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
@output.close
|
36
|
+
end
|
37
|
+
|
38
|
+
def write_record(record)
|
39
|
+
# enforces ending in newlin if it doesn't already have one
|
40
|
+
output.io.puts record.to_s
|
41
|
+
@count += 1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: The LeafNode is a wrapper around a Reader object to enable
|
3
|
+
# a consistent api for use in the MergeTree
|
4
|
+
#
|
5
|
+
# The LeafNode keeps track of the head of the reader list and when its value
|
6
|
+
# is used up, it pulls the next value and then notifies the next level of the
|
7
|
+
# MergeTree that its value has changed and so should do another play.
|
8
|
+
#
|
9
|
+
# If all the data is used up from the reader, it also notifies the next level
|
10
|
+
# of that so the next level can remove it from the tree.
|
11
|
+
class LeafNode
|
12
|
+
|
13
|
+
include Comparable
|
14
|
+
|
15
|
+
attr_reader :reader
|
16
|
+
attr_reader :value
|
17
|
+
|
18
|
+
attr_accessor :next_level
|
19
|
+
|
20
|
+
def initialize(reader)
|
21
|
+
@reader = reader
|
22
|
+
@enum = @reader.to_enum
|
23
|
+
@value = @enum.next
|
24
|
+
|
25
|
+
@next_level = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def winner
|
29
|
+
value
|
30
|
+
end
|
31
|
+
|
32
|
+
def sentinel?
|
33
|
+
false
|
34
|
+
end
|
35
|
+
|
36
|
+
def leaf?
|
37
|
+
true
|
38
|
+
end
|
39
|
+
|
40
|
+
def leaf
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def update_and_replay
|
45
|
+
self.next
|
46
|
+
if finished? then
|
47
|
+
::FlatKit.logger.debug "#{reader.source} has finished reading #{reader.count} records"
|
48
|
+
next_level.player_finished(self)
|
49
|
+
end
|
50
|
+
next_level.play
|
51
|
+
end
|
52
|
+
|
53
|
+
def next
|
54
|
+
begin
|
55
|
+
@value = @enum.next
|
56
|
+
rescue StopIteration
|
57
|
+
@value = nil
|
58
|
+
end
|
59
|
+
@value
|
60
|
+
end
|
61
|
+
|
62
|
+
def finished?
|
63
|
+
@enum && @value.nil?
|
64
|
+
end
|
65
|
+
|
66
|
+
def <=>(other)
|
67
|
+
return -1 if other.sentinel?
|
68
|
+
self.value.<=>(other.value)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class LogFormatter < ::Logger::Formatter
|
5
|
+
FORMAT = "%s %5d %05s : %s\n".freeze
|
6
|
+
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
self.datetime_format = DATETIME_FORMAT
|
10
|
+
end
|
11
|
+
|
12
|
+
def call(severity, time, progname, msg)
|
13
|
+
FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Logger
|
18
|
+
def self.for_io(io)
|
19
|
+
::Logger.new(io, formatter: LogFormatter.new)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.for_path(path)
|
23
|
+
io = File.open(path.to_s, "a")
|
24
|
+
for_io(io)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.log_to(destination = $stderr)
|
29
|
+
if destination.kind_of?(::IO) then
|
30
|
+
@logger = ::FlatKit::Logger.for_io(destination)
|
31
|
+
else
|
32
|
+
@logger = ::FlatKit::Logger.for_path(destination)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.logger
|
37
|
+
@logger ||= ::FlatKit::Logger.for_io($stderr)
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Merge
|
3
|
+
attr_reader :readers
|
4
|
+
attr_reader :writer
|
5
|
+
attr_reader :compare_fields
|
6
|
+
|
7
|
+
def initialize(inputs:, input_fallback: "auto",
|
8
|
+
output:, output_fallback: "auto",
|
9
|
+
compare_fields:)
|
10
|
+
@compare_fields = compare_fields
|
11
|
+
@readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
|
12
|
+
fallback: input_fallback)
|
13
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
14
|
+
reader_format: @readers.first.format_name)
|
15
|
+
end
|
16
|
+
|
17
|
+
def call
|
18
|
+
::FlatKit.logger.info "Merging the following files into #{writer.destination}"
|
19
|
+
::FlatKit.logger.info "Using this key for sorting: #{compare_fields.join(", ")}"
|
20
|
+
readers.each do |r|
|
21
|
+
::FlatKit.logger.info " #{r.source}"
|
22
|
+
end
|
23
|
+
|
24
|
+
merge_tree = ::FlatKit::MergeTree.new(readers)
|
25
|
+
merge_tree.each do |record|
|
26
|
+
writer.write(record)
|
27
|
+
end
|
28
|
+
readers.each do |r|
|
29
|
+
::FlatKit.logger.info " #{r.source} produced #{r.count} records"
|
30
|
+
end
|
31
|
+
writer.close
|
32
|
+
::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: Merge a list of sorted records from Readers into a single output Writer
|
3
|
+
#
|
4
|
+
# The MergeTree implements a Tournament Tree algorightm to do a k-way merge
|
5
|
+
# between Reader objects:
|
6
|
+
#
|
7
|
+
# https://en.wikipedia.org/wiki/K-way_merge_algorithm
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
#
|
11
|
+
# compare_fields = %w[ key timestamp ]
|
12
|
+
# format = ::FlatKit::Format.for('json')
|
13
|
+
#
|
14
|
+
# readers = ARGV.map do |path|
|
15
|
+
# format.reader.new(source: path, compare_fields: compare_fields)
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# write_path = "merged.jsonl"
|
19
|
+
# writer = format.writer.new(destination: write_path.to_s)
|
20
|
+
#
|
21
|
+
# tree = ::FlatKit::MergeTree.new(readers)
|
22
|
+
#
|
23
|
+
# tree.each do |record|
|
24
|
+
# writer.write(record)
|
25
|
+
# end
|
26
|
+
# writer.close
|
27
|
+
#
|
28
|
+
#
|
29
|
+
class MergeTree
|
30
|
+
include Enumerable
|
31
|
+
|
32
|
+
attr_reader :leaves
|
33
|
+
attr_reader :levels
|
34
|
+
attr_reader :readers
|
35
|
+
|
36
|
+
def initialize(readers)
|
37
|
+
@readers = readers
|
38
|
+
@leaves = []
|
39
|
+
@levels = []
|
40
|
+
|
41
|
+
@readers.each do |reader|
|
42
|
+
@leaves << LeafNode.new(reader)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Need to pad the leaves to an even number so that the slicing by 2 for
|
46
|
+
# the tournament will work
|
47
|
+
if @leaves.size.odd? then
|
48
|
+
@leaves << SentinelLeafNode.new
|
49
|
+
end
|
50
|
+
|
51
|
+
init_tree
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Initialize the tournament tree, go in depths - bottom layer will be the
|
56
|
+
# winners of the 2 leaf nodes, continuing until top layer is just 1 node
|
57
|
+
#
|
58
|
+
def init_tree
|
59
|
+
values = @leaves.dup
|
60
|
+
loop do
|
61
|
+
break if values.size == 1
|
62
|
+
|
63
|
+
winners = []
|
64
|
+
|
65
|
+
# we alays need a left and a right node, there is the possibility of
|
66
|
+
# adding a single Sentinel node as the final right node in each level
|
67
|
+
values.each_slice(2) do |left, right|
|
68
|
+
right = SentinelInternalNode.new if right.nil?
|
69
|
+
winners << InternalNode.new(left: left, right: right)
|
70
|
+
end
|
71
|
+
values = winners
|
72
|
+
@levels << winners
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# Root is the last level - should only have one node
|
78
|
+
#
|
79
|
+
def root
|
80
|
+
@levels[-1].first
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# The number of levels, this shold be the logn(readers.size)
|
85
|
+
#
|
86
|
+
def depth
|
87
|
+
@levels.size
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Iterate over all the ements from all the readers yielding them in sorted
|
92
|
+
# order.
|
93
|
+
#
|
94
|
+
def each
|
95
|
+
loop do
|
96
|
+
break if root.leaf.finished?
|
97
|
+
yield root.value
|
98
|
+
# consume the yielded value and have the tournament tree replay those
|
99
|
+
# brackets affected
|
100
|
+
root.leaf.update_and_replay
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|