flat_kit 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CONTRIBUTING.md +46 -0
- data/HISTORY.md +5 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +66 -0
- data/README.md +80 -0
- data/Rakefile +20 -0
- data/bin/fk +4 -0
- data/lib/flat_kit.rb +23 -0
- data/lib/flat_kit/cli.rb +80 -0
- data/lib/flat_kit/command.rb +53 -0
- data/lib/flat_kit/command/cat.rb +93 -0
- data/lib/flat_kit/command/merge.rb +88 -0
- data/lib/flat_kit/command/sort.rb +88 -0
- data/lib/flat_kit/descendant_tracker.rb +27 -0
- data/lib/flat_kit/error.rb +5 -0
- data/lib/flat_kit/format.rb +34 -0
- data/lib/flat_kit/input.rb +32 -0
- data/lib/flat_kit/input/file.rb +53 -0
- data/lib/flat_kit/input/io.rb +54 -0
- data/lib/flat_kit/internal_node.rb +84 -0
- data/lib/flat_kit/jsonl.rb +8 -0
- data/lib/flat_kit/jsonl/format.rb +25 -0
- data/lib/flat_kit/jsonl/reader.rb +30 -0
- data/lib/flat_kit/jsonl/record.rb +84 -0
- data/lib/flat_kit/jsonl/writer.rb +45 -0
- data/lib/flat_kit/leaf_node.rb +71 -0
- data/lib/flat_kit/logger.rb +39 -0
- data/lib/flat_kit/merge.rb +35 -0
- data/lib/flat_kit/merge_tree.rb +104 -0
- data/lib/flat_kit/output.rb +32 -0
- data/lib/flat_kit/output/file.rb +55 -0
- data/lib/flat_kit/output/io.rb +73 -0
- data/lib/flat_kit/reader.rb +61 -0
- data/lib/flat_kit/record.rb +83 -0
- data/lib/flat_kit/sentinel_internal_node.rb +37 -0
- data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
- data/lib/flat_kit/sort.rb +35 -0
- data/lib/flat_kit/writer.rb +38 -0
- data/lib/flat_kit/xsv.rb +8 -0
- data/lib/flat_kit/xsv/format.rb +25 -0
- data/lib/flat_kit/xsv/reader.rb +45 -0
- data/lib/flat_kit/xsv/record.rb +90 -0
- data/lib/flat_kit/xsv/writer.rb +70 -0
- data/tasks/default.rake +242 -0
- data/tasks/extension.rake +38 -0
- data/tasks/man.rake +7 -0
- data/tasks/this.rb +208 -0
- data/test/device_dataset.rb +117 -0
- data/test/input/test_file.rb +73 -0
- data/test/input/test_io.rb +93 -0
- data/test/jsonl/test_format.rb +22 -0
- data/test/jsonl/test_reader.rb +49 -0
- data/test/jsonl/test_record.rb +61 -0
- data/test/jsonl/test_writer.rb +68 -0
- data/test/output/test_file.rb +60 -0
- data/test/output/test_io.rb +104 -0
- data/test/test_conversions.rb +45 -0
- data/test/test_format.rb +24 -0
- data/test/test_helper.rb +26 -0
- data/test/test_merge.rb +40 -0
- data/test/test_merge_tree.rb +64 -0
- data/test/test_version.rb +11 -0
- data/test/xsv/test_format.rb +22 -0
- data/test/xsv/test_reader.rb +61 -0
- data/test/xsv/test_record.rb +69 -0
- data/test/xsv/test_writer.rb +68 -0
- metadata +237 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
module FlatKit
|
2
|
+
module Jsonl
|
3
|
+
class Format < ::FlatKit::Format
|
4
|
+
def self.format_name
|
5
|
+
"jsonl"
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.handles?(filename)
|
9
|
+
parts = filename.split(".")
|
10
|
+
%w[ json jsonl ndjson ].each do |ext|
|
11
|
+
return true if parts.include?(ext)
|
12
|
+
end
|
13
|
+
return false
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.reader
|
17
|
+
::FlatKit::Jsonl::Reader
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.writer
|
21
|
+
::FlatKit::Jsonl::Writer
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module FlatKit
|
2
|
+
module Jsonl
|
3
|
+
class Reader < ::FlatKit::Reader
|
4
|
+
attr_reader :input
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def self.format_name
|
8
|
+
::FlatKit::Jsonl::Format.format_name
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(source:, compare_fields: :none)
|
12
|
+
super
|
13
|
+
@input = ::FlatKit::Input.from(source)
|
14
|
+
@count = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
while line = input.io.gets do
|
19
|
+
record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
|
20
|
+
@count += 1
|
21
|
+
yield record
|
22
|
+
end
|
23
|
+
input.close
|
24
|
+
rescue => e
|
25
|
+
::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
|
26
|
+
raise ::FlatKit::Error, e
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'oj'
|
2
|
+
require 'flat_kit/record'
|
3
|
+
|
4
|
+
module FlatKit
|
5
|
+
module Jsonl
|
6
|
+
class Record < ::FlatKit::Record
|
7
|
+
attr_reader :compare_data
|
8
|
+
|
9
|
+
def self.format_name
|
10
|
+
::FlatKit::Jsonl::Format.format_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.from_record(record)
|
14
|
+
if record.instance_of?(FlatKit::Jsonl::Record) then
|
15
|
+
|
16
|
+
structured = record.complete_structured_data? ? record.complete_structured_data : nil
|
17
|
+
|
18
|
+
new(data: record.data, compare_fields: record.compare_fields,
|
19
|
+
compare_data: record.compare_data,
|
20
|
+
complete_structured_data: structured)
|
21
|
+
else
|
22
|
+
new(data: nil, compare_fields: record.compare_fields,
|
23
|
+
complete_structured_data: record.to_hash)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(data:, compare_fields: :none,
|
28
|
+
compare_data: Hash.new,
|
29
|
+
complete_structured_data: nil)
|
30
|
+
super(data: data, compare_fields: compare_fields)
|
31
|
+
|
32
|
+
@complete_structured_data = complete_structured_data
|
33
|
+
|
34
|
+
if complete_structured_data? && (compare_data.nil? || compare_data.empty?) then
|
35
|
+
@compare_data = complete_structured_data
|
36
|
+
else
|
37
|
+
@compare_data = compare_data
|
38
|
+
end
|
39
|
+
|
40
|
+
# only load compare data if it dosn't exist
|
41
|
+
if data && compare_data.empty? then
|
42
|
+
quick_parse
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def [](key)
|
47
|
+
compare_data[key]
|
48
|
+
end
|
49
|
+
|
50
|
+
def complete_structured_data
|
51
|
+
@complete_structured_data ||= Oj.load(data)
|
52
|
+
end
|
53
|
+
alias to_hash complete_structured_data
|
54
|
+
|
55
|
+
def complete_structured_data?
|
56
|
+
!(@complete_structured_data.nil? || @complete_structured_data.empty?)
|
57
|
+
end
|
58
|
+
|
59
|
+
# overriding parent accessor since we may be initialized without raw bytes
|
60
|
+
# to parse
|
61
|
+
def data
|
62
|
+
if @data.nil? && complete_structured_data? then
|
63
|
+
@data = Oj.dump(complete_structured_data)
|
64
|
+
end
|
65
|
+
@data
|
66
|
+
end
|
67
|
+
alias to_s data
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
def quick_parse
|
72
|
+
Oj::Doc.open(data) do |doc|
|
73
|
+
compare_fields.each do |field|
|
74
|
+
val = doc.fetch("/#{field}")
|
75
|
+
@compare_data[field] = val
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module FlatKit
|
2
|
+
module Jsonl
|
3
|
+
class Writer < ::FlatKit::Writer
|
4
|
+
attr_reader :output
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def self.format_name
|
8
|
+
::FlatKit::Jsonl::Format.format_name
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(destination:)
|
12
|
+
super
|
13
|
+
@output = ::FlatKit::Output.from(@destination)
|
14
|
+
@count = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def write(record)
|
18
|
+
case record
|
19
|
+
when FlatKit::Jsonl::Record
|
20
|
+
write_record(record)
|
21
|
+
when FlatKit::Record
|
22
|
+
converted_record = ::FlatKit::Jsonl::Record.from_record(record)
|
23
|
+
write_record(converted_record)
|
24
|
+
else
|
25
|
+
raise FlatKit::Error, "Unable to write records of type #{record.class}"
|
26
|
+
end
|
27
|
+
rescue FlatKit::Error => fe
|
28
|
+
raise fe
|
29
|
+
rescue => e
|
30
|
+
::FlatKit.logger.error "Error reading jsonl records from #{output.name}: #{e}"
|
31
|
+
raise ::FlatKit::Error, e
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
@output.close
|
36
|
+
end
|
37
|
+
|
38
|
+
def write_record(record)
|
39
|
+
# enforces ending in newlin if it doesn't already have one
|
40
|
+
output.io.puts record.to_s
|
41
|
+
@count += 1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Private: The LeafNode is a wrapper around a Reader object to enable
|
3
|
+
# a consistent api for use in the MergeTree
|
4
|
+
#
|
5
|
+
# The LeafNode keeps track of the head of the reader list and when its value
|
6
|
+
# is used up, it pulls the next value and then notifies the next level of the
|
7
|
+
# MergeTree that its value has changed and so should do another play.
|
8
|
+
#
|
9
|
+
# If all the data is used up from the reader, it also notifies the next level
|
10
|
+
# of that so the next level can remove it from the tree.
|
11
|
+
class LeafNode
|
12
|
+
|
13
|
+
include Comparable
|
14
|
+
|
15
|
+
attr_reader :reader
|
16
|
+
attr_reader :value
|
17
|
+
|
18
|
+
attr_accessor :next_level
|
19
|
+
|
20
|
+
def initialize(reader)
|
21
|
+
@reader = reader
|
22
|
+
@enum = @reader.to_enum
|
23
|
+
@value = @enum.next
|
24
|
+
|
25
|
+
@next_level = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def winner
|
29
|
+
value
|
30
|
+
end
|
31
|
+
|
32
|
+
def sentinel?
|
33
|
+
false
|
34
|
+
end
|
35
|
+
|
36
|
+
def leaf?
|
37
|
+
true
|
38
|
+
end
|
39
|
+
|
40
|
+
def leaf
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def update_and_replay
|
45
|
+
self.next
|
46
|
+
if finished? then
|
47
|
+
::FlatKit.logger.debug "#{reader.source} has finished reading #{reader.count} records"
|
48
|
+
next_level.player_finished(self)
|
49
|
+
end
|
50
|
+
next_level.play
|
51
|
+
end
|
52
|
+
|
53
|
+
def next
|
54
|
+
begin
|
55
|
+
@value = @enum.next
|
56
|
+
rescue StopIteration
|
57
|
+
@value = nil
|
58
|
+
end
|
59
|
+
@value
|
60
|
+
end
|
61
|
+
|
62
|
+
def finished?
|
63
|
+
@enum && @value.nil?
|
64
|
+
end
|
65
|
+
|
66
|
+
def <=>(other)
|
67
|
+
return -1 if other.sentinel?
|
68
|
+
self.value.<=>(other.value)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class LogFormatter < ::Logger::Formatter
|
5
|
+
FORMAT = "%s %5d %05s : %s\n".freeze
|
6
|
+
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
self.datetime_format = DATETIME_FORMAT
|
10
|
+
end
|
11
|
+
|
12
|
+
def call(severity, time, progname, msg)
|
13
|
+
FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Logger
|
18
|
+
def self.for_io(io)
|
19
|
+
::Logger.new(io, formatter: LogFormatter.new)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.for_path(path)
|
23
|
+
io = File.open(path.to_s, "a")
|
24
|
+
for_io(io)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.log_to(destination = $stderr)
|
29
|
+
if destination.kind_of?(::IO) then
|
30
|
+
@logger = ::FlatKit::Logger.for_io(destination)
|
31
|
+
else
|
32
|
+
@logger = ::FlatKit::Logger.for_path(destination)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.logger
|
37
|
+
@logger ||= ::FlatKit::Logger.for_io($stderr)
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class Merge
|
3
|
+
attr_reader :readers
|
4
|
+
attr_reader :writer
|
5
|
+
attr_reader :compare_fields
|
6
|
+
|
7
|
+
def initialize(inputs:, input_fallback: "auto",
|
8
|
+
output:, output_fallback: "auto",
|
9
|
+
compare_fields:)
|
10
|
+
@compare_fields = compare_fields
|
11
|
+
@readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
|
12
|
+
fallback: input_fallback)
|
13
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
14
|
+
reader_format: @readers.first.format_name)
|
15
|
+
end
|
16
|
+
|
17
|
+
def call
|
18
|
+
::FlatKit.logger.info "Merging the following files into #{writer.destination}"
|
19
|
+
::FlatKit.logger.info "Using this key for sorting: #{compare_fields.join(", ")}"
|
20
|
+
readers.each do |r|
|
21
|
+
::FlatKit.logger.info " #{r.source}"
|
22
|
+
end
|
23
|
+
|
24
|
+
merge_tree = ::FlatKit::MergeTree.new(readers)
|
25
|
+
merge_tree.each do |record|
|
26
|
+
writer.write(record)
|
27
|
+
end
|
28
|
+
readers.each do |r|
|
29
|
+
::FlatKit.logger.info " #{r.source} produced #{r.count} records"
|
30
|
+
end
|
31
|
+
writer.close
|
32
|
+
::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
module FlatKit
|
2
|
+
# Public: Merge a list of sorted records from Readers into a single output Writer
|
3
|
+
#
|
4
|
+
# The MergeTree implements a Tournament Tree algorightm to do a k-way merge
|
5
|
+
# between Reader objects:
|
6
|
+
#
|
7
|
+
# https://en.wikipedia.org/wiki/K-way_merge_algorithm
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
#
|
11
|
+
# compare_fields = %w[ key timestamp ]
|
12
|
+
# format = ::FlatKit::Format.for('json')
|
13
|
+
#
|
14
|
+
# readers = ARGV.map do |path|
|
15
|
+
# format.reader.new(source: path, compare_fields: compare_fields)
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# write_path = "merged.jsonl"
|
19
|
+
# writer = format.writer.new(destination: write_path.to_s)
|
20
|
+
#
|
21
|
+
# tree = ::FlatKit::MergeTree.new(readers)
|
22
|
+
#
|
23
|
+
# tree.each do |record|
|
24
|
+
# writer.write(record)
|
25
|
+
# end
|
26
|
+
# writer.close
|
27
|
+
#
|
28
|
+
#
|
29
|
+
class MergeTree
|
30
|
+
include Enumerable
|
31
|
+
|
32
|
+
attr_reader :leaves
|
33
|
+
attr_reader :levels
|
34
|
+
attr_reader :readers
|
35
|
+
|
36
|
+
def initialize(readers)
|
37
|
+
@readers = readers
|
38
|
+
@leaves = []
|
39
|
+
@levels = []
|
40
|
+
|
41
|
+
@readers.each do |reader|
|
42
|
+
@leaves << LeafNode.new(reader)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Need to pad the leaves to an even number so that the slicing by 2 for
|
46
|
+
# the tournament will work
|
47
|
+
if @leaves.size.odd? then
|
48
|
+
@leaves << SentinelLeafNode.new
|
49
|
+
end
|
50
|
+
|
51
|
+
init_tree
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Initialize the tournament tree, go in depths - bottom layer will be the
|
56
|
+
# winners of the 2 leaf nodes, continuing until top layer is just 1 node
|
57
|
+
#
|
58
|
+
def init_tree
|
59
|
+
values = @leaves.dup
|
60
|
+
loop do
|
61
|
+
break if values.size == 1
|
62
|
+
|
63
|
+
winners = []
|
64
|
+
|
65
|
+
# we alays need a left and a right node, there is the possibility of
|
66
|
+
# adding a single Sentinel node as the final right node in each level
|
67
|
+
values.each_slice(2) do |left, right|
|
68
|
+
right = SentinelInternalNode.new if right.nil?
|
69
|
+
winners << InternalNode.new(left: left, right: right)
|
70
|
+
end
|
71
|
+
values = winners
|
72
|
+
@levels << winners
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# Root is the last level - should only have one node
|
78
|
+
#
|
79
|
+
def root
|
80
|
+
@levels[-1].first
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# The number of levels, this shold be the logn(readers.size)
|
85
|
+
#
|
86
|
+
def depth
|
87
|
+
@levels.size
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Iterate over all the ements from all the readers yielding them in sorted
|
92
|
+
# order.
|
93
|
+
#
|
94
|
+
def each
|
95
|
+
loop do
|
96
|
+
break if root.leaf.finished?
|
97
|
+
yield root.value
|
98
|
+
# consume the yielded value and have the tournament tree replay those
|
99
|
+
# brackets affected
|
100
|
+
root.leaf.update_and_replay
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|