flat_kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +46 -0
  3. data/HISTORY.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/Manifest.txt +66 -0
  6. data/README.md +80 -0
  7. data/Rakefile +20 -0
  8. data/bin/fk +4 -0
  9. data/lib/flat_kit.rb +23 -0
  10. data/lib/flat_kit/cli.rb +80 -0
  11. data/lib/flat_kit/command.rb +53 -0
  12. data/lib/flat_kit/command/cat.rb +93 -0
  13. data/lib/flat_kit/command/merge.rb +88 -0
  14. data/lib/flat_kit/command/sort.rb +88 -0
  15. data/lib/flat_kit/descendant_tracker.rb +27 -0
  16. data/lib/flat_kit/error.rb +5 -0
  17. data/lib/flat_kit/format.rb +34 -0
  18. data/lib/flat_kit/input.rb +32 -0
  19. data/lib/flat_kit/input/file.rb +53 -0
  20. data/lib/flat_kit/input/io.rb +54 -0
  21. data/lib/flat_kit/internal_node.rb +84 -0
  22. data/lib/flat_kit/jsonl.rb +8 -0
  23. data/lib/flat_kit/jsonl/format.rb +25 -0
  24. data/lib/flat_kit/jsonl/reader.rb +30 -0
  25. data/lib/flat_kit/jsonl/record.rb +84 -0
  26. data/lib/flat_kit/jsonl/writer.rb +45 -0
  27. data/lib/flat_kit/leaf_node.rb +71 -0
  28. data/lib/flat_kit/logger.rb +39 -0
  29. data/lib/flat_kit/merge.rb +35 -0
  30. data/lib/flat_kit/merge_tree.rb +104 -0
  31. data/lib/flat_kit/output.rb +32 -0
  32. data/lib/flat_kit/output/file.rb +55 -0
  33. data/lib/flat_kit/output/io.rb +73 -0
  34. data/lib/flat_kit/reader.rb +61 -0
  35. data/lib/flat_kit/record.rb +83 -0
  36. data/lib/flat_kit/sentinel_internal_node.rb +37 -0
  37. data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
  38. data/lib/flat_kit/sort.rb +35 -0
  39. data/lib/flat_kit/writer.rb +38 -0
  40. data/lib/flat_kit/xsv.rb +8 -0
  41. data/lib/flat_kit/xsv/format.rb +25 -0
  42. data/lib/flat_kit/xsv/reader.rb +45 -0
  43. data/lib/flat_kit/xsv/record.rb +90 -0
  44. data/lib/flat_kit/xsv/writer.rb +70 -0
  45. data/tasks/default.rake +242 -0
  46. data/tasks/extension.rake +38 -0
  47. data/tasks/man.rake +7 -0
  48. data/tasks/this.rb +208 -0
  49. data/test/device_dataset.rb +117 -0
  50. data/test/input/test_file.rb +73 -0
  51. data/test/input/test_io.rb +93 -0
  52. data/test/jsonl/test_format.rb +22 -0
  53. data/test/jsonl/test_reader.rb +49 -0
  54. data/test/jsonl/test_record.rb +61 -0
  55. data/test/jsonl/test_writer.rb +68 -0
  56. data/test/output/test_file.rb +60 -0
  57. data/test/output/test_io.rb +104 -0
  58. data/test/test_conversions.rb +45 -0
  59. data/test/test_format.rb +24 -0
  60. data/test/test_helper.rb +26 -0
  61. data/test/test_merge.rb +40 -0
  62. data/test/test_merge_tree.rb +64 -0
  63. data/test/test_version.rb +11 -0
  64. data/test/xsv/test_format.rb +22 -0
  65. data/test/xsv/test_reader.rb +61 -0
  66. data/test/xsv/test_record.rb +69 -0
  67. data/test/xsv/test_writer.rb +68 -0
  68. metadata +237 -0
@@ -0,0 +1,8 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ end
4
+ end
5
+ require 'flat_kit/jsonl/record'
6
+ require 'flat_kit/jsonl/reader'
7
+ require 'flat_kit/jsonl/writer'
8
+ require 'flat_kit/jsonl/format'
@@ -0,0 +1,25 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ class Format < ::FlatKit::Format
4
+ def self.format_name
5
+ "jsonl"
6
+ end
7
+
8
+ def self.handles?(filename)
9
+ parts = filename.split(".")
10
+ %w[ json jsonl ndjson ].each do |ext|
11
+ return true if parts.include?(ext)
12
+ end
13
+ return false
14
+ end
15
+
16
+ def self.reader
17
+ ::FlatKit::Jsonl::Reader
18
+ end
19
+
20
+ def self.writer
21
+ ::FlatKit::Jsonl::Writer
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,30 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ class Reader < ::FlatKit::Reader
4
+ attr_reader :input
5
+ attr_reader :count
6
+
7
+ def self.format_name
8
+ ::FlatKit::Jsonl::Format.format_name
9
+ end
10
+
11
+ def initialize(source:, compare_fields: :none)
12
+ super
13
+ @input = ::FlatKit::Input.from(source)
14
+ @count = 0
15
+ end
16
+
17
+ def each
18
+ while line = input.io.gets do
19
+ record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
20
+ @count += 1
21
+ yield record
22
+ end
23
+ input.close
24
+ rescue => e
25
+ ::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
26
+ raise ::FlatKit::Error, e
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,84 @@
1
+ require 'oj'
2
+ require 'flat_kit/record'
3
+
4
+ module FlatKit
5
+ module Jsonl
6
+ class Record < ::FlatKit::Record
7
+ attr_reader :compare_data
8
+
9
+ def self.format_name
10
+ ::FlatKit::Jsonl::Format.format_name
11
+ end
12
+
13
+ def self.from_record(record)
14
+ if record.instance_of?(FlatKit::Jsonl::Record) then
15
+
16
+ structured = record.complete_structured_data? ? record.complete_structured_data : nil
17
+
18
+ new(data: record.data, compare_fields: record.compare_fields,
19
+ compare_data: record.compare_data,
20
+ complete_structured_data: structured)
21
+ else
22
+ new(data: nil, compare_fields: record.compare_fields,
23
+ complete_structured_data: record.to_hash)
24
+ end
25
+ end
26
+
27
+ def initialize(data:, compare_fields: :none,
28
+ compare_data: Hash.new,
29
+ complete_structured_data: nil)
30
+ super(data: data, compare_fields: compare_fields)
31
+
32
+ @complete_structured_data = complete_structured_data
33
+
34
+ if complete_structured_data? && (compare_data.nil? || compare_data.empty?) then
35
+ @compare_data = complete_structured_data
36
+ else
37
+ @compare_data = compare_data
38
+ end
39
+
40
+ # only load compare data if it dosn't exist
41
+ if data && compare_data.empty? then
42
+ quick_parse
43
+ end
44
+ end
45
+
46
+ def [](key)
47
+ compare_data[key]
48
+ end
49
+
50
+ def complete_structured_data
51
+ @complete_structured_data ||= Oj.load(data)
52
+ end
53
+ alias to_hash complete_structured_data
54
+
55
+ def complete_structured_data?
56
+ !(@complete_structured_data.nil? || @complete_structured_data.empty?)
57
+ end
58
+
59
+ # overriding parent accessor since we may be initialized without raw bytes
60
+ # to parse
61
+ def data
62
+ if @data.nil? && complete_structured_data? then
63
+ @data = Oj.dump(complete_structured_data)
64
+ end
65
+ @data
66
+ end
67
+ alias to_s data
68
+
69
+ private
70
+
71
+ def quick_parse
72
+ Oj::Doc.open(data) do |doc|
73
+ compare_fields.each do |field|
74
+ val = doc.fetch("/#{field}")
75
+ @compare_data[field] = val
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
@@ -0,0 +1,45 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ class Writer < ::FlatKit::Writer
4
+ attr_reader :output
5
+ attr_reader :count
6
+
7
+ def self.format_name
8
+ ::FlatKit::Jsonl::Format.format_name
9
+ end
10
+
11
+ def initialize(destination:)
12
+ super
13
+ @output = ::FlatKit::Output.from(@destination)
14
+ @count = 0
15
+ end
16
+
17
+ def write(record)
18
+ case record
19
+ when FlatKit::Jsonl::Record
20
+ write_record(record)
21
+ when FlatKit::Record
22
+ converted_record = ::FlatKit::Jsonl::Record.from_record(record)
23
+ write_record(converted_record)
24
+ else
25
+ raise FlatKit::Error, "Unable to write records of type #{record.class}"
26
+ end
27
+ rescue FlatKit::Error => fe
28
+ raise fe
29
+ rescue => e
30
+ ::FlatKit.logger.error "Error reading jsonl records from #{output.name}: #{e}"
31
+ raise ::FlatKit::Error, e
32
+ end
33
+
34
+ def close
35
+ @output.close
36
+ end
37
+
38
+ def write_record(record)
39
+ # enforces ending in newlin if it doesn't already have one
40
+ output.io.puts record.to_s
41
+ @count += 1
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,71 @@
1
+ module FlatKit
2
+ # Private: The LeafNode is a wrapper around a Reader object to enable
3
+ # a consistent api for use in the MergeTree
4
+ #
5
+ # The LeafNode keeps track of the head of the reader list and when its value
6
+ # is used up, it pulls the next value and then notifies the next level of the
7
+ # MergeTree that its value has changed and so should do another play.
8
+ #
9
+ # If all the data is used up from the reader, it also notifies the next level
10
+ # of that so the next level can remove it from the tree.
11
+ class LeafNode
12
+
13
+ include Comparable
14
+
15
+ attr_reader :reader
16
+ attr_reader :value
17
+
18
+ attr_accessor :next_level
19
+
20
+ def initialize(reader)
21
+ @reader = reader
22
+ @enum = @reader.to_enum
23
+ @value = @enum.next
24
+
25
+ @next_level = nil
26
+ end
27
+
28
+ def winner
29
+ value
30
+ end
31
+
32
+ def sentinel?
33
+ false
34
+ end
35
+
36
+ def leaf?
37
+ true
38
+ end
39
+
40
+ def leaf
41
+ self
42
+ end
43
+
44
+ def update_and_replay
45
+ self.next
46
+ if finished? then
47
+ ::FlatKit.logger.debug "#{reader.source} has finished reading #{reader.count} records"
48
+ next_level.player_finished(self)
49
+ end
50
+ next_level.play
51
+ end
52
+
53
+ def next
54
+ begin
55
+ @value = @enum.next
56
+ rescue StopIteration
57
+ @value = nil
58
+ end
59
+ @value
60
+ end
61
+
62
+ def finished?
63
+ @enum && @value.nil?
64
+ end
65
+
66
+ def <=>(other)
67
+ return -1 if other.sentinel?
68
+ self.value.<=>(other.value)
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,39 @@
1
+ require 'logger'
2
+
3
+ module FlatKit
4
+ class LogFormatter < ::Logger::Formatter
5
+ FORMAT = "%s %5d %05s : %s\n".freeze
6
+ DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
7
+ def initialize
8
+ super
9
+ self.datetime_format = DATETIME_FORMAT
10
+ end
11
+
12
+ def call(severity, time, progname, msg)
13
+ FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
14
+ end
15
+ end
16
+
17
+ class Logger
18
+ def self.for_io(io)
19
+ ::Logger.new(io, formatter: LogFormatter.new)
20
+ end
21
+
22
+ def self.for_path(path)
23
+ io = File.open(path.to_s, "a")
24
+ for_io(io)
25
+ end
26
+ end
27
+
28
+ def self.log_to(destination = $stderr)
29
+ if destination.kind_of?(::IO) then
30
+ @logger = ::FlatKit::Logger.for_io(destination)
31
+ else
32
+ @logger = ::FlatKit::Logger.for_path(destination)
33
+ end
34
+ end
35
+
36
+ def self.logger
37
+ @logger ||= ::FlatKit::Logger.for_io($stderr)
38
+ end
39
+ end
@@ -0,0 +1,35 @@
1
+ module FlatKit
2
+ class Merge
3
+ attr_reader :readers
4
+ attr_reader :writer
5
+ attr_reader :compare_fields
6
+
7
+ def initialize(inputs:, input_fallback: "auto",
8
+ output:, output_fallback: "auto",
9
+ compare_fields:)
10
+ @compare_fields = compare_fields
11
+ @readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
12
+ fallback: input_fallback)
13
+ @writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
14
+ reader_format: @readers.first.format_name)
15
+ end
16
+
17
+ def call
18
+ ::FlatKit.logger.info "Merging the following files into #{writer.destination}"
19
+ ::FlatKit.logger.info "Using this key for sorting: #{compare_fields.join(", ")}"
20
+ readers.each do |r|
21
+ ::FlatKit.logger.info " #{r.source}"
22
+ end
23
+
24
+ merge_tree = ::FlatKit::MergeTree.new(readers)
25
+ merge_tree.each do |record|
26
+ writer.write(record)
27
+ end
28
+ readers.each do |r|
29
+ ::FlatKit.logger.info " #{r.source} produced #{r.count} records"
30
+ end
31
+ writer.close
32
+ ::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,104 @@
1
+ module FlatKit
2
+ # Public: Merge a list of sorted records from Readers into a single output Writer
3
+ #
4
+ # The MergeTree implements a Tournament Tree algorightm to do a k-way merge
5
+ # between Reader objects:
6
+ #
7
+ # https://en.wikipedia.org/wiki/K-way_merge_algorithm
8
+ #
9
+ # Usage:
10
+ #
11
+ # compare_fields = %w[ key timestamp ]
12
+ # format = ::FlatKit::Format.for('json')
13
+ #
14
+ # readers = ARGV.map do |path|
15
+ # format.reader.new(source: path, compare_fields: compare_fields)
16
+ # end
17
+ #
18
+ # write_path = "merged.jsonl"
19
+ # writer = format.writer.new(destination: write_path.to_s)
20
+ #
21
+ # tree = ::FlatKit::MergeTree.new(readers)
22
+ #
23
+ # tree.each do |record|
24
+ # writer.write(record)
25
+ # end
26
+ # writer.close
27
+ #
28
+ #
29
+ class MergeTree
30
+ include Enumerable
31
+
32
+ attr_reader :leaves
33
+ attr_reader :levels
34
+ attr_reader :readers
35
+
36
+ def initialize(readers)
37
+ @readers = readers
38
+ @leaves = []
39
+ @levels = []
40
+
41
+ @readers.each do |reader|
42
+ @leaves << LeafNode.new(reader)
43
+ end
44
+
45
+ # Need to pad the leaves to an even number so that the slicing by 2 for
46
+ # the tournament will work
47
+ if @leaves.size.odd? then
48
+ @leaves << SentinelLeafNode.new
49
+ end
50
+
51
+ init_tree
52
+ end
53
+
54
+ #
55
+ # Initialize the tournament tree, go in depths - bottom layer will be the
56
+ # winners of the 2 leaf nodes, continuing until top layer is just 1 node
57
+ #
58
+ def init_tree
59
+ values = @leaves.dup
60
+ loop do
61
+ break if values.size == 1
62
+
63
+ winners = []
64
+
65
+ # we alays need a left and a right node, there is the possibility of
66
+ # adding a single Sentinel node as the final right node in each level
67
+ values.each_slice(2) do |left, right|
68
+ right = SentinelInternalNode.new if right.nil?
69
+ winners << InternalNode.new(left: left, right: right)
70
+ end
71
+ values = winners
72
+ @levels << winners
73
+ end
74
+ end
75
+
76
+ #
77
+ # Root is the last level - should only have one node
78
+ #
79
+ def root
80
+ @levels[-1].first
81
+ end
82
+
83
+ #
84
+ # The number of levels, this shold be the logn(readers.size)
85
+ #
86
+ def depth
87
+ @levels.size
88
+ end
89
+
90
+ #
91
+ # Iterate over all the ements from all the readers yielding them in sorted
92
+ # order.
93
+ #
94
+ def each
95
+ loop do
96
+ break if root.leaf.finished?
97
+ yield root.value
98
+ # consume the yielded value and have the tournament tree replay those
99
+ # brackets affected
100
+ root.leaf.update_and_replay
101
+ end
102
+ end
103
+ end
104
+ end