flat_kit 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +46 -0
  3. data/HISTORY.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/Manifest.txt +66 -0
  6. data/README.md +80 -0
  7. data/Rakefile +20 -0
  8. data/bin/fk +4 -0
  9. data/lib/flat_kit.rb +23 -0
  10. data/lib/flat_kit/cli.rb +80 -0
  11. data/lib/flat_kit/command.rb +53 -0
  12. data/lib/flat_kit/command/cat.rb +93 -0
  13. data/lib/flat_kit/command/merge.rb +88 -0
  14. data/lib/flat_kit/command/sort.rb +88 -0
  15. data/lib/flat_kit/descendant_tracker.rb +27 -0
  16. data/lib/flat_kit/error.rb +5 -0
  17. data/lib/flat_kit/format.rb +34 -0
  18. data/lib/flat_kit/input.rb +32 -0
  19. data/lib/flat_kit/input/file.rb +53 -0
  20. data/lib/flat_kit/input/io.rb +54 -0
  21. data/lib/flat_kit/internal_node.rb +84 -0
  22. data/lib/flat_kit/jsonl.rb +8 -0
  23. data/lib/flat_kit/jsonl/format.rb +25 -0
  24. data/lib/flat_kit/jsonl/reader.rb +30 -0
  25. data/lib/flat_kit/jsonl/record.rb +84 -0
  26. data/lib/flat_kit/jsonl/writer.rb +45 -0
  27. data/lib/flat_kit/leaf_node.rb +71 -0
  28. data/lib/flat_kit/logger.rb +39 -0
  29. data/lib/flat_kit/merge.rb +35 -0
  30. data/lib/flat_kit/merge_tree.rb +104 -0
  31. data/lib/flat_kit/output.rb +32 -0
  32. data/lib/flat_kit/output/file.rb +55 -0
  33. data/lib/flat_kit/output/io.rb +73 -0
  34. data/lib/flat_kit/reader.rb +61 -0
  35. data/lib/flat_kit/record.rb +83 -0
  36. data/lib/flat_kit/sentinel_internal_node.rb +37 -0
  37. data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
  38. data/lib/flat_kit/sort.rb +35 -0
  39. data/lib/flat_kit/writer.rb +38 -0
  40. data/lib/flat_kit/xsv.rb +8 -0
  41. data/lib/flat_kit/xsv/format.rb +25 -0
  42. data/lib/flat_kit/xsv/reader.rb +45 -0
  43. data/lib/flat_kit/xsv/record.rb +90 -0
  44. data/lib/flat_kit/xsv/writer.rb +70 -0
  45. data/tasks/default.rake +242 -0
  46. data/tasks/extension.rake +38 -0
  47. data/tasks/man.rake +7 -0
  48. data/tasks/this.rb +208 -0
  49. data/test/device_dataset.rb +117 -0
  50. data/test/input/test_file.rb +73 -0
  51. data/test/input/test_io.rb +93 -0
  52. data/test/jsonl/test_format.rb +22 -0
  53. data/test/jsonl/test_reader.rb +49 -0
  54. data/test/jsonl/test_record.rb +61 -0
  55. data/test/jsonl/test_writer.rb +68 -0
  56. data/test/output/test_file.rb +60 -0
  57. data/test/output/test_io.rb +104 -0
  58. data/test/test_conversions.rb +45 -0
  59. data/test/test_format.rb +24 -0
  60. data/test/test_helper.rb +26 -0
  61. data/test/test_merge.rb +40 -0
  62. data/test/test_merge_tree.rb +64 -0
  63. data/test/test_version.rb +11 -0
  64. data/test/xsv/test_format.rb +22 -0
  65. data/test/xsv/test_reader.rb +61 -0
  66. data/test/xsv/test_record.rb +69 -0
  67. data/test/xsv/test_writer.rb +68 -0
  68. metadata +237 -0
@@ -0,0 +1,8 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ end
4
+ end
5
+ require 'flat_kit/jsonl/record'
6
+ require 'flat_kit/jsonl/reader'
7
+ require 'flat_kit/jsonl/writer'
8
+ require 'flat_kit/jsonl/format'
@@ -0,0 +1,25 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ class Format < ::FlatKit::Format
4
+ def self.format_name
5
+ "jsonl"
6
+ end
7
+
8
+ def self.handles?(filename)
9
+ parts = filename.split(".")
10
+ %w[ json jsonl ndjson ].each do |ext|
11
+ return true if parts.include?(ext)
12
+ end
13
+ return false
14
+ end
15
+
16
+ def self.reader
17
+ ::FlatKit::Jsonl::Reader
18
+ end
19
+
20
+ def self.writer
21
+ ::FlatKit::Jsonl::Writer
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,30 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ class Reader < ::FlatKit::Reader
4
+ attr_reader :input
5
+ attr_reader :count
6
+
7
+ def self.format_name
8
+ ::FlatKit::Jsonl::Format.format_name
9
+ end
10
+
11
+ def initialize(source:, compare_fields: :none)
12
+ super
13
+ @input = ::FlatKit::Input.from(source)
14
+ @count = 0
15
+ end
16
+
17
+ def each
18
+ while line = input.io.gets do
19
+ record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
20
+ @count += 1
21
+ yield record
22
+ end
23
+ input.close
24
+ rescue => e
25
+ ::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
26
+ raise ::FlatKit::Error, e
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,84 @@
1
+ require 'oj'
2
+ require 'flat_kit/record'
3
+
4
+ module FlatKit
5
+ module Jsonl
6
+ class Record < ::FlatKit::Record
7
+ attr_reader :compare_data
8
+
9
+ def self.format_name
10
+ ::FlatKit::Jsonl::Format.format_name
11
+ end
12
+
13
+ def self.from_record(record)
14
+ if record.instance_of?(FlatKit::Jsonl::Record) then
15
+
16
+ structured = record.complete_structured_data? ? record.complete_structured_data : nil
17
+
18
+ new(data: record.data, compare_fields: record.compare_fields,
19
+ compare_data: record.compare_data,
20
+ complete_structured_data: structured)
21
+ else
22
+ new(data: nil, compare_fields: record.compare_fields,
23
+ complete_structured_data: record.to_hash)
24
+ end
25
+ end
26
+
27
+ def initialize(data:, compare_fields: :none,
28
+ compare_data: Hash.new,
29
+ complete_structured_data: nil)
30
+ super(data: data, compare_fields: compare_fields)
31
+
32
+ @complete_structured_data = complete_structured_data
33
+
34
+ if complete_structured_data? && (compare_data.nil? || compare_data.empty?) then
35
+ @compare_data = complete_structured_data
36
+ else
37
+ @compare_data = compare_data
38
+ end
39
+
40
+ # only load compare data if it dosn't exist
41
+ if data && compare_data.empty? then
42
+ quick_parse
43
+ end
44
+ end
45
+
46
+ def [](key)
47
+ compare_data[key]
48
+ end
49
+
50
+ def complete_structured_data
51
+ @complete_structured_data ||= Oj.load(data)
52
+ end
53
+ alias to_hash complete_structured_data
54
+
55
+ def complete_structured_data?
56
+ !(@complete_structured_data.nil? || @complete_structured_data.empty?)
57
+ end
58
+
59
+ # overriding parent accessor since we may be initialized without raw bytes
60
+ # to parse
61
+ def data
62
+ if @data.nil? && complete_structured_data? then
63
+ @data = Oj.dump(complete_structured_data)
64
+ end
65
+ @data
66
+ end
67
+ alias to_s data
68
+
69
+ private
70
+
71
+ def quick_parse
72
+ Oj::Doc.open(data) do |doc|
73
+ compare_fields.each do |field|
74
+ val = doc.fetch("/#{field}")
75
+ @compare_data[field] = val
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+
@@ -0,0 +1,45 @@
1
+ module FlatKit
2
+ module Jsonl
3
+ class Writer < ::FlatKit::Writer
4
+ attr_reader :output
5
+ attr_reader :count
6
+
7
+ def self.format_name
8
+ ::FlatKit::Jsonl::Format.format_name
9
+ end
10
+
11
+ def initialize(destination:)
12
+ super
13
+ @output = ::FlatKit::Output.from(@destination)
14
+ @count = 0
15
+ end
16
+
17
+ def write(record)
18
+ case record
19
+ when FlatKit::Jsonl::Record
20
+ write_record(record)
21
+ when FlatKit::Record
22
+ converted_record = ::FlatKit::Jsonl::Record.from_record(record)
23
+ write_record(converted_record)
24
+ else
25
+ raise FlatKit::Error, "Unable to write records of type #{record.class}"
26
+ end
27
+ rescue FlatKit::Error => fe
28
+ raise fe
29
+ rescue => e
30
+ ::FlatKit.logger.error "Error reading jsonl records from #{output.name}: #{e}"
31
+ raise ::FlatKit::Error, e
32
+ end
33
+
34
+ def close
35
+ @output.close
36
+ end
37
+
38
+ def write_record(record)
39
+ # enforces ending in newlin if it doesn't already have one
40
+ output.io.puts record.to_s
41
+ @count += 1
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,71 @@
1
+ module FlatKit
2
+ # Private: The LeafNode is a wrapper around a Reader object to enable
3
+ # a consistent api for use in the MergeTree
4
+ #
5
+ # The LeafNode keeps track of the head of the reader list and when its value
6
+ # is used up, it pulls the next value and then notifies the next level of the
7
+ # MergeTree that its value has changed and so should do another play.
8
+ #
9
+ # If all the data is used up from the reader, it also notifies the next level
10
+ # of that so the next level can remove it from the tree.
11
+ class LeafNode
12
+
13
+ include Comparable
14
+
15
+ attr_reader :reader
16
+ attr_reader :value
17
+
18
+ attr_accessor :next_level
19
+
20
+ def initialize(reader)
21
+ @reader = reader
22
+ @enum = @reader.to_enum
23
+ @value = @enum.next
24
+
25
+ @next_level = nil
26
+ end
27
+
28
+ def winner
29
+ value
30
+ end
31
+
32
+ def sentinel?
33
+ false
34
+ end
35
+
36
+ def leaf?
37
+ true
38
+ end
39
+
40
+ def leaf
41
+ self
42
+ end
43
+
44
+ def update_and_replay
45
+ self.next
46
+ if finished? then
47
+ ::FlatKit.logger.debug "#{reader.source} has finished reading #{reader.count} records"
48
+ next_level.player_finished(self)
49
+ end
50
+ next_level.play
51
+ end
52
+
53
+ def next
54
+ begin
55
+ @value = @enum.next
56
+ rescue StopIteration
57
+ @value = nil
58
+ end
59
+ @value
60
+ end
61
+
62
+ def finished?
63
+ @enum && @value.nil?
64
+ end
65
+
66
+ def <=>(other)
67
+ return -1 if other.sentinel?
68
+ self.value.<=>(other.value)
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,39 @@
1
+ require 'logger'
2
+
3
+ module FlatKit
4
+ class LogFormatter < ::Logger::Formatter
5
+ FORMAT = "%s %5d %05s : %s\n".freeze
6
+ DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ".freeze
7
+ def initialize
8
+ super
9
+ self.datetime_format = DATETIME_FORMAT
10
+ end
11
+
12
+ def call(severity, time, progname, msg)
13
+ FORMAT % [format_datetime(time.utc), Process.pid, severity, msg2str(msg)]
14
+ end
15
+ end
16
+
17
+ class Logger
18
+ def self.for_io(io)
19
+ ::Logger.new(io, formatter: LogFormatter.new)
20
+ end
21
+
22
+ def self.for_path(path)
23
+ io = File.open(path.to_s, "a")
24
+ for_io(io)
25
+ end
26
+ end
27
+
28
+ def self.log_to(destination = $stderr)
29
+ if destination.kind_of?(::IO) then
30
+ @logger = ::FlatKit::Logger.for_io(destination)
31
+ else
32
+ @logger = ::FlatKit::Logger.for_path(destination)
33
+ end
34
+ end
35
+
36
+ def self.logger
37
+ @logger ||= ::FlatKit::Logger.for_io($stderr)
38
+ end
39
+ end
@@ -0,0 +1,35 @@
1
+ module FlatKit
2
+ class Merge
3
+ attr_reader :readers
4
+ attr_reader :writer
5
+ attr_reader :compare_fields
6
+
7
+ def initialize(inputs:, input_fallback: "auto",
8
+ output:, output_fallback: "auto",
9
+ compare_fields:)
10
+ @compare_fields = compare_fields
11
+ @readers = ::FlatKit::Reader.create_readers_from_paths(paths: inputs, compare_fields: @compare_fields,
12
+ fallback: input_fallback)
13
+ @writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
14
+ reader_format: @readers.first.format_name)
15
+ end
16
+
17
+ def call
18
+ ::FlatKit.logger.info "Merging the following files into #{writer.destination}"
19
+ ::FlatKit.logger.info "Using this key for sorting: #{compare_fields.join(", ")}"
20
+ readers.each do |r|
21
+ ::FlatKit.logger.info " #{r.source}"
22
+ end
23
+
24
+ merge_tree = ::FlatKit::MergeTree.new(readers)
25
+ merge_tree.each do |record|
26
+ writer.write(record)
27
+ end
28
+ readers.each do |r|
29
+ ::FlatKit.logger.info " #{r.source} produced #{r.count} records"
30
+ end
31
+ writer.close
32
+ ::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,104 @@
1
+ module FlatKit
2
+ # Public: Merge a list of sorted records from Readers into a single output Writer
3
+ #
4
+ # The MergeTree implements a Tournament Tree algorightm to do a k-way merge
5
+ # between Reader objects:
6
+ #
7
+ # https://en.wikipedia.org/wiki/K-way_merge_algorithm
8
+ #
9
+ # Usage:
10
+ #
11
+ # compare_fields = %w[ key timestamp ]
12
+ # format = ::FlatKit::Format.for('json')
13
+ #
14
+ # readers = ARGV.map do |path|
15
+ # format.reader.new(source: path, compare_fields: compare_fields)
16
+ # end
17
+ #
18
+ # write_path = "merged.jsonl"
19
+ # writer = format.writer.new(destination: write_path.to_s)
20
+ #
21
+ # tree = ::FlatKit::MergeTree.new(readers)
22
+ #
23
+ # tree.each do |record|
24
+ # writer.write(record)
25
+ # end
26
+ # writer.close
27
+ #
28
+ #
29
+ class MergeTree
30
+ include Enumerable
31
+
32
+ attr_reader :leaves
33
+ attr_reader :levels
34
+ attr_reader :readers
35
+
36
+ def initialize(readers)
37
+ @readers = readers
38
+ @leaves = []
39
+ @levels = []
40
+
41
+ @readers.each do |reader|
42
+ @leaves << LeafNode.new(reader)
43
+ end
44
+
45
+ # Need to pad the leaves to an even number so that the slicing by 2 for
46
+ # the tournament will work
47
+ if @leaves.size.odd? then
48
+ @leaves << SentinelLeafNode.new
49
+ end
50
+
51
+ init_tree
52
+ end
53
+
54
+ #
55
+ # Initialize the tournament tree, go in depths - bottom layer will be the
56
+ # winners of the 2 leaf nodes, continuing until top layer is just 1 node
57
+ #
58
+ def init_tree
59
+ values = @leaves.dup
60
+ loop do
61
+ break if values.size == 1
62
+
63
+ winners = []
64
+
65
+ # we alays need a left and a right node, there is the possibility of
66
+ # adding a single Sentinel node as the final right node in each level
67
+ values.each_slice(2) do |left, right|
68
+ right = SentinelInternalNode.new if right.nil?
69
+ winners << InternalNode.new(left: left, right: right)
70
+ end
71
+ values = winners
72
+ @levels << winners
73
+ end
74
+ end
75
+
76
+ #
77
+ # Root is the last level - should only have one node
78
+ #
79
+ def root
80
+ @levels[-1].first
81
+ end
82
+
83
+ #
84
+ # The number of levels, this shold be the logn(readers.size)
85
+ #
86
+ def depth
87
+ @levels.size
88
+ end
89
+
90
+ #
91
+ # Iterate over all the ements from all the readers yielding them in sorted
92
+ # order.
93
+ #
94
+ def each
95
+ loop do
96
+ break if root.leaf.finished?
97
+ yield root.value
98
+ # consume the yielded value and have the tournament tree replay those
99
+ # brackets affected
100
+ root.leaf.update_and_replay
101
+ end
102
+ end
103
+ end
104
+ end