flat_kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +46 -0
  3. data/HISTORY.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/Manifest.txt +66 -0
  6. data/README.md +80 -0
  7. data/Rakefile +20 -0
  8. data/bin/fk +4 -0
  9. data/lib/flat_kit.rb +23 -0
  10. data/lib/flat_kit/cli.rb +80 -0
  11. data/lib/flat_kit/command.rb +53 -0
  12. data/lib/flat_kit/command/cat.rb +93 -0
  13. data/lib/flat_kit/command/merge.rb +88 -0
  14. data/lib/flat_kit/command/sort.rb +88 -0
  15. data/lib/flat_kit/descendant_tracker.rb +27 -0
  16. data/lib/flat_kit/error.rb +5 -0
  17. data/lib/flat_kit/format.rb +34 -0
  18. data/lib/flat_kit/input.rb +32 -0
  19. data/lib/flat_kit/input/file.rb +53 -0
  20. data/lib/flat_kit/input/io.rb +54 -0
  21. data/lib/flat_kit/internal_node.rb +84 -0
  22. data/lib/flat_kit/jsonl.rb +8 -0
  23. data/lib/flat_kit/jsonl/format.rb +25 -0
  24. data/lib/flat_kit/jsonl/reader.rb +30 -0
  25. data/lib/flat_kit/jsonl/record.rb +84 -0
  26. data/lib/flat_kit/jsonl/writer.rb +45 -0
  27. data/lib/flat_kit/leaf_node.rb +71 -0
  28. data/lib/flat_kit/logger.rb +39 -0
  29. data/lib/flat_kit/merge.rb +35 -0
  30. data/lib/flat_kit/merge_tree.rb +104 -0
  31. data/lib/flat_kit/output.rb +32 -0
  32. data/lib/flat_kit/output/file.rb +55 -0
  33. data/lib/flat_kit/output/io.rb +73 -0
  34. data/lib/flat_kit/reader.rb +61 -0
  35. data/lib/flat_kit/record.rb +83 -0
  36. data/lib/flat_kit/sentinel_internal_node.rb +37 -0
  37. data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
  38. data/lib/flat_kit/sort.rb +35 -0
  39. data/lib/flat_kit/writer.rb +38 -0
  40. data/lib/flat_kit/xsv.rb +8 -0
  41. data/lib/flat_kit/xsv/format.rb +25 -0
  42. data/lib/flat_kit/xsv/reader.rb +45 -0
  43. data/lib/flat_kit/xsv/record.rb +90 -0
  44. data/lib/flat_kit/xsv/writer.rb +70 -0
  45. data/tasks/default.rake +242 -0
  46. data/tasks/extension.rake +38 -0
  47. data/tasks/man.rake +7 -0
  48. data/tasks/this.rb +208 -0
  49. data/test/device_dataset.rb +117 -0
  50. data/test/input/test_file.rb +73 -0
  51. data/test/input/test_io.rb +93 -0
  52. data/test/jsonl/test_format.rb +22 -0
  53. data/test/jsonl/test_reader.rb +49 -0
  54. data/test/jsonl/test_record.rb +61 -0
  55. data/test/jsonl/test_writer.rb +68 -0
  56. data/test/output/test_file.rb +60 -0
  57. data/test/output/test_io.rb +104 -0
  58. data/test/test_conversions.rb +45 -0
  59. data/test/test_format.rb +24 -0
  60. data/test/test_helper.rb +26 -0
  61. data/test/test_merge.rb +40 -0
  62. data/test/test_merge_tree.rb +64 -0
  63. data/test/test_version.rb +11 -0
  64. data/test/xsv/test_format.rb +22 -0
  65. data/test/xsv/test_reader.rb +61 -0
  66. data/test/xsv/test_record.rb +69 -0
  67. data/test/xsv/test_writer.rb +68 -0
  68. metadata +237 -0
@@ -0,0 +1,32 @@
1
+ module FlatKit
2
+ class Output
3
+ extend DescendantTracker
4
+
5
+ def self.from(out)
6
+ return out if out.kind_of?(::FlatKit::Output)
7
+
8
+ out_klass = find_child(:handles?, out)
9
+ if out_klass then
10
+ return out_klass.new(out)
11
+ end
12
+
13
+ raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
14
+ end
15
+
16
+ def name
17
+ raise NotImplementedError, "#{self.class} must implement #name"
18
+ end
19
+
20
+ #
21
+ def io
22
+ raise NotImplementedError, "#{self.class} must implement #io"
23
+ end
24
+
25
+ def close
26
+ raise NotImplementedError, "#{self.class} must implement #close"
27
+ end
28
+ end
29
+ end
30
+
31
+ require 'flat_kit/output/io'
32
+ require 'flat_kit/output/file'
@@ -0,0 +1,55 @@
1
+ require 'zlib'
2
+
3
+ module FlatKit
4
+ class Output
5
+ class File < Output
6
+ attr_reader :path
7
+
8
+ def self.handles?(obj)
9
+ return true if obj.instance_of?(Pathname)
10
+ return false unless obj.instance_of?(String)
11
+
12
+ # incase these get loaded in different orders
13
+ return false if ::FlatKit::Output::IO.is_stdout?(obj)
14
+ return false if ::FlatKit::Output::IO.is_stderr?(obj)
15
+
16
+ return true
17
+ end
18
+
19
+ def initialize(obj)
20
+ @path = Pathname.new(obj)
21
+ path.dirname.mkpath
22
+ @io = open_output(path)
23
+ end
24
+
25
+ def name
26
+ path.to_s
27
+ end
28
+
29
+ def close
30
+ @io.close
31
+ end
32
+
33
+ # internal api method for testing purposes
34
+ def io
35
+ @io
36
+ end
37
+
38
+ private
39
+
40
+ # open the opropriate otuput type depending on the destination file name
41
+ #
42
+ # TODO: add in bzip
43
+ def open_output(path)
44
+ case path.extname
45
+ when ".gz"
46
+ Zlib::GzipWriter.open(path.to_s)
47
+ # when ".gz"
48
+ # ::IO.popen("gzip -c > #{path}", "w")
49
+ else
50
+ path.open("wb")
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,73 @@
1
+ module FlatKit
2
+ class Output
3
+ class IO < Output
4
+ attr_reader :count
5
+
6
+ STDOUTS = %w[ stdout STDOUT - <stdout> ]
7
+ STDERRS = %w[ stderr STDERR <stderr> ]
8
+
9
+ def self.handles?(obj)
10
+ return true if is_stderr?(obj)
11
+ return true if is_stdout?(obj)
12
+ return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
13
+ return false
14
+ end
15
+
16
+ def self.is_stderr?(obj)
17
+ case obj
18
+ when String
19
+ return true if STDERRS.include?(obj)
20
+ when ::IO
21
+ return true if obj == ::STDERR
22
+ end
23
+ return false
24
+ end
25
+
26
+ def self.is_stdout?(obj)
27
+ case obj
28
+ when String
29
+ return true if STDOUTS.include?(obj)
30
+ when ::IO
31
+ return true if obj == ::STDOUT
32
+ end
33
+ return false
34
+ end
35
+
36
+ def initialize(obj)
37
+ @count = 0
38
+ if self.class.is_stdout?(obj) then
39
+ @name = "<STDOUT>"
40
+ @io = $stdout
41
+ elsif self.class.is_stderr?(obj) then
42
+ @name = "<STDERR>"
43
+ @io = $stderr
44
+ elsif obj.kind_of?(::File) then
45
+ @name = obj.path
46
+ @io = obj
47
+ elsif obj.kind_of?(::StringIO) then
48
+ @name = obj.inspect
49
+ @io = obj
50
+ elsif obj.kind_of?(::IO) then
51
+ @name = obj.inspect
52
+ @io = obj
53
+ else
54
+ raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
55
+ end
56
+ end
57
+
58
+ def name
59
+ @name
60
+ end
61
+
62
+ # this goes to an io stream and we are not in charge of opening it
63
+ def close
64
+ @io.close
65
+ end
66
+
67
+ # internal api method for testing
68
+ def io
69
+ @io
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,61 @@
1
+ module FlatKit
2
+ # Public: the base class for all format readers.
3
+ #
4
+ # A format reader needs to be able to open the appropriate file format and
5
+ # implement Enumerable to iterate over all the records in the file format.
6
+ #
7
+ # If it is appropriate for the reader to be able to read from a IO object
8
+ # directly, that needs to be supported also.
9
+ #
10
+ # The ::FlatKit::Reader class should never be used directly, only the reader
11
+ # from the appropriate format should be used.
12
+ #
13
+ #
14
+ # API:
15
+ #
16
+ # initialize(source:, compare_fields:)
17
+ # each -> Yields / returns
18
+ #
19
+ class Reader
20
+ include Enumerable
21
+
22
+ attr_reader :source
23
+ attr_reader :compare_fields
24
+
25
+ def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
26
+ format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
27
+ return format.reader.new(source: path, compare_fields: compare_fields)
28
+ end
29
+
30
+ def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
31
+ # default to stdin if there are no paths
32
+ if paths.empty? then
33
+ paths << "-"
34
+ end
35
+
36
+ paths.map do |path|
37
+ create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
38
+ end
39
+ end
40
+
41
+ def initialize(source:, compare_fields: :none)
42
+ @source = source
43
+ @compare_fields = resolve_compare_fields(compare_fields)
44
+ end
45
+
46
+ def format_name
47
+ self.class.format_name
48
+ end
49
+
50
+ def each
51
+ raise NotImplementedError, "#{self.class} needs to implement #each"
52
+ end
53
+
54
+ private
55
+
56
+ def resolve_compare_fields(value)
57
+ return [] if value == :none
58
+ return value
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,83 @@
1
+ module FlatKit
2
+ # Public: The base class that all record classes should inherit from.
3
+ #
4
+ # Its goal is to be an efficient comparator of data that can be inflated from
5
+ # a source structure to a fully realized hash.
6
+ #
7
+ # All records need to be able to be initialized from a data structure that it
8
+ # is handed to it by the Reader intance within the same Format.
9
+ #
10
+ # Records are generally not going to be created outside of this library, they
11
+ # are tied to a specific format and provide a common interface that can be
12
+ # used for:
13
+ #
14
+ # * comparison between records from different source / destinations formats
15
+ # * conversion to a different format
16
+ #
17
+ # Given that - the way to create a record is either from another Record
18
+ # instance:
19
+ #
20
+ # Record.from_record(other) # create a record from another record
21
+ #
22
+ # or the way a Reader will do it
23
+ #
24
+ # Record.new(...) # generally only used by a Reader instance to
25
+ # # yield new reocrds
26
+ #
27
+ #
28
+ # When Implementing a new Format, the corresponding Record class for that
29
+ # Format must:
30
+ #
31
+ # * implement `#[](key)` which will be used to lookup the values of the
32
+ # comparable fields.
33
+ # * implement `#to_hash` which is used when conversions
34
+ # * implement `.from_record` which is used in conversion
35
+ # # the initialize method must call super(data:, compare_fields:) to
36
+ # initializa the root data structures
37
+ class Record
38
+
39
+ include Comparable
40
+
41
+ attr_reader :data
42
+ attr_reader :compare_fields
43
+
44
+ def initialize(data:, compare_fields:)
45
+ @data = data
46
+ @compare_fields = compare_fields
47
+ end
48
+
49
+ def format_name
50
+ self.class.format_name
51
+ end
52
+
53
+ def <=>(other)
54
+ compare_result = nil
55
+
56
+ compare_fields.each do |field|
57
+ my_val = self[field]
58
+ other_val = other[field]
59
+
60
+ return 0 if my_val.nil? && other_val.nil?
61
+ return -1 if my_val.nil?
62
+ return 1 if other_val.nil?
63
+
64
+ compare_result = my_val.<=>(other_val)
65
+
66
+ return compare_result unless compare_result.zero?
67
+ end
68
+ compare_result
69
+ end
70
+
71
+ def [](key)
72
+ raise NotImplementedError, "#{self.class} must implement #[](key)"
73
+ end
74
+
75
+ def to_hash
76
+ raise NotImplementedError, "#{self.class} must implement #to_hash"
77
+ end
78
+
79
+ def self.from_record
80
+ raise NotImplementedError, "#{self.class} must implement #{self.class}.from_record"
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module FlatKit
2
+ # Private: The Sentinel Internal Node is a private class used by the MergeTree
3
+ # class.
4
+ #
5
+ # This class represents an empty / completed node in the merge tree where all
6
+ # the data from the descendant leaf node is full used up.
7
+ #
8
+ class SentinelInternalNode
9
+ include Comparable
10
+
11
+ attr_reader :left
12
+ attr_reader :right
13
+ attr_reader :winner
14
+ attr_accessor :next_level
15
+
16
+ def initialize(left: nil, right: nil)
17
+ @left = nil
18
+ @right = nil
19
+ @winner = nil
20
+ @next_level = nil
21
+ end
22
+
23
+ def sentinel?
24
+ true
25
+ end
26
+
27
+ def leaf?
28
+ true
29
+ end
30
+
31
+ # A sentinal node is always greater than any other node
32
+ def <=>(other)
33
+ return 0 if other.sentinel?
34
+ return 1
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ module FlatKit
2
+ # Private: The Sentinel Leaf Node is used internally by the MergeTree
3
+ #
4
+ # This class represents a LeafNode that has no more data
5
+ #
6
+ class SentinelLeafNode
7
+ include Comparable
8
+
9
+ attr_accessor :next_level
10
+
11
+ def sentinel?
12
+ true
13
+ end
14
+
15
+ def leaf?
16
+ true
17
+ end
18
+
19
+ def next
20
+ nil
21
+ end
22
+
23
+ def finished?
24
+ true
25
+ end
26
+
27
+ def value
28
+ nil
29
+ end
30
+
31
+ # A sentinal node is always greater than any other node
32
+ def <=>(other)
33
+ return 0 if other.sentinel?
34
+ return 1
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,35 @@
1
+ module FlatKit
2
+ class Sort
3
+ attr_reader :reader
4
+ attr_reader :writer
5
+ attr_reader :compare_fields
6
+
7
+ def initialize(input:, input_fallback: "auto",
8
+ output:, output_fallback: "auto",
9
+ compare_fields:)
10
+
11
+ @compare_fields = compare_fields
12
+ @reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
13
+ fallback: input_fallback)
14
+ @writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
15
+ reader_format: @reader.format_name)
16
+ end
17
+
18
+ def call
19
+ ::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(", ")}"
20
+ records = Array.new.tap do |a|
21
+ reader.each do |r|
22
+ a << r
23
+ end
24
+ end
25
+ ::FlatKit.logger.info "Read #{reader.count} records into #{records.size} element array"
26
+ records.sort!
27
+ ::FlatKit.logger.info "Sorted #{records.size} records"
28
+ records.each do |r|
29
+ writer.write(r)
30
+ end
31
+ writer.close
32
+ ::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,38 @@
1
+ module FlatKit
2
+ # Public: The base class for all format writers.
3
+ #
4
+ # A format writer will only write those Records, and on that, only those of
5
+ # its own format.
6
+ #
7
+ # It must implement a #write methods takes a Record. It can convert the record
8
+ # to one matching its own format if it whishes. But it should in any case
9
+ # check the Record format to make sure it matches
10
+ #
11
+ # See the Xsv::Writer and Jsonl::Writer for examples.
12
+ #
13
+ class Writer
14
+ attr_reader :destination
15
+
16
+ def self.create_writer_from_path(path:, fallback:, reader_format:)
17
+ fallback = reader_format if fallback == "auto"
18
+ format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
19
+ format.writer.new(destination: path)
20
+ end
21
+
22
+ def initialize(destination:)
23
+ @destination = destination
24
+ end
25
+
26
+ def format_name
27
+ self.class.format_name
28
+ end
29
+
30
+ def write(record)
31
+ raise NotImplementedError, "#{self.class} needs to implement #write"
32
+ end
33
+
34
+ def close
35
+ raise NotImplementedError, "#{self.class} needs to implement #close"
36
+ end
37
+ end
38
+ end