flat_kit 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +46 -0
  3. data/HISTORY.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/Manifest.txt +66 -0
  6. data/README.md +80 -0
  7. data/Rakefile +20 -0
  8. data/bin/fk +4 -0
  9. data/lib/flat_kit.rb +23 -0
  10. data/lib/flat_kit/cli.rb +80 -0
  11. data/lib/flat_kit/command.rb +53 -0
  12. data/lib/flat_kit/command/cat.rb +93 -0
  13. data/lib/flat_kit/command/merge.rb +88 -0
  14. data/lib/flat_kit/command/sort.rb +88 -0
  15. data/lib/flat_kit/descendant_tracker.rb +27 -0
  16. data/lib/flat_kit/error.rb +5 -0
  17. data/lib/flat_kit/format.rb +34 -0
  18. data/lib/flat_kit/input.rb +32 -0
  19. data/lib/flat_kit/input/file.rb +53 -0
  20. data/lib/flat_kit/input/io.rb +54 -0
  21. data/lib/flat_kit/internal_node.rb +84 -0
  22. data/lib/flat_kit/jsonl.rb +8 -0
  23. data/lib/flat_kit/jsonl/format.rb +25 -0
  24. data/lib/flat_kit/jsonl/reader.rb +30 -0
  25. data/lib/flat_kit/jsonl/record.rb +84 -0
  26. data/lib/flat_kit/jsonl/writer.rb +45 -0
  27. data/lib/flat_kit/leaf_node.rb +71 -0
  28. data/lib/flat_kit/logger.rb +39 -0
  29. data/lib/flat_kit/merge.rb +35 -0
  30. data/lib/flat_kit/merge_tree.rb +104 -0
  31. data/lib/flat_kit/output.rb +32 -0
  32. data/lib/flat_kit/output/file.rb +55 -0
  33. data/lib/flat_kit/output/io.rb +73 -0
  34. data/lib/flat_kit/reader.rb +61 -0
  35. data/lib/flat_kit/record.rb +83 -0
  36. data/lib/flat_kit/sentinel_internal_node.rb +37 -0
  37. data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
  38. data/lib/flat_kit/sort.rb +35 -0
  39. data/lib/flat_kit/writer.rb +38 -0
  40. data/lib/flat_kit/xsv.rb +8 -0
  41. data/lib/flat_kit/xsv/format.rb +25 -0
  42. data/lib/flat_kit/xsv/reader.rb +45 -0
  43. data/lib/flat_kit/xsv/record.rb +90 -0
  44. data/lib/flat_kit/xsv/writer.rb +70 -0
  45. data/tasks/default.rake +242 -0
  46. data/tasks/extension.rake +38 -0
  47. data/tasks/man.rake +7 -0
  48. data/tasks/this.rb +208 -0
  49. data/test/device_dataset.rb +117 -0
  50. data/test/input/test_file.rb +73 -0
  51. data/test/input/test_io.rb +93 -0
  52. data/test/jsonl/test_format.rb +22 -0
  53. data/test/jsonl/test_reader.rb +49 -0
  54. data/test/jsonl/test_record.rb +61 -0
  55. data/test/jsonl/test_writer.rb +68 -0
  56. data/test/output/test_file.rb +60 -0
  57. data/test/output/test_io.rb +104 -0
  58. data/test/test_conversions.rb +45 -0
  59. data/test/test_format.rb +24 -0
  60. data/test/test_helper.rb +26 -0
  61. data/test/test_merge.rb +40 -0
  62. data/test/test_merge_tree.rb +64 -0
  63. data/test/test_version.rb +11 -0
  64. data/test/xsv/test_format.rb +22 -0
  65. data/test/xsv/test_reader.rb +61 -0
  66. data/test/xsv/test_record.rb +69 -0
  67. data/test/xsv/test_writer.rb +68 -0
  68. metadata +237 -0
@@ -0,0 +1,32 @@
1
+ module FlatKit
2
+ class Output
3
+ extend DescendantTracker
4
+
5
+ def self.from(out)
6
+ return out if out.kind_of?(::FlatKit::Output)
7
+
8
+ out_klass = find_child(:handles?, out)
9
+ if out_klass then
10
+ return out_klass.new(out)
11
+ end
12
+
13
+ raise FlatKit::Error, "Unable to create output from #{out.class} : #{out.inspect}"
14
+ end
15
+
16
+ def name
17
+ raise NotImplementedError, "#{self.class} must implement #name"
18
+ end
19
+
20
+ #
21
+ def io
22
+ raise NotImplementedError, "#{self.class} must implement #io"
23
+ end
24
+
25
+ def close
26
+ raise NotImplementedError, "#{self.class} must implement #close"
27
+ end
28
+ end
29
+ end
30
+
31
+ require 'flat_kit/output/io'
32
+ require 'flat_kit/output/file'
@@ -0,0 +1,55 @@
1
+ require 'zlib'
2
+
3
+ module FlatKit
4
+ class Output
5
+ class File < Output
6
+ attr_reader :path
7
+
8
+ def self.handles?(obj)
9
+ return true if obj.instance_of?(Pathname)
10
+ return false unless obj.instance_of?(String)
11
+
12
+ # incase these get loaded in different orders
13
+ return false if ::FlatKit::Output::IO.is_stdout?(obj)
14
+ return false if ::FlatKit::Output::IO.is_stderr?(obj)
15
+
16
+ return true
17
+ end
18
+
19
+ def initialize(obj)
20
+ @path = Pathname.new(obj)
21
+ path.dirname.mkpath
22
+ @io = open_output(path)
23
+ end
24
+
25
+ def name
26
+ path.to_s
27
+ end
28
+
29
+ def close
30
+ @io.close
31
+ end
32
+
33
+ # internal api method for testing purposes
34
+ def io
35
+ @io
36
+ end
37
+
38
+ private
39
+
40
+ # open the opropriate otuput type depending on the destination file name
41
+ #
42
+ # TODO: add in bzip
43
+ def open_output(path)
44
+ case path.extname
45
+ when ".gz"
46
+ Zlib::GzipWriter.open(path.to_s)
47
+ # when ".gz"
48
+ # ::IO.popen("gzip -c > #{path}", "w")
49
+ else
50
+ path.open("wb")
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,73 @@
1
+ module FlatKit
2
+ class Output
3
+ class IO < Output
4
+ attr_reader :count
5
+
6
+ STDOUTS = %w[ stdout STDOUT - <stdout> ]
7
+ STDERRS = %w[ stderr STDERR <stderr> ]
8
+
9
+ def self.handles?(obj)
10
+ return true if is_stderr?(obj)
11
+ return true if is_stdout?(obj)
12
+ return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
13
+ return false
14
+ end
15
+
16
+ def self.is_stderr?(obj)
17
+ case obj
18
+ when String
19
+ return true if STDERRS.include?(obj)
20
+ when ::IO
21
+ return true if obj == ::STDERR
22
+ end
23
+ return false
24
+ end
25
+
26
+ def self.is_stdout?(obj)
27
+ case obj
28
+ when String
29
+ return true if STDOUTS.include?(obj)
30
+ when ::IO
31
+ return true if obj == ::STDOUT
32
+ end
33
+ return false
34
+ end
35
+
36
+ def initialize(obj)
37
+ @count = 0
38
+ if self.class.is_stdout?(obj) then
39
+ @name = "<STDOUT>"
40
+ @io = $stdout
41
+ elsif self.class.is_stderr?(obj) then
42
+ @name = "<STDERR>"
43
+ @io = $stderr
44
+ elsif obj.kind_of?(::File) then
45
+ @name = obj.path
46
+ @io = obj
47
+ elsif obj.kind_of?(::StringIO) then
48
+ @name = obj.inspect
49
+ @io = obj
50
+ elsif obj.kind_of?(::IO) then
51
+ @name = obj.inspect
52
+ @io = obj
53
+ else
54
+ raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
55
+ end
56
+ end
57
+
58
+ def name
59
+ @name
60
+ end
61
+
62
+ # this goes to an io stream and we are not in charge of opening it
63
+ def close
64
+ @io.close
65
+ end
66
+
67
+ # internal api method for testing
68
+ def io
69
+ @io
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,61 @@
1
+ module FlatKit
2
+ # Public: the base class for all format readers.
3
+ #
4
+ # A format reader needs to be able to open the appropriate file format and
5
+ # implement Enumerable to iterate over all the records in the file format.
6
+ #
7
+ # If it is appropriate for the reader to be able to read from a IO object
8
+ # directly, that needs to be supported also.
9
+ #
10
+ # The ::FlatKit::Reader class should never be used directly, only the reader
11
+ # from the appropriate format should be used.
12
+ #
13
+ #
14
+ # API:
15
+ #
16
+ # initialize(source:, compare_fields:)
17
+ # each -> Yields / returns
18
+ #
19
+ class Reader
20
+ include Enumerable
21
+
22
+ attr_reader :source
23
+ attr_reader :compare_fields
24
+
25
+ def self.create_reader_from_path(path: "-", fallback: "auto", compare_fields: :none)
26
+ format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
27
+ return format.reader.new(source: path, compare_fields: compare_fields)
28
+ end
29
+
30
+ def self.create_readers_from_paths(paths:, fallback: "auto", compare_fields: :none)
31
+ # default to stdin if there are no paths
32
+ if paths.empty? then
33
+ paths << "-"
34
+ end
35
+
36
+ paths.map do |path|
37
+ create_reader_from_path(path: path, fallback: fallback, compare_fields: compare_fields)
38
+ end
39
+ end
40
+
41
+ def initialize(source:, compare_fields: :none)
42
+ @source = source
43
+ @compare_fields = resolve_compare_fields(compare_fields)
44
+ end
45
+
46
+ def format_name
47
+ self.class.format_name
48
+ end
49
+
50
+ def each
51
+ raise NotImplementedError, "#{self.class} needs to implement #each"
52
+ end
53
+
54
+ private
55
+
56
+ def resolve_compare_fields(value)
57
+ return [] if value == :none
58
+ return value
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,83 @@
1
+ module FlatKit
2
+ # Public: The base class that all record classes should inherit from.
3
+ #
4
+ # Its goal is to be an efficient comparator of data that can be inflated from
5
+ # a source structure to a fully realized hash.
6
+ #
7
+ # All records need to be able to be initialized from a data structure that it
8
+ # is handed to it by the Reader intance within the same Format.
9
+ #
10
+ # Records are generally not going to be created outside of this library, they
11
+ # are tied to a specific format and provide a common interface that can be
12
+ # used for:
13
+ #
14
+ # * comparison between records from different source / destinations formats
15
+ # * conversion to a different format
16
+ #
17
+ # Given that - the way to create a record is either from another Record
18
+ # instance:
19
+ #
20
+ # Record.from_record(other) # create a record from another record
21
+ #
22
+ # or the way a Reader will do it
23
+ #
24
+ # Record.new(...) # generally only used by a Reader instance to
25
+ # # yield new reocrds
26
+ #
27
+ #
28
+ # When Implementing a new Format, the corresponding Record class for that
29
+ # Format must:
30
+ #
31
+ # * implement `#[](key)` which will be used to lookup the values of the
32
+ # comparable fields.
33
+ # * implement `#to_hash` which is used when conversions
34
+ # * implement `.from_record` which is used in conversion
35
+ # # the initialize method must call super(data:, compare_fields:) to
36
+ # initializa the root data structures
37
+ class Record
38
+
39
+ include Comparable
40
+
41
+ attr_reader :data
42
+ attr_reader :compare_fields
43
+
44
+ def initialize(data:, compare_fields:)
45
+ @data = data
46
+ @compare_fields = compare_fields
47
+ end
48
+
49
+ def format_name
50
+ self.class.format_name
51
+ end
52
+
53
+ def <=>(other)
54
+ compare_result = nil
55
+
56
+ compare_fields.each do |field|
57
+ my_val = self[field]
58
+ other_val = other[field]
59
+
60
+ return 0 if my_val.nil? && other_val.nil?
61
+ return -1 if my_val.nil?
62
+ return 1 if other_val.nil?
63
+
64
+ compare_result = my_val.<=>(other_val)
65
+
66
+ return compare_result unless compare_result.zero?
67
+ end
68
+ compare_result
69
+ end
70
+
71
+ def [](key)
72
+ raise NotImplementedError, "#{self.class} must implement #[](key)"
73
+ end
74
+
75
+ def to_hash
76
+ raise NotImplementedError, "#{self.class} must implement #to_hash"
77
+ end
78
+
79
+ def self.from_record
80
+ raise NotImplementedError, "#{self.class} must implement #{self.class}.from_record"
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,37 @@
1
+ module FlatKit
2
+ # Private: The Sentinel Internal Node is a private class used by the MergeTree
3
+ # class.
4
+ #
5
+ # This class represents an empty / completed node in the merge tree where all
6
+ # the data from the descendant leaf node is full used up.
7
+ #
8
+ class SentinelInternalNode
9
+ include Comparable
10
+
11
+ attr_reader :left
12
+ attr_reader :right
13
+ attr_reader :winner
14
+ attr_accessor :next_level
15
+
16
+ def initialize(left: nil, right: nil)
17
+ @left = nil
18
+ @right = nil
19
+ @winner = nil
20
+ @next_level = nil
21
+ end
22
+
23
+ def sentinel?
24
+ true
25
+ end
26
+
27
+ def leaf?
28
+ true
29
+ end
30
+
31
+ # A sentinal node is always greater than any other node
32
+ def <=>(other)
33
+ return 0 if other.sentinel?
34
+ return 1
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ module FlatKit
2
+ # Private: The Sentinel Leaf Node is used internally by the MergeTree
3
+ #
4
+ # This class represents a LeafNode that has no more data
5
+ #
6
+ class SentinelLeafNode
7
+ include Comparable
8
+
9
+ attr_accessor :next_level
10
+
11
+ def sentinel?
12
+ true
13
+ end
14
+
15
+ def leaf?
16
+ true
17
+ end
18
+
19
+ def next
20
+ nil
21
+ end
22
+
23
+ def finished?
24
+ true
25
+ end
26
+
27
+ def value
28
+ nil
29
+ end
30
+
31
+ # A sentinal node is always greater than any other node
32
+ def <=>(other)
33
+ return 0 if other.sentinel?
34
+ return 1
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,35 @@
1
+ module FlatKit
2
+ class Sort
3
+ attr_reader :reader
4
+ attr_reader :writer
5
+ attr_reader :compare_fields
6
+
7
+ def initialize(input:, input_fallback: "auto",
8
+ output:, output_fallback: "auto",
9
+ compare_fields:)
10
+
11
+ @compare_fields = compare_fields
12
+ @reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
13
+ fallback: input_fallback)
14
+ @writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
15
+ reader_format: @reader.format_name)
16
+ end
17
+
18
+ def call
19
+ ::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(", ")}"
20
+ records = Array.new.tap do |a|
21
+ reader.each do |r|
22
+ a << r
23
+ end
24
+ end
25
+ ::FlatKit.logger.info "Read #{reader.count} records into #{records.size} element array"
26
+ records.sort!
27
+ ::FlatKit.logger.info "Sorted #{records.size} records"
28
+ records.each do |r|
29
+ writer.write(r)
30
+ end
31
+ writer.close
32
+ ::FlatKit.logger.info "Wrote #{writer.count} records to #{writer.destination}"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,38 @@
1
+ module FlatKit
2
+ # Public: The base class for all format writers.
3
+ #
4
+ # A format writer will only write those Records, and on that, only those of
5
+ # its own format.
6
+ #
7
+ # It must implement a #write methods takes a Record. It can convert the record
8
+ # to one matching its own format if it whishes. But it should in any case
9
+ # check the Record format to make sure it matches
10
+ #
11
+ # See the Xsv::Writer and Jsonl::Writer for examples.
12
+ #
13
+ class Writer
14
+ attr_reader :destination
15
+
16
+ def self.create_writer_from_path(path:, fallback:, reader_format:)
17
+ fallback = reader_format if fallback == "auto"
18
+ format = ::FlatKit::Format.for_with_fallback!(path: path, fallback: fallback)
19
+ format.writer.new(destination: path)
20
+ end
21
+
22
+ def initialize(destination:)
23
+ @destination = destination
24
+ end
25
+
26
+ def format_name
27
+ self.class.format_name
28
+ end
29
+
30
+ def write(record)
31
+ raise NotImplementedError, "#{self.class} needs to implement #write"
32
+ end
33
+
34
+ def close
35
+ raise NotImplementedError, "#{self.class} needs to implement #close"
36
+ end
37
+ end
38
+ end