flat_kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +46 -0
  3. data/HISTORY.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/Manifest.txt +66 -0
  6. data/README.md +80 -0
  7. data/Rakefile +20 -0
  8. data/bin/fk +4 -0
  9. data/lib/flat_kit.rb +23 -0
  10. data/lib/flat_kit/cli.rb +80 -0
  11. data/lib/flat_kit/command.rb +53 -0
  12. data/lib/flat_kit/command/cat.rb +93 -0
  13. data/lib/flat_kit/command/merge.rb +88 -0
  14. data/lib/flat_kit/command/sort.rb +88 -0
  15. data/lib/flat_kit/descendant_tracker.rb +27 -0
  16. data/lib/flat_kit/error.rb +5 -0
  17. data/lib/flat_kit/format.rb +34 -0
  18. data/lib/flat_kit/input.rb +32 -0
  19. data/lib/flat_kit/input/file.rb +53 -0
  20. data/lib/flat_kit/input/io.rb +54 -0
  21. data/lib/flat_kit/internal_node.rb +84 -0
  22. data/lib/flat_kit/jsonl.rb +8 -0
  23. data/lib/flat_kit/jsonl/format.rb +25 -0
  24. data/lib/flat_kit/jsonl/reader.rb +30 -0
  25. data/lib/flat_kit/jsonl/record.rb +84 -0
  26. data/lib/flat_kit/jsonl/writer.rb +45 -0
  27. data/lib/flat_kit/leaf_node.rb +71 -0
  28. data/lib/flat_kit/logger.rb +39 -0
  29. data/lib/flat_kit/merge.rb +35 -0
  30. data/lib/flat_kit/merge_tree.rb +104 -0
  31. data/lib/flat_kit/output.rb +32 -0
  32. data/lib/flat_kit/output/file.rb +55 -0
  33. data/lib/flat_kit/output/io.rb +73 -0
  34. data/lib/flat_kit/reader.rb +61 -0
  35. data/lib/flat_kit/record.rb +83 -0
  36. data/lib/flat_kit/sentinel_internal_node.rb +37 -0
  37. data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
  38. data/lib/flat_kit/sort.rb +35 -0
  39. data/lib/flat_kit/writer.rb +38 -0
  40. data/lib/flat_kit/xsv.rb +8 -0
  41. data/lib/flat_kit/xsv/format.rb +25 -0
  42. data/lib/flat_kit/xsv/reader.rb +45 -0
  43. data/lib/flat_kit/xsv/record.rb +90 -0
  44. data/lib/flat_kit/xsv/writer.rb +70 -0
  45. data/tasks/default.rake +242 -0
  46. data/tasks/extension.rake +38 -0
  47. data/tasks/man.rake +7 -0
  48. data/tasks/this.rb +208 -0
  49. data/test/device_dataset.rb +117 -0
  50. data/test/input/test_file.rb +73 -0
  51. data/test/input/test_io.rb +93 -0
  52. data/test/jsonl/test_format.rb +22 -0
  53. data/test/jsonl/test_reader.rb +49 -0
  54. data/test/jsonl/test_record.rb +61 -0
  55. data/test/jsonl/test_writer.rb +68 -0
  56. data/test/output/test_file.rb +60 -0
  57. data/test/output/test_io.rb +104 -0
  58. data/test/test_conversions.rb +45 -0
  59. data/test/test_format.rb +24 -0
  60. data/test/test_helper.rb +26 -0
  61. data/test/test_merge.rb +40 -0
  62. data/test/test_merge_tree.rb +64 -0
  63. data/test/test_version.rb +11 -0
  64. data/test/xsv/test_format.rb +22 -0
  65. data/test/xsv/test_reader.rb +61 -0
  66. data/test/xsv/test_record.rb +69 -0
  67. data/test/xsv/test_writer.rb +68 -0
  68. metadata +237 -0
@@ -0,0 +1,88 @@
1
+ require 'csv'
2
+ module FlatKit
3
+ class Command
4
+ class Merge < ::FlatKit::Command
5
+
6
+ def self.name
7
+ "merge"
8
+ end
9
+
10
+ def self.description
11
+ "Merge sorted files together that have the same structure."
12
+ end
13
+
14
+ def self.parser
15
+ ::Optimist::Parser.new do
16
+ banner "#{Merge.description}"
17
+ banner ""
18
+
19
+ banner <<~BANNER
20
+ Given a set of input files that have the same structure, and are already
21
+ sorted by a set of keys. The Merge command will merge all those files
22
+ into a single output file.
23
+
24
+ The --key parameter is required, and it must be a comma separated list
25
+ of field nams on the input on which to use as the sort key for the merge
26
+ process.
27
+
28
+ There must also be at least 2 input files. Merging only 1 file into an
29
+ output file is the same as the 'cat' command.
30
+
31
+ The flatfile type(s) will be automatically determined by the file name.
32
+ If the output is not a file, but to stdout then the output type will
33
+ be the same as the first input file, or it can be specified as a commandline
34
+ switch.
35
+
36
+ The merge will do a single pass through the input to generate the
37
+ output.
38
+ BANNER
39
+
40
+ banner <<~USAGE
41
+
42
+ Usage:
43
+ fk merge --key surname,given_name file1.csv file2.csv > all.csv
44
+ fk merge --key surname,given_name --output-format json file1.csv file2.csv > all.json
45
+ fk merge --key field1,field2 --output-format json input*.csv | gzip -c > all.json.gz
46
+ fk merge --key field12 file*.json.gz -o all.json.gz
47
+
48
+ USAGE
49
+
50
+ banner <<~OPTIONS
51
+
52
+ Options:
53
+
54
+ OPTIONS
55
+
56
+ opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
57
+ opt :input_format, "Input format, csv or json", default: "auto", short: :none
58
+ opt :output_format, "Output format, csv or json", default: "auto", short: :none
59
+ opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
60
+ end
61
+ end
62
+
63
+ attr_reader :compare_keys
64
+
65
+ def parse
66
+ parser = self.class.parser
67
+ ::Optimist::with_standard_exception_handling(parser) do
68
+ begin
69
+ @opts = parser.parse(argv)
70
+ @compare_keys = CSV.parse_line(opts[:key])
71
+ paths = parser.leftovers
72
+ raise ::Optimist::CommandlineError, "At least 2 input files are required" if paths.size < 2
73
+
74
+ @merge = ::FlatKit::Merge.new(inputs: paths, input_fallback: opts[:input_format],
75
+ compare_fields: @compare_keys,
76
+ output: opts[:output], output_fallback: opts[:output_format])
77
+ rescue ::FlatKit::Error => e
78
+ raise ::Optimist::CommandlineError, e.message
79
+ end
80
+ end
81
+ end
82
+
83
+ def call
84
+ @merge.call
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,88 @@
1
+ require 'csv'
2
+ module FlatKit
3
+ class Command
4
+ class Sort < ::FlatKit::Command
5
+
6
+ def self.name
7
+ "sort"
8
+ end
9
+
10
+ def self.description
11
+ "Sort a given file by a set of fields."
12
+ end
13
+
14
+ def self.parser
15
+ ::Optimist::Parser.new do
16
+ banner "#{Sort.description}"
17
+ banner ""
18
+
19
+ banner <<~BANNER
20
+ Given an input file and a sort key, order the records in that file by that
21
+ key. If no input file is given the stdin is assumed. If no output file
22
+ is given then stdout is assumed.
23
+
24
+ The --key parameter is required, and it must be a comma separated list
25
+ of field nams on the input on which to use as the sort key for the merge
26
+ process.
27
+
28
+ There must also be only 1 input files.
29
+
30
+ The flatfile type(s) will be automatically determined by the file name.
31
+ If the output is not a file, but to stdout then the output type will
32
+ be the same as the first input file, or it can be specified as a commandline
33
+ switch.
34
+
35
+ BANNER
36
+
37
+ banner <<~USAGE
38
+
39
+ Usage:
40
+ fk sort --key surname,given_name file.csv > sorted.csv
41
+ fk sort --key surname,given_name --output-format json file.csv > sorted.json
42
+ fk sort --key field1,field2 --output-format json input.csv | gzip -c > sorted.json.gz
43
+ fk sort --key field1 file.json.gz -o sorted.json.gz
44
+ gunzip -c file.json.gz | fk sort --key field1 --input-format json --output-format json > gzip -c sorted.json.gz
45
+
46
+ USAGE
47
+
48
+ banner <<~OPTIONS
49
+
50
+ Options:
51
+
52
+ OPTIONS
53
+
54
+ opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
55
+ opt :input_format, "Input format, csv or json", default: "auto", short: :none
56
+ opt :output_format, "Output format, csv or json", default: "auto", short: :none
57
+ opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
58
+ end
59
+ end
60
+
61
+ attr_reader :compare_keys
62
+ attr_reader :reader
63
+ attr_reader :sort
64
+
65
+ def parse
66
+ parser = self.class.parser
67
+ ::Optimist::with_standard_exception_handling(parser) do
68
+ begin
69
+ @opts = parser.parse(argv)
70
+ @compare_keys = CSV.parse_line(opts[:key])
71
+ paths = parser.leftovers
72
+ raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
73
+ path = paths.first || "-" # default to stdin
74
+ @sort = ::FlatKit::Sort.new(input: path, input_fallback: opts[:input_format],
75
+ output: opts[:output], output_fallback: opts[:output_format],
76
+ compare_fields: @compare_keys)
77
+ rescue ::FlatKit::Error => e
78
+ raise ::Optimist::CommandlineError, e.message
79
+ end
80
+ end
81
+ end
82
+
83
+ def call
84
+ sort.call
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,27 @@
1
+ require 'set'
2
+
3
+ module FlatKit
4
+ module DescendantTracker
5
+ def inherited(klass)
6
+ super
7
+ return unless klass.instance_of?(Class)
8
+ self.children << klass
9
+ end
10
+
11
+ def children
12
+ unless defined? @_children
13
+ @_children = Set.new
14
+ end
15
+ @_children
16
+ end
17
+
18
+ #
19
+ # Find the first child that returns truthy from the given method with args
20
+ #
21
+ def find_child(method, *args)
22
+ children.find do |child_klass|
23
+ child_klass.send(method, *args)
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,5 @@
1
+ module FlatKit
2
+ class Error < ::StandardError
3
+ class UnknownFormat < ::FlatKit::Error; end
4
+ end
5
+ end
@@ -0,0 +1,34 @@
1
+ module FlatKit
2
+ class Format
3
+ extend DescendantTracker
4
+
5
+ def self.format_name
6
+ raise NotImplementedError, "#{self.class} must implemente #{self.class}.format_name"
7
+ end
8
+
9
+ def format_name
10
+ self.class.format_name
11
+ end
12
+
13
+ def self.for(path)
14
+ find_child(:handles?, path.to_s)
15
+ end
16
+
17
+ def self.for_with_fallback(path:, fallback: "auto")
18
+ # test by path
19
+ format = ::FlatKit::Format.for(path)
20
+ return format unless format.nil?
21
+
22
+ # now try the fallback
23
+ format = ::FlatKit::Format.for(fallback)
24
+ return format
25
+ end
26
+
27
+ def self.for_with_fallback!(path:, fallback: "auto")
28
+ format = for_with_fallback(path: path, fallback: fallback)
29
+ raise ::FlatKit::Error::UnknownFormat, "Unable to figure out format for '#{path}' with fallback '#{fallback}'" if format.nil?
30
+
31
+ return format
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,32 @@
1
+ module FlatKit
2
+ class Input
3
+ extend DescendantTracker
4
+
5
+ def self.from(input)
6
+ return input if input.kind_of?(::FlatKit::Input)
7
+
8
+ in_klass = find_child(:handles?, input)
9
+ if in_klass then
10
+ return in_klass.new(input)
11
+ end
12
+
13
+ raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
14
+ end
15
+
16
+ def name
17
+ raise NotImplementedError, "#{self.class} must implement #name"
18
+ end
19
+
20
+ #
21
+ def io
22
+ raise NotImplementedError, "#{self.class} must implement #io"
23
+ end
24
+
25
+ def close
26
+ raise NotImplementedError, "#{self.class} must implement #close"
27
+ end
28
+ end
29
+ end
30
+
31
+ require 'flat_kit/input/io'
32
+ require 'flat_kit/input/file'
@@ -0,0 +1,53 @@
1
+ require 'zlib'
2
+
3
+ module FlatKit
4
+ class Input
5
+ class File < Input
6
+ attr_reader :path
7
+ attr_reader :count
8
+
9
+ def self.handles?(obj)
10
+ return true if obj.instance_of?(Pathname)
11
+ return false unless obj.instance_of?(String)
12
+
13
+ # incase these get loaded in different orders
14
+ return false if ::FlatKit::Input::IO.is_stdin?(obj)
15
+
16
+ return true
17
+ end
18
+
19
+ def initialize(obj)
20
+ @count = 0
21
+ @path = Pathname.new(obj)
22
+ raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
23
+ @io = open_input(path)
24
+ end
25
+
26
+ def name
27
+ path.to_s
28
+ end
29
+
30
+ def close
31
+ @io.close
32
+ end
33
+
34
+ def io
35
+ @io
36
+ end
37
+
38
+ private
39
+
40
+ # open the opropriate input type depending on the source file name
41
+ #
42
+ # TODO: add in bzip
43
+ def open_input(path)
44
+ case path.extname
45
+ when ".gz"
46
+ Zlib::GzipReader.open(path.to_s)
47
+ else
48
+ path.open("rb")
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,54 @@
1
+ module FlatKit
2
+ class Input
3
+ class IO < Input
4
+ STDINS = %w[ stdin STDIN - <stdin> ]
5
+
6
+ def self.handles?(obj)
7
+ return true if is_stdin?(obj)
8
+ return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
9
+ return false
10
+ end
11
+
12
+ def self.is_stdin?(obj)
13
+ case obj
14
+ when String
15
+ return true if STDINS.include?(obj)
16
+ when ::IO
17
+ return true if obj == ::STDIN
18
+ end
19
+ return false
20
+ end
21
+
22
+ def initialize(obj)
23
+ if self.class.is_stdin?(obj) then
24
+ @name = "<STDIN>"
25
+ @io = $stdin
26
+ elsif obj.kind_of?(::File) then
27
+ @name = obj.path
28
+ @io = obj
29
+ elsif obj.kind_of?(::StringIO) then
30
+ @name = obj.inspect
31
+ @io = obj
32
+ elsif obj.kind_of?(::IO) then
33
+ @name = obj.inspect
34
+ @io = obj
35
+ else
36
+ raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
37
+ end
38
+ end
39
+
40
+ def name
41
+ @name
42
+ end
43
+
44
+ # this goes to an io stream and we are not in charge of opening it
45
+ def close
46
+ @io.close
47
+ end
48
+
49
+ def io
50
+ @io
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,84 @@
1
+ module FlatKit
2
+ # Private: This is a class used internally by MergeTree and should not be used
3
+ # outside of that context.
4
+ #
5
+ # The InternalNode represents a single element of the tournament tree
6
+ # altorithm holding references to the to other internal nodes that competed in
7
+ # this node and which one is the winner.
8
+ #
9
+ # A reference to the leaf node that is associated with the winner is also kept
10
+ # here.
11
+ #
12
+ class InternalNode
13
+
14
+ include Comparable
15
+
16
+ attr_accessor :left # Internal Node
17
+ attr_accessor :right # Internal Node
18
+ attr_accessor :winner # Internal Node
19
+ attr_accessor :next_level # Who to tell
20
+ attr_accessor :leaf # winning leaf node
21
+
22
+ def initialize(left:, right:)
23
+ @left = left
24
+ @left.next_level = self
25
+
26
+ @right = right
27
+ @right.next_level = self
28
+ @next_level = nil
29
+
30
+ play
31
+ end
32
+
33
+ def value
34
+ winner.value
35
+ end
36
+
37
+ def sentinel?
38
+ false
39
+ end
40
+
41
+ def leaf?
42
+ false
43
+ end
44
+
45
+ # We are being told that the passed in node no longer has data in it and is
46
+ # to be removed from the tree.
47
+ #
48
+ # We replace our reference to this node with a sentinal node so that
49
+ # comparisons work correctly.
50
+ #
51
+ # After updating the node, we then need to check and see if both of our
52
+ # child nodes are sentinels, and if so, then tell our parent to remove us
53
+ # from the tree.
54
+ #
55
+ def player_finished(node)
56
+ if left.object_id == node.object_id then
57
+ @left = SentinelInternalNode.new
58
+ @left.next_level = self
59
+ elsif right.object_id == node.object_id then
60
+ @right = SentinelInternalNode.new
61
+ @right.next_level = self
62
+ else
63
+ raise FlatKit::Error, "Unknown player #{node}"
64
+ end
65
+
66
+ if @right.sentinel? && @left.sentinel? then
67
+ next_level.player_finished(self) if next_level
68
+ end
69
+ end
70
+
71
+ def play
72
+ @winner = left <= right ? left : right
73
+ if !@winner.sentinel? then
74
+ @leaf = winner.leaf
75
+ end
76
+ next_level.play if next_level
77
+ end
78
+
79
+ def <=>(other)
80
+ return -1 if other.sentinel?
81
+ value.<=>(other.value)
82
+ end
83
+ end
84
+ end