flat_kit 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/CONTRIBUTING.md +46 -0
  3. data/HISTORY.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/Manifest.txt +66 -0
  6. data/README.md +80 -0
  7. data/Rakefile +20 -0
  8. data/bin/fk +4 -0
  9. data/lib/flat_kit.rb +23 -0
  10. data/lib/flat_kit/cli.rb +80 -0
  11. data/lib/flat_kit/command.rb +53 -0
  12. data/lib/flat_kit/command/cat.rb +93 -0
  13. data/lib/flat_kit/command/merge.rb +88 -0
  14. data/lib/flat_kit/command/sort.rb +88 -0
  15. data/lib/flat_kit/descendant_tracker.rb +27 -0
  16. data/lib/flat_kit/error.rb +5 -0
  17. data/lib/flat_kit/format.rb +34 -0
  18. data/lib/flat_kit/input.rb +32 -0
  19. data/lib/flat_kit/input/file.rb +53 -0
  20. data/lib/flat_kit/input/io.rb +54 -0
  21. data/lib/flat_kit/internal_node.rb +84 -0
  22. data/lib/flat_kit/jsonl.rb +8 -0
  23. data/lib/flat_kit/jsonl/format.rb +25 -0
  24. data/lib/flat_kit/jsonl/reader.rb +30 -0
  25. data/lib/flat_kit/jsonl/record.rb +84 -0
  26. data/lib/flat_kit/jsonl/writer.rb +45 -0
  27. data/lib/flat_kit/leaf_node.rb +71 -0
  28. data/lib/flat_kit/logger.rb +39 -0
  29. data/lib/flat_kit/merge.rb +35 -0
  30. data/lib/flat_kit/merge_tree.rb +104 -0
  31. data/lib/flat_kit/output.rb +32 -0
  32. data/lib/flat_kit/output/file.rb +55 -0
  33. data/lib/flat_kit/output/io.rb +73 -0
  34. data/lib/flat_kit/reader.rb +61 -0
  35. data/lib/flat_kit/record.rb +83 -0
  36. data/lib/flat_kit/sentinel_internal_node.rb +37 -0
  37. data/lib/flat_kit/sentinel_leaf_node.rb +37 -0
  38. data/lib/flat_kit/sort.rb +35 -0
  39. data/lib/flat_kit/writer.rb +38 -0
  40. data/lib/flat_kit/xsv.rb +8 -0
  41. data/lib/flat_kit/xsv/format.rb +25 -0
  42. data/lib/flat_kit/xsv/reader.rb +45 -0
  43. data/lib/flat_kit/xsv/record.rb +90 -0
  44. data/lib/flat_kit/xsv/writer.rb +70 -0
  45. data/tasks/default.rake +242 -0
  46. data/tasks/extension.rake +38 -0
  47. data/tasks/man.rake +7 -0
  48. data/tasks/this.rb +208 -0
  49. data/test/device_dataset.rb +117 -0
  50. data/test/input/test_file.rb +73 -0
  51. data/test/input/test_io.rb +93 -0
  52. data/test/jsonl/test_format.rb +22 -0
  53. data/test/jsonl/test_reader.rb +49 -0
  54. data/test/jsonl/test_record.rb +61 -0
  55. data/test/jsonl/test_writer.rb +68 -0
  56. data/test/output/test_file.rb +60 -0
  57. data/test/output/test_io.rb +104 -0
  58. data/test/test_conversions.rb +45 -0
  59. data/test/test_format.rb +24 -0
  60. data/test/test_helper.rb +26 -0
  61. data/test/test_merge.rb +40 -0
  62. data/test/test_merge_tree.rb +64 -0
  63. data/test/test_version.rb +11 -0
  64. data/test/xsv/test_format.rb +22 -0
  65. data/test/xsv/test_reader.rb +61 -0
  66. data/test/xsv/test_record.rb +69 -0
  67. data/test/xsv/test_writer.rb +68 -0
  68. metadata +237 -0
@@ -0,0 +1,88 @@
1
+ require 'csv'
2
+ module FlatKit
3
+ class Command
4
+ class Merge < ::FlatKit::Command
5
+
6
+ def self.name
7
+ "merge"
8
+ end
9
+
10
+ def self.description
11
+ "Merge sorted files together that have the same structure."
12
+ end
13
+
14
+ def self.parser
15
+ ::Optimist::Parser.new do
16
+ banner "#{Merge.description}"
17
+ banner ""
18
+
19
+ banner <<~BANNER
20
+ Given a set of input files that have the same structure, and are already
21
+ sorted by a set of keys. The Merge command will merge all those files
22
+ into a single output file.
23
+
24
+ The --key parameter is required, and it must be a comma separated list
25
+ of field nams on the input on which to use as the sort key for the merge
26
+ process.
27
+
28
+ There must also be at least 2 input files. Merging only 1 file into an
29
+ output file is the same as the 'cat' command.
30
+
31
+ The flatfile type(s) will be automatically determined by the file name.
32
+ If the output is not a file, but to stdout then the output type will
33
+ be the same as the first input file, or it can be specified as a commandline
34
+ switch.
35
+
36
+ The merge will do a single pass through the input to generate the
37
+ output.
38
+ BANNER
39
+
40
+ banner <<~USAGE
41
+
42
+ Usage:
43
+ fk merge --key surname,given_name file1.csv file2.csv > all.csv
44
+ fk merge --key surname,given_name --output-format json file1.csv file2.csv > all.json
45
+ fk merge --key field1,field2 --output-format json input*.csv | gzip -c > all.json.gz
46
+ fk merge --key field12 file*.json.gz -o all.json.gz
47
+
48
+ USAGE
49
+
50
+ banner <<~OPTIONS
51
+
52
+ Options:
53
+
54
+ OPTIONS
55
+
56
+ opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
57
+ opt :input_format, "Input format, csv or json", default: "auto", short: :none
58
+ opt :output_format, "Output format, csv or json", default: "auto", short: :none
59
+ opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
60
+ end
61
+ end
62
+
63
+ attr_reader :compare_keys
64
+
65
+ def parse
66
+ parser = self.class.parser
67
+ ::Optimist::with_standard_exception_handling(parser) do
68
+ begin
69
+ @opts = parser.parse(argv)
70
+ @compare_keys = CSV.parse_line(opts[:key])
71
+ paths = parser.leftovers
72
+ raise ::Optimist::CommandlineError, "At least 2 input files are required" if paths.size < 2
73
+
74
+ @merge = ::FlatKit::Merge.new(inputs: paths, input_fallback: opts[:input_format],
75
+ compare_fields: @compare_keys,
76
+ output: opts[:output], output_fallback: opts[:output_format])
77
+ rescue ::FlatKit::Error => e
78
+ raise ::Optimist::CommandlineError, e.message
79
+ end
80
+ end
81
+ end
82
+
83
+ def call
84
+ @merge.call
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,88 @@
1
+ require 'csv'
2
+ module FlatKit
3
+ class Command
4
+ class Sort < ::FlatKit::Command
5
+
6
+ def self.name
7
+ "sort"
8
+ end
9
+
10
+ def self.description
11
+ "Sort a given file by a set of fields."
12
+ end
13
+
14
+ def self.parser
15
+ ::Optimist::Parser.new do
16
+ banner "#{Sort.description}"
17
+ banner ""
18
+
19
+ banner <<~BANNER
20
+ Given an input file and a sort key, order the records in that file by that
21
+ key. If no input file is given the stdin is assumed. If no output file
22
+ is given then stdout is assumed.
23
+
24
+ The --key parameter is required, and it must be a comma separated list
25
+ of field nams on the input on which to use as the sort key for the merge
26
+ process.
27
+
28
+ There must also be only 1 input files.
29
+
30
+ The flatfile type(s) will be automatically determined by the file name.
31
+ If the output is not a file, but to stdout then the output type will
32
+ be the same as the first input file, or it can be specified as a commandline
33
+ switch.
34
+
35
+ BANNER
36
+
37
+ banner <<~USAGE
38
+
39
+ Usage:
40
+ fk sort --key surname,given_name file.csv > sorted.csv
41
+ fk sort --key surname,given_name --output-format json file.csv > sorted.json
42
+ fk sort --key field1,field2 --output-format json input.csv | gzip -c > sorted.json.gz
43
+ fk sort --key field1 file.json.gz -o sorted.json.gz
44
+ gunzip -c file.json.gz | fk sort --key field1 --input-format json --output-format json > gzip -c sorted.json.gz
45
+
46
+ USAGE
47
+
48
+ banner <<~OPTIONS
49
+
50
+ Options:
51
+
52
+ OPTIONS
53
+
54
+ opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
55
+ opt :input_format, "Input format, csv or json", default: "auto", short: :none
56
+ opt :output_format, "Output format, csv or json", default: "auto", short: :none
57
+ opt :key, "The comma separted list of field(s) to use for sorting the input", required: true, type: :string
58
+ end
59
+ end
60
+
61
+ attr_reader :compare_keys
62
+ attr_reader :reader
63
+ attr_reader :sort
64
+
65
+ def parse
66
+ parser = self.class.parser
67
+ ::Optimist::with_standard_exception_handling(parser) do
68
+ begin
69
+ @opts = parser.parse(argv)
70
+ @compare_keys = CSV.parse_line(opts[:key])
71
+ paths = parser.leftovers
72
+ raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
73
+ path = paths.first || "-" # default to stdin
74
+ @sort = ::FlatKit::Sort.new(input: path, input_fallback: opts[:input_format],
75
+ output: opts[:output], output_fallback: opts[:output_format],
76
+ compare_fields: @compare_keys)
77
+ rescue ::FlatKit::Error => e
78
+ raise ::Optimist::CommandlineError, e.message
79
+ end
80
+ end
81
+ end
82
+
83
+ def call
84
+ sort.call
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,27 @@
1
+ require 'set'
2
+
3
+ module FlatKit
4
+ module DescendantTracker
5
+ def inherited(klass)
6
+ super
7
+ return unless klass.instance_of?(Class)
8
+ self.children << klass
9
+ end
10
+
11
+ def children
12
+ unless defined? @_children
13
+ @_children = Set.new
14
+ end
15
+ @_children
16
+ end
17
+
18
+ #
19
+ # Find the first child that returns truthy from the given method with args
20
+ #
21
+ def find_child(method, *args)
22
+ children.find do |child_klass|
23
+ child_klass.send(method, *args)
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,5 @@
1
+ module FlatKit
2
+ class Error < ::StandardError
3
+ class UnknownFormat < ::FlatKit::Error; end
4
+ end
5
+ end
@@ -0,0 +1,34 @@
1
+ module FlatKit
2
+ class Format
3
+ extend DescendantTracker
4
+
5
+ def self.format_name
6
+ raise NotImplementedError, "#{self.class} must implemente #{self.class}.format_name"
7
+ end
8
+
9
+ def format_name
10
+ self.class.format_name
11
+ end
12
+
13
+ def self.for(path)
14
+ find_child(:handles?, path.to_s)
15
+ end
16
+
17
+ def self.for_with_fallback(path:, fallback: "auto")
18
+ # test by path
19
+ format = ::FlatKit::Format.for(path)
20
+ return format unless format.nil?
21
+
22
+ # now try the fallback
23
+ format = ::FlatKit::Format.for(fallback)
24
+ return format
25
+ end
26
+
27
+ def self.for_with_fallback!(path:, fallback: "auto")
28
+ format = for_with_fallback(path: path, fallback: fallback)
29
+ raise ::FlatKit::Error::UnknownFormat, "Unable to figure out format for '#{path}' with fallback '#{fallback}'" if format.nil?
30
+
31
+ return format
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,32 @@
1
+ module FlatKit
2
+ class Input
3
+ extend DescendantTracker
4
+
5
+ def self.from(input)
6
+ return input if input.kind_of?(::FlatKit::Input)
7
+
8
+ in_klass = find_child(:handles?, input)
9
+ if in_klass then
10
+ return in_klass.new(input)
11
+ end
12
+
13
+ raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
14
+ end
15
+
16
+ def name
17
+ raise NotImplementedError, "#{self.class} must implement #name"
18
+ end
19
+
20
+ #
21
+ def io
22
+ raise NotImplementedError, "#{self.class} must implement #io"
23
+ end
24
+
25
+ def close
26
+ raise NotImplementedError, "#{self.class} must implement #close"
27
+ end
28
+ end
29
+ end
30
+
31
+ require 'flat_kit/input/io'
32
+ require 'flat_kit/input/file'
@@ -0,0 +1,53 @@
1
+ require 'zlib'
2
+
3
+ module FlatKit
4
+ class Input
5
+ class File < Input
6
+ attr_reader :path
7
+ attr_reader :count
8
+
9
+ def self.handles?(obj)
10
+ return true if obj.instance_of?(Pathname)
11
+ return false unless obj.instance_of?(String)
12
+
13
+ # incase these get loaded in different orders
14
+ return false if ::FlatKit::Input::IO.is_stdin?(obj)
15
+
16
+ return true
17
+ end
18
+
19
+ def initialize(obj)
20
+ @count = 0
21
+ @path = Pathname.new(obj)
22
+ raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
23
+ @io = open_input(path)
24
+ end
25
+
26
+ def name
27
+ path.to_s
28
+ end
29
+
30
+ def close
31
+ @io.close
32
+ end
33
+
34
+ def io
35
+ @io
36
+ end
37
+
38
+ private
39
+
40
+ # open the opropriate input type depending on the source file name
41
+ #
42
+ # TODO: add in bzip
43
+ def open_input(path)
44
+ case path.extname
45
+ when ".gz"
46
+ Zlib::GzipReader.open(path.to_s)
47
+ else
48
+ path.open("rb")
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,54 @@
1
+ module FlatKit
2
+ class Input
3
+ class IO < Input
4
+ STDINS = %w[ stdin STDIN - <stdin> ]
5
+
6
+ def self.handles?(obj)
7
+ return true if is_stdin?(obj)
8
+ return true if [ ::File, ::StringIO, ::IO ].any? { |klass| obj.kind_of?(klass) }
9
+ return false
10
+ end
11
+
12
+ def self.is_stdin?(obj)
13
+ case obj
14
+ when String
15
+ return true if STDINS.include?(obj)
16
+ when ::IO
17
+ return true if obj == ::STDIN
18
+ end
19
+ return false
20
+ end
21
+
22
+ def initialize(obj)
23
+ if self.class.is_stdin?(obj) then
24
+ @name = "<STDIN>"
25
+ @io = $stdin
26
+ elsif obj.kind_of?(::File) then
27
+ @name = obj.path
28
+ @io = obj
29
+ elsif obj.kind_of?(::StringIO) then
30
+ @name = obj.inspect
31
+ @io = obj
32
+ elsif obj.kind_of?(::IO) then
33
+ @name = obj.inspect
34
+ @io = obj
35
+ else
36
+ raise ::FlatKit::Error, "Unable to create #{self.class} from #{obj.class} : #{obj.inspect}"
37
+ end
38
+ end
39
+
40
+ def name
41
+ @name
42
+ end
43
+
44
+ # this goes to an io stream and we are not in charge of opening it
45
+ def close
46
+ @io.close
47
+ end
48
+
49
+ def io
50
+ @io
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,84 @@
1
+ module FlatKit
2
+ # Private: This is a class used internally by MergeTree and should not be used
3
+ # outside of that context.
4
+ #
5
+ # The InternalNode represents a single element of the tournament tree
6
+ # altorithm holding references to the to other internal nodes that competed in
7
+ # this node and which one is the winner.
8
+ #
9
+ # A reference to the leaf node that is associated with the winner is also kept
10
+ # here.
11
+ #
12
+ class InternalNode
13
+
14
+ include Comparable
15
+
16
+ attr_accessor :left # Internal Node
17
+ attr_accessor :right # Internal Node
18
+ attr_accessor :winner # Internal Node
19
+ attr_accessor :next_level # Who to tell
20
+ attr_accessor :leaf # winning leaf node
21
+
22
+ def initialize(left:, right:)
23
+ @left = left
24
+ @left.next_level = self
25
+
26
+ @right = right
27
+ @right.next_level = self
28
+ @next_level = nil
29
+
30
+ play
31
+ end
32
+
33
+ def value
34
+ winner.value
35
+ end
36
+
37
+ def sentinel?
38
+ false
39
+ end
40
+
41
+ def leaf?
42
+ false
43
+ end
44
+
45
+ # We are being told that the passed in node no longer has data in it and is
46
+ # to be removed from the tree.
47
+ #
48
+ # We replace our reference to this node with a sentinal node so that
49
+ # comparisons work correctly.
50
+ #
51
+ # After updating the node, we then need to check and see if both of our
52
+ # child nodes are sentinels, and if so, then tell our parent to remove us
53
+ # from the tree.
54
+ #
55
+ def player_finished(node)
56
+ if left.object_id == node.object_id then
57
+ @left = SentinelInternalNode.new
58
+ @left.next_level = self
59
+ elsif right.object_id == node.object_id then
60
+ @right = SentinelInternalNode.new
61
+ @right.next_level = self
62
+ else
63
+ raise FlatKit::Error, "Unknown player #{node}"
64
+ end
65
+
66
+ if @right.sentinel? && @left.sentinel? then
67
+ next_level.player_finished(self) if next_level
68
+ end
69
+ end
70
+
71
+ def play
72
+ @winner = left <= right ? left : right
73
+ if !@winner.sentinel? then
74
+ @leaf = winner.leaf
75
+ end
76
+ next_level.play if next_level
77
+ end
78
+
79
+ def <=>(other)
80
+ return -1 if other.sentinel?
81
+ value.<=>(other.value)
82
+ end
83
+ end
84
+ end