flat_kit 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +15 -0
- data/Manifest.txt +21 -26
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +48 -23
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +96 -0
- data/lib/flat_kit/command.rb +10 -10
- data/lib/flat_kit/descendant_tracker.rb +17 -5
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +7 -4
- data/lib/flat_kit/field_stats.rb +246 -0
- data/lib/flat_kit/field_type/boolean_type.rb +52 -0
- data/lib/flat_kit/field_type/date_type.rb +181 -0
- data/lib/flat_kit/field_type/float_type.rb +43 -0
- data/lib/flat_kit/field_type/guess_type.rb +23 -0
- data/lib/flat_kit/field_type/integer_type.rb +36 -0
- data/lib/flat_kit/field_type/null_type.rb +39 -0
- data/lib/flat_kit/field_type/string_type.rb +24 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +48 -0
- data/lib/flat_kit/field_type/unknown_type.rb +30 -0
- data/lib/flat_kit/field_type.rb +83 -0
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +16 -19
- data/lib/flat_kit/jsonl/writer.rb +25 -18
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -16
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +12 -7
- data/lib/flat_kit/position.rb +18 -0
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +64 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +37 -0
- data/lib/flat_kit/stat_type.rb +70 -0
- data/lib/flat_kit/stats.rb +64 -0
- data/lib/flat_kit/writer.rb +17 -3
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +36 -18
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +33 -21
- metadata +38 -113
- data/Rakefile +0 -20
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -68
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -72
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -68
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Represeting floating point data and conversion to it
|
6
|
+
#
|
7
|
+
class FloatType < FieldType
|
8
|
+
def self.type_name
|
9
|
+
"float"
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.matches?(data)
|
13
|
+
case data
|
14
|
+
when Float
|
15
|
+
true
|
16
|
+
when Integer
|
17
|
+
false
|
18
|
+
when String
|
19
|
+
return false if IntegerType.matches?(data)
|
20
|
+
|
21
|
+
maybe_float?(data)
|
22
|
+
else
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.coerce(data)
|
28
|
+
Float(data)
|
29
|
+
rescue TypeError => _e
|
30
|
+
CoerceFailure
|
31
|
+
rescue ArgumentError => _e
|
32
|
+
CoerceFailure
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.maybe_float?(data)
|
36
|
+
Float(data)
|
37
|
+
true
|
38
|
+
rescue ArgumentError => _e
|
39
|
+
false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: GuessType is a field type where we don't know what type the
|
6
|
+
# field is, and it needs to be guessed. This is a sentinel type that doesn't
|
7
|
+
# match any data.
|
8
|
+
#
|
9
|
+
class GuessType < FieldType
|
10
|
+
def self.type_name
|
11
|
+
name
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(*)
|
15
|
+
false
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.coerce(*)
|
19
|
+
CoerceFailure
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Class reprepseting the Integer type and coercian to it.
|
6
|
+
#
|
7
|
+
class IntegerType < FieldType
|
8
|
+
REGEX = /\A[-+]?\d+\Z/
|
9
|
+
|
10
|
+
def self.type_name
|
11
|
+
"integer"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(data)
|
15
|
+
case data
|
16
|
+
when Integer
|
17
|
+
true
|
18
|
+
when Float
|
19
|
+
false
|
20
|
+
when String
|
21
|
+
REGEX.match?(data)
|
22
|
+
else
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.coerce(data)
|
28
|
+
Integer(data)
|
29
|
+
rescue TypeError => _e
|
30
|
+
CoerceFailure
|
31
|
+
rescue ArgumentError => _e
|
32
|
+
CoerceFailure
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Class reprepseting the null type and coercian to it.
|
6
|
+
#
|
7
|
+
class NullType < FieldType
|
8
|
+
REGEX = Regexp.union(/\A(null|nil)\Z/i, /\A\\N\Z/)
|
9
|
+
|
10
|
+
def self.type_name
|
11
|
+
"null"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(data)
|
15
|
+
case data
|
16
|
+
when nil
|
17
|
+
true
|
18
|
+
when String
|
19
|
+
REGEX.match?(data)
|
20
|
+
else
|
21
|
+
false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.coerce(data)
|
26
|
+
case data
|
27
|
+
when nil
|
28
|
+
data
|
29
|
+
when String
|
30
|
+
return nil if REGEX.match?(data)
|
31
|
+
|
32
|
+
CoerceFailure
|
33
|
+
else
|
34
|
+
CoerceFailure
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: StringType is essentially a fallback - hence its lower weight
|
6
|
+
# than other types that might have string representations.
|
7
|
+
#
|
8
|
+
class StringType < FieldType
|
9
|
+
def self.type_name
|
10
|
+
"string"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.matches?(data)
|
14
|
+
data.is_a?(String)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.coerce(data)
|
18
|
+
data.to_s
|
19
|
+
rescue StandardError => _e
|
20
|
+
CoerceFailure
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Type for all tiemstamps types more granular than Date.
|
6
|
+
#
|
7
|
+
class TimestampType < FieldType
|
8
|
+
def self.parse_formats
|
9
|
+
@parse_formats ||= [
|
10
|
+
"%Y-%m-%d %H:%M:%S.%NZ",
|
11
|
+
"%Y-%m-%d %H:%M:%S.%N",
|
12
|
+
"%Y-%m-%dT%H:%M:%S.%N%z", # w3cdtf
|
13
|
+
"%Y-%m-%d %H:%M:%S",
|
14
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
15
|
+
"%Y-%m-%dT%H:%M:%SZ",
|
16
|
+
"%Y%m%dT%H%M%S",
|
17
|
+
"%a, %d %b %Y %H:%M:%S %z", # rfc2822, httpdate
|
18
|
+
].freeze
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.type_name
|
22
|
+
"timestamp"
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.matches?(data)
|
26
|
+
coerced = coerce(data)
|
27
|
+
coerced.is_a?(Time)
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.coerce(data)
|
31
|
+
case data
|
32
|
+
when Time
|
33
|
+
data
|
34
|
+
when String
|
35
|
+
parse_formats.each do |format|
|
36
|
+
coerced_data = Time.strptime(data, format).utc
|
37
|
+
return coerced_data
|
38
|
+
rescue StandardError => _e
|
39
|
+
# do nothing
|
40
|
+
end
|
41
|
+
CoerceFailure
|
42
|
+
else
|
43
|
+
CoerceFailure
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Unknown type, this is what we use for unknown values in the data
|
6
|
+
#
|
7
|
+
class UnknownType < FieldType
|
8
|
+
REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
|
9
|
+
|
10
|
+
def self.type_name
|
11
|
+
"unknown"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(data)
|
15
|
+
return false unless data.is_a?(String)
|
16
|
+
return true if data.empty?
|
17
|
+
|
18
|
+
REGEX.match?(data)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.coerce(data)
|
22
|
+
return data if REGEX.match?(data)
|
23
|
+
|
24
|
+
CoerceFailure
|
25
|
+
rescue StandardError
|
26
|
+
CoerceFailure
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: The base class for all field types
|
5
|
+
#
|
6
|
+
class FieldType
|
7
|
+
extend FlatKit::DescendantTracker
|
8
|
+
|
9
|
+
CoerceFailure = Class.new(::Object).freeze
|
10
|
+
|
11
|
+
def self.weights
|
12
|
+
@weights ||= {
|
13
|
+
# Boolean has crossover with Integer so going to let it overrule Integer
|
14
|
+
BooleanType => 5,
|
15
|
+
|
16
|
+
# Integer could potentially overlap with Float, but it is more restrictive
|
17
|
+
# so let it override Flaot
|
18
|
+
IntegerType => 4,
|
19
|
+
FloatType => 3,
|
20
|
+
|
21
|
+
# Date and Timestamps string representation shouldn't intersect with anything so
|
22
|
+
# leaving it at the same level as Null and Unkonwn
|
23
|
+
DateType => 2,
|
24
|
+
TimestampType => 2,
|
25
|
+
|
26
|
+
# Null and Unknown shoulnd't conflict since their string representations
|
27
|
+
# do not intersect
|
28
|
+
NullType => 2,
|
29
|
+
UnknownType => 2,
|
30
|
+
|
31
|
+
# Stringtype is the fallback for anything that has a string
|
32
|
+
# representation, so it should lose out on integers, floats, nulls,
|
33
|
+
# unknowns as strings
|
34
|
+
StringType => 1,
|
35
|
+
|
36
|
+
# at the bottom - since it should never match anywhere
|
37
|
+
GuessType => 0,
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.candidate_types(data)
|
42
|
+
find_children(:matches?, data)
|
43
|
+
end
|
44
|
+
|
45
|
+
# rubocop:disable Style/RedundantSort
|
46
|
+
# We need the stable sort, max_by(&:weight) returns the wrong one
|
47
|
+
def self.best_guess(data)
|
48
|
+
candidate_types(data).sort_by(&:weight).last
|
49
|
+
end
|
50
|
+
# rubocop:enable Style/RedundantSort
|
51
|
+
|
52
|
+
def self.type_name
|
53
|
+
raise NotImplementedError, "must impleent #{type_name}"
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.matches?(data)
|
57
|
+
raise NotImplementedError, "must implement #{name}.matches?(data)"
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.coerce(data)
|
61
|
+
raise NotImplementedError, "must implement #{name}.coerce(data)"
|
62
|
+
end
|
63
|
+
|
64
|
+
# Each type has a weight so if a value matches multiple types, then the list
|
65
|
+
# can be compared to see where the tie breakers are
|
66
|
+
#
|
67
|
+
# All the weights are here so that we can see the order of precedence
|
68
|
+
#
|
69
|
+
def self.weight
|
70
|
+
weights.fetch(self) { raise NotImplementedError, "No weight assigned to type #{self} - fix immediately" }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
require "flat_kit/field_type/guess_type"
|
76
|
+
require "flat_kit/field_type/boolean_type"
|
77
|
+
require "flat_kit/field_type/date_type"
|
78
|
+
require "flat_kit/field_type/timestamp_type"
|
79
|
+
require "flat_kit/field_type/integer_type"
|
80
|
+
require "flat_kit/field_type/float_type"
|
81
|
+
require "flat_kit/field_type/null_type"
|
82
|
+
require "flat_kit/field_type/string_type"
|
83
|
+
require "flat_kit/field_type/unknown_type"
|
data/lib/flat_kit/format.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class of all data file format classes
|
5
|
+
#
|
2
6
|
class Format
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.format_name
|
6
|
-
raise NotImplementedError, "#{self.class} must
|
10
|
+
raise NotImplementedError, "#{self.class} must implement #{self.class}.format_name"
|
7
11
|
end
|
8
12
|
|
9
13
|
def format_name
|
@@ -20,15 +24,17 @@ module FlatKit
|
|
20
24
|
return format unless format.nil?
|
21
25
|
|
22
26
|
# now try the fallback
|
23
|
-
|
24
|
-
return format
|
27
|
+
::FlatKit::Format.for(fallback)
|
25
28
|
end
|
26
29
|
|
27
30
|
def self.for_with_fallback!(path:, fallback: "auto")
|
28
31
|
format = for_with_fallback(path: path, fallback: fallback)
|
29
|
-
|
32
|
+
if format.nil?
|
33
|
+
raise ::FlatKit::Error::UnknownFormat,
|
34
|
+
"Unable to figure out format for '#{path}' with fallback '#{fallback}'"
|
35
|
+
end
|
30
36
|
|
31
|
-
|
37
|
+
format
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/flat_kit/input/file.rb
CHANGED
@@ -1,25 +1,31 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "zlib"
|
4
|
+
require "pathname"
|
2
5
|
|
3
6
|
module FlatKit
|
4
7
|
class Input
|
8
|
+
# Internal: Handler for file based input
|
9
|
+
#
|
5
10
|
class File < Input
|
6
|
-
attr_reader :path
|
7
|
-
attr_reader :count
|
11
|
+
attr_reader :path, :count, :io
|
8
12
|
|
9
13
|
def self.handles?(obj)
|
10
14
|
return true if obj.instance_of?(Pathname)
|
11
15
|
return false unless obj.instance_of?(String)
|
12
16
|
|
13
17
|
# incase these get loaded in different orders
|
14
|
-
return false if ::FlatKit::Input::IO.
|
18
|
+
return false if ::FlatKit::Input::IO.stdin?(obj)
|
15
19
|
|
16
|
-
|
20
|
+
true
|
17
21
|
end
|
18
22
|
|
19
23
|
def initialize(obj)
|
24
|
+
super()
|
20
25
|
@count = 0
|
21
26
|
@path = Pathname.new(obj)
|
22
27
|
raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
|
28
|
+
|
23
29
|
@io = open_input(path)
|
24
30
|
end
|
25
31
|
|
@@ -31,10 +37,6 @@ module FlatKit
|
|
31
37
|
@io.close
|
32
38
|
end
|
33
39
|
|
34
|
-
def io
|
35
|
-
@io
|
36
|
-
end
|
37
|
-
|
38
40
|
private
|
39
41
|
|
40
42
|
# open the opropriate input type depending on the source file name
|
data/lib/flat_kit/input/io.rb
CHANGED
@@ -1,35 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class Input
|
5
|
+
# Internal: Handler for non-filebased input. Generally this is just stdin
|
6
|
+
#
|
3
7
|
class IO < Input
|
4
|
-
STDINS = %w[
|
8
|
+
STDINS = %w[stdin STDIN - <stdin>].freeze
|
5
9
|
|
6
10
|
def self.handles?(obj)
|
7
|
-
return true if
|
8
|
-
return true if [
|
9
|
-
|
11
|
+
return true if stdin?(obj)
|
12
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
13
|
+
|
14
|
+
false
|
10
15
|
end
|
11
16
|
|
12
|
-
def self.
|
17
|
+
def self.stdin?(obj)
|
13
18
|
case obj
|
14
19
|
when String
|
15
20
|
return true if STDINS.include?(obj)
|
16
21
|
when ::IO
|
17
|
-
return true if obj ==
|
22
|
+
return true if obj == $stdin
|
18
23
|
end
|
19
|
-
|
24
|
+
false
|
20
25
|
end
|
21
26
|
|
22
27
|
def initialize(obj)
|
23
|
-
|
28
|
+
super()
|
29
|
+
if self.class.stdin?(obj)
|
24
30
|
@name = "<STDIN>"
|
25
31
|
@io = $stdin
|
26
|
-
elsif obj.
|
27
|
-
@name = obj.path
|
28
|
-
@io = obj
|
29
|
-
elsif obj.kind_of?(::StringIO) then
|
30
|
-
@name = obj.inspect
|
32
|
+
elsif obj.is_a?(::IO)
|
33
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
31
34
|
@io = obj
|
32
|
-
elsif obj.
|
35
|
+
elsif obj.is_a?(::StringIO)
|
33
36
|
@name = obj.inspect
|
34
37
|
@io = obj
|
35
38
|
else
|
@@ -37,18 +40,12 @@ module FlatKit
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
|
41
|
-
@name
|
42
|
-
end
|
43
|
+
attr_reader :name, :io
|
43
44
|
|
44
45
|
# this goes to an io stream and we are not in charge of opening it
|
45
46
|
def close
|
46
47
|
@io.close
|
47
48
|
end
|
48
|
-
|
49
|
-
def io
|
50
|
-
@io
|
51
|
-
end
|
52
49
|
end
|
53
50
|
end
|
54
51
|
end
|
data/lib/flat_kit/input.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Base class of all input handlers
|
5
|
+
#
|
2
6
|
class Input
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.from(input)
|
6
|
-
return input if input.
|
10
|
+
return input if input.is_a?(::FlatKit::Input)
|
7
11
|
|
8
12
|
in_klass = find_child(:handles?, input)
|
9
|
-
if in_klass
|
10
|
-
return in_klass.new(input)
|
11
|
-
end
|
13
|
+
return in_klass.new(input) if in_klass
|
12
14
|
|
13
15
|
raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
|
14
16
|
end
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
18
20
|
end
|
19
21
|
|
20
|
-
#
|
21
22
|
def io
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
23
24
|
end
|
@@ -28,5 +29,5 @@ module FlatKit
|
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
31
|
-
require
|
32
|
-
require
|
32
|
+
require "flat_kit/input/io"
|
33
|
+
require "flat_kit/input/file"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: This is a class used internally by MergeTree and should not be used
|
3
5
|
# outside of that context.
|
@@ -10,22 +12,24 @@ module FlatKit
|
|
10
12
|
# here.
|
11
13
|
#
|
12
14
|
class InternalNode
|
13
|
-
|
14
15
|
include Comparable
|
15
16
|
|
16
|
-
|
17
|
-
attr_accessor :right
|
18
|
-
|
19
|
-
|
20
|
-
attr_accessor :
|
17
|
+
# Internal Nodes
|
18
|
+
attr_accessor :left, :right, :winner
|
19
|
+
|
20
|
+
# Who to tell
|
21
|
+
attr_accessor :next_level
|
22
|
+
|
23
|
+
# winning leaf node
|
24
|
+
attr_accessor :leaf
|
21
25
|
|
22
26
|
def initialize(left:, right:)
|
23
|
-
@left
|
27
|
+
@left = left
|
24
28
|
@left.next_level = self
|
25
29
|
|
26
|
-
@right
|
30
|
+
@right = right
|
27
31
|
@right.next_level = self
|
28
|
-
@next_level
|
32
|
+
@next_level = nil
|
29
33
|
|
30
34
|
play
|
31
35
|
end
|
@@ -53,32 +57,31 @@ module FlatKit
|
|
53
57
|
# from the tree.
|
54
58
|
#
|
55
59
|
def player_finished(node)
|
56
|
-
if left.
|
60
|
+
if left.equal?(node)
|
57
61
|
@left = SentinelInternalNode.new
|
58
62
|
@left.next_level = self
|
59
|
-
elsif right.
|
63
|
+
elsif right.equal?(node)
|
60
64
|
@right = SentinelInternalNode.new
|
61
65
|
@right.next_level = self
|
62
66
|
else
|
63
67
|
raise FlatKit::Error, "Unknown player #{node}"
|
64
68
|
end
|
65
69
|
|
66
|
-
|
67
|
-
|
68
|
-
|
70
|
+
return unless @right.sentinel? && @left.sentinel?
|
71
|
+
|
72
|
+
next_level.player_finished(self) if next_level
|
69
73
|
end
|
70
74
|
|
71
75
|
def play
|
72
|
-
@winner = left <= right ? left : right
|
73
|
-
|
74
|
-
@leaf = winner.leaf
|
75
|
-
end
|
76
|
+
@winner = (left <= right) ? left : right
|
77
|
+
@leaf = winner.leaf unless @winner.sentinel?
|
76
78
|
next_level.play if next_level
|
77
79
|
end
|
78
80
|
|
79
81
|
def <=>(other)
|
80
82
|
return -1 if other.sentinel?
|
81
|
-
|
83
|
+
|
84
|
+
value <=> (other.value)
|
82
85
|
end
|
83
86
|
end
|
84
87
|
end
|
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: JSONL format class holding the metadata about the JSONL format
|
6
|
+
#
|
3
7
|
class Format < ::FlatKit::Format
|
4
8
|
def self.format_name
|
5
9
|
"jsonl"
|
@@ -7,10 +11,10 @@ module FlatKit
|
|
7
11
|
|
8
12
|
def self.handles?(filename)
|
9
13
|
parts = filename.split(".")
|
10
|
-
%w[
|
14
|
+
%w[json jsonl ndjson].each do |ext|
|
11
15
|
return true if parts.include?(ext)
|
12
16
|
end
|
13
|
-
|
17
|
+
false
|
14
18
|
end
|
15
19
|
|
16
20
|
def self.reader
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: Reader class that parses and yields records from JSONL files
|
6
|
+
#
|
3
7
|
class Reader < ::FlatKit::Reader
|
4
|
-
attr_reader :input
|
5
|
-
attr_reader :count
|
8
|
+
attr_reader :input, :count
|
6
9
|
|
7
10
|
def self.format_name
|
8
11
|
::FlatKit::Jsonl::Format.format_name
|
@@ -15,13 +18,13 @@ module FlatKit
|
|
15
18
|
end
|
16
19
|
|
17
20
|
def each
|
18
|
-
while line = input.io.gets
|
21
|
+
while (line = input.io.gets)
|
19
22
|
record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
|
20
23
|
@count += 1
|
21
24
|
yield record
|
22
25
|
end
|
23
26
|
input.close
|
24
|
-
rescue => e
|
27
|
+
rescue StandardError => e
|
25
28
|
::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
|
26
29
|
raise ::FlatKit::Error, e
|
27
30
|
end
|