flat_kit 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +15 -0
- data/Manifest.txt +21 -26
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +48 -23
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +96 -0
- data/lib/flat_kit/command.rb +10 -10
- data/lib/flat_kit/descendant_tracker.rb +17 -5
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +7 -4
- data/lib/flat_kit/field_stats.rb +246 -0
- data/lib/flat_kit/field_type/boolean_type.rb +52 -0
- data/lib/flat_kit/field_type/date_type.rb +181 -0
- data/lib/flat_kit/field_type/float_type.rb +43 -0
- data/lib/flat_kit/field_type/guess_type.rb +23 -0
- data/lib/flat_kit/field_type/integer_type.rb +36 -0
- data/lib/flat_kit/field_type/null_type.rb +39 -0
- data/lib/flat_kit/field_type/string_type.rb +24 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +48 -0
- data/lib/flat_kit/field_type/unknown_type.rb +30 -0
- data/lib/flat_kit/field_type.rb +83 -0
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +16 -19
- data/lib/flat_kit/jsonl/writer.rb +25 -18
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -16
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +12 -7
- data/lib/flat_kit/position.rb +18 -0
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +64 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +37 -0
- data/lib/flat_kit/stat_type.rb +70 -0
- data/lib/flat_kit/stats.rb +64 -0
- data/lib/flat_kit/writer.rb +17 -3
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +36 -18
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +33 -21
- metadata +38 -113
- data/Rakefile +0 -20
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -68
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -72
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -68
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Represeting floating point data and conversion to it
|
6
|
+
#
|
7
|
+
class FloatType < FieldType
|
8
|
+
def self.type_name
|
9
|
+
"float"
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.matches?(data)
|
13
|
+
case data
|
14
|
+
when Float
|
15
|
+
true
|
16
|
+
when Integer
|
17
|
+
false
|
18
|
+
when String
|
19
|
+
return false if IntegerType.matches?(data)
|
20
|
+
|
21
|
+
maybe_float?(data)
|
22
|
+
else
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.coerce(data)
|
28
|
+
Float(data)
|
29
|
+
rescue TypeError => _e
|
30
|
+
CoerceFailure
|
31
|
+
rescue ArgumentError => _e
|
32
|
+
CoerceFailure
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.maybe_float?(data)
|
36
|
+
Float(data)
|
37
|
+
true
|
38
|
+
rescue ArgumentError => _e
|
39
|
+
false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: GuessType is a field type where we don't know what type the
|
6
|
+
# field is, and it needs to be guessed. This is a sentinel type that doesn't
|
7
|
+
# match any data.
|
8
|
+
#
|
9
|
+
class GuessType < FieldType
|
10
|
+
def self.type_name
|
11
|
+
name
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(*)
|
15
|
+
false
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.coerce(*)
|
19
|
+
CoerceFailure
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Class reprepseting the Integer type and coercian to it.
|
6
|
+
#
|
7
|
+
class IntegerType < FieldType
|
8
|
+
REGEX = /\A[-+]?\d+\Z/
|
9
|
+
|
10
|
+
def self.type_name
|
11
|
+
"integer"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(data)
|
15
|
+
case data
|
16
|
+
when Integer
|
17
|
+
true
|
18
|
+
when Float
|
19
|
+
false
|
20
|
+
when String
|
21
|
+
REGEX.match?(data)
|
22
|
+
else
|
23
|
+
false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.coerce(data)
|
28
|
+
Integer(data)
|
29
|
+
rescue TypeError => _e
|
30
|
+
CoerceFailure
|
31
|
+
rescue ArgumentError => _e
|
32
|
+
CoerceFailure
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Class reprepseting the null type and coercian to it.
|
6
|
+
#
|
7
|
+
class NullType < FieldType
|
8
|
+
REGEX = Regexp.union(/\A(null|nil)\Z/i, /\A\\N\Z/)
|
9
|
+
|
10
|
+
def self.type_name
|
11
|
+
"null"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(data)
|
15
|
+
case data
|
16
|
+
when nil
|
17
|
+
true
|
18
|
+
when String
|
19
|
+
REGEX.match?(data)
|
20
|
+
else
|
21
|
+
false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.coerce(data)
|
26
|
+
case data
|
27
|
+
when nil
|
28
|
+
data
|
29
|
+
when String
|
30
|
+
return nil if REGEX.match?(data)
|
31
|
+
|
32
|
+
CoerceFailure
|
33
|
+
else
|
34
|
+
CoerceFailure
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: StringType is essentially a fallback - hence its lower weight
|
6
|
+
# than other types that might have string representations.
|
7
|
+
#
|
8
|
+
class StringType < FieldType
|
9
|
+
def self.type_name
|
10
|
+
"string"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.matches?(data)
|
14
|
+
data.is_a?(String)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.coerce(data)
|
18
|
+
data.to_s
|
19
|
+
rescue StandardError => _e
|
20
|
+
CoerceFailure
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Type for all tiemstamps types more granular than Date.
|
6
|
+
#
|
7
|
+
class TimestampType < FieldType
|
8
|
+
def self.parse_formats
|
9
|
+
@parse_formats ||= [
|
10
|
+
"%Y-%m-%d %H:%M:%S.%NZ",
|
11
|
+
"%Y-%m-%d %H:%M:%S.%N",
|
12
|
+
"%Y-%m-%dT%H:%M:%S.%N%z", # w3cdtf
|
13
|
+
"%Y-%m-%d %H:%M:%S",
|
14
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
15
|
+
"%Y-%m-%dT%H:%M:%SZ",
|
16
|
+
"%Y%m%dT%H%M%S",
|
17
|
+
"%a, %d %b %Y %H:%M:%S %z", # rfc2822, httpdate
|
18
|
+
].freeze
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.type_name
|
22
|
+
"timestamp"
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.matches?(data)
|
26
|
+
coerced = coerce(data)
|
27
|
+
coerced.is_a?(Time)
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.coerce(data)
|
31
|
+
case data
|
32
|
+
when Time
|
33
|
+
data
|
34
|
+
when String
|
35
|
+
parse_formats.each do |format|
|
36
|
+
coerced_data = Time.strptime(data, format).utc
|
37
|
+
return coerced_data
|
38
|
+
rescue StandardError => _e
|
39
|
+
# do nothing
|
40
|
+
end
|
41
|
+
CoerceFailure
|
42
|
+
else
|
43
|
+
CoerceFailure
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Unknown type, this is what we use for unknown values in the data
|
6
|
+
#
|
7
|
+
class UnknownType < FieldType
|
8
|
+
REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
|
9
|
+
|
10
|
+
def self.type_name
|
11
|
+
"unknown"
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.matches?(data)
|
15
|
+
return false unless data.is_a?(String)
|
16
|
+
return true if data.empty?
|
17
|
+
|
18
|
+
REGEX.match?(data)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.coerce(data)
|
22
|
+
return data if REGEX.match?(data)
|
23
|
+
|
24
|
+
CoerceFailure
|
25
|
+
rescue StandardError
|
26
|
+
CoerceFailure
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: The base class for all field types
|
5
|
+
#
|
6
|
+
class FieldType
|
7
|
+
extend FlatKit::DescendantTracker
|
8
|
+
|
9
|
+
CoerceFailure = Class.new(::Object).freeze
|
10
|
+
|
11
|
+
def self.weights
|
12
|
+
@weights ||= {
|
13
|
+
# Boolean has crossover with Integer so going to let it overrule Integer
|
14
|
+
BooleanType => 5,
|
15
|
+
|
16
|
+
# Integer could potentially overlap with Float, but it is more restrictive
|
17
|
+
# so let it override Flaot
|
18
|
+
IntegerType => 4,
|
19
|
+
FloatType => 3,
|
20
|
+
|
21
|
+
# Date and Timestamps string representation shouldn't intersect with anything so
|
22
|
+
# leaving it at the same level as Null and Unkonwn
|
23
|
+
DateType => 2,
|
24
|
+
TimestampType => 2,
|
25
|
+
|
26
|
+
# Null and Unknown shoulnd't conflict since their string representations
|
27
|
+
# do not intersect
|
28
|
+
NullType => 2,
|
29
|
+
UnknownType => 2,
|
30
|
+
|
31
|
+
# Stringtype is the fallback for anything that has a string
|
32
|
+
# representation, so it should lose out on integers, floats, nulls,
|
33
|
+
# unknowns as strings
|
34
|
+
StringType => 1,
|
35
|
+
|
36
|
+
# at the bottom - since it should never match anywhere
|
37
|
+
GuessType => 0,
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.candidate_types(data)
|
42
|
+
find_children(:matches?, data)
|
43
|
+
end
|
44
|
+
|
45
|
+
# rubocop:disable Style/RedundantSort
|
46
|
+
# We need the stable sort, max_by(&:weight) returns the wrong one
|
47
|
+
def self.best_guess(data)
|
48
|
+
candidate_types(data).sort_by(&:weight).last
|
49
|
+
end
|
50
|
+
# rubocop:enable Style/RedundantSort
|
51
|
+
|
52
|
+
def self.type_name
|
53
|
+
raise NotImplementedError, "must impleent #{type_name}"
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.matches?(data)
|
57
|
+
raise NotImplementedError, "must implement #{name}.matches?(data)"
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.coerce(data)
|
61
|
+
raise NotImplementedError, "must implement #{name}.coerce(data)"
|
62
|
+
end
|
63
|
+
|
64
|
+
# Each type has a weight so if a value matches multiple types, then the list
|
65
|
+
# can be compared to see where the tie breakers are
|
66
|
+
#
|
67
|
+
# All the weights are here so that we can see the order of precedence
|
68
|
+
#
|
69
|
+
def self.weight
|
70
|
+
weights.fetch(self) { raise NotImplementedError, "No weight assigned to type #{self} - fix immediately" }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
require "flat_kit/field_type/guess_type"
|
76
|
+
require "flat_kit/field_type/boolean_type"
|
77
|
+
require "flat_kit/field_type/date_type"
|
78
|
+
require "flat_kit/field_type/timestamp_type"
|
79
|
+
require "flat_kit/field_type/integer_type"
|
80
|
+
require "flat_kit/field_type/float_type"
|
81
|
+
require "flat_kit/field_type/null_type"
|
82
|
+
require "flat_kit/field_type/string_type"
|
83
|
+
require "flat_kit/field_type/unknown_type"
|
data/lib/flat_kit/format.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class of all data file format classes
|
5
|
+
#
|
2
6
|
class Format
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.format_name
|
6
|
-
raise NotImplementedError, "#{self.class} must
|
10
|
+
raise NotImplementedError, "#{self.class} must implement #{self.class}.format_name"
|
7
11
|
end
|
8
12
|
|
9
13
|
def format_name
|
@@ -20,15 +24,17 @@ module FlatKit
|
|
20
24
|
return format unless format.nil?
|
21
25
|
|
22
26
|
# now try the fallback
|
23
|
-
|
24
|
-
return format
|
27
|
+
::FlatKit::Format.for(fallback)
|
25
28
|
end
|
26
29
|
|
27
30
|
def self.for_with_fallback!(path:, fallback: "auto")
|
28
31
|
format = for_with_fallback(path: path, fallback: fallback)
|
29
|
-
|
32
|
+
if format.nil?
|
33
|
+
raise ::FlatKit::Error::UnknownFormat,
|
34
|
+
"Unable to figure out format for '#{path}' with fallback '#{fallback}'"
|
35
|
+
end
|
30
36
|
|
31
|
-
|
37
|
+
format
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/flat_kit/input/file.rb
CHANGED
@@ -1,25 +1,31 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "zlib"
|
4
|
+
require "pathname"
|
2
5
|
|
3
6
|
module FlatKit
|
4
7
|
class Input
|
8
|
+
# Internal: Handler for file based input
|
9
|
+
#
|
5
10
|
class File < Input
|
6
|
-
attr_reader :path
|
7
|
-
attr_reader :count
|
11
|
+
attr_reader :path, :count, :io
|
8
12
|
|
9
13
|
def self.handles?(obj)
|
10
14
|
return true if obj.instance_of?(Pathname)
|
11
15
|
return false unless obj.instance_of?(String)
|
12
16
|
|
13
17
|
# incase these get loaded in different orders
|
14
|
-
return false if ::FlatKit::Input::IO.
|
18
|
+
return false if ::FlatKit::Input::IO.stdin?(obj)
|
15
19
|
|
16
|
-
|
20
|
+
true
|
17
21
|
end
|
18
22
|
|
19
23
|
def initialize(obj)
|
24
|
+
super()
|
20
25
|
@count = 0
|
21
26
|
@path = Pathname.new(obj)
|
22
27
|
raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
|
28
|
+
|
23
29
|
@io = open_input(path)
|
24
30
|
end
|
25
31
|
|
@@ -31,10 +37,6 @@ module FlatKit
|
|
31
37
|
@io.close
|
32
38
|
end
|
33
39
|
|
34
|
-
def io
|
35
|
-
@io
|
36
|
-
end
|
37
|
-
|
38
40
|
private
|
39
41
|
|
40
42
|
# open the opropriate input type depending on the source file name
|
data/lib/flat_kit/input/io.rb
CHANGED
@@ -1,35 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class Input
|
5
|
+
# Internal: Handler for non-filebased input. Generally this is just stdin
|
6
|
+
#
|
3
7
|
class IO < Input
|
4
|
-
STDINS = %w[
|
8
|
+
STDINS = %w[stdin STDIN - <stdin>].freeze
|
5
9
|
|
6
10
|
def self.handles?(obj)
|
7
|
-
return true if
|
8
|
-
return true if [
|
9
|
-
|
11
|
+
return true if stdin?(obj)
|
12
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
13
|
+
|
14
|
+
false
|
10
15
|
end
|
11
16
|
|
12
|
-
def self.
|
17
|
+
def self.stdin?(obj)
|
13
18
|
case obj
|
14
19
|
when String
|
15
20
|
return true if STDINS.include?(obj)
|
16
21
|
when ::IO
|
17
|
-
return true if obj ==
|
22
|
+
return true if obj == $stdin
|
18
23
|
end
|
19
|
-
|
24
|
+
false
|
20
25
|
end
|
21
26
|
|
22
27
|
def initialize(obj)
|
23
|
-
|
28
|
+
super()
|
29
|
+
if self.class.stdin?(obj)
|
24
30
|
@name = "<STDIN>"
|
25
31
|
@io = $stdin
|
26
|
-
elsif obj.
|
27
|
-
@name = obj.path
|
28
|
-
@io = obj
|
29
|
-
elsif obj.kind_of?(::StringIO) then
|
30
|
-
@name = obj.inspect
|
32
|
+
elsif obj.is_a?(::IO)
|
33
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
31
34
|
@io = obj
|
32
|
-
elsif obj.
|
35
|
+
elsif obj.is_a?(::StringIO)
|
33
36
|
@name = obj.inspect
|
34
37
|
@io = obj
|
35
38
|
else
|
@@ -37,18 +40,12 @@ module FlatKit
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
|
41
|
-
@name
|
42
|
-
end
|
43
|
+
attr_reader :name, :io
|
43
44
|
|
44
45
|
# this goes to an io stream and we are not in charge of opening it
|
45
46
|
def close
|
46
47
|
@io.close
|
47
48
|
end
|
48
|
-
|
49
|
-
def io
|
50
|
-
@io
|
51
|
-
end
|
52
49
|
end
|
53
50
|
end
|
54
51
|
end
|
data/lib/flat_kit/input.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Base class of all input handlers
|
5
|
+
#
|
2
6
|
class Input
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.from(input)
|
6
|
-
return input if input.
|
10
|
+
return input if input.is_a?(::FlatKit::Input)
|
7
11
|
|
8
12
|
in_klass = find_child(:handles?, input)
|
9
|
-
if in_klass
|
10
|
-
return in_klass.new(input)
|
11
|
-
end
|
13
|
+
return in_klass.new(input) if in_klass
|
12
14
|
|
13
15
|
raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
|
14
16
|
end
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
18
20
|
end
|
19
21
|
|
20
|
-
#
|
21
22
|
def io
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
23
24
|
end
|
@@ -28,5 +29,5 @@ module FlatKit
|
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
31
|
-
require
|
32
|
-
require
|
32
|
+
require "flat_kit/input/io"
|
33
|
+
require "flat_kit/input/file"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: This is a class used internally by MergeTree and should not be used
|
3
5
|
# outside of that context.
|
@@ -10,22 +12,24 @@ module FlatKit
|
|
10
12
|
# here.
|
11
13
|
#
|
12
14
|
class InternalNode
|
13
|
-
|
14
15
|
include Comparable
|
15
16
|
|
16
|
-
|
17
|
-
attr_accessor :right
|
18
|
-
|
19
|
-
|
20
|
-
attr_accessor :
|
17
|
+
# Internal Nodes
|
18
|
+
attr_accessor :left, :right, :winner
|
19
|
+
|
20
|
+
# Who to tell
|
21
|
+
attr_accessor :next_level
|
22
|
+
|
23
|
+
# winning leaf node
|
24
|
+
attr_accessor :leaf
|
21
25
|
|
22
26
|
def initialize(left:, right:)
|
23
|
-
@left
|
27
|
+
@left = left
|
24
28
|
@left.next_level = self
|
25
29
|
|
26
|
-
@right
|
30
|
+
@right = right
|
27
31
|
@right.next_level = self
|
28
|
-
@next_level
|
32
|
+
@next_level = nil
|
29
33
|
|
30
34
|
play
|
31
35
|
end
|
@@ -53,32 +57,31 @@ module FlatKit
|
|
53
57
|
# from the tree.
|
54
58
|
#
|
55
59
|
def player_finished(node)
|
56
|
-
if left.
|
60
|
+
if left.equal?(node)
|
57
61
|
@left = SentinelInternalNode.new
|
58
62
|
@left.next_level = self
|
59
|
-
elsif right.
|
63
|
+
elsif right.equal?(node)
|
60
64
|
@right = SentinelInternalNode.new
|
61
65
|
@right.next_level = self
|
62
66
|
else
|
63
67
|
raise FlatKit::Error, "Unknown player #{node}"
|
64
68
|
end
|
65
69
|
|
66
|
-
|
67
|
-
|
68
|
-
|
70
|
+
return unless @right.sentinel? && @left.sentinel?
|
71
|
+
|
72
|
+
next_level.player_finished(self) if next_level
|
69
73
|
end
|
70
74
|
|
71
75
|
def play
|
72
|
-
@winner = left <= right ? left : right
|
73
|
-
|
74
|
-
@leaf = winner.leaf
|
75
|
-
end
|
76
|
+
@winner = (left <= right) ? left : right
|
77
|
+
@leaf = winner.leaf unless @winner.sentinel?
|
76
78
|
next_level.play if next_level
|
77
79
|
end
|
78
80
|
|
79
81
|
def <=>(other)
|
80
82
|
return -1 if other.sentinel?
|
81
|
-
|
83
|
+
|
84
|
+
value <=> (other.value)
|
82
85
|
end
|
83
86
|
end
|
84
87
|
end
|
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: JSONL format class holding the metadata about the JSONL format
|
6
|
+
#
|
3
7
|
class Format < ::FlatKit::Format
|
4
8
|
def self.format_name
|
5
9
|
"jsonl"
|
@@ -7,10 +11,10 @@ module FlatKit
|
|
7
11
|
|
8
12
|
def self.handles?(filename)
|
9
13
|
parts = filename.split(".")
|
10
|
-
%w[
|
14
|
+
%w[json jsonl ndjson].each do |ext|
|
11
15
|
return true if parts.include?(ext)
|
12
16
|
end
|
13
|
-
|
17
|
+
false
|
14
18
|
end
|
15
19
|
|
16
20
|
def self.reader
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: Reader class that parses and yields records from JSONL files
|
6
|
+
#
|
3
7
|
class Reader < ::FlatKit::Reader
|
4
|
-
attr_reader :input
|
5
|
-
attr_reader :count
|
8
|
+
attr_reader :input, :count
|
6
9
|
|
7
10
|
def self.format_name
|
8
11
|
::FlatKit::Jsonl::Format.format_name
|
@@ -15,13 +18,13 @@ module FlatKit
|
|
15
18
|
end
|
16
19
|
|
17
20
|
def each
|
18
|
-
while line = input.io.gets
|
21
|
+
while (line = input.io.gets)
|
19
22
|
record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
|
20
23
|
@count += 1
|
21
24
|
yield record
|
22
25
|
end
|
23
26
|
input.close
|
24
|
-
rescue => e
|
27
|
+
rescue StandardError => e
|
25
28
|
::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
|
26
29
|
raise ::FlatKit::Error, e
|
27
30
|
end
|