flat_kit 0.3.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +9 -0
- data/Manifest.txt +3 -42
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +46 -32
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +41 -39
- data/lib/flat_kit/command.rb +10 -11
- data/lib/flat_kit/descendant_tracker.rb +9 -6
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +5 -2
- data/lib/flat_kit/field_stats.rb +31 -26
- data/lib/flat_kit/field_type/boolean_type.rb +9 -5
- data/lib/flat_kit/field_type/date_type.rb +19 -17
- data/lib/flat_kit/field_type/float_type.rb +15 -9
- data/lib/flat_kit/field_type/guess_type.rb +9 -6
- data/lib/flat_kit/field_type/integer_type.rb +6 -4
- data/lib/flat_kit/field_type/null_type.rb +5 -1
- data/lib/flat_kit/field_type/string_type.rb +8 -6
- data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
- data/lib/flat_kit/field_type/unknown_type.rb +12 -8
- data/lib/flat_kit/field_type.rb +52 -44
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +15 -18
- data/lib/flat_kit/jsonl/writer.rb +8 -10
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -18
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +8 -7
- data/lib/flat_kit/position.rb +3 -4
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
- data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
- data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
- data/lib/flat_kit/stat_type.rb +18 -13
- data/lib/flat_kit/stats.rb +12 -14
- data/lib/flat_kit/writer.rb +5 -6
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +13 -10
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +31 -26
- metadata +20 -158
- data/Rakefile +0 -21
- data/examples/stream-active-record-to-csv.rb +0 -42
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/field_type/test_boolean_type.rb +0 -65
- data/test/field_type/test_date_type.rb +0 -71
- data/test/field_type/test_float_type.rb +0 -56
- data/test/field_type/test_guess_type.rb +0 -14
- data/test/field_type/test_integer_type.rb +0 -52
- data/test/field_type/test_null_type.rb +0 -41
- data/test/field_type/test_string_type.rb +0 -18
- data/test/field_type/test_timestamp_type.rb +0 -108
- data/test/field_type/test_unknown_type.rb +0 -35
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -86
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/run +0 -23
- data/test/stat_type/test_nominal_stats.rb +0 -69
- data/test/stat_type/test_numerical_stats.rb +0 -118
- data/test/stat_type/test_ordinal_stats.rb +0 -92
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -89
- data/test/test_field_stats.rb +0 -134
- data/test/test_field_type.rb +0 -34
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -89
@@ -1,7 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class FieldType
|
5
|
+
# Internal: Unknown type, this is what we use for unknown values in the data
|
6
|
+
#
|
3
7
|
class UnknownType < FieldType
|
4
|
-
|
5
8
|
REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
|
6
9
|
|
7
10
|
def self.type_name
|
@@ -9,18 +12,19 @@ module FlatKit
|
|
9
12
|
end
|
10
13
|
|
11
14
|
def self.matches?(data)
|
12
|
-
return false unless data.
|
13
|
-
return true if data.
|
14
|
-
|
15
|
+
return false unless data.is_a?(String)
|
16
|
+
return true if data.empty?
|
17
|
+
|
18
|
+
REGEX.match?(data)
|
15
19
|
end
|
16
20
|
|
17
21
|
def self.coerce(data)
|
18
22
|
return data if REGEX.match?(data)
|
19
|
-
return CoerceFailure
|
20
|
-
rescue
|
21
|
-
return CoerceFailure
|
22
|
-
end
|
23
23
|
|
24
|
+
CoerceFailure
|
25
|
+
rescue StandardError
|
26
|
+
CoerceFailure
|
27
|
+
end
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|
data/lib/flat_kit/field_type.rb
CHANGED
@@ -1,75 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class for all field types
|
5
|
+
#
|
2
6
|
class FieldType
|
3
|
-
|
4
7
|
extend FlatKit::DescendantTracker
|
5
8
|
|
6
9
|
CoerceFailure = Class.new(::Object).freeze
|
7
10
|
|
11
|
+
def self.weights
|
12
|
+
@weights ||= {
|
13
|
+
# Boolean has crossover with Integer so going to let it overrule Integer
|
14
|
+
BooleanType => 5,
|
15
|
+
|
16
|
+
# Integer could potentially overlap with Float, but it is more restrictive
|
17
|
+
# so let it override Flaot
|
18
|
+
IntegerType => 4,
|
19
|
+
FloatType => 3,
|
20
|
+
|
21
|
+
# Date and Timestamps string representation shouldn't intersect with anything so
|
22
|
+
# leaving it at the same level as Null and Unkonwn
|
23
|
+
DateType => 2,
|
24
|
+
TimestampType => 2,
|
25
|
+
|
26
|
+
# Null and Unknown shoulnd't conflict since their string representations
|
27
|
+
# do not intersect
|
28
|
+
NullType => 2,
|
29
|
+
UnknownType => 2,
|
30
|
+
|
31
|
+
# Stringtype is the fallback for anything that has a string
|
32
|
+
# representation, so it should lose out on integers, floats, nulls,
|
33
|
+
# unknowns as strings
|
34
|
+
StringType => 1,
|
35
|
+
|
36
|
+
# at the bottom - since it should never match anywhere
|
37
|
+
GuessType => 0,
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
8
41
|
def self.candidate_types(data)
|
9
42
|
find_children(:matches?, data)
|
10
43
|
end
|
11
44
|
|
45
|
+
# rubocop:disable Style/RedundantSort
|
46
|
+
# We need the stable sort, max_by(&:weight) returns the wrong one
|
12
47
|
def self.best_guess(data)
|
13
|
-
candidate_types(data).sort_by
|
48
|
+
candidate_types(data).sort_by(&:weight).last
|
14
49
|
end
|
50
|
+
# rubocop:enable Style/RedundantSort
|
15
51
|
|
16
52
|
def self.type_name
|
17
|
-
raise NotImplementedError, "must impleent #{
|
53
|
+
raise NotImplementedError, "must impleent #{type_name}"
|
18
54
|
end
|
19
55
|
|
20
56
|
def self.matches?(data)
|
21
|
-
raise NotImplementedError, "must implement #{
|
57
|
+
raise NotImplementedError, "must implement #{name}.matches?(data)"
|
22
58
|
end
|
23
59
|
|
24
60
|
def self.coerce(data)
|
25
|
-
raise NotImplementedError, "must implement #{
|
61
|
+
raise NotImplementedError, "must implement #{name}.coerce(data)"
|
26
62
|
end
|
27
63
|
|
28
64
|
# Each type has a weight so if a value matches multiple types, then the list
|
29
65
|
# can be compared to see where the tie breakers are
|
30
66
|
#
|
31
|
-
# All the weights are here so that
|
32
|
-
#
|
67
|
+
# All the weights are here so that we can see the order of precedence
|
33
68
|
#
|
34
69
|
def self.weight
|
35
|
-
|
36
|
-
return 5 if self == BooleanType
|
37
|
-
|
38
|
-
|
39
|
-
# Integer could potentially overlap with Float, but it is more restrictive
|
40
|
-
# so let it override Flaot
|
41
|
-
return 4 if self == IntegerType
|
42
|
-
return 3 if self == FloatType
|
43
|
-
|
44
|
-
# Date and Timestamps string representation shouldn't intersect with anything so
|
45
|
-
# leaving it at the same level as Null and Unkonwn
|
46
|
-
return 2 if self == DateType
|
47
|
-
return 2 if self == TimestampType
|
48
|
-
|
49
|
-
# Null and Unknown shoulnd't conflict since their string representations
|
50
|
-
# do not intersect
|
51
|
-
return 2 if self == NullType
|
52
|
-
return 2 if self == UnknownType
|
53
|
-
|
54
|
-
# Stringtype is the fallback for anything that has a string
|
55
|
-
# representation, so it should lose out on integers, floats, nulls,
|
56
|
-
# unknowns as strings
|
57
|
-
return 1 if self == StringType
|
58
|
-
|
59
|
-
# at the bottom - since it should never match anywhere
|
60
|
-
return 0 if self == GuessType
|
61
|
-
|
62
|
-
raise NotImplementedError, "No weight assigned to type #{self} - fix immediately"
|
70
|
+
weights.fetch(self) { raise NotImplementedError, "No weight assigned to type #{self} - fix immediately" }
|
63
71
|
end
|
64
72
|
end
|
65
73
|
end
|
66
74
|
|
67
|
-
require
|
68
|
-
require
|
69
|
-
require
|
70
|
-
require
|
71
|
-
require
|
72
|
-
require
|
73
|
-
require
|
74
|
-
require
|
75
|
-
require
|
75
|
+
require "flat_kit/field_type/guess_type"
|
76
|
+
require "flat_kit/field_type/boolean_type"
|
77
|
+
require "flat_kit/field_type/date_type"
|
78
|
+
require "flat_kit/field_type/timestamp_type"
|
79
|
+
require "flat_kit/field_type/integer_type"
|
80
|
+
require "flat_kit/field_type/float_type"
|
81
|
+
require "flat_kit/field_type/null_type"
|
82
|
+
require "flat_kit/field_type/string_type"
|
83
|
+
require "flat_kit/field_type/unknown_type"
|
data/lib/flat_kit/format.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class of all data file format classes
|
5
|
+
#
|
2
6
|
class Format
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.format_name
|
6
|
-
raise NotImplementedError, "#{self.class} must
|
10
|
+
raise NotImplementedError, "#{self.class} must implement #{self.class}.format_name"
|
7
11
|
end
|
8
12
|
|
9
13
|
def format_name
|
@@ -20,15 +24,17 @@ module FlatKit
|
|
20
24
|
return format unless format.nil?
|
21
25
|
|
22
26
|
# now try the fallback
|
23
|
-
|
24
|
-
return format
|
27
|
+
::FlatKit::Format.for(fallback)
|
25
28
|
end
|
26
29
|
|
27
30
|
def self.for_with_fallback!(path:, fallback: "auto")
|
28
31
|
format = for_with_fallback(path: path, fallback: fallback)
|
29
|
-
|
32
|
+
if format.nil?
|
33
|
+
raise ::FlatKit::Error::UnknownFormat,
|
34
|
+
"Unable to figure out format for '#{path}' with fallback '#{fallback}'"
|
35
|
+
end
|
30
36
|
|
31
|
-
|
37
|
+
format
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/flat_kit/input/file.rb
CHANGED
@@ -1,25 +1,31 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "zlib"
|
4
|
+
require "pathname"
|
2
5
|
|
3
6
|
module FlatKit
|
4
7
|
class Input
|
8
|
+
# Internal: Handler for file based input
|
9
|
+
#
|
5
10
|
class File < Input
|
6
|
-
attr_reader :path
|
7
|
-
attr_reader :count
|
11
|
+
attr_reader :path, :count, :io
|
8
12
|
|
9
13
|
def self.handles?(obj)
|
10
14
|
return true if obj.instance_of?(Pathname)
|
11
15
|
return false unless obj.instance_of?(String)
|
12
16
|
|
13
17
|
# incase these get loaded in different orders
|
14
|
-
return false if ::FlatKit::Input::IO.
|
18
|
+
return false if ::FlatKit::Input::IO.stdin?(obj)
|
15
19
|
|
16
|
-
|
20
|
+
true
|
17
21
|
end
|
18
22
|
|
19
23
|
def initialize(obj)
|
24
|
+
super()
|
20
25
|
@count = 0
|
21
26
|
@path = Pathname.new(obj)
|
22
27
|
raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
|
28
|
+
|
23
29
|
@io = open_input(path)
|
24
30
|
end
|
25
31
|
|
@@ -31,10 +37,6 @@ module FlatKit
|
|
31
37
|
@io.close
|
32
38
|
end
|
33
39
|
|
34
|
-
def io
|
35
|
-
@io
|
36
|
-
end
|
37
|
-
|
38
40
|
private
|
39
41
|
|
40
42
|
# open the opropriate input type depending on the source file name
|
data/lib/flat_kit/input/io.rb
CHANGED
@@ -1,35 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class Input
|
5
|
+
# Internal: Handler for non-filebased input. Generally this is just stdin
|
6
|
+
#
|
3
7
|
class IO < Input
|
4
|
-
STDINS = %w[
|
8
|
+
STDINS = %w[stdin STDIN - <stdin>].freeze
|
5
9
|
|
6
10
|
def self.handles?(obj)
|
7
|
-
return true if
|
8
|
-
return true if [
|
9
|
-
|
11
|
+
return true if stdin?(obj)
|
12
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
13
|
+
|
14
|
+
false
|
10
15
|
end
|
11
16
|
|
12
|
-
def self.
|
17
|
+
def self.stdin?(obj)
|
13
18
|
case obj
|
14
19
|
when String
|
15
20
|
return true if STDINS.include?(obj)
|
16
21
|
when ::IO
|
17
|
-
return true if obj ==
|
22
|
+
return true if obj == $stdin
|
18
23
|
end
|
19
|
-
|
24
|
+
false
|
20
25
|
end
|
21
26
|
|
22
27
|
def initialize(obj)
|
23
|
-
|
28
|
+
super()
|
29
|
+
if self.class.stdin?(obj)
|
24
30
|
@name = "<STDIN>"
|
25
31
|
@io = $stdin
|
26
|
-
elsif obj.
|
27
|
-
@name = obj.path
|
28
|
-
@io = obj
|
29
|
-
elsif obj.kind_of?(::StringIO) then
|
30
|
-
@name = obj.inspect
|
32
|
+
elsif obj.is_a?(::IO)
|
33
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
31
34
|
@io = obj
|
32
|
-
elsif obj.
|
35
|
+
elsif obj.is_a?(::StringIO)
|
33
36
|
@name = obj.inspect
|
34
37
|
@io = obj
|
35
38
|
else
|
@@ -37,18 +40,12 @@ module FlatKit
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
|
41
|
-
@name
|
42
|
-
end
|
43
|
+
attr_reader :name, :io
|
43
44
|
|
44
45
|
# this goes to an io stream and we are not in charge of opening it
|
45
46
|
def close
|
46
47
|
@io.close
|
47
48
|
end
|
48
|
-
|
49
|
-
def io
|
50
|
-
@io
|
51
|
-
end
|
52
49
|
end
|
53
50
|
end
|
54
51
|
end
|
data/lib/flat_kit/input.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Base class of all input handlers
|
5
|
+
#
|
2
6
|
class Input
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.from(input)
|
6
|
-
return input if input.
|
10
|
+
return input if input.is_a?(::FlatKit::Input)
|
7
11
|
|
8
12
|
in_klass = find_child(:handles?, input)
|
9
|
-
if in_klass
|
10
|
-
return in_klass.new(input)
|
11
|
-
end
|
13
|
+
return in_klass.new(input) if in_klass
|
12
14
|
|
13
15
|
raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
|
14
16
|
end
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
18
20
|
end
|
19
21
|
|
20
|
-
#
|
21
22
|
def io
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
23
24
|
end
|
@@ -28,5 +29,5 @@ module FlatKit
|
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
31
|
-
require
|
32
|
-
require
|
32
|
+
require "flat_kit/input/io"
|
33
|
+
require "flat_kit/input/file"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: This is a class used internally by MergeTree and should not be used
|
3
5
|
# outside of that context.
|
@@ -10,22 +12,24 @@ module FlatKit
|
|
10
12
|
# here.
|
11
13
|
#
|
12
14
|
class InternalNode
|
13
|
-
|
14
15
|
include Comparable
|
15
16
|
|
16
|
-
|
17
|
-
attr_accessor :right
|
18
|
-
|
19
|
-
|
20
|
-
attr_accessor :
|
17
|
+
# Internal Nodes
|
18
|
+
attr_accessor :left, :right, :winner
|
19
|
+
|
20
|
+
# Who to tell
|
21
|
+
attr_accessor :next_level
|
22
|
+
|
23
|
+
# winning leaf node
|
24
|
+
attr_accessor :leaf
|
21
25
|
|
22
26
|
def initialize(left:, right:)
|
23
|
-
@left
|
27
|
+
@left = left
|
24
28
|
@left.next_level = self
|
25
29
|
|
26
|
-
@right
|
30
|
+
@right = right
|
27
31
|
@right.next_level = self
|
28
|
-
@next_level
|
32
|
+
@next_level = nil
|
29
33
|
|
30
34
|
play
|
31
35
|
end
|
@@ -53,32 +57,31 @@ module FlatKit
|
|
53
57
|
# from the tree.
|
54
58
|
#
|
55
59
|
def player_finished(node)
|
56
|
-
if left.
|
60
|
+
if left.equal?(node)
|
57
61
|
@left = SentinelInternalNode.new
|
58
62
|
@left.next_level = self
|
59
|
-
elsif right.
|
63
|
+
elsif right.equal?(node)
|
60
64
|
@right = SentinelInternalNode.new
|
61
65
|
@right.next_level = self
|
62
66
|
else
|
63
67
|
raise FlatKit::Error, "Unknown player #{node}"
|
64
68
|
end
|
65
69
|
|
66
|
-
|
67
|
-
|
68
|
-
|
70
|
+
return unless @right.sentinel? && @left.sentinel?
|
71
|
+
|
72
|
+
next_level.player_finished(self) if next_level
|
69
73
|
end
|
70
74
|
|
71
75
|
def play
|
72
|
-
@winner = left <= right ? left : right
|
73
|
-
|
74
|
-
@leaf = winner.leaf
|
75
|
-
end
|
76
|
+
@winner = (left <= right) ? left : right
|
77
|
+
@leaf = winner.leaf unless @winner.sentinel?
|
76
78
|
next_level.play if next_level
|
77
79
|
end
|
78
80
|
|
79
81
|
def <=>(other)
|
80
82
|
return -1 if other.sentinel?
|
81
|
-
|
83
|
+
|
84
|
+
value <=> (other.value)
|
82
85
|
end
|
83
86
|
end
|
84
87
|
end
|
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: JSONL format class holding the metadata about the JSONL format
|
6
|
+
#
|
3
7
|
class Format < ::FlatKit::Format
|
4
8
|
def self.format_name
|
5
9
|
"jsonl"
|
@@ -7,10 +11,10 @@ module FlatKit
|
|
7
11
|
|
8
12
|
def self.handles?(filename)
|
9
13
|
parts = filename.split(".")
|
10
|
-
%w[
|
14
|
+
%w[json jsonl ndjson].each do |ext|
|
11
15
|
return true if parts.include?(ext)
|
12
16
|
end
|
13
|
-
|
17
|
+
false
|
14
18
|
end
|
15
19
|
|
16
20
|
def self.reader
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: Reader class that parses and yields records from JSONL files
|
6
|
+
#
|
3
7
|
class Reader < ::FlatKit::Reader
|
4
|
-
attr_reader :input
|
5
|
-
attr_reader :count
|
8
|
+
attr_reader :input, :count
|
6
9
|
|
7
10
|
def self.format_name
|
8
11
|
::FlatKit::Jsonl::Format.format_name
|
@@ -15,13 +18,13 @@ module FlatKit
|
|
15
18
|
end
|
16
19
|
|
17
20
|
def each
|
18
|
-
while line = input.io.gets
|
21
|
+
while (line = input.io.gets)
|
19
22
|
record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
|
20
23
|
@count += 1
|
21
24
|
yield record
|
22
25
|
end
|
23
26
|
input.close
|
24
|
-
rescue => e
|
27
|
+
rescue StandardError => e
|
25
28
|
::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
|
26
29
|
raise ::FlatKit::Error, e
|
27
30
|
end
|
@@ -1,8 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "oj"
|
4
|
+
require "flat_kit/record"
|
3
5
|
|
4
6
|
module FlatKit
|
5
7
|
module Jsonl
|
8
|
+
# Internal: Class that exposes data from a JSONL format record to the flatkit api
|
9
|
+
#
|
6
10
|
class Record < ::FlatKit::Record
|
7
11
|
attr_reader :compare_data
|
8
12
|
|
@@ -11,7 +15,7 @@ module FlatKit
|
|
11
15
|
end
|
12
16
|
|
13
17
|
def self.from_record(record)
|
14
|
-
if record.instance_of?(FlatKit::Jsonl::Record)
|
18
|
+
if record.instance_of?(FlatKit::Jsonl::Record)
|
15
19
|
|
16
20
|
structured = record.complete_structured_data? ? record.complete_structured_data : nil
|
17
21
|
|
@@ -25,22 +29,20 @@ module FlatKit
|
|
25
29
|
end
|
26
30
|
|
27
31
|
def initialize(data:, compare_fields: :none,
|
28
|
-
compare_data:
|
32
|
+
compare_data: {},
|
29
33
|
complete_structured_data: nil)
|
30
34
|
super(data: data, compare_fields: compare_fields)
|
31
35
|
|
32
36
|
@complete_structured_data = complete_structured_data
|
33
37
|
|
34
|
-
if complete_structured_data? && (compare_data.nil? || compare_data.empty?)
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
38
|
+
@compare_data = if complete_structured_data? && (compare_data.nil? || compare_data.empty?)
|
39
|
+
complete_structured_data
|
40
|
+
else
|
41
|
+
compare_data
|
42
|
+
end
|
39
43
|
|
40
44
|
# only load compare data if it dosn't exist
|
41
|
-
if data && compare_data.empty?
|
42
|
-
quick_parse
|
43
|
-
end
|
45
|
+
quick_parse if data && compare_data.empty?
|
44
46
|
end
|
45
47
|
|
46
48
|
def [](key)
|
@@ -59,9 +61,7 @@ module FlatKit
|
|
59
61
|
# overriding parent accessor since we may be initialized without raw bytes
|
60
62
|
# to parse
|
61
63
|
def data
|
62
|
-
if @data.nil? && complete_structured_data?
|
63
|
-
@data = Oj.dump(complete_structured_data, mode: :json)
|
64
|
-
end
|
64
|
+
@data = Oj.dump(complete_structured_data, mode: :json) if @data.nil? && complete_structured_data?
|
65
65
|
@data
|
66
66
|
end
|
67
67
|
alias to_s data
|
@@ -79,6 +79,3 @@ module FlatKit
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
@@ -1,15 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: Class that writes flatkit records to JSONL files
|
6
|
+
#
|
3
7
|
class Writer < ::FlatKit::Writer
|
4
|
-
|
5
8
|
def self.format_name
|
6
9
|
::FlatKit::Jsonl::Format.format_name
|
7
10
|
end
|
8
11
|
|
9
|
-
def initialize(destination:)
|
10
|
-
super
|
11
|
-
end
|
12
|
-
|
13
12
|
# write the record and return the Position the record was written
|
14
13
|
#
|
15
14
|
def write(record)
|
@@ -22,10 +21,10 @@ module FlatKit
|
|
22
21
|
else
|
23
22
|
raise FlatKit::Error, "Unable to write records of type #{record.class}"
|
24
23
|
end
|
25
|
-
rescue FlatKit::Error =>
|
26
|
-
raise
|
27
|
-
rescue => e
|
28
|
-
::FlatKit.logger.error "Error
|
24
|
+
rescue FlatKit::Error => e
|
25
|
+
raise e
|
26
|
+
rescue StandardError => e
|
27
|
+
::FlatKit.logger.error "Error writing jsonl records to #{output.name}: #{e}"
|
29
28
|
raise ::FlatKit::Error, e
|
30
29
|
end
|
31
30
|
|
@@ -47,7 +46,6 @@ module FlatKit
|
|
47
46
|
@last_position = ::FlatKit::Position.new(index: record_index,
|
48
47
|
offset: start_offset,
|
49
48
|
bytesize: bytes_written)
|
50
|
-
|
51
49
|
end
|
52
50
|
end
|
53
51
|
end
|
data/lib/flat_kit/jsonl.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Top level namespace for the newline-oriented JSON format
|
5
|
+
#
|
2
6
|
module Jsonl
|
3
7
|
end
|
4
8
|
end
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
+
require "flat_kit/jsonl/record"
|
10
|
+
require "flat_kit/jsonl/reader"
|
11
|
+
require "flat_kit/jsonl/writer"
|
12
|
+
require "flat_kit/jsonl/format"
|
data/lib/flat_kit/leaf_node.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The LeafNode is a wrapper around a Reader object to enable
|
3
5
|
# a consistent api for use in the MergeTree
|
@@ -9,11 +11,9 @@ module FlatKit
|
|
9
11
|
# If all the data is used up from the reader, it also notifies the next level
|
10
12
|
# of that so the next level can remove it from the tree.
|
11
13
|
class LeafNode
|
12
|
-
|
13
14
|
include Comparable
|
14
15
|
|
15
|
-
attr_reader :reader
|
16
|
-
attr_reader :value
|
16
|
+
attr_reader :reader, :value
|
17
17
|
|
18
18
|
attr_accessor :next_level
|
19
19
|
|
@@ -43,7 +43,7 @@ module FlatKit
|
|
43
43
|
|
44
44
|
def update_and_replay
|
45
45
|
self.next
|
46
|
-
if finished?
|
46
|
+
if finished?
|
47
47
|
::FlatKit.logger.debug "#{reader.source} has finished reading #{reader.count} records"
|
48
48
|
next_level.player_finished(self)
|
49
49
|
end
|
@@ -65,7 +65,8 @@ module FlatKit
|
|
65
65
|
|
66
66
|
def <=>(other)
|
67
67
|
return -1 if other.sentinel?
|
68
|
-
|
68
|
+
|
69
|
+
value <=> (other.value)
|
69
70
|
end
|
70
71
|
end
|
71
72
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
|
5
|
+
module FlatKit
|
6
|
+
# Internal: Log formatting class for FlatKit
|
7
|
+
#
|
8
|
+
class LogFormatter < ::Logger::Formatter
|
9
|
+
FORMAT = "%s %5d %05s : %s\n"
|
10
|
+
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
11
|
+
def initialize
|
12
|
+
super
|
13
|
+
self.datetime_format = DATETIME_FORMAT
|
14
|
+
end
|
15
|
+
|
16
|
+
def call(severity, time, _progname, msg)
|
17
|
+
format(FORMAT, format_datetime(time.utc), Process.pid, severity, msg2str(msg))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|