flat_kit 0.3.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +9 -0
- data/Manifest.txt +3 -42
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +46 -32
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +41 -39
- data/lib/flat_kit/command.rb +10 -11
- data/lib/flat_kit/descendant_tracker.rb +9 -6
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +5 -2
- data/lib/flat_kit/field_stats.rb +31 -26
- data/lib/flat_kit/field_type/boolean_type.rb +9 -5
- data/lib/flat_kit/field_type/date_type.rb +19 -17
- data/lib/flat_kit/field_type/float_type.rb +15 -9
- data/lib/flat_kit/field_type/guess_type.rb +9 -6
- data/lib/flat_kit/field_type/integer_type.rb +6 -4
- data/lib/flat_kit/field_type/null_type.rb +5 -1
- data/lib/flat_kit/field_type/string_type.rb +8 -6
- data/lib/flat_kit/field_type/timestamp_type.rb +11 -10
- data/lib/flat_kit/field_type/unknown_type.rb +12 -8
- data/lib/flat_kit/field_type.rb +52 -44
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +15 -18
- data/lib/flat_kit/jsonl/writer.rb +8 -10
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -18
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +8 -7
- data/lib/flat_kit/position.rb +3 -4
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +13 -7
- data/lib/flat_kit/stat_type/numerical_stats.rb +18 -18
- data/lib/flat_kit/stat_type/ordinal_stats.rb +8 -13
- data/lib/flat_kit/stat_type.rb +18 -13
- data/lib/flat_kit/stats.rb +12 -14
- data/lib/flat_kit/writer.rb +5 -6
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +13 -10
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +31 -26
- metadata +20 -158
- data/Rakefile +0 -21
- data/examples/stream-active-record-to-csv.rb +0 -42
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/field_type/test_boolean_type.rb +0 -65
- data/test/field_type/test_date_type.rb +0 -71
- data/test/field_type/test_float_type.rb +0 -56
- data/test/field_type/test_guess_type.rb +0 -14
- data/test/field_type/test_integer_type.rb +0 -52
- data/test/field_type/test_null_type.rb +0 -41
- data/test/field_type/test_string_type.rb +0 -18
- data/test/field_type/test_timestamp_type.rb +0 -108
- data/test/field_type/test_unknown_type.rb +0 -35
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -86
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/run +0 -23
- data/test/stat_type/test_nominal_stats.rb +0 -69
- data/test/stat_type/test_numerical_stats.rb +0 -118
- data/test/stat_type/test_ordinal_stats.rb +0 -92
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -89
- data/test/test_field_stats.rb +0 -134
- data/test/test_field_type.rb +0 -34
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -89
@@ -1,7 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class FieldType
|
5
|
+
# Internal: Unknown type, this is what we use for unknown values in the data
|
6
|
+
#
|
3
7
|
class UnknownType < FieldType
|
4
|
-
|
5
8
|
REGEX = %r{\A(na|n/a|unk|unknown)\Z}i
|
6
9
|
|
7
10
|
def self.type_name
|
@@ -9,18 +12,19 @@ module FlatKit
|
|
9
12
|
end
|
10
13
|
|
11
14
|
def self.matches?(data)
|
12
|
-
return false unless data.
|
13
|
-
return true if data.
|
14
|
-
|
15
|
+
return false unless data.is_a?(String)
|
16
|
+
return true if data.empty?
|
17
|
+
|
18
|
+
REGEX.match?(data)
|
15
19
|
end
|
16
20
|
|
17
21
|
def self.coerce(data)
|
18
22
|
return data if REGEX.match?(data)
|
19
|
-
return CoerceFailure
|
20
|
-
rescue
|
21
|
-
return CoerceFailure
|
22
|
-
end
|
23
23
|
|
24
|
+
CoerceFailure
|
25
|
+
rescue StandardError
|
26
|
+
CoerceFailure
|
27
|
+
end
|
24
28
|
end
|
25
29
|
end
|
26
30
|
end
|
data/lib/flat_kit/field_type.rb
CHANGED
@@ -1,75 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class for all field types
|
5
|
+
#
|
2
6
|
class FieldType
|
3
|
-
|
4
7
|
extend FlatKit::DescendantTracker
|
5
8
|
|
6
9
|
CoerceFailure = Class.new(::Object).freeze
|
7
10
|
|
11
|
+
def self.weights
|
12
|
+
@weights ||= {
|
13
|
+
# Boolean has crossover with Integer so going to let it overrule Integer
|
14
|
+
BooleanType => 5,
|
15
|
+
|
16
|
+
# Integer could potentially overlap with Float, but it is more restrictive
|
17
|
+
# so let it override Flaot
|
18
|
+
IntegerType => 4,
|
19
|
+
FloatType => 3,
|
20
|
+
|
21
|
+
# Date and Timestamps string representation shouldn't intersect with anything so
|
22
|
+
# leaving it at the same level as Null and Unkonwn
|
23
|
+
DateType => 2,
|
24
|
+
TimestampType => 2,
|
25
|
+
|
26
|
+
# Null and Unknown shoulnd't conflict since their string representations
|
27
|
+
# do not intersect
|
28
|
+
NullType => 2,
|
29
|
+
UnknownType => 2,
|
30
|
+
|
31
|
+
# Stringtype is the fallback for anything that has a string
|
32
|
+
# representation, so it should lose out on integers, floats, nulls,
|
33
|
+
# unknowns as strings
|
34
|
+
StringType => 1,
|
35
|
+
|
36
|
+
# at the bottom - since it should never match anywhere
|
37
|
+
GuessType => 0,
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
8
41
|
def self.candidate_types(data)
|
9
42
|
find_children(:matches?, data)
|
10
43
|
end
|
11
44
|
|
45
|
+
# rubocop:disable Style/RedundantSort
|
46
|
+
# We need the stable sort, max_by(&:weight) returns the wrong one
|
12
47
|
def self.best_guess(data)
|
13
|
-
candidate_types(data).sort_by
|
48
|
+
candidate_types(data).sort_by(&:weight).last
|
14
49
|
end
|
50
|
+
# rubocop:enable Style/RedundantSort
|
15
51
|
|
16
52
|
def self.type_name
|
17
|
-
raise NotImplementedError, "must impleent #{
|
53
|
+
raise NotImplementedError, "must impleent #{type_name}"
|
18
54
|
end
|
19
55
|
|
20
56
|
def self.matches?(data)
|
21
|
-
raise NotImplementedError, "must implement #{
|
57
|
+
raise NotImplementedError, "must implement #{name}.matches?(data)"
|
22
58
|
end
|
23
59
|
|
24
60
|
def self.coerce(data)
|
25
|
-
raise NotImplementedError, "must implement #{
|
61
|
+
raise NotImplementedError, "must implement #{name}.coerce(data)"
|
26
62
|
end
|
27
63
|
|
28
64
|
# Each type has a weight so if a value matches multiple types, then the list
|
29
65
|
# can be compared to see where the tie breakers are
|
30
66
|
#
|
31
|
-
# All the weights are here so that
|
32
|
-
#
|
67
|
+
# All the weights are here so that we can see the order of precedence
|
33
68
|
#
|
34
69
|
def self.weight
|
35
|
-
|
36
|
-
return 5 if self == BooleanType
|
37
|
-
|
38
|
-
|
39
|
-
# Integer could potentially overlap with Float, but it is more restrictive
|
40
|
-
# so let it override Flaot
|
41
|
-
return 4 if self == IntegerType
|
42
|
-
return 3 if self == FloatType
|
43
|
-
|
44
|
-
# Date and Timestamps string representation shouldn't intersect with anything so
|
45
|
-
# leaving it at the same level as Null and Unkonwn
|
46
|
-
return 2 if self == DateType
|
47
|
-
return 2 if self == TimestampType
|
48
|
-
|
49
|
-
# Null and Unknown shoulnd't conflict since their string representations
|
50
|
-
# do not intersect
|
51
|
-
return 2 if self == NullType
|
52
|
-
return 2 if self == UnknownType
|
53
|
-
|
54
|
-
# Stringtype is the fallback for anything that has a string
|
55
|
-
# representation, so it should lose out on integers, floats, nulls,
|
56
|
-
# unknowns as strings
|
57
|
-
return 1 if self == StringType
|
58
|
-
|
59
|
-
# at the bottom - since it should never match anywhere
|
60
|
-
return 0 if self == GuessType
|
61
|
-
|
62
|
-
raise NotImplementedError, "No weight assigned to type #{self} - fix immediately"
|
70
|
+
weights.fetch(self) { raise NotImplementedError, "No weight assigned to type #{self} - fix immediately" }
|
63
71
|
end
|
64
72
|
end
|
65
73
|
end
|
66
74
|
|
67
|
-
require
|
68
|
-
require
|
69
|
-
require
|
70
|
-
require
|
71
|
-
require
|
72
|
-
require
|
73
|
-
require
|
74
|
-
require
|
75
|
-
require
|
75
|
+
require "flat_kit/field_type/guess_type"
|
76
|
+
require "flat_kit/field_type/boolean_type"
|
77
|
+
require "flat_kit/field_type/date_type"
|
78
|
+
require "flat_kit/field_type/timestamp_type"
|
79
|
+
require "flat_kit/field_type/integer_type"
|
80
|
+
require "flat_kit/field_type/float_type"
|
81
|
+
require "flat_kit/field_type/null_type"
|
82
|
+
require "flat_kit/field_type/string_type"
|
83
|
+
require "flat_kit/field_type/unknown_type"
|
data/lib/flat_kit/format.rb
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class of all data file format classes
|
5
|
+
#
|
2
6
|
class Format
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.format_name
|
6
|
-
raise NotImplementedError, "#{self.class} must
|
10
|
+
raise NotImplementedError, "#{self.class} must implement #{self.class}.format_name"
|
7
11
|
end
|
8
12
|
|
9
13
|
def format_name
|
@@ -20,15 +24,17 @@ module FlatKit
|
|
20
24
|
return format unless format.nil?
|
21
25
|
|
22
26
|
# now try the fallback
|
23
|
-
|
24
|
-
return format
|
27
|
+
::FlatKit::Format.for(fallback)
|
25
28
|
end
|
26
29
|
|
27
30
|
def self.for_with_fallback!(path:, fallback: "auto")
|
28
31
|
format = for_with_fallback(path: path, fallback: fallback)
|
29
|
-
|
32
|
+
if format.nil?
|
33
|
+
raise ::FlatKit::Error::UnknownFormat,
|
34
|
+
"Unable to figure out format for '#{path}' with fallback '#{fallback}'"
|
35
|
+
end
|
30
36
|
|
31
|
-
|
37
|
+
format
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
data/lib/flat_kit/input/file.rb
CHANGED
@@ -1,25 +1,31 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "zlib"
|
4
|
+
require "pathname"
|
2
5
|
|
3
6
|
module FlatKit
|
4
7
|
class Input
|
8
|
+
# Internal: Handler for file based input
|
9
|
+
#
|
5
10
|
class File < Input
|
6
|
-
attr_reader :path
|
7
|
-
attr_reader :count
|
11
|
+
attr_reader :path, :count, :io
|
8
12
|
|
9
13
|
def self.handles?(obj)
|
10
14
|
return true if obj.instance_of?(Pathname)
|
11
15
|
return false unless obj.instance_of?(String)
|
12
16
|
|
13
17
|
# incase these get loaded in different orders
|
14
|
-
return false if ::FlatKit::Input::IO.
|
18
|
+
return false if ::FlatKit::Input::IO.stdin?(obj)
|
15
19
|
|
16
|
-
|
20
|
+
true
|
17
21
|
end
|
18
22
|
|
19
23
|
def initialize(obj)
|
24
|
+
super()
|
20
25
|
@count = 0
|
21
26
|
@path = Pathname.new(obj)
|
22
27
|
raise FlatKit::Error, "Input #{obj} is not readable" unless @path.readable?
|
28
|
+
|
23
29
|
@io = open_input(path)
|
24
30
|
end
|
25
31
|
|
@@ -31,10 +37,6 @@ module FlatKit
|
|
31
37
|
@io.close
|
32
38
|
end
|
33
39
|
|
34
|
-
def io
|
35
|
-
@io
|
36
|
-
end
|
37
|
-
|
38
40
|
private
|
39
41
|
|
40
42
|
# open the opropriate input type depending on the source file name
|
data/lib/flat_kit/input/io.rb
CHANGED
@@ -1,35 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
class Input
|
5
|
+
# Internal: Handler for non-filebased input. Generally this is just stdin
|
6
|
+
#
|
3
7
|
class IO < Input
|
4
|
-
STDINS = %w[
|
8
|
+
STDINS = %w[stdin STDIN - <stdin>].freeze
|
5
9
|
|
6
10
|
def self.handles?(obj)
|
7
|
-
return true if
|
8
|
-
return true if [
|
9
|
-
|
11
|
+
return true if stdin?(obj)
|
12
|
+
return true if [::File, ::StringIO, ::IO].any? { |klass| obj.is_a?(klass) }
|
13
|
+
|
14
|
+
false
|
10
15
|
end
|
11
16
|
|
12
|
-
def self.
|
17
|
+
def self.stdin?(obj)
|
13
18
|
case obj
|
14
19
|
when String
|
15
20
|
return true if STDINS.include?(obj)
|
16
21
|
when ::IO
|
17
|
-
return true if obj ==
|
22
|
+
return true if obj == $stdin
|
18
23
|
end
|
19
|
-
|
24
|
+
false
|
20
25
|
end
|
21
26
|
|
22
27
|
def initialize(obj)
|
23
|
-
|
28
|
+
super()
|
29
|
+
if self.class.stdin?(obj)
|
24
30
|
@name = "<STDIN>"
|
25
31
|
@io = $stdin
|
26
|
-
elsif obj.
|
27
|
-
@name = obj.path
|
28
|
-
@io = obj
|
29
|
-
elsif obj.kind_of?(::StringIO) then
|
30
|
-
@name = obj.inspect
|
32
|
+
elsif obj.is_a?(::IO)
|
33
|
+
@name = (obj.respond_to?(:path) && obj.path) || obj.inspect
|
31
34
|
@io = obj
|
32
|
-
elsif obj.
|
35
|
+
elsif obj.is_a?(::StringIO)
|
33
36
|
@name = obj.inspect
|
34
37
|
@io = obj
|
35
38
|
else
|
@@ -37,18 +40,12 @@ module FlatKit
|
|
37
40
|
end
|
38
41
|
end
|
39
42
|
|
40
|
-
|
41
|
-
@name
|
42
|
-
end
|
43
|
+
attr_reader :name, :io
|
43
44
|
|
44
45
|
# this goes to an io stream and we are not in charge of opening it
|
45
46
|
def close
|
46
47
|
@io.close
|
47
48
|
end
|
48
|
-
|
49
|
-
def io
|
50
|
-
@io
|
51
|
-
end
|
52
49
|
end
|
53
50
|
end
|
54
51
|
end
|
data/lib/flat_kit/input.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Base class of all input handlers
|
5
|
+
#
|
2
6
|
class Input
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
9
|
def self.from(input)
|
6
|
-
return input if input.
|
10
|
+
return input if input.is_a?(::FlatKit::Input)
|
7
11
|
|
8
12
|
in_klass = find_child(:handles?, input)
|
9
|
-
if in_klass
|
10
|
-
return in_klass.new(input)
|
11
|
-
end
|
13
|
+
return in_klass.new(input) if in_klass
|
12
14
|
|
13
15
|
raise FlatKit::Error, "Unable to create input from #{input.class} : #{input.inspect}"
|
14
16
|
end
|
@@ -17,7 +19,6 @@ module FlatKit
|
|
17
19
|
raise NotImplementedError, "#{self.class} must implement #name"
|
18
20
|
end
|
19
21
|
|
20
|
-
#
|
21
22
|
def io
|
22
23
|
raise NotImplementedError, "#{self.class} must implement #io"
|
23
24
|
end
|
@@ -28,5 +29,5 @@ module FlatKit
|
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
31
|
-
require
|
32
|
-
require
|
32
|
+
require "flat_kit/input/io"
|
33
|
+
require "flat_kit/input/file"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: This is a class used internally by MergeTree and should not be used
|
3
5
|
# outside of that context.
|
@@ -10,22 +12,24 @@ module FlatKit
|
|
10
12
|
# here.
|
11
13
|
#
|
12
14
|
class InternalNode
|
13
|
-
|
14
15
|
include Comparable
|
15
16
|
|
16
|
-
|
17
|
-
attr_accessor :right
|
18
|
-
|
19
|
-
|
20
|
-
attr_accessor :
|
17
|
+
# Internal Nodes
|
18
|
+
attr_accessor :left, :right, :winner
|
19
|
+
|
20
|
+
# Who to tell
|
21
|
+
attr_accessor :next_level
|
22
|
+
|
23
|
+
# winning leaf node
|
24
|
+
attr_accessor :leaf
|
21
25
|
|
22
26
|
def initialize(left:, right:)
|
23
|
-
@left
|
27
|
+
@left = left
|
24
28
|
@left.next_level = self
|
25
29
|
|
26
|
-
@right
|
30
|
+
@right = right
|
27
31
|
@right.next_level = self
|
28
|
-
@next_level
|
32
|
+
@next_level = nil
|
29
33
|
|
30
34
|
play
|
31
35
|
end
|
@@ -53,32 +57,31 @@ module FlatKit
|
|
53
57
|
# from the tree.
|
54
58
|
#
|
55
59
|
def player_finished(node)
|
56
|
-
if left.
|
60
|
+
if left.equal?(node)
|
57
61
|
@left = SentinelInternalNode.new
|
58
62
|
@left.next_level = self
|
59
|
-
elsif right.
|
63
|
+
elsif right.equal?(node)
|
60
64
|
@right = SentinelInternalNode.new
|
61
65
|
@right.next_level = self
|
62
66
|
else
|
63
67
|
raise FlatKit::Error, "Unknown player #{node}"
|
64
68
|
end
|
65
69
|
|
66
|
-
|
67
|
-
|
68
|
-
|
70
|
+
return unless @right.sentinel? && @left.sentinel?
|
71
|
+
|
72
|
+
next_level.player_finished(self) if next_level
|
69
73
|
end
|
70
74
|
|
71
75
|
def play
|
72
|
-
@winner = left <= right ? left : right
|
73
|
-
|
74
|
-
@leaf = winner.leaf
|
75
|
-
end
|
76
|
+
@winner = (left <= right) ? left : right
|
77
|
+
@leaf = winner.leaf unless @winner.sentinel?
|
76
78
|
next_level.play if next_level
|
77
79
|
end
|
78
80
|
|
79
81
|
def <=>(other)
|
80
82
|
return -1 if other.sentinel?
|
81
|
-
|
83
|
+
|
84
|
+
value <=> (other.value)
|
82
85
|
end
|
83
86
|
end
|
84
87
|
end
|
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: JSONL format class holding the metadata about the JSONL format
|
6
|
+
#
|
3
7
|
class Format < ::FlatKit::Format
|
4
8
|
def self.format_name
|
5
9
|
"jsonl"
|
@@ -7,10 +11,10 @@ module FlatKit
|
|
7
11
|
|
8
12
|
def self.handles?(filename)
|
9
13
|
parts = filename.split(".")
|
10
|
-
%w[
|
14
|
+
%w[json jsonl ndjson].each do |ext|
|
11
15
|
return true if parts.include?(ext)
|
12
16
|
end
|
13
|
-
|
17
|
+
false
|
14
18
|
end
|
15
19
|
|
16
20
|
def self.reader
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: Reader class that parses and yields records from JSONL files
|
6
|
+
#
|
3
7
|
class Reader < ::FlatKit::Reader
|
4
|
-
attr_reader :input
|
5
|
-
attr_reader :count
|
8
|
+
attr_reader :input, :count
|
6
9
|
|
7
10
|
def self.format_name
|
8
11
|
::FlatKit::Jsonl::Format.format_name
|
@@ -15,13 +18,13 @@ module FlatKit
|
|
15
18
|
end
|
16
19
|
|
17
20
|
def each
|
18
|
-
while line = input.io.gets
|
21
|
+
while (line = input.io.gets)
|
19
22
|
record = ::FlatKit::Jsonl::Record.new(data: line, compare_fields: compare_fields)
|
20
23
|
@count += 1
|
21
24
|
yield record
|
22
25
|
end
|
23
26
|
input.close
|
24
|
-
rescue => e
|
27
|
+
rescue StandardError => e
|
25
28
|
::FlatKit.logger.error "Error reading jsonl records from #{input.name}: #{e}"
|
26
29
|
raise ::FlatKit::Error, e
|
27
30
|
end
|
@@ -1,8 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "oj"
|
4
|
+
require "flat_kit/record"
|
3
5
|
|
4
6
|
module FlatKit
|
5
7
|
module Jsonl
|
8
|
+
# Internal: Class that exposes data from a JSONL format record to the flatkit api
|
9
|
+
#
|
6
10
|
class Record < ::FlatKit::Record
|
7
11
|
attr_reader :compare_data
|
8
12
|
|
@@ -11,7 +15,7 @@ module FlatKit
|
|
11
15
|
end
|
12
16
|
|
13
17
|
def self.from_record(record)
|
14
|
-
if record.instance_of?(FlatKit::Jsonl::Record)
|
18
|
+
if record.instance_of?(FlatKit::Jsonl::Record)
|
15
19
|
|
16
20
|
structured = record.complete_structured_data? ? record.complete_structured_data : nil
|
17
21
|
|
@@ -25,22 +29,20 @@ module FlatKit
|
|
25
29
|
end
|
26
30
|
|
27
31
|
def initialize(data:, compare_fields: :none,
|
28
|
-
compare_data:
|
32
|
+
compare_data: {},
|
29
33
|
complete_structured_data: nil)
|
30
34
|
super(data: data, compare_fields: compare_fields)
|
31
35
|
|
32
36
|
@complete_structured_data = complete_structured_data
|
33
37
|
|
34
|
-
if complete_structured_data? && (compare_data.nil? || compare_data.empty?)
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
38
|
+
@compare_data = if complete_structured_data? && (compare_data.nil? || compare_data.empty?)
|
39
|
+
complete_structured_data
|
40
|
+
else
|
41
|
+
compare_data
|
42
|
+
end
|
39
43
|
|
40
44
|
# only load compare data if it dosn't exist
|
41
|
-
if data && compare_data.empty?
|
42
|
-
quick_parse
|
43
|
-
end
|
45
|
+
quick_parse if data && compare_data.empty?
|
44
46
|
end
|
45
47
|
|
46
48
|
def [](key)
|
@@ -59,9 +61,7 @@ module FlatKit
|
|
59
61
|
# overriding parent accessor since we may be initialized without raw bytes
|
60
62
|
# to parse
|
61
63
|
def data
|
62
|
-
if @data.nil? && complete_structured_data?
|
63
|
-
@data = Oj.dump(complete_structured_data, mode: :json)
|
64
|
-
end
|
64
|
+
@data = Oj.dump(complete_structured_data, mode: :json) if @data.nil? && complete_structured_data?
|
65
65
|
@data
|
66
66
|
end
|
67
67
|
alias to_s data
|
@@ -79,6 +79,3 @@ module FlatKit
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
83
|
-
|
84
|
-
|
@@ -1,15 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Jsonl
|
5
|
+
# Internal: Class that writes flatkit records to JSONL files
|
6
|
+
#
|
3
7
|
class Writer < ::FlatKit::Writer
|
4
|
-
|
5
8
|
def self.format_name
|
6
9
|
::FlatKit::Jsonl::Format.format_name
|
7
10
|
end
|
8
11
|
|
9
|
-
def initialize(destination:)
|
10
|
-
super
|
11
|
-
end
|
12
|
-
|
13
12
|
# write the record and return the Position the record was written
|
14
13
|
#
|
15
14
|
def write(record)
|
@@ -22,10 +21,10 @@ module FlatKit
|
|
22
21
|
else
|
23
22
|
raise FlatKit::Error, "Unable to write records of type #{record.class}"
|
24
23
|
end
|
25
|
-
rescue FlatKit::Error =>
|
26
|
-
raise
|
27
|
-
rescue => e
|
28
|
-
::FlatKit.logger.error "Error
|
24
|
+
rescue FlatKit::Error => e
|
25
|
+
raise e
|
26
|
+
rescue StandardError => e
|
27
|
+
::FlatKit.logger.error "Error writing jsonl records to #{output.name}: #{e}"
|
29
28
|
raise ::FlatKit::Error, e
|
30
29
|
end
|
31
30
|
|
@@ -47,7 +46,6 @@ module FlatKit
|
|
47
46
|
@last_position = ::FlatKit::Position.new(index: record_index,
|
48
47
|
offset: start_offset,
|
49
48
|
bytesize: bytes_written)
|
50
|
-
|
51
49
|
end
|
52
50
|
end
|
53
51
|
end
|
data/lib/flat_kit/jsonl.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Top level namespace for the newline-oriented JSON format
|
5
|
+
#
|
2
6
|
module Jsonl
|
3
7
|
end
|
4
8
|
end
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
+
require "flat_kit/jsonl/record"
|
10
|
+
require "flat_kit/jsonl/reader"
|
11
|
+
require "flat_kit/jsonl/writer"
|
12
|
+
require "flat_kit/jsonl/format"
|
data/lib/flat_kit/leaf_node.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The LeafNode is a wrapper around a Reader object to enable
|
3
5
|
# a consistent api for use in the MergeTree
|
@@ -9,11 +11,9 @@ module FlatKit
|
|
9
11
|
# If all the data is used up from the reader, it also notifies the next level
|
10
12
|
# of that so the next level can remove it from the tree.
|
11
13
|
class LeafNode
|
12
|
-
|
13
14
|
include Comparable
|
14
15
|
|
15
|
-
attr_reader :reader
|
16
|
-
attr_reader :value
|
16
|
+
attr_reader :reader, :value
|
17
17
|
|
18
18
|
attr_accessor :next_level
|
19
19
|
|
@@ -43,7 +43,7 @@ module FlatKit
|
|
43
43
|
|
44
44
|
def update_and_replay
|
45
45
|
self.next
|
46
|
-
if finished?
|
46
|
+
if finished?
|
47
47
|
::FlatKit.logger.debug "#{reader.source} has finished reading #{reader.count} records"
|
48
48
|
next_level.player_finished(self)
|
49
49
|
end
|
@@ -65,7 +65,8 @@ module FlatKit
|
|
65
65
|
|
66
66
|
def <=>(other)
|
67
67
|
return -1 if other.sentinel?
|
68
|
-
|
68
|
+
|
69
|
+
value <=> (other.value)
|
69
70
|
end
|
70
71
|
end
|
71
72
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
|
5
|
+
module FlatKit
|
6
|
+
# Internal: Log formatting class for FlatKit
|
7
|
+
#
|
8
|
+
class LogFormatter < ::Logger::Formatter
|
9
|
+
FORMAT = "%s %5d %05s : %s\n"
|
10
|
+
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
11
|
+
def initialize
|
12
|
+
super
|
13
|
+
self.datetime_format = DATETIME_FORMAT
|
14
|
+
end
|
15
|
+
|
16
|
+
def call(severity, time, _progname, msg)
|
17
|
+
format(FORMAT, format_datetime(time.utc), Process.pid, severity, msg2str(msg))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|