flat_kit 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +15 -0
- data/Manifest.txt +21 -26
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +48 -23
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +96 -0
- data/lib/flat_kit/command.rb +10 -10
- data/lib/flat_kit/descendant_tracker.rb +17 -5
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +7 -4
- data/lib/flat_kit/field_stats.rb +246 -0
- data/lib/flat_kit/field_type/boolean_type.rb +52 -0
- data/lib/flat_kit/field_type/date_type.rb +181 -0
- data/lib/flat_kit/field_type/float_type.rb +43 -0
- data/lib/flat_kit/field_type/guess_type.rb +23 -0
- data/lib/flat_kit/field_type/integer_type.rb +36 -0
- data/lib/flat_kit/field_type/null_type.rb +39 -0
- data/lib/flat_kit/field_type/string_type.rb +24 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +48 -0
- data/lib/flat_kit/field_type/unknown_type.rb +30 -0
- data/lib/flat_kit/field_type.rb +83 -0
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +16 -19
- data/lib/flat_kit/jsonl/writer.rb +25 -18
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -16
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +12 -7
- data/lib/flat_kit/position.rb +18 -0
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +64 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +37 -0
- data/lib/flat_kit/stat_type.rb +70 -0
- data/lib/flat_kit/stats.rb +64 -0
- data/lib/flat_kit/writer.rb +17 -3
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +36 -18
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +33 -21
- metadata +38 -113
- data/Rakefile +0 -20
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -68
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -72
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -68
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
3
5
|
# class.
|
@@ -8,12 +10,10 @@ module FlatKit
|
|
8
10
|
class SentinelInternalNode
|
9
11
|
include Comparable
|
10
12
|
|
11
|
-
attr_reader :left
|
12
|
-
attr_reader :right
|
13
|
-
attr_reader :winner
|
13
|
+
attr_reader :left, :right, :winner
|
14
14
|
attr_accessor :next_level
|
15
15
|
|
16
|
-
def initialize(
|
16
|
+
def initialize(*)
|
17
17
|
@left = nil
|
18
18
|
@right = nil
|
19
19
|
@winner = nil
|
@@ -31,7 +31,8 @@ module FlatKit
|
|
31
31
|
# A sentinal node is always greater than any other node
|
32
32
|
def <=>(other)
|
33
33
|
return 0 if other.sentinel?
|
34
|
-
|
34
|
+
|
35
|
+
1
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
3
5
|
#
|
@@ -31,7 +33,8 @@ module FlatKit
|
|
31
33
|
# A sentinal node is always greater than any other node
|
32
34
|
def <=>(other)
|
33
35
|
return 0 if other.sentinel?
|
34
|
-
|
36
|
+
|
37
|
+
1
|
35
38
|
end
|
36
39
|
end
|
37
40
|
end
|
data/lib/flat_kit/sort.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Sorts an Input and sends the sorted records to an Output
|
5
|
+
#
|
2
6
|
class Sort
|
3
|
-
attr_reader :reader
|
4
|
-
attr_reader :writer
|
5
|
-
attr_reader :compare_fields
|
6
|
-
|
7
|
-
def initialize(input:, input_fallback: "auto",
|
8
|
-
output:, output_fallback: "auto",
|
9
|
-
compare_fields:)
|
7
|
+
attr_reader :reader, :writer, :compare_fields
|
10
8
|
|
9
|
+
def initialize(input:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
11
10
|
@compare_fields = compare_fields
|
12
11
|
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
13
12
|
fallback: input_fallback)
|
@@ -16,8 +15,8 @@ module FlatKit
|
|
16
15
|
end
|
17
16
|
|
18
17
|
def call
|
19
|
-
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(
|
20
|
-
records =
|
18
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(', ')}"
|
19
|
+
records = [].tap do |a|
|
21
20
|
reader.each do |r|
|
22
21
|
a << r
|
23
22
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class StatType
|
5
|
+
# Internal: Status object to keep track of the count and frequency of values.
|
6
|
+
#
|
7
|
+
class NominalStats < StatType
|
8
|
+
attr_reader :count
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[count]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[count unique_count unique_values mode]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
super()
|
20
|
+
@mutex = Mutex.new
|
21
|
+
@count = 0
|
22
|
+
@collecting_frequencies = collecting_frequencies
|
23
|
+
@frequencies = Hash.new(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
def collected_stats
|
27
|
+
return self.class.default_stats unless @collecting_frequencies
|
28
|
+
|
29
|
+
self.class.all_stats
|
30
|
+
end
|
31
|
+
|
32
|
+
def mode
|
33
|
+
return nil unless @collecting_frequencies
|
34
|
+
|
35
|
+
@frequencies.max_by { |_item, item_count| item_count }.first
|
36
|
+
end
|
37
|
+
|
38
|
+
def unique_count
|
39
|
+
return nil unless @collecting_frequencies
|
40
|
+
|
41
|
+
@frequencies.size
|
42
|
+
end
|
43
|
+
|
44
|
+
def unique_values
|
45
|
+
return nil unless @collecting_frequencies
|
46
|
+
|
47
|
+
@frequencies.keys
|
48
|
+
end
|
49
|
+
|
50
|
+
def frequencies
|
51
|
+
return nil unless @collecting_frequencies
|
52
|
+
|
53
|
+
@frequencies
|
54
|
+
end
|
55
|
+
|
56
|
+
def update(value)
|
57
|
+
@mutex.synchronize do
|
58
|
+
@count += 1
|
59
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2008, 2009 Jeremy Hinegardner
|
5
|
+
# All rights reserved. See LICENSE and/or COPYING for details.
|
6
|
+
#
|
7
|
+
# Pulled from Hitimes, which I also wrote
|
8
|
+
#++
|
9
|
+
|
10
|
+
require "oj"
|
11
|
+
|
12
|
+
module FlatKit
|
13
|
+
class StatType
|
14
|
+
# Internal: Stats object to keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
15
|
+
# and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
|
16
|
+
#
|
17
|
+
# This contrived example shows getting a list of all the files in a directory
|
18
|
+
# and running stats on file sizes.
|
19
|
+
#
|
20
|
+
# s = FlatKit::Stats.new
|
21
|
+
# dir = ARGV.shift || Dir.pwd
|
22
|
+
# Dir.entries( dir ).each do |entry|
|
23
|
+
# fs = File.stat( entry )
|
24
|
+
# if fs.file? then
|
25
|
+
# s.update( fs.size )
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# %w[ count min max mean sum stddev rate ].each do |m|
|
30
|
+
# puts "#{m.rjust(6)} : #{s.send( m ) }"
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
class NumericalStats < NominalStats
|
34
|
+
# A list of the available stats
|
35
|
+
|
36
|
+
attr_reader :min, :max, :sum, :sumsq
|
37
|
+
|
38
|
+
def self.default_stats
|
39
|
+
@default_stats ||= %w[count max mean min rate stddev sum sumsq]
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.all_stats
|
43
|
+
@all_stats ||= %w[count max mean min mode rate stddev sum sumsq unique_count unique_values]
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize(collecting_frequencies: false)
|
47
|
+
super
|
48
|
+
@min = Float::INFINITY
|
49
|
+
@max = -Float::INFINITY
|
50
|
+
@sum = 0.0
|
51
|
+
@sumsq = 0.0
|
52
|
+
end
|
53
|
+
|
54
|
+
# call-seq:
|
55
|
+
# stat.update( val ) -> val
|
56
|
+
#
|
57
|
+
# Update the running stats with the new value.
|
58
|
+
# Return the input value.
|
59
|
+
def update(value)
|
60
|
+
@mutex.synchronize do
|
61
|
+
@min = [value, @min].min
|
62
|
+
@max = [value, @max].max
|
63
|
+
|
64
|
+
@count += 1
|
65
|
+
@sum += value
|
66
|
+
@sumsq += (value * value)
|
67
|
+
|
68
|
+
# from Nomnial update
|
69
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
70
|
+
end
|
71
|
+
|
72
|
+
value
|
73
|
+
end
|
74
|
+
|
75
|
+
# call-seq:
|
76
|
+
# stat.mean -> Float
|
77
|
+
#
|
78
|
+
# Return the arithmetic mean of the values put into the Stats object. If no
|
79
|
+
# values have passed through the stats object then 0.0 is returned;
|
80
|
+
def mean
|
81
|
+
return 0.0 if @count.zero?
|
82
|
+
|
83
|
+
@sum / @count
|
84
|
+
end
|
85
|
+
|
86
|
+
# call-seq:
|
87
|
+
# stat.rate -> Float
|
88
|
+
#
|
89
|
+
# Return the +count+ divided by +sum+.
|
90
|
+
#
|
91
|
+
# In many cases when Stats#update( _value_ ) is called, the _value_ is a unit
|
92
|
+
# of time, typically seconds or microseconds. #rate is a convenience for those
|
93
|
+
# times. In this case, where _value_ is a unit if time, then count divided by
|
94
|
+
# sum is a useful value, i.e. +something per unit of time+.
|
95
|
+
#
|
96
|
+
# In the case where _value_ is a non-time related value, then the value
|
97
|
+
# returned by _rate_ is not really useful.
|
98
|
+
#
|
99
|
+
def rate
|
100
|
+
return 0.0 if @sum.zero?
|
101
|
+
|
102
|
+
@count / @sum
|
103
|
+
end
|
104
|
+
|
105
|
+
#
|
106
|
+
# call-seq:
|
107
|
+
# stat.stddev -> Float
|
108
|
+
#
|
109
|
+
# Return the standard deviation of all the values that have passed through the
|
110
|
+
# Stats object. The standard deviation has no meaning unless the count is > 1,
|
111
|
+
# therefore if the current _stat.count_ is < 1 then 0.0 will be returned;
|
112
|
+
#
|
113
|
+
def stddev
|
114
|
+
return 0.0 unless @count > 1
|
115
|
+
|
116
|
+
Math.sqrt((@sumsq - ((@sum * @sum) / @count)) / (@count - 1))
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class StatType
|
5
|
+
# Internal: Same as NominalStats and also collects min and max
|
6
|
+
#
|
7
|
+
class OrdinalStats < NominalStats
|
8
|
+
attr_reader :min, :max
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[count max min]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[count max min unique_count unique_values mode]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
super
|
20
|
+
@min = nil
|
21
|
+
@max = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def update(value)
|
25
|
+
@mutex.synchronize do
|
26
|
+
@min = value if @min.nil? || (value < @min)
|
27
|
+
|
28
|
+
@max = value if @max.nil? || (value > @max)
|
29
|
+
|
30
|
+
@count += 1
|
31
|
+
|
32
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: Base class of all the statistic types
|
5
|
+
#
|
6
|
+
class StatType
|
7
|
+
def self.nominal_types
|
8
|
+
[FieldType::BooleanType, FieldType::StringType, FieldType::NullType]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.ordinal_types
|
12
|
+
[FieldType::DateType, FieldType::TimestampType]
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.numerical_types
|
16
|
+
[FieldType::FloatType, FieldType::IntegerType]
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.for(type)
|
20
|
+
return OrdinalStats if ordinal_types.include?(type)
|
21
|
+
return NominalStats if nominal_types.include?(type)
|
22
|
+
return NumericalStats if numerical_types.include?(type)
|
23
|
+
|
24
|
+
raise ArgumentError, "Unknown stat type for #{type}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def collected_stats
|
28
|
+
raise NotImplementedError, "#{self.class.name} must implement #collected_stats"
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# call-seq:
|
33
|
+
# stat.to_hash -> Hash
|
34
|
+
# stat.to_hash( %w[ count max mean ]) -> Hash
|
35
|
+
#
|
36
|
+
# return a hash of the stats. By default this returns a hash of all stats
|
37
|
+
# but passing in an array of items will limit the stats returned to only
|
38
|
+
# those in the Array.
|
39
|
+
#
|
40
|
+
# If passed in an empty array or nil to to_hash then STATS is assumed to be
|
41
|
+
# the list of stats to return in the hash.
|
42
|
+
#
|
43
|
+
def to_hash(*args)
|
44
|
+
h = {}
|
45
|
+
args = [args].flatten
|
46
|
+
args = collected_stats if args.empty?
|
47
|
+
args.each do |meth|
|
48
|
+
h[meth] = send(meth)
|
49
|
+
end
|
50
|
+
h
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# call-seq:
|
55
|
+
# stat.to_json -> String
|
56
|
+
# stat.to_json( *args ) -> String
|
57
|
+
#
|
58
|
+
# return a json string of the stats. By default this returns a json string
|
59
|
+
# of all the stats. If an array of items is passed in, those that match the
|
60
|
+
# known stats will be all that is included in the json output.
|
61
|
+
#
|
62
|
+
def to_json(*args)
|
63
|
+
h = to_hash(*args)
|
64
|
+
Oj.dump(h)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
require "flat_kit/stat_type/nominal_stats"
|
69
|
+
require "flat_kit/stat_type/ordinal_stats"
|
70
|
+
require "flat_kit/stat_type/numerical_stats"
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: Collects stats from an Input and sends thos stats to an Output
|
5
|
+
#
|
6
|
+
class Stats
|
7
|
+
include ::FlatKit::EventEmitter
|
8
|
+
|
9
|
+
AllFields = Class.new.freeze
|
10
|
+
|
11
|
+
attr_reader :reader, :writer, :fields_to_stat, :stats_to_collect, :stats_by_field
|
12
|
+
|
13
|
+
def initialize(input:, output:, input_fallback: "auto", output_fallback: "auto",
|
14
|
+
fields_to_stat: AllFields, stats_to_collect: FieldStats::CORE_STATS)
|
15
|
+
|
16
|
+
@fields_to_stat = fields_to_stat
|
17
|
+
@stats_to_collect = stats_to_collect
|
18
|
+
@stats_by_field = {}
|
19
|
+
@record_count = 0
|
20
|
+
|
21
|
+
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, fallback: input_fallback)
|
22
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
23
|
+
reader_format: @reader.format_name)
|
24
|
+
end
|
25
|
+
|
26
|
+
def call
|
27
|
+
calculate_stats
|
28
|
+
write_stat_records
|
29
|
+
@writer.close
|
30
|
+
end
|
31
|
+
|
32
|
+
def collecting_stats_on_field?(name)
|
33
|
+
return true if @fields_to_stat == AllFields
|
34
|
+
|
35
|
+
@fields_to_stat.include?(name)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def calculate_stats
|
41
|
+
::FlatKit.logger.debug "Calculating statistics on #{reader.source}"
|
42
|
+
reader.each do |record|
|
43
|
+
record.to_hash.each do |field_name, field_value|
|
44
|
+
update_stats_for_field(name: field_name, value: field_value) if collecting_stats_on_field?(field_name)
|
45
|
+
end
|
46
|
+
@record_count += 1
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def update_stats_for_field(name:, value:)
|
51
|
+
field_stats = @stats_by_field[name] ||= FieldStats.new(name: name, stats_to_collect: @stats_to_collect)
|
52
|
+
field_stats.update(value)
|
53
|
+
end
|
54
|
+
|
55
|
+
def write_stat_records
|
56
|
+
@stats_by_field.each_value do |stats|
|
57
|
+
h = stats.to_hash.merge({ "total_record_count" => @record_count })
|
58
|
+
record = ::FlatKit::Jsonl::Record.new(data: nil, complete_structured_data: h)
|
59
|
+
|
60
|
+
@writer.write(record)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/flat_kit/writer.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: The base class for all format writers.
|
3
5
|
#
|
@@ -11,7 +13,7 @@ module FlatKit
|
|
11
13
|
# See the Xsv::Writer and Jsonl::Writer for examples.
|
12
14
|
#
|
13
15
|
class Writer
|
14
|
-
attr_reader :destination
|
16
|
+
attr_reader :destination, :output, :count, :last_position
|
15
17
|
|
16
18
|
def self.create_writer_from_path(path:, fallback:, reader_format:)
|
17
19
|
fallback = reader_format if fallback == "auto"
|
@@ -21,18 +23,30 @@ module FlatKit
|
|
21
23
|
|
22
24
|
def initialize(destination:)
|
23
25
|
@destination = destination
|
26
|
+
@output = ::FlatKit::Output.from(@destination)
|
27
|
+
@count = 0
|
28
|
+
@last_position = nil
|
24
29
|
end
|
25
30
|
|
26
31
|
def format_name
|
27
32
|
self.class.format_name
|
28
33
|
end
|
29
34
|
|
35
|
+
def current_position
|
36
|
+
::FlatKit::Position.new(index: @count, # since this hasn't been written yet its the right index
|
37
|
+
offset: output.tell,
|
38
|
+
bytesize: 0) # nothing has been written yet
|
39
|
+
end
|
40
|
+
|
41
|
+
# The write method MUST return a Position object detailing the location the
|
42
|
+
# record was written in the output stream.
|
43
|
+
#
|
30
44
|
def write(record)
|
31
|
-
raise NotImplementedError, "#{self.class} needs to implement #write"
|
45
|
+
raise NotImplementedError, "#{self.class} needs to implement #write that returns Position"
|
32
46
|
end
|
33
47
|
|
34
48
|
def close
|
35
|
-
|
49
|
+
output.close
|
36
50
|
end
|
37
51
|
end
|
38
52
|
end
|
data/lib/flat_kit/xsv/format.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Xsv
|
5
|
+
# Internal: xsv format class holding the metadata about the xsv format utilities
|
6
|
+
#
|
3
7
|
class Format < ::FlatKit::Format
|
4
8
|
def self.format_name
|
5
9
|
"xsv"
|
@@ -7,10 +11,10 @@ module FlatKit
|
|
7
11
|
|
8
12
|
def self.handles?(filename)
|
9
13
|
parts = filename.split(".")
|
10
|
-
%w[
|
14
|
+
%w[csv tsv txt].each do |ext|
|
11
15
|
return true if parts.include?(ext)
|
12
16
|
end
|
13
|
-
|
17
|
+
false
|
14
18
|
end
|
15
19
|
|
16
20
|
def self.reader
|
data/lib/flat_kit/xsv/reader.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
2
4
|
|
3
5
|
module FlatKit
|
4
6
|
module Xsv
|
7
|
+
# Internal: Reader class that parses and yields records from xsv files
|
8
|
+
#
|
5
9
|
class Reader < ::FlatKit::Reader
|
6
|
-
attr_reader :input
|
7
|
-
attr_reader :count
|
8
|
-
attr_reader :fields
|
10
|
+
attr_reader :input, :count, :fields
|
9
11
|
|
10
12
|
def self.format_name
|
11
13
|
::FlatKit::Xsv::Format.format_name
|
@@ -15,7 +17,7 @@ module FlatKit
|
|
15
17
|
{
|
16
18
|
headers: :first_row,
|
17
19
|
converters: :numeric,
|
18
|
-
return_headers: false
|
20
|
+
return_headers: false,
|
19
21
|
}
|
20
22
|
end
|
21
23
|
|
@@ -36,7 +38,7 @@ module FlatKit
|
|
36
38
|
yield record
|
37
39
|
end
|
38
40
|
input.close
|
39
|
-
rescue => e
|
41
|
+
rescue StandardError => e
|
40
42
|
::FlatKit.logger.error "Error reading xsv records from #{input.name}: #{e}"
|
41
43
|
raise ::FlatKit::Error, e
|
42
44
|
end
|
data/lib/flat_kit/xsv/record.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
require "flat_kit/record"
|
3
5
|
|
4
6
|
module FlatKit
|
5
7
|
module Xsv
|
8
|
+
# Internal: Class that exposes the data from an XSV format record to the flatkit API
|
9
|
+
#
|
6
10
|
class Record < ::FlatKit::Record
|
7
11
|
attr_reader :ordered_fields
|
8
12
|
|
@@ -10,8 +14,8 @@ module FlatKit
|
|
10
14
|
::FlatKit::Xsv::Format.format_name
|
11
15
|
end
|
12
16
|
|
13
|
-
def self.from_record(record
|
14
|
-
if record.instance_of?(FlatKit::Xsv::Record)
|
17
|
+
def self.from_record(record)
|
18
|
+
if record.instance_of?(FlatKit::Xsv::Record)
|
15
19
|
new(data: record.data, compare_fields: record.compare_fields)
|
16
20
|
else
|
17
21
|
new(data: nil, compare_fields: record.compare_fields,
|
@@ -28,9 +32,9 @@ module FlatKit
|
|
28
32
|
@complete_structured_data = complete_structured_data
|
29
33
|
@ordered_fields = ordered_fields
|
30
34
|
|
31
|
-
if data.nil? && (complete_structured_data.nil? || complete_structured_data.empty?)
|
35
|
+
if data.nil? && (complete_structured_data.nil? || complete_structured_data.empty?)
|
32
36
|
raise FlatKit::Error,
|
33
|
-
|
37
|
+
"#{self.class} requires initialization from data: or complete_structured_data:"
|
34
38
|
end
|
35
39
|
|
36
40
|
resolve_ordered_fields
|
@@ -38,7 +42,8 @@ module FlatKit
|
|
38
42
|
|
39
43
|
def [](key)
|
40
44
|
return nil unless @compare_fields.include?(key)
|
41
|
-
|
45
|
+
|
46
|
+
if data.nil? && !@complete_structured_data.nil?
|
42
47
|
@complete_structured_data[key]
|
43
48
|
else
|
44
49
|
data[key]
|
@@ -53,7 +58,7 @@ module FlatKit
|
|
53
58
|
def to_a
|
54
59
|
return data.fields unless data.nil?
|
55
60
|
|
56
|
-
|
61
|
+
[].tap do |a|
|
57
62
|
@ordered_fields.each do |field|
|
58
63
|
a << @complete_structured_data[field]
|
59
64
|
end
|
@@ -71,19 +76,20 @@ module FlatKit
|
|
71
76
|
# values in that order.
|
72
77
|
def to_s
|
73
78
|
return data.to_csv unless data.nil?
|
79
|
+
|
74
80
|
CSV.generate_line(to_a)
|
75
81
|
end
|
76
82
|
|
77
83
|
private
|
78
84
|
|
79
85
|
def resolve_ordered_fields
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
return unless (@ordered_fields == :auto) || (@ordered_fields.nil? || @ordered_fields.empty?)
|
87
|
+
|
88
|
+
@ordered_fields = if @data.nil? || @data.empty?
|
89
|
+
complete_structured_data.keys
|
90
|
+
else
|
91
|
+
@data.headers
|
92
|
+
end
|
87
93
|
end
|
88
94
|
end
|
89
95
|
end
|