flat_kit 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +15 -0
- data/Manifest.txt +21 -26
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +48 -23
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +96 -0
- data/lib/flat_kit/command.rb +10 -10
- data/lib/flat_kit/descendant_tracker.rb +17 -5
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +7 -4
- data/lib/flat_kit/field_stats.rb +246 -0
- data/lib/flat_kit/field_type/boolean_type.rb +52 -0
- data/lib/flat_kit/field_type/date_type.rb +181 -0
- data/lib/flat_kit/field_type/float_type.rb +43 -0
- data/lib/flat_kit/field_type/guess_type.rb +23 -0
- data/lib/flat_kit/field_type/integer_type.rb +36 -0
- data/lib/flat_kit/field_type/null_type.rb +39 -0
- data/lib/flat_kit/field_type/string_type.rb +24 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +48 -0
- data/lib/flat_kit/field_type/unknown_type.rb +30 -0
- data/lib/flat_kit/field_type.rb +83 -0
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +16 -19
- data/lib/flat_kit/jsonl/writer.rb +25 -18
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -16
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +12 -7
- data/lib/flat_kit/position.rb +18 -0
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +64 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +37 -0
- data/lib/flat_kit/stat_type.rb +70 -0
- data/lib/flat_kit/stats.rb +64 -0
- data/lib/flat_kit/writer.rb +17 -3
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +36 -18
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +33 -21
- metadata +38 -113
- data/Rakefile +0 -20
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -68
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -72
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -68
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Internal Node is a private class used by the MergeTree
|
3
5
|
# class.
|
@@ -8,12 +10,10 @@ module FlatKit
|
|
8
10
|
class SentinelInternalNode
|
9
11
|
include Comparable
|
10
12
|
|
11
|
-
attr_reader :left
|
12
|
-
attr_reader :right
|
13
|
-
attr_reader :winner
|
13
|
+
attr_reader :left, :right, :winner
|
14
14
|
attr_accessor :next_level
|
15
15
|
|
16
|
-
def initialize(
|
16
|
+
def initialize(*)
|
17
17
|
@left = nil
|
18
18
|
@right = nil
|
19
19
|
@winner = nil
|
@@ -31,7 +31,8 @@ module FlatKit
|
|
31
31
|
# A sentinal node is always greater than any other node
|
32
32
|
def <=>(other)
|
33
33
|
return 0 if other.sentinel?
|
34
|
-
|
34
|
+
|
35
|
+
1
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Private: The Sentinel Leaf Node is used internally by the MergeTree
|
3
5
|
#
|
@@ -31,7 +33,8 @@ module FlatKit
|
|
31
33
|
# A sentinal node is always greater than any other node
|
32
34
|
def <=>(other)
|
33
35
|
return 0 if other.sentinel?
|
34
|
-
|
36
|
+
|
37
|
+
1
|
35
38
|
end
|
36
39
|
end
|
37
40
|
end
|
data/lib/flat_kit/sort.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: Sorts an Input and sends the sorted records to an Output
|
5
|
+
#
|
2
6
|
class Sort
|
3
|
-
attr_reader :reader
|
4
|
-
attr_reader :writer
|
5
|
-
attr_reader :compare_fields
|
6
|
-
|
7
|
-
def initialize(input:, input_fallback: "auto",
|
8
|
-
output:, output_fallback: "auto",
|
9
|
-
compare_fields:)
|
7
|
+
attr_reader :reader, :writer, :compare_fields
|
10
8
|
|
9
|
+
def initialize(input:, output:, compare_fields:, input_fallback: "auto", output_fallback: "auto")
|
11
10
|
@compare_fields = compare_fields
|
12
11
|
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, compare_fields: @compare_fields,
|
13
12
|
fallback: input_fallback)
|
@@ -16,8 +15,8 @@ module FlatKit
|
|
16
15
|
end
|
17
16
|
|
18
17
|
def call
|
19
|
-
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(
|
20
|
-
records =
|
18
|
+
::FlatKit.logger.info "Sorting #{reader.source} into #{writer.destination} using key #{compare_fields.join(', ')}"
|
19
|
+
records = [].tap do |a|
|
21
20
|
reader.each do |r|
|
22
21
|
a << r
|
23
22
|
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class StatType
|
5
|
+
# Internal: Status object to keep track of the count and frequency of values.
|
6
|
+
#
|
7
|
+
class NominalStats < StatType
|
8
|
+
attr_reader :count
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[count]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[count unique_count unique_values mode]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
super()
|
20
|
+
@mutex = Mutex.new
|
21
|
+
@count = 0
|
22
|
+
@collecting_frequencies = collecting_frequencies
|
23
|
+
@frequencies = Hash.new(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
def collected_stats
|
27
|
+
return self.class.default_stats unless @collecting_frequencies
|
28
|
+
|
29
|
+
self.class.all_stats
|
30
|
+
end
|
31
|
+
|
32
|
+
def mode
|
33
|
+
return nil unless @collecting_frequencies
|
34
|
+
|
35
|
+
@frequencies.max_by { |_item, item_count| item_count }.first
|
36
|
+
end
|
37
|
+
|
38
|
+
def unique_count
|
39
|
+
return nil unless @collecting_frequencies
|
40
|
+
|
41
|
+
@frequencies.size
|
42
|
+
end
|
43
|
+
|
44
|
+
def unique_values
|
45
|
+
return nil unless @collecting_frequencies
|
46
|
+
|
47
|
+
@frequencies.keys
|
48
|
+
end
|
49
|
+
|
50
|
+
def frequencies
|
51
|
+
return nil unless @collecting_frequencies
|
52
|
+
|
53
|
+
@frequencies
|
54
|
+
end
|
55
|
+
|
56
|
+
def update(value)
|
57
|
+
@mutex.synchronize do
|
58
|
+
@count += 1
|
59
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2008, 2009 Jeremy Hinegardner
|
5
|
+
# All rights reserved. See LICENSE and/or COPYING for details.
|
6
|
+
#
|
7
|
+
# Pulled from Hitimes, which I also wrote
|
8
|
+
#++
|
9
|
+
|
10
|
+
require "oj"
|
11
|
+
|
12
|
+
module FlatKit
|
13
|
+
class StatType
|
14
|
+
# Internal: Stats object to keep track of the _min_, _max_, _count_, _sum_ and _sumsq_
|
15
|
+
# and when you want you may also retrieve the _mean_, _stddev_ and _rate_.
|
16
|
+
#
|
17
|
+
# This contrived example shows getting a list of all the files in a directory
|
18
|
+
# and running stats on file sizes.
|
19
|
+
#
|
20
|
+
# s = FlatKit::Stats.new
|
21
|
+
# dir = ARGV.shift || Dir.pwd
|
22
|
+
# Dir.entries( dir ).each do |entry|
|
23
|
+
# fs = File.stat( entry )
|
24
|
+
# if fs.file? then
|
25
|
+
# s.update( fs.size )
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# %w[ count min max mean sum stddev rate ].each do |m|
|
30
|
+
# puts "#{m.rjust(6)} : #{s.send( m ) }"
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
class NumericalStats < NominalStats
|
34
|
+
# A list of the available stats
|
35
|
+
|
36
|
+
attr_reader :min, :max, :sum, :sumsq
|
37
|
+
|
38
|
+
def self.default_stats
|
39
|
+
@default_stats ||= %w[count max mean min rate stddev sum sumsq]
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.all_stats
|
43
|
+
@all_stats ||= %w[count max mean min mode rate stddev sum sumsq unique_count unique_values]
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize(collecting_frequencies: false)
|
47
|
+
super
|
48
|
+
@min = Float::INFINITY
|
49
|
+
@max = -Float::INFINITY
|
50
|
+
@sum = 0.0
|
51
|
+
@sumsq = 0.0
|
52
|
+
end
|
53
|
+
|
54
|
+
# call-seq:
|
55
|
+
# stat.update( val ) -> val
|
56
|
+
#
|
57
|
+
# Update the running stats with the new value.
|
58
|
+
# Return the input value.
|
59
|
+
def update(value)
|
60
|
+
@mutex.synchronize do
|
61
|
+
@min = [value, @min].min
|
62
|
+
@max = [value, @max].max
|
63
|
+
|
64
|
+
@count += 1
|
65
|
+
@sum += value
|
66
|
+
@sumsq += (value * value)
|
67
|
+
|
68
|
+
# from Nomnial update
|
69
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
70
|
+
end
|
71
|
+
|
72
|
+
value
|
73
|
+
end
|
74
|
+
|
75
|
+
# call-seq:
|
76
|
+
# stat.mean -> Float
|
77
|
+
#
|
78
|
+
# Return the arithmetic mean of the values put into the Stats object. If no
|
79
|
+
# values have passed through the stats object then 0.0 is returned;
|
80
|
+
def mean
|
81
|
+
return 0.0 if @count.zero?
|
82
|
+
|
83
|
+
@sum / @count
|
84
|
+
end
|
85
|
+
|
86
|
+
# call-seq:
|
87
|
+
# stat.rate -> Float
|
88
|
+
#
|
89
|
+
# Return the +count+ divided by +sum+.
|
90
|
+
#
|
91
|
+
# In many cases when Stats#update( _value_ ) is called, the _value_ is a unit
|
92
|
+
# of time, typically seconds or microseconds. #rate is a convenience for those
|
93
|
+
# times. In this case, where _value_ is a unit if time, then count divided by
|
94
|
+
# sum is a useful value, i.e. +something per unit of time+.
|
95
|
+
#
|
96
|
+
# In the case where _value_ is a non-time related value, then the value
|
97
|
+
# returned by _rate_ is not really useful.
|
98
|
+
#
|
99
|
+
def rate
|
100
|
+
return 0.0 if @sum.zero?
|
101
|
+
|
102
|
+
@count / @sum
|
103
|
+
end
|
104
|
+
|
105
|
+
#
|
106
|
+
# call-seq:
|
107
|
+
# stat.stddev -> Float
|
108
|
+
#
|
109
|
+
# Return the standard deviation of all the values that have passed through the
|
110
|
+
# Stats object. The standard deviation has no meaning unless the count is > 1,
|
111
|
+
# therefore if the current _stat.count_ is < 1 then 0.0 will be returned;
|
112
|
+
#
|
113
|
+
def stddev
|
114
|
+
return 0.0 unless @count > 1
|
115
|
+
|
116
|
+
Math.sqrt((@sumsq - ((@sum * @sum) / @count)) / (@count - 1))
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class StatType
|
5
|
+
# Internal: Same as NominalStats and also collects min and max
|
6
|
+
#
|
7
|
+
class OrdinalStats < NominalStats
|
8
|
+
attr_reader :min, :max
|
9
|
+
|
10
|
+
def self.default_stats
|
11
|
+
@default_stats ||= %w[count max min]
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.all_stats
|
15
|
+
@all_stats ||= %w[count max min unique_count unique_values mode]
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(collecting_frequencies: false)
|
19
|
+
super
|
20
|
+
@min = nil
|
21
|
+
@max = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def update(value)
|
25
|
+
@mutex.synchronize do
|
26
|
+
@min = value if @min.nil? || (value < @min)
|
27
|
+
|
28
|
+
@max = value if @max.nil? || (value > @max)
|
29
|
+
|
30
|
+
@count += 1
|
31
|
+
|
32
|
+
@frequencies[value] += 1 if @collecting_frequencies
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: Base class of all the statistic types
|
5
|
+
#
|
6
|
+
class StatType
|
7
|
+
def self.nominal_types
|
8
|
+
[FieldType::BooleanType, FieldType::StringType, FieldType::NullType]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.ordinal_types
|
12
|
+
[FieldType::DateType, FieldType::TimestampType]
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.numerical_types
|
16
|
+
[FieldType::FloatType, FieldType::IntegerType]
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.for(type)
|
20
|
+
return OrdinalStats if ordinal_types.include?(type)
|
21
|
+
return NominalStats if nominal_types.include?(type)
|
22
|
+
return NumericalStats if numerical_types.include?(type)
|
23
|
+
|
24
|
+
raise ArgumentError, "Unknown stat type for #{type}"
|
25
|
+
end
|
26
|
+
|
27
|
+
def collected_stats
|
28
|
+
raise NotImplementedError, "#{self.class.name} must implement #collected_stats"
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# call-seq:
|
33
|
+
# stat.to_hash -> Hash
|
34
|
+
# stat.to_hash( %w[ count max mean ]) -> Hash
|
35
|
+
#
|
36
|
+
# return a hash of the stats. By default this returns a hash of all stats
|
37
|
+
# but passing in an array of items will limit the stats returned to only
|
38
|
+
# those in the Array.
|
39
|
+
#
|
40
|
+
# If passed in an empty array or nil to to_hash then STATS is assumed to be
|
41
|
+
# the list of stats to return in the hash.
|
42
|
+
#
|
43
|
+
def to_hash(*args)
|
44
|
+
h = {}
|
45
|
+
args = [args].flatten
|
46
|
+
args = collected_stats if args.empty?
|
47
|
+
args.each do |meth|
|
48
|
+
h[meth] = send(meth)
|
49
|
+
end
|
50
|
+
h
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# call-seq:
|
55
|
+
# stat.to_json -> String
|
56
|
+
# stat.to_json( *args ) -> String
|
57
|
+
#
|
58
|
+
# return a json string of the stats. By default this returns a json string
|
59
|
+
# of all the stats. If an array of items is passed in, those that match the
|
60
|
+
# known stats will be all that is included in the json output.
|
61
|
+
#
|
62
|
+
def to_json(*args)
|
63
|
+
h = to_hash(*args)
|
64
|
+
Oj.dump(h)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
require "flat_kit/stat_type/nominal_stats"
|
69
|
+
require "flat_kit/stat_type/ordinal_stats"
|
70
|
+
require "flat_kit/stat_type/numerical_stats"
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: Collects stats from an Input and sends thos stats to an Output
|
5
|
+
#
|
6
|
+
class Stats
|
7
|
+
include ::FlatKit::EventEmitter
|
8
|
+
|
9
|
+
AllFields = Class.new.freeze
|
10
|
+
|
11
|
+
attr_reader :reader, :writer, :fields_to_stat, :stats_to_collect, :stats_by_field
|
12
|
+
|
13
|
+
def initialize(input:, output:, input_fallback: "auto", output_fallback: "auto",
|
14
|
+
fields_to_stat: AllFields, stats_to_collect: FieldStats::CORE_STATS)
|
15
|
+
|
16
|
+
@fields_to_stat = fields_to_stat
|
17
|
+
@stats_to_collect = stats_to_collect
|
18
|
+
@stats_by_field = {}
|
19
|
+
@record_count = 0
|
20
|
+
|
21
|
+
@reader = ::FlatKit::Reader.create_reader_from_path(path: input, fallback: input_fallback)
|
22
|
+
@writer = ::FlatKit::Writer.create_writer_from_path(path: output, fallback: output_fallback,
|
23
|
+
reader_format: @reader.format_name)
|
24
|
+
end
|
25
|
+
|
26
|
+
def call
|
27
|
+
calculate_stats
|
28
|
+
write_stat_records
|
29
|
+
@writer.close
|
30
|
+
end
|
31
|
+
|
32
|
+
def collecting_stats_on_field?(name)
|
33
|
+
return true if @fields_to_stat == AllFields
|
34
|
+
|
35
|
+
@fields_to_stat.include?(name)
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def calculate_stats
|
41
|
+
::FlatKit.logger.debug "Calculating statistics on #{reader.source}"
|
42
|
+
reader.each do |record|
|
43
|
+
record.to_hash.each do |field_name, field_value|
|
44
|
+
update_stats_for_field(name: field_name, value: field_value) if collecting_stats_on_field?(field_name)
|
45
|
+
end
|
46
|
+
@record_count += 1
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def update_stats_for_field(name:, value:)
|
51
|
+
field_stats = @stats_by_field[name] ||= FieldStats.new(name: name, stats_to_collect: @stats_to_collect)
|
52
|
+
field_stats.update(value)
|
53
|
+
end
|
54
|
+
|
55
|
+
def write_stat_records
|
56
|
+
@stats_by_field.each_value do |stats|
|
57
|
+
h = stats.to_hash.merge({ "total_record_count" => @record_count })
|
58
|
+
record = ::FlatKit::Jsonl::Record.new(data: nil, complete_structured_data: h)
|
59
|
+
|
60
|
+
@writer.write(record)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/lib/flat_kit/writer.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
# Public: The base class for all format writers.
|
3
5
|
#
|
@@ -11,7 +13,7 @@ module FlatKit
|
|
11
13
|
# See the Xsv::Writer and Jsonl::Writer for examples.
|
12
14
|
#
|
13
15
|
class Writer
|
14
|
-
attr_reader :destination
|
16
|
+
attr_reader :destination, :output, :count, :last_position
|
15
17
|
|
16
18
|
def self.create_writer_from_path(path:, fallback:, reader_format:)
|
17
19
|
fallback = reader_format if fallback == "auto"
|
@@ -21,18 +23,30 @@ module FlatKit
|
|
21
23
|
|
22
24
|
def initialize(destination:)
|
23
25
|
@destination = destination
|
26
|
+
@output = ::FlatKit::Output.from(@destination)
|
27
|
+
@count = 0
|
28
|
+
@last_position = nil
|
24
29
|
end
|
25
30
|
|
26
31
|
def format_name
|
27
32
|
self.class.format_name
|
28
33
|
end
|
29
34
|
|
35
|
+
def current_position
|
36
|
+
::FlatKit::Position.new(index: @count, # since this hasn't been written yet its the right index
|
37
|
+
offset: output.tell,
|
38
|
+
bytesize: 0) # nothing has been written yet
|
39
|
+
end
|
40
|
+
|
41
|
+
# The write method MUST return a Position object detailing the location the
|
42
|
+
# record was written in the output stream.
|
43
|
+
#
|
30
44
|
def write(record)
|
31
|
-
raise NotImplementedError, "#{self.class} needs to implement #write"
|
45
|
+
raise NotImplementedError, "#{self.class} needs to implement #write that returns Position"
|
32
46
|
end
|
33
47
|
|
34
48
|
def close
|
35
|
-
|
49
|
+
output.close
|
36
50
|
end
|
37
51
|
end
|
38
52
|
end
|
data/lib/flat_kit/xsv/format.rb
CHANGED
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
4
|
module Xsv
|
5
|
+
# Internal: xsv format class holding the metadata about the xsv format utilities
|
6
|
+
#
|
3
7
|
class Format < ::FlatKit::Format
|
4
8
|
def self.format_name
|
5
9
|
"xsv"
|
@@ -7,10 +11,10 @@ module FlatKit
|
|
7
11
|
|
8
12
|
def self.handles?(filename)
|
9
13
|
parts = filename.split(".")
|
10
|
-
%w[
|
14
|
+
%w[csv tsv txt].each do |ext|
|
11
15
|
return true if parts.include?(ext)
|
12
16
|
end
|
13
|
-
|
17
|
+
false
|
14
18
|
end
|
15
19
|
|
16
20
|
def self.reader
|
data/lib/flat_kit/xsv/reader.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
2
4
|
|
3
5
|
module FlatKit
|
4
6
|
module Xsv
|
7
|
+
# Internal: Reader class that parses and yields records from xsv files
|
8
|
+
#
|
5
9
|
class Reader < ::FlatKit::Reader
|
6
|
-
attr_reader :input
|
7
|
-
attr_reader :count
|
8
|
-
attr_reader :fields
|
10
|
+
attr_reader :input, :count, :fields
|
9
11
|
|
10
12
|
def self.format_name
|
11
13
|
::FlatKit::Xsv::Format.format_name
|
@@ -15,7 +17,7 @@ module FlatKit
|
|
15
17
|
{
|
16
18
|
headers: :first_row,
|
17
19
|
converters: :numeric,
|
18
|
-
return_headers: false
|
20
|
+
return_headers: false,
|
19
21
|
}
|
20
22
|
end
|
21
23
|
|
@@ -36,7 +38,7 @@ module FlatKit
|
|
36
38
|
yield record
|
37
39
|
end
|
38
40
|
input.close
|
39
|
-
rescue => e
|
41
|
+
rescue StandardError => e
|
40
42
|
::FlatKit.logger.error "Error reading xsv records from #{input.name}: #{e}"
|
41
43
|
raise ::FlatKit::Error, e
|
42
44
|
end
|
data/lib/flat_kit/xsv/record.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
require "flat_kit/record"
|
3
5
|
|
4
6
|
module FlatKit
|
5
7
|
module Xsv
|
8
|
+
# Internal: Class that exposes the data from an XSV format record to the flatkit API
|
9
|
+
#
|
6
10
|
class Record < ::FlatKit::Record
|
7
11
|
attr_reader :ordered_fields
|
8
12
|
|
@@ -10,8 +14,8 @@ module FlatKit
|
|
10
14
|
::FlatKit::Xsv::Format.format_name
|
11
15
|
end
|
12
16
|
|
13
|
-
def self.from_record(record
|
14
|
-
if record.instance_of?(FlatKit::Xsv::Record)
|
17
|
+
def self.from_record(record)
|
18
|
+
if record.instance_of?(FlatKit::Xsv::Record)
|
15
19
|
new(data: record.data, compare_fields: record.compare_fields)
|
16
20
|
else
|
17
21
|
new(data: nil, compare_fields: record.compare_fields,
|
@@ -28,9 +32,9 @@ module FlatKit
|
|
28
32
|
@complete_structured_data = complete_structured_data
|
29
33
|
@ordered_fields = ordered_fields
|
30
34
|
|
31
|
-
if data.nil? && (complete_structured_data.nil? || complete_structured_data.empty?)
|
35
|
+
if data.nil? && (complete_structured_data.nil? || complete_structured_data.empty?)
|
32
36
|
raise FlatKit::Error,
|
33
|
-
|
37
|
+
"#{self.class} requires initialization from data: or complete_structured_data:"
|
34
38
|
end
|
35
39
|
|
36
40
|
resolve_ordered_fields
|
@@ -38,7 +42,8 @@ module FlatKit
|
|
38
42
|
|
39
43
|
def [](key)
|
40
44
|
return nil unless @compare_fields.include?(key)
|
41
|
-
|
45
|
+
|
46
|
+
if data.nil? && !@complete_structured_data.nil?
|
42
47
|
@complete_structured_data[key]
|
43
48
|
else
|
44
49
|
data[key]
|
@@ -53,7 +58,7 @@ module FlatKit
|
|
53
58
|
def to_a
|
54
59
|
return data.fields unless data.nil?
|
55
60
|
|
56
|
-
|
61
|
+
[].tap do |a|
|
57
62
|
@ordered_fields.each do |field|
|
58
63
|
a << @complete_structured_data[field]
|
59
64
|
end
|
@@ -71,19 +76,20 @@ module FlatKit
|
|
71
76
|
# values in that order.
|
72
77
|
def to_s
|
73
78
|
return data.to_csv unless data.nil?
|
79
|
+
|
74
80
|
CSV.generate_line(to_a)
|
75
81
|
end
|
76
82
|
|
77
83
|
private
|
78
84
|
|
79
85
|
def resolve_ordered_fields
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
return unless (@ordered_fields == :auto) || (@ordered_fields.nil? || @ordered_fields.empty?)
|
87
|
+
|
88
|
+
@ordered_fields = if @data.nil? || @data.empty?
|
89
|
+
complete_structured_data.keys
|
90
|
+
else
|
91
|
+
@data.headers
|
92
|
+
end
|
87
93
|
end
|
88
94
|
end
|
89
95
|
end
|