flat_kit 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +1 -2
- data/HISTORY.md +15 -0
- data/Manifest.txt +21 -26
- data/{bin → exe}/fk +2 -1
- data/flat_kit.gemspec +33 -0
- data/lib/flat_kit/cli.rb +48 -23
- data/lib/flat_kit/command/cat.rb +34 -32
- data/lib/flat_kit/command/merge.rb +37 -36
- data/lib/flat_kit/command/sort.rb +37 -37
- data/lib/flat_kit/command/stats.rb +96 -0
- data/lib/flat_kit/command.rb +10 -10
- data/lib/flat_kit/descendant_tracker.rb +17 -5
- data/lib/flat_kit/error.rb +4 -0
- data/lib/flat_kit/event_emitter.rb +7 -4
- data/lib/flat_kit/field_stats.rb +246 -0
- data/lib/flat_kit/field_type/boolean_type.rb +52 -0
- data/lib/flat_kit/field_type/date_type.rb +181 -0
- data/lib/flat_kit/field_type/float_type.rb +43 -0
- data/lib/flat_kit/field_type/guess_type.rb +23 -0
- data/lib/flat_kit/field_type/integer_type.rb +36 -0
- data/lib/flat_kit/field_type/null_type.rb +39 -0
- data/lib/flat_kit/field_type/string_type.rb +24 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +48 -0
- data/lib/flat_kit/field_type/unknown_type.rb +30 -0
- data/lib/flat_kit/field_type.rb +83 -0
- data/lib/flat_kit/format.rb +11 -5
- data/lib/flat_kit/input/file.rb +11 -9
- data/lib/flat_kit/input/io.rb +18 -21
- data/lib/flat_kit/input.rb +8 -7
- data/lib/flat_kit/internal_node.rb +22 -19
- data/lib/flat_kit/jsonl/format.rb +6 -2
- data/lib/flat_kit/jsonl/reader.rb +7 -4
- data/lib/flat_kit/jsonl/record.rb +16 -19
- data/lib/flat_kit/jsonl/writer.rb +25 -18
- data/lib/flat_kit/jsonl.rb +8 -4
- data/lib/flat_kit/leaf_node.rb +6 -5
- data/lib/flat_kit/log_formatter.rb +20 -0
- data/lib/flat_kit/logger.rb +12 -19
- data/lib/flat_kit/merge.rb +21 -16
- data/lib/flat_kit/merge_tree.rb +5 -6
- data/lib/flat_kit/output/file.rb +13 -9
- data/lib/flat_kit/output/io.rb +40 -35
- data/lib/flat_kit/output.rb +12 -7
- data/lib/flat_kit/position.rb +18 -0
- data/lib/flat_kit/reader.rb +8 -8
- data/lib/flat_kit/record.rb +12 -12
- data/lib/flat_kit/sentinel_internal_node.rb +6 -5
- data/lib/flat_kit/sentinel_leaf_node.rb +4 -1
- data/lib/flat_kit/sort.rb +8 -9
- data/lib/flat_kit/stat_type/nominal_stats.rb +64 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +37 -0
- data/lib/flat_kit/stat_type.rb +70 -0
- data/lib/flat_kit/stats.rb +64 -0
- data/lib/flat_kit/writer.rb +17 -3
- data/lib/flat_kit/xsv/format.rb +6 -2
- data/lib/flat_kit/xsv/reader.rb +8 -6
- data/lib/flat_kit/xsv/record.rb +21 -15
- data/lib/flat_kit/xsv/writer.rb +36 -18
- data/lib/flat_kit/xsv.rb +7 -4
- data/lib/flat_kit.rb +33 -21
- metadata +38 -113
- data/Rakefile +0 -20
- data/tasks/default.rake +0 -242
- data/tasks/extension.rake +0 -38
- data/tasks/man.rake +0 -7
- data/tasks/this.rb +0 -208
- data/test/device_dataset.rb +0 -117
- data/test/input/test_file.rb +0 -73
- data/test/input/test_io.rb +0 -93
- data/test/jsonl/test_format.rb +0 -22
- data/test/jsonl/test_reader.rb +0 -49
- data/test/jsonl/test_record.rb +0 -61
- data/test/jsonl/test_writer.rb +0 -68
- data/test/output/test_file.rb +0 -60
- data/test/output/test_io.rb +0 -104
- data/test/test_conversions.rb +0 -45
- data/test/test_event_emitter.rb +0 -72
- data/test/test_format.rb +0 -24
- data/test/test_helper.rb +0 -26
- data/test/test_merge.rb +0 -40
- data/test/test_merge_tree.rb +0 -64
- data/test/test_version.rb +0 -11
- data/test/xsv/test_format.rb +0 -22
- data/test/xsv/test_reader.rb +0 -61
- data/test/xsv/test_record.rb +0 -69
- data/test/xsv/test_writer.rb +0 -68
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class Command
|
5
|
+
# Internal: The implementation of the stats command.
|
6
|
+
#
|
7
|
+
class Stats < ::FlatKit::Command
|
8
|
+
def self.name
|
9
|
+
"stats"
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.description
|
13
|
+
"Collect and report stats on the inputfile"
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.parser
|
17
|
+
::Optimist::Parser.new do
|
18
|
+
banner Sort.description.to_s
|
19
|
+
banner ""
|
20
|
+
|
21
|
+
banner <<~BANNER
|
22
|
+
Given an input file collect basic statistics.
|
23
|
+
|
24
|
+
The statistics can vary based upon the datatype of the field.
|
25
|
+
|
26
|
+
Numeric fields will report the basic count, min, max, mean, standard deviation and sum.
|
27
|
+
Non-numeric fields that are comparable, like dates, will report count, min and max.
|
28
|
+
Other non-numeric fields will only report the count.
|
29
|
+
|
30
|
+
Adding --cardinality will report the count, and frequency of distinct values in the result.
|
31
|
+
This will allow for reporting the median value.
|
32
|
+
|
33
|
+
The fields upon which stats are collected may be selected with the --fields parameter.
|
34
|
+
By default statistics are collected on all fields.
|
35
|
+
|
36
|
+
The flatfile type(s) will be automatically determined by the file name.
|
37
|
+
|
38
|
+
The output can be dumped as a CSV, JSON or a a formated ascii table.
|
39
|
+
|
40
|
+
BANNER
|
41
|
+
|
42
|
+
banner <<~USAGE
|
43
|
+
|
44
|
+
Usage:
|
45
|
+
fk stats --everything file.json
|
46
|
+
fk stats --select surname,given_name file.csv
|
47
|
+
fk stats --select surname,given_name --output-format json file.csv > stats.json
|
48
|
+
fk stats --select field1,field2 --output-format json input.csv
|
49
|
+
fk stats --select field1 file.json.gz -o stats.csv
|
50
|
+
gunzip -c file.json.gz | fk stats --input-format json --output-format text
|
51
|
+
|
52
|
+
USAGE
|
53
|
+
|
54
|
+
banner <<~OPTIONS
|
55
|
+
|
56
|
+
Options:
|
57
|
+
|
58
|
+
OPTIONS
|
59
|
+
|
60
|
+
opt :output, "Send the output to the given path instead of standard out.", default: "<stdout>"
|
61
|
+
opt :input_format, "Input format, csv or json", default: "auto", short: :none
|
62
|
+
opt :output_format, "Output format, csv or json", default: "auto", short: :none
|
63
|
+
opt :select, "The comma separted list of field(s) to report stats on", required: false, type: :string
|
64
|
+
opt :everything, "Show all statistics that are possible", default: false
|
65
|
+
opt :cardinality, "Show the cardinality of the fields, this requires additional memory", default: false
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def parse
|
70
|
+
parser = self.class.parser
|
71
|
+
::Optimist.with_standard_exception_handling(parser) do
|
72
|
+
opts = parser.parse(argv)
|
73
|
+
fields = ::FlatKit::Stats::AllFields
|
74
|
+
fields = CSV.parse_line(opts[:select]) if opts[:select]
|
75
|
+
|
76
|
+
stats = [FieldStats::CORE_STATS]
|
77
|
+
stats << FieldStats::CARDINALITY_STATS if opts[:cardinality] || opts[:everything]
|
78
|
+
|
79
|
+
paths = parser.leftovers
|
80
|
+
raise ::Optimist::CommandlineError, "1 and only 1 input file is allowed" if paths.size > 1
|
81
|
+
|
82
|
+
path = paths.first || "-" # default to stdin
|
83
|
+
@stats = ::FlatKit::Stats.new(input: path, input_fallback: opts[:input_format],
|
84
|
+
output: opts[:output], output_fallback: opts[:output_format],
|
85
|
+
fields_to_stat: fields, stats_to_collect: stats)
|
86
|
+
rescue ::FlatKit::Error => e
|
87
|
+
raise ::Optimist::CommandlineError, e.message
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def call
|
92
|
+
@stats.call
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/lib/flat_kit/command.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
4
|
+
# Internal: The base class for all commands in the CLI
|
5
|
+
#
|
2
6
|
class Command
|
3
7
|
extend DescendantTracker
|
4
8
|
|
5
|
-
attr_reader :argv
|
6
|
-
attr_reader :env
|
7
|
-
attr_reader :logger
|
8
|
-
attr_reader :opts
|
9
|
-
attr_reader :readers
|
10
|
-
attr_reader :writer
|
9
|
+
attr_reader :argv, :env, :logger, :opts, :readers, :writer
|
11
10
|
|
12
11
|
def self.name
|
13
12
|
raise NotImplementedError, "#{self.class} must implement #{self.class}.name"
|
@@ -22,7 +21,7 @@ module FlatKit
|
|
22
21
|
end
|
23
22
|
|
24
23
|
def self.names
|
25
|
-
children.map
|
24
|
+
children.map(&:name)
|
26
25
|
end
|
27
26
|
|
28
27
|
def self.for(name)
|
@@ -48,6 +47,7 @@ module FlatKit
|
|
48
47
|
end
|
49
48
|
end
|
50
49
|
|
51
|
-
require
|
52
|
-
require
|
53
|
-
require
|
50
|
+
require "flat_kit/command/cat"
|
51
|
+
require "flat_kit/command/merge"
|
52
|
+
require "flat_kit/command/sort"
|
53
|
+
require "flat_kit/command/stats"
|
@@ -1,17 +1,20 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "set"
|
2
4
|
|
3
5
|
module FlatKit
|
6
|
+
# Internal: A module to track descendants of a class
|
7
|
+
#
|
4
8
|
module DescendantTracker
|
5
9
|
def inherited(klass)
|
6
10
|
super
|
7
11
|
return unless klass.instance_of?(Class)
|
8
|
-
|
12
|
+
|
13
|
+
children << klass
|
9
14
|
end
|
10
15
|
|
11
16
|
def children
|
12
|
-
unless defined? @_children
|
13
|
-
@_children = Set.new
|
14
|
-
end
|
17
|
+
@_children = Set.new unless defined? @_children
|
15
18
|
@_children
|
16
19
|
end
|
17
20
|
|
@@ -23,5 +26,14 @@ module FlatKit
|
|
23
26
|
child_klass.send(method, *args)
|
24
27
|
end
|
25
28
|
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Find all the children that return truthy from the given method with args
|
32
|
+
#
|
33
|
+
def find_children(method, *args)
|
34
|
+
children.select do |child_klass|
|
35
|
+
child_klass.send(method, *args)
|
36
|
+
end
|
37
|
+
end
|
26
38
|
end
|
27
39
|
end
|
data/lib/flat_kit/error.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module FlatKit
|
2
|
-
# A simplified Observable class for use internally
|
4
|
+
# Internal: A simplified Observable class for use internally
|
3
5
|
#
|
4
6
|
module EventEmitter
|
5
7
|
def add_listener(listener)
|
6
8
|
raise ::NoMethodError, "#{listener} does not resond to #on_event" unless listener.respond_to?(:on_event)
|
9
|
+
|
7
10
|
self._listeners ||= []
|
8
11
|
self._listeners << listener unless _listeners.include?(listener)
|
9
12
|
end
|
@@ -20,14 +23,14 @@ module FlatKit
|
|
20
23
|
_listeners.clear
|
21
24
|
end
|
22
25
|
|
23
|
-
def notify_listeners(name:, data:)
|
26
|
+
def notify_listeners(name:, data:, meta: nil)
|
24
27
|
_listeners.each do |l|
|
25
|
-
l.on_event(name: name, data: data)
|
28
|
+
l.on_event(name: name, data: data, meta: meta)
|
26
29
|
end
|
27
30
|
end
|
28
31
|
|
29
32
|
def _listeners
|
30
|
-
@_listeners ||=
|
33
|
+
@_listeners ||= []
|
31
34
|
end
|
32
35
|
end
|
33
36
|
end
|
@@ -0,0 +1,246 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
# Internal: Collect stats on a single field.
|
5
|
+
#
|
6
|
+
# We may not know what the field data type is to start with, so collect a
|
7
|
+
# bunch of values until we have the threshold, and then calculte states based
|
8
|
+
# upon the data types determined by the guess
|
9
|
+
#
|
10
|
+
class FieldStats
|
11
|
+
DEFAULT_GUESS_THRESHOLD = 1000
|
12
|
+
|
13
|
+
CORE_STATS = :core
|
14
|
+
CARDINALITY_STATS = :cardinality
|
15
|
+
|
16
|
+
ALL_STATS = [CORE_STATS, CARDINALITY_STATS].freeze
|
17
|
+
|
18
|
+
EXPORT_FIELDS = %w[
|
19
|
+
name
|
20
|
+
type
|
21
|
+
count
|
22
|
+
max
|
23
|
+
mean
|
24
|
+
min
|
25
|
+
stddev
|
26
|
+
sum
|
27
|
+
mode
|
28
|
+
unique_count
|
29
|
+
|
30
|
+
max_length
|
31
|
+
mean_length
|
32
|
+
min_length
|
33
|
+
stddev_length
|
34
|
+
mode_length
|
35
|
+
unique_count_lengths
|
36
|
+
|
37
|
+
null_count
|
38
|
+
unknown_count
|
39
|
+
out_of_type_count
|
40
|
+
total_count
|
41
|
+
null_percent
|
42
|
+
unknown_percent
|
43
|
+
].freeze
|
44
|
+
|
45
|
+
attr_reader :type_counts, :field_type, :name, :out_of_type_count
|
46
|
+
|
47
|
+
def initialize(name:, stats_to_collect: CORE_STATS,
|
48
|
+
type: ::FlatKit::FieldType::GuessType,
|
49
|
+
guess_threshold: DEFAULT_GUESS_THRESHOLD)
|
50
|
+
@name = name
|
51
|
+
@field_type = type
|
52
|
+
@guess_threshold = guess_threshold
|
53
|
+
@type_counts = Hash.new(0)
|
54
|
+
@out_of_type_count = 0
|
55
|
+
@values = []
|
56
|
+
@stats = nil
|
57
|
+
@length_stats = nil
|
58
|
+
@stats_to_collect = [stats_to_collect].flatten
|
59
|
+
|
60
|
+
@stats_to_collect.each do |collection_set|
|
61
|
+
next if ALL_STATS.include?(collection_set)
|
62
|
+
|
63
|
+
valid_sets = ALL_STATS.map(&:to_s).join(", ")
|
64
|
+
|
65
|
+
raise ArgumentError, "#{collection_set} is not a valid stats collection set, must be one of #{valid_sets}"
|
66
|
+
end
|
67
|
+
return if type.is_a?(Class) && (type.superclass == ::FlatKit::FieldType)
|
68
|
+
|
69
|
+
raise ArgumentError, "type: must be FieldType subclasses - not #{type}"
|
70
|
+
end
|
71
|
+
|
72
|
+
def field_type_determined?
|
73
|
+
@field_type != ::FlatKit::FieldType::GuessType
|
74
|
+
end
|
75
|
+
|
76
|
+
def update(value)
|
77
|
+
update_type_count(value)
|
78
|
+
|
79
|
+
if field_type_determined?
|
80
|
+
update_stats(value)
|
81
|
+
else
|
82
|
+
@values << value
|
83
|
+
|
84
|
+
resolve_guess if @values.size >= @guess_threshold
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def collecting_frequencies?
|
89
|
+
@stats_to_collect.include?(CARDINALITY_STATS)
|
90
|
+
end
|
91
|
+
|
92
|
+
def type
|
93
|
+
@field_type.type_name
|
94
|
+
end
|
95
|
+
|
96
|
+
def count
|
97
|
+
stats.count
|
98
|
+
end
|
99
|
+
|
100
|
+
def max
|
101
|
+
stats.max if stats.respond_to?(:max)
|
102
|
+
end
|
103
|
+
|
104
|
+
def mean
|
105
|
+
stats.mean if stats.respond_to?(:mean)
|
106
|
+
end
|
107
|
+
|
108
|
+
def min
|
109
|
+
stats.min if stats.respond_to?(:min)
|
110
|
+
end
|
111
|
+
|
112
|
+
def stddev
|
113
|
+
stats.stddev if stats.respond_to?(:stddev)
|
114
|
+
end
|
115
|
+
|
116
|
+
def sum
|
117
|
+
stats.sum if stats.respond_to?(:sum)
|
118
|
+
end
|
119
|
+
|
120
|
+
def mode
|
121
|
+
stats.mode if collecting_frequencies?
|
122
|
+
end
|
123
|
+
|
124
|
+
def unique_count
|
125
|
+
stats.unique_count if collecting_frequencies?
|
126
|
+
end
|
127
|
+
|
128
|
+
def unique_values
|
129
|
+
stats.unique_values if collecting_frequencies?
|
130
|
+
end
|
131
|
+
|
132
|
+
def frequencies
|
133
|
+
stats.frequencies if collecting_frequencies?
|
134
|
+
end
|
135
|
+
|
136
|
+
def min_length
|
137
|
+
length_stats.min if @length_stats
|
138
|
+
end
|
139
|
+
|
140
|
+
def max_length
|
141
|
+
length_stats.max if @length_stats
|
142
|
+
end
|
143
|
+
|
144
|
+
def mean_length
|
145
|
+
length_stats.mean if @length_stats
|
146
|
+
end
|
147
|
+
|
148
|
+
def stddev_length
|
149
|
+
length_stats.stddev if @length_stats
|
150
|
+
end
|
151
|
+
|
152
|
+
def mode_length
|
153
|
+
length_stats.mode if @length_stats && collecting_frequencies?
|
154
|
+
end
|
155
|
+
|
156
|
+
def unique_count_lengths
|
157
|
+
length_stats.unique_count if @length_stats && collecting_frequencies?
|
158
|
+
end
|
159
|
+
|
160
|
+
def unique_values_lengths
|
161
|
+
length_stats.unique_values if @length_stats && collecting_frequencies?
|
162
|
+
end
|
163
|
+
|
164
|
+
def length_frequencies
|
165
|
+
length_stats.frequencies if @length_stats && collecting_frequencies?
|
166
|
+
end
|
167
|
+
|
168
|
+
def null_count
|
169
|
+
type_counts[FieldType::NullType]
|
170
|
+
end
|
171
|
+
|
172
|
+
def total_count
|
173
|
+
stats.count + @out_of_type_count
|
174
|
+
end
|
175
|
+
|
176
|
+
def null_percent
|
177
|
+
return 0 if total_count.zero?
|
178
|
+
|
179
|
+
((null_count.to_f / total_count) * 100.0).truncate(2)
|
180
|
+
end
|
181
|
+
|
182
|
+
def unknown_count
|
183
|
+
type_counts[FieldType::UnknownType]
|
184
|
+
end
|
185
|
+
|
186
|
+
def unknown_percent
|
187
|
+
return 0 if total_count.zero?
|
188
|
+
|
189
|
+
((unknown_count.to_f / total_count) * 100.0).truncate(2)
|
190
|
+
end
|
191
|
+
|
192
|
+
def to_hash
|
193
|
+
resolve_guess
|
194
|
+
|
195
|
+
{}.tap do |h|
|
196
|
+
EXPORT_FIELDS.each do |n|
|
197
|
+
h[n] = send(n)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
private
|
203
|
+
|
204
|
+
def stats
|
205
|
+
resolve_guess
|
206
|
+
@stats
|
207
|
+
end
|
208
|
+
|
209
|
+
def length_stats
|
210
|
+
resolve_guess
|
211
|
+
@length_stats
|
212
|
+
end
|
213
|
+
|
214
|
+
def update_stats(value)
|
215
|
+
coerced_value = @field_type.coerce(value)
|
216
|
+
if coerced_value == FieldType::CoerceFailure
|
217
|
+
@out_of_type_count += 1
|
218
|
+
return
|
219
|
+
end
|
220
|
+
|
221
|
+
@stats.update(coerced_value)
|
222
|
+
@length_stats.update(coerced_value.to_s.length) if @length_stats
|
223
|
+
end
|
224
|
+
|
225
|
+
def update_type_count(value)
|
226
|
+
guess = FieldType.best_guess(value)
|
227
|
+
type_counts[guess] += 1
|
228
|
+
guess
|
229
|
+
end
|
230
|
+
|
231
|
+
def resolve_guess
|
232
|
+
return if field_type_determined?
|
233
|
+
|
234
|
+
best_guess_type, _best_guess_count = type_counts.max_by { |_k, v| v }
|
235
|
+
@field_type = best_guess_type
|
236
|
+
@stats = StatType.for(@field_type).new(collecting_frequencies: collecting_frequencies?)
|
237
|
+
if @field_type == ::FlatKit::FieldType::StringType
|
238
|
+
@length_stats = ::FlatKit::StatType::NumericalStats.new(collecting_frequencies: collecting_frequencies?)
|
239
|
+
end
|
240
|
+
@values.each do |v|
|
241
|
+
update_stats(v)
|
242
|
+
end
|
243
|
+
@values.clear
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Implemenation of the boolean type and coercion to the type
|
6
|
+
#
|
7
|
+
class BooleanType < FieldType
|
8
|
+
TRUTHY_REGEX = /\A(true|t|1|yes|y|on)\Z/i
|
9
|
+
FALSEY_REGEX = /\A(false|f|0|no|n|off)\Z/i
|
10
|
+
REGEX = Regexp.union(TRUTHY_REGEX, FALSEY_REGEX)
|
11
|
+
|
12
|
+
def self.type_name
|
13
|
+
"boolean"
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.matches?(data)
|
17
|
+
case data
|
18
|
+
when TrueClass, FalseClass
|
19
|
+
true
|
20
|
+
when String
|
21
|
+
REGEX.match?(data)
|
22
|
+
when Integer
|
23
|
+
return true if data.zero?
|
24
|
+
return true if data == 1
|
25
|
+
|
26
|
+
false
|
27
|
+
else
|
28
|
+
false
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.coerce(data)
|
33
|
+
case data
|
34
|
+
when TrueClass
|
35
|
+
true
|
36
|
+
when FalseClass
|
37
|
+
false
|
38
|
+
when Numeric
|
39
|
+
return false if data.zero?
|
40
|
+
return true if data == 1
|
41
|
+
|
42
|
+
CoerceFailure
|
43
|
+
when String
|
44
|
+
return true if TRUTHY_REGEX.match?(data)
|
45
|
+
return false if FALSEY_REGEX.match?(data)
|
46
|
+
|
47
|
+
CoerceFailure
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,181 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module FlatKit
|
4
|
+
class FieldType
|
5
|
+
# Internal: Representing the type of data which only includes data up to
|
6
|
+
# the day resolution
|
7
|
+
#
|
8
|
+
class DateType < FieldType
|
9
|
+
# %Y 4 digit year
|
10
|
+
# %y 2 didigt year (%Y mod 100) (00..99)
|
11
|
+
# %m month of year zero padded
|
12
|
+
# %-m month of year no-padding
|
13
|
+
# %B Full month name
|
14
|
+
# %b Abbreviated month name
|
15
|
+
# %^b uppercased month name
|
16
|
+
# %d day of month zero padded
|
17
|
+
# %-d day of moneth not padded
|
18
|
+
# %e day of month blank padded
|
19
|
+
# %j day of year zero padded
|
20
|
+
|
21
|
+
# parse formats are not the same as print formats as parsing does not deal
|
22
|
+
# with flags and widths
|
23
|
+
def self.parse_formats
|
24
|
+
@parse_formats ||= [
|
25
|
+
# YMD formats
|
26
|
+
"%Y-%m-%d",
|
27
|
+
"%Y%m%d",
|
28
|
+
"%Y/%m/%d",
|
29
|
+
"%Y %m %d.",
|
30
|
+
|
31
|
+
# DMY formats
|
32
|
+
"%d %B %Y",
|
33
|
+
"%d %b %Y",
|
34
|
+
"%d-%b-%Y",
|
35
|
+
"%d/%b/%Y",
|
36
|
+
"%d-%m-%Y",
|
37
|
+
"%d-%m-%y",
|
38
|
+
"%d %b, %Y",
|
39
|
+
"%d %b,%Y",
|
40
|
+
"%d %B, %Y",
|
41
|
+
"%d %B,%Y",
|
42
|
+
|
43
|
+
# MDY formats
|
44
|
+
"%m/%d/%Y",
|
45
|
+
"%m-%d-%Y",
|
46
|
+
"%m/%d/%y",
|
47
|
+
"%m-%d-%y",
|
48
|
+
|
49
|
+
"%B %d, %Y",
|
50
|
+
"%b %d, %Y",
|
51
|
+
|
52
|
+
# other formats
|
53
|
+
"%Y-%j",
|
54
|
+
"%a %b %d %Y",
|
55
|
+
].freeze
|
56
|
+
end
|
57
|
+
|
58
|
+
# https://en.wikipedia.org/wiki/Date_format_by_country
|
59
|
+
# List of formats culled from the above - not using all as it is
|
60
|
+
# definitely a performance issue at the moment
|
61
|
+
# def self.known_formats
|
62
|
+
# @known_formats ||= [
|
63
|
+
# # YMD formats
|
64
|
+
# "%Y-%m-%d",
|
65
|
+
# "%Y%m%d",
|
66
|
+
# "%Y/%m/%d",
|
67
|
+
# "%Y.%m.%d",
|
68
|
+
# "%Y.%m.%d.",
|
69
|
+
# "%Y %m %d.",
|
70
|
+
# "%Y %b %d",
|
71
|
+
# "%Y %b %-d",
|
72
|
+
# "%Y %B %-d",
|
73
|
+
# "%Y %B %d",
|
74
|
+
# "%Y-%m%d",
|
75
|
+
# "%Y. %m. %-d.",
|
76
|
+
# "%Y. %m. %d.",
|
77
|
+
# "%Y.%-m.%-d.",
|
78
|
+
# "%Y.%-m.%-d",
|
79
|
+
# "%Y, %d %B",
|
80
|
+
# "%Y, %d %b",
|
81
|
+
#
|
82
|
+
# "%y.%-m.%-d",
|
83
|
+
# "%y.%-m.%-d.",
|
84
|
+
# "%y.%m.%d.",
|
85
|
+
# "%y.%m.%d",
|
86
|
+
# "%y/%m/%d",
|
87
|
+
#
|
88
|
+
# # DMY formats
|
89
|
+
# "%-d %b %Y",
|
90
|
+
# "%-d %B %Y",
|
91
|
+
# "%-d-%-m-%Y",
|
92
|
+
# "%-d. %-m. %Y",
|
93
|
+
# "%-d. %-m. %Y.",
|
94
|
+
# "%-d. %B %Y",
|
95
|
+
# "%-d. %B %Y.",
|
96
|
+
# "%-d.%-m.%Y",
|
97
|
+
# "%-d.%-m.%Y.",
|
98
|
+
# "%-d.%m.%Y.",
|
99
|
+
# "%-d.%m.%Y",
|
100
|
+
# "%-d.%b.%Y",
|
101
|
+
# "%-d.%B.%Y",
|
102
|
+
# "%-d/%-m %Y",
|
103
|
+
# "%-d/%-m/%Y",
|
104
|
+
# "%d %B %Y",
|
105
|
+
# "%d %b %Y",
|
106
|
+
# "%d-%m-%Y",
|
107
|
+
# "%d-%b-%Y",
|
108
|
+
# "%d-%B-%Y",
|
109
|
+
# "%d.%m.%Y",
|
110
|
+
# "%d/%m %Y",
|
111
|
+
# "%d/%m/%Y",
|
112
|
+
#
|
113
|
+
# "%-d.%b.%y",
|
114
|
+
# "%-d.%B.%y",
|
115
|
+
# "%-d.%-m.%y",
|
116
|
+
# "%-d/%-m-%y",
|
117
|
+
# "%-d/%-m/%y",
|
118
|
+
# "%d/%m/%y",
|
119
|
+
# "%d-%m-%y",
|
120
|
+
# "%d.%m.%y",
|
121
|
+
# "%d%m%y",
|
122
|
+
#
|
123
|
+
# # MDY formats
|
124
|
+
# "%-m/%-d/%Y",
|
125
|
+
# "%m/%d/%Y",
|
126
|
+
# "%m-%d-%Y",
|
127
|
+
# "%b-%d-%Y",
|
128
|
+
# "%B %-d, %Y",
|
129
|
+
# "%B %-d. %Y",
|
130
|
+
# "%B %d, %Y",
|
131
|
+
# "%B-%d-%Y",
|
132
|
+
# "%B/%d/%Y",
|
133
|
+
#
|
134
|
+
# "%-m/%-d/%y",
|
135
|
+
#
|
136
|
+
# # other formats
|
137
|
+
# "%Y-%j",
|
138
|
+
# "%Y%m",
|
139
|
+
# "%Y-%m",
|
140
|
+
# "%Y %m",
|
141
|
+
# ]
|
142
|
+
# end
|
143
|
+
|
144
|
+
def self.type_name
|
145
|
+
"date"
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.matches?(data)
|
149
|
+
coerced = coerce(data)
|
150
|
+
coerced.is_a?(Date)
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.coerce(data)
|
154
|
+
case data
|
155
|
+
when DateTime
|
156
|
+
CoerceFailure
|
157
|
+
when Date
|
158
|
+
data
|
159
|
+
when String
|
160
|
+
try_parse(data)
|
161
|
+
else
|
162
|
+
CoerceFailure
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def self.try_parse(data)
|
167
|
+
parse_formats.each do |format|
|
168
|
+
coerced_data = Date.strptime(data, format)
|
169
|
+
return coerced_data
|
170
|
+
rescue StandardError => _e
|
171
|
+
false
|
172
|
+
end
|
173
|
+
CoerceFailure
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
__END__
|
181
|
+
|