flat_kit 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,75 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
|
4
|
+
extend FlatKit::DescendantTracker
|
5
|
+
|
6
|
+
CoerceFailure = Class.new(::Object).freeze
|
7
|
+
|
8
|
+
def self.candidate_types(data)
|
9
|
+
find_children(:matches?, data)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.best_guess(data)
|
13
|
+
candidate_types(data).sort_by { |t| t.weight }.last
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.type_name
|
17
|
+
raise NotImplementedError, "must impleent #{self.type_name}"
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.matches?(data)
|
21
|
+
raise NotImplementedError, "must implement #{self.name}.matches?(data)"
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.coerce(data)
|
25
|
+
raise NotImplementedError, "must implement #{self.name}.coerce(data)"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Each type has a weight so if a value matches multiple types, then the list
|
29
|
+
# can be compared to see where the tie breakers are
|
30
|
+
#
|
31
|
+
# All the weights are here so that
|
32
|
+
#
|
33
|
+
#
|
34
|
+
def self.weight
|
35
|
+
# Boolean has crossover with Integer so going to let it overrule Integer
|
36
|
+
return 5 if self == BooleanType
|
37
|
+
|
38
|
+
|
39
|
+
# Integer could potentially overlap with Float, but it is more restrictive
|
40
|
+
# so let it override Flaot
|
41
|
+
return 4 if self == IntegerType
|
42
|
+
return 3 if self == FloatType
|
43
|
+
|
44
|
+
# Date and Timestamps string representation shouldn't intersect with anything so
|
45
|
+
# leaving it at the same level as Null and Unkonwn
|
46
|
+
return 2 if self == DateType
|
47
|
+
return 2 if self == TimestampType
|
48
|
+
|
49
|
+
# Null and Unknown shoulnd't conflict since their string representations
|
50
|
+
# do not intersect
|
51
|
+
return 2 if self == NullType
|
52
|
+
return 2 if self == UnknownType
|
53
|
+
|
54
|
+
# Stringtype is the fallback for anything that has a string
|
55
|
+
# representation, so it should lose out on integers, floats, nulls,
|
56
|
+
# unknowns as strings
|
57
|
+
return 1 if self == StringType
|
58
|
+
|
59
|
+
# at the bottom - since it should never match anywhere
|
60
|
+
return 0 if self == GuessType
|
61
|
+
|
62
|
+
raise NotImplementedError, "No weight assigned to type #{self} - fix immediately"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
require 'flat_kit/field_type/guess_type'
|
68
|
+
require 'flat_kit/field_type/boolean_type'
|
69
|
+
require 'flat_kit/field_type/date_type'
|
70
|
+
require 'flat_kit/field_type/timestamp_type'
|
71
|
+
require 'flat_kit/field_type/integer_type'
|
72
|
+
require 'flat_kit/field_type/float_type'
|
73
|
+
require 'flat_kit/field_type/null_type'
|
74
|
+
require 'flat_kit/field_type/string_type'
|
75
|
+
require 'flat_kit/field_type/unknown_type'
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class BooleanType < FieldType
|
4
|
+
|
5
|
+
TRUTHY_REGEX = /\A(true|t|1|yes|y|on)\Z/i
|
6
|
+
FALSEY_REGEX = /\A(false|f|0|no|n|off)\Z/i
|
7
|
+
REGEX = Regexp.union(TRUTHY_REGEX, FALSEY_REGEX)
|
8
|
+
|
9
|
+
def self.type_name
|
10
|
+
"boolean"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.matches?(data)
|
14
|
+
case data
|
15
|
+
when TrueClass
|
16
|
+
true
|
17
|
+
when FalseClass
|
18
|
+
true
|
19
|
+
when String
|
20
|
+
REGEX.match?(data)
|
21
|
+
when Integer
|
22
|
+
return true if data.zero?
|
23
|
+
return true if data == 1
|
24
|
+
return false
|
25
|
+
else
|
26
|
+
false
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.coerce(data)
|
31
|
+
case data
|
32
|
+
when TrueClass
|
33
|
+
true
|
34
|
+
when FalseClass
|
35
|
+
false
|
36
|
+
when Numeric
|
37
|
+
return false if data.zero?
|
38
|
+
return true if data == 1
|
39
|
+
CoerceFailure
|
40
|
+
when String
|
41
|
+
return true if TRUTHY_REGEX.match?(data)
|
42
|
+
return false if FALSEY_REGEX.match?(data)
|
43
|
+
CoerceFailure
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
# Representing the type of data which only includes data up to the day
|
4
|
+
# resolution
|
5
|
+
class DateType < FieldType
|
6
|
+
|
7
|
+
# %Y 4 digit year
|
8
|
+
# %y 2 didigt year (%Y mod 100) (00..99)
|
9
|
+
# %m month of year zero padded
|
10
|
+
# %-m month of year no-padding
|
11
|
+
# %B Full month name
|
12
|
+
# %b Abbreviated month name
|
13
|
+
# %^b uppercased month name
|
14
|
+
# %d day of month zero padded
|
15
|
+
# %-d day of moneth not padded
|
16
|
+
# %e day of month blank padded
|
17
|
+
# %j day of year zero padded
|
18
|
+
|
19
|
+
# parse formats are not the same as print formats as parsing does not deal
|
20
|
+
# with flags and widths
|
21
|
+
def self.parse_formats
|
22
|
+
@parse_formats ||= [
|
23
|
+
# YMD formats
|
24
|
+
"%Y-%m-%d",
|
25
|
+
"%Y%m%d",
|
26
|
+
"%Y/%m/%d",
|
27
|
+
"%Y %m %d.",
|
28
|
+
|
29
|
+
# DMY formats
|
30
|
+
"%d %B %Y",
|
31
|
+
"%d %b %Y",
|
32
|
+
"%d-%b-%Y",
|
33
|
+
"%d/%b/%Y",
|
34
|
+
"%d-%m-%Y",
|
35
|
+
"%d-%m-%y",
|
36
|
+
"%d %b, %Y",
|
37
|
+
"%d %b,%Y",
|
38
|
+
"%d %B, %Y",
|
39
|
+
"%d %B,%Y",
|
40
|
+
|
41
|
+
# MDY formats
|
42
|
+
"%m/%d/%Y",
|
43
|
+
"%m-%d-%Y",
|
44
|
+
"%m/%d/%y",
|
45
|
+
"%m-%d-%y",
|
46
|
+
|
47
|
+
"%B %d, %Y",
|
48
|
+
"%b %d, %Y",
|
49
|
+
|
50
|
+
# other formats
|
51
|
+
"%Y-%j",
|
52
|
+
"%a %b %d %Y"
|
53
|
+
]
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
# https://en.wikipedia.org/wiki/Date_format_by_country
|
58
|
+
# List of formats culled from the above - not using all as it is
|
59
|
+
# definitely a performance issue at the moment
|
60
|
+
# def self.known_formats
|
61
|
+
# @known_formats ||= [
|
62
|
+
# # YMD formats
|
63
|
+
# "%Y-%m-%d",
|
64
|
+
# "%Y%m%d",
|
65
|
+
# "%Y/%m/%d",
|
66
|
+
# "%Y.%m.%d",
|
67
|
+
# "%Y.%m.%d.",
|
68
|
+
# "%Y %m %d.",
|
69
|
+
# "%Y %b %d",
|
70
|
+
# "%Y %b %-d",
|
71
|
+
# "%Y %B %-d",
|
72
|
+
# "%Y %B %d",
|
73
|
+
# "%Y-%m%d",
|
74
|
+
# "%Y. %m. %-d.",
|
75
|
+
# "%Y. %m. %d.",
|
76
|
+
# "%Y.%-m.%-d.",
|
77
|
+
# "%Y.%-m.%-d",
|
78
|
+
# "%Y, %d %B",
|
79
|
+
# "%Y, %d %b",
|
80
|
+
#
|
81
|
+
# "%y.%-m.%-d",
|
82
|
+
# "%y.%-m.%-d.",
|
83
|
+
# "%y.%m.%d.",
|
84
|
+
# "%y.%m.%d",
|
85
|
+
# "%y/%m/%d",
|
86
|
+
#
|
87
|
+
# # DMY formats
|
88
|
+
# "%-d %b %Y",
|
89
|
+
# "%-d %B %Y",
|
90
|
+
# "%-d-%-m-%Y",
|
91
|
+
# "%-d. %-m. %Y",
|
92
|
+
# "%-d. %-m. %Y.",
|
93
|
+
# "%-d. %B %Y",
|
94
|
+
# "%-d. %B %Y.",
|
95
|
+
# "%-d.%-m.%Y",
|
96
|
+
# "%-d.%-m.%Y.",
|
97
|
+
# "%-d.%m.%Y.",
|
98
|
+
# "%-d.%m.%Y",
|
99
|
+
# "%-d.%b.%Y",
|
100
|
+
# "%-d.%B.%Y",
|
101
|
+
# "%-d/%-m %Y",
|
102
|
+
# "%-d/%-m/%Y",
|
103
|
+
# "%d %B %Y",
|
104
|
+
# "%d %b %Y",
|
105
|
+
# "%d-%m-%Y",
|
106
|
+
# "%d-%b-%Y",
|
107
|
+
# "%d-%B-%Y",
|
108
|
+
# "%d.%m.%Y",
|
109
|
+
# "%d/%m %Y",
|
110
|
+
# "%d/%m/%Y",
|
111
|
+
#
|
112
|
+
# "%-d.%b.%y",
|
113
|
+
# "%-d.%B.%y",
|
114
|
+
# "%-d.%-m.%y",
|
115
|
+
# "%-d/%-m-%y",
|
116
|
+
# "%-d/%-m/%y",
|
117
|
+
# "%d/%m/%y",
|
118
|
+
# "%d-%m-%y",
|
119
|
+
# "%d.%m.%y",
|
120
|
+
# "%d%m%y",
|
121
|
+
#
|
122
|
+
# # MDY formats
|
123
|
+
# "%-m/%-d/%Y",
|
124
|
+
# "%m/%d/%Y",
|
125
|
+
# "%m-%d-%Y",
|
126
|
+
# "%b-%d-%Y",
|
127
|
+
# "%B %-d, %Y",
|
128
|
+
# "%B %-d. %Y",
|
129
|
+
# "%B %d, %Y",
|
130
|
+
# "%B-%d-%Y",
|
131
|
+
# "%B/%d/%Y",
|
132
|
+
#
|
133
|
+
# "%-m/%-d/%y",
|
134
|
+
#
|
135
|
+
# # other formats
|
136
|
+
# "%Y-%j",
|
137
|
+
# "%Y%m",
|
138
|
+
# "%Y-%m",
|
139
|
+
# "%Y %m",
|
140
|
+
# ]
|
141
|
+
# end
|
142
|
+
|
143
|
+
def self.type_name
|
144
|
+
"date"
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.matches?(data)
|
148
|
+
coerced = coerce(data)
|
149
|
+
return coerced.kind_of?(Date)
|
150
|
+
end
|
151
|
+
|
152
|
+
def self.coerce(data)
|
153
|
+
case data
|
154
|
+
when DateTime
|
155
|
+
CoerceFailure
|
156
|
+
when Date
|
157
|
+
data
|
158
|
+
when String
|
159
|
+
coerced_data = CoerceFailure
|
160
|
+
parse_formats.each do |format|
|
161
|
+
begin
|
162
|
+
coerced_data = Date.strptime(data, format)
|
163
|
+
break
|
164
|
+
rescue => _
|
165
|
+
false
|
166
|
+
end
|
167
|
+
end
|
168
|
+
coerced_data
|
169
|
+
else
|
170
|
+
CoerceFailure
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
__END__
|
179
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class FloatType < FieldType
|
4
|
+
|
5
|
+
def self.type_name
|
6
|
+
"float"
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.matches?(data)
|
10
|
+
case data
|
11
|
+
when Float
|
12
|
+
true
|
13
|
+
when Integer
|
14
|
+
false
|
15
|
+
when String
|
16
|
+
return false if IntegerType.matches?(data)
|
17
|
+
begin
|
18
|
+
Float(data)
|
19
|
+
true
|
20
|
+
rescue ArgumentError => _
|
21
|
+
false
|
22
|
+
end
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.coerce(data)
|
29
|
+
Float(data)
|
30
|
+
rescue TypeError => _
|
31
|
+
CoerceFailure
|
32
|
+
rescue ArgumentError => _
|
33
|
+
CoerceFailure
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
# GuessType is a field type where we don't know what type the field is, and
|
4
|
+
# it needs to be guessed. This is a sentinel type that doesn't match any
|
5
|
+
# data.
|
6
|
+
class GuessType < FieldType
|
7
|
+
def self.type_name
|
8
|
+
self.name
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.coerce(data)
|
16
|
+
CoerceFailure
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class IntegerType < FieldType
|
4
|
+
|
5
|
+
REGEX = /\A[-+]?\d+\Z/
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"integer"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
case data
|
13
|
+
when Integer
|
14
|
+
true
|
15
|
+
when Float
|
16
|
+
false
|
17
|
+
when String
|
18
|
+
REGEX.match?(data)
|
19
|
+
else
|
20
|
+
false
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.coerce(data)
|
25
|
+
Integer(data)
|
26
|
+
rescue TypeError => _
|
27
|
+
CoerceFailure
|
28
|
+
rescue ArgumentError => _
|
29
|
+
CoerceFailure
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class NullType < FieldType
|
4
|
+
|
5
|
+
REGEX = Regexp.union(/\A(null|nil)\Z/i, /\A\\N\Z/)
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"null"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
case data
|
13
|
+
when nil
|
14
|
+
true
|
15
|
+
when String
|
16
|
+
REGEX.match?(data)
|
17
|
+
else
|
18
|
+
false
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.coerce(data)
|
23
|
+
case data
|
24
|
+
when nil
|
25
|
+
data
|
26
|
+
when String
|
27
|
+
return nil if REGEX.match?(data)
|
28
|
+
CoerceFailure
|
29
|
+
else
|
30
|
+
CoerceFailure
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
# StringType is essentially a fallback - hence its lower weight than other
|
4
|
+
# types that might have string representations.
|
5
|
+
class StringType< FieldType
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"string"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
data.kind_of?(String)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.coerce(data)
|
16
|
+
data.to_s
|
17
|
+
rescue => _
|
18
|
+
CoerceFailure
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|