flat_kit 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +6 -0
- data/Manifest.txt +34 -0
- data/Rakefile +1 -0
- data/examples/stream-active-record-to-csv.rb +42 -0
- data/lib/flat_kit.rb +9 -2
- data/lib/flat_kit/cli.rb +11 -0
- data/lib/flat_kit/command.rb +1 -0
- data/lib/flat_kit/command/stats.rb +94 -0
- data/lib/flat_kit/descendant_tracker.rb +9 -0
- data/lib/flat_kit/event_emitter.rb +2 -2
- data/lib/flat_kit/field_stats.rb +241 -0
- data/lib/flat_kit/field_type.rb +75 -0
- data/lib/flat_kit/field_type/boolean_type.rb +48 -0
- data/lib/flat_kit/field_type/date_type.rb +179 -0
- data/lib/flat_kit/field_type/float_type.rb +37 -0
- data/lib/flat_kit/field_type/guess_type.rb +20 -0
- data/lib/flat_kit/field_type/integer_type.rb +34 -0
- data/lib/flat_kit/field_type/null_type.rb +35 -0
- data/lib/flat_kit/field_type/string_type.rb +22 -0
- data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
- data/lib/flat_kit/field_type/unknown_type.rb +26 -0
- data/lib/flat_kit/jsonl/record.rb +2 -2
- data/lib/flat_kit/jsonl/writer.rb +18 -9
- data/lib/flat_kit/merge.rb +4 -2
- data/lib/flat_kit/output.rb +4 -0
- data/lib/flat_kit/position.rb +19 -0
- data/lib/flat_kit/stat_type.rb +65 -0
- data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
- data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
- data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
- data/lib/flat_kit/stats.rb +66 -0
- data/lib/flat_kit/writer.rb +17 -2
- data/lib/flat_kit/xsv/writer.rb +24 -9
- data/test/field_type/test_boolean_type.rb +65 -0
- data/test/field_type/test_date_type.rb +71 -0
- data/test/field_type/test_float_type.rb +56 -0
- data/test/field_type/test_guess_type.rb +14 -0
- data/test/field_type/test_integer_type.rb +52 -0
- data/test/field_type/test_null_type.rb +41 -0
- data/test/field_type/test_string_type.rb +18 -0
- data/test/field_type/test_timestamp_type.rb +108 -0
- data/test/field_type/test_unknown_type.rb +35 -0
- data/test/jsonl/test_writer.rb +21 -3
- data/test/run +23 -0
- data/test/stat_type/test_nominal_stats.rb +69 -0
- data/test/stat_type/test_numerical_stats.rb +118 -0
- data/test/stat_type/test_ordinal_stats.rb +92 -0
- data/test/test_event_emitter.rb +19 -2
- data/test/test_field_stats.rb +134 -0
- data/test/test_field_type.rb +34 -0
- data/test/xsv/test_writer.rb +25 -4
- metadata +65 -2
@@ -0,0 +1,75 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
|
4
|
+
extend FlatKit::DescendantTracker
|
5
|
+
|
6
|
+
CoerceFailure = Class.new(::Object).freeze
|
7
|
+
|
8
|
+
def self.candidate_types(data)
|
9
|
+
find_children(:matches?, data)
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.best_guess(data)
|
13
|
+
candidate_types(data).sort_by { |t| t.weight }.last
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.type_name
|
17
|
+
raise NotImplementedError, "must impleent #{self.type_name}"
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.matches?(data)
|
21
|
+
raise NotImplementedError, "must implement #{self.name}.matches?(data)"
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.coerce(data)
|
25
|
+
raise NotImplementedError, "must implement #{self.name}.coerce(data)"
|
26
|
+
end
|
27
|
+
|
28
|
+
# Each type has a weight so if a value matches multiple types, then the list
|
29
|
+
# can be compared to see where the tie breakers are
|
30
|
+
#
|
31
|
+
# All the weights are here so that
|
32
|
+
#
|
33
|
+
#
|
34
|
+
def self.weight
|
35
|
+
# Boolean has crossover with Integer so going to let it overrule Integer
|
36
|
+
return 5 if self == BooleanType
|
37
|
+
|
38
|
+
|
39
|
+
# Integer could potentially overlap with Float, but it is more restrictive
|
40
|
+
# so let it override Flaot
|
41
|
+
return 4 if self == IntegerType
|
42
|
+
return 3 if self == FloatType
|
43
|
+
|
44
|
+
# Date and Timestamps string representation shouldn't intersect with anything so
|
45
|
+
# leaving it at the same level as Null and Unkonwn
|
46
|
+
return 2 if self == DateType
|
47
|
+
return 2 if self == TimestampType
|
48
|
+
|
49
|
+
# Null and Unknown shoulnd't conflict since their string representations
|
50
|
+
# do not intersect
|
51
|
+
return 2 if self == NullType
|
52
|
+
return 2 if self == UnknownType
|
53
|
+
|
54
|
+
# Stringtype is the fallback for anything that has a string
|
55
|
+
# representation, so it should lose out on integers, floats, nulls,
|
56
|
+
# unknowns as strings
|
57
|
+
return 1 if self == StringType
|
58
|
+
|
59
|
+
# at the bottom - since it should never match anywhere
|
60
|
+
return 0 if self == GuessType
|
61
|
+
|
62
|
+
raise NotImplementedError, "No weight assigned to type #{self} - fix immediately"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
require 'flat_kit/field_type/guess_type'
|
68
|
+
require 'flat_kit/field_type/boolean_type'
|
69
|
+
require 'flat_kit/field_type/date_type'
|
70
|
+
require 'flat_kit/field_type/timestamp_type'
|
71
|
+
require 'flat_kit/field_type/integer_type'
|
72
|
+
require 'flat_kit/field_type/float_type'
|
73
|
+
require 'flat_kit/field_type/null_type'
|
74
|
+
require 'flat_kit/field_type/string_type'
|
75
|
+
require 'flat_kit/field_type/unknown_type'
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class BooleanType < FieldType
|
4
|
+
|
5
|
+
TRUTHY_REGEX = /\A(true|t|1|yes|y|on)\Z/i
|
6
|
+
FALSEY_REGEX = /\A(false|f|0|no|n|off)\Z/i
|
7
|
+
REGEX = Regexp.union(TRUTHY_REGEX, FALSEY_REGEX)
|
8
|
+
|
9
|
+
def self.type_name
|
10
|
+
"boolean"
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.matches?(data)
|
14
|
+
case data
|
15
|
+
when TrueClass
|
16
|
+
true
|
17
|
+
when FalseClass
|
18
|
+
true
|
19
|
+
when String
|
20
|
+
REGEX.match?(data)
|
21
|
+
when Integer
|
22
|
+
return true if data.zero?
|
23
|
+
return true if data == 1
|
24
|
+
return false
|
25
|
+
else
|
26
|
+
false
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.coerce(data)
|
31
|
+
case data
|
32
|
+
when TrueClass
|
33
|
+
true
|
34
|
+
when FalseClass
|
35
|
+
false
|
36
|
+
when Numeric
|
37
|
+
return false if data.zero?
|
38
|
+
return true if data == 1
|
39
|
+
CoerceFailure
|
40
|
+
when String
|
41
|
+
return true if TRUTHY_REGEX.match?(data)
|
42
|
+
return false if FALSEY_REGEX.match?(data)
|
43
|
+
CoerceFailure
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
# Representing the type of data which only includes data up to the day
|
4
|
+
# resolution
|
5
|
+
class DateType < FieldType
|
6
|
+
|
7
|
+
# %Y 4 digit year
|
8
|
+
# %y 2 didigt year (%Y mod 100) (00..99)
|
9
|
+
# %m month of year zero padded
|
10
|
+
# %-m month of year no-padding
|
11
|
+
# %B Full month name
|
12
|
+
# %b Abbreviated month name
|
13
|
+
# %^b uppercased month name
|
14
|
+
# %d day of month zero padded
|
15
|
+
# %-d day of moneth not padded
|
16
|
+
# %e day of month blank padded
|
17
|
+
# %j day of year zero padded
|
18
|
+
|
19
|
+
# parse formats are not the same as print formats as parsing does not deal
|
20
|
+
# with flags and widths
|
21
|
+
def self.parse_formats
|
22
|
+
@parse_formats ||= [
|
23
|
+
# YMD formats
|
24
|
+
"%Y-%m-%d",
|
25
|
+
"%Y%m%d",
|
26
|
+
"%Y/%m/%d",
|
27
|
+
"%Y %m %d.",
|
28
|
+
|
29
|
+
# DMY formats
|
30
|
+
"%d %B %Y",
|
31
|
+
"%d %b %Y",
|
32
|
+
"%d-%b-%Y",
|
33
|
+
"%d/%b/%Y",
|
34
|
+
"%d-%m-%Y",
|
35
|
+
"%d-%m-%y",
|
36
|
+
"%d %b, %Y",
|
37
|
+
"%d %b,%Y",
|
38
|
+
"%d %B, %Y",
|
39
|
+
"%d %B,%Y",
|
40
|
+
|
41
|
+
# MDY formats
|
42
|
+
"%m/%d/%Y",
|
43
|
+
"%m-%d-%Y",
|
44
|
+
"%m/%d/%y",
|
45
|
+
"%m-%d-%y",
|
46
|
+
|
47
|
+
"%B %d, %Y",
|
48
|
+
"%b %d, %Y",
|
49
|
+
|
50
|
+
# other formats
|
51
|
+
"%Y-%j",
|
52
|
+
"%a %b %d %Y"
|
53
|
+
]
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
# https://en.wikipedia.org/wiki/Date_format_by_country
|
58
|
+
# List of formats culled from the above - not using all as it is
|
59
|
+
# definitely a performance issue at the moment
|
60
|
+
# def self.known_formats
|
61
|
+
# @known_formats ||= [
|
62
|
+
# # YMD formats
|
63
|
+
# "%Y-%m-%d",
|
64
|
+
# "%Y%m%d",
|
65
|
+
# "%Y/%m/%d",
|
66
|
+
# "%Y.%m.%d",
|
67
|
+
# "%Y.%m.%d.",
|
68
|
+
# "%Y %m %d.",
|
69
|
+
# "%Y %b %d",
|
70
|
+
# "%Y %b %-d",
|
71
|
+
# "%Y %B %-d",
|
72
|
+
# "%Y %B %d",
|
73
|
+
# "%Y-%m%d",
|
74
|
+
# "%Y. %m. %-d.",
|
75
|
+
# "%Y. %m. %d.",
|
76
|
+
# "%Y.%-m.%-d.",
|
77
|
+
# "%Y.%-m.%-d",
|
78
|
+
# "%Y, %d %B",
|
79
|
+
# "%Y, %d %b",
|
80
|
+
#
|
81
|
+
# "%y.%-m.%-d",
|
82
|
+
# "%y.%-m.%-d.",
|
83
|
+
# "%y.%m.%d.",
|
84
|
+
# "%y.%m.%d",
|
85
|
+
# "%y/%m/%d",
|
86
|
+
#
|
87
|
+
# # DMY formats
|
88
|
+
# "%-d %b %Y",
|
89
|
+
# "%-d %B %Y",
|
90
|
+
# "%-d-%-m-%Y",
|
91
|
+
# "%-d. %-m. %Y",
|
92
|
+
# "%-d. %-m. %Y.",
|
93
|
+
# "%-d. %B %Y",
|
94
|
+
# "%-d. %B %Y.",
|
95
|
+
# "%-d.%-m.%Y",
|
96
|
+
# "%-d.%-m.%Y.",
|
97
|
+
# "%-d.%m.%Y.",
|
98
|
+
# "%-d.%m.%Y",
|
99
|
+
# "%-d.%b.%Y",
|
100
|
+
# "%-d.%B.%Y",
|
101
|
+
# "%-d/%-m %Y",
|
102
|
+
# "%-d/%-m/%Y",
|
103
|
+
# "%d %B %Y",
|
104
|
+
# "%d %b %Y",
|
105
|
+
# "%d-%m-%Y",
|
106
|
+
# "%d-%b-%Y",
|
107
|
+
# "%d-%B-%Y",
|
108
|
+
# "%d.%m.%Y",
|
109
|
+
# "%d/%m %Y",
|
110
|
+
# "%d/%m/%Y",
|
111
|
+
#
|
112
|
+
# "%-d.%b.%y",
|
113
|
+
# "%-d.%B.%y",
|
114
|
+
# "%-d.%-m.%y",
|
115
|
+
# "%-d/%-m-%y",
|
116
|
+
# "%-d/%-m/%y",
|
117
|
+
# "%d/%m/%y",
|
118
|
+
# "%d-%m-%y",
|
119
|
+
# "%d.%m.%y",
|
120
|
+
# "%d%m%y",
|
121
|
+
#
|
122
|
+
# # MDY formats
|
123
|
+
# "%-m/%-d/%Y",
|
124
|
+
# "%m/%d/%Y",
|
125
|
+
# "%m-%d-%Y",
|
126
|
+
# "%b-%d-%Y",
|
127
|
+
# "%B %-d, %Y",
|
128
|
+
# "%B %-d. %Y",
|
129
|
+
# "%B %d, %Y",
|
130
|
+
# "%B-%d-%Y",
|
131
|
+
# "%B/%d/%Y",
|
132
|
+
#
|
133
|
+
# "%-m/%-d/%y",
|
134
|
+
#
|
135
|
+
# # other formats
|
136
|
+
# "%Y-%j",
|
137
|
+
# "%Y%m",
|
138
|
+
# "%Y-%m",
|
139
|
+
# "%Y %m",
|
140
|
+
# ]
|
141
|
+
# end
|
142
|
+
|
143
|
+
def self.type_name
|
144
|
+
"date"
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.matches?(data)
|
148
|
+
coerced = coerce(data)
|
149
|
+
return coerced.kind_of?(Date)
|
150
|
+
end
|
151
|
+
|
152
|
+
def self.coerce(data)
|
153
|
+
case data
|
154
|
+
when DateTime
|
155
|
+
CoerceFailure
|
156
|
+
when Date
|
157
|
+
data
|
158
|
+
when String
|
159
|
+
coerced_data = CoerceFailure
|
160
|
+
parse_formats.each do |format|
|
161
|
+
begin
|
162
|
+
coerced_data = Date.strptime(data, format)
|
163
|
+
break
|
164
|
+
rescue => _
|
165
|
+
false
|
166
|
+
end
|
167
|
+
end
|
168
|
+
coerced_data
|
169
|
+
else
|
170
|
+
CoerceFailure
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
__END__
|
179
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class FloatType < FieldType
|
4
|
+
|
5
|
+
def self.type_name
|
6
|
+
"float"
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.matches?(data)
|
10
|
+
case data
|
11
|
+
when Float
|
12
|
+
true
|
13
|
+
when Integer
|
14
|
+
false
|
15
|
+
when String
|
16
|
+
return false if IntegerType.matches?(data)
|
17
|
+
begin
|
18
|
+
Float(data)
|
19
|
+
true
|
20
|
+
rescue ArgumentError => _
|
21
|
+
false
|
22
|
+
end
|
23
|
+
else
|
24
|
+
false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.coerce(data)
|
29
|
+
Float(data)
|
30
|
+
rescue TypeError => _
|
31
|
+
CoerceFailure
|
32
|
+
rescue ArgumentError => _
|
33
|
+
CoerceFailure
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
# GuessType is a field type where we don't know what type the field is, and
|
4
|
+
# it needs to be guessed. This is a sentinel type that doesn't match any
|
5
|
+
# data.
|
6
|
+
class GuessType < FieldType
|
7
|
+
def self.type_name
|
8
|
+
self.name
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.coerce(data)
|
16
|
+
CoerceFailure
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class IntegerType < FieldType
|
4
|
+
|
5
|
+
REGEX = /\A[-+]?\d+\Z/
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"integer"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
case data
|
13
|
+
when Integer
|
14
|
+
true
|
15
|
+
when Float
|
16
|
+
false
|
17
|
+
when String
|
18
|
+
REGEX.match?(data)
|
19
|
+
else
|
20
|
+
false
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.coerce(data)
|
25
|
+
Integer(data)
|
26
|
+
rescue TypeError => _
|
27
|
+
CoerceFailure
|
28
|
+
rescue ArgumentError => _
|
29
|
+
CoerceFailure
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
class NullType < FieldType
|
4
|
+
|
5
|
+
REGEX = Regexp.union(/\A(null|nil)\Z/i, /\A\\N\Z/)
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"null"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
case data
|
13
|
+
when nil
|
14
|
+
true
|
15
|
+
when String
|
16
|
+
REGEX.match?(data)
|
17
|
+
else
|
18
|
+
false
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.coerce(data)
|
23
|
+
case data
|
24
|
+
when nil
|
25
|
+
data
|
26
|
+
when String
|
27
|
+
return nil if REGEX.match?(data)
|
28
|
+
CoerceFailure
|
29
|
+
else
|
30
|
+
CoerceFailure
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module FlatKit
|
2
|
+
class FieldType
|
3
|
+
# StringType is essentially a fallback - hence its lower weight than other
|
4
|
+
# types that might have string representations.
|
5
|
+
class StringType< FieldType
|
6
|
+
|
7
|
+
def self.type_name
|
8
|
+
"string"
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.matches?(data)
|
12
|
+
data.kind_of?(String)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.coerce(data)
|
16
|
+
data.to_s
|
17
|
+
rescue => _
|
18
|
+
CoerceFailure
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|