flat_kit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,75 @@
1
+ module FlatKit
2
+ class FieldType
3
+
4
+ extend FlatKit::DescendantTracker
5
+
6
+ CoerceFailure = Class.new(::Object).freeze
7
+
8
+ def self.candidate_types(data)
9
+ find_children(:matches?, data)
10
+ end
11
+
12
+ def self.best_guess(data)
13
+ candidate_types(data).sort_by { |t| t.weight }.last
14
+ end
15
+
16
+ def self.type_name
17
+ raise NotImplementedError, "must impleent #{self.type_name}"
18
+ end
19
+
20
+ def self.matches?(data)
21
+ raise NotImplementedError, "must implement #{self.name}.matches?(data)"
22
+ end
23
+
24
+ def self.coerce(data)
25
+ raise NotImplementedError, "must implement #{self.name}.coerce(data)"
26
+ end
27
+
28
+ # Each type has a weight so if a value matches multiple types, then the list
29
+ # can be compared to see where the tie breakers are
30
+ #
31
+ # All the weights are here so that
32
+ #
33
+ #
34
+ def self.weight
35
+ # Boolean has crossover with Integer so going to let it overrule Integer
36
+ return 5 if self == BooleanType
37
+
38
+
39
+ # Integer could potentially overlap with Float, but it is more restrictive
40
+ # so let it override Flaot
41
+ return 4 if self == IntegerType
42
+ return 3 if self == FloatType
43
+
44
+ # Date and Timestamps string representation shouldn't intersect with anything so
45
+ # leaving it at the same level as Null and Unkonwn
46
+ return 2 if self == DateType
47
+ return 2 if self == TimestampType
48
+
49
+ # Null and Unknown shoulnd't conflict since their string representations
50
+ # do not intersect
51
+ return 2 if self == NullType
52
+ return 2 if self == UnknownType
53
+
54
+ # Stringtype is the fallback for anything that has a string
55
+ # representation, so it should lose out on integers, floats, nulls,
56
+ # unknowns as strings
57
+ return 1 if self == StringType
58
+
59
+ # at the bottom - since it should never match anywhere
60
+ return 0 if self == GuessType
61
+
62
+ raise NotImplementedError, "No weight assigned to type #{self} - fix immediately"
63
+ end
64
+ end
65
+ end
66
+
67
+ require 'flat_kit/field_type/guess_type'
68
+ require 'flat_kit/field_type/boolean_type'
69
+ require 'flat_kit/field_type/date_type'
70
+ require 'flat_kit/field_type/timestamp_type'
71
+ require 'flat_kit/field_type/integer_type'
72
+ require 'flat_kit/field_type/float_type'
73
+ require 'flat_kit/field_type/null_type'
74
+ require 'flat_kit/field_type/string_type'
75
+ require 'flat_kit/field_type/unknown_type'
@@ -0,0 +1,48 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class BooleanType < FieldType
4
+
5
+ TRUTHY_REGEX = /\A(true|t|1|yes|y|on)\Z/i
6
+ FALSEY_REGEX = /\A(false|f|0|no|n|off)\Z/i
7
+ REGEX = Regexp.union(TRUTHY_REGEX, FALSEY_REGEX)
8
+
9
+ def self.type_name
10
+ "boolean"
11
+ end
12
+
13
+ def self.matches?(data)
14
+ case data
15
+ when TrueClass
16
+ true
17
+ when FalseClass
18
+ true
19
+ when String
20
+ REGEX.match?(data)
21
+ when Integer
22
+ return true if data.zero?
23
+ return true if data == 1
24
+ return false
25
+ else
26
+ false
27
+ end
28
+ end
29
+
30
+ def self.coerce(data)
31
+ case data
32
+ when TrueClass
33
+ true
34
+ when FalseClass
35
+ false
36
+ when Numeric
37
+ return false if data.zero?
38
+ return true if data == 1
39
+ CoerceFailure
40
+ when String
41
+ return true if TRUTHY_REGEX.match?(data)
42
+ return false if FALSEY_REGEX.match?(data)
43
+ CoerceFailure
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,179 @@
1
+ module FlatKit
2
+ class FieldType
3
+ # Representing the type of data which only includes data up to the day
4
+ # resolution
5
+ class DateType < FieldType
6
+
7
+ # %Y 4 digit year
8
+ # %y 2 didigt year (%Y mod 100) (00..99)
9
+ # %m month of year zero padded
10
+ # %-m month of year no-padding
11
+ # %B Full month name
12
+ # %b Abbreviated month name
13
+ # %^b uppercased month name
14
+ # %d day of month zero padded
15
+ # %-d day of moneth not padded
16
+ # %e day of month blank padded
17
+ # %j day of year zero padded
18
+
19
+ # parse formats are not the same as print formats as parsing does not deal
20
+ # with flags and widths
21
+ def self.parse_formats
22
+ @parse_formats ||= [
23
+ # YMD formats
24
+ "%Y-%m-%d",
25
+ "%Y%m%d",
26
+ "%Y/%m/%d",
27
+ "%Y %m %d.",
28
+
29
+ # DMY formats
30
+ "%d %B %Y",
31
+ "%d %b %Y",
32
+ "%d-%b-%Y",
33
+ "%d/%b/%Y",
34
+ "%d-%m-%Y",
35
+ "%d-%m-%y",
36
+ "%d %b, %Y",
37
+ "%d %b,%Y",
38
+ "%d %B, %Y",
39
+ "%d %B,%Y",
40
+
41
+ # MDY formats
42
+ "%m/%d/%Y",
43
+ "%m-%d-%Y",
44
+ "%m/%d/%y",
45
+ "%m-%d-%y",
46
+
47
+ "%B %d, %Y",
48
+ "%b %d, %Y",
49
+
50
+ # other formats
51
+ "%Y-%j",
52
+ "%a %b %d %Y"
53
+ ]
54
+
55
+ end
56
+
57
+ # https://en.wikipedia.org/wiki/Date_format_by_country
58
+ # List of formats culled from the above - not using all as it is
59
+ # definitely a performance issue at the moment
60
+ # def self.known_formats
61
+ # @known_formats ||= [
62
+ # # YMD formats
63
+ # "%Y-%m-%d",
64
+ # "%Y%m%d",
65
+ # "%Y/%m/%d",
66
+ # "%Y.%m.%d",
67
+ # "%Y.%m.%d.",
68
+ # "%Y %m %d.",
69
+ # "%Y %b %d",
70
+ # "%Y %b %-d",
71
+ # "%Y %B %-d",
72
+ # "%Y %B %d",
73
+ # "%Y-%m%d",
74
+ # "%Y. %m. %-d.",
75
+ # "%Y. %m. %d.",
76
+ # "%Y.%-m.%-d.",
77
+ # "%Y.%-m.%-d",
78
+ # "%Y, %d %B",
79
+ # "%Y, %d %b",
80
+ #
81
+ # "%y.%-m.%-d",
82
+ # "%y.%-m.%-d.",
83
+ # "%y.%m.%d.",
84
+ # "%y.%m.%d",
85
+ # "%y/%m/%d",
86
+ #
87
+ # # DMY formats
88
+ # "%-d %b %Y",
89
+ # "%-d %B %Y",
90
+ # "%-d-%-m-%Y",
91
+ # "%-d. %-m. %Y",
92
+ # "%-d. %-m. %Y.",
93
+ # "%-d. %B %Y",
94
+ # "%-d. %B %Y.",
95
+ # "%-d.%-m.%Y",
96
+ # "%-d.%-m.%Y.",
97
+ # "%-d.%m.%Y.",
98
+ # "%-d.%m.%Y",
99
+ # "%-d.%b.%Y",
100
+ # "%-d.%B.%Y",
101
+ # "%-d/%-m %Y",
102
+ # "%-d/%-m/%Y",
103
+ # "%d %B %Y",
104
+ # "%d %b %Y",
105
+ # "%d-%m-%Y",
106
+ # "%d-%b-%Y",
107
+ # "%d-%B-%Y",
108
+ # "%d.%m.%Y",
109
+ # "%d/%m %Y",
110
+ # "%d/%m/%Y",
111
+ #
112
+ # "%-d.%b.%y",
113
+ # "%-d.%B.%y",
114
+ # "%-d.%-m.%y",
115
+ # "%-d/%-m-%y",
116
+ # "%-d/%-m/%y",
117
+ # "%d/%m/%y",
118
+ # "%d-%m-%y",
119
+ # "%d.%m.%y",
120
+ # "%d%m%y",
121
+ #
122
+ # # MDY formats
123
+ # "%-m/%-d/%Y",
124
+ # "%m/%d/%Y",
125
+ # "%m-%d-%Y",
126
+ # "%b-%d-%Y",
127
+ # "%B %-d, %Y",
128
+ # "%B %-d. %Y",
129
+ # "%B %d, %Y",
130
+ # "%B-%d-%Y",
131
+ # "%B/%d/%Y",
132
+ #
133
+ # "%-m/%-d/%y",
134
+ #
135
+ # # other formats
136
+ # "%Y-%j",
137
+ # "%Y%m",
138
+ # "%Y-%m",
139
+ # "%Y %m",
140
+ # ]
141
+ # end
142
+
143
+ def self.type_name
144
+ "date"
145
+ end
146
+
147
+ def self.matches?(data)
148
+ coerced = coerce(data)
149
+ return coerced.kind_of?(Date)
150
+ end
151
+
152
+ def self.coerce(data)
153
+ case data
154
+ when DateTime
155
+ CoerceFailure
156
+ when Date
157
+ data
158
+ when String
159
+ coerced_data = CoerceFailure
160
+ parse_formats.each do |format|
161
+ begin
162
+ coerced_data = Date.strptime(data, format)
163
+ break
164
+ rescue => _
165
+ false
166
+ end
167
+ end
168
+ coerced_data
169
+ else
170
+ CoerceFailure
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+
178
+ __END__
179
+
@@ -0,0 +1,37 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class FloatType < FieldType
4
+
5
+ def self.type_name
6
+ "float"
7
+ end
8
+
9
+ def self.matches?(data)
10
+ case data
11
+ when Float
12
+ true
13
+ when Integer
14
+ false
15
+ when String
16
+ return false if IntegerType.matches?(data)
17
+ begin
18
+ Float(data)
19
+ true
20
+ rescue ArgumentError => _
21
+ false
22
+ end
23
+ else
24
+ false
25
+ end
26
+ end
27
+
28
+ def self.coerce(data)
29
+ Float(data)
30
+ rescue TypeError => _
31
+ CoerceFailure
32
+ rescue ArgumentError => _
33
+ CoerceFailure
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,20 @@
1
+ module FlatKit
2
+ class FieldType
3
+ # GuessType is a field type where we don't know what type the field is, and
4
+ # it needs to be guessed. This is a sentinel type that doesn't match any
5
+ # data.
6
+ class GuessType < FieldType
7
+ def self.type_name
8
+ self.name
9
+ end
10
+
11
+ def self.matches?(data)
12
+ false
13
+ end
14
+
15
+ def self.coerce(data)
16
+ CoerceFailure
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,34 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class IntegerType < FieldType
4
+
5
+ REGEX = /\A[-+]?\d+\Z/
6
+
7
+ def self.type_name
8
+ "integer"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ case data
13
+ when Integer
14
+ true
15
+ when Float
16
+ false
17
+ when String
18
+ REGEX.match?(data)
19
+ else
20
+ false
21
+ end
22
+ end
23
+
24
+ def self.coerce(data)
25
+ Integer(data)
26
+ rescue TypeError => _
27
+ CoerceFailure
28
+ rescue ArgumentError => _
29
+ CoerceFailure
30
+ end
31
+
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,35 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class NullType < FieldType
4
+
5
+ REGEX = Regexp.union(/\A(null|nil)\Z/i, /\A\\N\Z/)
6
+
7
+ def self.type_name
8
+ "null"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ case data
13
+ when nil
14
+ true
15
+ when String
16
+ REGEX.match?(data)
17
+ else
18
+ false
19
+ end
20
+ end
21
+
22
+ def self.coerce(data)
23
+ case data
24
+ when nil
25
+ data
26
+ when String
27
+ return nil if REGEX.match?(data)
28
+ CoerceFailure
29
+ else
30
+ CoerceFailure
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ module FlatKit
2
+ class FieldType
3
+ # StringType is essentially a fallback - hence its lower weight than other
4
+ # types that might have string representations.
5
+ class StringType< FieldType
6
+
7
+ def self.type_name
8
+ "string"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ data.kind_of?(String)
13
+ end
14
+
15
+ def self.coerce(data)
16
+ data.to_s
17
+ rescue => _
18
+ CoerceFailure
19
+ end
20
+ end
21
+ end
22
+ end