flat_kit 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/HISTORY.md +6 -0
  3. data/Manifest.txt +34 -0
  4. data/Rakefile +1 -0
  5. data/examples/stream-active-record-to-csv.rb +42 -0
  6. data/lib/flat_kit.rb +9 -2
  7. data/lib/flat_kit/cli.rb +11 -0
  8. data/lib/flat_kit/command.rb +1 -0
  9. data/lib/flat_kit/command/stats.rb +94 -0
  10. data/lib/flat_kit/descendant_tracker.rb +9 -0
  11. data/lib/flat_kit/event_emitter.rb +2 -2
  12. data/lib/flat_kit/field_stats.rb +241 -0
  13. data/lib/flat_kit/field_type.rb +75 -0
  14. data/lib/flat_kit/field_type/boolean_type.rb +48 -0
  15. data/lib/flat_kit/field_type/date_type.rb +179 -0
  16. data/lib/flat_kit/field_type/float_type.rb +37 -0
  17. data/lib/flat_kit/field_type/guess_type.rb +20 -0
  18. data/lib/flat_kit/field_type/integer_type.rb +34 -0
  19. data/lib/flat_kit/field_type/null_type.rb +35 -0
  20. data/lib/flat_kit/field_type/string_type.rb +22 -0
  21. data/lib/flat_kit/field_type/timestamp_type.rb +47 -0
  22. data/lib/flat_kit/field_type/unknown_type.rb +26 -0
  23. data/lib/flat_kit/jsonl/record.rb +2 -2
  24. data/lib/flat_kit/jsonl/writer.rb +18 -9
  25. data/lib/flat_kit/merge.rb +4 -2
  26. data/lib/flat_kit/output.rb +4 -0
  27. data/lib/flat_kit/position.rb +19 -0
  28. data/lib/flat_kit/stat_type.rb +65 -0
  29. data/lib/flat_kit/stat_type/nominal_stats.rb +58 -0
  30. data/lib/flat_kit/stat_type/numerical_stats.rb +120 -0
  31. data/lib/flat_kit/stat_type/ordinal_stats.rb +42 -0
  32. data/lib/flat_kit/stats.rb +66 -0
  33. data/lib/flat_kit/writer.rb +17 -2
  34. data/lib/flat_kit/xsv/writer.rb +24 -9
  35. data/test/field_type/test_boolean_type.rb +65 -0
  36. data/test/field_type/test_date_type.rb +71 -0
  37. data/test/field_type/test_float_type.rb +56 -0
  38. data/test/field_type/test_guess_type.rb +14 -0
  39. data/test/field_type/test_integer_type.rb +52 -0
  40. data/test/field_type/test_null_type.rb +41 -0
  41. data/test/field_type/test_string_type.rb +18 -0
  42. data/test/field_type/test_timestamp_type.rb +108 -0
  43. data/test/field_type/test_unknown_type.rb +35 -0
  44. data/test/jsonl/test_writer.rb +21 -3
  45. data/test/run +23 -0
  46. data/test/stat_type/test_nominal_stats.rb +69 -0
  47. data/test/stat_type/test_numerical_stats.rb +118 -0
  48. data/test/stat_type/test_ordinal_stats.rb +92 -0
  49. data/test/test_event_emitter.rb +19 -2
  50. data/test/test_field_stats.rb +134 -0
  51. data/test/test_field_type.rb +34 -0
  52. data/test/xsv/test_writer.rb +25 -4
  53. metadata +65 -2
@@ -0,0 +1,75 @@
1
+ module FlatKit
2
+ class FieldType
3
+
4
+ extend FlatKit::DescendantTracker
5
+
6
+ CoerceFailure = Class.new(::Object).freeze
7
+
8
+ def self.candidate_types(data)
9
+ find_children(:matches?, data)
10
+ end
11
+
12
+ def self.best_guess(data)
13
+ candidate_types(data).sort_by { |t| t.weight }.last
14
+ end
15
+
16
+ def self.type_name
17
+ raise NotImplementedError, "must impleent #{self.type_name}"
18
+ end
19
+
20
+ def self.matches?(data)
21
+ raise NotImplementedError, "must implement #{self.name}.matches?(data)"
22
+ end
23
+
24
+ def self.coerce(data)
25
+ raise NotImplementedError, "must implement #{self.name}.coerce(data)"
26
+ end
27
+
28
+ # Each type has a weight so if a value matches multiple types, then the list
29
+ # can be compared to see where the tie breakers are
30
+ #
31
+ # All the weights are here so that
32
+ #
33
+ #
34
+ def self.weight
35
+ # Boolean has crossover with Integer so going to let it overrule Integer
36
+ return 5 if self == BooleanType
37
+
38
+
39
+ # Integer could potentially overlap with Float, but it is more restrictive
40
+ # so let it override Flaot
41
+ return 4 if self == IntegerType
42
+ return 3 if self == FloatType
43
+
44
+ # Date and Timestamps string representation shouldn't intersect with anything so
45
+ # leaving it at the same level as Null and Unkonwn
46
+ return 2 if self == DateType
47
+ return 2 if self == TimestampType
48
+
49
+ # Null and Unknown shoulnd't conflict since their string representations
50
+ # do not intersect
51
+ return 2 if self == NullType
52
+ return 2 if self == UnknownType
53
+
54
+ # Stringtype is the fallback for anything that has a string
55
+ # representation, so it should lose out on integers, floats, nulls,
56
+ # unknowns as strings
57
+ return 1 if self == StringType
58
+
59
+ # at the bottom - since it should never match anywhere
60
+ return 0 if self == GuessType
61
+
62
+ raise NotImplementedError, "No weight assigned to type #{self} - fix immediately"
63
+ end
64
+ end
65
+ end
66
+
67
+ require 'flat_kit/field_type/guess_type'
68
+ require 'flat_kit/field_type/boolean_type'
69
+ require 'flat_kit/field_type/date_type'
70
+ require 'flat_kit/field_type/timestamp_type'
71
+ require 'flat_kit/field_type/integer_type'
72
+ require 'flat_kit/field_type/float_type'
73
+ require 'flat_kit/field_type/null_type'
74
+ require 'flat_kit/field_type/string_type'
75
+ require 'flat_kit/field_type/unknown_type'
@@ -0,0 +1,48 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class BooleanType < FieldType
4
+
5
+ TRUTHY_REGEX = /\A(true|t|1|yes|y|on)\Z/i
6
+ FALSEY_REGEX = /\A(false|f|0|no|n|off)\Z/i
7
+ REGEX = Regexp.union(TRUTHY_REGEX, FALSEY_REGEX)
8
+
9
+ def self.type_name
10
+ "boolean"
11
+ end
12
+
13
+ def self.matches?(data)
14
+ case data
15
+ when TrueClass
16
+ true
17
+ when FalseClass
18
+ true
19
+ when String
20
+ REGEX.match?(data)
21
+ when Integer
22
+ return true if data.zero?
23
+ return true if data == 1
24
+ return false
25
+ else
26
+ false
27
+ end
28
+ end
29
+
30
+ def self.coerce(data)
31
+ case data
32
+ when TrueClass
33
+ true
34
+ when FalseClass
35
+ false
36
+ when Numeric
37
+ return false if data.zero?
38
+ return true if data == 1
39
+ CoerceFailure
40
+ when String
41
+ return true if TRUTHY_REGEX.match?(data)
42
+ return false if FALSEY_REGEX.match?(data)
43
+ CoerceFailure
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,179 @@
1
+ module FlatKit
2
+ class FieldType
3
+ # Representing the type of data which only includes data up to the day
4
+ # resolution
5
+ class DateType < FieldType
6
+
7
+ # %Y 4 digit year
8
+ # %y 2 didigt year (%Y mod 100) (00..99)
9
+ # %m month of year zero padded
10
+ # %-m month of year no-padding
11
+ # %B Full month name
12
+ # %b Abbreviated month name
13
+ # %^b uppercased month name
14
+ # %d day of month zero padded
15
+ # %-d day of moneth not padded
16
+ # %e day of month blank padded
17
+ # %j day of year zero padded
18
+
19
+ # parse formats are not the same as print formats as parsing does not deal
20
+ # with flags and widths
21
+ def self.parse_formats
22
+ @parse_formats ||= [
23
+ # YMD formats
24
+ "%Y-%m-%d",
25
+ "%Y%m%d",
26
+ "%Y/%m/%d",
27
+ "%Y %m %d.",
28
+
29
+ # DMY formats
30
+ "%d %B %Y",
31
+ "%d %b %Y",
32
+ "%d-%b-%Y",
33
+ "%d/%b/%Y",
34
+ "%d-%m-%Y",
35
+ "%d-%m-%y",
36
+ "%d %b, %Y",
37
+ "%d %b,%Y",
38
+ "%d %B, %Y",
39
+ "%d %B,%Y",
40
+
41
+ # MDY formats
42
+ "%m/%d/%Y",
43
+ "%m-%d-%Y",
44
+ "%m/%d/%y",
45
+ "%m-%d-%y",
46
+
47
+ "%B %d, %Y",
48
+ "%b %d, %Y",
49
+
50
+ # other formats
51
+ "%Y-%j",
52
+ "%a %b %d %Y"
53
+ ]
54
+
55
+ end
56
+
57
+ # https://en.wikipedia.org/wiki/Date_format_by_country
58
+ # List of formats culled from the above - not using all as it is
59
+ # definitely a performance issue at the moment
60
+ # def self.known_formats
61
+ # @known_formats ||= [
62
+ # # YMD formats
63
+ # "%Y-%m-%d",
64
+ # "%Y%m%d",
65
+ # "%Y/%m/%d",
66
+ # "%Y.%m.%d",
67
+ # "%Y.%m.%d.",
68
+ # "%Y %m %d.",
69
+ # "%Y %b %d",
70
+ # "%Y %b %-d",
71
+ # "%Y %B %-d",
72
+ # "%Y %B %d",
73
+ # "%Y-%m%d",
74
+ # "%Y. %m. %-d.",
75
+ # "%Y. %m. %d.",
76
+ # "%Y.%-m.%-d.",
77
+ # "%Y.%-m.%-d",
78
+ # "%Y, %d %B",
79
+ # "%Y, %d %b",
80
+ #
81
+ # "%y.%-m.%-d",
82
+ # "%y.%-m.%-d.",
83
+ # "%y.%m.%d.",
84
+ # "%y.%m.%d",
85
+ # "%y/%m/%d",
86
+ #
87
+ # # DMY formats
88
+ # "%-d %b %Y",
89
+ # "%-d %B %Y",
90
+ # "%-d-%-m-%Y",
91
+ # "%-d. %-m. %Y",
92
+ # "%-d. %-m. %Y.",
93
+ # "%-d. %B %Y",
94
+ # "%-d. %B %Y.",
95
+ # "%-d.%-m.%Y",
96
+ # "%-d.%-m.%Y.",
97
+ # "%-d.%m.%Y.",
98
+ # "%-d.%m.%Y",
99
+ # "%-d.%b.%Y",
100
+ # "%-d.%B.%Y",
101
+ # "%-d/%-m %Y",
102
+ # "%-d/%-m/%Y",
103
+ # "%d %B %Y",
104
+ # "%d %b %Y",
105
+ # "%d-%m-%Y",
106
+ # "%d-%b-%Y",
107
+ # "%d-%B-%Y",
108
+ # "%d.%m.%Y",
109
+ # "%d/%m %Y",
110
+ # "%d/%m/%Y",
111
+ #
112
+ # "%-d.%b.%y",
113
+ # "%-d.%B.%y",
114
+ # "%-d.%-m.%y",
115
+ # "%-d/%-m-%y",
116
+ # "%-d/%-m/%y",
117
+ # "%d/%m/%y",
118
+ # "%d-%m-%y",
119
+ # "%d.%m.%y",
120
+ # "%d%m%y",
121
+ #
122
+ # # MDY formats
123
+ # "%-m/%-d/%Y",
124
+ # "%m/%d/%Y",
125
+ # "%m-%d-%Y",
126
+ # "%b-%d-%Y",
127
+ # "%B %-d, %Y",
128
+ # "%B %-d. %Y",
129
+ # "%B %d, %Y",
130
+ # "%B-%d-%Y",
131
+ # "%B/%d/%Y",
132
+ #
133
+ # "%-m/%-d/%y",
134
+ #
135
+ # # other formats
136
+ # "%Y-%j",
137
+ # "%Y%m",
138
+ # "%Y-%m",
139
+ # "%Y %m",
140
+ # ]
141
+ # end
142
+
143
+ def self.type_name
144
+ "date"
145
+ end
146
+
147
+ def self.matches?(data)
148
+ coerced = coerce(data)
149
+ return coerced.kind_of?(Date)
150
+ end
151
+
152
+ def self.coerce(data)
153
+ case data
154
+ when DateTime
155
+ CoerceFailure
156
+ when Date
157
+ data
158
+ when String
159
+ coerced_data = CoerceFailure
160
+ parse_formats.each do |format|
161
+ begin
162
+ coerced_data = Date.strptime(data, format)
163
+ break
164
+ rescue => _
165
+ false
166
+ end
167
+ end
168
+ coerced_data
169
+ else
170
+ CoerceFailure
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+
178
+ __END__
179
+
@@ -0,0 +1,37 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class FloatType < FieldType
4
+
5
+ def self.type_name
6
+ "float"
7
+ end
8
+
9
+ def self.matches?(data)
10
+ case data
11
+ when Float
12
+ true
13
+ when Integer
14
+ false
15
+ when String
16
+ return false if IntegerType.matches?(data)
17
+ begin
18
+ Float(data)
19
+ true
20
+ rescue ArgumentError => _
21
+ false
22
+ end
23
+ else
24
+ false
25
+ end
26
+ end
27
+
28
+ def self.coerce(data)
29
+ Float(data)
30
+ rescue TypeError => _
31
+ CoerceFailure
32
+ rescue ArgumentError => _
33
+ CoerceFailure
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,20 @@
1
+ module FlatKit
2
+ class FieldType
3
+ # GuessType is a field type where we don't know what type the field is, and
4
+ # it needs to be guessed. This is a sentinel type that doesn't match any
5
+ # data.
6
+ class GuessType < FieldType
7
+ def self.type_name
8
+ self.name
9
+ end
10
+
11
+ def self.matches?(data)
12
+ false
13
+ end
14
+
15
+ def self.coerce(data)
16
+ CoerceFailure
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,34 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class IntegerType < FieldType
4
+
5
+ REGEX = /\A[-+]?\d+\Z/
6
+
7
+ def self.type_name
8
+ "integer"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ case data
13
+ when Integer
14
+ true
15
+ when Float
16
+ false
17
+ when String
18
+ REGEX.match?(data)
19
+ else
20
+ false
21
+ end
22
+ end
23
+
24
+ def self.coerce(data)
25
+ Integer(data)
26
+ rescue TypeError => _
27
+ CoerceFailure
28
+ rescue ArgumentError => _
29
+ CoerceFailure
30
+ end
31
+
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,35 @@
1
+ module FlatKit
2
+ class FieldType
3
+ class NullType < FieldType
4
+
5
+ REGEX = Regexp.union(/\A(null|nil)\Z/i, /\A\\N\Z/)
6
+
7
+ def self.type_name
8
+ "null"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ case data
13
+ when nil
14
+ true
15
+ when String
16
+ REGEX.match?(data)
17
+ else
18
+ false
19
+ end
20
+ end
21
+
22
+ def self.coerce(data)
23
+ case data
24
+ when nil
25
+ data
26
+ when String
27
+ return nil if REGEX.match?(data)
28
+ CoerceFailure
29
+ else
30
+ CoerceFailure
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,22 @@
1
+ module FlatKit
2
+ class FieldType
3
+ # StringType is essentially a fallback - hence its lower weight than other
4
+ # types that might have string representations.
5
+ class StringType< FieldType
6
+
7
+ def self.type_name
8
+ "string"
9
+ end
10
+
11
+ def self.matches?(data)
12
+ data.kind_of?(String)
13
+ end
14
+
15
+ def self.coerce(data)
16
+ data.to_s
17
+ rescue => _
18
+ CoerceFailure
19
+ end
20
+ end
21
+ end
22
+ end