schema-inference 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/schema/inference/schema_inferrer.rb +57 -16
- data/lib/schema/inference/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f26792c5eb829a7d9971d33fa1f8f5969cde64b
|
4
|
+
data.tar.gz: 810e9c35acc28b06c5d2858d92e3d2fb08cbabb5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2c8e5baefeb83e102dffc70d7f7a57638f5e1d9c886701947c032f93e1f6b1cc82fb370dcd8873b9e414c41fd291fb089b0ccb73383fa5764e18cd8fc2118935
|
7
|
+
data.tar.gz: 162cace85faea7adc7e66861b874e1ea13a9b01b761ba735e615c4f5d1bc95a2e426754381de87e9e1b38c4776e881bc84e7c1984dee68a2cec8bebc10d6f123
|
@@ -50,30 +50,53 @@ module Schema
|
|
50
50
|
raise ArgumentError, 'dataset must be an array or a hash'
|
51
51
|
end
|
52
52
|
|
53
|
+
FIXNUM_MAX = (2**(0.size * 8 -2) -1)
|
54
|
+
|
53
55
|
def data_schema(data)
|
54
56
|
table_schema = {}
|
55
57
|
data.each do |record|
|
56
58
|
# fetch the record schema & update the general schema
|
57
59
|
rec_schema = record_schema(record)
|
58
|
-
rec_schema.each do |
|
59
|
-
table_schema[
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
60
|
+
rec_schema.each do |field|
|
61
|
+
field_schema = table_schema[field[:field]] ||= {type: field[:type], usage_count: 0}
|
62
|
+
field_schema = table_schema[field[:field]]
|
63
|
+
if field_schema[:type] != field[:type]
|
64
|
+
if field_schema[:type] == NilClass
|
65
|
+
# if it was set as nil, we now set it to a concrete type
|
66
|
+
field_schema[:type] = field[:type]
|
67
|
+
elsif field[:type] != nil
|
68
|
+
# if it had a different (non-nil) type, then try to upcast
|
69
|
+
field_schema[:type] = lowest_common_type(field[:type], field_schema[:type])
|
65
70
|
end
|
66
71
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
72
|
+
|
73
|
+
field_schema[:usage_count] += 1
|
74
|
+
field_schema[:types] ||= {}
|
75
|
+
field_schema[:types][field[:type]] ||= { count: 0 }
|
76
|
+
field_schema[:types][field[:type]][:count] += 1
|
77
|
+
|
78
|
+
if type_has_min_max?(field[:type])
|
79
|
+
field_size = value_length(field[:inferred_value])
|
80
|
+
field_schema[:types][field[:type]][:min] = [field_schema[:types][field[:type]][:min] || FIXNUM_MAX, field_size].min
|
81
|
+
field_schema[:types][field[:type]][:max] = [field_schema[:types][field[:type]][:max] || 0, field_size].max
|
82
|
+
end
|
83
|
+
|
84
|
+
|
71
85
|
end
|
72
86
|
end
|
73
87
|
|
74
88
|
table_schema
|
75
89
|
end
|
76
90
|
|
91
|
+
def type_has_min_max?(type)
|
92
|
+
type == String || NumericTypes.include?(type)
|
93
|
+
end
|
94
|
+
|
95
|
+
def value_length(value)
|
96
|
+
return value.length if value.is_a?(String)
|
97
|
+
value # leave as-is otherwise
|
98
|
+
end
|
99
|
+
|
77
100
|
def process_schema_results(results, total_count, extended)
|
78
101
|
# aggregate the results
|
79
102
|
table_schema = results[0]
|
@@ -81,10 +104,14 @@ module Schema
|
|
81
104
|
table_schema.each { |k, v|
|
82
105
|
next if res[k].blank?
|
83
106
|
|
84
|
-
# aggregate types count
|
85
|
-
res[k][:types].each { |type,
|
86
|
-
table_schema[k][:types][type] ||= 0
|
87
|
-
table_schema[k][:types][type] += count
|
107
|
+
# aggregate types count, set min and max
|
108
|
+
res[k][:types].each { |type, info|
|
109
|
+
table_schema[k][:types][type] ||= { count: 0 }
|
110
|
+
table_schema[k][:types][type][:count] += info[:count]
|
111
|
+
if type_has_min_max?(type)
|
112
|
+
table_schema[k][:types][type][:min] = [table_schema[k][:types][type][:min] || FIXNUM_MAX, info[:min]].min
|
113
|
+
table_schema[k][:types][type][:max] = [table_schema[k][:types][type][:max] || 0, info[:max]].max
|
114
|
+
end
|
88
115
|
}
|
89
116
|
|
90
117
|
# aggregate other informations
|
@@ -174,7 +201,7 @@ module Schema
|
|
174
201
|
record_schema(x, field_name)
|
175
202
|
}
|
176
203
|
else
|
177
|
-
{ field: name, type: detect_type_of(record) }
|
204
|
+
{ field: name, type: detect_type_of(record), inferred_value: inferred_value_of(record) }
|
178
205
|
end
|
179
206
|
end
|
180
207
|
|
@@ -196,6 +223,20 @@ module Schema
|
|
196
223
|
Object
|
197
224
|
end
|
198
225
|
|
226
|
+
def inferred_value_of(value)
|
227
|
+
return value unless value.is_a?(String)
|
228
|
+
|
229
|
+
return value.to_i if value =~ /^[-+]?[0-9]+$/
|
230
|
+
return value.to_f if value =~ /^[-+]?[0-9]*\.?[0-9]+$/
|
231
|
+
return true if value.downcase == 'true'
|
232
|
+
return false if value.downcase == 'false'
|
233
|
+
|
234
|
+
time_value = Timeliness.parse(value)
|
235
|
+
return time_value if time_value
|
236
|
+
|
237
|
+
value
|
238
|
+
end
|
239
|
+
|
199
240
|
def key_access_tokens(key:)
|
200
241
|
key.split(separator).map { |token|
|
201
242
|
# only parse integers for array indexing
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: schema-inference
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|