schema-inference 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/schema/inference/schema_inferrer.rb +57 -16
- data/lib/schema/inference/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f26792c5eb829a7d9971d33fa1f8f5969cde64b
|
4
|
+
data.tar.gz: 810e9c35acc28b06c5d2858d92e3d2fb08cbabb5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2c8e5baefeb83e102dffc70d7f7a57638f5e1d9c886701947c032f93e1f6b1cc82fb370dcd8873b9e414c41fd291fb089b0ccb73383fa5764e18cd8fc2118935
|
7
|
+
data.tar.gz: 162cace85faea7adc7e66861b874e1ea13a9b01b761ba735e615c4f5d1bc95a2e426754381de87e9e1b38c4776e881bc84e7c1984dee68a2cec8bebc10d6f123
|
@@ -50,30 +50,53 @@ module Schema
|
|
50
50
|
raise ArgumentError, 'dataset must be an array or a hash'
|
51
51
|
end
|
52
52
|
|
53
|
+
FIXNUM_MAX = (2**(0.size * 8 -2) -1)
|
54
|
+
|
53
55
|
def data_schema(data)
|
54
56
|
table_schema = {}
|
55
57
|
data.each do |record|
|
56
58
|
# fetch the record schema & update the general schema
|
57
59
|
rec_schema = record_schema(record)
|
58
|
-
rec_schema.each do |
|
59
|
-
table_schema[
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
60
|
+
rec_schema.each do |field|
|
61
|
+
field_schema = table_schema[field[:field]] ||= {type: field[:type], usage_count: 0}
|
62
|
+
field_schema = table_schema[field[:field]]
|
63
|
+
if field_schema[:type] != field[:type]
|
64
|
+
if field_schema[:type] == NilClass
|
65
|
+
# if it was set as nil, we now set it to a concrete type
|
66
|
+
field_schema[:type] = field[:type]
|
67
|
+
elsif field[:type] != nil
|
68
|
+
# if it had a different (non-nil) type, then try to upcast
|
69
|
+
field_schema[:type] = lowest_common_type(field[:type], field_schema[:type])
|
65
70
|
end
|
66
71
|
end
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
72
|
+
|
73
|
+
field_schema[:usage_count] += 1
|
74
|
+
field_schema[:types] ||= {}
|
75
|
+
field_schema[:types][field[:type]] ||= { count: 0 }
|
76
|
+
field_schema[:types][field[:type]][:count] += 1
|
77
|
+
|
78
|
+
if type_has_min_max?(field[:type])
|
79
|
+
field_size = value_length(field[:inferred_value])
|
80
|
+
field_schema[:types][field[:type]][:min] = [field_schema[:types][field[:type]][:min] || FIXNUM_MAX, field_size].min
|
81
|
+
field_schema[:types][field[:type]][:max] = [field_schema[:types][field[:type]][:max] || 0, field_size].max
|
82
|
+
end
|
83
|
+
|
84
|
+
|
71
85
|
end
|
72
86
|
end
|
73
87
|
|
74
88
|
table_schema
|
75
89
|
end
|
76
90
|
|
91
|
+
def type_has_min_max?(type)
|
92
|
+
type == String || NumericTypes.include?(type)
|
93
|
+
end
|
94
|
+
|
95
|
+
def value_length(value)
|
96
|
+
return value.length if value.is_a?(String)
|
97
|
+
value # leave as-is otherwise
|
98
|
+
end
|
99
|
+
|
77
100
|
def process_schema_results(results, total_count, extended)
|
78
101
|
# aggregate the results
|
79
102
|
table_schema = results[0]
|
@@ -81,10 +104,14 @@ module Schema
|
|
81
104
|
table_schema.each { |k, v|
|
82
105
|
next if res[k].blank?
|
83
106
|
|
84
|
-
# aggregate types count
|
85
|
-
res[k][:types].each { |type,
|
86
|
-
table_schema[k][:types][type] ||= 0
|
87
|
-
table_schema[k][:types][type] += count
|
107
|
+
# aggregate types count, set min and max
|
108
|
+
res[k][:types].each { |type, info|
|
109
|
+
table_schema[k][:types][type] ||= { count: 0 }
|
110
|
+
table_schema[k][:types][type][:count] += info[:count]
|
111
|
+
if type_has_min_max?(type)
|
112
|
+
table_schema[k][:types][type][:min] = [table_schema[k][:types][type][:min] || FIXNUM_MAX, info[:min]].min
|
113
|
+
table_schema[k][:types][type][:max] = [table_schema[k][:types][type][:max] || 0, info[:max]].max
|
114
|
+
end
|
88
115
|
}
|
89
116
|
|
90
117
|
# aggregate other informations
|
@@ -174,7 +201,7 @@ module Schema
|
|
174
201
|
record_schema(x, field_name)
|
175
202
|
}
|
176
203
|
else
|
177
|
-
{ field: name, type: detect_type_of(record) }
|
204
|
+
{ field: name, type: detect_type_of(record), inferred_value: inferred_value_of(record) }
|
178
205
|
end
|
179
206
|
end
|
180
207
|
|
@@ -196,6 +223,20 @@ module Schema
|
|
196
223
|
Object
|
197
224
|
end
|
198
225
|
|
226
|
+
def inferred_value_of(value)
|
227
|
+
return value unless value.is_a?(String)
|
228
|
+
|
229
|
+
return value.to_i if value =~ /^[-+]?[0-9]+$/
|
230
|
+
return value.to_f if value =~ /^[-+]?[0-9]*\.?[0-9]+$/
|
231
|
+
return true if value.downcase == 'true'
|
232
|
+
return false if value.downcase == 'false'
|
233
|
+
|
234
|
+
time_value = Timeliness.parse(value)
|
235
|
+
return time_value if time_value
|
236
|
+
|
237
|
+
value
|
238
|
+
end
|
239
|
+
|
199
240
|
def key_access_tokens(key:)
|
200
241
|
key.split(separator).map { |token|
|
201
242
|
# only parse integers for array indexing
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: schema-inference
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|