schema-inference 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e783c172078cf4000dc5efabd8d5ad85cc3072ff
4
- data.tar.gz: b84bdc0aa7c1cfaec9e6228f70c60e070fffe30f
3
+ metadata.gz: 2f26792c5eb829a7d9971d33fa1f8f5969cde64b
4
+ data.tar.gz: 810e9c35acc28b06c5d2858d92e3d2fb08cbabb5
5
5
  SHA512:
6
- metadata.gz: 9157d2db16ed2b70a6d731f11d363e976a914635dffe32fe1a750576890e3cca675ef844bb34c48ffce19696d7fc7d3826f3e82696a0c5549d5d63176359d950
7
- data.tar.gz: b97153fe64406c10783f28d90e94e3f66a246138c4abeb2bb002cb359f807a211215ca16f6068d74fc74346c3a801eabb354c21c9f42271dc4c48fadc262845b
6
+ metadata.gz: 2c8e5baefeb83e102dffc70d7f7a57638f5e1d9c886701947c032f93e1f6b1cc82fb370dcd8873b9e414c41fd291fb089b0ccb73383fa5764e18cd8fc2118935
7
+ data.tar.gz: 162cace85faea7adc7e66861b874e1ea13a9b01b761ba735e615c4f5d1bc95a2e426754381de87e9e1b38c4776e881bc84e7c1984dee68a2cec8bebc10d6f123
@@ -50,30 +50,53 @@ module Schema
50
50
  raise ArgumentError, 'dataset must be an array or a hash'
51
51
  end
52
52
 
53
+ FIXNUM_MAX = (2**(0.size * 8 -2) -1)
54
+
53
55
  def data_schema(data)
54
56
  table_schema = {}
55
57
  data.each do |record|
56
58
  # fetch the record schema & update the general schema
57
59
  rec_schema = record_schema(record)
58
- rec_schema.each do |field_schema|
59
- table_schema[field_schema[:field]] ||= {type: field_schema[:type], usage_count: 0}
60
- if table_schema[field_schema[:field]][:type] != field_schema[:type]
61
- if table_schema[field_schema[:field]][:type] == NilClass
62
- table_schema[field_schema[:field]][:type] = field_schema[:type]
63
- elsif field_schema[:type] != nil
64
- table_schema[field_schema[:field]][:type] = lowest_common_type(field_schema[:type], table_schema[field_schema[:field]][:type])
60
+ rec_schema.each do |field|
61
+ field_schema = table_schema[field[:field]] ||= {type: field[:type], usage_count: 0}
62
+ field_schema = table_schema[field[:field]]
63
+ if field_schema[:type] != field[:type]
64
+ if field_schema[:type] == NilClass
65
+ # if it was set as nil, we now set it to a concrete type
66
+ field_schema[:type] = field[:type]
67
+ elsif field[:type] != nil
68
+ # if it had a different (non-nil) type, then try to upcast
69
+ field_schema[:type] = lowest_common_type(field[:type], field_schema[:type])
65
70
  end
66
71
  end
67
- table_schema[field_schema[:field]][:usage_count] += 1
68
- table_schema[field_schema[:field]][:types] ||= {}
69
- table_schema[field_schema[:field]][:types][field_schema[:type]] ||= 0
70
- table_schema[field_schema[:field]][:types][field_schema[:type]] += 1
72
+
73
+ field_schema[:usage_count] += 1
74
+ field_schema[:types] ||= {}
75
+ field_schema[:types][field[:type]] ||= { count: 0 }
76
+ field_schema[:types][field[:type]][:count] += 1
77
+
78
+ if type_has_min_max?(field[:type])
79
+ field_size = value_length(field[:inferred_value])
80
+ field_schema[:types][field[:type]][:min] = [field_schema[:types][field[:type]][:min] || FIXNUM_MAX, field_size].min
81
+ field_schema[:types][field[:type]][:max] = [field_schema[:types][field[:type]][:max] || 0, field_size].max
82
+ end
83
+
84
+
71
85
  end
72
86
  end
73
87
 
74
88
  table_schema
75
89
  end
76
90
 
91
+ def type_has_min_max?(type)
92
+ type == String || NumericTypes.include?(type)
93
+ end
94
+
95
+ def value_length(value)
96
+ return value.length if value.is_a?(String)
97
+ value # leave as-is otherwise
98
+ end
99
+
77
100
  def process_schema_results(results, total_count, extended)
78
101
  # aggregate the results
79
102
  table_schema = results[0]
@@ -81,10 +104,14 @@ module Schema
81
104
  table_schema.each { |k, v|
82
105
  next if res[k].blank?
83
106
 
84
- # aggregate types count
85
- res[k][:types].each { |type, count|
86
- table_schema[k][:types][type] ||= 0
87
- table_schema[k][:types][type] += count
107
+ # aggregate types count, set min and max
108
+ res[k][:types].each { |type, info|
109
+ table_schema[k][:types][type] ||= { count: 0 }
110
+ table_schema[k][:types][type][:count] += info[:count]
111
+ if type_has_min_max?(type)
112
+ table_schema[k][:types][type][:min] = [table_schema[k][:types][type][:min] || FIXNUM_MAX, info[:min]].min
113
+ table_schema[k][:types][type][:max] = [table_schema[k][:types][type][:max] || 0, info[:max]].max
114
+ end
88
115
  }
89
116
 
90
117
  # aggregate other informations
@@ -174,7 +201,7 @@ module Schema
174
201
  record_schema(x, field_name)
175
202
  }
176
203
  else
177
- { field: name, type: detect_type_of(record) }
204
+ { field: name, type: detect_type_of(record), inferred_value: inferred_value_of(record) }
178
205
  end
179
206
  end
180
207
 
@@ -196,6 +223,20 @@ module Schema
196
223
  Object
197
224
  end
198
225
 
226
+ def inferred_value_of(value)
227
+ return value unless value.is_a?(String)
228
+
229
+ return value.to_i if value =~ /^[-+]?[0-9]+$/
230
+ return value.to_f if value =~ /^[-+]?[0-9]*\.?[0-9]+$/
231
+ return true if value.downcase == 'true'
232
+ return false if value.downcase == 'false'
233
+
234
+ time_value = Timeliness.parse(value)
235
+ return time_value if time_value
236
+
237
+ value
238
+ end
239
+
199
240
  def key_access_tokens(key:)
200
241
  key.split(separator).map { |token|
201
242
  # only parse integers for array indexing
@@ -1,5 +1,5 @@
1
1
  module Schema
2
2
  module Inference
3
- VERSION = '1.1.0'
3
+ VERSION = '1.2.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: schema-inference
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-10-20 00:00:00.000000000 Z
11
+ date: 2016-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler