schema-inference 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e783c172078cf4000dc5efabd8d5ad85cc3072ff
4
- data.tar.gz: b84bdc0aa7c1cfaec9e6228f70c60e070fffe30f
3
+ metadata.gz: 2f26792c5eb829a7d9971d33fa1f8f5969cde64b
4
+ data.tar.gz: 810e9c35acc28b06c5d2858d92e3d2fb08cbabb5
5
5
  SHA512:
6
- metadata.gz: 9157d2db16ed2b70a6d731f11d363e976a914635dffe32fe1a750576890e3cca675ef844bb34c48ffce19696d7fc7d3826f3e82696a0c5549d5d63176359d950
7
- data.tar.gz: b97153fe64406c10783f28d90e94e3f66a246138c4abeb2bb002cb359f807a211215ca16f6068d74fc74346c3a801eabb354c21c9f42271dc4c48fadc262845b
6
+ metadata.gz: 2c8e5baefeb83e102dffc70d7f7a57638f5e1d9c886701947c032f93e1f6b1cc82fb370dcd8873b9e414c41fd291fb089b0ccb73383fa5764e18cd8fc2118935
7
+ data.tar.gz: 162cace85faea7adc7e66861b874e1ea13a9b01b761ba735e615c4f5d1bc95a2e426754381de87e9e1b38c4776e881bc84e7c1984dee68a2cec8bebc10d6f123
@@ -50,30 +50,53 @@ module Schema
50
50
  raise ArgumentError, 'dataset must be an array or a hash'
51
51
  end
52
52
 
53
+ FIXNUM_MAX = (2**(0.size * 8 -2) -1)
54
+
53
55
  def data_schema(data)
54
56
  table_schema = {}
55
57
  data.each do |record|
56
58
  # fetch the record schema & update the general schema
57
59
  rec_schema = record_schema(record)
58
- rec_schema.each do |field_schema|
59
- table_schema[field_schema[:field]] ||= {type: field_schema[:type], usage_count: 0}
60
- if table_schema[field_schema[:field]][:type] != field_schema[:type]
61
- if table_schema[field_schema[:field]][:type] == NilClass
62
- table_schema[field_schema[:field]][:type] = field_schema[:type]
63
- elsif field_schema[:type] != nil
64
- table_schema[field_schema[:field]][:type] = lowest_common_type(field_schema[:type], table_schema[field_schema[:field]][:type])
60
+ rec_schema.each do |field|
61
+ field_schema = table_schema[field[:field]] ||= {type: field[:type], usage_count: 0}
62
+ field_schema = table_schema[field[:field]]
63
+ if field_schema[:type] != field[:type]
64
+ if field_schema[:type] == NilClass
65
+ # if it was set as nil, we now set it to a concrete type
66
+ field_schema[:type] = field[:type]
67
+ elsif field[:type] != nil
68
+ # if it had a different (non-nil) type, then try to upcast
69
+ field_schema[:type] = lowest_common_type(field[:type], field_schema[:type])
65
70
  end
66
71
  end
67
- table_schema[field_schema[:field]][:usage_count] += 1
68
- table_schema[field_schema[:field]][:types] ||= {}
69
- table_schema[field_schema[:field]][:types][field_schema[:type]] ||= 0
70
- table_schema[field_schema[:field]][:types][field_schema[:type]] += 1
72
+
73
+ field_schema[:usage_count] += 1
74
+ field_schema[:types] ||= {}
75
+ field_schema[:types][field[:type]] ||= { count: 0 }
76
+ field_schema[:types][field[:type]][:count] += 1
77
+
78
+ if type_has_min_max?(field[:type])
79
+ field_size = value_length(field[:inferred_value])
80
+ field_schema[:types][field[:type]][:min] = [field_schema[:types][field[:type]][:min] || FIXNUM_MAX, field_size].min
81
+ field_schema[:types][field[:type]][:max] = [field_schema[:types][field[:type]][:max] || 0, field_size].max
82
+ end
83
+
84
+
71
85
  end
72
86
  end
73
87
 
74
88
  table_schema
75
89
  end
76
90
 
91
+ def type_has_min_max?(type)
92
+ type == String || NumericTypes.include?(type)
93
+ end
94
+
95
+ def value_length(value)
96
+ return value.length if value.is_a?(String)
97
+ value # leave as-is otherwise
98
+ end
99
+
77
100
  def process_schema_results(results, total_count, extended)
78
101
  # aggregate the results
79
102
  table_schema = results[0]
@@ -81,10 +104,14 @@ module Schema
81
104
  table_schema.each { |k, v|
82
105
  next if res[k].blank?
83
106
 
84
- # aggregate types count
85
- res[k][:types].each { |type, count|
86
- table_schema[k][:types][type] ||= 0
87
- table_schema[k][:types][type] += count
107
+ # aggregate types count, set min and max
108
+ res[k][:types].each { |type, info|
109
+ table_schema[k][:types][type] ||= { count: 0 }
110
+ table_schema[k][:types][type][:count] += info[:count]
111
+ if type_has_min_max?(type)
112
+ table_schema[k][:types][type][:min] = [table_schema[k][:types][type][:min] || FIXNUM_MAX, info[:min]].min
113
+ table_schema[k][:types][type][:max] = [table_schema[k][:types][type][:max] || 0, info[:max]].max
114
+ end
88
115
  }
89
116
 
90
117
  # aggregate other informations
@@ -174,7 +201,7 @@ module Schema
174
201
  record_schema(x, field_name)
175
202
  }
176
203
  else
177
- { field: name, type: detect_type_of(record) }
204
+ { field: name, type: detect_type_of(record), inferred_value: inferred_value_of(record) }
178
205
  end
179
206
  end
180
207
 
@@ -196,6 +223,20 @@ module Schema
196
223
  Object
197
224
  end
198
225
 
226
+ def inferred_value_of(value)
227
+ return value unless value.is_a?(String)
228
+
229
+ return value.to_i if value =~ /^[-+]?[0-9]+$/
230
+ return value.to_f if value =~ /^[-+]?[0-9]*\.?[0-9]+$/
231
+ return true if value.downcase == 'true'
232
+ return false if value.downcase == 'false'
233
+
234
+ time_value = Timeliness.parse(value)
235
+ return time_value if time_value
236
+
237
+ value
238
+ end
239
+
199
240
  def key_access_tokens(key:)
200
241
  key.split(separator).map { |token|
201
242
  # only parse integers for array indexing
@@ -1,5 +1,5 @@
1
1
  module Schema
2
2
  module Inference
3
- VERSION = '1.1.0'
3
+ VERSION = '1.2.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: schema-inference
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-10-20 00:00:00.000000000 Z
11
+ date: 2016-10-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler