duckdb 0.6.2-dev1971.0 → 0.6.2-dev2015.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +132 -18
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +29 -9
- package/src/duckdb/extension/json/include/json_common.hpp +56 -0
- package/src/duckdb/extension/json/include/json_functions.hpp +9 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +115 -25
- package/src/duckdb/extension/json/include/json_structure.hpp +73 -0
- package/src/duckdb/extension/json/include/json_transform.hpp +57 -0
- package/src/duckdb/extension/json/json-extension.cpp +3 -0
- package/src/duckdb/extension/json/json_functions/json_contains.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_create.cpp +6 -10
- package/src/duckdb/extension/json/json_functions/json_extract.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_keys.cpp +60 -0
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +404 -150
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +216 -60
- package/src/duckdb/extension/json/json_functions/read_json.cpp +224 -0
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +6 -6
- package/src/duckdb/extension/json/json_functions.cpp +25 -0
- package/src/duckdb/extension/json/json_scan.cpp +192 -86
- package/src/duckdb/extension/json/yyjson/include/yyjson.hpp +18 -9
- package/src/duckdb/extension/json/yyjson/yyjson.cpp +58 -13
- package/src/duckdb/src/function/table/copy_csv.cpp +16 -11
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/function/scalar/strftime.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/extension_functions.hpp +5 -0
- package/src/duckdb/ub_extension_json_json_functions.cpp +4 -0
|
@@ -1,19 +1,35 @@
|
|
|
1
|
+
#include "json_transform.hpp"
|
|
2
|
+
|
|
1
3
|
#include "duckdb/common/types.hpp"
|
|
2
4
|
#include "duckdb/execution/expression_executor.hpp"
|
|
3
5
|
#include "duckdb/function/scalar/nested_functions.hpp"
|
|
4
|
-
#include "json_common.hpp"
|
|
5
6
|
#include "json_functions.hpp"
|
|
7
|
+
#include "json_scan.hpp"
|
|
6
8
|
|
|
7
9
|
namespace duckdb {
|
|
8
10
|
|
|
11
|
+
void JSONTransformOptions::Serialize(FieldWriter &writer) {
|
|
12
|
+
writer.WriteField(strict_cast);
|
|
13
|
+
writer.WriteField(error_duplicate_key);
|
|
14
|
+
writer.WriteField(error_missing_key);
|
|
15
|
+
writer.WriteField(error_unknown_key);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
void JSONTransformOptions::Deserialize(FieldReader &reader) {
|
|
19
|
+
strict_cast = reader.ReadRequired<bool>();
|
|
20
|
+
error_duplicate_key = reader.ReadRequired<bool>();
|
|
21
|
+
error_missing_key = reader.ReadRequired<bool>();
|
|
22
|
+
error_unknown_key = reader.ReadRequired<bool>();
|
|
23
|
+
}
|
|
24
|
+
|
|
9
25
|
//! Forward declaration for recursion
|
|
10
|
-
static LogicalType
|
|
26
|
+
static LogicalType StructureStringToType(yyjson_val *val, ClientContext &context);
|
|
11
27
|
|
|
12
|
-
static LogicalType
|
|
28
|
+
static LogicalType StructureStringToTypeArray(yyjson_val *arr, ClientContext &context) {
|
|
13
29
|
if (yyjson_arr_size(arr) != 1) {
|
|
14
30
|
throw InvalidInputException("Too many values in array of JSON structure");
|
|
15
31
|
}
|
|
16
|
-
return LogicalType::LIST(
|
|
32
|
+
return LogicalType::LIST(StructureStringToType(yyjson_arr_get_first(arr), context));
|
|
17
33
|
}
|
|
18
34
|
|
|
19
35
|
static LogicalType StructureToTypeObject(yyjson_val *obj, ClientContext &context) {
|
|
@@ -23,25 +39,25 @@ static LogicalType StructureToTypeObject(yyjson_val *obj, ClientContext &context
|
|
|
23
39
|
yyjson_val *key, *val;
|
|
24
40
|
yyjson_obj_foreach(obj, idx, max, key, val) {
|
|
25
41
|
val = yyjson_obj_iter_get_val(key);
|
|
26
|
-
auto key_str =
|
|
42
|
+
auto key_str = unsafe_yyjson_get_str(key);
|
|
27
43
|
if (names.find(key_str) != names.end()) {
|
|
28
44
|
JSONCommon::ThrowValFormatError("Duplicate keys in object in JSON structure: %s", val);
|
|
29
45
|
}
|
|
30
46
|
names.insert(key_str);
|
|
31
|
-
child_types.emplace_back(key_str,
|
|
47
|
+
child_types.emplace_back(key_str, StructureStringToType(val, context));
|
|
32
48
|
}
|
|
33
49
|
D_ASSERT(yyjson_obj_size(obj) == names.size());
|
|
34
50
|
return LogicalType::STRUCT(child_types);
|
|
35
51
|
}
|
|
36
52
|
|
|
37
|
-
static LogicalType
|
|
53
|
+
static LogicalType StructureStringToType(yyjson_val *val, ClientContext &context) {
|
|
38
54
|
switch (yyjson_get_tag(val)) {
|
|
39
55
|
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
|
40
|
-
return
|
|
56
|
+
return StructureStringToTypeArray(val, context);
|
|
41
57
|
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
|
42
58
|
return StructureToTypeObject(val, context);
|
|
43
59
|
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
|
44
|
-
return TransformStringToLogicalType(
|
|
60
|
+
return TransformStringToLogicalType(unsafe_yyjson_get_str(val), context);
|
|
45
61
|
default:
|
|
46
62
|
throw InvalidInputException("invalid JSON structure");
|
|
47
63
|
}
|
|
@@ -70,7 +86,7 @@ static unique_ptr<FunctionData> JSONTransformBind(ClientContext &context, Scalar
|
|
|
70
86
|
if (err.code != YYJSON_READ_SUCCESS) {
|
|
71
87
|
JSONCommon::ThrowParseError(structure_string.GetDataUnsafe(), structure_string.GetSize(), err);
|
|
72
88
|
}
|
|
73
|
-
bound_function.return_type =
|
|
89
|
+
bound_function.return_type = StructureStringToType(doc->root, context);
|
|
74
90
|
}
|
|
75
91
|
return make_unique<VariableReturnBindData>(bound_function.return_type);
|
|
76
92
|
}
|
|
@@ -155,7 +171,7 @@ static inline bool GetValueString(yyjson_val *val, yyjson_alc *alc, string_t &re
|
|
|
155
171
|
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
|
156
172
|
return false;
|
|
157
173
|
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
|
158
|
-
result =
|
|
174
|
+
result = string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val));
|
|
159
175
|
return true;
|
|
160
176
|
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
|
161
177
|
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
|
@@ -179,9 +195,6 @@ static inline bool GetValueString(yyjson_val *val, yyjson_alc *alc, string_t &re
|
|
|
179
195
|
}
|
|
180
196
|
}
|
|
181
197
|
|
|
182
|
-
//! Forward declaration for recursion
|
|
183
|
-
static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count, bool strict);
|
|
184
|
-
|
|
185
198
|
template <class T>
|
|
186
199
|
static void TransformNumerical(yyjson_val *vals[], Vector &result, const idx_t count, const bool strict) {
|
|
187
200
|
auto data = (T *)FlatVector::GetData(result);
|
|
@@ -207,9 +220,8 @@ static void TransformDecimal(yyjson_val *vals[], Vector &result, const idx_t cou
|
|
|
207
220
|
}
|
|
208
221
|
}
|
|
209
222
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
Vector string_vector(LogicalTypeId::VARCHAR, count);
|
|
223
|
+
void JSONTransform::GetStringVector(yyjson_val *vals[], const idx_t count, const LogicalType &target,
|
|
224
|
+
Vector &string_vector, const bool strict) {
|
|
213
225
|
auto data = (string_t *)FlatVector::GetData(string_vector);
|
|
214
226
|
auto &validity = FlatVector::Validity(string_vector);
|
|
215
227
|
|
|
@@ -223,6 +235,11 @@ static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t
|
|
|
223
235
|
data[i] = GetString(val);
|
|
224
236
|
}
|
|
225
237
|
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t count, const bool strict) {
|
|
241
|
+
Vector string_vector(LogicalTypeId::VARCHAR, count);
|
|
242
|
+
JSONTransform::GetStringVector(vals, count, result.GetType(), string_vector, strict);
|
|
226
243
|
|
|
227
244
|
string error_message;
|
|
228
245
|
if (!VectorOperations::DefaultTryCast(string_vector, result, count, &error_message, strict) && strict) {
|
|
@@ -230,6 +247,63 @@ static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t
|
|
|
230
247
|
}
|
|
231
248
|
}
|
|
232
249
|
|
|
250
|
+
template <class OP, class T>
|
|
251
|
+
static bool TransformStringWithFormat(Vector &string_vector, StrpTimeFormat &format, const idx_t count, Vector &result,
|
|
252
|
+
string &error_message) {
|
|
253
|
+
const auto source_strings = FlatVector::GetData<string_t>(string_vector);
|
|
254
|
+
const auto &source_validity = FlatVector::Validity(string_vector);
|
|
255
|
+
|
|
256
|
+
auto target_vals = FlatVector::GetData<T>(result);
|
|
257
|
+
auto &target_validity = FlatVector::Validity(result);
|
|
258
|
+
|
|
259
|
+
bool success = true;
|
|
260
|
+
if (source_validity.AllValid()) {
|
|
261
|
+
for (idx_t i = 0; i < count; i++) {
|
|
262
|
+
if (!OP::template Operation<T>(format, source_strings[i], target_vals[i], error_message)) {
|
|
263
|
+
target_validity.SetInvalid(i);
|
|
264
|
+
success = false;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
} else {
|
|
268
|
+
for (idx_t i = 0; i < count; i++) {
|
|
269
|
+
if (!source_validity.RowIsValid(i)) {
|
|
270
|
+
target_validity.SetInvalid(i);
|
|
271
|
+
} else if (!OP::template Operation<T>(format, source_strings[i], target_vals[i], error_message)) {
|
|
272
|
+
target_validity.SetInvalid(i);
|
|
273
|
+
success = false;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
return success;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
static void TransformFromStringWithFormat(yyjson_val *vals[], Vector &result, const idx_t count,
|
|
281
|
+
const JSONTransformOptions &options) {
|
|
282
|
+
Vector string_vector(LogicalTypeId::VARCHAR, count);
|
|
283
|
+
JSONTransform::GetStringVector(vals, count, result.GetType(), string_vector, options.strict_cast);
|
|
284
|
+
|
|
285
|
+
const auto &result_type = result.GetType().id();
|
|
286
|
+
auto &format = options.date_format_map->GetFormat(result_type);
|
|
287
|
+
|
|
288
|
+
bool success;
|
|
289
|
+
string error_message;
|
|
290
|
+
switch (result_type) {
|
|
291
|
+
case LogicalTypeId::DATE:
|
|
292
|
+
success = TransformStringWithFormat<TryParseDate, date_t>(string_vector, format, count, result, error_message);
|
|
293
|
+
break;
|
|
294
|
+
case LogicalTypeId::TIMESTAMP:
|
|
295
|
+
success = TransformStringWithFormat<TryParseTimeStamp, timestamp_t>(string_vector, format, count, result,
|
|
296
|
+
error_message);
|
|
297
|
+
break;
|
|
298
|
+
default:
|
|
299
|
+
throw InternalException("No date/timestamp formats for %s", LogicalTypeIdToString(result.GetType().id()));
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (options.strict_cast && !success) {
|
|
303
|
+
throw CastException(error_message);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
233
307
|
static void TransformToString(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count) {
|
|
234
308
|
auto data = (string_t *)FlatVector::GetData(result);
|
|
235
309
|
auto &validity = FlatVector::Validity(result);
|
|
@@ -241,44 +315,118 @@ static void TransformToString(yyjson_val *vals[], yyjson_alc *alc, Vector &resul
|
|
|
241
315
|
}
|
|
242
316
|
}
|
|
243
317
|
|
|
244
|
-
static void
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
318
|
+
static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
|
|
319
|
+
const JSONTransformOptions &options);
|
|
320
|
+
|
|
321
|
+
void JSONTransform::TransformObject(yyjson_val *objects[], yyjson_alc *alc, const idx_t count,
|
|
322
|
+
const vector<string> &names, const vector<Vector *> &result_vectors,
|
|
323
|
+
const JSONTransformOptions &options) {
|
|
324
|
+
D_ASSERT(alc);
|
|
325
|
+
D_ASSERT(names.size() == result_vectors.size());
|
|
326
|
+
const idx_t column_count = names.size();
|
|
327
|
+
|
|
328
|
+
// Build hash map from key to column index so we don't have to linearly search using the key
|
|
329
|
+
json_key_map_t<idx_t> key_map;
|
|
330
|
+
vector<yyjson_val **> nested_vals;
|
|
331
|
+
nested_vals.reserve(column_count);
|
|
332
|
+
for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
|
|
333
|
+
key_map.insert({{names[col_idx].c_str(), names[col_idx].length()}, col_idx});
|
|
334
|
+
nested_vals.push_back((yyjson_val **)alc->malloc(alc->ctx, sizeof(yyjson_val *) * count));
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
idx_t found_key_count;
|
|
338
|
+
auto found_keys = (bool *)alc->malloc(alc->ctx, sizeof(bool) * column_count);
|
|
339
|
+
|
|
340
|
+
size_t idx, max;
|
|
341
|
+
yyjson_val *key, *val;
|
|
342
|
+
for (idx_t i = 0; i < count; i++) {
|
|
343
|
+
if (objects[i]) {
|
|
344
|
+
found_key_count = 0;
|
|
345
|
+
memset(found_keys, false, column_count);
|
|
346
|
+
yyjson_obj_foreach(objects[i], idx, max, key, val) {
|
|
347
|
+
auto key_ptr = yyjson_get_str(key);
|
|
348
|
+
auto key_len = yyjson_get_len(key);
|
|
349
|
+
auto it = key_map.find({key_ptr, key_len});
|
|
350
|
+
if (it != key_map.end()) {
|
|
351
|
+
const auto &col_idx = it->second;
|
|
352
|
+
if (options.error_duplicate_key && found_keys[col_idx]) {
|
|
353
|
+
JSONCommon::ThrowValFormatError(
|
|
354
|
+
"Duplicate key \"" + string(key_ptr, key_len) + "\" in object %s", objects[i]);
|
|
355
|
+
}
|
|
356
|
+
nested_vals[col_idx][i] = val;
|
|
357
|
+
found_keys[col_idx] = true;
|
|
358
|
+
if (++found_key_count == column_count) {
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
} else if (options.error_unknown_key) {
|
|
362
|
+
JSONCommon::ThrowValFormatError("Object %s has unknown key \"" + string(key_ptr, key_len) + "\"",
|
|
363
|
+
objects[i]);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
if (found_key_count != column_count) {
|
|
367
|
+
// If 'error_missing_key, we throw an error if one of the keys was not found.
|
|
368
|
+
// If not, we set the nested val to null so the recursion doesn't break
|
|
369
|
+
for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
|
|
370
|
+
if (!found_keys[col_idx]) {
|
|
371
|
+
if (options.error_missing_key) {
|
|
372
|
+
JSONCommon::ThrowValFormatError("Object %s does not have key \"" + names[col_idx] + "\"",
|
|
373
|
+
objects[i]);
|
|
374
|
+
} else {
|
|
375
|
+
nested_vals[col_idx][i] = nullptr;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
} else {
|
|
381
|
+
// Set nested val to null so the recursion doesn't break
|
|
382
|
+
for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
|
|
383
|
+
nested_vals[col_idx][i] = nullptr;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
|
|
389
|
+
Transform(nested_vals[col_idx], alc, *result_vectors[col_idx], count, options);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
static void TransformObject(yyjson_val *objects[], yyjson_alc *alc, Vector &result, const idx_t count,
|
|
394
|
+
const LogicalType &type, const JSONTransformOptions &options) {
|
|
395
|
+
// Get child vectors and names
|
|
250
396
|
auto &child_vs = StructVector::GetEntries(result);
|
|
397
|
+
vector<string> child_names;
|
|
398
|
+
vector<Vector *> child_vectors;
|
|
399
|
+
child_names.reserve(child_vs.size());
|
|
400
|
+
child_vectors.reserve(child_vs.size());
|
|
251
401
|
for (idx_t child_i = 0; child_i < child_vs.size(); child_i++) {
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
auto name_len = name.size();
|
|
255
|
-
for (idx_t i = 0; i < count; i++) {
|
|
256
|
-
nested_vals[i] = yyjson_obj_getn(vals[i], name_ptr, name_len);
|
|
257
|
-
}
|
|
258
|
-
// Transform child values
|
|
259
|
-
Transform(nested_vals, alc, *child_vs[child_i], count, strict);
|
|
402
|
+
child_names.push_back(StructType::GetChildName(type, child_i));
|
|
403
|
+
child_vectors.push_back(child_vs[child_i].get());
|
|
260
404
|
}
|
|
405
|
+
|
|
406
|
+
JSONTransform::TransformObject(objects, alc, count, child_names, child_vectors, options);
|
|
261
407
|
}
|
|
262
408
|
|
|
263
|
-
static void TransformArray(yyjson_val *
|
|
409
|
+
static void TransformArray(yyjson_val *arrays[], yyjson_alc *alc, Vector &result, const idx_t count,
|
|
410
|
+
const JSONTransformOptions &options) {
|
|
264
411
|
// Initialize list vector
|
|
265
412
|
auto list_entries = FlatVector::GetData<list_entry_t>(result);
|
|
266
413
|
auto &list_validity = FlatVector::Validity(result);
|
|
267
414
|
idx_t offset = 0;
|
|
268
415
|
for (idx_t i = 0; i < count; i++) {
|
|
269
|
-
if (!
|
|
416
|
+
if (!arrays[i] || yyjson_is_null(arrays[i])) {
|
|
270
417
|
list_validity.SetInvalid(i);
|
|
271
418
|
}
|
|
272
419
|
auto &entry = list_entries[i];
|
|
273
420
|
entry.offset = offset;
|
|
274
|
-
entry.length = yyjson_arr_size(
|
|
421
|
+
entry.length = yyjson_arr_size(arrays[i]);
|
|
275
422
|
offset += entry.length;
|
|
276
423
|
}
|
|
277
424
|
ListVector::SetListSize(result, offset);
|
|
278
425
|
ListVector::Reserve(result, offset);
|
|
426
|
+
|
|
279
427
|
// Initialize array for the nested values
|
|
280
|
-
auto
|
|
281
|
-
|
|
428
|
+
auto nested_vals = (yyjson_val **)alc->malloc(alc->ctx, sizeof(yyjson_val *) * offset);
|
|
429
|
+
|
|
282
430
|
// Get array values
|
|
283
431
|
size_t idx, max;
|
|
284
432
|
yyjson_val *val;
|
|
@@ -288,57 +436,63 @@ static void TransformArray(yyjson_val *vals[], yyjson_alc *alc, Vector &result,
|
|
|
288
436
|
// We already marked this as invalid
|
|
289
437
|
continue;
|
|
290
438
|
}
|
|
291
|
-
yyjson_arr_foreach(
|
|
439
|
+
yyjson_arr_foreach(arrays[i], idx, max, val) {
|
|
292
440
|
nested_vals[list_i] = val;
|
|
293
441
|
list_i++;
|
|
294
442
|
}
|
|
295
443
|
}
|
|
296
444
|
D_ASSERT(list_i == offset);
|
|
297
445
|
// Transform array values
|
|
298
|
-
Transform(nested_vals, alc, ListVector::GetEntry(result), offset,
|
|
446
|
+
Transform(nested_vals, alc, ListVector::GetEntry(result), offset, options);
|
|
299
447
|
}
|
|
300
448
|
|
|
301
|
-
static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
|
|
449
|
+
static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
|
|
450
|
+
const JSONTransformOptions &options) {
|
|
302
451
|
auto result_type = result.GetType();
|
|
452
|
+
if (options.date_format_map && (result_type == LogicalTypeId::TIMESTAMP || result_type == LogicalTypeId::DATE)) {
|
|
453
|
+
TransformFromStringWithFormat(vals, result, count, options);
|
|
454
|
+
return;
|
|
455
|
+
}
|
|
456
|
+
|
|
303
457
|
switch (result_type.id()) {
|
|
304
458
|
case LogicalTypeId::SQLNULL:
|
|
305
459
|
return;
|
|
306
460
|
case LogicalTypeId::BOOLEAN:
|
|
307
|
-
return TransformNumerical<bool>(vals, result, count,
|
|
461
|
+
return TransformNumerical<bool>(vals, result, count, options.strict_cast);
|
|
308
462
|
case LogicalTypeId::TINYINT:
|
|
309
|
-
return TransformNumerical<int8_t>(vals, result, count,
|
|
463
|
+
return TransformNumerical<int8_t>(vals, result, count, options.strict_cast);
|
|
310
464
|
case LogicalTypeId::SMALLINT:
|
|
311
|
-
return TransformNumerical<int16_t>(vals, result, count,
|
|
465
|
+
return TransformNumerical<int16_t>(vals, result, count, options.strict_cast);
|
|
312
466
|
case LogicalTypeId::INTEGER:
|
|
313
|
-
return TransformNumerical<int32_t>(vals, result, count,
|
|
467
|
+
return TransformNumerical<int32_t>(vals, result, count, options.strict_cast);
|
|
314
468
|
case LogicalTypeId::BIGINT:
|
|
315
|
-
return TransformNumerical<int64_t>(vals, result, count,
|
|
469
|
+
return TransformNumerical<int64_t>(vals, result, count, options.strict_cast);
|
|
316
470
|
case LogicalTypeId::UTINYINT:
|
|
317
|
-
return TransformNumerical<uint8_t>(vals, result, count,
|
|
471
|
+
return TransformNumerical<uint8_t>(vals, result, count, options.strict_cast);
|
|
318
472
|
case LogicalTypeId::USMALLINT:
|
|
319
|
-
return TransformNumerical<uint16_t>(vals, result, count,
|
|
473
|
+
return TransformNumerical<uint16_t>(vals, result, count, options.strict_cast);
|
|
320
474
|
case LogicalTypeId::UINTEGER:
|
|
321
|
-
return TransformNumerical<uint32_t>(vals, result, count,
|
|
475
|
+
return TransformNumerical<uint32_t>(vals, result, count, options.strict_cast);
|
|
322
476
|
case LogicalTypeId::UBIGINT:
|
|
323
|
-
return TransformNumerical<uint64_t>(vals, result, count,
|
|
477
|
+
return TransformNumerical<uint64_t>(vals, result, count, options.strict_cast);
|
|
324
478
|
case LogicalTypeId::HUGEINT:
|
|
325
|
-
return TransformNumerical<hugeint_t>(vals, result, count,
|
|
479
|
+
return TransformNumerical<hugeint_t>(vals, result, count, options.strict_cast);
|
|
326
480
|
case LogicalTypeId::FLOAT:
|
|
327
|
-
return TransformNumerical<float>(vals, result, count,
|
|
481
|
+
return TransformNumerical<float>(vals, result, count, options.strict_cast);
|
|
328
482
|
case LogicalTypeId::DOUBLE:
|
|
329
|
-
return TransformNumerical<double>(vals, result, count,
|
|
483
|
+
return TransformNumerical<double>(vals, result, count, options.strict_cast);
|
|
330
484
|
case LogicalTypeId::DECIMAL: {
|
|
331
485
|
auto width = DecimalType::GetWidth(result_type);
|
|
332
486
|
auto scale = DecimalType::GetScale(result_type);
|
|
333
487
|
switch (result_type.InternalType()) {
|
|
334
488
|
case PhysicalType::INT16:
|
|
335
|
-
return TransformDecimal<int16_t>(vals, result, count, width, scale,
|
|
489
|
+
return TransformDecimal<int16_t>(vals, result, count, width, scale, options.strict_cast);
|
|
336
490
|
case PhysicalType::INT32:
|
|
337
|
-
return TransformDecimal<int32_t>(vals, result, count, width, scale,
|
|
491
|
+
return TransformDecimal<int32_t>(vals, result, count, width, scale, options.strict_cast);
|
|
338
492
|
case PhysicalType::INT64:
|
|
339
|
-
return TransformDecimal<int64_t>(vals, result, count, width, scale,
|
|
493
|
+
return TransformDecimal<int64_t>(vals, result, count, width, scale, options.strict_cast);
|
|
340
494
|
case PhysicalType::INT128:
|
|
341
|
-
return TransformDecimal<hugeint_t>(vals, result, count, width, scale,
|
|
495
|
+
return TransformDecimal<hugeint_t>(vals, result, count, width, scale, options.strict_cast);
|
|
342
496
|
default:
|
|
343
497
|
throw InternalException("Unimplemented physical type for decimal");
|
|
344
498
|
}
|
|
@@ -355,14 +509,14 @@ static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const
|
|
|
355
509
|
case LogicalTypeId::TIMESTAMP_MS:
|
|
356
510
|
case LogicalTypeId::TIMESTAMP_SEC:
|
|
357
511
|
case LogicalTypeId::UUID:
|
|
358
|
-
return TransformFromString(vals, result, count,
|
|
512
|
+
return TransformFromString(vals, result, count, options.strict_cast);
|
|
359
513
|
case LogicalTypeId::VARCHAR:
|
|
360
514
|
case LogicalTypeId::BLOB:
|
|
361
515
|
return TransformToString(vals, alc, result, count);
|
|
362
516
|
case LogicalTypeId::STRUCT:
|
|
363
|
-
return TransformObject(vals, alc, result, count, result_type,
|
|
517
|
+
return TransformObject(vals, alc, result, count, result_type, options);
|
|
364
518
|
case LogicalTypeId::LIST:
|
|
365
|
-
return TransformArray(vals, alc, result, count,
|
|
519
|
+
return TransformArray(vals, alc, result, count, options);
|
|
366
520
|
default:
|
|
367
521
|
throw InternalException("Unexpected type at JSON Transform %s", result_type.ToString());
|
|
368
522
|
}
|
|
@@ -393,8 +547,10 @@ static void TransformFunction(DataChunk &args, ExpressionState &state, Vector &r
|
|
|
393
547
|
vals[i] = docs[i]->root;
|
|
394
548
|
}
|
|
395
549
|
}
|
|
396
|
-
|
|
397
|
-
|
|
550
|
+
|
|
551
|
+
const JSONTransformOptions options {strict, strict, strict, false, nullptr};
|
|
552
|
+
|
|
553
|
+
Transform(vals, alc, result, count, options);
|
|
398
554
|
|
|
399
555
|
if (args.AllConstant()) {
|
|
400
556
|
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
#include "json_functions.hpp"
|
|
2
|
+
#include "json_scan.hpp"
|
|
3
|
+
#include "json_structure.hpp"
|
|
4
|
+
#include "json_transform.hpp"
|
|
5
|
+
|
|
6
|
+
namespace duckdb {
|
|
7
|
+
|
|
8
|
+
void AutoDetect(ClientContext &context, JSONScanData &bind_data, vector<LogicalType> &return_types,
|
|
9
|
+
vector<string> &names) {
|
|
10
|
+
auto original_scan_type = bind_data.type;
|
|
11
|
+
bind_data.type = JSONScanType::SAMPLE; // Set scan type to sample for the auto-detect, we restore it later
|
|
12
|
+
JSONScanGlobalState gstate(context, bind_data);
|
|
13
|
+
JSONScanLocalState lstate(context, gstate);
|
|
14
|
+
ArenaAllocator allocator(BufferAllocator::Get(context));
|
|
15
|
+
|
|
16
|
+
static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
|
|
17
|
+
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
|
18
|
+
{LogicalTypeId::TIMESTAMP,
|
|
19
|
+
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
|
|
20
|
+
"%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
// Populate possible date/timestamp formats, assume this is consistent across columns
|
|
24
|
+
bind_data.date_format_map.Initialize(FORMAT_TEMPLATES);
|
|
25
|
+
|
|
26
|
+
// Read for the specified sample size
|
|
27
|
+
JSONStructureNode node;
|
|
28
|
+
Vector string_vector(LogicalType::VARCHAR);
|
|
29
|
+
idx_t read = 0;
|
|
30
|
+
while (read < bind_data.sample_size) {
|
|
31
|
+
allocator.Reset();
|
|
32
|
+
auto count = lstate.ReadNext(gstate);
|
|
33
|
+
if (count == 0) {
|
|
34
|
+
break;
|
|
35
|
+
}
|
|
36
|
+
idx_t i;
|
|
37
|
+
for (i = 0; i < count; i++) {
|
|
38
|
+
if (lstate.objects[i]) {
|
|
39
|
+
JSONStructure::ExtractStructure(lstate.objects[i], node);
|
|
40
|
+
}
|
|
41
|
+
if (++read == bind_data.sample_size) {
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
node.InitializeCandidateTypes(bind_data.max_depth);
|
|
49
|
+
node.RefineCandidateTypes(lstate.objects, i, string_vector, allocator, bind_data.date_format_map);
|
|
50
|
+
}
|
|
51
|
+
bind_data.type = original_scan_type;
|
|
52
|
+
bind_data.transform_options.date_format_map = &bind_data.date_format_map;
|
|
53
|
+
|
|
54
|
+
const auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
|
|
55
|
+
if (type.id() != LogicalTypeId::STRUCT) {
|
|
56
|
+
return_types.emplace_back(type);
|
|
57
|
+
names.emplace_back("json");
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const auto &child_types = StructType::GetChildTypes(type);
|
|
62
|
+
return_types.reserve(child_types.size());
|
|
63
|
+
names.reserve(child_types.size());
|
|
64
|
+
for (auto &child_type : child_types) {
|
|
65
|
+
return_types.emplace_back(child_type.second);
|
|
66
|
+
names.emplace_back(child_type.first);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
for (auto &reader : gstate.json_readers) {
|
|
70
|
+
if (reader->IsOpen()) {
|
|
71
|
+
reader->Reset();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
bind_data.stored_readers = std::move(gstate.json_readers);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
unique_ptr<FunctionData> ReadJSONBind(ClientContext &context, TableFunctionBindInput &input,
|
|
78
|
+
vector<LogicalType> &return_types, vector<string> &names) {
|
|
79
|
+
// First bind default params
|
|
80
|
+
auto result = JSONScanData::Bind(context, input);
|
|
81
|
+
auto &bind_data = (JSONScanData &)*result;
|
|
82
|
+
|
|
83
|
+
for (auto &kv : input.named_parameters) {
|
|
84
|
+
auto loption = StringUtil::Lower(kv.first);
|
|
85
|
+
if (loption == "columns") {
|
|
86
|
+
auto &child_type = kv.second.type();
|
|
87
|
+
if (child_type.id() != LogicalTypeId::STRUCT) {
|
|
88
|
+
throw BinderException("read_json \"columns\" parameter requires a struct as input");
|
|
89
|
+
}
|
|
90
|
+
auto &struct_children = StructValue::GetChildren(kv.second);
|
|
91
|
+
D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
|
|
92
|
+
for (idx_t i = 0; i < struct_children.size(); i++) {
|
|
93
|
+
auto &name = StructType::GetChildName(child_type, i);
|
|
94
|
+
auto &val = struct_children[i];
|
|
95
|
+
names.push_back(name);
|
|
96
|
+
if (val.type().id() != LogicalTypeId::VARCHAR) {
|
|
97
|
+
throw BinderException("read_json \"columns\" parameter type specification must be VARCHAR");
|
|
98
|
+
}
|
|
99
|
+
return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
|
|
100
|
+
}
|
|
101
|
+
D_ASSERT(names.size() == return_types.size());
|
|
102
|
+
if (names.empty()) {
|
|
103
|
+
throw BinderException("read_json \"columns\" parameter needs at least one column");
|
|
104
|
+
}
|
|
105
|
+
bind_data.names = names;
|
|
106
|
+
} else if (loption == "auto_detect") {
|
|
107
|
+
bind_data.auto_detect = BooleanValue::Get(kv.second);
|
|
108
|
+
} else if (loption == "sample_size") {
|
|
109
|
+
auto arg = BigIntValue::Get(kv.second);
|
|
110
|
+
if (arg == -1) {
|
|
111
|
+
bind_data.sample_size = NumericLimits<idx_t>::Maximum();
|
|
112
|
+
} else if (arg > 0) {
|
|
113
|
+
bind_data.sample_size = arg;
|
|
114
|
+
} else {
|
|
115
|
+
throw BinderException(
|
|
116
|
+
"read_json \"sample_size\" parameter must be positive, or -1 to sample the entire file");
|
|
117
|
+
}
|
|
118
|
+
} else if (loption == "maximum_depth") {
|
|
119
|
+
auto arg = BigIntValue::Get(kv.second);
|
|
120
|
+
if (arg == -1) {
|
|
121
|
+
bind_data.max_depth = NumericLimits<idx_t>::Maximum();
|
|
122
|
+
} else {
|
|
123
|
+
bind_data.max_depth = arg;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (!bind_data.names.empty()) {
|
|
129
|
+
bind_data.auto_detect = false; // override auto-detect when columns are specified
|
|
130
|
+
} else if (!bind_data.auto_detect) {
|
|
131
|
+
throw BinderException("read_json \"columns\" parameter is required when auto_detect is false");
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (bind_data.auto_detect) {
|
|
135
|
+
AutoDetect(context, bind_data, return_types, names);
|
|
136
|
+
bind_data.names = names;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
auto &transform_options = bind_data.transform_options;
|
|
140
|
+
transform_options.strict_cast = !bind_data.ignore_errors;
|
|
141
|
+
transform_options.error_duplicate_key = !bind_data.ignore_errors;
|
|
142
|
+
transform_options.error_missing_key = false;
|
|
143
|
+
transform_options.error_unknown_key = bind_data.auto_detect && !bind_data.ignore_errors;
|
|
144
|
+
|
|
145
|
+
return result;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
149
|
+
auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
|
|
150
|
+
auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
|
|
151
|
+
D_ASSERT(output.ColumnCount() == gstate.bind_data.names.size());
|
|
152
|
+
|
|
153
|
+
// Fetch next lines
|
|
154
|
+
const auto count = lstate.ReadNext(gstate);
|
|
155
|
+
const auto objects = lstate.objects;
|
|
156
|
+
|
|
157
|
+
vector<Vector *> result_vectors;
|
|
158
|
+
result_vectors.reserve(output.ColumnCount());
|
|
159
|
+
for (idx_t col_idx = 0; col_idx < output.ColumnCount(); col_idx++) {
|
|
160
|
+
result_vectors.push_back(&output.data[col_idx]);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// TODO: if errors occur during transformation, we don't have line number information
|
|
164
|
+
JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names, result_vectors,
|
|
165
|
+
gstate.bind_data.transform_options);
|
|
166
|
+
output.SetCardinality(count);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
TableFunction GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
|
|
170
|
+
auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
|
|
171
|
+
TableFunction table_function({parameter}, ReadJSONFunction, ReadJSONBind, JSONGlobalTableFunctionState::Init,
|
|
172
|
+
JSONLocalTableFunctionState::Init);
|
|
173
|
+
|
|
174
|
+
JSONScan::TableFunctionDefaults(table_function);
|
|
175
|
+
table_function.named_parameters["columns"] = LogicalType::ANY;
|
|
176
|
+
table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
|
|
177
|
+
table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
|
|
178
|
+
|
|
179
|
+
table_function.projection_pushdown = true;
|
|
180
|
+
|
|
181
|
+
table_function.function_info = std::move(function_info);
|
|
182
|
+
|
|
183
|
+
return table_function;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
TableFunction GetReadJSONAutoTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
|
|
187
|
+
auto table_function = GetReadJSONTableFunction(list_parameter, std::move(function_info));
|
|
188
|
+
table_function.named_parameters["maximum_depth"] = LogicalType::BIGINT;
|
|
189
|
+
return table_function;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
CreateTableFunctionInfo JSONFunctions::GetReadJSONFunction() {
|
|
193
|
+
TableFunctionSet function_set("read_json");
|
|
194
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::UNSTRUCTURED, false);
|
|
195
|
+
function_set.AddFunction(GetReadJSONTableFunction(false, function_info));
|
|
196
|
+
function_set.AddFunction(GetReadJSONTableFunction(true, function_info));
|
|
197
|
+
return CreateTableFunctionInfo(function_set);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
CreateTableFunctionInfo JSONFunctions::GetReadNDJSONFunction() {
|
|
201
|
+
TableFunctionSet function_set("read_ndjson");
|
|
202
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, false);
|
|
203
|
+
function_set.AddFunction(GetReadJSONTableFunction(false, function_info));
|
|
204
|
+
function_set.AddFunction(GetReadJSONTableFunction(true, function_info));
|
|
205
|
+
return CreateTableFunctionInfo(function_set);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
CreateTableFunctionInfo JSONFunctions::GetReadJSONAutoFunction() {
|
|
209
|
+
TableFunctionSet function_set("read_json_auto");
|
|
210
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, true);
|
|
211
|
+
function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
|
|
212
|
+
function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
|
|
213
|
+
return CreateTableFunctionInfo(function_set);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
CreateTableFunctionInfo JSONFunctions::GetReadNDJSONAutoFunction() {
|
|
217
|
+
TableFunctionSet function_set("read_ndjson_auto");
|
|
218
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, true);
|
|
219
|
+
function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
|
|
220
|
+
function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
|
|
221
|
+
return CreateTableFunctionInfo(function_set);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
} // namespace duckdb
|