duckdb 0.6.2-dev1971.0 → 0.6.2-dev2015.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/json/buffered_json_reader.cpp +132 -18
  3. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +29 -9
  4. package/src/duckdb/extension/json/include/json_common.hpp +56 -0
  5. package/src/duckdb/extension/json/include/json_functions.hpp +9 -0
  6. package/src/duckdb/extension/json/include/json_scan.hpp +115 -25
  7. package/src/duckdb/extension/json/include/json_structure.hpp +73 -0
  8. package/src/duckdb/extension/json/include/json_transform.hpp +57 -0
  9. package/src/duckdb/extension/json/json-extension.cpp +3 -0
  10. package/src/duckdb/extension/json/json_functions/json_contains.cpp +1 -1
  11. package/src/duckdb/extension/json/json_functions/json_create.cpp +6 -10
  12. package/src/duckdb/extension/json/json_functions/json_extract.cpp +1 -1
  13. package/src/duckdb/extension/json/json_functions/json_keys.cpp +60 -0
  14. package/src/duckdb/extension/json/json_functions/json_structure.cpp +404 -150
  15. package/src/duckdb/extension/json/json_functions/json_transform.cpp +216 -60
  16. package/src/duckdb/extension/json/json_functions/read_json.cpp +224 -0
  17. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +6 -6
  18. package/src/duckdb/extension/json/json_functions.cpp +25 -0
  19. package/src/duckdb/extension/json/json_scan.cpp +192 -86
  20. package/src/duckdb/extension/json/yyjson/include/yyjson.hpp +18 -9
  21. package/src/duckdb/extension/json/yyjson/yyjson.cpp +58 -13
  22. package/src/duckdb/src/function/table/copy_csv.cpp +16 -11
  23. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  24. package/src/duckdb/src/include/duckdb/function/scalar/strftime.hpp +2 -2
  25. package/src/duckdb/src/include/duckdb/main/extension_functions.hpp +5 -0
  26. package/src/duckdb/ub_extension_json_json_functions.cpp +4 -0
@@ -1,19 +1,35 @@
1
+ #include "json_transform.hpp"
2
+
1
3
  #include "duckdb/common/types.hpp"
2
4
  #include "duckdb/execution/expression_executor.hpp"
3
5
  #include "duckdb/function/scalar/nested_functions.hpp"
4
- #include "json_common.hpp"
5
6
  #include "json_functions.hpp"
7
+ #include "json_scan.hpp"
6
8
 
7
9
  namespace duckdb {
8
10
 
11
+ void JSONTransformOptions::Serialize(FieldWriter &writer) {
12
+ writer.WriteField(strict_cast);
13
+ writer.WriteField(error_duplicate_key);
14
+ writer.WriteField(error_missing_key);
15
+ writer.WriteField(error_unknown_key);
16
+ }
17
+
18
+ void JSONTransformOptions::Deserialize(FieldReader &reader) {
19
+ strict_cast = reader.ReadRequired<bool>();
20
+ error_duplicate_key = reader.ReadRequired<bool>();
21
+ error_missing_key = reader.ReadRequired<bool>();
22
+ error_unknown_key = reader.ReadRequired<bool>();
23
+ }
24
+
9
25
  //! Forward declaration for recursion
10
- static LogicalType StructureToType(yyjson_val *val, ClientContext &context);
26
+ static LogicalType StructureStringToType(yyjson_val *val, ClientContext &context);
11
27
 
12
- static LogicalType StructureToTypeArray(yyjson_val *arr, ClientContext &context) {
28
+ static LogicalType StructureStringToTypeArray(yyjson_val *arr, ClientContext &context) {
13
29
  if (yyjson_arr_size(arr) != 1) {
14
30
  throw InvalidInputException("Too many values in array of JSON structure");
15
31
  }
16
- return LogicalType::LIST(StructureToType(yyjson_arr_get_first(arr), context));
32
+ return LogicalType::LIST(StructureStringToType(yyjson_arr_get_first(arr), context));
17
33
  }
18
34
 
19
35
  static LogicalType StructureToTypeObject(yyjson_val *obj, ClientContext &context) {
@@ -23,25 +39,25 @@ static LogicalType StructureToTypeObject(yyjson_val *obj, ClientContext &context
23
39
  yyjson_val *key, *val;
24
40
  yyjson_obj_foreach(obj, idx, max, key, val) {
25
41
  val = yyjson_obj_iter_get_val(key);
26
- auto key_str = yyjson_get_str(key);
42
+ auto key_str = unsafe_yyjson_get_str(key);
27
43
  if (names.find(key_str) != names.end()) {
28
44
  JSONCommon::ThrowValFormatError("Duplicate keys in object in JSON structure: %s", val);
29
45
  }
30
46
  names.insert(key_str);
31
- child_types.emplace_back(key_str, StructureToType(val, context));
47
+ child_types.emplace_back(key_str, StructureStringToType(val, context));
32
48
  }
33
49
  D_ASSERT(yyjson_obj_size(obj) == names.size());
34
50
  return LogicalType::STRUCT(child_types);
35
51
  }
36
52
 
37
- static LogicalType StructureToType(yyjson_val *val, ClientContext &context) {
53
+ static LogicalType StructureStringToType(yyjson_val *val, ClientContext &context) {
38
54
  switch (yyjson_get_tag(val)) {
39
55
  case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
40
- return StructureToTypeArray(val, context);
56
+ return StructureStringToTypeArray(val, context);
41
57
  case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
42
58
  return StructureToTypeObject(val, context);
43
59
  case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
44
- return TransformStringToLogicalType(yyjson_get_str(val), context);
60
+ return TransformStringToLogicalType(unsafe_yyjson_get_str(val), context);
45
61
  default:
46
62
  throw InvalidInputException("invalid JSON structure");
47
63
  }
@@ -70,7 +86,7 @@ static unique_ptr<FunctionData> JSONTransformBind(ClientContext &context, Scalar
70
86
  if (err.code != YYJSON_READ_SUCCESS) {
71
87
  JSONCommon::ThrowParseError(structure_string.GetDataUnsafe(), structure_string.GetSize(), err);
72
88
  }
73
- bound_function.return_type = StructureToType(doc->root, context);
89
+ bound_function.return_type = StructureStringToType(doc->root, context);
74
90
  }
75
91
  return make_unique<VariableReturnBindData>(bound_function.return_type);
76
92
  }
@@ -155,7 +171,7 @@ static inline bool GetValueString(yyjson_val *val, yyjson_alc *alc, string_t &re
155
171
  case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
156
172
  return false;
157
173
  case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
158
- result = StringVector::AddString(vector, unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val));
174
+ result = string_t(unsafe_yyjson_get_str(val), unsafe_yyjson_get_len(val));
159
175
  return true;
160
176
  case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
161
177
  case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
@@ -179,9 +195,6 @@ static inline bool GetValueString(yyjson_val *val, yyjson_alc *alc, string_t &re
179
195
  }
180
196
  }
181
197
 
182
- //! Forward declaration for recursion
183
- static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count, bool strict);
184
-
185
198
  template <class T>
186
199
  static void TransformNumerical(yyjson_val *vals[], Vector &result, const idx_t count, const bool strict) {
187
200
  auto data = (T *)FlatVector::GetData(result);
@@ -207,9 +220,8 @@ static void TransformDecimal(yyjson_val *vals[], Vector &result, const idx_t cou
207
220
  }
208
221
  }
209
222
 
210
- static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t count, const LogicalType &target,
211
- const bool strict) {
212
- Vector string_vector(LogicalTypeId::VARCHAR, count);
223
+ void JSONTransform::GetStringVector(yyjson_val *vals[], const idx_t count, const LogicalType &target,
224
+ Vector &string_vector, const bool strict) {
213
225
  auto data = (string_t *)FlatVector::GetData(string_vector);
214
226
  auto &validity = FlatVector::Validity(string_vector);
215
227
 
@@ -223,6 +235,11 @@ static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t
223
235
  data[i] = GetString(val);
224
236
  }
225
237
  }
238
+ }
239
+
240
+ static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t count, const bool strict) {
241
+ Vector string_vector(LogicalTypeId::VARCHAR, count);
242
+ JSONTransform::GetStringVector(vals, count, result.GetType(), string_vector, strict);
226
243
 
227
244
  string error_message;
228
245
  if (!VectorOperations::DefaultTryCast(string_vector, result, count, &error_message, strict) && strict) {
@@ -230,6 +247,63 @@ static void TransformFromString(yyjson_val *vals[], Vector &result, const idx_t
230
247
  }
231
248
  }
232
249
 
250
+ template <class OP, class T>
251
+ static bool TransformStringWithFormat(Vector &string_vector, StrpTimeFormat &format, const idx_t count, Vector &result,
252
+ string &error_message) {
253
+ const auto source_strings = FlatVector::GetData<string_t>(string_vector);
254
+ const auto &source_validity = FlatVector::Validity(string_vector);
255
+
256
+ auto target_vals = FlatVector::GetData<T>(result);
257
+ auto &target_validity = FlatVector::Validity(result);
258
+
259
+ bool success = true;
260
+ if (source_validity.AllValid()) {
261
+ for (idx_t i = 0; i < count; i++) {
262
+ if (!OP::template Operation<T>(format, source_strings[i], target_vals[i], error_message)) {
263
+ target_validity.SetInvalid(i);
264
+ success = false;
265
+ }
266
+ }
267
+ } else {
268
+ for (idx_t i = 0; i < count; i++) {
269
+ if (!source_validity.RowIsValid(i)) {
270
+ target_validity.SetInvalid(i);
271
+ } else if (!OP::template Operation<T>(format, source_strings[i], target_vals[i], error_message)) {
272
+ target_validity.SetInvalid(i);
273
+ success = false;
274
+ }
275
+ }
276
+ }
277
+ return success;
278
+ }
279
+
280
+ static void TransformFromStringWithFormat(yyjson_val *vals[], Vector &result, const idx_t count,
281
+ const JSONTransformOptions &options) {
282
+ Vector string_vector(LogicalTypeId::VARCHAR, count);
283
+ JSONTransform::GetStringVector(vals, count, result.GetType(), string_vector, options.strict_cast);
284
+
285
+ const auto &result_type = result.GetType().id();
286
+ auto &format = options.date_format_map->GetFormat(result_type);
287
+
288
+ bool success;
289
+ string error_message;
290
+ switch (result_type) {
291
+ case LogicalTypeId::DATE:
292
+ success = TransformStringWithFormat<TryParseDate, date_t>(string_vector, format, count, result, error_message);
293
+ break;
294
+ case LogicalTypeId::TIMESTAMP:
295
+ success = TransformStringWithFormat<TryParseTimeStamp, timestamp_t>(string_vector, format, count, result,
296
+ error_message);
297
+ break;
298
+ default:
299
+ throw InternalException("No date/timestamp formats for %s", LogicalTypeIdToString(result.GetType().id()));
300
+ }
301
+
302
+ if (options.strict_cast && !success) {
303
+ throw CastException(error_message);
304
+ }
305
+ }
306
+
233
307
  static void TransformToString(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count) {
234
308
  auto data = (string_t *)FlatVector::GetData(result);
235
309
  auto &validity = FlatVector::Validity(result);
@@ -241,44 +315,118 @@ static void TransformToString(yyjson_val *vals[], yyjson_alc *alc, Vector &resul
241
315
  }
242
316
  }
243
317
 
244
- static void TransformObject(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
245
- const LogicalType &type, bool strict) {
246
- // Initialize array for the nested values
247
- auto nested_vals_ptr = unique_ptr<yyjson_val *[]>(new yyjson_val *[count]);
248
- auto nested_vals = nested_vals_ptr.get();
249
- // Loop through child types
318
+ static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
319
+ const JSONTransformOptions &options);
320
+
321
+ void JSONTransform::TransformObject(yyjson_val *objects[], yyjson_alc *alc, const idx_t count,
322
+ const vector<string> &names, const vector<Vector *> &result_vectors,
323
+ const JSONTransformOptions &options) {
324
+ D_ASSERT(alc);
325
+ D_ASSERT(names.size() == result_vectors.size());
326
+ const idx_t column_count = names.size();
327
+
328
+ // Build hash map from key to column index so we don't have to linearly search using the key
329
+ json_key_map_t<idx_t> key_map;
330
+ vector<yyjson_val **> nested_vals;
331
+ nested_vals.reserve(column_count);
332
+ for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
333
+ key_map.insert({{names[col_idx].c_str(), names[col_idx].length()}, col_idx});
334
+ nested_vals.push_back((yyjson_val **)alc->malloc(alc->ctx, sizeof(yyjson_val *) * count));
335
+ }
336
+
337
+ idx_t found_key_count;
338
+ auto found_keys = (bool *)alc->malloc(alc->ctx, sizeof(bool) * column_count);
339
+
340
+ size_t idx, max;
341
+ yyjson_val *key, *val;
342
+ for (idx_t i = 0; i < count; i++) {
343
+ if (objects[i]) {
344
+ found_key_count = 0;
345
+ memset(found_keys, false, column_count);
346
+ yyjson_obj_foreach(objects[i], idx, max, key, val) {
347
+ auto key_ptr = yyjson_get_str(key);
348
+ auto key_len = yyjson_get_len(key);
349
+ auto it = key_map.find({key_ptr, key_len});
350
+ if (it != key_map.end()) {
351
+ const auto &col_idx = it->second;
352
+ if (options.error_duplicate_key && found_keys[col_idx]) {
353
+ JSONCommon::ThrowValFormatError(
354
+ "Duplicate key \"" + string(key_ptr, key_len) + "\" in object %s", objects[i]);
355
+ }
356
+ nested_vals[col_idx][i] = val;
357
+ found_keys[col_idx] = true;
358
+ if (++found_key_count == column_count) {
359
+ break;
360
+ }
361
+ } else if (options.error_unknown_key) {
362
+ JSONCommon::ThrowValFormatError("Object %s has unknown key \"" + string(key_ptr, key_len) + "\"",
363
+ objects[i]);
364
+ }
365
+ }
366
+ if (found_key_count != column_count) {
367
+ // If 'error_missing_key, we throw an error if one of the keys was not found.
368
+ // If not, we set the nested val to null so the recursion doesn't break
369
+ for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
370
+ if (!found_keys[col_idx]) {
371
+ if (options.error_missing_key) {
372
+ JSONCommon::ThrowValFormatError("Object %s does not have key \"" + names[col_idx] + "\"",
373
+ objects[i]);
374
+ } else {
375
+ nested_vals[col_idx][i] = nullptr;
376
+ }
377
+ }
378
+ }
379
+ }
380
+ } else {
381
+ // Set nested val to null so the recursion doesn't break
382
+ for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
383
+ nested_vals[col_idx][i] = nullptr;
384
+ }
385
+ }
386
+ }
387
+
388
+ for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
389
+ Transform(nested_vals[col_idx], alc, *result_vectors[col_idx], count, options);
390
+ }
391
+ }
392
+
393
+ static void TransformObject(yyjson_val *objects[], yyjson_alc *alc, Vector &result, const idx_t count,
394
+ const LogicalType &type, const JSONTransformOptions &options) {
395
+ // Get child vectors and names
250
396
  auto &child_vs = StructVector::GetEntries(result);
397
+ vector<string> child_names;
398
+ vector<Vector *> child_vectors;
399
+ child_names.reserve(child_vs.size());
400
+ child_vectors.reserve(child_vs.size());
251
401
  for (idx_t child_i = 0; child_i < child_vs.size(); child_i++) {
252
- const auto &name = StructType::GetChildName(type, child_i);
253
- auto name_ptr = name.c_str();
254
- auto name_len = name.size();
255
- for (idx_t i = 0; i < count; i++) {
256
- nested_vals[i] = yyjson_obj_getn(vals[i], name_ptr, name_len);
257
- }
258
- // Transform child values
259
- Transform(nested_vals, alc, *child_vs[child_i], count, strict);
402
+ child_names.push_back(StructType::GetChildName(type, child_i));
403
+ child_vectors.push_back(child_vs[child_i].get());
260
404
  }
405
+
406
+ JSONTransform::TransformObject(objects, alc, count, child_names, child_vectors, options);
261
407
  }
262
408
 
263
- static void TransformArray(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count, bool strict) {
409
+ static void TransformArray(yyjson_val *arrays[], yyjson_alc *alc, Vector &result, const idx_t count,
410
+ const JSONTransformOptions &options) {
264
411
  // Initialize list vector
265
412
  auto list_entries = FlatVector::GetData<list_entry_t>(result);
266
413
  auto &list_validity = FlatVector::Validity(result);
267
414
  idx_t offset = 0;
268
415
  for (idx_t i = 0; i < count; i++) {
269
- if (!vals[i] || yyjson_is_null(vals[i])) {
416
+ if (!arrays[i] || yyjson_is_null(arrays[i])) {
270
417
  list_validity.SetInvalid(i);
271
418
  }
272
419
  auto &entry = list_entries[i];
273
420
  entry.offset = offset;
274
- entry.length = yyjson_arr_size(vals[i]);
421
+ entry.length = yyjson_arr_size(arrays[i]);
275
422
  offset += entry.length;
276
423
  }
277
424
  ListVector::SetListSize(result, offset);
278
425
  ListVector::Reserve(result, offset);
426
+
279
427
  // Initialize array for the nested values
280
- auto nested_vals_ptr = unique_ptr<yyjson_val *[]>(new yyjson_val *[offset]);
281
- auto nested_vals = nested_vals_ptr.get();
428
+ auto nested_vals = (yyjson_val **)alc->malloc(alc->ctx, sizeof(yyjson_val *) * offset);
429
+
282
430
  // Get array values
283
431
  size_t idx, max;
284
432
  yyjson_val *val;
@@ -288,57 +436,63 @@ static void TransformArray(yyjson_val *vals[], yyjson_alc *alc, Vector &result,
288
436
  // We already marked this as invalid
289
437
  continue;
290
438
  }
291
- yyjson_arr_foreach(vals[i], idx, max, val) {
439
+ yyjson_arr_foreach(arrays[i], idx, max, val) {
292
440
  nested_vals[list_i] = val;
293
441
  list_i++;
294
442
  }
295
443
  }
296
444
  D_ASSERT(list_i == offset);
297
445
  // Transform array values
298
- Transform(nested_vals, alc, ListVector::GetEntry(result), offset, strict);
446
+ Transform(nested_vals, alc, ListVector::GetEntry(result), offset, options);
299
447
  }
300
448
 
301
- static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count, bool strict) {
449
+ static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
450
+ const JSONTransformOptions &options) {
302
451
  auto result_type = result.GetType();
452
+ if (options.date_format_map && (result_type == LogicalTypeId::TIMESTAMP || result_type == LogicalTypeId::DATE)) {
453
+ TransformFromStringWithFormat(vals, result, count, options);
454
+ return;
455
+ }
456
+
303
457
  switch (result_type.id()) {
304
458
  case LogicalTypeId::SQLNULL:
305
459
  return;
306
460
  case LogicalTypeId::BOOLEAN:
307
- return TransformNumerical<bool>(vals, result, count, strict);
461
+ return TransformNumerical<bool>(vals, result, count, options.strict_cast);
308
462
  case LogicalTypeId::TINYINT:
309
- return TransformNumerical<int8_t>(vals, result, count, strict);
463
+ return TransformNumerical<int8_t>(vals, result, count, options.strict_cast);
310
464
  case LogicalTypeId::SMALLINT:
311
- return TransformNumerical<int16_t>(vals, result, count, strict);
465
+ return TransformNumerical<int16_t>(vals, result, count, options.strict_cast);
312
466
  case LogicalTypeId::INTEGER:
313
- return TransformNumerical<int32_t>(vals, result, count, strict);
467
+ return TransformNumerical<int32_t>(vals, result, count, options.strict_cast);
314
468
  case LogicalTypeId::BIGINT:
315
- return TransformNumerical<int64_t>(vals, result, count, strict);
469
+ return TransformNumerical<int64_t>(vals, result, count, options.strict_cast);
316
470
  case LogicalTypeId::UTINYINT:
317
- return TransformNumerical<uint8_t>(vals, result, count, strict);
471
+ return TransformNumerical<uint8_t>(vals, result, count, options.strict_cast);
318
472
  case LogicalTypeId::USMALLINT:
319
- return TransformNumerical<uint16_t>(vals, result, count, strict);
473
+ return TransformNumerical<uint16_t>(vals, result, count, options.strict_cast);
320
474
  case LogicalTypeId::UINTEGER:
321
- return TransformNumerical<uint32_t>(vals, result, count, strict);
475
+ return TransformNumerical<uint32_t>(vals, result, count, options.strict_cast);
322
476
  case LogicalTypeId::UBIGINT:
323
- return TransformNumerical<uint64_t>(vals, result, count, strict);
477
+ return TransformNumerical<uint64_t>(vals, result, count, options.strict_cast);
324
478
  case LogicalTypeId::HUGEINT:
325
- return TransformNumerical<hugeint_t>(vals, result, count, strict);
479
+ return TransformNumerical<hugeint_t>(vals, result, count, options.strict_cast);
326
480
  case LogicalTypeId::FLOAT:
327
- return TransformNumerical<float>(vals, result, count, strict);
481
+ return TransformNumerical<float>(vals, result, count, options.strict_cast);
328
482
  case LogicalTypeId::DOUBLE:
329
- return TransformNumerical<double>(vals, result, count, strict);
483
+ return TransformNumerical<double>(vals, result, count, options.strict_cast);
330
484
  case LogicalTypeId::DECIMAL: {
331
485
  auto width = DecimalType::GetWidth(result_type);
332
486
  auto scale = DecimalType::GetScale(result_type);
333
487
  switch (result_type.InternalType()) {
334
488
  case PhysicalType::INT16:
335
- return TransformDecimal<int16_t>(vals, result, count, width, scale, strict);
489
+ return TransformDecimal<int16_t>(vals, result, count, width, scale, options.strict_cast);
336
490
  case PhysicalType::INT32:
337
- return TransformDecimal<int32_t>(vals, result, count, width, scale, strict);
491
+ return TransformDecimal<int32_t>(vals, result, count, width, scale, options.strict_cast);
338
492
  case PhysicalType::INT64:
339
- return TransformDecimal<int64_t>(vals, result, count, width, scale, strict);
493
+ return TransformDecimal<int64_t>(vals, result, count, width, scale, options.strict_cast);
340
494
  case PhysicalType::INT128:
341
- return TransformDecimal<hugeint_t>(vals, result, count, width, scale, strict);
495
+ return TransformDecimal<hugeint_t>(vals, result, count, width, scale, options.strict_cast);
342
496
  default:
343
497
  throw InternalException("Unimplemented physical type for decimal");
344
498
  }
@@ -355,14 +509,14 @@ static void Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const
355
509
  case LogicalTypeId::TIMESTAMP_MS:
356
510
  case LogicalTypeId::TIMESTAMP_SEC:
357
511
  case LogicalTypeId::UUID:
358
- return TransformFromString(vals, result, count, result_type, strict);
512
+ return TransformFromString(vals, result, count, options.strict_cast);
359
513
  case LogicalTypeId::VARCHAR:
360
514
  case LogicalTypeId::BLOB:
361
515
  return TransformToString(vals, alc, result, count);
362
516
  case LogicalTypeId::STRUCT:
363
- return TransformObject(vals, alc, result, count, result_type, strict);
517
+ return TransformObject(vals, alc, result, count, result_type, options);
364
518
  case LogicalTypeId::LIST:
365
- return TransformArray(vals, alc, result, count, strict);
519
+ return TransformArray(vals, alc, result, count, options);
366
520
  default:
367
521
  throw InternalException("Unexpected type at JSON Transform %s", result_type.ToString());
368
522
  }
@@ -393,8 +547,10 @@ static void TransformFunction(DataChunk &args, ExpressionState &state, Vector &r
393
547
  vals[i] = docs[i]->root;
394
548
  }
395
549
  }
396
- // Transform
397
- Transform(vals, alc, result, count, strict);
550
+
551
+ const JSONTransformOptions options {strict, strict, strict, false, nullptr};
552
+
553
+ Transform(vals, alc, result, count, options);
398
554
 
399
555
  if (args.AllConstant()) {
400
556
  result.SetVectorType(VectorType::CONSTANT_VECTOR);
@@ -0,0 +1,224 @@
1
+ #include "json_functions.hpp"
2
+ #include "json_scan.hpp"
3
+ #include "json_structure.hpp"
4
+ #include "json_transform.hpp"
5
+
6
+ namespace duckdb {
7
+
8
+ void AutoDetect(ClientContext &context, JSONScanData &bind_data, vector<LogicalType> &return_types,
9
+ vector<string> &names) {
10
+ auto original_scan_type = bind_data.type;
11
+ bind_data.type = JSONScanType::SAMPLE; // Set scan type to sample for the auto-detect, we restore it later
12
+ JSONScanGlobalState gstate(context, bind_data);
13
+ JSONScanLocalState lstate(context, gstate);
14
+ ArenaAllocator allocator(BufferAllocator::Get(context));
15
+
16
+ static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
17
+ {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
18
+ {LogicalTypeId::TIMESTAMP,
19
+ {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
20
+ "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
21
+ };
22
+
23
+ // Populate possible date/timestamp formats, assume this is consistent across columns
24
+ bind_data.date_format_map.Initialize(FORMAT_TEMPLATES);
25
+
26
+ // Read for the specified sample size
27
+ JSONStructureNode node;
28
+ Vector string_vector(LogicalType::VARCHAR);
29
+ idx_t read = 0;
30
+ while (read < bind_data.sample_size) {
31
+ allocator.Reset();
32
+ auto count = lstate.ReadNext(gstate);
33
+ if (count == 0) {
34
+ break;
35
+ }
36
+ idx_t i;
37
+ for (i = 0; i < count; i++) {
38
+ if (lstate.objects[i]) {
39
+ JSONStructure::ExtractStructure(lstate.objects[i], node);
40
+ }
41
+ if (++read == bind_data.sample_size) {
42
+ break;
43
+ }
44
+ }
45
+ if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
46
+ continue;
47
+ }
48
+ node.InitializeCandidateTypes(bind_data.max_depth);
49
+ node.RefineCandidateTypes(lstate.objects, i, string_vector, allocator, bind_data.date_format_map);
50
+ }
51
+ bind_data.type = original_scan_type;
52
+ bind_data.transform_options.date_format_map = &bind_data.date_format_map;
53
+
54
+ const auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
55
+ if (type.id() != LogicalTypeId::STRUCT) {
56
+ return_types.emplace_back(type);
57
+ names.emplace_back("json");
58
+ return;
59
+ }
60
+
61
+ const auto &child_types = StructType::GetChildTypes(type);
62
+ return_types.reserve(child_types.size());
63
+ names.reserve(child_types.size());
64
+ for (auto &child_type : child_types) {
65
+ return_types.emplace_back(child_type.second);
66
+ names.emplace_back(child_type.first);
67
+ }
68
+
69
+ for (auto &reader : gstate.json_readers) {
70
+ if (reader->IsOpen()) {
71
+ reader->Reset();
72
+ }
73
+ }
74
+ bind_data.stored_readers = std::move(gstate.json_readers);
75
+ }
76
+
77
+ unique_ptr<FunctionData> ReadJSONBind(ClientContext &context, TableFunctionBindInput &input,
78
+ vector<LogicalType> &return_types, vector<string> &names) {
79
+ // First bind default params
80
+ auto result = JSONScanData::Bind(context, input);
81
+ auto &bind_data = (JSONScanData &)*result;
82
+
83
+ for (auto &kv : input.named_parameters) {
84
+ auto loption = StringUtil::Lower(kv.first);
85
+ if (loption == "columns") {
86
+ auto &child_type = kv.second.type();
87
+ if (child_type.id() != LogicalTypeId::STRUCT) {
88
+ throw BinderException("read_json \"columns\" parameter requires a struct as input");
89
+ }
90
+ auto &struct_children = StructValue::GetChildren(kv.second);
91
+ D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
92
+ for (idx_t i = 0; i < struct_children.size(); i++) {
93
+ auto &name = StructType::GetChildName(child_type, i);
94
+ auto &val = struct_children[i];
95
+ names.push_back(name);
96
+ if (val.type().id() != LogicalTypeId::VARCHAR) {
97
+ throw BinderException("read_json \"columns\" parameter type specification must be VARCHAR");
98
+ }
99
+ return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
100
+ }
101
+ D_ASSERT(names.size() == return_types.size());
102
+ if (names.empty()) {
103
+ throw BinderException("read_json \"columns\" parameter needs at least one column");
104
+ }
105
+ bind_data.names = names;
106
+ } else if (loption == "auto_detect") {
107
+ bind_data.auto_detect = BooleanValue::Get(kv.second);
108
+ } else if (loption == "sample_size") {
109
+ auto arg = BigIntValue::Get(kv.second);
110
+ if (arg == -1) {
111
+ bind_data.sample_size = NumericLimits<idx_t>::Maximum();
112
+ } else if (arg > 0) {
113
+ bind_data.sample_size = arg;
114
+ } else {
115
+ throw BinderException(
116
+ "read_json \"sample_size\" parameter must be positive, or -1 to sample the entire file");
117
+ }
118
+ } else if (loption == "maximum_depth") {
119
+ auto arg = BigIntValue::Get(kv.second);
120
+ if (arg == -1) {
121
+ bind_data.max_depth = NumericLimits<idx_t>::Maximum();
122
+ } else {
123
+ bind_data.max_depth = arg;
124
+ }
125
+ }
126
+ }
127
+
128
+ if (!bind_data.names.empty()) {
129
+ bind_data.auto_detect = false; // override auto-detect when columns are specified
130
+ } else if (!bind_data.auto_detect) {
131
+ throw BinderException("read_json \"columns\" parameter is required when auto_detect is false");
132
+ }
133
+
134
+ if (bind_data.auto_detect) {
135
+ AutoDetect(context, bind_data, return_types, names);
136
+ bind_data.names = names;
137
+ }
138
+
139
+ auto &transform_options = bind_data.transform_options;
140
+ transform_options.strict_cast = !bind_data.ignore_errors;
141
+ transform_options.error_duplicate_key = !bind_data.ignore_errors;
142
+ transform_options.error_missing_key = false;
143
+ transform_options.error_unknown_key = bind_data.auto_detect && !bind_data.ignore_errors;
144
+
145
+ return result;
146
+ }
147
+
148
+ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
149
+ auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
150
+ auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
151
+ D_ASSERT(output.ColumnCount() == gstate.bind_data.names.size());
152
+
153
+ // Fetch next lines
154
+ const auto count = lstate.ReadNext(gstate);
155
+ const auto objects = lstate.objects;
156
+
157
+ vector<Vector *> result_vectors;
158
+ result_vectors.reserve(output.ColumnCount());
159
+ for (idx_t col_idx = 0; col_idx < output.ColumnCount(); col_idx++) {
160
+ result_vectors.push_back(&output.data[col_idx]);
161
+ }
162
+
163
+ // TODO: if errors occur during transformation, we don't have line number information
164
+ JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names, result_vectors,
165
+ gstate.bind_data.transform_options);
166
+ output.SetCardinality(count);
167
+ }
168
+
169
+ TableFunction GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
170
+ auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
171
+ TableFunction table_function({parameter}, ReadJSONFunction, ReadJSONBind, JSONGlobalTableFunctionState::Init,
172
+ JSONLocalTableFunctionState::Init);
173
+
174
+ JSONScan::TableFunctionDefaults(table_function);
175
+ table_function.named_parameters["columns"] = LogicalType::ANY;
176
+ table_function.named_parameters["auto_detect"] = LogicalType::BOOLEAN;
177
+ table_function.named_parameters["sample_size"] = LogicalType::BIGINT;
178
+
179
+ table_function.projection_pushdown = true;
180
+
181
+ table_function.function_info = std::move(function_info);
182
+
183
+ return table_function;
184
+ }
185
+
186
+ TableFunction GetReadJSONAutoTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
187
+ auto table_function = GetReadJSONTableFunction(list_parameter, std::move(function_info));
188
+ table_function.named_parameters["maximum_depth"] = LogicalType::BIGINT;
189
+ return table_function;
190
+ }
191
+
192
+ CreateTableFunctionInfo JSONFunctions::GetReadJSONFunction() {
193
+ TableFunctionSet function_set("read_json");
194
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::UNSTRUCTURED, false);
195
+ function_set.AddFunction(GetReadJSONTableFunction(false, function_info));
196
+ function_set.AddFunction(GetReadJSONTableFunction(true, function_info));
197
+ return CreateTableFunctionInfo(function_set);
198
+ }
199
+
200
+ CreateTableFunctionInfo JSONFunctions::GetReadNDJSONFunction() {
201
+ TableFunctionSet function_set("read_ndjson");
202
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, false);
203
+ function_set.AddFunction(GetReadJSONTableFunction(false, function_info));
204
+ function_set.AddFunction(GetReadJSONTableFunction(true, function_info));
205
+ return CreateTableFunctionInfo(function_set);
206
+ }
207
+
208
+ CreateTableFunctionInfo JSONFunctions::GetReadJSONAutoFunction() {
209
+ TableFunctionSet function_set("read_json_auto");
210
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, true);
211
+ function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
212
+ function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
213
+ return CreateTableFunctionInfo(function_set);
214
+ }
215
+
216
+ CreateTableFunctionInfo JSONFunctions::GetReadNDJSONAutoFunction() {
217
+ TableFunctionSet function_set("read_ndjson_auto");
218
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, true);
219
+ function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
220
+ function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
221
+ return CreateTableFunctionInfo(function_set);
222
+ }
223
+
224
+ } // namespace duckdb