duckdb 0.7.1-dev16.0 → 0.7.1-dev187.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/binding.gyp +7 -7
  2. package/package.json +1 -1
  3. package/src/duckdb/extension/json/buffered_json_reader.cpp +29 -5
  4. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +5 -1
  5. package/src/duckdb/extension/json/include/json_scan.hpp +17 -2
  6. package/src/duckdb/extension/json/json_functions/json_transform.cpp +19 -0
  7. package/src/duckdb/extension/json/json_functions/read_json.cpp +30 -28
  8. package/src/duckdb/extension/json/json_functions.cpp +6 -0
  9. package/src/duckdb/extension/json/json_scan.cpp +111 -23
  10. package/src/duckdb/extension/parquet/parquet-extension.cpp +3 -2
  11. package/src/duckdb/src/common/enums/logical_operator_type.cpp +2 -0
  12. package/src/duckdb/src/common/enums/physical_operator_type.cpp +2 -0
  13. package/src/duckdb/src/common/enums/statement_type.cpp +2 -0
  14. package/src/duckdb/src/common/file_system.cpp +14 -0
  15. package/src/duckdb/src/common/hive_partitioning.cpp +1 -0
  16. package/src/duckdb/src/common/operator/cast_operators.cpp +14 -8
  17. package/src/duckdb/src/common/printer.cpp +1 -1
  18. package/src/duckdb/src/common/types/time.cpp +1 -1
  19. package/src/duckdb/src/common/types/timestamp.cpp +35 -4
  20. package/src/duckdb/src/common/types.cpp +36 -10
  21. package/src/duckdb/src/execution/column_binding_resolver.cpp +5 -2
  22. package/src/duckdb/src/execution/operator/join/physical_iejoin.cpp +7 -9
  23. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +6 -11
  24. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +13 -13
  25. package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +1 -1
  26. package/src/duckdb/src/execution/operator/schema/physical_detach.cpp +37 -0
  27. package/src/duckdb/src/execution/operator/schema/physical_drop.cpp +0 -5
  28. package/src/duckdb/src/execution/physical_plan/plan_simple.cpp +4 -0
  29. package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -0
  30. package/src/duckdb/src/function/pragma/pragma_queries.cpp +36 -9
  31. package/src/duckdb/src/function/table/read_csv.cpp +15 -4
  32. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  33. package/src/duckdb/src/include/duckdb/common/enums/logical_operator_type.hpp +1 -0
  34. package/src/duckdb/src/include/duckdb/common/enums/physical_operator_type.hpp +1 -0
  35. package/src/duckdb/src/include/duckdb/common/enums/statement_type.hpp +2 -1
  36. package/src/duckdb/src/include/duckdb/common/exception.hpp +10 -0
  37. package/src/duckdb/src/include/duckdb/common/file_system.hpp +1 -0
  38. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +9 -1
  39. package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +4 -4
  40. package/src/duckdb/src/include/duckdb/common/types/timestamp.hpp +5 -1
  41. package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +1 -3
  42. package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -2
  43. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
  44. package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_detach.hpp +32 -0
  45. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -2
  46. package/src/duckdb/src/include/duckdb/main/config.hpp +0 -3
  47. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_database_info.hpp +0 -4
  48. package/src/duckdb/src/include/duckdb/parser/parsed_data/detach_info.hpp +32 -0
  49. package/src/duckdb/src/include/duckdb/parser/statement/detach_statement.hpp +29 -0
  50. package/src/duckdb/src/include/duckdb/parser/statement/list.hpp +1 -0
  51. package/src/duckdb/src/include/duckdb/parser/tokens.hpp +1 -0
  52. package/src/duckdb/src/include/duckdb/parser/transformer.hpp +1 -0
  53. package/src/duckdb/src/include/duckdb/planner/binder.hpp +1 -0
  54. package/src/duckdb/src/include/duckdb/planner/operator/logical_execute.hpp +1 -5
  55. package/src/duckdb/src/include/duckdb/planner/operator/logical_show.hpp +1 -2
  56. package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +7 -0
  57. package/src/duckdb/src/include/duckdb/storage/table/update_segment.hpp +2 -0
  58. package/src/duckdb/src/main/client_context.cpp +2 -0
  59. package/src/duckdb/src/main/extension/extension_alias.cpp +2 -1
  60. package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +2 -6
  61. package/src/duckdb/src/parser/statement/copy_statement.cpp +2 -13
  62. package/src/duckdb/src/parser/statement/delete_statement.cpp +3 -0
  63. package/src/duckdb/src/parser/statement/detach_statement.cpp +15 -0
  64. package/src/duckdb/src/parser/statement/insert_statement.cpp +9 -0
  65. package/src/duckdb/src/parser/statement/update_statement.cpp +3 -0
  66. package/src/duckdb/src/parser/transform/expression/transform_case.cpp +3 -3
  67. package/src/duckdb/src/parser/transform/statement/transform_create_database.cpp +0 -1
  68. package/src/duckdb/src/parser/transform/statement/transform_detach.cpp +19 -0
  69. package/src/duckdb/src/parser/transformer.cpp +2 -0
  70. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +6 -3
  71. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +16 -14
  72. package/src/duckdb/src/planner/binder/statement/bind_detach.cpp +19 -0
  73. package/src/duckdb/src/planner/binder/statement/bind_drop.cpp +29 -4
  74. package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +22 -1
  75. package/src/duckdb/src/planner/binder/tableref/bind_joinref.cpp +2 -1
  76. package/src/duckdb/src/planner/binder.cpp +2 -0
  77. package/src/duckdb/src/planner/expression_binder/lateral_binder.cpp +21 -5
  78. package/src/duckdb/src/planner/logical_operator.cpp +4 -0
  79. package/src/duckdb/src/planner/planner.cpp +1 -0
  80. package/src/duckdb/src/storage/storage_info.cpp +2 -1
  81. package/src/duckdb/src/storage/table/column_data.cpp +4 -2
  82. package/src/duckdb/src/storage/table/update_segment.cpp +15 -0
  83. package/src/duckdb/third_party/fmt/include/fmt/core.h +1 -2
  84. package/src/duckdb/third_party/libpg_query/include/nodes/nodes.hpp +1 -0
  85. package/src/duckdb/third_party/libpg_query/include/nodes/parsenodes.hpp +14 -0
  86. package/src/duckdb/third_party/libpg_query/include/parser/gram.hpp +530 -1006
  87. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +17659 -17626
  88. package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +4 -4
  89. package/src/duckdb/ub_src_execution_operator_schema.cpp +2 -0
  90. package/src/duckdb/ub_src_parser_statement.cpp +2 -0
  91. package/src/duckdb/ub_src_parser_transform_statement.cpp +2 -0
  92. package/src/duckdb/ub_src_planner_binder_statement.cpp +2 -0
  93. package/src/duckdb/src/include/duckdb/function/create_database_extension.hpp +0 -37
package/binding.gyp CHANGED
@@ -222,16 +222,16 @@
222
222
  "src/duckdb/third_party/zstd/compress/zstd_lazy.cpp",
223
223
  "src/duckdb/third_party/zstd/compress/zstd_ldm.cpp",
224
224
  "src/duckdb/third_party/zstd/compress/zstd_opt.cpp",
225
- "src/duckdb/extension/icu/./icu-timezone.cpp",
226
- "src/duckdb/extension/icu/./icu-makedate.cpp",
227
- "src/duckdb/extension/icu/./icu-datepart.cpp",
228
- "src/duckdb/extension/icu/./icu-datesub.cpp",
225
+ "src/duckdb/extension/icu/./icu-dateadd.cpp",
229
226
  "src/duckdb/extension/icu/./icu-datetrunc.cpp",
230
- "src/duckdb/extension/icu/./icu-timebucket.cpp",
231
227
  "src/duckdb/extension/icu/./icu-strptime.cpp",
232
- "src/duckdb/extension/icu/./icu-extension.cpp",
233
- "src/duckdb/extension/icu/./icu-dateadd.cpp",
234
228
  "src/duckdb/extension/icu/./icu-datefunc.cpp",
229
+ "src/duckdb/extension/icu/./icu-extension.cpp",
230
+ "src/duckdb/extension/icu/./icu-makedate.cpp",
231
+ "src/duckdb/extension/icu/./icu-timezone.cpp",
232
+ "src/duckdb/extension/icu/./icu-datesub.cpp",
233
+ "src/duckdb/extension/icu/./icu-timebucket.cpp",
234
+ "src/duckdb/extension/icu/./icu-datepart.cpp",
235
235
  "src/duckdb/ub_extension_icu_third_party_icu_common.cpp",
236
236
  "src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp",
237
237
  "src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp",
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.7.1-dev16.0",
5
+ "version": "0.7.1-dev187.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -25,7 +25,12 @@ JSONBufferHandle::JSONBufferHandle(idx_t buffer_index_p, idx_t readers_p, Alloca
25
25
  JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
26
26
  : file_handle(std::move(file_handle_p)), allocator(allocator_p), can_seek(file_handle->CanSeek()),
27
27
  plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()), read_position(0),
28
- cached_size(0) {
28
+ requested_reads(0), actual_reads(0), cached_size(0) {
29
+ }
30
+
31
+ void JSONFileHandle::Close() {
32
+ file_handle->Close();
33
+ cached_buffers.clear();
29
34
  }
30
35
 
31
36
  idx_t JSONFileHandle::FileSize() const {
@@ -36,10 +41,6 @@ idx_t JSONFileHandle::Remaining() const {
36
41
  return file_size - read_position;
37
42
  }
38
43
 
39
- bool JSONFileHandle::PlainFileSource() const {
40
- return plain_file_source;
41
- }
42
-
43
44
  bool JSONFileHandle::CanSeek() const {
44
45
  return can_seek;
45
46
  }
@@ -53,6 +54,9 @@ idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size)
53
54
  position = read_position;
54
55
  auto actual_size = MinValue<idx_t>(requested_size, Remaining());
55
56
  read_position += actual_size;
57
+ if (actual_size != 0) {
58
+ requested_reads++;
59
+ }
56
60
  return actual_size;
57
61
  }
58
62
 
@@ -60,11 +64,13 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
60
64
  D_ASSERT(size != 0);
61
65
  if (plain_file_source) {
62
66
  file_handle->Read((void *)pointer, size, position);
67
+ actual_reads++;
63
68
  return;
64
69
  }
65
70
 
66
71
  if (sample_run) { // Cache the buffer
67
72
  file_handle->Read((void *)pointer, size, position);
73
+ actual_reads++;
68
74
  cached_buffers.emplace_back(allocator.Allocate(size));
69
75
  memcpy(cached_buffers.back().get(), pointer, size);
70
76
  cached_size += size;
@@ -73,9 +79,11 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
73
79
 
74
80
  if (!cached_buffers.empty() || position < cached_size) {
75
81
  ReadFromCache(pointer, size, position);
82
+ actual_reads++;
76
83
  }
77
84
  if (size != 0) {
78
85
  file_handle->Read((void *)pointer, size, position);
86
+ actual_reads++;
79
87
  }
80
88
  }
81
89
 
@@ -143,6 +151,16 @@ void BufferedJSONReader::OpenJSONFile() {
143
151
  file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
144
152
  }
145
153
 
154
+ void BufferedJSONReader::CloseJSONFile() {
155
+ while (true) {
156
+ lock_guard<mutex> guard(lock);
157
+ if (file_handle->RequestedReadsComplete()) {
158
+ file_handle->Close();
159
+ break;
160
+ }
161
+ }
162
+ }
163
+
146
164
  bool BufferedJSONReader::IsOpen() {
147
165
  return file_handle != nullptr;
148
166
  }
@@ -246,9 +264,15 @@ void BufferedJSONReader::Reset() {
246
264
 
247
265
  void JSONFileHandle::Reset() {
248
266
  read_position = 0;
267
+ requested_reads = 0;
268
+ actual_reads = 0;
249
269
  if (plain_file_source) {
250
270
  file_handle->Reset();
251
271
  }
252
272
  }
253
273
 
274
+ bool JSONFileHandle::RequestedReadsComplete() {
275
+ return requested_reads == actual_reads;
276
+ }
277
+
254
278
  } // namespace duckdb
@@ -58,11 +58,11 @@ public:
58
58
  struct JSONFileHandle {
59
59
  public:
60
60
  JSONFileHandle(unique_ptr<FileHandle> file_handle, Allocator &allocator);
61
+ void Close();
61
62
 
62
63
  idx_t FileSize() const;
63
64
  idx_t Remaining() const;
64
65
 
65
- bool PlainFileSource() const;
66
66
  bool CanSeek() const;
67
67
  void Seek(idx_t position);
68
68
 
@@ -71,6 +71,7 @@ public:
71
71
  idx_t Read(const char *pointer, idx_t requested_size, bool sample_run);
72
72
 
73
73
  void Reset();
74
+ bool RequestedReadsComplete();
74
75
 
75
76
  private:
76
77
  idx_t ReadFromCache(const char *&pointer, idx_t &size, idx_t &position);
@@ -87,6 +88,8 @@ private:
87
88
 
88
89
  //! Read properties
89
90
  idx_t read_position;
91
+ idx_t requested_reads;
92
+ atomic<idx_t> actual_reads;
90
93
 
91
94
  //! Cached buffers for resetting when reading stream
92
95
  vector<AllocatedData> cached_buffers;
@@ -98,6 +101,7 @@ public:
98
101
  BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, string file_path);
99
102
 
100
103
  void OpenJSONFile();
104
+ void CloseJSONFile();
101
105
  bool IsOpen();
102
106
 
103
107
  BufferedJSONReaderOptions &GetOptions();
@@ -26,6 +26,16 @@ enum class JSONScanType : uint8_t {
26
26
  SAMPLE = 3,
27
27
  };
28
28
 
29
+ enum class JSONScanTopLevelType : uint8_t {
30
+ INVALID = 0,
31
+ //! Sequential objects, e.g., NDJSON
32
+ OBJECTS = 1,
33
+ //! Top-level array containing objects
34
+ ARRAY_OF_OBJECTS = 2,
35
+ //! Other, e.g., array of integer, or just strings
36
+ OTHER = 3
37
+ };
38
+
29
39
  //! Even though LogicalTypeId is just a uint8_t, this is still needed ...
30
40
  struct LogicalTypeIdHash {
31
41
  inline std::size_t operator()(const LogicalTypeId &id) const {
@@ -105,7 +115,7 @@ public:
105
115
  //! Max depth we go to detect nested JSON schema (defaults to unlimited)
106
116
  idx_t max_depth = NumericLimits<idx_t>::Maximum();
107
117
  //! Whether we're parsing objects (usually), or something else like arrays
108
- bool objects = true;
118
+ JSONScanTopLevelType top_level_type = JSONScanTopLevelType::OBJECTS;
109
119
  //! Forced date/timestamp formats
110
120
  string date_format;
111
121
  string timestamp_format;
@@ -181,9 +191,14 @@ public:
181
191
  yyjson_alc *GetAllocator();
182
192
  void ThrowTransformError(idx_t count, idx_t object_index, const string &error_message);
183
193
 
194
+ idx_t scan_count;
184
195
  JSONLine lines[STANDARD_VECTOR_SIZE];
185
196
  yyjson_val *objects[STANDARD_VECTOR_SIZE];
186
197
 
198
+ idx_t array_idx;
199
+ idx_t array_offset;
200
+ yyjson_val *array_objects[STANDARD_VECTOR_SIZE];
201
+
187
202
  idx_t batch_index;
188
203
 
189
204
  //! Options when transforming the JSON to columnar data
@@ -192,6 +207,7 @@ public:
192
207
 
193
208
  private:
194
209
  yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
210
+ idx_t GetObjectsFromArray();
195
211
 
196
212
  private:
197
213
  //! Bind data
@@ -300,7 +316,6 @@ public:
300
316
  table_function.serialize = JSONScanSerialize;
301
317
  table_function.deserialize = JSONScanDeserialize;
302
318
 
303
- // TODO: might be able to do some of these
304
319
  table_function.projection_pushdown = false;
305
320
  table_function.filter_pushdown = false;
306
321
  table_function.filter_prune = false;
@@ -523,6 +523,21 @@ static bool TransformArray(yyjson_val *arrays[], yyjson_alc *alc, Vector &result
523
523
  return success;
524
524
  }
525
525
 
526
+ bool TransformToJSON(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count) {
527
+ auto data = (string_t *)FlatVector::GetData(result);
528
+ auto &validity = FlatVector::Validity(result);
529
+ for (idx_t i = 0; i < count; i++) {
530
+ const auto &val = vals[i];
531
+ if (!val) {
532
+ validity.SetInvalid(i);
533
+ } else {
534
+ data[i] = JSONCommon::WriteVal(val, alc);
535
+ }
536
+ }
537
+ // Can always transform to JSON
538
+ return true;
539
+ }
540
+
526
541
  bool JSONTransform::Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
527
542
  JSONTransformOptions &options) {
528
543
  auto result_type = result.GetType();
@@ -531,6 +546,10 @@ bool JSONTransform::Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &resul
531
546
  return TransformFromStringWithFormat(vals, result, count, options);
532
547
  }
533
548
 
549
+ if (JSONCommon::LogicalTypeIsJSON(result_type)) {
550
+ return TransformToJSON(vals, alc, result, count);
551
+ }
552
+
534
553
  switch (result_type.id()) {
535
554
  case LogicalTypeId::SQLNULL:
536
555
  return true;
@@ -13,32 +13,17 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
13
13
  JSONScanLocalState lstate(context, gstate);
14
14
  ArenaAllocator allocator(BufferAllocator::Get(context));
15
15
 
16
- static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
17
- {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
18
- {LogicalTypeId::TIMESTAMP,
19
- {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
20
- "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
21
- };
22
-
23
- // Populate possible date/timestamp formats, assume this is consistent across columns
24
- for (auto &kv : FORMAT_TEMPLATES) {
25
- const auto &type = kv.first;
26
- if (bind_data.date_format_map.HasFormats(type)) {
27
- continue; // Already populated
28
- }
29
- const auto &format_strings = kv.second;
30
- for (auto &format_string : format_strings) {
31
- bind_data.date_format_map.AddFormat(type, format_string);
32
- }
33
- }
34
-
35
16
  // Read for the specified sample size
36
17
  JSONStructureNode node;
18
+ bool more_than_one = false;
37
19
  Vector string_vector(LogicalType::VARCHAR);
38
20
  idx_t remaining = bind_data.sample_size;
39
21
  while (remaining != 0) {
40
22
  allocator.Reset();
41
23
  auto read_count = lstate.ReadNext(gstate);
24
+ if (read_count > 1) {
25
+ more_than_one = true;
26
+ }
42
27
  if (read_count == 0) {
43
28
  break;
44
29
  }
@@ -54,15 +39,29 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
54
39
  node.InitializeCandidateTypes(bind_data.max_depth);
55
40
  node.RefineCandidateTypes(lstate.objects, next, string_vector, allocator, bind_data.date_format_map);
56
41
  remaining -= next;
42
+
43
+ if (gstate.file_index == 10) {
44
+ // We really shouldn't open more than 10 files when sampling
45
+ break;
46
+ }
57
47
  }
58
48
  bind_data.type = original_scan_type;
59
49
  bind_data.transform_options.date_format_map = &bind_data.date_format_map;
60
50
 
61
- const auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
51
+ auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
52
+ if (type.id() == LogicalTypeId::STRUCT) {
53
+ bind_data.top_level_type = JSONScanTopLevelType::OBJECTS;
54
+ } else if (!more_than_one && type.id() == LogicalTypeId::LIST &&
55
+ ListType::GetChildType(type).id() == LogicalTypeId::STRUCT) {
56
+ bind_data.top_level_type = JSONScanTopLevelType::ARRAY_OF_OBJECTS;
57
+ bind_data.options.format = JSONFormat::UNSTRUCTURED;
58
+ type = ListType::GetChildType(type);
59
+ }
60
+
62
61
  if (type.id() != LogicalTypeId::STRUCT) {
63
62
  return_types.emplace_back(type);
64
63
  names.emplace_back("json");
65
- bind_data.objects = false;
64
+ bind_data.top_level_type = JSONScanTopLevelType::OTHER;
66
65
  } else {
67
66
  const auto &child_types = StructType::GetChildTypes(type);
68
67
  return_types.reserve(child_types.size());
@@ -189,9 +188,11 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
189
188
  auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
190
189
  auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
191
190
 
192
- // Fetch next lines
193
191
  const auto count = lstate.ReadNext(gstate);
194
- const auto objects = lstate.objects;
192
+ const auto objects = gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS
193
+ ? lstate.array_objects
194
+ : lstate.objects;
195
+ output.SetCardinality(count);
195
196
 
196
197
  vector<Vector *> result_vectors;
197
198
  result_vectors.reserve(output.ColumnCount());
@@ -202,13 +203,14 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
202
203
 
203
204
  // Pass current reader to transform options so we can get line number information if an error occurs
204
205
  bool success;
205
- if (gstate.bind_data.objects) {
206
- success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
207
- result_vectors, lstate.transform_options);
208
- } else {
206
+ if (gstate.bind_data.top_level_type == JSONScanTopLevelType::OTHER) {
209
207
  success = JSONTransform::Transform(objects, lstate.GetAllocator(), *result_vectors[0], count,
210
208
  lstate.transform_options);
209
+ } else {
210
+ success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
211
+ result_vectors, lstate.transform_options);
211
212
  }
213
+
212
214
  if (!success) {
213
215
  string hint = gstate.bind_data.auto_detect
214
216
  ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns' manually, "
@@ -217,7 +219,6 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
217
219
  lstate.ThrowTransformError(count, lstate.transform_options.object_index,
218
220
  lstate.transform_options.error_message + hint);
219
221
  }
220
- output.SetCardinality(count);
221
222
  }
222
223
 
223
224
  TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
@@ -235,6 +236,7 @@ TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, share
235
236
  table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
236
237
 
237
238
  table_function.projection_pushdown = true;
239
+ // TODO: might be able to do filter pushdown/prune too
238
240
 
239
241
  table_function.function_info = std::move(function_info);
240
242
 
@@ -166,6 +166,12 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
166
166
  unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
167
167
  ReplacementScanData *data) {
168
168
  auto lower_name = StringUtil::Lower(table_name);
169
+ // remove any compression
170
+ if (StringUtil::EndsWith(lower_name, ".gz")) {
171
+ lower_name = lower_name.substr(0, lower_name.size() - 3);
172
+ } else if (StringUtil::EndsWith(lower_name, ".zst")) {
173
+ lower_name = lower_name.substr(0, lower_name.size() - 4);
174
+ }
169
175
  if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
170
176
  !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
171
177
  return nullptr;
@@ -3,6 +3,7 @@
3
3
  #include "duckdb/main/database.hpp"
4
4
  #include "duckdb/parallel/task_scheduler.hpp"
5
5
  #include "duckdb/storage/buffer_manager.hpp"
6
+ #include "duckdb/main/extension_helper.hpp"
6
7
 
7
8
  namespace duckdb {
8
9
 
@@ -47,8 +48,11 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
47
48
  options.format = JSONFormat::UNSTRUCTURED;
48
49
  } else if (format == "newline_delimited") {
49
50
  options.format = JSONFormat::NEWLINE_DELIMITED;
51
+ } else if (format == "array_of_objects") {
52
+ result->top_level_type = JSONScanTopLevelType::ARRAY_OF_OBJECTS;
50
53
  } else {
51
- throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
54
+ throw BinderException(
55
+ "format must be one of ['auto', 'unstructured', 'newline_delimited', 'array_of_objects']");
52
56
  }
53
57
  } else if (loption == "compression") {
54
58
  auto compression = StringUtil::Lower(StringValue::Get(kv.second));
@@ -66,6 +70,10 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
66
70
  }
67
71
  }
68
72
 
73
+ if (result->top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS) {
74
+ result->options.format = JSONFormat::UNSTRUCTURED;
75
+ }
76
+
69
77
  return std::move(result);
70
78
  }
71
79
 
@@ -75,7 +83,7 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
75
83
  for (auto &file_pattern : patterns) {
76
84
  auto found_files = fs.Glob(file_pattern, context);
77
85
  if (found_files.empty()) {
78
- throw IOException("No files found that match the pattern \"%s\"", file_pattern);
86
+ throw FileSystem::MissingFileException(file_pattern, context);
79
87
  }
80
88
  file_paths.insert(file_paths.end(), found_files.begin(), found_files.end());
81
89
  }
@@ -97,6 +105,27 @@ void JSONScanData::InitializeFormats() {
97
105
  if (!timestamp_format.empty()) {
98
106
  date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
99
107
  }
108
+
109
+ if (auto_detect) {
110
+ static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
111
+ {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
112
+ {LogicalTypeId::TIMESTAMP,
113
+ {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
114
+ "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
115
+ };
116
+
117
+ // Populate possible date/timestamp formats, assume this is consistent across columns
118
+ for (auto &kv : FORMAT_TEMPLATES) {
119
+ const auto &type = kv.first;
120
+ if (date_format_map.HasFormats(type)) {
121
+ continue; // Already populated
122
+ }
123
+ const auto &format_strings = kv.second;
124
+ for (auto &format_string : format_strings) {
125
+ date_format_map.AddFormat(type, format_string);
126
+ }
127
+ }
128
+ }
100
129
  }
101
130
 
102
131
  void JSONScanData::Serialize(FieldWriter &writer) {
@@ -111,9 +140,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
111
140
  writer.WriteList<string>(names);
112
141
  writer.WriteList<idx_t>(valid_cols);
113
142
  writer.WriteField<idx_t>(max_depth);
114
- writer.WriteField<bool>(objects);
115
- writer.WriteString(date_format);
116
- writer.WriteString(timestamp_format);
143
+ writer.WriteField<JSONScanTopLevelType>(top_level_type);
144
+ if (!date_format.empty()) {
145
+ writer.WriteString(date_format);
146
+ } else {
147
+ writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
148
+ }
149
+ if (!timestamp_format.empty()) {
150
+ writer.WriteString(timestamp_format);
151
+ } else {
152
+ writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
153
+ }
117
154
  }
118
155
 
119
156
  void JSONScanData::Deserialize(FieldReader &reader) {
@@ -128,9 +165,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
128
165
  names = reader.ReadRequiredList<string>();
129
166
  valid_cols = reader.ReadRequiredList<idx_t>();
130
167
  max_depth = reader.ReadRequired<idx_t>();
131
- objects = reader.ReadRequired<bool>();
168
+ top_level_type = reader.ReadRequired<JSONScanTopLevelType>();
132
169
  date_format = reader.ReadRequired<string>();
133
170
  timestamp_format = reader.ReadRequired<string>();
171
+
172
+ InitializeFormats();
173
+ transform_options.date_format_map = &date_format_map;
134
174
  }
135
175
 
136
176
  JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
@@ -149,9 +189,9 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
149
189
  }
150
190
 
151
191
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
152
- : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
192
+ : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
153
193
  json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
154
- buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
194
+ is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
155
195
 
156
196
  // Buffer to reconstruct JSON objects when they cross a buffer boundary
157
197
  reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
@@ -173,11 +213,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
173
213
  // Perform projection pushdown
174
214
  if (bind_data.type == JSONScanType::READ_JSON) {
175
215
  D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
176
- if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
177
- // If we are auto-detecting, but don't need all columns present in the file,
178
- // then we don't need to throw an error if we encounter an unseen column
179
- bind_data.transform_options.error_unknown_key = false;
180
- }
181
216
  vector<string> names;
182
217
  names.reserve(input.column_ids.size());
183
218
  for (idx_t i = 0; i < input.column_ids.size(); i++) {
@@ -188,6 +223,11 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
188
223
  names.push_back(std::move(bind_data.names[id]));
189
224
  bind_data.valid_cols.push_back(i);
190
225
  }
226
+ if (names.size() < bind_data.names.size()) {
227
+ // If we are auto-detecting, but don't need all columns present in the file,
228
+ // then we don't need to throw an error if we encounter an unseen column
229
+ bind_data.transform_options.error_unknown_key = false;
230
+ }
191
231
  bind_data.names = std::move(names);
192
232
  }
193
233
  return result;
@@ -230,6 +270,10 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
230
270
  idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
231
271
  json_allocator.Reset();
232
272
 
273
+ if (gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS && array_idx < scan_count) {
274
+ return GetObjectsFromArray();
275
+ }
276
+
233
277
  idx_t count = 0;
234
278
  if (buffer_offset == buffer_size) {
235
279
  if (!ReadNextBuffer(gstate)) {
@@ -253,10 +297,20 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
253
297
  default:
254
298
  throw InternalException("Unknown JSON format");
255
299
  }
300
+ scan_count = count;
256
301
 
257
302
  // Skip over any remaining whitespace for the next scan
258
303
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
259
304
 
305
+ if (gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS) {
306
+ if (scan_count > 1) {
307
+ throw InvalidInputException("File must have exactly one array of objects when format='array_of_objects'");
308
+ }
309
+ array_idx = 0;
310
+ array_offset = 0;
311
+ return GetObjectsFromArray();
312
+ }
313
+
260
314
  return count;
261
315
  }
262
316
 
@@ -331,10 +385,39 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
331
385
  }
332
386
  }
333
387
 
388
+ idx_t JSONScanLocalState::GetObjectsFromArray() {
389
+ idx_t arr_count = 0;
390
+
391
+ size_t idx, max;
392
+ yyjson_val *val;
393
+ for (; array_idx < scan_count; array_idx++, array_offset = 0) {
394
+ if (objects[array_idx]) {
395
+ yyjson_arr_foreach(objects[array_idx], idx, max, val) {
396
+ if (idx < array_offset) {
397
+ continue;
398
+ }
399
+ array_objects[arr_count++] = val;
400
+ if (arr_count == STANDARD_VECTOR_SIZE) {
401
+ break;
402
+ }
403
+ }
404
+ array_offset = idx + 1;
405
+ if (arr_count == STANDARD_VECTOR_SIZE) {
406
+ break;
407
+ }
408
+ }
409
+ }
410
+ return arr_count;
411
+ }
412
+
334
413
  bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
335
414
  if (current_reader) {
336
415
  D_ASSERT(current_buffer_handle);
337
416
  current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
417
+ if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
418
+ // Close files that are done if we're not sampling
419
+ current_reader->CloseJSONFile();
420
+ }
338
421
  }
339
422
 
340
423
  AllocatedData buffer;
@@ -395,7 +478,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
395
478
  // Unopened file
396
479
  current_reader->OpenJSONFile();
397
480
  batch_index = gstate.batch_index++;
398
- if (options.format == JSONFormat::UNSTRUCTURED) {
481
+ if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
482
+ options.compression != FileCompressionType::UNCOMPRESSED &&
483
+ gstate.file_index < gstate.json_readers.size())) {
399
484
  gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
400
485
  }
401
486
  if (options.format != JSONFormat::AUTO_DETECT) {
@@ -449,9 +534,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
449
534
  auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
450
535
  current_buffer_handle = json_buffer_handle.get();
451
536
  current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
452
- if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
453
- // TODO: store buffer
454
- }
455
537
 
456
538
  buffer_offset = 0;
457
539
  prev_buffer_remainder = 0;
@@ -507,16 +589,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
507
589
  }
508
590
 
509
591
  void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
510
- auto &file_handle = current_reader->GetFileHandle();
511
-
512
592
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
513
593
  idx_t read_size;
514
594
  {
515
595
  lock_guard<mutex> reader_guard(current_reader->lock);
516
596
  buffer_index = current_reader->GetBufferIndex();
517
597
 
518
- read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
519
- gstate.bind_data.type == JSONScanType::SAMPLE);
598
+ if (current_reader->IsOpen()) {
599
+ read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
600
+ gstate.bind_data.type == JSONScanType::SAMPLE);
601
+ } else {
602
+ read_size = 0;
603
+ }
520
604
  is_last = read_size < request_size;
521
605
 
522
606
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
@@ -582,6 +666,11 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
582
666
  }
583
667
 
584
668
  void JSONScanLocalState::ReadUnstructured(idx_t &count) {
669
+ // yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
670
+ // if a different error code happens within the last 50 bytes
671
+ // we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
672
+ static constexpr idx_t END_BOUND = 50;
673
+
585
674
  const auto max_obj_size = reconstruct_buffer.GetSize();
586
675
  yyjson_read_err error;
587
676
  for (; count < STANDARD_VECTOR_SIZE; count++) {
@@ -607,8 +696,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
607
696
  } else if (error.pos > max_obj_size) {
608
697
  current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
609
698
  "Try increasing \"maximum_object_size\".");
610
-
611
- } else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
699
+ } else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
612
700
  // Copy remaining to reconstruct_buffer
613
701
  const auto reconstruct_ptr = reconstruct_buffer.get();
614
702
  memcpy(reconstruct_ptr, obj_copy_start, remaining);
@@ -223,7 +223,7 @@ public:
223
223
  FileSystem &fs = FileSystem::GetFileSystem(context);
224
224
  auto files = fs.Glob(info.file_path, context);
225
225
  if (files.empty()) {
226
- throw IOException("No files found that match the pattern \"%s\"", info.file_path);
226
+ throw FileSystem::MissingFileException(info.file_path, context);
227
227
  }
228
228
 
229
229
  // The most likely path (Parquet read without union by name option)
@@ -363,8 +363,9 @@ public:
363
363
 
364
364
  static vector<string> ParquetGlob(FileSystem &fs, const string &glob, ClientContext &context) {
365
365
  auto files = fs.Glob(glob, FileSystem::GetFileOpener(context));
366
+
366
367
  if (files.empty()) {
367
- throw IOException("No files found that match the pattern \"%s\"", glob);
368
+ throw FileSystem::MissingFileException(glob, context);
368
369
  }
369
370
  return files;
370
371
  }