duckdb 0.7.1-dev90.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +1 -1
  2. package/binding.gyp +7 -7
  3. package/package.json +3 -3
  4. package/src/duckdb/extension/json/buffered_json_reader.cpp +50 -9
  5. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +7 -2
  6. package/src/duckdb/extension/json/include/json_scan.hpp +45 -10
  7. package/src/duckdb/extension/json/json_functions/copy_json.cpp +35 -22
  8. package/src/duckdb/extension/json/json_functions/json_create.cpp +8 -8
  9. package/src/duckdb/extension/json/json_functions/json_structure.cpp +8 -3
  10. package/src/duckdb/extension/json/json_functions/json_transform.cpp +54 -10
  11. package/src/duckdb/extension/json/json_functions/read_json.cpp +104 -49
  12. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +5 -3
  13. package/src/duckdb/extension/json/json_functions.cpp +7 -0
  14. package/src/duckdb/extension/json/json_scan.cpp +144 -38
  15. package/src/duckdb/extension/parquet/column_reader.cpp +7 -0
  16. package/src/duckdb/extension/parquet/include/column_reader.hpp +1 -0
  17. package/src/duckdb/extension/parquet/parquet-extension.cpp +2 -10
  18. package/src/duckdb/src/catalog/catalog.cpp +62 -13
  19. package/src/duckdb/src/catalog/catalog_entry/index_catalog_entry.cpp +8 -7
  20. package/src/duckdb/src/catalog/catalog_entry/schema_catalog_entry.cpp +1 -1
  21. package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
  22. package/src/duckdb/src/catalog/default/default_functions.cpp +1 -0
  23. package/src/duckdb/src/catalog/default/default_views.cpp +1 -1
  24. package/src/duckdb/src/common/bind_helpers.cpp +55 -0
  25. package/src/duckdb/src/common/file_system.cpp +23 -9
  26. package/src/duckdb/src/common/hive_partitioning.cpp +1 -0
  27. package/src/duckdb/src/common/local_file_system.cpp +4 -4
  28. package/src/duckdb/src/common/string_util.cpp +8 -4
  29. package/src/duckdb/src/common/types/partitioned_column_data.cpp +1 -0
  30. package/src/duckdb/src/common/types.cpp +37 -11
  31. package/src/duckdb/src/execution/column_binding_resolver.cpp +5 -2
  32. package/src/duckdb/src/execution/index/art/art.cpp +117 -67
  33. package/src/duckdb/src/execution/index/art/art_key.cpp +24 -12
  34. package/src/duckdb/src/execution/index/art/leaf.cpp +7 -8
  35. package/src/duckdb/src/execution/index/art/node.cpp +13 -27
  36. package/src/duckdb/src/execution/index/art/node16.cpp +5 -8
  37. package/src/duckdb/src/execution/index/art/node256.cpp +3 -5
  38. package/src/duckdb/src/execution/index/art/node4.cpp +4 -7
  39. package/src/duckdb/src/execution/index/art/node48.cpp +5 -8
  40. package/src/duckdb/src/execution/index/art/prefix.cpp +2 -3
  41. package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +6 -27
  42. package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +1 -9
  43. package/src/duckdb/src/execution/operator/helper/physical_set.cpp +1 -9
  44. package/src/duckdb/src/execution/operator/join/physical_iejoin.cpp +7 -9
  45. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +9 -0
  46. package/src/duckdb/src/execution/physical_operator.cpp +6 -6
  47. package/src/duckdb/src/function/pragma/pragma_queries.cpp +38 -11
  48. package/src/duckdb/src/function/scalar/generic/current_setting.cpp +2 -2
  49. package/src/duckdb/src/function/scalar/list/array_slice.cpp +2 -3
  50. package/src/duckdb/src/function/scalar/map/map.cpp +69 -21
  51. package/src/duckdb/src/function/scalar/string/like.cpp +6 -3
  52. package/src/duckdb/src/function/table/read_csv.cpp +16 -5
  53. package/src/duckdb/src/function/table/system/duckdb_temporary_files.cpp +59 -0
  54. package/src/duckdb/src/function/table/system_functions.cpp +1 -0
  55. package/src/duckdb/src/function/table/table_scan.cpp +3 -0
  56. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  57. package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +7 -1
  58. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_index_entry.hpp +1 -1
  59. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
  60. package/src/duckdb/src/include/duckdb/common/bind_helpers.hpp +2 -0
  61. package/src/duckdb/src/include/duckdb/common/enums/statement_type.hpp +1 -1
  62. package/src/duckdb/src/include/duckdb/common/enums/wal_type.hpp +3 -0
  63. package/src/duckdb/src/include/duckdb/common/file_system.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +9 -1
  65. package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +4 -4
  66. package/src/duckdb/src/include/duckdb/common/string_util.hpp +9 -2
  67. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +37 -41
  68. package/src/duckdb/src/include/duckdb/execution/index/art/art_key.hpp +8 -11
  69. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
  70. package/src/duckdb/src/include/duckdb/function/scalar/string_functions.hpp +2 -1
  71. package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
  72. package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -2
  73. package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
  74. package/src/duckdb/src/include/duckdb/main/{extension_functions.hpp → extension_entries.hpp} +27 -5
  75. package/src/duckdb/src/include/duckdb/main/extension_helper.hpp +11 -1
  76. package/src/duckdb/src/include/duckdb/main/settings.hpp +9 -0
  77. package/src/duckdb/src/include/duckdb/parallel/pipeline_executor.hpp +0 -7
  78. package/src/duckdb/src/include/duckdb/parser/query_node/select_node.hpp +1 -1
  79. package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +2 -2
  80. package/src/duckdb/src/include/duckdb/parser/statement/copy_statement.hpp +1 -1
  81. package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +3 -3
  82. package/src/duckdb/src/include/duckdb/parser/tableref/subqueryref.hpp +1 -1
  83. package/src/duckdb/src/include/duckdb/planner/binder.hpp +3 -0
  84. package/src/duckdb/src/include/duckdb/planner/expression_binder/index_binder.hpp +10 -3
  85. package/src/duckdb/src/include/duckdb/planner/operator/logical_execute.hpp +1 -5
  86. package/src/duckdb/src/include/duckdb/planner/operator/logical_show.hpp +1 -2
  87. package/src/duckdb/src/include/duckdb/storage/buffer_manager.hpp +8 -0
  88. package/src/duckdb/src/include/duckdb/storage/data_table.hpp +7 -1
  89. package/src/duckdb/src/include/duckdb/storage/index.hpp +47 -38
  90. package/src/duckdb/src/include/duckdb/storage/write_ahead_log.hpp +7 -0
  91. package/src/duckdb/src/main/client_context.cpp +2 -0
  92. package/src/duckdb/src/main/config.cpp +1 -0
  93. package/src/duckdb/src/main/database.cpp +14 -5
  94. package/src/duckdb/src/main/extension/extension_alias.cpp +2 -1
  95. package/src/duckdb/src/main/extension/extension_helper.cpp +15 -0
  96. package/src/duckdb/src/main/extension/extension_install.cpp +60 -16
  97. package/src/duckdb/src/main/extension/extension_load.cpp +62 -13
  98. package/src/duckdb/src/main/settings/settings.cpp +16 -0
  99. package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +2 -6
  100. package/src/duckdb/src/parallel/pipeline_executor.cpp +1 -55
  101. package/src/duckdb/src/parser/parsed_data/create_index_info.cpp +3 -0
  102. package/src/duckdb/src/parser/statement/copy_statement.cpp +2 -13
  103. package/src/duckdb/src/parser/statement/delete_statement.cpp +3 -0
  104. package/src/duckdb/src/parser/statement/insert_statement.cpp +9 -0
  105. package/src/duckdb/src/parser/statement/update_statement.cpp +3 -0
  106. package/src/duckdb/src/parser/transform/expression/transform_case.cpp +3 -3
  107. package/src/duckdb/src/planner/bind_context.cpp +1 -1
  108. package/src/duckdb/src/planner/binder/expression/bind_aggregate_expression.cpp +3 -0
  109. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +7 -14
  110. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +13 -0
  111. package/src/duckdb/src/planner/binder/statement/bind_drop.cpp +2 -2
  112. package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +22 -1
  113. package/src/duckdb/src/planner/expression_binder/index_binder.cpp +32 -1
  114. package/src/duckdb/src/planner/logical_operator.cpp +4 -1
  115. package/src/duckdb/src/storage/buffer_manager.cpp +105 -26
  116. package/src/duckdb/src/storage/compression/bitpacking.cpp +16 -7
  117. package/src/duckdb/src/storage/data_table.cpp +66 -3
  118. package/src/duckdb/src/storage/index.cpp +1 -1
  119. package/src/duckdb/src/storage/local_storage.cpp +1 -1
  120. package/src/duckdb/src/storage/table_index_list.cpp +1 -2
  121. package/src/duckdb/src/storage/wal_replay.cpp +68 -0
  122. package/src/duckdb/src/storage/write_ahead_log.cpp +21 -1
  123. package/src/duckdb/src/transaction/commit_state.cpp +5 -2
  124. package/src/duckdb/third_party/concurrentqueue/blockingconcurrentqueue.h +2 -2
  125. package/src/duckdb/third_party/fmt/include/fmt/core.h +1 -2
  126. package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +4 -4
  127. package/src/duckdb/ub_src_function_table_system.cpp +2 -0
  128. package/src/statement.cpp +46 -12
  129. package/test/arrow.test.ts +3 -3
  130. package/test/prepare.test.ts +39 -1
  131. package/test/typescript_decls.test.ts +1 -1
@@ -1,9 +1,9 @@
1
1
  #include "json_scan.hpp"
2
2
 
3
3
  #include "duckdb/main/database.hpp"
4
+ #include "duckdb/main/extension_helper.hpp"
4
5
  #include "duckdb/parallel/task_scheduler.hpp"
5
6
  #include "duckdb/storage/buffer_manager.hpp"
6
- #include "duckdb/main/extension_helper.hpp"
7
7
 
8
8
  namespace duckdb {
9
9
 
@@ -20,8 +20,9 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
20
20
  auto &options = result->options;
21
21
 
22
22
  auto &info = (JSONScanInfo &)*input.info;
23
- options.format = info.format;
24
23
  result->type = info.type;
24
+ options.format = info.format;
25
+ result->record_type = info.record_type;
25
26
  result->auto_detect = info.auto_detect;
26
27
 
27
28
  vector<string> patterns;
@@ -40,16 +41,16 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
40
41
  result->ignore_errors = BooleanValue::Get(kv.second);
41
42
  } else if (loption == "maximum_object_size") {
42
43
  result->maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), result->maximum_object_size);
43
- } else if (loption == "format") {
44
+ } else if (loption == "lines") {
44
45
  auto format = StringUtil::Lower(StringValue::Get(kv.second));
45
46
  if (format == "auto") {
46
47
  options.format = JSONFormat::AUTO_DETECT;
47
- } else if (format == "unstructured") {
48
+ } else if (format == "false") {
48
49
  options.format = JSONFormat::UNSTRUCTURED;
49
- } else if (format == "newline_delimited") {
50
+ } else if (format == "true") {
50
51
  options.format = JSONFormat::NEWLINE_DELIMITED;
51
52
  } else {
52
- throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
53
+ throw BinderException("\"lines\" must be one of ['auto', 'true', 'false']");
53
54
  }
54
55
  } else if (loption == "compression") {
55
56
  auto compression = StringUtil::Lower(StringValue::Get(kv.second));
@@ -74,10 +75,7 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
74
75
  vector<string> &file_paths) {
75
76
  auto &fs = FileSystem::GetFileSystem(context);
76
77
  for (auto &file_pattern : patterns) {
77
- auto found_files = fs.Glob(file_pattern, context);
78
- if (found_files.empty()) {
79
- throw FileSystem::MissingFileException(file_pattern, context);
80
- }
78
+ auto found_files = fs.GlobFiles(file_pattern, context);
81
79
  file_paths.insert(file_paths.end(), found_files.begin(), found_files.end());
82
80
  }
83
81
  }
@@ -98,6 +96,27 @@ void JSONScanData::InitializeFormats() {
98
96
  if (!timestamp_format.empty()) {
99
97
  date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
100
98
  }
99
+
100
+ if (auto_detect) {
101
+ static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
102
+ {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
103
+ {LogicalTypeId::TIMESTAMP,
104
+ {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
105
+ "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
106
+ };
107
+
108
+ // Populate possible date/timestamp formats, assume this is consistent across columns
109
+ for (auto &kv : FORMAT_TEMPLATES) {
110
+ const auto &type = kv.first;
111
+ if (date_format_map.HasFormats(type)) {
112
+ continue; // Already populated
113
+ }
114
+ const auto &format_strings = kv.second;
115
+ for (auto &format_string : format_strings) {
116
+ date_format_map.AddFormat(type, format_string);
117
+ }
118
+ }
119
+ }
101
120
  }
102
121
 
103
122
  void JSONScanData::Serialize(FieldWriter &writer) {
@@ -112,9 +131,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
112
131
  writer.WriteList<string>(names);
113
132
  writer.WriteList<idx_t>(valid_cols);
114
133
  writer.WriteField<idx_t>(max_depth);
115
- writer.WriteField<bool>(objects);
116
- writer.WriteString(date_format);
117
- writer.WriteString(timestamp_format);
134
+ writer.WriteField<JSONRecordType>(record_type);
135
+ if (!date_format.empty()) {
136
+ writer.WriteString(date_format);
137
+ } else {
138
+ writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
139
+ }
140
+ if (!timestamp_format.empty()) {
141
+ writer.WriteString(timestamp_format);
142
+ } else {
143
+ writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
144
+ }
118
145
  }
119
146
 
120
147
  void JSONScanData::Deserialize(FieldReader &reader) {
@@ -129,9 +156,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
129
156
  names = reader.ReadRequiredList<string>();
130
157
  valid_cols = reader.ReadRequiredList<idx_t>();
131
158
  max_depth = reader.ReadRequired<idx_t>();
132
- objects = reader.ReadRequired<bool>();
159
+ record_type = reader.ReadRequired<JSONRecordType>();
133
160
  date_format = reader.ReadRequired<string>();
134
161
  timestamp_format = reader.ReadRequired<string>();
162
+
163
+ InitializeFormats();
164
+ transform_options.date_format_map = &date_format_map;
135
165
  }
136
166
 
137
167
  JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
@@ -150,11 +180,11 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
150
180
  }
151
181
 
152
182
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
153
- : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
183
+ : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
154
184
  json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
155
- buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
185
+ is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
156
186
 
157
- // Buffer to reconstruct JSON objects when they cross a buffer boundary
187
+ // Buffer to reconstruct JSON values when they cross a buffer boundary
158
188
  reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
159
189
 
160
190
  // This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
@@ -174,11 +204,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
174
204
  // Perform projection pushdown
175
205
  if (bind_data.type == JSONScanType::READ_JSON) {
176
206
  D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
177
- if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
178
- // If we are auto-detecting, but don't need all columns present in the file,
179
- // then we don't need to throw an error if we encounter an unseen column
180
- bind_data.transform_options.error_unknown_key = false;
181
- }
182
207
  vector<string> names;
183
208
  names.reserve(input.column_ids.size());
184
209
  for (idx_t i = 0; i < input.column_ids.size(); i++) {
@@ -189,13 +214,37 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
189
214
  names.push_back(std::move(bind_data.names[id]));
190
215
  bind_data.valid_cols.push_back(i);
191
216
  }
217
+ if (names.size() < bind_data.names.size()) {
218
+ // If we are auto-detecting, but don't need all columns present in the file,
219
+ // then we don't need to throw an error if we encounter an unseen column
220
+ bind_data.transform_options.error_unknown_key = false;
221
+ }
192
222
  bind_data.names = std::move(names);
193
223
  }
194
224
  return result;
195
225
  }
196
226
 
197
227
  idx_t JSONGlobalTableFunctionState::MaxThreads() const {
198
- return state.system_threads;
228
+ auto &bind_data = state.bind_data;
229
+
230
+ auto num_files = bind_data.file_paths.size();
231
+ idx_t readers_per_file;
232
+ if (bind_data.options.format == JSONFormat::UNSTRUCTURED) {
233
+ // Unstructured necessitates single thread
234
+ readers_per_file = 1;
235
+ } else if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
236
+ auto &reader = *state.json_readers[0];
237
+ const auto &options = reader.GetOptions();
238
+ if (options.format == JSONFormat::UNSTRUCTURED || options.compression != FileCompressionType::UNCOMPRESSED) {
239
+ // Auto-detected unstructured - same story, compression also really limits parallelism
240
+ readers_per_file = 1;
241
+ } else {
242
+ return state.system_threads;
243
+ }
244
+ } else {
245
+ return state.system_threads;
246
+ }
247
+ return num_files * readers_per_file;
199
248
  }
200
249
 
201
250
  JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -231,6 +280,12 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
231
280
  idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
232
281
  json_allocator.Reset();
233
282
 
283
+ if ((gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
284
+ gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) &&
285
+ array_idx < scan_count) {
286
+ return GetObjectsFromArray(gstate);
287
+ }
288
+
234
289
  idx_t count = 0;
235
290
  if (buffer_offset == buffer_size) {
236
291
  if (!ReadNextBuffer(gstate)) {
@@ -254,10 +309,18 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
254
309
  default:
255
310
  throw InternalException("Unknown JSON format");
256
311
  }
312
+ scan_count = count;
257
313
 
258
314
  // Skip over any remaining whitespace for the next scan
259
315
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
260
316
 
317
+ if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
318
+ gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
319
+ array_idx = 0;
320
+ array_offset = 0;
321
+ return GetObjectsFromArray(gstate);
322
+ }
323
+
261
324
  return count;
262
325
  }
263
326
 
@@ -332,10 +395,48 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
332
395
  }
333
396
  }
334
397
 
398
+ idx_t JSONScanLocalState::GetObjectsFromArray(JSONScanGlobalState &gstate) {
399
+ idx_t arr_count = 0;
400
+
401
+ size_t idx, max;
402
+ yyjson_val *val;
403
+ for (; array_idx < scan_count; array_idx++, array_offset = 0) {
404
+ auto &value = values[array_idx];
405
+ if (!value) {
406
+ continue;
407
+ }
408
+ if (unsafe_yyjson_is_arr(value)) {
409
+ yyjson_arr_foreach(value, idx, max, val) {
410
+ if (idx < array_offset) {
411
+ continue;
412
+ }
413
+ array_values[arr_count++] = val;
414
+ if (arr_count == STANDARD_VECTOR_SIZE) {
415
+ break;
416
+ }
417
+ }
418
+ array_offset = idx + 1;
419
+ if (arr_count == STANDARD_VECTOR_SIZE) {
420
+ break;
421
+ }
422
+ } else if (!gstate.bind_data.ignore_errors) {
423
+ ThrowTransformError(
424
+ array_idx,
425
+ StringUtil::Format("Expected JSON ARRAY but got %s: %s\nTry setting json_format to 'records'",
426
+ JSONCommon::ValTypeToString(value), JSONCommon::ValToString(value, 50)));
427
+ }
428
+ }
429
+ return arr_count;
430
+ }
431
+
335
432
  bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
336
433
  if (current_reader) {
337
434
  D_ASSERT(current_buffer_handle);
338
435
  current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
436
+ if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
437
+ // Close files that are done if we're not sampling
438
+ current_reader->CloseJSONFile();
439
+ }
339
440
  }
340
441
 
341
442
  AllocatedData buffer;
@@ -396,7 +497,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
396
497
  // Unopened file
397
498
  current_reader->OpenJSONFile();
398
499
  batch_index = gstate.batch_index++;
399
- if (options.format == JSONFormat::UNSTRUCTURED) {
500
+ if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
501
+ options.compression != FileCompressionType::UNCOMPRESSED &&
502
+ gstate.file_index < gstate.json_readers.size())) {
400
503
  gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
401
504
  }
402
505
  if (options.format != JSONFormat::AUTO_DETECT) {
@@ -450,9 +553,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
450
553
  auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
451
554
  current_buffer_handle = json_buffer_handle.get();
452
555
  current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
453
- if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
454
- // TODO: store buffer
455
- }
456
556
 
457
557
  buffer_offset = 0;
458
558
  prev_buffer_remainder = 0;
@@ -508,16 +608,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
508
608
  }
509
609
 
510
610
  void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
511
- auto &file_handle = current_reader->GetFileHandle();
512
-
513
611
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
514
612
  idx_t read_size;
515
613
  {
516
614
  lock_guard<mutex> reader_guard(current_reader->lock);
517
615
  buffer_index = current_reader->GetBufferIndex();
518
616
 
519
- read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
520
- gstate.bind_data.type == JSONScanType::SAMPLE);
617
+ if (current_reader->IsOpen()) {
618
+ read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
619
+ gstate.bind_data.type == JSONScanType::SAMPLE);
620
+ } else {
621
+ read_size = 0;
622
+ }
521
623
  is_last = read_size < request_size;
522
624
 
523
625
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
@@ -579,10 +681,15 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
579
681
  current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
580
682
  }
581
683
 
582
- objects[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
684
+ values[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
583
685
  }
584
686
 
585
687
  void JSONScanLocalState::ReadUnstructured(idx_t &count) {
688
+ // yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
689
+ // if a different error code happens within the last 50 bytes
690
+ // we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
691
+ static constexpr idx_t END_BOUND = 50;
692
+
586
693
  const auto max_obj_size = reconstruct_buffer.GetSize();
587
694
  yyjson_read_err error;
588
695
  for (; count < STANDARD_VECTOR_SIZE; count++) {
@@ -608,8 +715,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
608
715
  } else if (error.pos > max_obj_size) {
609
716
  current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
610
717
  "Try increasing \"maximum_object_size\".");
611
-
612
- } else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
718
+ } else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
613
719
  // Copy remaining to reconstruct_buffer
614
720
  const auto reconstruct_ptr = reconstruct_buffer.get();
615
721
  memcpy(reconstruct_ptr, obj_copy_start, remaining);
@@ -619,7 +725,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
619
725
  } else {
620
726
  current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
621
727
  }
622
- objects[count] = read_doc->root;
728
+ values[count] = read_doc->root;
623
729
  }
624
730
  }
625
731
 
@@ -645,7 +751,7 @@ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
645
751
  }
646
752
  idx_t line_size = line_end - line_start;
647
753
 
648
- objects[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
754
+ values[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
649
755
 
650
756
  buffer_offset += line_size;
651
757
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
@@ -656,11 +762,11 @@ yyjson_alc *JSONScanLocalState::GetAllocator() {
656
762
  return json_allocator.GetYYJSONAllocator();
657
763
  }
658
764
 
659
- void JSONScanLocalState::ThrowTransformError(idx_t count, idx_t object_index, const string &error_message) {
765
+ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &error_message) {
660
766
  D_ASSERT(current_reader);
661
767
  D_ASSERT(current_buffer_handle);
662
768
  D_ASSERT(object_index != DConstants::INVALID_INDEX);
663
- auto line_or_object_in_buffer = lines_or_objects_in_buffer - count + object_index;
769
+ auto line_or_object_in_buffer = lines_or_objects_in_buffer - scan_count + object_index;
664
770
  current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
665
771
  }
666
772
 
@@ -589,6 +589,7 @@ void StringColumnReader::PrepareDeltaLengthByteArray(ResizeableBuffer &buffer) {
589
589
  }
590
590
  auto length_data = (uint32_t *)length_buffer->ptr;
591
591
  byte_array_data = make_unique<Vector>(LogicalType::VARCHAR, value_count);
592
+ byte_array_count = value_count;
592
593
  auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
593
594
  for (idx_t i = 0; i < value_count; i++) {
594
595
  auto str_len = length_data[i];
@@ -615,6 +616,7 @@ void StringColumnReader::PrepareDeltaByteArray(ResizeableBuffer &buffer) {
615
616
  auto prefix_data = (uint32_t *)prefix_buffer->ptr;
616
617
  auto suffix_data = (uint32_t *)suffix_buffer->ptr;
617
618
  byte_array_data = make_unique<Vector>(LogicalType::VARCHAR, prefix_count);
619
+ byte_array_count = prefix_count;
618
620
  auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
619
621
  for (idx_t i = 0; i < prefix_count; i++) {
620
622
  auto str_len = prefix_data[i] + suffix_data[i];
@@ -646,6 +648,11 @@ void StringColumnReader::DeltaByteArray(uint8_t *defines, idx_t num_values, parq
646
648
  continue;
647
649
  }
648
650
  if (filter[row_idx + result_offset]) {
651
+ if (delta_offset >= byte_array_count) {
652
+ throw IOException("DELTA_BYTE_ARRAY - length mismatch between values and byte array lengths (attempted "
653
+ "read of %d from %d entries) - corrupt file?",
654
+ delta_offset + 1, byte_array_count);
655
+ }
649
656
  result_ptr[row_idx + result_offset] = string_data[delta_offset++];
650
657
  } else {
651
658
  delta_offset++;
@@ -131,6 +131,7 @@ protected:
131
131
  ParquetReader &reader;
132
132
  LogicalType type;
133
133
  unique_ptr<Vector> byte_array_data;
134
+ idx_t byte_array_count = 0;
134
135
 
135
136
  idx_t pending_skips = 0;
136
137
 
@@ -221,10 +221,7 @@ public:
221
221
  }
222
222
 
223
223
  FileSystem &fs = FileSystem::GetFileSystem(context);
224
- auto files = fs.Glob(info.file_path, context);
225
- if (files.empty()) {
226
- throw FileSystem::MissingFileException(info.file_path, context);
227
- }
224
+ auto files = fs.GlobFiles(info.file_path, context);
228
225
 
229
226
  // The most likely path (Parquet read without union by name option)
230
227
  if (!parquet_options.union_by_name) {
@@ -362,12 +359,7 @@ public:
362
359
  }
363
360
 
364
361
  static vector<string> ParquetGlob(FileSystem &fs, const string &glob, ClientContext &context) {
365
- auto files = fs.Glob(glob, FileSystem::GetFileOpener(context));
366
-
367
- if (files.empty()) {
368
- throw FileSystem::MissingFileException(glob, context);
369
- }
370
- return files;
362
+ return fs.GlobFiles(glob, context);
371
363
  }
372
364
 
373
365
  static unique_ptr<FunctionData> ParquetScanBind(ClientContext &context, TableFunctionBindInput &input,
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "duckdb/catalog/catalog_search_path.hpp"
4
4
  #include "duckdb/catalog/catalog_entry/list.hpp"
5
+ #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
5
6
  #include "duckdb/catalog/catalog_set.hpp"
6
7
  #include "duckdb/catalog/default/default_schemas.hpp"
7
8
  #include "duckdb/catalog/catalog_entry/type_catalog_entry.hpp"
@@ -26,7 +27,7 @@
26
27
  #include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
27
28
  #include "duckdb/planner/binder.hpp"
28
29
  #include "duckdb/catalog/default/default_types.hpp"
29
- #include "duckdb/main/extension_functions.hpp"
30
+ #include "duckdb/main/extension_entries.hpp"
30
31
  #include "duckdb/main/connection.hpp"
31
32
  #include "duckdb/main/attached_database.hpp"
32
33
  #include "duckdb/main/database_manager.hpp"
@@ -251,6 +252,20 @@ CatalogEntry *Catalog::CreateCollation(CatalogTransaction transaction, SchemaCat
251
252
  return schema->CreateCollation(transaction, info);
252
253
  }
253
254
 
255
+ //===--------------------------------------------------------------------===//
256
+ // Index
257
+ //===--------------------------------------------------------------------===//
258
+ CatalogEntry *Catalog::CreateIndex(CatalogTransaction transaction, CreateIndexInfo *info) {
259
+ auto &context = transaction.GetContext();
260
+ return CreateIndex(context, info);
261
+ }
262
+
263
+ CatalogEntry *Catalog::CreateIndex(ClientContext &context, CreateIndexInfo *info) {
264
+ auto schema = GetSchema(context, info->schema);
265
+ auto table = GetEntry<TableCatalogEntry>(context, schema->name, info->table->table_name);
266
+ return schema->CreateIndex(context, info, table);
267
+ }
268
+
254
269
  //===--------------------------------------------------------------------===//
255
270
  // Lookup Structures
256
271
  //===--------------------------------------------------------------------===//
@@ -317,17 +332,26 @@ SimilarCatalogEntry Catalog::SimilarEntryInSchemas(ClientContext &context, const
317
332
  return result;
318
333
  }
319
334
 
320
- string FindExtension(const string &function_name) {
321
- auto size = sizeof(EXTENSION_FUNCTIONS) / sizeof(ExtensionFunction);
322
- auto it = std::lower_bound(
323
- EXTENSION_FUNCTIONS, EXTENSION_FUNCTIONS + size, function_name,
324
- [](const ExtensionFunction &element, const string &value) { return element.function < value; });
325
- if (it != EXTENSION_FUNCTIONS + size && it->function == function_name) {
335
+ string FindExtensionGeneric(const string &name, const ExtensionEntry entries[], idx_t size) {
336
+ auto lcase = StringUtil::Lower(name);
337
+ auto it = std::lower_bound(entries, entries + size, lcase,
338
+ [](const ExtensionEntry &element, const string &value) { return element.name < value; });
339
+ if (it != entries + size && it->name == lcase) {
326
340
  return it->extension;
327
341
  }
328
342
  return "";
329
343
  }
330
344
 
345
+ string FindExtensionForFunction(const string &name) {
346
+ idx_t size = sizeof(EXTENSION_FUNCTIONS) / sizeof(ExtensionEntry);
347
+ return FindExtensionGeneric(name, EXTENSION_FUNCTIONS, size);
348
+ }
349
+
350
+ string FindExtensionForSetting(const string &name) {
351
+ idx_t size = sizeof(EXTENSION_SETTINGS) / sizeof(ExtensionEntry);
352
+ return FindExtensionGeneric(name, EXTENSION_SETTINGS, size);
353
+ }
354
+
331
355
  vector<CatalogSearchEntry> GetCatalogEntries(ClientContext &context, const string &catalog, const string &schema) {
332
356
  vector<CatalogSearchEntry> entries;
333
357
  auto &search_path = *context.client_data->catalog_search_path;
@@ -392,6 +416,26 @@ void FindMinimalQualification(ClientContext &context, const string &catalog_name
392
416
  qualify_schema = true;
393
417
  }
394
418
 
419
+ CatalogException Catalog::UnrecognizedConfigurationError(ClientContext &context, const string &name) {
420
+ // check if the setting exists in any extensions
421
+ auto extension_name = FindExtensionForSetting(name);
422
+ if (!extension_name.empty()) {
423
+ return CatalogException(
424
+ "Setting with name \"%s\" is not in the catalog, but it exists in the %s extension.\n\nTo "
425
+ "install and load the extension, run:\nINSTALL %s;\nLOAD %s;",
426
+ name, extension_name, extension_name, extension_name);
427
+ }
428
+ // the setting is not in an extension
429
+ // get a list of all options
430
+ vector<string> potential_names = DBConfig::GetOptionNames();
431
+ for (auto &entry : DBConfig::GetConfig(context).extension_parameters) {
432
+ potential_names.push_back(entry.first);
433
+ }
434
+
435
+ throw CatalogException("unrecognized configuration parameter \"%s\"\n%s", name,
436
+ StringUtil::CandidatesErrorMessage(potential_names, name, "Did you mean"));
437
+ }
438
+
395
439
  CatalogException Catalog::CreateMissingEntryException(ClientContext &context, const string &entry_name,
396
440
  CatalogType type,
397
441
  const unordered_set<SchemaCatalogEntry *> &schemas,
@@ -408,13 +452,18 @@ CatalogException Catalog::CreateMissingEntryException(ClientContext &context, co
408
452
  unseen_schemas.insert(current_schema);
409
453
  }
410
454
  }
411
- auto unseen_entry = SimilarEntryInSchemas(context, entry_name, type, unseen_schemas);
412
- auto extension_name = FindExtension(entry_name);
413
- if (!extension_name.empty()) {
414
- return CatalogException("Function with name %s is not on the catalog, but it exists in the %s extension. To "
415
- "Install and Load the extension, run: INSTALL %s; LOAD %s;",
416
- entry_name, extension_name, extension_name, extension_name);
455
+ // check if the entry exists in any extension
456
+ if (type == CatalogType::TABLE_FUNCTION_ENTRY || type == CatalogType::SCALAR_FUNCTION_ENTRY ||
457
+ type == CatalogType::AGGREGATE_FUNCTION_ENTRY) {
458
+ auto extension_name = FindExtensionForFunction(entry_name);
459
+ if (!extension_name.empty()) {
460
+ return CatalogException(
461
+ "Function with name \"%s\" is not in the catalog, but it exists in the %s extension.\n\nTo "
462
+ "install and load the extension, run:\nINSTALL %s;\nLOAD %s;",
463
+ entry_name, extension_name, extension_name, extension_name);
464
+ }
417
465
  }
466
+ auto unseen_entry = SimilarEntryInSchemas(context, entry_name, type, unseen_schemas);
418
467
  string did_you_mean;
419
468
  if (unseen_entry.Found() && unseen_entry.distance < entry.distance) {
420
469
  // the closest matching entry requires qualification as it is not in the default search path
@@ -19,10 +19,11 @@ string IndexCatalogEntry::ToSQL() {
19
19
  return sql;
20
20
  }
21
21
 
22
- void IndexCatalogEntry::Serialize(duckdb::MetaBlockWriter &serializer) {
23
- // Here we serialize the index metadata in the following order:
24
- // schema name, table name, index name, sql, index type, index constraint type, expression list.
25
- // column_ids, unbound_expression
22
+ void IndexCatalogEntry::Serialize(Serializer &serializer) {
23
+ // here we serialize the index metadata in the following order:
24
+ // schema name, table name, index name, sql, index type, index constraint type, expression list, parsed expressions,
25
+ // column IDs
26
+
26
27
  FieldWriter writer(serializer);
27
28
  writer.WriteString(GetSchemaName());
28
29
  writer.WriteString(GetTableName());
@@ -37,9 +38,9 @@ void IndexCatalogEntry::Serialize(duckdb::MetaBlockWriter &serializer) {
37
38
  }
38
39
 
39
40
  unique_ptr<CreateIndexInfo> IndexCatalogEntry::Deserialize(Deserializer &source, ClientContext &context) {
40
- // Here we deserialize the index metadata in the following order:
41
- // root block, root offset, schema name, table name, index name, sql, index type, index constraint type, expression
42
- // list.
41
+ // here we deserialize the index metadata in the following order:
42
+ // schema name, table schema name, table name, index name, sql, index type, index constraint type, expression list,
43
+ // parsed expression list, column IDs
43
44
 
44
45
  auto create_index_info = make_unique<CreateIndexInfo>();
45
46
 
@@ -24,7 +24,7 @@ SimilarCatalogEntry SchemaCatalogEntry::GetSimilarEntry(CatalogTransaction trans
24
24
  const string &name) {
25
25
  SimilarCatalogEntry result;
26
26
  Scan(transaction.GetContext(), type, [&](CatalogEntry *entry) {
27
- auto ldist = StringUtil::LevenshteinDistance(entry->name, name);
27
+ auto ldist = StringUtil::SimilarityScore(entry->name, name);
28
28
  if (ldist < result.distance) {
29
29
  result.distance = ldist;
30
30
  result.name = entry->name;
@@ -460,7 +460,7 @@ SimilarCatalogEntry CatalogSet::SimilarEntry(CatalogTransaction transaction, con
460
460
  for (auto &kv : mapping) {
461
461
  auto mapping_value = GetMapping(transaction, kv.first);
462
462
  if (mapping_value && !mapping_value->deleted) {
463
- auto ldist = StringUtil::LevenshteinDistance(kv.first, name);
463
+ auto ldist = StringUtil::SimilarityScore(kv.first, name);
464
464
  if (ldist < result.distance) {
465
465
  result.distance = ldist;
466
466
  result.name = kv.first;
@@ -93,6 +93,7 @@ static DefaultMacro internal_macros[] = {
93
93
  {DEFAULT_SCHEMA, "fdiv", {"x", "y", nullptr}, "floor(x/y)"},
94
94
  {DEFAULT_SCHEMA, "fmod", {"x", "y", nullptr}, "(x-y*floor(x/y))"},
95
95
  {DEFAULT_SCHEMA, "count_if", {"l", nullptr}, "sum(if(l, 1, 0))"},
96
+ {DEFAULT_SCHEMA, "split_part", {"string", "delimiter", "position", nullptr}, "coalesce(string_split(string, delimiter)[position],'')"},
96
97
 
97
98
  // algebraic list aggregates
98
99
  {DEFAULT_SCHEMA, "list_avg", {"l", nullptr}, "list_aggr(l, 'avg')"},
@@ -48,7 +48,7 @@ static DefaultView internal_views[] = {
48
48
  {"pg_catalog", "pg_views", "SELECT schema_name schemaname, view_name viewname, 'duckdb' viewowner, sql definition FROM duckdb_views()"},
49
49
  {"information_schema", "columns", "SELECT database_name table_catalog, schema_name table_schema, table_name, column_name, column_index ordinal_position, column_default, CASE WHEN is_nullable THEN 'YES' ELSE 'NO' END is_nullable, data_type, character_maximum_length, NULL character_octet_length, numeric_precision, numeric_precision_radix, numeric_scale, NULL datetime_precision, NULL interval_type, NULL interval_precision, NULL character_set_catalog, NULL character_set_schema, NULL character_set_name, NULL collation_catalog, NULL collation_schema, NULL collation_name, NULL domain_catalog, NULL domain_schema, NULL domain_name, NULL udt_catalog, NULL udt_schema, NULL udt_name, NULL scope_catalog, NULL scope_schema, NULL scope_name, NULL maximum_cardinality, NULL dtd_identifier, NULL is_self_referencing, NULL is_identity, NULL identity_generation, NULL identity_start, NULL identity_increment, NULL identity_maximum, NULL identity_minimum, NULL identity_cycle, NULL is_generated, NULL generation_expression, NULL is_updatable FROM duckdb_columns;"},
50
50
  {"information_schema", "schemata", "SELECT database_name catalog_name, schema_name, 'duckdb' schema_owner, NULL default_character_set_catalog, NULL default_character_set_schema, NULL default_character_set_name, sql sql_path FROM duckdb_schemas()"},
51
- {"information_schema", "tables", "SELECT database_name table_catalog, schema_name table_schema, table_name, CASE WHEN temporary THEN 'LOCAL TEMPORARY' ELSE 'BASE TABLE' END table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'YES' is_insertable_into, 'NO' is_typed, CASE WHEN temporary THEN 'PRESERVE' ELSE NULL END commit_action FROM duckdb_tables() UNION ALL SELECT NULL table_catalog, schema_name table_schema, view_name table_name, 'VIEW' table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'NO' is_insertable_into, 'NO' is_typed, NULL commit_action FROM duckdb_views;"},
51
+ {"information_schema", "tables", "SELECT database_name table_catalog, schema_name table_schema, table_name, CASE WHEN temporary THEN 'LOCAL TEMPORARY' ELSE 'BASE TABLE' END table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'YES' is_insertable_into, 'NO' is_typed, CASE WHEN temporary THEN 'PRESERVE' ELSE NULL END commit_action FROM duckdb_tables() UNION ALL SELECT database_name table_catalog, schema_name table_schema, view_name table_name, 'VIEW' table_type, NULL self_referencing_column_name, NULL reference_generation, NULL user_defined_type_catalog, NULL user_defined_type_schema, NULL user_defined_type_name, 'NO' is_insertable_into, 'NO' is_typed, NULL commit_action FROM duckdb_views;"},
52
52
  {nullptr, nullptr, nullptr}};
53
53
 
54
54
  static unique_ptr<CreateViewInfo> GetDefaultView(ClientContext &context, const string &input_schema, const string &input_name) {