duckdb 0.7.2-dev3515.0 → 0.7.2-dev3666.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/configure.py +2 -0
  2. package/package.json +1 -1
  3. package/src/database.cpp +1 -0
  4. package/src/duckdb/extension/json/buffered_json_reader.cpp +56 -17
  5. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +56 -31
  6. package/src/duckdb/extension/json/include/json_common.hpp +5 -4
  7. package/src/duckdb/extension/json/include/json_executors.hpp +13 -18
  8. package/src/duckdb/extension/json/include/json_functions.hpp +3 -0
  9. package/src/duckdb/extension/json/include/json_scan.hpp +106 -153
  10. package/src/duckdb/extension/json/include/json_transform.hpp +2 -2
  11. package/src/duckdb/extension/json/json_common.cpp +1 -1
  12. package/src/duckdb/extension/json/json_functions/copy_json.cpp +94 -38
  13. package/src/duckdb/extension/json/json_functions/json_contains.cpp +7 -8
  14. package/src/duckdb/extension/json/json_functions/json_create.cpp +7 -7
  15. package/src/duckdb/extension/json/json_functions/json_merge_patch.cpp +4 -4
  16. package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +4 -4
  17. package/src/duckdb/extension/json/json_functions/json_structure.cpp +7 -5
  18. package/src/duckdb/extension/json/json_functions/json_transform.cpp +10 -8
  19. package/src/duckdb/extension/json/json_functions/json_valid.cpp +1 -1
  20. package/src/duckdb/extension/json/json_functions/read_json.cpp +167 -169
  21. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +37 -16
  22. package/src/duckdb/extension/json/json_functions.cpp +11 -4
  23. package/src/duckdb/extension/json/json_scan.cpp +593 -374
  24. package/src/duckdb/extension/parquet/parquet-extension.cpp +5 -0
  25. package/src/duckdb/src/catalog/catalog_entry/macro_catalog_entry.cpp +42 -0
  26. package/src/duckdb/src/catalog/catalog_search_path.cpp +5 -0
  27. package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
  28. package/src/duckdb/src/common/constants.cpp +1 -0
  29. package/src/duckdb/src/common/file_system.cpp +26 -6
  30. package/src/duckdb/src/common/local_file_system.cpp +0 -13
  31. package/src/duckdb/src/common/types/vector.cpp +3 -3
  32. package/src/duckdb/src/common/types/vector_buffer.cpp +11 -3
  33. package/src/duckdb/src/common/types/vector_cache.cpp +5 -5
  34. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +12 -6
  35. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +10 -0
  36. package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +2 -2
  37. package/src/duckdb/src/function/macro_function.cpp +43 -0
  38. package/src/duckdb/src/function/pragma/pragma_queries.cpp +5 -3
  39. package/src/duckdb/src/function/scalar/strftime_format.cpp +1 -0
  40. package/src/duckdb/src/function/scalar_macro_function.cpp +10 -0
  41. package/src/duckdb/src/function/table/copy_csv.cpp +68 -18
  42. package/src/duckdb/src/function/table/read_csv.cpp +30 -3
  43. package/src/duckdb/src/function/table/version/pragma_version.cpp +8 -2
  44. package/src/duckdb/src/function/table_macro_function.cpp +10 -0
  45. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/column_dependency_manager.hpp +1 -1
  46. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/macro_catalog_entry.hpp +3 -1
  47. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/scalar_macro_catalog_entry.hpp +0 -6
  48. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_macro_catalog_entry.hpp +0 -6
  49. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
  50. package/src/duckdb/src/include/duckdb/catalog/similar_catalog_entry.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
  52. package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -3
  53. package/src/duckdb/src/include/duckdb/common/field_writer.hpp +3 -3
  54. package/src/duckdb/src/include/duckdb/common/file_system.hpp +5 -0
  55. package/src/duckdb/src/include/duckdb/common/http_state.hpp +2 -1
  56. package/src/duckdb/src/include/duckdb/common/hugeint.hpp +6 -6
  57. package/src/duckdb/src/include/duckdb/common/limits.hpp +46 -46
  58. package/src/duckdb/src/include/duckdb/common/operator/cast_operators.hpp +8 -8
  59. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +6 -6
  60. package/src/duckdb/src/include/duckdb/common/operator/convert_to_string.hpp +1 -1
  61. package/src/duckdb/src/include/duckdb/common/operator/decimal_cast_operators.hpp +2 -4
  62. package/src/duckdb/src/include/duckdb/common/operator/string_cast.hpp +1 -1
  63. package/src/duckdb/src/include/duckdb/common/operator/subtract.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +1 -1
  65. package/src/duckdb/src/include/duckdb/common/re2_regex.hpp +1 -1
  66. package/src/duckdb/src/include/duckdb/common/string_util.hpp +7 -7
  67. package/src/duckdb/src/include/duckdb/common/types/chunk_collection.hpp +10 -10
  68. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +12 -12
  69. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection_iterators.hpp +2 -2
  70. package/src/duckdb/src/include/duckdb/common/types/value.hpp +1 -1
  71. package/src/duckdb/src/include/duckdb/common/types/vector_buffer.hpp +12 -2
  72. package/src/duckdb/src/include/duckdb/common/types.hpp +2 -2
  73. package/src/duckdb/src/include/duckdb/common/winapi.hpp +1 -1
  74. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
  75. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +9 -5
  76. package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_type.hpp +1 -1
  77. package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +10 -14
  78. package/src/duckdb/src/include/duckdb/function/macro_function.hpp +7 -1
  79. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +3 -4
  80. package/src/duckdb/src/include/duckdb/function/scalar_macro_function.hpp +7 -2
  81. package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
  82. package/src/duckdb/src/include/duckdb/function/table_macro_function.hpp +5 -0
  83. package/src/duckdb/src/include/duckdb/function/udf_function.hpp +56 -50
  84. package/src/duckdb/src/include/duckdb/main/appender.hpp +2 -2
  85. package/src/duckdb/src/include/duckdb/main/client_context.hpp +2 -2
  86. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -1
  87. package/src/duckdb/src/include/duckdb/main/connection.hpp +8 -9
  88. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
  89. package/src/duckdb/src/include/duckdb/main/query_result.hpp +3 -3
  90. package/src/duckdb/src/include/duckdb/main/relation.hpp +6 -7
  91. package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +1 -1
  92. package/src/duckdb/src/include/duckdb/parser/column_list.hpp +7 -7
  93. package/src/duckdb/src/include/duckdb/parser/parsed_data/attach_info.hpp +4 -7
  94. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_macro_info.hpp +8 -12
  95. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_sequence_info.hpp +6 -20
  96. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_type_info.hpp +6 -18
  97. package/src/duckdb/src/include/duckdb/parser/parsed_data/detach_info.hpp +4 -8
  98. package/src/duckdb/src/include/duckdb/parser/parsed_data/drop_info.hpp +4 -38
  99. package/src/duckdb/src/include/duckdb/parser/parsed_data/transaction_info.hpp +5 -2
  100. package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +10 -10
  101. package/src/duckdb/src/include/duckdb/parser/parser_extension.hpp +2 -2
  102. package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +1 -1
  103. package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +1 -1
  104. package/src/duckdb/src/include/duckdb/planner/operator_extension.hpp +2 -2
  105. package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +2 -2
  106. package/src/duckdb/src/parser/parsed_data/attach_info.cpp +42 -0
  107. package/src/duckdb/src/parser/parsed_data/create_index_info.cpp +0 -7
  108. package/src/duckdb/src/parser/parsed_data/create_info.cpp +19 -8
  109. package/src/duckdb/src/parser/parsed_data/create_macro_info.cpp +46 -0
  110. package/src/duckdb/src/parser/parsed_data/create_sequence_info.cpp +56 -0
  111. package/src/duckdb/src/parser/parsed_data/create_type_info.cpp +47 -0
  112. package/src/duckdb/src/parser/parsed_data/detach_info.cpp +34 -0
  113. package/src/duckdb/src/parser/parsed_data/drop_info.cpp +46 -0
  114. package/src/duckdb/src/parser/parsed_data/transaction_info.cpp +24 -0
  115. package/src/duckdb/src/parser/parsed_data/vacuum_info.cpp +37 -0
  116. package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +27 -9
  117. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +9 -4
  118. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +2 -1
  119. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +1 -0
  120. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +1 -1
  121. package/src/duckdb/src/planner/logical_operator.cpp +1 -2
  122. package/src/duckdb/src/planner/operator/logical_create_index.cpp +16 -25
  123. package/src/duckdb/src/planner/operator/logical_insert.cpp +30 -0
  124. package/src/duckdb/src/planner/operator/logical_simple.cpp +33 -5
  125. package/src/duckdb/src/planner/parsed_data/bound_create_table_info.cpp +6 -16
  126. package/src/duckdb/src/planner/planner.cpp +4 -13
  127. package/src/duckdb/src/storage/checkpoint_manager.cpp +12 -6
  128. package/src/duckdb/src/storage/single_file_block_manager.cpp +0 -4
  129. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  130. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5735 -5773
  131. package/src/duckdb/ub_src_catalog_catalog_entry.cpp +1 -1
  132. package/src/duckdb/ub_src_parser_parsed_data.cpp +16 -0
  133. package/src/duckdb/src/catalog/catalog_entry/scalar_macro_catalog_entry.cpp +0 -104
@@ -9,7 +9,10 @@
9
9
  #pragma once
10
10
 
11
11
  #include "buffered_json_reader.hpp"
12
+ #include "duckdb/common/multi_file_reader.hpp"
12
13
  #include "duckdb/common/mutex.hpp"
14
+ #include "duckdb/common/pair.hpp"
15
+ #include "duckdb/common/types/type_map.hpp"
13
16
  #include "duckdb/function/scalar/strftime_format.hpp"
14
17
  #include "duckdb/function/table_function.hpp"
15
18
  #include "json_transform.hpp"
@@ -26,29 +29,29 @@ enum class JSONScanType : uint8_t {
26
29
  SAMPLE = 3,
27
30
  };
28
31
 
29
- enum class JSONRecordType : uint8_t {
30
- //! Sequential values
31
- RECORDS = 0,
32
- //! Array of values
33
- ARRAY_OF_RECORDS = 1,
34
- //! Sequential non-object JSON
35
- JSON = 2,
36
- //! Array of non-object JSON
37
- ARRAY_OF_JSON = 3,
38
- //! Auto-detect
39
- AUTO = 4,
40
- };
32
+ struct JSONString {
33
+ public:
34
+ JSONString() {
35
+ }
36
+ JSONString(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
37
+ }
38
+
39
+ const char *pointer;
40
+ idx_t size;
41
+
42
+ public:
43
+ string ToString() {
44
+ return string(pointer, size);
45
+ }
41
46
 
42
- //! Even though LogicalTypeId is just a uint8_t, this is still needed ...
43
- struct LogicalTypeIdHash {
44
- inline std::size_t operator()(const LogicalTypeId &id) const {
45
- return (size_t)id;
47
+ const char &operator[](size_t i) const {
48
+ return pointer[i];
46
49
  }
47
50
  };
48
51
 
49
52
  struct DateFormatMap {
50
53
  public:
51
- void Initialize(const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> &format_templates) {
54
+ void Initialize(const type_id_map_t<vector<const char *>> &format_templates) {
52
55
  for (const auto &entry : format_templates) {
53
56
  const auto &type = entry.first;
54
57
  for (const auto &format_string : entry.second) {
@@ -74,65 +77,79 @@ public:
74
77
  }
75
78
 
76
79
  StrpTimeFormat &GetFormat(LogicalTypeId type) {
77
- return candidate_formats[type].back();
80
+ D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
81
+ return candidate_formats.find(type)->second.back();
82
+ }
83
+
84
+ const StrpTimeFormat &GetFormat(LogicalTypeId type) const {
85
+ D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
86
+ return candidate_formats.find(type)->second.back();
78
87
  }
79
88
 
80
89
  private:
81
- unordered_map<LogicalTypeId, vector<StrpTimeFormat>, LogicalTypeIdHash> candidate_formats;
90
+ type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
82
91
  };
83
92
 
84
93
  struct JSONScanData : public TableFunctionData {
85
94
  public:
86
95
  JSONScanData();
87
96
 
88
- static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindInput &input);
97
+ void Bind(ClientContext &context, TableFunctionBindInput &input);
98
+
99
+ void InitializeReaders(ClientContext &context);
89
100
  void InitializeFormats();
90
101
  void InitializeFormats(bool auto_detect);
102
+ void SetCompression(const string &compression);
91
103
 
92
- void Serialize(FieldWriter &writer);
93
- void Deserialize(FieldReader &reader);
104
+ void Serialize(FieldWriter &writer) const;
105
+ void Deserialize(ClientContext &context, FieldReader &reader);
94
106
 
95
107
  public:
96
108
  //! Scan type
97
109
  JSONScanType type;
110
+
98
111
  //! File-specific options
99
112
  BufferedJSONReaderOptions options;
113
+
114
+ //! Multi-file reader stuff
115
+ MultiFileReaderBindData reader_bind;
116
+
100
117
  //! The files we're reading
101
- vector<string> file_paths;
118
+ vector<string> files;
119
+ //! Initial file reader
120
+ unique_ptr<BufferedJSONReader> initial_reader;
121
+ //! The readers
122
+ vector<unique_ptr<BufferedJSONReader>> union_readers;
102
123
 
103
124
  //! Whether or not we should ignore malformed JSON (default to NULL)
104
125
  bool ignore_errors = false;
105
- //! Maximum JSON object size (defaults to 1MB minimum)
106
- idx_t maximum_object_size = 1048576;
107
- //! Options when transforming the JSON to columnar data
108
- JSONTransformOptions transform_options;
109
-
126
+ //! Maximum JSON object size (defaults to 16MB minimum)
127
+ idx_t maximum_object_size = 16777216;
110
128
  //! Whether we auto-detect a schema
111
129
  bool auto_detect = false;
112
130
  //! Sample size for detecting schema
113
- idx_t sample_size = STANDARD_VECTOR_SIZE;
114
- //! Column names (in order)
115
- vector<string> names;
116
- //! Valid cols (ROW_TYPE cols are considered invalid)
117
- vector<idx_t> valid_cols;
131
+ idx_t sample_size = idx_t(STANDARD_VECTOR_SIZE) * 10;
118
132
  //! Max depth we go to detect nested JSON schema (defaults to unlimited)
119
133
  idx_t max_depth = NumericLimits<idx_t>::Maximum();
120
- //! Whether we're parsing values (usually), or something else
121
- JSONRecordType record_type = JSONRecordType::RECORDS;
134
+
135
+ //! All column names (in order)
136
+ vector<string> names;
137
+ //! Options when transforming the JSON to columnar data
138
+ JSONTransformOptions transform_options;
122
139
  //! Forced date/timestamp formats
123
140
  string date_format;
124
141
  string timestamp_format;
125
-
126
- //! Stored readers for when we're detecting the schema
127
- vector<duckdb::unique_ptr<BufferedJSONReader>> stored_readers;
128
142
  //! Candidate date formats
129
143
  DateFormatMap date_format_map;
144
+
145
+ //! The inferred avg tuple size
146
+ idx_t avg_tuple_size = 420;
130
147
  };
131
148
 
132
149
  struct JSONScanInfo : public TableFunctionInfo {
133
150
  public:
134
151
  explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
135
- JSONRecordType record_type_p = JSONRecordType::AUTO, bool auto_detect_p = false)
152
+ JSONRecordType record_type_p = JSONRecordType::AUTO_DETECT, bool auto_detect_p = false)
136
153
  : type(type_p), format(format_p), record_type(record_type_p), auto_detect(auto_detect_p) {
137
154
  }
138
155
 
@@ -144,11 +161,17 @@ public:
144
161
 
145
162
  struct JSONScanGlobalState {
146
163
  public:
147
- JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data);
164
+ JSONScanGlobalState(ClientContext &context, const JSONScanData &bind_data);
148
165
 
149
166
  public:
150
167
  //! Bound data
151
- JSONScanData &bind_data;
168
+ const JSONScanData &bind_data;
169
+ //! Options when transforming the JSON to columnar data
170
+ JSONTransformOptions transform_options;
171
+
172
+ //! Column names that we're actually reading (after projection pushdown)
173
+ vector<string> names;
174
+ vector<column_t> column_indices;
152
175
 
153
176
  //! Buffer manager allocator
154
177
  Allocator &allocator;
@@ -157,7 +180,7 @@ public:
157
180
 
158
181
  mutex lock;
159
182
  //! One JSON reader per file
160
- vector<duckdb::unique_ptr<BufferedJSONReader>> json_readers;
183
+ vector<optional_ptr<BufferedJSONReader>> json_readers;
161
184
  //! Current file/batch index
162
185
  idx_t file_index;
163
186
  atomic<idx_t> batch_index;
@@ -166,62 +189,58 @@ public:
166
189
  idx_t system_threads;
167
190
  };
168
191
 
169
- struct JSONLine {
170
- public:
171
- JSONLine() {
172
- }
173
- JSONLine(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
174
- }
175
-
176
- const char *pointer;
177
- idx_t size;
178
-
179
- public:
180
- string ToString() {
181
- return string(pointer, size);
182
- }
183
-
184
- const char &operator[](size_t i) const {
185
- return pointer[i];
186
- }
187
- };
188
-
189
192
  struct JSONScanLocalState {
190
193
  public:
191
194
  JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate);
192
195
 
193
196
  public:
194
197
  idx_t ReadNext(JSONScanGlobalState &gstate);
195
- yyjson_alc *GetAllocator();
196
198
  void ThrowTransformError(idx_t object_index, const string &error_message);
197
199
 
200
+ yyjson_alc *GetAllocator();
201
+ const MultiFileReaderData &GetReaderData() const;
202
+
203
+ public:
204
+ //! Current scan data
198
205
  idx_t scan_count;
199
- JSONLine lines[STANDARD_VECTOR_SIZE];
206
+ JSONString units[STANDARD_VECTOR_SIZE];
200
207
  yyjson_val *values[STANDARD_VECTOR_SIZE];
201
208
 
202
- idx_t array_idx;
203
- idx_t array_offset;
204
- yyjson_val *array_values[STANDARD_VECTOR_SIZE];
205
-
209
+ //! Batch index for order-preserving parallelism
206
210
  idx_t batch_index;
207
211
 
208
212
  //! Options when transforming the JSON to columnar data
209
213
  DateFormatMap date_format_map;
210
214
  JSONTransformOptions transform_options;
211
215
 
216
+ //! For determining average tuple size
217
+ idx_t total_read_size;
218
+ idx_t total_tuple_count;
219
+
212
220
  private:
213
- yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
214
- idx_t GetObjectsFromArray(JSONScanGlobalState &gstate);
221
+ bool ReadNextBuffer(JSONScanGlobalState &gstate);
222
+ void ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index);
223
+ void ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
224
+ void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
225
+ void SkipOverArrayStart();
226
+
227
+ bool ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index, const bool already_incremented_file_idx);
228
+ void ReconstructFirstObject(JSONScanGlobalState &gstate);
229
+ void ParseNextChunk();
230
+
231
+ void ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining);
232
+ void ThrowObjectSizeError(const idx_t object_size);
233
+ void ThrowInvalidAtEndError();
215
234
 
216
235
  private:
217
236
  //! Bind data
218
- JSONScanData &bind_data;
237
+ const JSONScanData &bind_data;
219
238
  //! Thread-local allocator
220
- JSONAllocator json_allocator;
239
+ JSONAllocator allocator;
221
240
 
222
241
  //! Current reader and buffer handle
223
- BufferedJSONReader *current_reader;
224
- JSONBufferHandle *current_buffer_handle;
242
+ optional_ptr<BufferedJSONReader> current_reader;
243
+ optional_ptr<JSONBufferHandle> current_buffer_handle;
225
244
  //! Whether this is the last batch of the file
226
245
  bool is_last;
227
246
 
@@ -234,26 +253,12 @@ private:
234
253
 
235
254
  //! Buffer to reconstruct split values
236
255
  AllocatedData reconstruct_buffer;
237
- //! Copy of current buffer for YYJSON_READ_INSITU
238
- AllocatedData current_buffer_copy;
239
- const char *buffer_copy_ptr;
240
-
241
- private:
242
- bool ReadNextBuffer(JSONScanGlobalState &gstate);
243
- void ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index);
244
- void ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
245
- void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
246
-
247
- void ReconstructFirstObject(JSONScanGlobalState &gstate);
248
-
249
- void ReadUnstructured(idx_t &count);
250
- void ReadNewlineDelimited(idx_t &count);
251
256
  };
252
257
 
253
258
  struct JSONGlobalTableFunctionState : public GlobalTableFunctionState {
254
259
  public:
255
260
  JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input);
256
- static duckdb::unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
261
+ static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
257
262
  idx_t MaxThreads() const override;
258
263
 
259
264
  public:
@@ -263,8 +268,8 @@ public:
263
268
  struct JSONLocalTableFunctionState : public LocalTableFunctionState {
264
269
  public:
265
270
  JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate);
266
- static duckdb::unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
267
- GlobalTableFunctionState *global_state);
271
+ static unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
272
+ GlobalTableFunctionState *global_state);
268
273
  idx_t GetBatchIndex() const;
269
274
 
270
275
  public:
@@ -276,70 +281,18 @@ public:
276
281
  static void AutoDetect(ClientContext &context, JSONScanData &bind_data, vector<LogicalType> &return_types,
277
282
  vector<string> &names);
278
283
 
279
- static void InitializeBindData(ClientContext &context, JSONScanData &bind_data,
280
- const named_parameter_map_t &named_parameters, vector<string> &names,
281
- vector<LogicalType> &return_types);
284
+ static double ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
285
+ const GlobalTableFunctionState *global_state);
286
+ static idx_t GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
287
+ LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state);
288
+ static unique_ptr<NodeStatistics> Cardinality(ClientContext &context, const FunctionData *bind_data);
289
+ static void ComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p,
290
+ vector<unique_ptr<Expression>> &filters);
282
291
 
283
- static double JSONScanProgress(ClientContext &context, const FunctionData *bind_data_p,
284
- const GlobalTableFunctionState *global_state) {
285
- auto &gstate = ((JSONGlobalTableFunctionState &)*global_state).state;
286
- double progress = 0;
287
- for (auto &reader : gstate.json_readers) {
288
- progress += reader->GetProgress();
289
- }
290
- return progress / double(gstate.json_readers.size());
291
- }
292
+ static void Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function);
293
+ static unique_ptr<FunctionData> Deserialize(ClientContext &context, FieldReader &reader, TableFunction &function);
292
294
 
293
- static idx_t JSONScanGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
294
- LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
295
- auto &lstate = (JSONLocalTableFunctionState &)*local_state;
296
- return lstate.GetBatchIndex();
297
- }
298
-
299
- static unique_ptr<NodeStatistics> JSONScanCardinality(ClientContext &context, const FunctionData *bind_data) {
300
- auto &data = (JSONScanData &)*bind_data;
301
- idx_t per_file_cardinality;
302
- if (data.stored_readers.empty()) {
303
- // The cardinality of an unknown JSON file is the almighty number 42 except when it's not
304
- per_file_cardinality = 42;
305
- } else {
306
- // If we multiply the almighty number 42 by 10, we get the exact average size of a JSON
307
- // Not really, but the average size of a lineitem row in JSON is around 360 bytes
308
- per_file_cardinality = data.stored_readers[0]->GetFileHandle().FileSize() / 420;
309
- }
310
- // Obviously this can be improved but this is better than defaulting to 0
311
- return make_uniq<NodeStatistics>(per_file_cardinality * data.file_paths.size());
312
- }
313
-
314
- static void JSONScanSerialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
315
- auto &bind_data = (JSONScanData &)*bind_data_p;
316
- bind_data.Serialize(writer);
317
- }
318
-
319
- static duckdb::unique_ptr<FunctionData> JSONScanDeserialize(ClientContext &context, FieldReader &reader,
320
- TableFunction &function) {
321
- auto result = make_uniq<JSONScanData>();
322
- result->Deserialize(reader);
323
- return std::move(result);
324
- }
325
-
326
- static void TableFunctionDefaults(TableFunction &table_function) {
327
- table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
328
- table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
329
- table_function.named_parameters["lines"] = LogicalType::VARCHAR;
330
- table_function.named_parameters["compression"] = LogicalType::VARCHAR;
331
-
332
- table_function.table_scan_progress = JSONScanProgress;
333
- table_function.get_batch_index = JSONScanGetBatchIndex;
334
- table_function.cardinality = JSONScanCardinality;
335
-
336
- table_function.serialize = JSONScanSerialize;
337
- table_function.deserialize = JSONScanDeserialize;
338
-
339
- table_function.projection_pushdown = false;
340
- table_function.filter_pushdown = false;
341
- table_function.filter_prune = false;
342
- }
295
+ static void TableFunctionDefaults(TableFunction &table_function);
343
296
  };
344
297
 
345
298
  } // namespace duckdb
@@ -35,14 +35,14 @@ public:
35
35
  //! Whether to delay the error when transforming (e.g., when non-strict casting or reading from file)
36
36
  bool delay_error = false;
37
37
  //! Date format used for parsing (can be NULL)
38
- DateFormatMap *date_format_map = nullptr;
38
+ optional_ptr<DateFormatMap> date_format_map = nullptr;
39
39
  //! String to store errors in
40
40
  string error_message;
41
41
  //! Index of the object where the error occurred
42
42
  idx_t object_index = DConstants::INVALID_INDEX;
43
43
 
44
44
  public:
45
- void Serialize(FieldWriter &writer);
45
+ void Serialize(FieldWriter &writer) const;
46
46
  void Deserialize(FieldReader &reader);
47
47
  };
48
48
 
@@ -5,7 +5,7 @@ namespace duckdb {
5
5
  string JSONCommon::ValToString(yyjson_val *val, idx_t max_len) {
6
6
  JSONAllocator json_allocator(Allocator::DefaultAllocator());
7
7
  idx_t len;
8
- auto data = JSONCommon::WriteVal<yyjson_val>(val, json_allocator.GetYYJSONAllocator(), len);
8
+ auto data = JSONCommon::WriteVal<yyjson_val>(val, json_allocator.GetYYAlc(), len);
9
9
  if (max_len < len) {
10
10
  return string(data, max_len) + "...";
11
11
  } else {
@@ -11,11 +11,47 @@
11
11
 
12
12
  namespace duckdb {
13
13
 
14
+ static void ThrowJSONCopyParameterException(const string &loption) {
15
+ throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.");
16
+ }
17
+
14
18
  static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
15
19
  auto stmt_copy = stmt.Copy();
16
20
  auto &copy = stmt_copy->Cast<CopyStatement>();
17
21
  auto &info = *copy.info;
18
22
 
23
+ // Parse the options, creating options for the CSV writer while doing so
24
+ string date_format;
25
+ string timestamp_format;
26
+ case_insensitive_map_t<vector<Value>> csv_copy_options;
27
+ for (const auto &kv : info.options) {
28
+ const auto &loption = StringUtil::Lower(kv.first);
29
+ if (loption == "dateformat" || loption == "date_format") {
30
+ if (kv.second.size() != 1) {
31
+ ThrowJSONCopyParameterException(loption);
32
+ }
33
+ date_format = StringValue::Get(kv.second.back());
34
+ } else if (loption == "timestampformat" || loption == "timestamp_format") {
35
+ if (kv.second.size() != 1) {
36
+ ThrowJSONCopyParameterException(loption);
37
+ }
38
+ timestamp_format = StringValue::Get(kv.second.back());
39
+ } else if (loption == "compression") {
40
+ csv_copy_options.insert(kv);
41
+ } else if (loption == "array") {
42
+ if (kv.second.size() > 1) {
43
+ ThrowJSONCopyParameterException(loption);
44
+ }
45
+ if (kv.second.empty() || BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
46
+ csv_copy_options["prefix"] = {"[\n\t"};
47
+ csv_copy_options["suffix"] = {"\n]\n"};
48
+ csv_copy_options["new_line"] = {",\n\t"};
49
+ }
50
+ } else {
51
+ throw BinderException("Unknown option for COPY ... TO ... (FORMAT JSON): \"%s\".", loption);
52
+ }
53
+ }
54
+
19
55
  // Bind the select statement of the original to resolve the types
20
56
  auto dummy_binder = Binder::CreateBinder(binder.context, &binder, true);
21
57
  auto bound_original = dummy_binder->Bind(*stmt.select_statement);
@@ -29,26 +65,24 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
29
65
  new_select_node.from_table = std::move(subquery_ref);
30
66
 
31
67
  // Create new select list
32
- vector<duckdb::unique_ptr<ParsedExpression>> select_list;
68
+ vector<unique_ptr<ParsedExpression>> select_list;
33
69
  select_list.reserve(bound_original.types.size());
34
70
 
35
71
  // strftime if the user specified a format (loop also gives columns a name, needed for struct_pack)
36
72
  // TODO: deal with date/timestamp within nested types
37
- const auto date_it = info.options.find("dateformat");
38
- const auto timestamp_it = info.options.find("timestampformat");
39
- vector<duckdb::unique_ptr<ParsedExpression>> strftime_children;
73
+ vector<unique_ptr<ParsedExpression>> strftime_children;
40
74
  for (idx_t col_idx = 0; col_idx < bound_original.types.size(); col_idx++) {
41
75
  auto column = make_uniq_base<ParsedExpression, PositionalReferenceExpression>(col_idx + 1);
42
- strftime_children.clear();
76
+ strftime_children = vector<unique_ptr<ParsedExpression>>();
43
77
  const auto &type = bound_original.types[col_idx];
44
78
  const auto &name = bound_original.names[col_idx];
45
- if (date_it != info.options.end() && type == LogicalTypeId::DATE) {
79
+ if (!date_format.empty() && type == LogicalTypeId::DATE) {
46
80
  strftime_children.emplace_back(std::move(column));
47
- strftime_children.emplace_back(make_uniq<ConstantExpression>(date_it->second.back()));
81
+ strftime_children.emplace_back(make_uniq<ConstantExpression>(date_format));
48
82
  column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
49
- } else if (timestamp_it != info.options.end() && type == LogicalTypeId::TIMESTAMP) {
83
+ } else if (!timestamp_format.empty() && type == LogicalTypeId::TIMESTAMP) {
50
84
  strftime_children.emplace_back(std::move(column));
51
- strftime_children.emplace_back(make_uniq<ConstantExpression>(timestamp_it->second.back()));
85
+ strftime_children.emplace_back(make_uniq<ConstantExpression>(timestamp_format));
52
86
  column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
53
87
  }
54
88
  column->alias = name;
@@ -63,6 +97,7 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
63
97
 
64
98
  // Now we can just use the CSV writer
65
99
  info.format = "csv";
100
+ info.options = std::move(csv_copy_options);
66
101
  info.options["quote"] = {""};
67
102
  info.options["escape"] = {""};
68
103
  info.options["delimiter"] = {"\n"};
@@ -71,49 +106,70 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
71
106
  return binder.Bind(*stmt_copy);
72
107
  }
73
108
 
74
- static duckdb::unique_ptr<FunctionData> CopyFromJSONBind(ClientContext &context, CopyInfo &info,
75
- vector<string> &expected_names,
76
- vector<LogicalType> &expected_types) {
109
+ static unique_ptr<FunctionData> CopyFromJSONBind(ClientContext &context, CopyInfo &info, vector<string> &expected_names,
110
+ vector<LogicalType> &expected_types) {
77
111
  auto bind_data = make_uniq<JSONScanData>();
112
+ bind_data->type = JSONScanType::READ_JSON;
113
+ bind_data->options.record_type = JSONRecordType::RECORDS;
114
+ bind_data->options.format = JSONFormat::NEWLINE_DELIMITED;
78
115
 
79
- bind_data->file_paths.emplace_back(info.file_path);
116
+ bind_data->files.emplace_back(info.file_path);
80
117
  bind_data->names = expected_names;
81
- for (idx_t col_idx = 0; col_idx < expected_names.size(); col_idx++) {
82
- bind_data->valid_cols.emplace_back(col_idx);
83
- }
84
118
 
85
- auto it = info.options.find("dateformat");
86
- if (it == info.options.end()) {
87
- it = info.options.find("date_format");
88
- }
89
- if (it != info.options.end()) {
90
- bind_data->date_format = StringValue::Get(it->second.back());
91
- }
92
-
93
- it = info.options.find("timestampformat");
94
- if (it == info.options.end()) {
95
- it = info.options.find("timestamp_format");
119
+ bool auto_detect = false;
120
+ for (auto &kv : info.options) {
121
+ const auto &loption = StringUtil::Lower(kv.first);
122
+ if (loption == "dateformat" || loption == "date_format") {
123
+ if (kv.second.size() != 1) {
124
+ ThrowJSONCopyParameterException(loption);
125
+ }
126
+ bind_data->date_format = StringValue::Get(kv.second.back());
127
+ } else if (loption == "timestampformat" || loption == "timestamp_format") {
128
+ if (kv.second.size() != 1) {
129
+ ThrowJSONCopyParameterException(loption);
130
+ }
131
+ bind_data->timestamp_format = StringValue::Get(kv.second.back());
132
+ } else if (loption == "auto_detect") {
133
+ if (kv.second.empty()) {
134
+ auto_detect = true;
135
+ } else if (kv.second.size() != 1) {
136
+ ThrowJSONCopyParameterException(loption);
137
+ } else {
138
+ auto_detect = BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN));
139
+ }
140
+ } else if (loption == "compression") {
141
+ if (kv.second.size() != 1) {
142
+ ThrowJSONCopyParameterException(loption);
143
+ }
144
+ bind_data->SetCompression(StringValue::Get(kv.second.back()));
145
+ } else if (loption == "array") {
146
+ if (kv.second.empty()) {
147
+ bind_data->options.format = JSONFormat::ARRAY;
148
+ } else if (kv.second.size() != 1) {
149
+ ThrowJSONCopyParameterException(loption);
150
+ } else if (BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
151
+ bind_data->options.format = JSONFormat::ARRAY;
152
+ }
153
+ } else {
154
+ throw BinderException("Unknown option for COPY ... FROM ... (FORMAT JSON): \"%s\".", loption);
155
+ }
96
156
  }
97
- if (it != info.options.end()) {
98
- bind_data->timestamp_format = StringValue::Get(it->second.back());
157
+ bind_data->InitializeFormats(auto_detect);
158
+ if (auto_detect && bind_data->options.format != JSONFormat::ARRAY) {
159
+ bind_data->options.format = JSONFormat::AUTO_DETECT;
99
160
  }
100
161
 
101
162
  bind_data->transform_options = JSONTransformOptions(true, true, true, true);
102
163
  bind_data->transform_options.delay_error = true;
103
164
 
104
- it = info.options.find("auto_detect");
105
- if (it != info.options.end()) {
106
- // Wrap this with auto detect true/false so we can detect date/timestamp formats
107
- // Note that auto_detect for names/types is not actually true because these are already know when we COPY
108
- bind_data->InitializeFormats(true);
109
- bind_data->options.format = JSONFormat::AUTO_DETECT;
110
- bind_data->record_type = JSONRecordType::AUTO;
165
+ bind_data->InitializeReaders(context);
166
+ if (auto_detect) {
111
167
  JSONScan::AutoDetect(context, *bind_data, expected_types, expected_names);
112
168
  bind_data->auto_detect = true;
113
- } else {
114
- bind_data->InitializeFormats();
115
169
  }
116
170
 
171
+ bind_data->transform_options.date_format_map = &bind_data->date_format_map;
172
+
117
173
  return std::move(bind_data);
118
174
  }
119
175
 
@@ -115,20 +115,19 @@ static void JSONContainsFunction(DataChunk &args, ExpressionState &state, Vector
115
115
 
116
116
  if (needles.GetVectorType() == VectorType::CONSTANT_VECTOR) {
117
117
  auto &needle_str = *ConstantVector::GetData<string_t>(needles);
118
- auto needle_doc =
119
- JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYJSONAllocator());
118
+ auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
120
119
  UnaryExecutor::Execute<string_t, bool>(haystacks, result, args.size(), [&](string_t haystack_str) {
121
- auto haystack_doc = JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG,
122
- lstate.json_allocator.GetYYJSONAllocator());
120
+ auto haystack_doc =
121
+ JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
123
122
  return JSONContains(haystack_doc->root, needle_doc->root);
124
123
  });
125
124
  } else {
126
125
  BinaryExecutor::Execute<string_t, string_t, bool>(
127
126
  haystacks, needles, result, args.size(), [&](string_t haystack_str, string_t needle_str) {
128
- auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG,
129
- lstate.json_allocator.GetYYJSONAllocator());
130
- auto haystack_doc = JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG,
131
- lstate.json_allocator.GetYYJSONAllocator());
127
+ auto needle_doc =
128
+ JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
129
+ auto haystack_doc =
130
+ JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
132
131
  return JSONContains(haystack_doc->root, needle_doc->root);
133
132
  });
134
133
  }