duckdb 0.7.2-dev3515.0 → 0.7.2-dev3666.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/configure.py +2 -0
- package/package.json +1 -1
- package/src/database.cpp +1 -0
- package/src/duckdb/extension/json/buffered_json_reader.cpp +56 -17
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +56 -31
- package/src/duckdb/extension/json/include/json_common.hpp +5 -4
- package/src/duckdb/extension/json/include/json_executors.hpp +13 -18
- package/src/duckdb/extension/json/include/json_functions.hpp +3 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +106 -153
- package/src/duckdb/extension/json/include/json_transform.hpp +2 -2
- package/src/duckdb/extension/json/json_common.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/copy_json.cpp +94 -38
- package/src/duckdb/extension/json/json_functions/json_contains.cpp +7 -8
- package/src/duckdb/extension/json/json_functions/json_create.cpp +7 -7
- package/src/duckdb/extension/json/json_functions/json_merge_patch.cpp +4 -4
- package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +4 -4
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +7 -5
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +10 -8
- package/src/duckdb/extension/json/json_functions/json_valid.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/read_json.cpp +167 -169
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +37 -16
- package/src/duckdb/extension/json/json_functions.cpp +11 -4
- package/src/duckdb/extension/json/json_scan.cpp +593 -374
- package/src/duckdb/extension/parquet/parquet-extension.cpp +5 -0
- package/src/duckdb/src/catalog/catalog_entry/macro_catalog_entry.cpp +42 -0
- package/src/duckdb/src/catalog/catalog_search_path.cpp +5 -0
- package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
- package/src/duckdb/src/common/constants.cpp +1 -0
- package/src/duckdb/src/common/file_system.cpp +26 -6
- package/src/duckdb/src/common/local_file_system.cpp +0 -13
- package/src/duckdb/src/common/types/vector.cpp +3 -3
- package/src/duckdb/src/common/types/vector_buffer.cpp +11 -3
- package/src/duckdb/src/common/types/vector_cache.cpp +5 -5
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +12 -6
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +10 -0
- package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +2 -2
- package/src/duckdb/src/function/macro_function.cpp +43 -0
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +5 -3
- package/src/duckdb/src/function/scalar/strftime_format.cpp +1 -0
- package/src/duckdb/src/function/scalar_macro_function.cpp +10 -0
- package/src/duckdb/src/function/table/copy_csv.cpp +68 -18
- package/src/duckdb/src/function/table/read_csv.cpp +30 -3
- package/src/duckdb/src/function/table/version/pragma_version.cpp +8 -2
- package/src/duckdb/src/function/table_macro_function.cpp +10 -0
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/column_dependency_manager.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/macro_catalog_entry.hpp +3 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/scalar_macro_catalog_entry.hpp +0 -6
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_macro_catalog_entry.hpp +0 -6
- package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/similar_catalog_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/field_writer.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/file_system.hpp +5 -0
- package/src/duckdb/src/include/duckdb/common/http_state.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/hugeint.hpp +6 -6
- package/src/duckdb/src/include/duckdb/common/limits.hpp +46 -46
- package/src/duckdb/src/include/duckdb/common/operator/cast_operators.hpp +8 -8
- package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +6 -6
- package/src/duckdb/src/include/duckdb/common/operator/convert_to_string.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/operator/decimal_cast_operators.hpp +2 -4
- package/src/duckdb/src/include/duckdb/common/operator/string_cast.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/operator/subtract.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/re2_regex.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +7 -7
- package/src/duckdb/src/include/duckdb/common/types/chunk_collection.hpp +10 -10
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +12 -12
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection_iterators.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/value.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/types/vector_buffer.hpp +12 -2
- package/src/duckdb/src/include/duckdb/common/types.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/winapi.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +9 -5
- package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_type.hpp +1 -1
- package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +10 -14
- package/src/duckdb/src/include/duckdb/function/macro_function.hpp +7 -1
- package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +3 -4
- package/src/duckdb/src/include/duckdb/function/scalar_macro_function.hpp +7 -2
- package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
- package/src/duckdb/src/include/duckdb/function/table_macro_function.hpp +5 -0
- package/src/duckdb/src/include/duckdb/function/udf_function.hpp +56 -50
- package/src/duckdb/src/include/duckdb/main/appender.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/client_context.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -1
- package/src/duckdb/src/include/duckdb/main/connection.hpp +8 -9
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +3 -3
- package/src/duckdb/src/include/duckdb/main/relation.hpp +6 -7
- package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/column_list.hpp +7 -7
- package/src/duckdb/src/include/duckdb/parser/parsed_data/attach_info.hpp +4 -7
- package/src/duckdb/src/include/duckdb/parser/parsed_data/create_macro_info.hpp +8 -12
- package/src/duckdb/src/include/duckdb/parser/parsed_data/create_sequence_info.hpp +6 -20
- package/src/duckdb/src/include/duckdb/parser/parsed_data/create_type_info.hpp +6 -18
- package/src/duckdb/src/include/duckdb/parser/parsed_data/detach_info.hpp +4 -8
- package/src/duckdb/src/include/duckdb/parser/parsed_data/drop_info.hpp +4 -38
- package/src/duckdb/src/include/duckdb/parser/parsed_data/transaction_info.hpp +5 -2
- package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +10 -10
- package/src/duckdb/src/include/duckdb/parser/parser_extension.hpp +2 -2
- package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/operator_extension.hpp +2 -2
- package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +2 -2
- package/src/duckdb/src/parser/parsed_data/attach_info.cpp +42 -0
- package/src/duckdb/src/parser/parsed_data/create_index_info.cpp +0 -7
- package/src/duckdb/src/parser/parsed_data/create_info.cpp +19 -8
- package/src/duckdb/src/parser/parsed_data/create_macro_info.cpp +46 -0
- package/src/duckdb/src/parser/parsed_data/create_sequence_info.cpp +56 -0
- package/src/duckdb/src/parser/parsed_data/create_type_info.cpp +47 -0
- package/src/duckdb/src/parser/parsed_data/detach_info.cpp +34 -0
- package/src/duckdb/src/parser/parsed_data/drop_info.cpp +46 -0
- package/src/duckdb/src/parser/parsed_data/transaction_info.cpp +24 -0
- package/src/duckdb/src/parser/parsed_data/vacuum_info.cpp +37 -0
- package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +27 -9
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +9 -4
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +2 -1
- package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +1 -0
- package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +1 -1
- package/src/duckdb/src/planner/logical_operator.cpp +1 -2
- package/src/duckdb/src/planner/operator/logical_create_index.cpp +16 -25
- package/src/duckdb/src/planner/operator/logical_insert.cpp +30 -0
- package/src/duckdb/src/planner/operator/logical_simple.cpp +33 -5
- package/src/duckdb/src/planner/parsed_data/bound_create_table_info.cpp +6 -16
- package/src/duckdb/src/planner/planner.cpp +4 -13
- package/src/duckdb/src/storage/checkpoint_manager.cpp +12 -6
- package/src/duckdb/src/storage/single_file_block_manager.cpp +0 -4
- package/src/duckdb/src/storage/storage_info.cpp +1 -1
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5735 -5773
- package/src/duckdb/ub_src_catalog_catalog_entry.cpp +1 -1
- package/src/duckdb/ub_src_parser_parsed_data.cpp +16 -0
- package/src/duckdb/src/catalog/catalog_entry/scalar_macro_catalog_entry.cpp +0 -104
@@ -9,7 +9,10 @@
|
|
9
9
|
#pragma once
|
10
10
|
|
11
11
|
#include "buffered_json_reader.hpp"
|
12
|
+
#include "duckdb/common/multi_file_reader.hpp"
|
12
13
|
#include "duckdb/common/mutex.hpp"
|
14
|
+
#include "duckdb/common/pair.hpp"
|
15
|
+
#include "duckdb/common/types/type_map.hpp"
|
13
16
|
#include "duckdb/function/scalar/strftime_format.hpp"
|
14
17
|
#include "duckdb/function/table_function.hpp"
|
15
18
|
#include "json_transform.hpp"
|
@@ -26,29 +29,29 @@ enum class JSONScanType : uint8_t {
|
|
26
29
|
SAMPLE = 3,
|
27
30
|
};
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
32
|
+
struct JSONString {
|
33
|
+
public:
|
34
|
+
JSONString() {
|
35
|
+
}
|
36
|
+
JSONString(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
|
37
|
+
}
|
38
|
+
|
39
|
+
const char *pointer;
|
40
|
+
idx_t size;
|
41
|
+
|
42
|
+
public:
|
43
|
+
string ToString() {
|
44
|
+
return string(pointer, size);
|
45
|
+
}
|
41
46
|
|
42
|
-
|
43
|
-
|
44
|
-
inline std::size_t operator()(const LogicalTypeId &id) const {
|
45
|
-
return (size_t)id;
|
47
|
+
const char &operator[](size_t i) const {
|
48
|
+
return pointer[i];
|
46
49
|
}
|
47
50
|
};
|
48
51
|
|
49
52
|
struct DateFormatMap {
|
50
53
|
public:
|
51
|
-
void Initialize(const
|
54
|
+
void Initialize(const type_id_map_t<vector<const char *>> &format_templates) {
|
52
55
|
for (const auto &entry : format_templates) {
|
53
56
|
const auto &type = entry.first;
|
54
57
|
for (const auto &format_string : entry.second) {
|
@@ -74,65 +77,79 @@ public:
|
|
74
77
|
}
|
75
78
|
|
76
79
|
StrpTimeFormat &GetFormat(LogicalTypeId type) {
|
77
|
-
|
80
|
+
D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
|
81
|
+
return candidate_formats.find(type)->second.back();
|
82
|
+
}
|
83
|
+
|
84
|
+
const StrpTimeFormat &GetFormat(LogicalTypeId type) const {
|
85
|
+
D_ASSERT(candidate_formats.find(type) != candidate_formats.end());
|
86
|
+
return candidate_formats.find(type)->second.back();
|
78
87
|
}
|
79
88
|
|
80
89
|
private:
|
81
|
-
|
90
|
+
type_id_map_t<vector<StrpTimeFormat>> candidate_formats;
|
82
91
|
};
|
83
92
|
|
84
93
|
struct JSONScanData : public TableFunctionData {
|
85
94
|
public:
|
86
95
|
JSONScanData();
|
87
96
|
|
88
|
-
|
97
|
+
void Bind(ClientContext &context, TableFunctionBindInput &input);
|
98
|
+
|
99
|
+
void InitializeReaders(ClientContext &context);
|
89
100
|
void InitializeFormats();
|
90
101
|
void InitializeFormats(bool auto_detect);
|
102
|
+
void SetCompression(const string &compression);
|
91
103
|
|
92
|
-
void Serialize(FieldWriter &writer);
|
93
|
-
void Deserialize(FieldReader &reader);
|
104
|
+
void Serialize(FieldWriter &writer) const;
|
105
|
+
void Deserialize(ClientContext &context, FieldReader &reader);
|
94
106
|
|
95
107
|
public:
|
96
108
|
//! Scan type
|
97
109
|
JSONScanType type;
|
110
|
+
|
98
111
|
//! File-specific options
|
99
112
|
BufferedJSONReaderOptions options;
|
113
|
+
|
114
|
+
//! Multi-file reader stuff
|
115
|
+
MultiFileReaderBindData reader_bind;
|
116
|
+
|
100
117
|
//! The files we're reading
|
101
|
-
vector<string>
|
118
|
+
vector<string> files;
|
119
|
+
//! Initial file reader
|
120
|
+
unique_ptr<BufferedJSONReader> initial_reader;
|
121
|
+
//! The readers
|
122
|
+
vector<unique_ptr<BufferedJSONReader>> union_readers;
|
102
123
|
|
103
124
|
//! Whether or not we should ignore malformed JSON (default to NULL)
|
104
125
|
bool ignore_errors = false;
|
105
|
-
//! Maximum JSON object size (defaults to
|
106
|
-
idx_t maximum_object_size =
|
107
|
-
//! Options when transforming the JSON to columnar data
|
108
|
-
JSONTransformOptions transform_options;
|
109
|
-
|
126
|
+
//! Maximum JSON object size (defaults to 16MB minimum)
|
127
|
+
idx_t maximum_object_size = 16777216;
|
110
128
|
//! Whether we auto-detect a schema
|
111
129
|
bool auto_detect = false;
|
112
130
|
//! Sample size for detecting schema
|
113
|
-
idx_t sample_size = STANDARD_VECTOR_SIZE;
|
114
|
-
//! Column names (in order)
|
115
|
-
vector<string> names;
|
116
|
-
//! Valid cols (ROW_TYPE cols are considered invalid)
|
117
|
-
vector<idx_t> valid_cols;
|
131
|
+
idx_t sample_size = idx_t(STANDARD_VECTOR_SIZE) * 10;
|
118
132
|
//! Max depth we go to detect nested JSON schema (defaults to unlimited)
|
119
133
|
idx_t max_depth = NumericLimits<idx_t>::Maximum();
|
120
|
-
|
121
|
-
|
134
|
+
|
135
|
+
//! All column names (in order)
|
136
|
+
vector<string> names;
|
137
|
+
//! Options when transforming the JSON to columnar data
|
138
|
+
JSONTransformOptions transform_options;
|
122
139
|
//! Forced date/timestamp formats
|
123
140
|
string date_format;
|
124
141
|
string timestamp_format;
|
125
|
-
|
126
|
-
//! Stored readers for when we're detecting the schema
|
127
|
-
vector<duckdb::unique_ptr<BufferedJSONReader>> stored_readers;
|
128
142
|
//! Candidate date formats
|
129
143
|
DateFormatMap date_format_map;
|
144
|
+
|
145
|
+
//! The inferred avg tuple size
|
146
|
+
idx_t avg_tuple_size = 420;
|
130
147
|
};
|
131
148
|
|
132
149
|
struct JSONScanInfo : public TableFunctionInfo {
|
133
150
|
public:
|
134
151
|
explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
|
135
|
-
JSONRecordType record_type_p = JSONRecordType::
|
152
|
+
JSONRecordType record_type_p = JSONRecordType::AUTO_DETECT, bool auto_detect_p = false)
|
136
153
|
: type(type_p), format(format_p), record_type(record_type_p), auto_detect(auto_detect_p) {
|
137
154
|
}
|
138
155
|
|
@@ -144,11 +161,17 @@ public:
|
|
144
161
|
|
145
162
|
struct JSONScanGlobalState {
|
146
163
|
public:
|
147
|
-
JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data);
|
164
|
+
JSONScanGlobalState(ClientContext &context, const JSONScanData &bind_data);
|
148
165
|
|
149
166
|
public:
|
150
167
|
//! Bound data
|
151
|
-
JSONScanData &bind_data;
|
168
|
+
const JSONScanData &bind_data;
|
169
|
+
//! Options when transforming the JSON to columnar data
|
170
|
+
JSONTransformOptions transform_options;
|
171
|
+
|
172
|
+
//! Column names that we're actually reading (after projection pushdown)
|
173
|
+
vector<string> names;
|
174
|
+
vector<column_t> column_indices;
|
152
175
|
|
153
176
|
//! Buffer manager allocator
|
154
177
|
Allocator &allocator;
|
@@ -157,7 +180,7 @@ public:
|
|
157
180
|
|
158
181
|
mutex lock;
|
159
182
|
//! One JSON reader per file
|
160
|
-
vector<
|
183
|
+
vector<optional_ptr<BufferedJSONReader>> json_readers;
|
161
184
|
//! Current file/batch index
|
162
185
|
idx_t file_index;
|
163
186
|
atomic<idx_t> batch_index;
|
@@ -166,62 +189,58 @@ public:
|
|
166
189
|
idx_t system_threads;
|
167
190
|
};
|
168
191
|
|
169
|
-
struct JSONLine {
|
170
|
-
public:
|
171
|
-
JSONLine() {
|
172
|
-
}
|
173
|
-
JSONLine(const char *pointer_p, idx_t size_p) : pointer(pointer_p), size(size_p) {
|
174
|
-
}
|
175
|
-
|
176
|
-
const char *pointer;
|
177
|
-
idx_t size;
|
178
|
-
|
179
|
-
public:
|
180
|
-
string ToString() {
|
181
|
-
return string(pointer, size);
|
182
|
-
}
|
183
|
-
|
184
|
-
const char &operator[](size_t i) const {
|
185
|
-
return pointer[i];
|
186
|
-
}
|
187
|
-
};
|
188
|
-
|
189
192
|
struct JSONScanLocalState {
|
190
193
|
public:
|
191
194
|
JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate);
|
192
195
|
|
193
196
|
public:
|
194
197
|
idx_t ReadNext(JSONScanGlobalState &gstate);
|
195
|
-
yyjson_alc *GetAllocator();
|
196
198
|
void ThrowTransformError(idx_t object_index, const string &error_message);
|
197
199
|
|
200
|
+
yyjson_alc *GetAllocator();
|
201
|
+
const MultiFileReaderData &GetReaderData() const;
|
202
|
+
|
203
|
+
public:
|
204
|
+
//! Current scan data
|
198
205
|
idx_t scan_count;
|
199
|
-
|
206
|
+
JSONString units[STANDARD_VECTOR_SIZE];
|
200
207
|
yyjson_val *values[STANDARD_VECTOR_SIZE];
|
201
208
|
|
202
|
-
|
203
|
-
idx_t array_offset;
|
204
|
-
yyjson_val *array_values[STANDARD_VECTOR_SIZE];
|
205
|
-
|
209
|
+
//! Batch index for order-preserving parallelism
|
206
210
|
idx_t batch_index;
|
207
211
|
|
208
212
|
//! Options when transforming the JSON to columnar data
|
209
213
|
DateFormatMap date_format_map;
|
210
214
|
JSONTransformOptions transform_options;
|
211
215
|
|
216
|
+
//! For determining average tuple size
|
217
|
+
idx_t total_read_size;
|
218
|
+
idx_t total_tuple_count;
|
219
|
+
|
212
220
|
private:
|
213
|
-
|
214
|
-
|
221
|
+
bool ReadNextBuffer(JSONScanGlobalState &gstate);
|
222
|
+
void ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
223
|
+
void ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
224
|
+
void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
225
|
+
void SkipOverArrayStart();
|
226
|
+
|
227
|
+
bool ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index, const bool already_incremented_file_idx);
|
228
|
+
void ReconstructFirstObject(JSONScanGlobalState &gstate);
|
229
|
+
void ParseNextChunk();
|
230
|
+
|
231
|
+
void ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining);
|
232
|
+
void ThrowObjectSizeError(const idx_t object_size);
|
233
|
+
void ThrowInvalidAtEndError();
|
215
234
|
|
216
235
|
private:
|
217
236
|
//! Bind data
|
218
|
-
JSONScanData &bind_data;
|
237
|
+
const JSONScanData &bind_data;
|
219
238
|
//! Thread-local allocator
|
220
|
-
JSONAllocator
|
239
|
+
JSONAllocator allocator;
|
221
240
|
|
222
241
|
//! Current reader and buffer handle
|
223
|
-
BufferedJSONReader
|
224
|
-
JSONBufferHandle
|
242
|
+
optional_ptr<BufferedJSONReader> current_reader;
|
243
|
+
optional_ptr<JSONBufferHandle> current_buffer_handle;
|
225
244
|
//! Whether this is the last batch of the file
|
226
245
|
bool is_last;
|
227
246
|
|
@@ -234,26 +253,12 @@ private:
|
|
234
253
|
|
235
254
|
//! Buffer to reconstruct split values
|
236
255
|
AllocatedData reconstruct_buffer;
|
237
|
-
//! Copy of current buffer for YYJSON_READ_INSITU
|
238
|
-
AllocatedData current_buffer_copy;
|
239
|
-
const char *buffer_copy_ptr;
|
240
|
-
|
241
|
-
private:
|
242
|
-
bool ReadNextBuffer(JSONScanGlobalState &gstate);
|
243
|
-
void ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
244
|
-
void ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
245
|
-
void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
246
|
-
|
247
|
-
void ReconstructFirstObject(JSONScanGlobalState &gstate);
|
248
|
-
|
249
|
-
void ReadUnstructured(idx_t &count);
|
250
|
-
void ReadNewlineDelimited(idx_t &count);
|
251
256
|
};
|
252
257
|
|
253
258
|
struct JSONGlobalTableFunctionState : public GlobalTableFunctionState {
|
254
259
|
public:
|
255
260
|
JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input);
|
256
|
-
static
|
261
|
+
static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
|
257
262
|
idx_t MaxThreads() const override;
|
258
263
|
|
259
264
|
public:
|
@@ -263,8 +268,8 @@ public:
|
|
263
268
|
struct JSONLocalTableFunctionState : public LocalTableFunctionState {
|
264
269
|
public:
|
265
270
|
JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate);
|
266
|
-
static
|
267
|
-
|
271
|
+
static unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
|
272
|
+
GlobalTableFunctionState *global_state);
|
268
273
|
idx_t GetBatchIndex() const;
|
269
274
|
|
270
275
|
public:
|
@@ -276,70 +281,18 @@ public:
|
|
276
281
|
static void AutoDetect(ClientContext &context, JSONScanData &bind_data, vector<LogicalType> &return_types,
|
277
282
|
vector<string> &names);
|
278
283
|
|
279
|
-
static
|
280
|
-
|
281
|
-
|
284
|
+
static double ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
|
285
|
+
const GlobalTableFunctionState *global_state);
|
286
|
+
static idx_t GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
|
287
|
+
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state);
|
288
|
+
static unique_ptr<NodeStatistics> Cardinality(ClientContext &context, const FunctionData *bind_data);
|
289
|
+
static void ComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p,
|
290
|
+
vector<unique_ptr<Expression>> &filters);
|
282
291
|
|
283
|
-
static
|
284
|
-
|
285
|
-
auto &gstate = ((JSONGlobalTableFunctionState &)*global_state).state;
|
286
|
-
double progress = 0;
|
287
|
-
for (auto &reader : gstate.json_readers) {
|
288
|
-
progress += reader->GetProgress();
|
289
|
-
}
|
290
|
-
return progress / double(gstate.json_readers.size());
|
291
|
-
}
|
292
|
+
static void Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function);
|
293
|
+
static unique_ptr<FunctionData> Deserialize(ClientContext &context, FieldReader &reader, TableFunction &function);
|
292
294
|
|
293
|
-
static
|
294
|
-
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
|
295
|
-
auto &lstate = (JSONLocalTableFunctionState &)*local_state;
|
296
|
-
return lstate.GetBatchIndex();
|
297
|
-
}
|
298
|
-
|
299
|
-
static unique_ptr<NodeStatistics> JSONScanCardinality(ClientContext &context, const FunctionData *bind_data) {
|
300
|
-
auto &data = (JSONScanData &)*bind_data;
|
301
|
-
idx_t per_file_cardinality;
|
302
|
-
if (data.stored_readers.empty()) {
|
303
|
-
// The cardinality of an unknown JSON file is the almighty number 42 except when it's not
|
304
|
-
per_file_cardinality = 42;
|
305
|
-
} else {
|
306
|
-
// If we multiply the almighty number 42 by 10, we get the exact average size of a JSON
|
307
|
-
// Not really, but the average size of a lineitem row in JSON is around 360 bytes
|
308
|
-
per_file_cardinality = data.stored_readers[0]->GetFileHandle().FileSize() / 420;
|
309
|
-
}
|
310
|
-
// Obviously this can be improved but this is better than defaulting to 0
|
311
|
-
return make_uniq<NodeStatistics>(per_file_cardinality * data.file_paths.size());
|
312
|
-
}
|
313
|
-
|
314
|
-
static void JSONScanSerialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
|
315
|
-
auto &bind_data = (JSONScanData &)*bind_data_p;
|
316
|
-
bind_data.Serialize(writer);
|
317
|
-
}
|
318
|
-
|
319
|
-
static duckdb::unique_ptr<FunctionData> JSONScanDeserialize(ClientContext &context, FieldReader &reader,
|
320
|
-
TableFunction &function) {
|
321
|
-
auto result = make_uniq<JSONScanData>();
|
322
|
-
result->Deserialize(reader);
|
323
|
-
return std::move(result);
|
324
|
-
}
|
325
|
-
|
326
|
-
static void TableFunctionDefaults(TableFunction &table_function) {
|
327
|
-
table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
|
328
|
-
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
329
|
-
table_function.named_parameters["lines"] = LogicalType::VARCHAR;
|
330
|
-
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
|
331
|
-
|
332
|
-
table_function.table_scan_progress = JSONScanProgress;
|
333
|
-
table_function.get_batch_index = JSONScanGetBatchIndex;
|
334
|
-
table_function.cardinality = JSONScanCardinality;
|
335
|
-
|
336
|
-
table_function.serialize = JSONScanSerialize;
|
337
|
-
table_function.deserialize = JSONScanDeserialize;
|
338
|
-
|
339
|
-
table_function.projection_pushdown = false;
|
340
|
-
table_function.filter_pushdown = false;
|
341
|
-
table_function.filter_prune = false;
|
342
|
-
}
|
295
|
+
static void TableFunctionDefaults(TableFunction &table_function);
|
343
296
|
};
|
344
297
|
|
345
298
|
} // namespace duckdb
|
@@ -35,14 +35,14 @@ public:
|
|
35
35
|
//! Whether to delay the error when transforming (e.g., when non-strict casting or reading from file)
|
36
36
|
bool delay_error = false;
|
37
37
|
//! Date format used for parsing (can be NULL)
|
38
|
-
DateFormatMap
|
38
|
+
optional_ptr<DateFormatMap> date_format_map = nullptr;
|
39
39
|
//! String to store errors in
|
40
40
|
string error_message;
|
41
41
|
//! Index of the object where the error occurred
|
42
42
|
idx_t object_index = DConstants::INVALID_INDEX;
|
43
43
|
|
44
44
|
public:
|
45
|
-
void Serialize(FieldWriter &writer);
|
45
|
+
void Serialize(FieldWriter &writer) const;
|
46
46
|
void Deserialize(FieldReader &reader);
|
47
47
|
};
|
48
48
|
|
@@ -5,7 +5,7 @@ namespace duckdb {
|
|
5
5
|
string JSONCommon::ValToString(yyjson_val *val, idx_t max_len) {
|
6
6
|
JSONAllocator json_allocator(Allocator::DefaultAllocator());
|
7
7
|
idx_t len;
|
8
|
-
auto data = JSONCommon::WriteVal<yyjson_val>(val, json_allocator.
|
8
|
+
auto data = JSONCommon::WriteVal<yyjson_val>(val, json_allocator.GetYYAlc(), len);
|
9
9
|
if (max_len < len) {
|
10
10
|
return string(data, max_len) + "...";
|
11
11
|
} else {
|
@@ -11,11 +11,47 @@
|
|
11
11
|
|
12
12
|
namespace duckdb {
|
13
13
|
|
14
|
+
static void ThrowJSONCopyParameterException(const string &loption) {
|
15
|
+
throw BinderException("COPY (FORMAT JSON) parameter %s expects a single argument.");
|
16
|
+
}
|
17
|
+
|
14
18
|
static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
|
15
19
|
auto stmt_copy = stmt.Copy();
|
16
20
|
auto © = stmt_copy->Cast<CopyStatement>();
|
17
21
|
auto &info = *copy.info;
|
18
22
|
|
23
|
+
// Parse the options, creating options for the CSV writer while doing so
|
24
|
+
string date_format;
|
25
|
+
string timestamp_format;
|
26
|
+
case_insensitive_map_t<vector<Value>> csv_copy_options;
|
27
|
+
for (const auto &kv : info.options) {
|
28
|
+
const auto &loption = StringUtil::Lower(kv.first);
|
29
|
+
if (loption == "dateformat" || loption == "date_format") {
|
30
|
+
if (kv.second.size() != 1) {
|
31
|
+
ThrowJSONCopyParameterException(loption);
|
32
|
+
}
|
33
|
+
date_format = StringValue::Get(kv.second.back());
|
34
|
+
} else if (loption == "timestampformat" || loption == "timestamp_format") {
|
35
|
+
if (kv.second.size() != 1) {
|
36
|
+
ThrowJSONCopyParameterException(loption);
|
37
|
+
}
|
38
|
+
timestamp_format = StringValue::Get(kv.second.back());
|
39
|
+
} else if (loption == "compression") {
|
40
|
+
csv_copy_options.insert(kv);
|
41
|
+
} else if (loption == "array") {
|
42
|
+
if (kv.second.size() > 1) {
|
43
|
+
ThrowJSONCopyParameterException(loption);
|
44
|
+
}
|
45
|
+
if (kv.second.empty() || BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
|
46
|
+
csv_copy_options["prefix"] = {"[\n\t"};
|
47
|
+
csv_copy_options["suffix"] = {"\n]\n"};
|
48
|
+
csv_copy_options["new_line"] = {",\n\t"};
|
49
|
+
}
|
50
|
+
} else {
|
51
|
+
throw BinderException("Unknown option for COPY ... TO ... (FORMAT JSON): \"%s\".", loption);
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
19
55
|
// Bind the select statement of the original to resolve the types
|
20
56
|
auto dummy_binder = Binder::CreateBinder(binder.context, &binder, true);
|
21
57
|
auto bound_original = dummy_binder->Bind(*stmt.select_statement);
|
@@ -29,26 +65,24 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
|
|
29
65
|
new_select_node.from_table = std::move(subquery_ref);
|
30
66
|
|
31
67
|
// Create new select list
|
32
|
-
vector<
|
68
|
+
vector<unique_ptr<ParsedExpression>> select_list;
|
33
69
|
select_list.reserve(bound_original.types.size());
|
34
70
|
|
35
71
|
// strftime if the user specified a format (loop also gives columns a name, needed for struct_pack)
|
36
72
|
// TODO: deal with date/timestamp within nested types
|
37
|
-
|
38
|
-
const auto timestamp_it = info.options.find("timestampformat");
|
39
|
-
vector<duckdb::unique_ptr<ParsedExpression>> strftime_children;
|
73
|
+
vector<unique_ptr<ParsedExpression>> strftime_children;
|
40
74
|
for (idx_t col_idx = 0; col_idx < bound_original.types.size(); col_idx++) {
|
41
75
|
auto column = make_uniq_base<ParsedExpression, PositionalReferenceExpression>(col_idx + 1);
|
42
|
-
strftime_children
|
76
|
+
strftime_children = vector<unique_ptr<ParsedExpression>>();
|
43
77
|
const auto &type = bound_original.types[col_idx];
|
44
78
|
const auto &name = bound_original.names[col_idx];
|
45
|
-
if (
|
79
|
+
if (!date_format.empty() && type == LogicalTypeId::DATE) {
|
46
80
|
strftime_children.emplace_back(std::move(column));
|
47
|
-
strftime_children.emplace_back(make_uniq<ConstantExpression>(
|
81
|
+
strftime_children.emplace_back(make_uniq<ConstantExpression>(date_format));
|
48
82
|
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
|
49
|
-
} else if (
|
83
|
+
} else if (!timestamp_format.empty() && type == LogicalTypeId::TIMESTAMP) {
|
50
84
|
strftime_children.emplace_back(std::move(column));
|
51
|
-
strftime_children.emplace_back(make_uniq<ConstantExpression>(
|
85
|
+
strftime_children.emplace_back(make_uniq<ConstantExpression>(timestamp_format));
|
52
86
|
column = make_uniq<FunctionExpression>("strftime", std::move(strftime_children));
|
53
87
|
}
|
54
88
|
column->alias = name;
|
@@ -63,6 +97,7 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
|
|
63
97
|
|
64
98
|
// Now we can just use the CSV writer
|
65
99
|
info.format = "csv";
|
100
|
+
info.options = std::move(csv_copy_options);
|
66
101
|
info.options["quote"] = {""};
|
67
102
|
info.options["escape"] = {""};
|
68
103
|
info.options["delimiter"] = {"\n"};
|
@@ -71,49 +106,70 @@ static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
|
|
71
106
|
return binder.Bind(*stmt_copy);
|
72
107
|
}
|
73
108
|
|
74
|
-
static
|
75
|
-
|
76
|
-
vector<LogicalType> &expected_types) {
|
109
|
+
static unique_ptr<FunctionData> CopyFromJSONBind(ClientContext &context, CopyInfo &info, vector<string> &expected_names,
|
110
|
+
vector<LogicalType> &expected_types) {
|
77
111
|
auto bind_data = make_uniq<JSONScanData>();
|
112
|
+
bind_data->type = JSONScanType::READ_JSON;
|
113
|
+
bind_data->options.record_type = JSONRecordType::RECORDS;
|
114
|
+
bind_data->options.format = JSONFormat::NEWLINE_DELIMITED;
|
78
115
|
|
79
|
-
bind_data->
|
116
|
+
bind_data->files.emplace_back(info.file_path);
|
80
117
|
bind_data->names = expected_names;
|
81
|
-
for (idx_t col_idx = 0; col_idx < expected_names.size(); col_idx++) {
|
82
|
-
bind_data->valid_cols.emplace_back(col_idx);
|
83
|
-
}
|
84
118
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
119
|
+
bool auto_detect = false;
|
120
|
+
for (auto &kv : info.options) {
|
121
|
+
const auto &loption = StringUtil::Lower(kv.first);
|
122
|
+
if (loption == "dateformat" || loption == "date_format") {
|
123
|
+
if (kv.second.size() != 1) {
|
124
|
+
ThrowJSONCopyParameterException(loption);
|
125
|
+
}
|
126
|
+
bind_data->date_format = StringValue::Get(kv.second.back());
|
127
|
+
} else if (loption == "timestampformat" || loption == "timestamp_format") {
|
128
|
+
if (kv.second.size() != 1) {
|
129
|
+
ThrowJSONCopyParameterException(loption);
|
130
|
+
}
|
131
|
+
bind_data->timestamp_format = StringValue::Get(kv.second.back());
|
132
|
+
} else if (loption == "auto_detect") {
|
133
|
+
if (kv.second.empty()) {
|
134
|
+
auto_detect = true;
|
135
|
+
} else if (kv.second.size() != 1) {
|
136
|
+
ThrowJSONCopyParameterException(loption);
|
137
|
+
} else {
|
138
|
+
auto_detect = BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN));
|
139
|
+
}
|
140
|
+
} else if (loption == "compression") {
|
141
|
+
if (kv.second.size() != 1) {
|
142
|
+
ThrowJSONCopyParameterException(loption);
|
143
|
+
}
|
144
|
+
bind_data->SetCompression(StringValue::Get(kv.second.back()));
|
145
|
+
} else if (loption == "array") {
|
146
|
+
if (kv.second.empty()) {
|
147
|
+
bind_data->options.format = JSONFormat::ARRAY;
|
148
|
+
} else if (kv.second.size() != 1) {
|
149
|
+
ThrowJSONCopyParameterException(loption);
|
150
|
+
} else if (BooleanValue::Get(kv.second.back().DefaultCastAs(LogicalTypeId::BOOLEAN))) {
|
151
|
+
bind_data->options.format = JSONFormat::ARRAY;
|
152
|
+
}
|
153
|
+
} else {
|
154
|
+
throw BinderException("Unknown option for COPY ... FROM ... (FORMAT JSON): \"%s\".", loption);
|
155
|
+
}
|
96
156
|
}
|
97
|
-
|
98
|
-
|
157
|
+
bind_data->InitializeFormats(auto_detect);
|
158
|
+
if (auto_detect && bind_data->options.format != JSONFormat::ARRAY) {
|
159
|
+
bind_data->options.format = JSONFormat::AUTO_DETECT;
|
99
160
|
}
|
100
161
|
|
101
162
|
bind_data->transform_options = JSONTransformOptions(true, true, true, true);
|
102
163
|
bind_data->transform_options.delay_error = true;
|
103
164
|
|
104
|
-
|
105
|
-
if (
|
106
|
-
// Wrap this with auto detect true/false so we can detect date/timestamp formats
|
107
|
-
// Note that auto_detect for names/types is not actually true because these are already know when we COPY
|
108
|
-
bind_data->InitializeFormats(true);
|
109
|
-
bind_data->options.format = JSONFormat::AUTO_DETECT;
|
110
|
-
bind_data->record_type = JSONRecordType::AUTO;
|
165
|
+
bind_data->InitializeReaders(context);
|
166
|
+
if (auto_detect) {
|
111
167
|
JSONScan::AutoDetect(context, *bind_data, expected_types, expected_names);
|
112
168
|
bind_data->auto_detect = true;
|
113
|
-
} else {
|
114
|
-
bind_data->InitializeFormats();
|
115
169
|
}
|
116
170
|
|
171
|
+
bind_data->transform_options.date_format_map = &bind_data->date_format_map;
|
172
|
+
|
117
173
|
return std::move(bind_data);
|
118
174
|
}
|
119
175
|
|
@@ -115,20 +115,19 @@ static void JSONContainsFunction(DataChunk &args, ExpressionState &state, Vector
|
|
115
115
|
|
116
116
|
if (needles.GetVectorType() == VectorType::CONSTANT_VECTOR) {
|
117
117
|
auto &needle_str = *ConstantVector::GetData<string_t>(needles);
|
118
|
-
auto needle_doc =
|
119
|
-
JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYJSONAllocator());
|
118
|
+
auto needle_doc = JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
|
120
119
|
UnaryExecutor::Execute<string_t, bool>(haystacks, result, args.size(), [&](string_t haystack_str) {
|
121
|
-
auto haystack_doc =
|
122
|
-
|
120
|
+
auto haystack_doc =
|
121
|
+
JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
|
123
122
|
return JSONContains(haystack_doc->root, needle_doc->root);
|
124
123
|
});
|
125
124
|
} else {
|
126
125
|
BinaryExecutor::Execute<string_t, string_t, bool>(
|
127
126
|
haystacks, needles, result, args.size(), [&](string_t haystack_str, string_t needle_str) {
|
128
|
-
auto needle_doc =
|
129
|
-
|
130
|
-
auto haystack_doc =
|
131
|
-
|
127
|
+
auto needle_doc =
|
128
|
+
JSONCommon::ReadDocument(needle_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
|
129
|
+
auto haystack_doc =
|
130
|
+
JSONCommon::ReadDocument(haystack_str, JSONCommon::READ_FLAG, lstate.json_allocator.GetYYAlc());
|
132
131
|
return JSONContains(haystack_doc->root, needle_doc->root);
|
133
132
|
});
|
134
133
|
}
|