duckdb 0.7.1-dev2.0 → 0.7.1-dev240.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +7 -7
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +50 -9
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +7 -2
- package/src/duckdb/extension/json/include/json_common.hpp +2 -2
- package/src/duckdb/extension/json/include/json_scan.hpp +29 -10
- package/src/duckdb/extension/json/json_functions/copy_json.cpp +35 -22
- package/src/duckdb/extension/json/json_functions/json_create.cpp +8 -8
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +47 -8
- package/src/duckdb/extension/json/json_functions/read_json.cpp +104 -49
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +5 -3
- package/src/duckdb/extension/json/json_functions.cpp +6 -0
- package/src/duckdb/extension/json/json_scan.cpp +144 -34
- package/src/duckdb/extension/parquet/parquet-extension.cpp +3 -2
- package/src/duckdb/src/common/enums/logical_operator_type.cpp +2 -0
- package/src/duckdb/src/common/enums/physical_operator_type.cpp +2 -0
- package/src/duckdb/src/common/enums/statement_type.cpp +2 -0
- package/src/duckdb/src/common/file_system.cpp +14 -0
- package/src/duckdb/src/common/hive_partitioning.cpp +1 -0
- package/src/duckdb/src/common/operator/cast_operators.cpp +14 -8
- package/src/duckdb/src/common/printer.cpp +1 -1
- package/src/duckdb/src/common/types/time.cpp +1 -1
- package/src/duckdb/src/common/types/timestamp.cpp +35 -4
- package/src/duckdb/src/common/types.cpp +36 -10
- package/src/duckdb/src/execution/column_binding_resolver.cpp +5 -2
- package/src/duckdb/src/execution/operator/join/physical_iejoin.cpp +7 -9
- package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +6 -11
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +13 -13
- package/src/duckdb/src/execution/operator/persistent/parallel_csv_reader.cpp +1 -1
- package/src/duckdb/src/execution/operator/schema/physical_detach.cpp +37 -0
- package/src/duckdb/src/execution/operator/schema/physical_drop.cpp +0 -5
- package/src/duckdb/src/execution/physical_plan/plan_simple.cpp +4 -0
- package/src/duckdb/src/execution/physical_plan_generator.cpp +1 -0
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +36 -9
- package/src/duckdb/src/function/table/read_csv.cpp +15 -4
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enums/logical_operator_type.hpp +1 -0
- package/src/duckdb/src/include/duckdb/common/enums/physical_operator_type.hpp +1 -0
- package/src/duckdb/src/include/duckdb/common/enums/statement_type.hpp +3 -2
- package/src/duckdb/src/include/duckdb/common/exception.hpp +10 -0
- package/src/duckdb/src/include/duckdb/common/file_system.hpp +1 -0
- package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +9 -1
- package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +4 -4
- package/src/duckdb/src/include/duckdb/common/types/timestamp.hpp +5 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/base_csv_reader.hpp +1 -3
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/buffered_csv_reader.hpp +0 -2
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_detach.hpp +32 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/config.hpp +0 -3
- package/src/duckdb/src/include/duckdb/parser/parsed_data/create_database_info.hpp +0 -4
- package/src/duckdb/src/include/duckdb/parser/parsed_data/detach_info.hpp +32 -0
- package/src/duckdb/src/include/duckdb/parser/query_node/select_node.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +2 -2
- package/src/duckdb/src/include/duckdb/parser/statement/copy_statement.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/statement/detach_statement.hpp +29 -0
- package/src/duckdb/src/include/duckdb/parser/statement/list.hpp +1 -0
- package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +3 -3
- package/src/duckdb/src/include/duckdb/parser/tableref/subqueryref.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/tokens.hpp +1 -0
- package/src/duckdb/src/include/duckdb/parser/transformer.hpp +1 -0
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +1 -0
- package/src/duckdb/src/include/duckdb/planner/operator/logical_execute.hpp +1 -5
- package/src/duckdb/src/include/duckdb/planner/operator/logical_show.hpp +1 -2
- package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +7 -0
- package/src/duckdb/src/include/duckdb/storage/table/update_segment.hpp +2 -0
- package/src/duckdb/src/main/client_context.cpp +2 -0
- package/src/duckdb/src/main/extension/extension_alias.cpp +2 -1
- package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +2 -6
- package/src/duckdb/src/parser/statement/copy_statement.cpp +2 -13
- package/src/duckdb/src/parser/statement/delete_statement.cpp +3 -0
- package/src/duckdb/src/parser/statement/detach_statement.cpp +15 -0
- package/src/duckdb/src/parser/statement/insert_statement.cpp +9 -0
- package/src/duckdb/src/parser/statement/update_statement.cpp +3 -0
- package/src/duckdb/src/parser/transform/expression/transform_case.cpp +3 -3
- package/src/duckdb/src/parser/transform/statement/transform_create_database.cpp +0 -1
- package/src/duckdb/src/parser/transform/statement/transform_detach.cpp +19 -0
- package/src/duckdb/src/parser/transformer.cpp +2 -0
- package/src/duckdb/src/planner/binder/expression/bind_aggregate_expression.cpp +3 -0
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +6 -3
- package/src/duckdb/src/planner/binder/statement/bind_create.cpp +16 -14
- package/src/duckdb/src/planner/binder/statement/bind_detach.cpp +19 -0
- package/src/duckdb/src/planner/binder/statement/bind_drop.cpp +29 -4
- package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +22 -1
- package/src/duckdb/src/planner/binder/tableref/bind_joinref.cpp +2 -1
- package/src/duckdb/src/planner/binder.cpp +2 -0
- package/src/duckdb/src/planner/expression_binder/lateral_binder.cpp +21 -5
- package/src/duckdb/src/planner/logical_operator.cpp +4 -0
- package/src/duckdb/src/planner/planner.cpp +1 -0
- package/src/duckdb/src/storage/storage_info.cpp +2 -1
- package/src/duckdb/src/storage/table/column_data.cpp +4 -2
- package/src/duckdb/src/storage/table/update_segment.cpp +15 -0
- package/src/duckdb/third_party/fmt/include/fmt/core.h +1 -2
- package/src/duckdb/third_party/libpg_query/include/nodes/nodes.hpp +1 -0
- package/src/duckdb/third_party/libpg_query/include/nodes/parsenodes.hpp +14 -0
- package/src/duckdb/third_party/libpg_query/include/parser/gram.hpp +530 -1006
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +17659 -17626
- package/src/duckdb/third_party/thrift/thrift/Thrift.h +8 -2
- package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +4 -4
- package/src/duckdb/ub_src_execution_operator_schema.cpp +2 -0
- package/src/duckdb/ub_src_parser_statement.cpp +2 -0
- package/src/duckdb/ub_src_parser_transform_statement.cpp +2 -0
- package/src/duckdb/ub_src_planner_binder_statement.cpp +2 -0
- package/src/duckdb/src/include/duckdb/function/create_database_extension.hpp +0 -37
|
@@ -13,63 +13,88 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
|
|
|
13
13
|
JSONScanLocalState lstate(context, gstate);
|
|
14
14
|
ArenaAllocator allocator(BufferAllocator::Get(context));
|
|
15
15
|
|
|
16
|
-
static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
|
|
17
|
-
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
|
18
|
-
{LogicalTypeId::TIMESTAMP,
|
|
19
|
-
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
|
|
20
|
-
"%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S"}},
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
// Populate possible date/timestamp formats, assume this is consistent across columns
|
|
24
|
-
for (auto &kv : FORMAT_TEMPLATES) {
|
|
25
|
-
const auto &type = kv.first;
|
|
26
|
-
if (bind_data.date_format_map.HasFormats(type)) {
|
|
27
|
-
continue; // Already populated
|
|
28
|
-
}
|
|
29
|
-
const auto &format_strings = kv.second;
|
|
30
|
-
for (auto &format_string : format_strings) {
|
|
31
|
-
bind_data.date_format_map.AddFormat(type, format_string);
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
|
|
35
16
|
// Read for the specified sample size
|
|
36
17
|
JSONStructureNode node;
|
|
18
|
+
bool more_than_one = false;
|
|
37
19
|
Vector string_vector(LogicalType::VARCHAR);
|
|
38
20
|
idx_t remaining = bind_data.sample_size;
|
|
39
21
|
while (remaining != 0) {
|
|
40
22
|
allocator.Reset();
|
|
41
23
|
auto read_count = lstate.ReadNext(gstate);
|
|
24
|
+
if (lstate.scan_count > 1) {
|
|
25
|
+
more_than_one = true;
|
|
26
|
+
}
|
|
42
27
|
if (read_count == 0) {
|
|
43
28
|
break;
|
|
44
29
|
}
|
|
45
30
|
idx_t next = MinValue<idx_t>(read_count, remaining);
|
|
31
|
+
yyjson_val **values;
|
|
32
|
+
if (bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
|
|
33
|
+
bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
|
|
34
|
+
values = lstate.array_values;
|
|
35
|
+
} else {
|
|
36
|
+
values = lstate.values;
|
|
37
|
+
}
|
|
46
38
|
for (idx_t i = 0; i < next; i++) {
|
|
47
|
-
if (
|
|
48
|
-
JSONStructure::ExtractStructure(
|
|
39
|
+
if (values[i]) {
|
|
40
|
+
JSONStructure::ExtractStructure(values[i], node);
|
|
49
41
|
}
|
|
50
42
|
}
|
|
51
43
|
if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
|
|
52
44
|
continue;
|
|
53
45
|
}
|
|
54
46
|
node.InitializeCandidateTypes(bind_data.max_depth);
|
|
55
|
-
node.RefineCandidateTypes(
|
|
47
|
+
node.RefineCandidateTypes(values, next, string_vector, allocator, bind_data.date_format_map);
|
|
56
48
|
remaining -= next;
|
|
49
|
+
|
|
50
|
+
if (gstate.file_index == 10) {
|
|
51
|
+
// We really shouldn't open more than 10 files when sampling
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
57
54
|
}
|
|
58
55
|
bind_data.type = original_scan_type;
|
|
59
|
-
bind_data.transform_options.date_format_map = &bind_data.date_format_map;
|
|
60
56
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
57
|
+
// Convert structure to logical type
|
|
58
|
+
auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
|
|
59
|
+
|
|
60
|
+
// Detect record type
|
|
61
|
+
if (bind_data.record_type == JSONRecordType::AUTO) {
|
|
62
|
+
switch (type.id()) {
|
|
63
|
+
case LogicalTypeId::STRUCT:
|
|
64
|
+
bind_data.record_type = JSONRecordType::RECORDS;
|
|
65
|
+
break;
|
|
66
|
+
case LogicalTypeId::LIST: {
|
|
67
|
+
if (more_than_one) {
|
|
68
|
+
bind_data.record_type = JSONRecordType::JSON;
|
|
69
|
+
} else {
|
|
70
|
+
type = ListType::GetChildType(type);
|
|
71
|
+
if (type.id() == LogicalTypeId::STRUCT) {
|
|
72
|
+
bind_data.record_type = JSONRecordType::ARRAY_OF_RECORDS;
|
|
73
|
+
} else {
|
|
74
|
+
bind_data.record_type = JSONRecordType::ARRAY_OF_JSON;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
default:
|
|
80
|
+
bind_data.record_type = JSONRecordType::JSON;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Detect return type
|
|
85
|
+
if (bind_data.auto_detect) {
|
|
86
|
+
bind_data.transform_options.date_format_map = &bind_data.date_format_map;
|
|
87
|
+
if (type.id() != LogicalTypeId::STRUCT) {
|
|
88
|
+
return_types.emplace_back(type);
|
|
89
|
+
names.emplace_back("json");
|
|
90
|
+
} else {
|
|
91
|
+
const auto &child_types = StructType::GetChildTypes(type);
|
|
92
|
+
return_types.reserve(child_types.size());
|
|
93
|
+
names.reserve(child_types.size());
|
|
94
|
+
for (auto &child_type : child_types) {
|
|
95
|
+
return_types.emplace_back(child_type.second);
|
|
96
|
+
names.emplace_back(child_type.first);
|
|
97
|
+
}
|
|
73
98
|
}
|
|
74
99
|
}
|
|
75
100
|
|
|
@@ -150,6 +175,22 @@ void JSONScan::InitializeBindData(ClientContext &context, JSONScanData &bind_dat
|
|
|
150
175
|
if (!error.empty()) {
|
|
151
176
|
throw InvalidInputException("Could not parse TIMESTAMPFORMAT: %s", error.c_str());
|
|
152
177
|
}
|
|
178
|
+
} else if (loption == "json_format") {
|
|
179
|
+
auto arg = StringValue::Get(kv.second);
|
|
180
|
+
if (arg == "records") {
|
|
181
|
+
bind_data.record_type = JSONRecordType::RECORDS;
|
|
182
|
+
} else if (arg == "array_of_records") {
|
|
183
|
+
bind_data.record_type = JSONRecordType::ARRAY_OF_RECORDS;
|
|
184
|
+
} else if (arg == "values") {
|
|
185
|
+
bind_data.record_type = JSONRecordType::JSON;
|
|
186
|
+
} else if (arg == "array_of_values") {
|
|
187
|
+
bind_data.record_type = JSONRecordType::ARRAY_OF_JSON;
|
|
188
|
+
} else if (arg == "auto") {
|
|
189
|
+
bind_data.record_type = JSONRecordType::AUTO;
|
|
190
|
+
} else {
|
|
191
|
+
throw InvalidInputException("\"json_format\" must be one of ['records', 'array_of_records', 'json', "
|
|
192
|
+
"'array_of_json', 'auto']");
|
|
193
|
+
}
|
|
153
194
|
}
|
|
154
195
|
}
|
|
155
196
|
}
|
|
@@ -170,7 +211,7 @@ unique_ptr<FunctionData> ReadJSONBind(ClientContext &context, TableFunctionBindI
|
|
|
170
211
|
|
|
171
212
|
bind_data.InitializeFormats();
|
|
172
213
|
|
|
173
|
-
if (bind_data.auto_detect) {
|
|
214
|
+
if (bind_data.auto_detect || bind_data.record_type == JSONRecordType::AUTO) {
|
|
174
215
|
JSONScan::AutoDetect(context, bind_data, return_types, names);
|
|
175
216
|
bind_data.names = names;
|
|
176
217
|
}
|
|
@@ -189,9 +230,16 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
|
|
|
189
230
|
auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
|
|
190
231
|
auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
|
|
191
232
|
|
|
192
|
-
// Fetch next lines
|
|
193
233
|
const auto count = lstate.ReadNext(gstate);
|
|
194
|
-
|
|
234
|
+
yyjson_val **values;
|
|
235
|
+
if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
|
|
236
|
+
gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
|
|
237
|
+
values = lstate.array_values;
|
|
238
|
+
} else {
|
|
239
|
+
D_ASSERT(gstate.bind_data.record_type != JSONRecordType::AUTO);
|
|
240
|
+
values = lstate.values;
|
|
241
|
+
}
|
|
242
|
+
output.SetCardinality(count);
|
|
195
243
|
|
|
196
244
|
vector<Vector *> result_vectors;
|
|
197
245
|
result_vectors.reserve(output.ColumnCount());
|
|
@@ -202,22 +250,23 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
|
|
|
202
250
|
|
|
203
251
|
// Pass current reader to transform options so we can get line number information if an error occurs
|
|
204
252
|
bool success;
|
|
205
|
-
if (gstate.bind_data.
|
|
206
|
-
|
|
253
|
+
if (gstate.bind_data.record_type == JSONRecordType::RECORDS ||
|
|
254
|
+
gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS) {
|
|
255
|
+
success = JSONTransform::TransformObject(values, lstate.GetAllocator(), count, gstate.bind_data.names,
|
|
207
256
|
result_vectors, lstate.transform_options);
|
|
208
257
|
} else {
|
|
209
|
-
success = JSONTransform::Transform(
|
|
258
|
+
success = JSONTransform::Transform(values, lstate.GetAllocator(), *result_vectors[0], count,
|
|
210
259
|
lstate.transform_options);
|
|
211
260
|
}
|
|
261
|
+
|
|
212
262
|
if (!success) {
|
|
213
263
|
string hint = gstate.bind_data.auto_detect
|
|
214
264
|
? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns' manually, "
|
|
215
|
-
"or setting 'ignore_errors' to true."
|
|
216
|
-
: "";
|
|
217
|
-
lstate.ThrowTransformError(
|
|
265
|
+
"specifying 'lines' or 'json_format' manually, or setting 'ignore_errors' to true."
|
|
266
|
+
: "\n Try specifying 'lines' or 'json_format' manually, or setting 'ignore_errors' to true.";
|
|
267
|
+
lstate.ThrowTransformError(lstate.transform_options.object_index,
|
|
218
268
|
lstate.transform_options.error_message + hint);
|
|
219
269
|
}
|
|
220
|
-
output.SetCardinality(count);
|
|
221
270
|
}
|
|
222
271
|
|
|
223
272
|
TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
|
|
@@ -233,8 +282,10 @@ TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, share
|
|
|
233
282
|
table_function.named_parameters["date_format"] = LogicalType::VARCHAR;
|
|
234
283
|
table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
|
|
235
284
|
table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
|
|
285
|
+
table_function.named_parameters["json_format"] = LogicalType::VARCHAR;
|
|
236
286
|
|
|
237
287
|
table_function.projection_pushdown = true;
|
|
288
|
+
// TODO: might be able to do filter pushdown/prune too
|
|
238
289
|
|
|
239
290
|
table_function.function_info = std::move(function_info);
|
|
240
291
|
|
|
@@ -249,7 +300,8 @@ TableFunction GetReadJSONAutoTableFunction(bool list_parameter, shared_ptr<JSONS
|
|
|
249
300
|
|
|
250
301
|
CreateTableFunctionInfo JSONFunctions::GetReadJSONFunction() {
|
|
251
302
|
TableFunctionSet function_set("read_json");
|
|
252
|
-
auto function_info =
|
|
303
|
+
auto function_info =
|
|
304
|
+
make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::UNSTRUCTURED, JSONRecordType::RECORDS, false);
|
|
253
305
|
function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(false, function_info));
|
|
254
306
|
function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(true, function_info));
|
|
255
307
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -257,7 +309,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONFunction() {
|
|
|
257
309
|
|
|
258
310
|
CreateTableFunctionInfo JSONFunctions::GetReadNDJSONFunction() {
|
|
259
311
|
TableFunctionSet function_set("read_ndjson");
|
|
260
|
-
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
|
|
312
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
|
|
313
|
+
JSONRecordType::RECORDS, false);
|
|
261
314
|
function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(false, function_info));
|
|
262
315
|
function_set.AddFunction(JSONFunctions::GetReadJSONTableFunction(true, function_info));
|
|
263
316
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -265,7 +318,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadNDJSONFunction() {
|
|
|
265
318
|
|
|
266
319
|
CreateTableFunctionInfo JSONFunctions::GetReadJSONAutoFunction() {
|
|
267
320
|
TableFunctionSet function_set("read_json_auto");
|
|
268
|
-
auto function_info =
|
|
321
|
+
auto function_info =
|
|
322
|
+
make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::AUTO, true);
|
|
269
323
|
function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
|
|
270
324
|
function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
|
|
271
325
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -273,7 +327,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONAutoFunction() {
|
|
|
273
327
|
|
|
274
328
|
CreateTableFunctionInfo JSONFunctions::GetReadNDJSONAutoFunction() {
|
|
275
329
|
TableFunctionSet function_set("read_ndjson_auto");
|
|
276
|
-
auto function_info =
|
|
330
|
+
auto function_info =
|
|
331
|
+
make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::AUTO, true);
|
|
277
332
|
function_set.AddFunction(GetReadJSONAutoTableFunction(false, function_info));
|
|
278
333
|
function_set.AddFunction(GetReadJSONAutoTableFunction(true, function_info));
|
|
279
334
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -20,7 +20,7 @@ static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &
|
|
|
20
20
|
// Fetch next lines
|
|
21
21
|
const auto count = lstate.ReadNext(gstate);
|
|
22
22
|
const auto lines = lstate.lines;
|
|
23
|
-
const auto objects = lstate.
|
|
23
|
+
const auto objects = lstate.values;
|
|
24
24
|
|
|
25
25
|
// Create the strings without copying them
|
|
26
26
|
auto strings = FlatVector::GetData<string_t>(output.data[0]);
|
|
@@ -48,7 +48,8 @@ TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JS
|
|
|
48
48
|
|
|
49
49
|
CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
|
|
50
50
|
TableFunctionSet function_set("read_json_objects");
|
|
51
|
-
auto function_info =
|
|
51
|
+
auto function_info =
|
|
52
|
+
make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::UNSTRUCTURED, JSONRecordType::JSON);
|
|
52
53
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
|
|
53
54
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
|
|
54
55
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -56,7 +57,8 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
|
|
|
56
57
|
|
|
57
58
|
CreateTableFunctionInfo JSONFunctions::GetReadNDJSONObjectsFunction() {
|
|
58
59
|
TableFunctionSet function_set("read_ndjson_objects");
|
|
59
|
-
auto function_info =
|
|
60
|
+
auto function_info =
|
|
61
|
+
make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::JSON);
|
|
60
62
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
|
|
61
63
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
|
|
62
64
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -166,6 +166,12 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
|
|
|
166
166
|
unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
|
|
167
167
|
ReplacementScanData *data) {
|
|
168
168
|
auto lower_name = StringUtil::Lower(table_name);
|
|
169
|
+
// remove any compression
|
|
170
|
+
if (StringUtil::EndsWith(lower_name, ".gz")) {
|
|
171
|
+
lower_name = lower_name.substr(0, lower_name.size() - 3);
|
|
172
|
+
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
|
|
173
|
+
lower_name = lower_name.substr(0, lower_name.size() - 4);
|
|
174
|
+
}
|
|
169
175
|
if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
|
|
170
176
|
!StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
|
|
171
177
|
return nullptr;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "json_scan.hpp"
|
|
2
2
|
|
|
3
3
|
#include "duckdb/main/database.hpp"
|
|
4
|
+
#include "duckdb/main/extension_helper.hpp"
|
|
4
5
|
#include "duckdb/parallel/task_scheduler.hpp"
|
|
5
6
|
#include "duckdb/storage/buffer_manager.hpp"
|
|
6
7
|
|
|
@@ -19,8 +20,9 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
|
|
|
19
20
|
auto &options = result->options;
|
|
20
21
|
|
|
21
22
|
auto &info = (JSONScanInfo &)*input.info;
|
|
22
|
-
options.format = info.format;
|
|
23
23
|
result->type = info.type;
|
|
24
|
+
options.format = info.format;
|
|
25
|
+
result->record_type = info.record_type;
|
|
24
26
|
result->auto_detect = info.auto_detect;
|
|
25
27
|
|
|
26
28
|
vector<string> patterns;
|
|
@@ -39,16 +41,16 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
|
|
|
39
41
|
result->ignore_errors = BooleanValue::Get(kv.second);
|
|
40
42
|
} else if (loption == "maximum_object_size") {
|
|
41
43
|
result->maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), result->maximum_object_size);
|
|
42
|
-
} else if (loption == "
|
|
44
|
+
} else if (loption == "lines") {
|
|
43
45
|
auto format = StringUtil::Lower(StringValue::Get(kv.second));
|
|
44
46
|
if (format == "auto") {
|
|
45
47
|
options.format = JSONFormat::AUTO_DETECT;
|
|
46
|
-
} else if (format == "
|
|
48
|
+
} else if (format == "false") {
|
|
47
49
|
options.format = JSONFormat::UNSTRUCTURED;
|
|
48
|
-
} else if (format == "
|
|
50
|
+
} else if (format == "true") {
|
|
49
51
|
options.format = JSONFormat::NEWLINE_DELIMITED;
|
|
50
52
|
} else {
|
|
51
|
-
throw BinderException("
|
|
53
|
+
throw BinderException("\"lines\" must be one of ['auto', 'true', 'false']");
|
|
52
54
|
}
|
|
53
55
|
} else if (loption == "compression") {
|
|
54
56
|
auto compression = StringUtil::Lower(StringValue::Get(kv.second));
|
|
@@ -75,7 +77,7 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
|
|
|
75
77
|
for (auto &file_pattern : patterns) {
|
|
76
78
|
auto found_files = fs.Glob(file_pattern, context);
|
|
77
79
|
if (found_files.empty()) {
|
|
78
|
-
throw
|
|
80
|
+
throw FileSystem::MissingFileException(file_pattern, context);
|
|
79
81
|
}
|
|
80
82
|
file_paths.insert(file_paths.end(), found_files.begin(), found_files.end());
|
|
81
83
|
}
|
|
@@ -97,6 +99,27 @@ void JSONScanData::InitializeFormats() {
|
|
|
97
99
|
if (!timestamp_format.empty()) {
|
|
98
100
|
date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
|
|
99
101
|
}
|
|
102
|
+
|
|
103
|
+
if (auto_detect) {
|
|
104
|
+
static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
|
|
105
|
+
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
|
106
|
+
{LogicalTypeId::TIMESTAMP,
|
|
107
|
+
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
|
|
108
|
+
"%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
// Populate possible date/timestamp formats, assume this is consistent across columns
|
|
112
|
+
for (auto &kv : FORMAT_TEMPLATES) {
|
|
113
|
+
const auto &type = kv.first;
|
|
114
|
+
if (date_format_map.HasFormats(type)) {
|
|
115
|
+
continue; // Already populated
|
|
116
|
+
}
|
|
117
|
+
const auto &format_strings = kv.second;
|
|
118
|
+
for (auto &format_string : format_strings) {
|
|
119
|
+
date_format_map.AddFormat(type, format_string);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
100
123
|
}
|
|
101
124
|
|
|
102
125
|
void JSONScanData::Serialize(FieldWriter &writer) {
|
|
@@ -111,9 +134,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
|
|
|
111
134
|
writer.WriteList<string>(names);
|
|
112
135
|
writer.WriteList<idx_t>(valid_cols);
|
|
113
136
|
writer.WriteField<idx_t>(max_depth);
|
|
114
|
-
writer.WriteField<
|
|
115
|
-
|
|
116
|
-
|
|
137
|
+
writer.WriteField<JSONRecordType>(record_type);
|
|
138
|
+
if (!date_format.empty()) {
|
|
139
|
+
writer.WriteString(date_format);
|
|
140
|
+
} else {
|
|
141
|
+
writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
|
|
142
|
+
}
|
|
143
|
+
if (!timestamp_format.empty()) {
|
|
144
|
+
writer.WriteString(timestamp_format);
|
|
145
|
+
} else {
|
|
146
|
+
writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
|
|
147
|
+
}
|
|
117
148
|
}
|
|
118
149
|
|
|
119
150
|
void JSONScanData::Deserialize(FieldReader &reader) {
|
|
@@ -128,9 +159,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
|
|
|
128
159
|
names = reader.ReadRequiredList<string>();
|
|
129
160
|
valid_cols = reader.ReadRequiredList<idx_t>();
|
|
130
161
|
max_depth = reader.ReadRequired<idx_t>();
|
|
131
|
-
|
|
162
|
+
record_type = reader.ReadRequired<JSONRecordType>();
|
|
132
163
|
date_format = reader.ReadRequired<string>();
|
|
133
164
|
timestamp_format = reader.ReadRequired<string>();
|
|
165
|
+
|
|
166
|
+
InitializeFormats();
|
|
167
|
+
transform_options.date_format_map = &date_format_map;
|
|
134
168
|
}
|
|
135
169
|
|
|
136
170
|
JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
|
|
@@ -149,11 +183,11 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
|
|
|
149
183
|
}
|
|
150
184
|
|
|
151
185
|
JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
|
|
152
|
-
: batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
|
|
186
|
+
: scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
|
|
153
187
|
json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
|
|
154
|
-
buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
|
|
188
|
+
is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
|
|
155
189
|
|
|
156
|
-
// Buffer to reconstruct JSON
|
|
190
|
+
// Buffer to reconstruct JSON values when they cross a buffer boundary
|
|
157
191
|
reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
|
|
158
192
|
|
|
159
193
|
// This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
|
|
@@ -173,11 +207,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
|
|
|
173
207
|
// Perform projection pushdown
|
|
174
208
|
if (bind_data.type == JSONScanType::READ_JSON) {
|
|
175
209
|
D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
|
|
176
|
-
if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
|
|
177
|
-
// If we are auto-detecting, but don't need all columns present in the file,
|
|
178
|
-
// then we don't need to throw an error if we encounter an unseen column
|
|
179
|
-
bind_data.transform_options.error_unknown_key = false;
|
|
180
|
-
}
|
|
181
210
|
vector<string> names;
|
|
182
211
|
names.reserve(input.column_ids.size());
|
|
183
212
|
for (idx_t i = 0; i < input.column_ids.size(); i++) {
|
|
@@ -188,13 +217,37 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
|
|
|
188
217
|
names.push_back(std::move(bind_data.names[id]));
|
|
189
218
|
bind_data.valid_cols.push_back(i);
|
|
190
219
|
}
|
|
220
|
+
if (names.size() < bind_data.names.size()) {
|
|
221
|
+
// If we are auto-detecting, but don't need all columns present in the file,
|
|
222
|
+
// then we don't need to throw an error if we encounter an unseen column
|
|
223
|
+
bind_data.transform_options.error_unknown_key = false;
|
|
224
|
+
}
|
|
191
225
|
bind_data.names = std::move(names);
|
|
192
226
|
}
|
|
193
227
|
return result;
|
|
194
228
|
}
|
|
195
229
|
|
|
196
230
|
idx_t JSONGlobalTableFunctionState::MaxThreads() const {
|
|
197
|
-
|
|
231
|
+
auto &bind_data = state.bind_data;
|
|
232
|
+
|
|
233
|
+
auto num_files = bind_data.file_paths.size();
|
|
234
|
+
idx_t readers_per_file;
|
|
235
|
+
if (bind_data.options.format == JSONFormat::UNSTRUCTURED) {
|
|
236
|
+
// Unstructured necessitates single thread
|
|
237
|
+
readers_per_file = 1;
|
|
238
|
+
} else if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
|
|
239
|
+
auto &reader = *state.json_readers[0];
|
|
240
|
+
const auto &options = reader.GetOptions();
|
|
241
|
+
if (options.format == JSONFormat::UNSTRUCTURED || options.compression != FileCompressionType::UNCOMPRESSED) {
|
|
242
|
+
// Auto-detected unstructured - same story, compression also really limits parallelism
|
|
243
|
+
readers_per_file = 1;
|
|
244
|
+
} else {
|
|
245
|
+
return state.system_threads;
|
|
246
|
+
}
|
|
247
|
+
} else {
|
|
248
|
+
return state.system_threads;
|
|
249
|
+
}
|
|
250
|
+
return num_files * readers_per_file;
|
|
198
251
|
}
|
|
199
252
|
|
|
200
253
|
JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
|
|
@@ -230,6 +283,12 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
|
|
|
230
283
|
idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
|
|
231
284
|
json_allocator.Reset();
|
|
232
285
|
|
|
286
|
+
if ((gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
|
|
287
|
+
gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) &&
|
|
288
|
+
array_idx < scan_count) {
|
|
289
|
+
return GetObjectsFromArray(gstate);
|
|
290
|
+
}
|
|
291
|
+
|
|
233
292
|
idx_t count = 0;
|
|
234
293
|
if (buffer_offset == buffer_size) {
|
|
235
294
|
if (!ReadNextBuffer(gstate)) {
|
|
@@ -253,10 +312,18 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
|
|
|
253
312
|
default:
|
|
254
313
|
throw InternalException("Unknown JSON format");
|
|
255
314
|
}
|
|
315
|
+
scan_count = count;
|
|
256
316
|
|
|
257
317
|
// Skip over any remaining whitespace for the next scan
|
|
258
318
|
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
|
259
319
|
|
|
320
|
+
if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
|
|
321
|
+
gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
|
|
322
|
+
array_idx = 0;
|
|
323
|
+
array_offset = 0;
|
|
324
|
+
return GetObjectsFromArray(gstate);
|
|
325
|
+
}
|
|
326
|
+
|
|
260
327
|
return count;
|
|
261
328
|
}
|
|
262
329
|
|
|
@@ -331,10 +398,48 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
|
|
|
331
398
|
}
|
|
332
399
|
}
|
|
333
400
|
|
|
401
|
+
idx_t JSONScanLocalState::GetObjectsFromArray(JSONScanGlobalState &gstate) {
|
|
402
|
+
idx_t arr_count = 0;
|
|
403
|
+
|
|
404
|
+
size_t idx, max;
|
|
405
|
+
yyjson_val *val;
|
|
406
|
+
for (; array_idx < scan_count; array_idx++, array_offset = 0) {
|
|
407
|
+
auto &value = values[array_idx];
|
|
408
|
+
if (!value) {
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
if (unsafe_yyjson_is_arr(value)) {
|
|
412
|
+
yyjson_arr_foreach(value, idx, max, val) {
|
|
413
|
+
if (idx < array_offset) {
|
|
414
|
+
continue;
|
|
415
|
+
}
|
|
416
|
+
array_values[arr_count++] = val;
|
|
417
|
+
if (arr_count == STANDARD_VECTOR_SIZE) {
|
|
418
|
+
break;
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
array_offset = idx + 1;
|
|
422
|
+
if (arr_count == STANDARD_VECTOR_SIZE) {
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
} else if (!gstate.bind_data.ignore_errors) {
|
|
426
|
+
ThrowTransformError(
|
|
427
|
+
array_idx,
|
|
428
|
+
StringUtil::Format("Expected JSON ARRAY but got %s: %s\nTry setting json_format to 'records'",
|
|
429
|
+
JSONCommon::ValTypeToString(value), JSONCommon::ValToString(value, 50)));
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
return arr_count;
|
|
433
|
+
}
|
|
434
|
+
|
|
334
435
|
bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
335
436
|
if (current_reader) {
|
|
336
437
|
D_ASSERT(current_buffer_handle);
|
|
337
438
|
current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
|
|
439
|
+
if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
|
|
440
|
+
// Close files that are done if we're not sampling
|
|
441
|
+
current_reader->CloseJSONFile();
|
|
442
|
+
}
|
|
338
443
|
}
|
|
339
444
|
|
|
340
445
|
AllocatedData buffer;
|
|
@@ -395,7 +500,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
|
395
500
|
// Unopened file
|
|
396
501
|
current_reader->OpenJSONFile();
|
|
397
502
|
batch_index = gstate.batch_index++;
|
|
398
|
-
if (options.format == JSONFormat::UNSTRUCTURED
|
|
503
|
+
if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
|
|
504
|
+
options.compression != FileCompressionType::UNCOMPRESSED &&
|
|
505
|
+
gstate.file_index < gstate.json_readers.size())) {
|
|
399
506
|
gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
|
|
400
507
|
}
|
|
401
508
|
if (options.format != JSONFormat::AUTO_DETECT) {
|
|
@@ -449,9 +556,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
|
449
556
|
auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
|
|
450
557
|
current_buffer_handle = json_buffer_handle.get();
|
|
451
558
|
current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
|
|
452
|
-
if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
|
|
453
|
-
// TODO: store buffer
|
|
454
|
-
}
|
|
455
559
|
|
|
456
560
|
buffer_offset = 0;
|
|
457
561
|
prev_buffer_remainder = 0;
|
|
@@ -507,16 +611,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
|
|
|
507
611
|
}
|
|
508
612
|
|
|
509
613
|
void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
|
|
510
|
-
auto &file_handle = current_reader->GetFileHandle();
|
|
511
|
-
|
|
512
614
|
idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
|
|
513
615
|
idx_t read_size;
|
|
514
616
|
{
|
|
515
617
|
lock_guard<mutex> reader_guard(current_reader->lock);
|
|
516
618
|
buffer_index = current_reader->GetBufferIndex();
|
|
517
619
|
|
|
518
|
-
|
|
519
|
-
|
|
620
|
+
if (current_reader->IsOpen()) {
|
|
621
|
+
read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
|
|
622
|
+
gstate.bind_data.type == JSONScanType::SAMPLE);
|
|
623
|
+
} else {
|
|
624
|
+
read_size = 0;
|
|
625
|
+
}
|
|
520
626
|
is_last = read_size < request_size;
|
|
521
627
|
|
|
522
628
|
if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
|
|
@@ -578,10 +684,15 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
|
578
684
|
current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
|
|
579
685
|
}
|
|
580
686
|
|
|
581
|
-
|
|
687
|
+
values[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
|
|
582
688
|
}
|
|
583
689
|
|
|
584
690
|
void JSONScanLocalState::ReadUnstructured(idx_t &count) {
|
|
691
|
+
// yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
|
|
692
|
+
// if a different error code happens within the last 50 bytes
|
|
693
|
+
// we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
|
|
694
|
+
static constexpr idx_t END_BOUND = 50;
|
|
695
|
+
|
|
585
696
|
const auto max_obj_size = reconstruct_buffer.GetSize();
|
|
586
697
|
yyjson_read_err error;
|
|
587
698
|
for (; count < STANDARD_VECTOR_SIZE; count++) {
|
|
@@ -607,8 +718,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
|
|
|
607
718
|
} else if (error.pos > max_obj_size) {
|
|
608
719
|
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
|
|
609
720
|
"Try increasing \"maximum_object_size\".");
|
|
610
|
-
|
|
611
|
-
} else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
|
|
721
|
+
} else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
|
|
612
722
|
// Copy remaining to reconstruct_buffer
|
|
613
723
|
const auto reconstruct_ptr = reconstruct_buffer.get();
|
|
614
724
|
memcpy(reconstruct_ptr, obj_copy_start, remaining);
|
|
@@ -618,7 +728,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
|
|
|
618
728
|
} else {
|
|
619
729
|
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
|
|
620
730
|
}
|
|
621
|
-
|
|
731
|
+
values[count] = read_doc->root;
|
|
622
732
|
}
|
|
623
733
|
}
|
|
624
734
|
|
|
@@ -644,7 +754,7 @@ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
|
|
|
644
754
|
}
|
|
645
755
|
idx_t line_size = line_end - line_start;
|
|
646
756
|
|
|
647
|
-
|
|
757
|
+
values[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
|
|
648
758
|
|
|
649
759
|
buffer_offset += line_size;
|
|
650
760
|
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
|
@@ -655,11 +765,11 @@ yyjson_alc *JSONScanLocalState::GetAllocator() {
|
|
|
655
765
|
return json_allocator.GetYYJSONAllocator();
|
|
656
766
|
}
|
|
657
767
|
|
|
658
|
-
void JSONScanLocalState::ThrowTransformError(idx_t
|
|
768
|
+
void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &error_message) {
|
|
659
769
|
D_ASSERT(current_reader);
|
|
660
770
|
D_ASSERT(current_buffer_handle);
|
|
661
771
|
D_ASSERT(object_index != DConstants::INVALID_INDEX);
|
|
662
|
-
auto line_or_object_in_buffer = lines_or_objects_in_buffer -
|
|
772
|
+
auto line_or_object_in_buffer = lines_or_objects_in_buffer - scan_count + object_index;
|
|
663
773
|
current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
|
|
664
774
|
}
|
|
665
775
|
|
|
@@ -223,7 +223,7 @@ public:
|
|
|
223
223
|
FileSystem &fs = FileSystem::GetFileSystem(context);
|
|
224
224
|
auto files = fs.Glob(info.file_path, context);
|
|
225
225
|
if (files.empty()) {
|
|
226
|
-
throw
|
|
226
|
+
throw FileSystem::MissingFileException(info.file_path, context);
|
|
227
227
|
}
|
|
228
228
|
|
|
229
229
|
// The most likely path (Parquet read without union by name option)
|
|
@@ -363,8 +363,9 @@ public:
|
|
|
363
363
|
|
|
364
364
|
static vector<string> ParquetGlob(FileSystem &fs, const string &glob, ClientContext &context) {
|
|
365
365
|
auto files = fs.Glob(glob, FileSystem::GetFileOpener(context));
|
|
366
|
+
|
|
366
367
|
if (files.empty()) {
|
|
367
|
-
throw
|
|
368
|
+
throw FileSystem::MissingFileException(glob, context);
|
|
368
369
|
}
|
|
369
370
|
return files;
|
|
370
371
|
}
|