duckdb 0.7.2-dev3546.0 → 0.7.2-dev3710.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/database.cpp +1 -0
- package/src/duckdb/extension/json/buffered_json_reader.cpp +56 -17
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +56 -31
- package/src/duckdb/extension/json/include/json_common.hpp +5 -4
- package/src/duckdb/extension/json/include/json_executors.hpp +13 -18
- package/src/duckdb/extension/json/include/json_functions.hpp +3 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +106 -153
- package/src/duckdb/extension/json/include/json_transform.hpp +2 -2
- package/src/duckdb/extension/json/json_common.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/copy_json.cpp +94 -38
- package/src/duckdb/extension/json/json_functions/json_contains.cpp +7 -8
- package/src/duckdb/extension/json/json_functions/json_create.cpp +7 -7
- package/src/duckdb/extension/json/json_functions/json_merge_patch.cpp +4 -4
- package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +4 -4
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +7 -5
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +11 -9
- package/src/duckdb/extension/json/json_functions/json_valid.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/read_json.cpp +166 -169
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +37 -16
- package/src/duckdb/extension/json/json_functions.cpp +11 -4
- package/src/duckdb/extension/json/json_scan.cpp +593 -374
- package/src/duckdb/extension/parquet/parquet-extension.cpp +5 -0
- package/src/duckdb/src/common/exception.cpp +17 -0
- package/src/duckdb/src/common/exception_format_value.cpp +14 -0
- package/src/duckdb/src/common/file_system.cpp +78 -36
- package/src/duckdb/src/common/local_file_system.cpp +5 -16
- package/src/duckdb/src/common/types.cpp +1 -1
- package/src/duckdb/src/core_functions/scalar/list/list_lambdas.cpp +1 -1
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +12 -6
- package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +10 -0
- package/src/duckdb/src/execution/operator/persistent/physical_export.cpp +2 -2
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +6 -4
- package/src/duckdb/src/function/table/copy_csv.cpp +66 -12
- package/src/duckdb/src/function/table/read_csv.cpp +16 -3
- package/src/duckdb/src/function/table/version/pragma_version.cpp +8 -2
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/column_dependency_manager.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/similar_catalog_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -3
- package/src/duckdb/src/include/duckdb/common/exception_format_value.hpp +26 -0
- package/src/duckdb/src/include/duckdb/common/file_system.hpp +11 -0
- package/src/duckdb/src/include/duckdb/common/http_state.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/hugeint.hpp +6 -6
- package/src/duckdb/src/include/duckdb/common/limits.hpp +46 -46
- package/src/duckdb/src/include/duckdb/common/operator/cast_operators.hpp +8 -8
- package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +6 -6
- package/src/duckdb/src/include/duckdb/common/operator/convert_to_string.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/operator/decimal_cast_operators.hpp +2 -4
- package/src/duckdb/src/include/duckdb/common/operator/string_cast.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/operator/subtract.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/re2_regex.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +25 -7
- package/src/duckdb/src/include/duckdb/common/types/chunk_collection.hpp +10 -10
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +12 -12
- package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection_iterators.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/types/value.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/types.hpp +2 -2
- package/src/duckdb/src/include/duckdb/common/winapi.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +8 -3
- package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +10 -14
- package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
- package/src/duckdb/src/include/duckdb/function/udf_function.hpp +56 -50
- package/src/duckdb/src/include/duckdb/main/appender.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/client_context.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -1
- package/src/duckdb/src/include/duckdb/main/connection.hpp +8 -9
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
- package/src/duckdb/src/include/duckdb/main/query_result.hpp +3 -3
- package/src/duckdb/src/include/duckdb/main/relation.hpp +6 -7
- package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/column_list.hpp +7 -7
- package/src/duckdb/src/include/duckdb/parser/expression/function_expression.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/expression/operator_expression.hpp +2 -2
- package/src/duckdb/src/include/duckdb/parser/keyword_helper.hpp +5 -0
- package/src/duckdb/src/include/duckdb/parser/parser_extension.hpp +2 -2
- package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/operator_extension.hpp +2 -2
- package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +2 -2
- package/src/duckdb/src/main/db_instance_cache.cpp +5 -3
- package/src/duckdb/src/main/extension/extension_install.cpp +22 -18
- package/src/duckdb/src/parser/expression/collate_expression.cpp +1 -1
- package/src/duckdb/src/parser/keyword_helper.cpp +11 -1
- package/src/duckdb/src/parser/query_node/select_node.cpp +1 -1
- package/src/duckdb/src/parser/statement/copy_statement.cpp +2 -2
- package/src/duckdb/src/parser/tableref.cpp +1 -1
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +9 -4
- package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +1 -1
- package/src/duckdb/src/storage/single_file_block_manager.cpp +0 -4
- package/src/duckdb/src/storage/storage_manager.cpp +3 -0
- package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5735 -5773
@@ -1,7 +1,7 @@
|
|
1
1
|
#include "json_scan.hpp"
|
2
2
|
|
3
|
+
#include "duckdb/common/enum_util.hpp"
|
3
4
|
#include "duckdb/common/multi_file_reader.hpp"
|
4
|
-
#include "duckdb/main/database.hpp"
|
5
5
|
#include "duckdb/main/extension_helper.hpp"
|
6
6
|
#include "duckdb/parallel/task_scheduler.hpp"
|
7
7
|
#include "duckdb/storage/buffer_manager.hpp"
|
@@ -11,51 +11,59 @@ namespace duckdb {
|
|
11
11
|
JSONScanData::JSONScanData() {
|
12
12
|
}
|
13
13
|
|
14
|
-
|
15
|
-
auto
|
16
|
-
|
17
|
-
|
18
|
-
auto &info = (JSONScanInfo &)*input.info;
|
19
|
-
result->type = info.type;
|
14
|
+
void JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
|
15
|
+
auto &info = input.info->Cast<JSONScanInfo>();
|
16
|
+
type = info.type;
|
20
17
|
options.format = info.format;
|
21
|
-
|
22
|
-
|
23
|
-
result->file_paths = MultiFileReader::GetFileList(context, input.inputs[0], "JSON");
|
18
|
+
options.record_type = info.record_type;
|
19
|
+
auto_detect = info.auto_detect;
|
24
20
|
|
25
21
|
for (auto &kv : input.named_parameters) {
|
22
|
+
if (MultiFileReader::ParseOption(kv.first, kv.second, options.file_options)) {
|
23
|
+
continue;
|
24
|
+
}
|
26
25
|
auto loption = StringUtil::Lower(kv.first);
|
27
26
|
if (loption == "ignore_errors") {
|
28
|
-
|
27
|
+
ignore_errors = BooleanValue::Get(kv.second);
|
29
28
|
} else if (loption == "maximum_object_size") {
|
30
|
-
|
31
|
-
} else if (loption == "
|
32
|
-
auto
|
33
|
-
if (
|
29
|
+
maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), maximum_object_size);
|
30
|
+
} else if (loption == "format") {
|
31
|
+
auto arg = StringValue::Get(kv.second);
|
32
|
+
if (arg == "auto") {
|
34
33
|
options.format = JSONFormat::AUTO_DETECT;
|
35
|
-
} else if (
|
34
|
+
} else if (arg == "unstructured") {
|
36
35
|
options.format = JSONFormat::UNSTRUCTURED;
|
37
|
-
} else if (
|
36
|
+
} else if (arg == "newline_delimited" || arg == "nd") {
|
38
37
|
options.format = JSONFormat::NEWLINE_DELIMITED;
|
38
|
+
} else if (arg == "array") {
|
39
|
+
options.format = JSONFormat::ARRAY;
|
39
40
|
} else {
|
40
|
-
throw
|
41
|
+
throw InvalidInputException(
|
42
|
+
"format must be one of ['auto', 'unstructured', 'newline_delimited', 'array']");
|
41
43
|
}
|
42
44
|
} else if (loption == "compression") {
|
43
|
-
|
44
|
-
if (compression == "none") {
|
45
|
-
options.compression = FileCompressionType::UNCOMPRESSED;
|
46
|
-
} else if (compression == "gzip") {
|
47
|
-
options.compression = FileCompressionType::GZIP;
|
48
|
-
} else if (compression == "zstd") {
|
49
|
-
options.compression = FileCompressionType::ZSTD;
|
50
|
-
} else if (compression == "auto") {
|
51
|
-
options.compression = FileCompressionType::AUTO_DETECT;
|
52
|
-
} else {
|
53
|
-
throw BinderException("compression must be one of ['none', 'gzip', 'zstd', 'auto']");
|
54
|
-
}
|
45
|
+
SetCompression(StringUtil::Lower(StringValue::Get(kv.second)));
|
55
46
|
}
|
56
47
|
}
|
57
48
|
|
58
|
-
|
49
|
+
files = MultiFileReader::GetFileList(context, input.inputs[0], "JSON");
|
50
|
+
|
51
|
+
if (options.file_options.auto_detect_hive_partitioning) {
|
52
|
+
options.file_options.hive_partitioning = MultiFileReaderOptions::AutoDetectHivePartitioning(files);
|
53
|
+
}
|
54
|
+
|
55
|
+
InitializeReaders(context);
|
56
|
+
}
|
57
|
+
|
58
|
+
void JSONScanData::InitializeReaders(ClientContext &context) {
|
59
|
+
union_readers.resize(files.empty() ? 0 : files.size() - 1);
|
60
|
+
for (idx_t file_idx = 0; file_idx < files.size(); file_idx++) {
|
61
|
+
if (file_idx == 0) {
|
62
|
+
initial_reader = make_uniq<BufferedJSONReader>(context, options, files[0]);
|
63
|
+
} else {
|
64
|
+
union_readers[file_idx - 1] = make_uniq<BufferedJSONReader>(context, options, files[file_idx]);
|
65
|
+
}
|
66
|
+
}
|
59
67
|
}
|
60
68
|
|
61
69
|
void JSONScanData::InitializeFormats() {
|
@@ -63,14 +71,6 @@ void JSONScanData::InitializeFormats() {
|
|
63
71
|
}
|
64
72
|
|
65
73
|
void JSONScanData::InitializeFormats(bool auto_detect_p) {
|
66
|
-
// Set defaults for date/timestamp formats if we need to
|
67
|
-
if (!auto_detect_p && date_format.empty()) {
|
68
|
-
date_format = "%Y-%m-%d";
|
69
|
-
}
|
70
|
-
if (!auto_detect_p && timestamp_format.empty()) {
|
71
|
-
timestamp_format = "%Y-%m-%dT%H:%M:%S.%fZ";
|
72
|
-
}
|
73
|
-
|
74
74
|
// Initialize date_format_map if anything was specified
|
75
75
|
if (!date_format.empty()) {
|
76
76
|
date_format_map.AddFormat(LogicalTypeId::DATE, date_format);
|
@@ -80,7 +80,7 @@ void JSONScanData::InitializeFormats(bool auto_detect_p) {
|
|
80
80
|
}
|
81
81
|
|
82
82
|
if (auto_detect_p) {
|
83
|
-
static const
|
83
|
+
static const type_id_map_t<vector<const char *>> FORMAT_TEMPLATES = {
|
84
84
|
{LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
|
85
85
|
{LogicalTypeId::TIMESTAMP,
|
86
86
|
{"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
|
@@ -89,56 +89,73 @@ void JSONScanData::InitializeFormats(bool auto_detect_p) {
|
|
89
89
|
|
90
90
|
// Populate possible date/timestamp formats, assume this is consistent across columns
|
91
91
|
for (auto &kv : FORMAT_TEMPLATES) {
|
92
|
-
const auto &
|
93
|
-
if (date_format_map.HasFormats(
|
92
|
+
const auto &logical_type = kv.first;
|
93
|
+
if (date_format_map.HasFormats(logical_type)) {
|
94
94
|
continue; // Already populated
|
95
95
|
}
|
96
96
|
const auto &format_strings = kv.second;
|
97
97
|
for (auto &format_string : format_strings) {
|
98
|
-
date_format_map.AddFormat(
|
98
|
+
date_format_map.AddFormat(logical_type, format_string);
|
99
99
|
}
|
100
100
|
}
|
101
101
|
}
|
102
102
|
}
|
103
103
|
|
104
|
-
void JSONScanData::
|
104
|
+
void JSONScanData::SetCompression(const string &compression) {
|
105
|
+
options.compression = EnumUtil::FromString<FileCompressionType>(StringUtil::Upper(compression));
|
106
|
+
}
|
107
|
+
|
108
|
+
void JSONScanData::Serialize(FieldWriter &writer) const {
|
105
109
|
writer.WriteField<JSONScanType>(type);
|
110
|
+
|
106
111
|
options.Serialize(writer);
|
107
|
-
|
112
|
+
|
113
|
+
writer.WriteSerializable(reader_bind);
|
114
|
+
|
115
|
+
writer.WriteList<string>(files);
|
116
|
+
|
108
117
|
writer.WriteField<bool>(ignore_errors);
|
109
118
|
writer.WriteField<idx_t>(maximum_object_size);
|
110
|
-
transform_options.Serialize(writer);
|
111
119
|
writer.WriteField<bool>(auto_detect);
|
112
120
|
writer.WriteField<idx_t>(sample_size);
|
113
|
-
writer.WriteList<string>(names);
|
114
|
-
writer.WriteList<idx_t>(valid_cols);
|
115
121
|
writer.WriteField<idx_t>(max_depth);
|
116
|
-
|
122
|
+
|
123
|
+
transform_options.Serialize(writer);
|
124
|
+
writer.WriteList<string>(names);
|
117
125
|
if (!date_format.empty()) {
|
118
126
|
writer.WriteString(date_format);
|
119
|
-
} else {
|
127
|
+
} else if (date_format_map.HasFormats(LogicalTypeId::DATE)) {
|
120
128
|
writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
|
129
|
+
} else {
|
130
|
+
writer.WriteString("");
|
121
131
|
}
|
122
132
|
if (!timestamp_format.empty()) {
|
123
133
|
writer.WriteString(timestamp_format);
|
124
|
-
} else {
|
134
|
+
} else if (date_format_map.HasFormats(LogicalTypeId::TIMESTAMP)) {
|
125
135
|
writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
|
136
|
+
} else {
|
137
|
+
writer.WriteString("");
|
126
138
|
}
|
127
139
|
}
|
128
140
|
|
129
|
-
void JSONScanData::Deserialize(FieldReader &reader) {
|
141
|
+
void JSONScanData::Deserialize(ClientContext &context, FieldReader &reader) {
|
130
142
|
type = reader.ReadRequired<JSONScanType>();
|
143
|
+
|
131
144
|
options.Deserialize(reader);
|
132
|
-
|
145
|
+
|
146
|
+
reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
|
147
|
+
|
148
|
+
files = reader.ReadRequiredList<string>();
|
149
|
+
InitializeReaders(context);
|
150
|
+
|
133
151
|
ignore_errors = reader.ReadRequired<bool>();
|
134
152
|
maximum_object_size = reader.ReadRequired<idx_t>();
|
135
|
-
transform_options.Deserialize(reader);
|
136
153
|
auto_detect = reader.ReadRequired<bool>();
|
137
154
|
sample_size = reader.ReadRequired<idx_t>();
|
138
|
-
names = reader.ReadRequiredList<string>();
|
139
|
-
valid_cols = reader.ReadRequiredList<idx_t>();
|
140
155
|
max_depth = reader.ReadRequired<idx_t>();
|
141
|
-
|
156
|
+
|
157
|
+
transform_options.Deserialize(reader);
|
158
|
+
names = reader.ReadRequiredList<string>();
|
142
159
|
date_format = reader.ReadRequired<string>();
|
143
160
|
timestamp_format = reader.ReadRequired<string>();
|
144
161
|
|
@@ -146,86 +163,97 @@ void JSONScanData::Deserialize(FieldReader &reader) {
|
|
146
163
|
transform_options.date_format_map = &date_format_map;
|
147
164
|
}
|
148
165
|
|
149
|
-
JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
|
150
|
-
: bind_data(bind_data_p),
|
166
|
+
JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const JSONScanData &bind_data_p)
|
167
|
+
: bind_data(bind_data_p), transform_options(bind_data.transform_options),
|
168
|
+
allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
|
151
169
|
buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
|
152
170
|
system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
|
153
|
-
if (bind_data.stored_readers.empty()) {
|
154
|
-
json_readers.reserve(bind_data.file_paths.size());
|
155
|
-
for (idx_t i = 0; i < bind_data.file_paths.size(); i++) {
|
156
|
-
json_readers.push_back(make_uniq<BufferedJSONReader>(context, bind_data.options, bind_data.file_paths[i]));
|
157
|
-
}
|
158
|
-
} else {
|
159
|
-
json_readers = std::move(bind_data.stored_readers);
|
160
|
-
}
|
161
171
|
}
|
162
172
|
|
163
173
|
JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
|
164
|
-
: scan_count(0),
|
165
|
-
|
166
|
-
is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
|
174
|
+
: scan_count(0), batch_index(DConstants::INVALID_INDEX), total_read_size(0), total_tuple_count(0),
|
175
|
+
bind_data(gstate.bind_data), allocator(BufferAllocator::Get(context)), current_reader(nullptr),
|
176
|
+
current_buffer_handle(nullptr), is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
|
167
177
|
|
168
178
|
// Buffer to reconstruct JSON values when they cross a buffer boundary
|
169
|
-
reconstruct_buffer = gstate.allocator.Allocate(gstate.
|
170
|
-
|
171
|
-
// This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
|
172
|
-
current_buffer_copy = gstate.allocator.Allocate(gstate.buffer_capacity);
|
173
|
-
buffer_copy_ptr = (const char *)current_buffer_copy.get();
|
179
|
+
reconstruct_buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
|
174
180
|
}
|
175
181
|
|
176
182
|
JSONGlobalTableFunctionState::JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input)
|
177
|
-
: state(context,
|
183
|
+
: state(context, input.bind_data->Cast<JSONScanData>()) {
|
178
184
|
}
|
179
185
|
|
180
186
|
unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientContext &context,
|
181
187
|
TableFunctionInitInput &input) {
|
182
|
-
auto &bind_data =
|
188
|
+
auto &bind_data = input.bind_data->Cast<JSONScanData>();
|
183
189
|
auto result = make_uniq<JSONGlobalTableFunctionState>(context, input);
|
190
|
+
auto &gstate = result->state;
|
184
191
|
|
185
192
|
// Perform projection pushdown
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
193
|
+
for (idx_t col_idx = 0; col_idx < input.column_ids.size(); col_idx++) {
|
194
|
+
const auto &col_id = input.column_ids[col_idx];
|
195
|
+
|
196
|
+
// Skip any multi-file reader / row id stuff
|
197
|
+
if (col_id == bind_data.reader_bind.filename_idx || IsRowIdColumnId(col_id)) {
|
198
|
+
continue;
|
199
|
+
}
|
200
|
+
bool skip = false;
|
201
|
+
for (const auto &hive_partitioning_index : bind_data.reader_bind.hive_partitioning_indexes) {
|
202
|
+
if (col_id == hive_partitioning_index.index) {
|
203
|
+
skip = true;
|
204
|
+
break;
|
194
205
|
}
|
195
|
-
names.push_back(std::move(bind_data.names[id]));
|
196
|
-
bind_data.valid_cols.push_back(i);
|
197
206
|
}
|
198
|
-
if (
|
199
|
-
|
200
|
-
// then we don't need to throw an error if we encounter an unseen column
|
201
|
-
bind_data.transform_options.error_unknown_key = false;
|
207
|
+
if (skip) {
|
208
|
+
continue;
|
202
209
|
}
|
203
|
-
|
210
|
+
|
211
|
+
gstate.column_indices.push_back(col_idx);
|
212
|
+
gstate.names.push_back(bind_data.names[col_id]);
|
213
|
+
}
|
214
|
+
|
215
|
+
if (gstate.names.size() < bind_data.names.size() || bind_data.options.file_options.union_by_name) {
|
216
|
+
// If we are auto-detecting, but don't need all columns present in the file,
|
217
|
+
// then we don't need to throw an error if we encounter an unseen column
|
218
|
+
gstate.transform_options.error_unknown_key = false;
|
219
|
+
}
|
220
|
+
|
221
|
+
// Place readers where they belong
|
222
|
+
if (bind_data.initial_reader) {
|
223
|
+
bind_data.initial_reader->Reset();
|
224
|
+
gstate.json_readers.emplace_back(bind_data.initial_reader.get());
|
204
225
|
}
|
226
|
+
for (const auto &reader : bind_data.union_readers) {
|
227
|
+
reader->Reset();
|
228
|
+
gstate.json_readers.emplace_back(reader.get());
|
229
|
+
}
|
230
|
+
|
231
|
+
vector<LogicalType> dummy_types(input.column_ids.size(), LogicalType::ANY);
|
232
|
+
for (auto &reader : gstate.json_readers) {
|
233
|
+
MultiFileReader::FinalizeBind(reader->GetOptions().file_options, gstate.bind_data.reader_bind,
|
234
|
+
reader->GetFileName(), gstate.names, dummy_types, bind_data.names,
|
235
|
+
input.column_ids, reader->reader_data);
|
236
|
+
}
|
237
|
+
|
205
238
|
return std::move(result);
|
206
239
|
}
|
207
240
|
|
208
241
|
idx_t JSONGlobalTableFunctionState::MaxThreads() const {
|
209
242
|
auto &bind_data = state.bind_data;
|
243
|
+
if (bind_data.options.format == JSONFormat::NEWLINE_DELIMITED &&
|
244
|
+
bind_data.options.compression == FileCompressionType::UNCOMPRESSED) {
|
245
|
+
return state.system_threads;
|
246
|
+
}
|
210
247
|
|
211
|
-
|
212
|
-
idx_t readers_per_file;
|
213
|
-
if (bind_data.options.format == JSONFormat::UNSTRUCTURED) {
|
214
|
-
// Unstructured necessitates single thread
|
215
|
-
readers_per_file = 1;
|
216
|
-
} else if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
|
248
|
+
if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
|
217
249
|
auto &reader = *state.json_readers[0];
|
218
|
-
|
219
|
-
if (options.format == JSONFormat::UNSTRUCTURED || options.compression != FileCompressionType::UNCOMPRESSED) {
|
220
|
-
// Auto-detected unstructured - same story, compression also really limits parallelism
|
221
|
-
readers_per_file = 1;
|
222
|
-
} else {
|
250
|
+
if (reader.IsParallel()) { // Auto-detected parallel scan
|
223
251
|
return state.system_threads;
|
224
252
|
}
|
225
|
-
} else {
|
226
|
-
return state.system_threads;
|
227
253
|
}
|
228
|
-
|
254
|
+
|
255
|
+
// One reader per file
|
256
|
+
return bind_data.files.size();
|
229
257
|
}
|
230
258
|
|
231
259
|
JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
|
@@ -235,12 +263,12 @@ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context,
|
|
235
263
|
unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
|
236
264
|
TableFunctionInitInput &input,
|
237
265
|
GlobalTableFunctionState *global_state) {
|
238
|
-
auto &gstate = (
|
266
|
+
auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>();
|
239
267
|
auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
|
240
268
|
|
241
269
|
// Copy the transform options / date format map because we need to do thread-local stuff
|
242
270
|
result->state.date_format_map = gstate.state.bind_data.date_format_map;
|
243
|
-
result->state.transform_options = gstate.state.
|
271
|
+
result->state.transform_options = gstate.state.transform_options;
|
244
272
|
result->state.transform_options.date_format_map = &result->state.date_format_map;
|
245
273
|
|
246
274
|
return std::move(result);
|
@@ -250,7 +278,7 @@ idx_t JSONLocalTableFunctionState::GetBatchIndex() const {
|
|
250
278
|
return state.batch_index;
|
251
279
|
}
|
252
280
|
|
253
|
-
static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, idx_t &buffer_size) {
|
281
|
+
static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, const idx_t &buffer_size) {
|
254
282
|
for (; buffer_offset != buffer_size; buffer_offset++) {
|
255
283
|
if (!StringUtil::CharacterIsSpace(buffer_ptr[buffer_offset])) {
|
256
284
|
break;
|
@@ -259,50 +287,21 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
|
|
259
287
|
}
|
260
288
|
|
261
289
|
idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
|
262
|
-
|
263
|
-
|
264
|
-
if ((gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
|
265
|
-
gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) &&
|
266
|
-
array_idx < scan_count) {
|
267
|
-
return GetObjectsFromArray(gstate);
|
268
|
-
}
|
290
|
+
allocator.Reset();
|
269
291
|
|
270
|
-
|
292
|
+
scan_count = 0;
|
271
293
|
if (buffer_offset == buffer_size) {
|
272
294
|
if (!ReadNextBuffer(gstate)) {
|
273
|
-
return
|
295
|
+
return scan_count;
|
274
296
|
}
|
275
|
-
if (current_buffer_handle->buffer_index != 0 &&
|
276
|
-
current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
|
297
|
+
if (current_buffer_handle->buffer_index != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
277
298
|
ReconstructFirstObject(gstate);
|
278
|
-
|
299
|
+
scan_count++;
|
279
300
|
}
|
280
301
|
}
|
302
|
+
ParseNextChunk();
|
281
303
|
|
282
|
-
|
283
|
-
switch (options.format) {
|
284
|
-
case JSONFormat::UNSTRUCTURED:
|
285
|
-
ReadUnstructured(count);
|
286
|
-
break;
|
287
|
-
case JSONFormat::NEWLINE_DELIMITED:
|
288
|
-
ReadNewlineDelimited(count);
|
289
|
-
break;
|
290
|
-
default:
|
291
|
-
throw InternalException("Unknown JSON format");
|
292
|
-
}
|
293
|
-
scan_count = count;
|
294
|
-
|
295
|
-
// Skip over any remaining whitespace for the next scan
|
296
|
-
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
297
|
-
|
298
|
-
if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
|
299
|
-
gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
|
300
|
-
array_idx = 0;
|
301
|
-
array_offset = 0;
|
302
|
-
return GetObjectsFromArray(gstate);
|
303
|
-
}
|
304
|
-
|
305
|
-
return count;
|
304
|
+
return scan_count;
|
306
305
|
}
|
307
306
|
|
308
307
|
static inline const char *NextNewline(const char *ptr, idx_t size) {
|
@@ -319,7 +318,71 @@ static inline const char *PreviousNewline(const char *ptr) {
|
|
319
318
|
return ptr;
|
320
319
|
}
|
321
320
|
|
322
|
-
static inline
|
321
|
+
static inline const char *NextJSONDefault(const char *ptr, const idx_t size, const char *const end) {
|
322
|
+
idx_t parents = 0;
|
323
|
+
while (ptr != end) {
|
324
|
+
switch (*ptr++) {
|
325
|
+
case '{':
|
326
|
+
case '[':
|
327
|
+
parents++;
|
328
|
+
continue;
|
329
|
+
case '}':
|
330
|
+
case ']':
|
331
|
+
parents--;
|
332
|
+
break;
|
333
|
+
case '"':
|
334
|
+
while (ptr != end) {
|
335
|
+
auto string_char = *ptr++;
|
336
|
+
if (string_char == '"') {
|
337
|
+
break;
|
338
|
+
} else if (string_char == '\\') {
|
339
|
+
if (ptr != end) {
|
340
|
+
ptr++; // Skip the escaped char
|
341
|
+
}
|
342
|
+
}
|
343
|
+
}
|
344
|
+
break;
|
345
|
+
default:
|
346
|
+
continue;
|
347
|
+
}
|
348
|
+
|
349
|
+
if (parents == 0) {
|
350
|
+
break;
|
351
|
+
}
|
352
|
+
}
|
353
|
+
|
354
|
+
return ptr;
|
355
|
+
}
|
356
|
+
|
357
|
+
static inline const char *NextJSON(const char *ptr, const idx_t size) {
|
358
|
+
D_ASSERT(!StringUtil::CharacterIsSpace(*ptr)); // Should be handled before
|
359
|
+
|
360
|
+
const char *const end = ptr + size;
|
361
|
+
switch (*ptr) {
|
362
|
+
case '{':
|
363
|
+
case '[':
|
364
|
+
case '"':
|
365
|
+
ptr = NextJSONDefault(ptr, size, end);
|
366
|
+
break;
|
367
|
+
default:
|
368
|
+
// Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
|
369
|
+
while (ptr != end) {
|
370
|
+
switch (*ptr++) {
|
371
|
+
case ',':
|
372
|
+
case ']':
|
373
|
+
ptr--;
|
374
|
+
break;
|
375
|
+
default:
|
376
|
+
continue;
|
377
|
+
}
|
378
|
+
break;
|
379
|
+
}
|
380
|
+
}
|
381
|
+
|
382
|
+
return ptr == end ? nullptr : ptr;
|
383
|
+
}
|
384
|
+
|
385
|
+
static inline void TrimWhitespace(JSONString &line) {
|
323
386
|
while (line.size != 0 && StringUtil::CharacterIsSpace(line[0])) {
|
324
387
|
line.pointer++;
|
325
388
|
line.size--;
|
@@ -329,204 +392,248 @@ static inline void TrimWhitespace(JSONLine &line) {
|
|
329
392
|
}
|
330
393
|
}
|
331
394
|
|
332
|
-
|
395
|
+
void JSONScanLocalState::ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining) {
|
333
396
|
yyjson_doc *doc;
|
334
|
-
|
335
|
-
|
336
|
-
|
397
|
+
yyjson_read_err err;
|
398
|
+
if (bind_data.type == JSONScanType::READ_JSON_OBJECTS) { // If we return strings, we cannot parse INSITU
|
399
|
+
doc = JSONCommon::ReadDocumentUnsafe(json_start, json_size, JSONCommon::READ_STOP_FLAG, allocator.GetYYAlc(),
|
400
|
+
&err);
|
337
401
|
} else {
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
|
402
|
+
doc = JSONCommon::ReadDocumentUnsafe(json_start, remaining, JSONCommon::READ_INSITU_FLAG, allocator.GetYYAlc(),
|
403
|
+
&err);
|
404
|
+
}
|
405
|
+
if (!bind_data.ignore_errors && err.code != YYJSON_READ_SUCCESS) {
|
406
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
|
407
|
+
}
|
408
|
+
|
409
|
+
// We parse with YYJSON_STOP_WHEN_DONE, so we need to check this by hand
|
410
|
+
const auto read_size = yyjson_doc_get_read_size(doc);
|
411
|
+
if (read_size > json_size) {
|
412
|
+
// Can't go past the boundary, even with ignore_errors
|
413
|
+
err.code = YYJSON_READ_ERROR_UNEXPECTED_END;
|
414
|
+
err.msg = "unexpected end of data";
|
415
|
+
err.pos = json_size;
|
416
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err,
|
417
|
+
"Try auto-detecting the JSON format");
|
418
|
+
} else if (!bind_data.ignore_errors && read_size < json_size) {
|
419
|
+
idx_t off = read_size;
|
420
|
+
idx_t rem = json_size;
|
421
|
+
SkipWhitespace(json_start, off, rem);
|
422
|
+
if (off != rem) { // Between end of document and boundary should be whitespace only
|
423
|
+
err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT;
|
424
|
+
err.msg = "unexpected content after document";
|
425
|
+
err.pos = read_size;
|
426
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err,
|
427
|
+
"Try auto-detecting the JSON format");
|
365
428
|
}
|
366
429
|
}
|
367
|
-
lines_or_objects_in_buffer++;
|
368
430
|
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
return doc->root;
|
374
|
-
} else {
|
375
|
-
return nullptr;
|
431
|
+
lines_or_objects_in_buffer++;
|
432
|
+
if (!doc) {
|
433
|
+
values[scan_count] = nullptr;
|
434
|
+
return;
|
376
435
|
}
|
436
|
+
|
437
|
+
// Set the JSONLine and trim
|
438
|
+
units[scan_count] = JSONString(json_start, json_size);
|
439
|
+
TrimWhitespace(units[scan_count]);
|
440
|
+
values[scan_count] = doc->root;
|
377
441
|
}
|
378
442
|
|
379
|
-
|
380
|
-
|
443
|
+
void JSONScanLocalState::ThrowObjectSizeError(const idx_t object_size) {
|
444
|
+
throw InvalidInputException(
|
445
|
+
"\"maximum_object_size\" of %llu bytes exceeded while reading file \"%s\" (>%llu bytes)."
|
446
|
+
"\n Try increasing \"maximum_object_size\".",
|
447
|
+
bind_data.maximum_object_size, current_reader->GetFileName(), object_size);
|
448
|
+
}
|
381
449
|
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
450
|
+
void JSONScanLocalState::ThrowInvalidAtEndError() {
|
451
|
+
throw InvalidInputException("Invalid JSON detected at the end of file \"%s\".", current_reader->GetFileName());
|
452
|
+
}
|
453
|
+
|
454
|
+
static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(const char *const buffer_ptr, const idx_t buffer_size,
|
455
|
+
yyjson_alc *alc) {
|
456
|
+
// First we do the easy check whether it's NEWLINE_DELIMITED
|
457
|
+
auto line_end = NextNewline(buffer_ptr, buffer_size);
|
458
|
+
if (line_end != nullptr) {
|
459
|
+
idx_t line_size = line_end - buffer_ptr;
|
460
|
+
SkipWhitespace(buffer_ptr, line_size, buffer_size);
|
461
|
+
|
462
|
+
yyjson_read_err error;
|
463
|
+
auto doc = JSONCommon::ReadDocumentUnsafe((char *)buffer_ptr, line_size, JSONCommon::READ_FLAG, alc, &error);
|
464
|
+
if (error.code == YYJSON_READ_SUCCESS) { // We successfully read the line
|
465
|
+
if (yyjson_is_arr(doc->root) && line_size == buffer_size) {
|
466
|
+
// It's just one array, let's actually assume ARRAY, not NEWLINE_DELIMITED
|
467
|
+
if (yyjson_arr_size(doc->root) == 0 || yyjson_is_obj(yyjson_arr_get(doc->root, 0))) {
|
468
|
+
// Either an empty array (assume records), or an array of objects
|
469
|
+
return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
|
470
|
+
} else {
|
471
|
+
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
|
397
472
|
}
|
473
|
+
} else if (yyjson_is_obj(doc->root)) {
|
474
|
+
return make_pair(JSONFormat::NEWLINE_DELIMITED, JSONRecordType::RECORDS);
|
475
|
+
} else {
|
476
|
+
return make_pair(JSONFormat::NEWLINE_DELIMITED, JSONRecordType::VALUES);
|
398
477
|
}
|
399
|
-
array_offset = idx + 1;
|
400
|
-
if (arr_count == STANDARD_VECTOR_SIZE) {
|
401
|
-
break;
|
402
|
-
}
|
403
|
-
} else if (!gstate.bind_data.ignore_errors) {
|
404
|
-
ThrowTransformError(
|
405
|
-
array_idx,
|
406
|
-
StringUtil::Format("Expected JSON ARRAY but got %s: %s\nTry setting json_format to 'records'",
|
407
|
-
JSONCommon::ValTypeToString(value), JSONCommon::ValToString(value, 50)));
|
408
478
|
}
|
409
479
|
}
|
410
|
-
|
480
|
+
|
481
|
+
// Skip whitespace
|
482
|
+
idx_t buffer_offset = 0;
|
483
|
+
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
484
|
+
auto remaining = buffer_size - buffer_offset;
|
485
|
+
|
486
|
+
// We know it's not NEWLINE_DELIMITED at this point, if there's a '{', we know it's not ARRAY either
|
487
|
+
// Also if it's fully whitespace we just return something because we don't know
|
488
|
+
if (remaining == 0 || buffer_ptr[buffer_offset] == '{') {
|
489
|
+
return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::RECORDS);
|
490
|
+
}
|
491
|
+
|
492
|
+
// We know it's not top-level records, if it's not '[', it's not ARRAY either
|
493
|
+
if (buffer_ptr[buffer_offset] != '[') {
|
494
|
+
return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::VALUES);
|
495
|
+
}
|
496
|
+
|
497
|
+
// It's definitely an ARRAY, but now we have to figure out if there's more than one top-level array
|
498
|
+
yyjson_read_err error;
|
499
|
+
auto doc = JSONCommon::ReadDocumentUnsafe((char *)buffer_ptr + buffer_offset, remaining, JSONCommon::READ_STOP_FLAG,
|
500
|
+
alc, &error);
|
501
|
+
if (error.code == YYJSON_READ_SUCCESS) {
|
502
|
+
D_ASSERT(yyjson_is_arr(doc->root));
|
503
|
+
|
504
|
+
// We successfully read something!
|
505
|
+
buffer_offset += yyjson_doc_get_read_size(doc);
|
506
|
+
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
507
|
+
remaining = buffer_size - buffer_offset;
|
508
|
+
|
509
|
+
if (remaining != 0) { // There's more
|
510
|
+
return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::VALUES);
|
511
|
+
}
|
512
|
+
|
513
|
+
// Just one array, check what's in there
|
514
|
+
if (yyjson_arr_size(doc->root) == 0 || yyjson_is_obj(yyjson_arr_get(doc->root, 0))) {
|
515
|
+
// Either an empty array (assume records), or an array of objects
|
516
|
+
return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
|
517
|
+
} else {
|
518
|
+
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
|
519
|
+
}
|
520
|
+
}
|
521
|
+
|
522
|
+
// We weren't able to parse an array, could be broken or an array larger than our buffer size, let's skip over '['
|
523
|
+
SkipWhitespace(buffer_ptr, ++buffer_offset, --remaining);
|
524
|
+
remaining = buffer_size - buffer_offset;
|
525
|
+
|
526
|
+
// If it's '{' we know there's RECORDS in the ARRAY, else it's VALUES
|
527
|
+
if (remaining == 0 || buffer_ptr[buffer_offset] == '{') {
|
528
|
+
return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
|
529
|
+
}
|
530
|
+
|
531
|
+
// It's not RECORDS, so it must be VALUES
|
532
|
+
return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
|
411
533
|
}
|
412
534
|
|
413
535
|
bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
536
|
+
AllocatedData buffer;
|
414
537
|
if (current_reader) {
|
415
|
-
|
538
|
+
// Keep track of this for accurate errors
|
416
539
|
current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
|
417
|
-
|
418
|
-
|
419
|
-
|
540
|
+
|
541
|
+
// Try to re-use existing buffer
|
542
|
+
if (current_buffer_handle && --current_buffer_handle->readers == 0) {
|
543
|
+
buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
|
544
|
+
} else {
|
545
|
+
buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
|
420
546
|
}
|
421
|
-
}
|
422
547
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
548
|
+
if (!is_last) {
|
549
|
+
if (current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
|
550
|
+
memcpy(buffer.get(), reconstruct_buffer.get(),
|
551
|
+
prev_buffer_remainder); // Copy last bit of previous buffer
|
552
|
+
}
|
553
|
+
} else {
|
554
|
+
if (gstate.bind_data.type != JSONScanType::SAMPLE) {
|
555
|
+
current_reader->CloseJSONFile(); // Close files that are done if we're not sampling
|
556
|
+
}
|
557
|
+
current_reader = nullptr;
|
558
|
+
}
|
428
559
|
} else {
|
429
|
-
// Allocate a new buffer
|
430
560
|
buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
|
431
561
|
}
|
432
562
|
buffer_ptr = (const char *)buffer.get();
|
433
563
|
|
434
|
-
if (current_reader && current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
|
435
|
-
// Copy last bit of previous buffer
|
436
|
-
memcpy(buffer.get(), reconstruct_buffer.get(), prev_buffer_remainder);
|
437
|
-
}
|
438
|
-
|
439
564
|
idx_t buffer_index;
|
440
565
|
while (true) {
|
441
566
|
if (current_reader) {
|
442
|
-
|
443
|
-
if (buffer_size
|
567
|
+
ReadNextBufferInternal(gstate, buffer_index);
|
568
|
+
if (buffer_size == 0) {
|
569
|
+
if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
|
570
|
+
current_reader->CloseJSONFile();
|
571
|
+
}
|
572
|
+
if (current_reader->IsParallel()) {
|
573
|
+
// If this threads' current reader is still the one at gstate.file_index,
|
574
|
+
// this thread can end the parallel scan
|
575
|
+
lock_guard<mutex> guard(gstate.lock);
|
576
|
+
if (gstate.file_index < gstate.json_readers.size() &&
|
577
|
+
current_reader == gstate.json_readers[gstate.file_index].get()) {
|
578
|
+
gstate.file_index++; // End parallel scan
|
579
|
+
}
|
580
|
+
}
|
581
|
+
current_reader = nullptr;
|
582
|
+
} else {
|
444
583
|
break; // We read something!
|
445
584
|
}
|
446
585
|
}
|
447
586
|
|
448
|
-
//
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
}
|
454
|
-
if (current_reader && current_reader == gstate.json_readers[gstate.file_index].get() &&
|
455
|
-
current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
|
456
|
-
// We had a reader, but we didn't read anything, move to the next file
|
457
|
-
gstate.file_index++;
|
458
|
-
}
|
459
|
-
// Check again since we may have just updated
|
460
|
-
if (gstate.file_index == gstate.json_readers.size()) {
|
461
|
-
return false; // No more files left
|
462
|
-
}
|
463
|
-
|
464
|
-
// Try the next reader
|
465
|
-
current_reader = gstate.json_readers[gstate.file_index].get();
|
466
|
-
auto &options = current_reader->GetOptions();
|
467
|
-
if (current_reader->IsOpen()) {
|
468
|
-
if (options.format == JSONFormat::UNSTRUCTURED ||
|
469
|
-
(options.compression != FileCompressionType::UNCOMPRESSED &&
|
470
|
-
gstate.file_index < gstate.json_readers.size())) {
|
471
|
-
// Can only be open from schema detection
|
472
|
-
batch_index = gstate.batch_index++;
|
473
|
-
gstate.file_index++;
|
587
|
+
// This thread needs a new reader
|
588
|
+
{
|
589
|
+
lock_guard<mutex> guard(gstate.lock);
|
590
|
+
if (gstate.file_index == gstate.json_readers.size()) {
|
591
|
+
return false; // No more files left
|
474
592
|
}
|
475
|
-
continue; // It's open, this thread joins the scan
|
476
|
-
}
|
477
|
-
|
478
|
-
// Unopened file
|
479
|
-
current_reader->OpenJSONFile();
|
480
|
-
batch_index = gstate.batch_index++;
|
481
|
-
if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
|
482
|
-
options.compression != FileCompressionType::UNCOMPRESSED &&
|
483
|
-
gstate.file_index < gstate.json_readers.size())) {
|
484
|
-
gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
|
485
|
-
}
|
486
|
-
if (options.format != JSONFormat::AUTO_DETECT) {
|
487
|
-
continue; // Re-enter loop to proceed reading
|
488
|
-
}
|
489
593
|
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
594
|
+
// Try the next reader
|
595
|
+
current_reader = gstate.json_readers[gstate.file_index].get();
|
596
|
+
if (current_reader->IsOpen()) {
|
597
|
+
// Can only be open from auto detection, so these should be known
|
598
|
+
if (!current_reader->IsParallel()) {
|
599
|
+
batch_index = gstate.batch_index++;
|
600
|
+
gstate.file_index++;
|
601
|
+
}
|
602
|
+
continue; // Re-enter the loop to start scanning the assigned file
|
603
|
+
}
|
496
604
|
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
605
|
+
current_reader->OpenJSONFile();
|
606
|
+
batch_index = gstate.batch_index++;
|
607
|
+
if (current_reader->GetFormat() != JSONFormat::AUTO_DETECT) {
|
608
|
+
if (!current_reader->IsParallel()) {
|
609
|
+
gstate.file_index++;
|
610
|
+
}
|
611
|
+
continue;
|
612
|
+
}
|
504
613
|
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
|
514
|
-
}
|
614
|
+
// If we have a low amount of files, we auto-detect within the lock,
|
615
|
+
// so other threads may join a parallel NDJSON scan
|
616
|
+
if (gstate.json_readers.size() < 100) {
|
617
|
+
if (ReadAndAutoDetect(gstate, buffer_index, false)) {
|
618
|
+
continue;
|
619
|
+
}
|
620
|
+
break;
|
621
|
+
}
|
515
622
|
|
516
|
-
|
517
|
-
if (options.compression != FileCompressionType::UNCOMPRESSED &&
|
518
|
-
gstate.file_index < gstate.json_readers.size()) {
|
623
|
+
// Increment the file index within the lock, then read/auto-detect outside of the lock
|
519
624
|
gstate.file_index++;
|
520
625
|
}
|
521
626
|
|
627
|
+
// High amount of files, just do 1 thread per file
|
628
|
+
if (ReadAndAutoDetect(gstate, buffer_index, true)) {
|
629
|
+
continue;
|
630
|
+
}
|
522
631
|
break;
|
523
632
|
}
|
524
633
|
D_ASSERT(buffer_size != 0); // We should have read something if we got here
|
525
634
|
|
526
|
-
idx_t readers;
|
527
|
-
if (current_reader->
|
528
|
-
readers = 1;
|
529
|
-
} else {
|
635
|
+
idx_t readers = 1;
|
636
|
+
if (current_reader->IsParallel()) {
|
530
637
|
readers = is_last ? 1 : 2;
|
531
638
|
}
|
532
639
|
|
@@ -535,24 +642,57 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
535
642
|
current_buffer_handle = json_buffer_handle.get();
|
536
643
|
current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
|
537
644
|
|
538
|
-
buffer_offset = 0;
|
539
645
|
prev_buffer_remainder = 0;
|
540
646
|
lines_or_objects_in_buffer = 0;
|
541
647
|
|
648
|
+
// YYJSON needs this
|
542
649
|
memset((void *)(buffer_ptr + buffer_size), 0, YYJSON_PADDING_SIZE);
|
543
|
-
if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
|
544
|
-
memcpy((void *)buffer_copy_ptr, buffer_ptr, buffer_size + YYJSON_PADDING_SIZE);
|
545
|
-
}
|
546
650
|
|
547
651
|
return true;
|
548
652
|
}
|
549
653
|
|
550
|
-
|
654
|
+
bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index,
|
655
|
+
const bool already_incremented_file_idx) {
|
656
|
+
// We have to detect the JSON format - hold the gstate lock while we do this
|
657
|
+
ReadNextBufferInternal(gstate, buffer_index);
|
658
|
+
if (buffer_size == 0) {
|
659
|
+
if (!already_incremented_file_idx) {
|
660
|
+
gstate.file_index++; // Empty file, move to the next one
|
661
|
+
}
|
662
|
+
return true;
|
663
|
+
}
|
664
|
+
|
665
|
+
auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, buffer_size, allocator.GetYYAlc());
|
666
|
+
current_reader->SetFormat(format_and_record_type.first);
|
667
|
+
if (current_reader->GetRecordType() == JSONRecordType::AUTO_DETECT) {
|
668
|
+
current_reader->SetRecordType(format_and_record_type.second);
|
669
|
+
}
|
670
|
+
if (current_reader->GetFormat() == JSONFormat::ARRAY) {
|
671
|
+
SkipOverArrayStart();
|
672
|
+
}
|
673
|
+
|
674
|
+
if (bind_data.options.record_type == JSONRecordType::RECORDS &&
|
675
|
+
current_reader->GetRecordType() != JSONRecordType::RECORDS) {
|
676
|
+
throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.",
|
677
|
+
current_reader->GetFileName());
|
678
|
+
}
|
679
|
+
if (!already_incremented_file_idx && !current_reader->IsParallel()) {
|
680
|
+
gstate.file_index++;
|
681
|
+
}
|
682
|
+
return false;
|
683
|
+
}
|
684
|
+
|
685
|
+
void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index) {
|
551
686
|
if (current_reader->GetFileHandle().CanSeek()) {
|
552
687
|
ReadNextBufferSeek(gstate, buffer_index);
|
553
688
|
} else {
|
554
689
|
ReadNextBufferNoSeek(gstate, buffer_index);
|
555
690
|
}
|
691
|
+
|
692
|
+
buffer_offset = 0;
|
693
|
+
if (buffer_index == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
|
694
|
+
SkipOverArrayStart();
|
695
|
+
}
|
556
696
|
}
|
557
697
|
|
558
698
|
void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
|
@@ -567,13 +707,13 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
|
|
567
707
|
buffer_index = current_reader->GetBufferIndex();
|
568
708
|
|
569
709
|
read_size = file_handle.GetPositionAndSize(read_position, request_size);
|
570
|
-
is_last =
|
710
|
+
is_last = read_size < request_size;
|
571
711
|
|
572
712
|
if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
|
573
|
-
|
713
|
+
ThrowInvalidAtEndError();
|
574
714
|
}
|
575
715
|
|
576
|
-
if (current_reader->
|
716
|
+
if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
577
717
|
batch_index = gstate.batch_index++;
|
578
718
|
}
|
579
719
|
}
|
@@ -604,10 +744,10 @@ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t
|
|
604
744
|
is_last = read_size < request_size;
|
605
745
|
|
606
746
|
if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
|
607
|
-
|
747
|
+
ThrowInvalidAtEndError();
|
608
748
|
}
|
609
749
|
|
610
|
-
if (current_reader->
|
750
|
+
if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
|
611
751
|
batch_index = gstate.batch_index++;
|
612
752
|
}
|
613
753
|
}
|
@@ -618,9 +758,38 @@ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t
|
|
618
758
|
}
|
619
759
|
}
|
620
760
|
|
761
|
+
void JSONScanLocalState::SkipOverArrayStart() {
|
762
|
+
// First read of this buffer, check if it's actually an array and skip over the bytes
|
763
|
+
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
764
|
+
if (buffer_offset == buffer_size) {
|
765
|
+
return; // Empty file
|
766
|
+
}
|
767
|
+
if (buffer_ptr[buffer_offset] != '[') {
|
768
|
+
throw InvalidInputException(
|
769
|
+
"Expected top-level JSON array with format='array', but first character is '%c' in file \"%s\"."
|
770
|
+
"\n Try setting format='auto' or format='newline_delimited'.",
|
771
|
+
buffer_ptr[buffer_offset], current_reader->GetFileName());
|
772
|
+
}
|
773
|
+
SkipWhitespace(buffer_ptr, ++buffer_offset, buffer_size);
|
774
|
+
if (buffer_offset >= buffer_size) {
|
775
|
+
throw InvalidInputException("Missing closing brace ']' in JSON array with format='array' in file \"%s\"",
|
776
|
+
current_reader->GetFileName());
|
777
|
+
}
|
778
|
+
if (buffer_ptr[buffer_offset] == ']') {
|
779
|
+
// Empty array
|
780
|
+
SkipWhitespace(buffer_ptr, ++buffer_offset, buffer_size);
|
781
|
+
if (buffer_offset != buffer_size) {
|
782
|
+
throw InvalidInputException(
|
783
|
+
"Empty array with trailing data when parsing JSON array with format='array' in file \"%s\"",
|
784
|
+
current_reader->GetFileName());
|
785
|
+
}
|
786
|
+
return;
|
787
|
+
}
|
788
|
+
}
|
789
|
+
|
621
790
|
void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
622
791
|
D_ASSERT(current_buffer_handle->buffer_index != 0);
|
623
|
-
D_ASSERT(current_reader->
|
792
|
+
D_ASSERT(current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED);
|
624
793
|
|
625
794
|
// Spinlock until the previous batch index has also read its buffer
|
626
795
|
JSONBufferHandle *previous_buffer_handle = nullptr;
|
@@ -638,9 +807,8 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
638
807
|
memcpy(reconstruct_ptr, part1_ptr, part1_size);
|
639
808
|
// Now find the newline in the current block
|
640
809
|
auto line_end = NextNewline(buffer_ptr, buffer_size);
|
641
|
-
if (line_end == nullptr) {
|
642
|
-
|
643
|
-
bind_data.maximum_object_size, buffer_size - buffer_offset);
|
810
|
+
if (line_end == nullptr) {
|
811
|
+
ThrowObjectSizeError(buffer_size - buffer_offset);
|
644
812
|
} else {
|
645
813
|
line_end++;
|
646
814
|
}
|
@@ -648,8 +816,7 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
648
816
|
|
649
817
|
idx_t line_size = part1_size + part2_size;
|
650
818
|
if (line_size > bind_data.maximum_object_size) {
|
651
|
-
|
652
|
-
bind_data.maximum_object_size, line_size);
|
819
|
+
ThrowObjectSizeError(line_size);
|
653
820
|
}
|
654
821
|
|
655
822
|
// And copy the remainder of the line to the reconstruct buffer
|
@@ -662,85 +829,68 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
662
829
|
current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
|
663
830
|
}
|
664
831
|
|
665
|
-
|
832
|
+
ParseJSON((char *)reconstruct_ptr, line_size, line_size);
|
666
833
|
}
|
667
834
|
|
668
|
-
void JSONScanLocalState::
|
669
|
-
|
670
|
-
// if a different error code happens within the last 50 bytes
|
671
|
-
// we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
|
672
|
-
static constexpr idx_t END_BOUND = 50;
|
673
|
-
|
674
|
-
const auto max_obj_size = reconstruct_buffer.GetSize();
|
675
|
-
yyjson_read_err error;
|
676
|
-
for (; count < STANDARD_VECTOR_SIZE; count++) {
|
677
|
-
const auto obj_start = buffer_ptr + buffer_offset;
|
678
|
-
const auto obj_copy_start = buffer_copy_ptr + buffer_offset;
|
679
|
-
|
680
|
-
idx_t remaining = buffer_size - buffer_offset;
|
681
|
-
if (remaining == 0) {
|
682
|
-
break;
|
683
|
-
}
|
684
|
-
|
685
|
-
// Read next JSON doc
|
686
|
-
auto read_doc = JSONCommon::ReadDocumentUnsafe((char *)obj_start, remaining, JSONCommon::STOP_READ_FLAG,
|
687
|
-
json_allocator.GetYYJSONAllocator(), &error);
|
688
|
-
if (error.code == YYJSON_READ_SUCCESS) {
|
689
|
-
idx_t line_size = yyjson_doc_get_read_size(read_doc);
|
690
|
-
lines[count] = JSONLine(obj_copy_start, line_size);
|
691
|
-
TrimWhitespace(lines[count]);
|
835
|
+
void JSONScanLocalState::ParseNextChunk() {
|
836
|
+
auto buffer_offset_before = buffer_offset;
|
692
837
|
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
|
698
|
-
"Try increasing \"maximum_object_size\".");
|
699
|
-
} else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
|
700
|
-
// Copy remaining to reconstruct_buffer
|
701
|
-
const auto reconstruct_ptr = reconstruct_buffer.get();
|
702
|
-
memcpy(reconstruct_ptr, obj_copy_start, remaining);
|
703
|
-
prev_buffer_remainder = remaining;
|
704
|
-
buffer_offset = buffer_size;
|
705
|
-
break;
|
706
|
-
} else {
|
707
|
-
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
|
708
|
-
}
|
709
|
-
values[count] = read_doc->root;
|
710
|
-
}
|
711
|
-
}
|
712
|
-
|
713
|
-
void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
|
714
|
-
for (; count < STANDARD_VECTOR_SIZE; count++) {
|
715
|
-
auto line_start = buffer_ptr + buffer_offset;
|
838
|
+
const auto format = current_reader->GetFormat();
|
839
|
+
for (; scan_count < STANDARD_VECTOR_SIZE; scan_count++) {
|
840
|
+
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
841
|
+
auto json_start = buffer_ptr + buffer_offset;
|
716
842
|
idx_t remaining = buffer_size - buffer_offset;
|
717
843
|
if (remaining == 0) {
|
718
844
|
break;
|
719
845
|
}
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
if (line_end == nullptr) {
|
846
|
+
const char *json_end = format == JSONFormat::NEWLINE_DELIMITED ? NextNewline(json_start, remaining)
|
847
|
+
: NextJSON(json_start, remaining);
|
848
|
+
if (json_end == nullptr) {
|
725
849
|
// We reached the end of the buffer
|
726
850
|
if (!is_last) {
|
727
851
|
// Last bit of data belongs to the next batch
|
852
|
+
if (format != JSONFormat::NEWLINE_DELIMITED) {
|
853
|
+
if (scan_count == 0) {
|
854
|
+
ThrowObjectSizeError(remaining);
|
855
|
+
}
|
856
|
+
memcpy(reconstruct_buffer.get(), json_start, remaining);
|
857
|
+
prev_buffer_remainder = remaining;
|
858
|
+
}
|
728
859
|
buffer_offset = buffer_size;
|
729
860
|
break;
|
730
861
|
}
|
731
|
-
|
862
|
+
json_end = json_start + remaining;
|
732
863
|
}
|
733
|
-
idx_t line_size = line_end - line_start;
|
734
864
|
|
735
|
-
|
865
|
+
idx_t json_size = json_end - json_start;
|
866
|
+
ParseJSON((char *)json_start, json_size, remaining);
|
867
|
+
buffer_offset += json_size;
|
736
868
|
|
737
|
-
|
869
|
+
if (format == JSONFormat::ARRAY) {
|
870
|
+
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
871
|
+
if (buffer_ptr[buffer_offset] == ',' || buffer_ptr[buffer_offset] == ']') {
|
872
|
+
buffer_offset++;
|
873
|
+
} else { // We can't ignore this error, even with 'ignore_errors'
|
874
|
+
yyjson_read_err err;
|
875
|
+
err.code = YYJSON_READ_ERROR_UNEXPECTED_CHARACTER;
|
876
|
+
err.msg = "unexpected character";
|
877
|
+
err.pos = json_size;
|
878
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
|
879
|
+
}
|
880
|
+
}
|
738
881
|
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
739
882
|
}
|
883
|
+
|
884
|
+
total_read_size += buffer_offset - buffer_offset_before;
|
885
|
+
total_tuple_count += scan_count;
|
740
886
|
}
|
741
887
|
|
742
888
|
yyjson_alc *JSONScanLocalState::GetAllocator() {
|
743
|
-
return
|
889
|
+
return allocator.GetYYAlc();
|
890
|
+
}
|
891
|
+
|
892
|
+
const MultiFileReaderData &JSONScanLocalState::GetReaderData() const {
|
893
|
+
return current_reader->reader_data;
|
744
894
|
}
|
745
895
|
|
746
896
|
void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &error_message) {
|
@@ -751,4 +901,73 @@ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &e
|
|
751
901
|
current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
|
752
902
|
}
|
753
903
|
|
904
|
+
double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
|
905
|
+
const GlobalTableFunctionState *global_state) {
|
906
|
+
auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>().state;
|
907
|
+
double progress = 0;
|
908
|
+
for (auto &reader : gstate.json_readers) {
|
909
|
+
progress += reader->GetProgress();
|
910
|
+
}
|
911
|
+
return progress / double(gstate.json_readers.size());
|
912
|
+
}
|
913
|
+
|
914
|
+
idx_t JSONScan::GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
|
915
|
+
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
|
916
|
+
auto &lstate = local_state->Cast<JSONLocalTableFunctionState>();
|
917
|
+
return lstate.GetBatchIndex();
|
918
|
+
}
|
919
|
+
|
920
|
+
unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &context, const FunctionData *bind_data) {
|
921
|
+
auto &data = bind_data->Cast<JSONScanData>();
|
922
|
+
idx_t per_file_cardinality;
|
923
|
+
if (data.initial_reader && data.initial_reader->IsOpen()) {
|
924
|
+
per_file_cardinality = data.initial_reader->GetFileHandle().FileSize() / data.avg_tuple_size;
|
925
|
+
} else {
|
926
|
+
per_file_cardinality = 42; // The cardinality of an unknown JSON file is the almighty number 42
|
927
|
+
}
|
928
|
+
return make_uniq<NodeStatistics>(per_file_cardinality * data.files.size());
|
929
|
+
}
|
930
|
+
|
931
|
+
void JSONScan::ComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p,
|
932
|
+
vector<unique_ptr<Expression>> &filters) {
|
933
|
+
auto &data = bind_data_p->Cast<JSONScanData>();
|
934
|
+
auto reset_reader =
|
935
|
+
MultiFileReader::ComplexFilterPushdown(context, data.files, data.options.file_options, get, filters);
|
936
|
+
if (reset_reader) {
|
937
|
+
MultiFileReader::PruneReaders(data);
|
938
|
+
}
|
939
|
+
}
|
940
|
+
|
941
|
+
void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
|
942
|
+
auto &bind_data = bind_data_p->Cast<JSONScanData>();
|
943
|
+
bind_data.Serialize(writer);
|
944
|
+
}
|
945
|
+
|
946
|
+
unique_ptr<FunctionData> JSONScan::Deserialize(ClientContext &context, FieldReader &reader, TableFunction &function) {
|
947
|
+
auto result = make_uniq<JSONScanData>();
|
948
|
+
result->Deserialize(context, reader);
|
949
|
+
return std::move(result);
|
950
|
+
}
|
951
|
+
|
952
|
+
void JSONScan::TableFunctionDefaults(TableFunction &table_function) {
|
953
|
+
MultiFileReader::AddParameters(table_function);
|
954
|
+
|
955
|
+
table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
|
956
|
+
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
957
|
+
table_function.named_parameters["format"] = LogicalType::VARCHAR;
|
958
|
+
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
|
959
|
+
|
960
|
+
table_function.table_scan_progress = ScanProgress;
|
961
|
+
table_function.get_batch_index = GetBatchIndex;
|
962
|
+
table_function.cardinality = Cardinality;
|
963
|
+
|
964
|
+
table_function.serialize = Serialize;
|
965
|
+
table_function.deserialize = Deserialize;
|
966
|
+
|
967
|
+
table_function.projection_pushdown = true;
|
968
|
+
table_function.filter_pushdown = false;
|
969
|
+
table_function.filter_prune = false;
|
970
|
+
table_function.pushdown_complex_filter = ComplexFilterPushdown;
|
971
|
+
}
|
972
|
+
|
754
973
|
} // namespace duckdb
|