duckdb 0.6.2-dev1978.0 → 0.6.2-dev2015.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +132 -18
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +29 -9
- package/src/duckdb/extension/json/include/json_common.hpp +56 -0
- package/src/duckdb/extension/json/include/json_functions.hpp +9 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +115 -25
- package/src/duckdb/extension/json/include/json_structure.hpp +73 -0
- package/src/duckdb/extension/json/include/json_transform.hpp +57 -0
- package/src/duckdb/extension/json/json-extension.cpp +3 -0
- package/src/duckdb/extension/json/json_functions/json_contains.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_create.cpp +6 -10
- package/src/duckdb/extension/json/json_functions/json_extract.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_keys.cpp +60 -0
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +404 -150
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +216 -60
- package/src/duckdb/extension/json/json_functions/read_json.cpp +224 -0
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +6 -6
- package/src/duckdb/extension/json/json_functions.cpp +25 -0
- package/src/duckdb/extension/json/json_scan.cpp +192 -86
- package/src/duckdb/extension/json/yyjson/include/yyjson.hpp +18 -9
- package/src/duckdb/extension/json/yyjson/yyjson.cpp +58 -13
- package/src/duckdb/src/function/table/copy_csv.cpp +16 -11
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/function/scalar/strftime.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/extension_functions.hpp +5 -0
- package/src/duckdb/ub_extension_json_json_functions.cpp +4 -0
|
@@ -14,8 +14,8 @@ unique_ptr<FunctionData> ReadJSONObjectsBind(ClientContext &context, TableFuncti
|
|
|
14
14
|
static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
|
|
15
15
|
D_ASSERT(output.ColumnCount() == 1);
|
|
16
16
|
D_ASSERT(JSONCommon::LogicalTypeIsJSON(output.data[0].GetType()));
|
|
17
|
-
auto &gstate = (
|
|
18
|
-
auto &lstate = (
|
|
17
|
+
auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
|
|
18
|
+
auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
|
|
19
19
|
|
|
20
20
|
// Fetch next lines
|
|
21
21
|
const auto count = lstate.ReadNext(gstate);
|
|
@@ -38,8 +38,8 @@ static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &
|
|
|
38
38
|
|
|
39
39
|
TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
|
|
40
40
|
auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
|
|
41
|
-
TableFunction table_function({parameter}, ReadJSONObjectsFunction, ReadJSONObjectsBind,
|
|
42
|
-
|
|
41
|
+
TableFunction table_function({parameter}, ReadJSONObjectsFunction, ReadJSONObjectsBind,
|
|
42
|
+
JSONGlobalTableFunctionState::Init, JSONLocalTableFunctionState::Init);
|
|
43
43
|
JSONScan::TableFunctionDefaults(table_function);
|
|
44
44
|
table_function.function_info = std::move(function_info);
|
|
45
45
|
|
|
@@ -48,7 +48,7 @@ TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JS
|
|
|
48
48
|
|
|
49
49
|
CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
|
|
50
50
|
TableFunctionSet function_set("read_json_objects");
|
|
51
|
-
auto function_info = make_shared<JSONScanInfo>(JSONFormat::UNSTRUCTURED
|
|
51
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::UNSTRUCTURED);
|
|
52
52
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
|
|
53
53
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
|
|
54
54
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -56,7 +56,7 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
|
|
|
56
56
|
|
|
57
57
|
CreateTableFunctionInfo JSONFunctions::GetReadNDJSONObjectsFunction() {
|
|
58
58
|
TableFunctionSet function_set("read_ndjson_objects");
|
|
59
|
-
auto function_info = make_shared<JSONScanInfo>(JSONFormat::NEWLINE_DELIMITED
|
|
59
|
+
auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED);
|
|
60
60
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
|
|
61
61
|
function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
|
|
62
62
|
return CreateTableFunctionInfo(function_set);
|
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
#include "duckdb/execution/expression_executor.hpp"
|
|
4
4
|
#include "duckdb/function/cast/cast_function_set.hpp"
|
|
5
5
|
#include "duckdb/function/cast/default_casts.hpp"
|
|
6
|
+
#include "duckdb/function/replacement_scan.hpp"
|
|
7
|
+
#include "duckdb/parser/expression/constant_expression.hpp"
|
|
8
|
+
#include "duckdb/parser/expression/function_expression.hpp"
|
|
9
|
+
#include "duckdb/parser/tableref/table_function_ref.hpp"
|
|
6
10
|
|
|
7
11
|
namespace duckdb {
|
|
8
12
|
|
|
@@ -136,6 +140,7 @@ vector<CreateScalarFunctionInfo> JSONFunctions::GetScalarFunctions() {
|
|
|
136
140
|
// Other
|
|
137
141
|
functions.push_back(GetArrayLengthFunction());
|
|
138
142
|
functions.push_back(GetContainsFunction());
|
|
143
|
+
functions.push_back(GetKeysFunction());
|
|
139
144
|
functions.push_back(GetTypeFunction());
|
|
140
145
|
functions.push_back(GetValidFunction());
|
|
141
146
|
|
|
@@ -149,9 +154,29 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
|
|
|
149
154
|
functions.push_back(GetReadJSONObjectsFunction());
|
|
150
155
|
functions.push_back(GetReadNDJSONObjectsFunction());
|
|
151
156
|
|
|
157
|
+
// Read JSON as columnar data
|
|
158
|
+
functions.push_back(GetReadJSONFunction());
|
|
159
|
+
functions.push_back(GetReadNDJSONFunction());
|
|
160
|
+
functions.push_back(GetReadJSONAutoFunction());
|
|
161
|
+
functions.push_back(GetReadNDJSONAutoFunction());
|
|
162
|
+
|
|
152
163
|
return functions;
|
|
153
164
|
}
|
|
154
165
|
|
|
166
|
+
unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
|
|
167
|
+
ReplacementScanData *data) {
|
|
168
|
+
auto lower_name = StringUtil::Lower(table_name);
|
|
169
|
+
if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
|
|
170
|
+
!StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
|
|
171
|
+
return nullptr;
|
|
172
|
+
}
|
|
173
|
+
auto table_function = make_unique<TableFunctionRef>();
|
|
174
|
+
vector<unique_ptr<ParsedExpression>> children;
|
|
175
|
+
children.push_back(make_unique<ConstantExpression>(Value(table_name)));
|
|
176
|
+
table_function->function = make_unique<FunctionExpression>("read_json_auto", std::move(children));
|
|
177
|
+
return std::move(table_function);
|
|
178
|
+
}
|
|
179
|
+
|
|
155
180
|
static unique_ptr<FunctionLocalState> InitJSONCastLocalState(ClientContext &context) {
|
|
156
181
|
return make_unique<JSONFunctionLocalState>(context);
|
|
157
182
|
}
|
|
@@ -19,18 +19,16 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
|
|
|
19
19
|
auto &options = result->options;
|
|
20
20
|
|
|
21
21
|
auto &info = (JSONScanInfo &)*input.info;
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
options.format = info.forced_format;
|
|
26
|
-
result->return_json_strings = info.return_json_strings;
|
|
22
|
+
options.format = info.format;
|
|
23
|
+
result->type = info.type;
|
|
24
|
+
result->auto_detect = info.auto_detect;
|
|
27
25
|
|
|
28
26
|
vector<string> patterns;
|
|
29
|
-
if (input.inputs[0].type().id() == LogicalTypeId::LIST) { //
|
|
27
|
+
if (input.inputs[0].type().id() == LogicalTypeId::LIST) { // List of globs
|
|
30
28
|
for (auto &val : ListValue::GetChildren(input.inputs[0])) {
|
|
31
29
|
patterns.push_back(StringValue::Get(val));
|
|
32
30
|
}
|
|
33
|
-
} else { //
|
|
31
|
+
} else { // Single glob pattern
|
|
34
32
|
patterns.push_back(StringValue::Get(input.inputs[0]));
|
|
35
33
|
}
|
|
36
34
|
InitializeFilePaths(context, patterns, result->file_paths);
|
|
@@ -50,15 +48,24 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
|
|
|
50
48
|
} else if (format == "newline_delimited") {
|
|
51
49
|
options.format = JSONFormat::NEWLINE_DELIMITED;
|
|
52
50
|
} else {
|
|
53
|
-
throw
|
|
51
|
+
throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
|
|
52
|
+
}
|
|
53
|
+
} else if (loption == "compression") {
|
|
54
|
+
auto compression = StringUtil::Lower(StringValue::Get(kv.second));
|
|
55
|
+
if (compression == "none") {
|
|
56
|
+
options.compression = FileCompressionType::UNCOMPRESSED;
|
|
57
|
+
} else if (compression == "gzip") {
|
|
58
|
+
options.compression = FileCompressionType::GZIP;
|
|
59
|
+
} else if (compression == "zstd") {
|
|
60
|
+
options.compression = FileCompressionType::ZSTD;
|
|
61
|
+
} else if (compression == "auto") {
|
|
62
|
+
options.compression = FileCompressionType::AUTO_DETECT;
|
|
63
|
+
} else {
|
|
64
|
+
throw BinderException("compression must be one of ['none', 'gzip', 'zstd', 'auto']");
|
|
54
65
|
}
|
|
55
66
|
}
|
|
56
67
|
}
|
|
57
68
|
|
|
58
|
-
if (result->ignore_errors && options.format == JSONFormat::UNSTRUCTURED) {
|
|
59
|
-
throw InvalidInputException("Cannot ignore errors with unstructured format");
|
|
60
|
-
}
|
|
61
|
-
|
|
62
69
|
return std::move(result);
|
|
63
70
|
}
|
|
64
71
|
|
|
@@ -75,56 +82,99 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
|
|
|
75
82
|
}
|
|
76
83
|
|
|
77
84
|
void JSONScanData::Serialize(FieldWriter &writer) {
|
|
85
|
+
writer.WriteField<JSONScanType>(type);
|
|
78
86
|
options.Serialize(writer);
|
|
79
87
|
writer.WriteList<string>(file_paths);
|
|
80
88
|
writer.WriteField<bool>(ignore_errors);
|
|
81
89
|
writer.WriteField<idx_t>(maximum_object_size);
|
|
82
|
-
|
|
90
|
+
transform_options.Serialize(writer);
|
|
91
|
+
writer.WriteField<bool>(auto_detect);
|
|
92
|
+
writer.WriteField<idx_t>(sample_size);
|
|
93
|
+
writer.WriteList<string>(names);
|
|
94
|
+
writer.WriteField<idx_t>(max_depth);
|
|
83
95
|
}
|
|
84
96
|
|
|
85
97
|
void JSONScanData::Deserialize(FieldReader &reader) {
|
|
98
|
+
type = reader.ReadRequired<JSONScanType>();
|
|
86
99
|
options.Deserialize(reader);
|
|
87
100
|
file_paths = reader.ReadRequiredList<string>();
|
|
88
101
|
ignore_errors = reader.ReadRequired<bool>();
|
|
89
102
|
maximum_object_size = reader.ReadRequired<idx_t>();
|
|
90
|
-
|
|
103
|
+
transform_options.Deserialize(reader);
|
|
104
|
+
auto_detect = reader.ReadRequired<bool>();
|
|
105
|
+
sample_size = reader.ReadRequired<idx_t>();
|
|
106
|
+
names = reader.ReadRequiredList<string>();
|
|
107
|
+
max_depth = reader.ReadRequired<idx_t>();
|
|
91
108
|
}
|
|
92
109
|
|
|
93
110
|
JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
|
|
94
111
|
: bind_data(bind_data_p), allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
|
|
95
112
|
buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
|
|
96
113
|
system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
114
|
+
if (bind_data.stored_readers.empty()) {
|
|
115
|
+
json_readers.reserve(bind_data.file_paths.size());
|
|
116
|
+
for (idx_t i = 0; i < bind_data.file_paths.size(); i++) {
|
|
117
|
+
json_readers.push_back(
|
|
118
|
+
make_unique<BufferedJSONReader>(context, bind_data.options, bind_data.file_paths[i]));
|
|
119
|
+
}
|
|
120
|
+
} else {
|
|
121
|
+
json_readers = std::move(bind_data.stored_readers);
|
|
100
122
|
}
|
|
101
123
|
}
|
|
102
124
|
|
|
103
|
-
unique_ptr<GlobalTableFunctionState> JSONScanGlobalState::Init(ClientContext &context, TableFunctionInitInput &input) {
|
|
104
|
-
auto &bind_data = (JSONScanData &)*input.bind_data;
|
|
105
|
-
return make_unique<JSONScanGlobalState>(context, bind_data);
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
idx_t JSONScanGlobalState::MaxThreads() const {
|
|
109
|
-
return system_threads;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
125
|
JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
|
|
113
|
-
: batch_index(DConstants::INVALID_INDEX),
|
|
114
|
-
|
|
126
|
+
: batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
|
|
127
|
+
json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
|
|
128
|
+
buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
|
|
115
129
|
|
|
116
130
|
// Buffer to reconstruct JSON objects when they cross a buffer boundary
|
|
117
|
-
reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size);
|
|
131
|
+
reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
|
|
118
132
|
|
|
119
133
|
// This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
|
|
120
134
|
current_buffer_copy = gstate.allocator.Allocate(gstate.buffer_capacity);
|
|
121
135
|
buffer_copy_ptr = (const char *)current_buffer_copy.get();
|
|
122
136
|
}
|
|
123
137
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
138
|
+
JSONGlobalTableFunctionState::JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input)
|
|
139
|
+
: state(context, (JSONScanData &)*input.bind_data) {
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientContext &context,
|
|
143
|
+
TableFunctionInitInput &input) {
|
|
144
|
+
auto &bind_data = (JSONScanData &)*input.bind_data;
|
|
145
|
+
auto result = make_unique<JSONGlobalTableFunctionState>(context, input);
|
|
146
|
+
|
|
147
|
+
// Check if we need to do projection pushdown
|
|
148
|
+
if (bind_data.type == JSONScanType::READ_JSON && input.column_ids.size() != bind_data.names.size()) {
|
|
149
|
+
D_ASSERT(input.column_ids.size() < bind_data.names.size()); // Can't project to have more columns
|
|
150
|
+
vector<string> names;
|
|
151
|
+
names.reserve(input.column_ids.size());
|
|
152
|
+
for (const auto &id : input.column_ids) {
|
|
153
|
+
names.push_back(std::move(bind_data.names[id]));
|
|
154
|
+
}
|
|
155
|
+
bind_data.names = std::move(names);
|
|
156
|
+
bind_data.transform_options.error_unknown_key = false;
|
|
157
|
+
}
|
|
158
|
+
return result;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
idx_t JSONGlobalTableFunctionState::MaxThreads() const {
|
|
162
|
+
return state.system_threads;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
|
|
166
|
+
: state(context, gstate) {
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
|
|
170
|
+
TableFunctionInitInput &input,
|
|
171
|
+
GlobalTableFunctionState *global_state) {
|
|
172
|
+
auto &gstate = (JSONGlobalTableFunctionState &)*global_state;
|
|
173
|
+
return make_unique<JSONLocalTableFunctionState>(context.client, gstate.state);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
idx_t JSONLocalTableFunctionState::GetBatchIndex() const {
|
|
177
|
+
return state.batch_index;
|
|
128
178
|
}
|
|
129
179
|
|
|
130
180
|
static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, idx_t &buffer_size) {
|
|
@@ -140,12 +190,11 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
|
|
|
140
190
|
|
|
141
191
|
idx_t count = 0;
|
|
142
192
|
if (buffer_offset == buffer_size) {
|
|
143
|
-
|
|
144
|
-
if (!ReadNextBuffer(gstate, first_read)) {
|
|
193
|
+
if (!ReadNextBuffer(gstate)) {
|
|
145
194
|
return 0;
|
|
146
195
|
}
|
|
147
|
-
|
|
148
|
-
|
|
196
|
+
if (current_buffer_handle->buffer_index != 0 &&
|
|
197
|
+
current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
|
|
149
198
|
ReconstructFirstObject(gstate);
|
|
150
199
|
count++;
|
|
151
200
|
}
|
|
@@ -157,7 +206,7 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
|
|
|
157
206
|
ReadUnstructured(count);
|
|
158
207
|
break;
|
|
159
208
|
case JSONFormat::NEWLINE_DELIMITED:
|
|
160
|
-
ReadNewlineDelimited(count
|
|
209
|
+
ReadNewlineDelimited(count);
|
|
161
210
|
break;
|
|
162
211
|
default:
|
|
163
212
|
throw InternalException("Unknown JSON format");
|
|
@@ -193,28 +242,59 @@ static inline void TrimWhitespace(JSONLine &line) {
|
|
|
193
242
|
}
|
|
194
243
|
}
|
|
195
244
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
result = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
|
|
202
|
-
json_allocator.GetYYJSONAllocator());
|
|
245
|
+
yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line) {
|
|
246
|
+
yyjson_doc *doc;
|
|
247
|
+
if (bind_data.ignore_errors) {
|
|
248
|
+
doc = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
|
|
249
|
+
json_allocator.GetYYJSONAllocator());
|
|
203
250
|
} else {
|
|
204
|
-
|
|
205
|
-
|
|
251
|
+
yyjson_read_err err;
|
|
252
|
+
if (bind_data.type != JSONScanType::READ_JSON_OBJECTS) {
|
|
253
|
+
// Optimization: if we don't ignore errors, and don't need to return strings, we can parse INSITU
|
|
254
|
+
doc = JSONCommon::ReadDocumentUnsafe(line_start, remaining, JSONCommon::STOP_READ_FLAG,
|
|
255
|
+
json_allocator.GetYYJSONAllocator(), &err);
|
|
256
|
+
idx_t read_size = yyjson_doc_get_read_size(doc);
|
|
257
|
+
if (read_size > line_size) {
|
|
258
|
+
err.pos = line_size;
|
|
259
|
+
err.code = YYJSON_READ_ERROR_UNEXPECTED_END;
|
|
260
|
+
err.msg = "unexpected end of data";
|
|
261
|
+
} else if (read_size < line_size) {
|
|
262
|
+
idx_t diff = line_size - read_size;
|
|
263
|
+
char *ptr = line_start + read_size;
|
|
264
|
+
for (idx_t i = 0; i < diff; i++) {
|
|
265
|
+
if (!StringUtil::CharacterIsSpace(ptr[i])) {
|
|
266
|
+
err.pos = read_size;
|
|
267
|
+
err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT;
|
|
268
|
+
err.msg = "unexpected content after document";
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
} else {
|
|
273
|
+
doc = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
|
|
274
|
+
json_allocator.GetYYJSONAllocator(), &err);
|
|
275
|
+
}
|
|
276
|
+
if (err.code != YYJSON_READ_SUCCESS) {
|
|
277
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
|
|
278
|
+
}
|
|
206
279
|
}
|
|
280
|
+
lines_or_objects_in_buffer++;
|
|
207
281
|
|
|
208
|
-
if (
|
|
282
|
+
if (doc) {
|
|
209
283
|
// Set the JSONLine and trim
|
|
210
284
|
line = JSONLine(line_start, line_size);
|
|
211
285
|
TrimWhitespace(line);
|
|
286
|
+
return doc->root;
|
|
287
|
+
} else {
|
|
288
|
+
return nullptr;
|
|
212
289
|
}
|
|
213
|
-
|
|
214
|
-
return result;
|
|
215
290
|
}
|
|
216
291
|
|
|
217
|
-
bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate
|
|
292
|
+
bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
|
|
293
|
+
if (current_reader) {
|
|
294
|
+
D_ASSERT(current_buffer_handle);
|
|
295
|
+
current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
|
|
296
|
+
}
|
|
297
|
+
|
|
218
298
|
AllocatedData buffer;
|
|
219
299
|
if (current_buffer_handle && --current_buffer_handle->readers == 0) {
|
|
220
300
|
D_ASSERT(current_reader);
|
|
@@ -234,16 +314,8 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
|
|
|
234
314
|
idx_t buffer_index;
|
|
235
315
|
while (true) {
|
|
236
316
|
if (current_reader) {
|
|
237
|
-
|
|
238
|
-
ReadNextBufferSeek(gstate, first_read, buffer_index);
|
|
239
|
-
} else {
|
|
240
|
-
ReadNextBufferNoSeek(gstate, first_read, buffer_index);
|
|
241
|
-
}
|
|
317
|
+
ReadNextBuffer(gstate, buffer_index);
|
|
242
318
|
if (buffer_size != 0) {
|
|
243
|
-
if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
|
|
244
|
-
lock_guard<mutex> guard(gstate.lock);
|
|
245
|
-
batch_index = gstate.batch_index++;
|
|
246
|
-
}
|
|
247
319
|
break; // We read something!
|
|
248
320
|
}
|
|
249
321
|
}
|
|
@@ -267,6 +339,11 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
|
|
|
267
339
|
// Try the next reader
|
|
268
340
|
current_reader = gstate.json_readers[gstate.file_index].get();
|
|
269
341
|
if (current_reader->IsOpen()) {
|
|
342
|
+
if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
|
|
343
|
+
// Can only be open from schema detection
|
|
344
|
+
batch_index = gstate.batch_index++;
|
|
345
|
+
gstate.file_index++;
|
|
346
|
+
}
|
|
270
347
|
continue; // It's open, this thread joins the scan
|
|
271
348
|
}
|
|
272
349
|
|
|
@@ -282,12 +359,7 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
|
|
|
282
359
|
}
|
|
283
360
|
|
|
284
361
|
// We have to detect whether it's UNSTRUCTURED/NEWLINE_DELIMITED - hold the gstate lock while we do this
|
|
285
|
-
|
|
286
|
-
ReadNextBufferSeek(gstate, first_read, buffer_index);
|
|
287
|
-
} else {
|
|
288
|
-
ReadNextBufferNoSeek(gstate, first_read, buffer_index);
|
|
289
|
-
}
|
|
290
|
-
|
|
362
|
+
ReadNextBuffer(gstate, buffer_index);
|
|
291
363
|
if (buffer_size == 0) {
|
|
292
364
|
gstate.file_index++; // Empty file, move to the next one
|
|
293
365
|
continue;
|
|
@@ -323,22 +395,34 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
|
|
|
323
395
|
}
|
|
324
396
|
|
|
325
397
|
// Create an entry and insert it into the map
|
|
326
|
-
auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, move(buffer), buffer_size);
|
|
398
|
+
auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
|
|
327
399
|
current_buffer_handle = json_buffer_handle.get();
|
|
328
400
|
current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
|
|
401
|
+
if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
|
|
402
|
+
// TODO: store buffer
|
|
403
|
+
}
|
|
329
404
|
|
|
330
405
|
buffer_offset = 0;
|
|
331
406
|
prev_buffer_remainder = 0;
|
|
407
|
+
lines_or_objects_in_buffer = 0;
|
|
332
408
|
|
|
409
|
+
memset((void *)(buffer_ptr + buffer_size), 0, YYJSON_PADDING_SIZE);
|
|
333
410
|
if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
|
|
334
|
-
memset((void *)(buffer_ptr + buffer_size), 0, YYJSON_PADDING_SIZE);
|
|
335
411
|
memcpy((void *)buffer_copy_ptr, buffer_ptr, buffer_size + YYJSON_PADDING_SIZE);
|
|
336
412
|
}
|
|
337
413
|
|
|
338
414
|
return true;
|
|
339
415
|
}
|
|
340
416
|
|
|
341
|
-
void JSONScanLocalState::
|
|
417
|
+
void JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index) {
|
|
418
|
+
if (current_reader->GetFileHandle().CanSeek()) {
|
|
419
|
+
ReadNextBufferSeek(gstate, buffer_index);
|
|
420
|
+
} else {
|
|
421
|
+
ReadNextBufferNoSeek(gstate, buffer_index);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
|
|
342
426
|
auto &file_handle = current_reader->GetFileHandle();
|
|
343
427
|
|
|
344
428
|
idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
|
|
@@ -346,44 +430,57 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, bool &f
|
|
|
346
430
|
idx_t read_size;
|
|
347
431
|
|
|
348
432
|
{
|
|
349
|
-
lock_guard<mutex>
|
|
433
|
+
lock_guard<mutex> reader_guard(current_reader->lock);
|
|
350
434
|
buffer_index = current_reader->GetBufferIndex();
|
|
351
435
|
|
|
352
436
|
read_size = file_handle.GetPositionAndSize(read_position, request_size);
|
|
353
|
-
first_read = read_position == 0;
|
|
354
437
|
is_last = file_handle.Remaining() == 0;
|
|
355
438
|
|
|
356
439
|
if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
|
|
357
440
|
throw InvalidInputException("Invalid JSON detected at the end of file %s", current_reader->file_path);
|
|
358
441
|
}
|
|
442
|
+
|
|
443
|
+
if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
|
|
444
|
+
batch_index = gstate.batch_index++;
|
|
445
|
+
}
|
|
359
446
|
}
|
|
360
447
|
buffer_size = prev_buffer_remainder + read_size;
|
|
361
448
|
if (buffer_size == 0) {
|
|
449
|
+
current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
|
|
362
450
|
return;
|
|
363
451
|
}
|
|
364
452
|
|
|
365
453
|
// Now read the file lock-free!
|
|
366
|
-
file_handle.ReadAtPosition(buffer_ptr + prev_buffer_remainder, read_size, read_position
|
|
454
|
+
file_handle.ReadAtPosition(buffer_ptr + prev_buffer_remainder, read_size, read_position,
|
|
455
|
+
gstate.bind_data.type == JSONScanType::SAMPLE);
|
|
367
456
|
}
|
|
368
457
|
|
|
369
|
-
void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate,
|
|
458
|
+
void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
|
|
370
459
|
auto &file_handle = current_reader->GetFileHandle();
|
|
371
460
|
|
|
372
461
|
idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
|
|
373
462
|
idx_t read_size;
|
|
374
463
|
{
|
|
375
|
-
lock_guard<mutex>
|
|
464
|
+
lock_guard<mutex> reader_guard(current_reader->lock);
|
|
376
465
|
buffer_index = current_reader->GetBufferIndex();
|
|
377
466
|
|
|
378
|
-
|
|
379
|
-
|
|
467
|
+
read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
|
|
468
|
+
gstate.bind_data.type == JSONScanType::SAMPLE);
|
|
380
469
|
is_last = read_size < request_size;
|
|
381
470
|
|
|
382
471
|
if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
|
|
383
472
|
throw InvalidInputException("Invalid JSON detected at the end of file %s", current_reader->file_path);
|
|
384
473
|
}
|
|
474
|
+
|
|
475
|
+
if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
|
|
476
|
+
batch_index = gstate.batch_index++;
|
|
477
|
+
}
|
|
385
478
|
}
|
|
386
479
|
buffer_size = prev_buffer_remainder + read_size;
|
|
480
|
+
if (buffer_size == 0) {
|
|
481
|
+
current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
|
|
482
|
+
return;
|
|
483
|
+
}
|
|
387
484
|
}
|
|
388
485
|
|
|
389
486
|
void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
@@ -408,14 +505,21 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
|
408
505
|
auto line_end = NextNewline(buffer_ptr, buffer_size);
|
|
409
506
|
if (line_end == nullptr) { // TODO I don't think we can ignore this even with ignore_errors ...
|
|
410
507
|
throw InvalidInputException("maximum_object_size of %llu bytes exceeded (>%llu bytes), is the JSON valid?",
|
|
411
|
-
|
|
508
|
+
bind_data.maximum_object_size, buffer_size - buffer_offset);
|
|
412
509
|
} else {
|
|
413
510
|
line_end++;
|
|
414
511
|
}
|
|
415
512
|
idx_t part2_size = line_end - buffer_ptr;
|
|
416
513
|
|
|
514
|
+
idx_t line_size = part1_size + part2_size;
|
|
515
|
+
if (line_size > bind_data.maximum_object_size) {
|
|
516
|
+
throw InvalidInputException("maximum_object_size of %llu bytes exceeded (%llu bytes), is the JSON valid?",
|
|
517
|
+
bind_data.maximum_object_size, line_size);
|
|
518
|
+
}
|
|
519
|
+
|
|
417
520
|
// And copy the remainder of the line to the reconstruct buffer
|
|
418
521
|
memcpy(reconstruct_ptr + part1_size, buffer_ptr, part2_size);
|
|
522
|
+
memset((void *)(reconstruct_ptr + line_size), 0, YYJSON_PADDING_SIZE);
|
|
419
523
|
buffer_offset += part2_size;
|
|
420
524
|
|
|
421
525
|
// We copied the object, so we are no longer reading the previous buffer
|
|
@@ -423,7 +527,7 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
|
|
|
423
527
|
current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
|
|
424
528
|
}
|
|
425
529
|
|
|
426
|
-
objects[0] = ParseLine((char *)reconstruct_ptr,
|
|
530
|
+
objects[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
|
|
427
531
|
}
|
|
428
532
|
|
|
429
533
|
void JSONScanLocalState::ReadUnstructured(idx_t &count) {
|
|
@@ -448,9 +552,11 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
|
|
|
448
552
|
|
|
449
553
|
buffer_offset += line_size;
|
|
450
554
|
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
|
555
|
+
lines_or_objects_in_buffer++;
|
|
451
556
|
} else if (error.pos > max_obj_size) {
|
|
452
|
-
|
|
453
|
-
|
|
557
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
|
|
558
|
+
"Try increasing \"maximum_object_size\".");
|
|
559
|
+
|
|
454
560
|
} else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
|
|
455
561
|
// Copy remaining to reconstruct_buffer
|
|
456
562
|
const auto reconstruct_ptr = reconstruct_buffer.get();
|
|
@@ -459,13 +565,13 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
|
|
|
459
565
|
buffer_offset = buffer_size;
|
|
460
566
|
break;
|
|
461
567
|
} else {
|
|
462
|
-
|
|
568
|
+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
|
|
463
569
|
}
|
|
464
|
-
objects[count] = read_doc;
|
|
570
|
+
objects[count] = read_doc->root;
|
|
465
571
|
}
|
|
466
572
|
}
|
|
467
573
|
|
|
468
|
-
void JSONScanLocalState::ReadNewlineDelimited(idx_t &count
|
|
574
|
+
void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
|
|
469
575
|
for (; count < STANDARD_VECTOR_SIZE; count++) {
|
|
470
576
|
auto line_start = buffer_ptr + buffer_offset;
|
|
471
577
|
idx_t remaining = buffer_size - buffer_offset;
|
|
@@ -487,15 +593,15 @@ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count, const bool &ignore_e
|
|
|
487
593
|
}
|
|
488
594
|
idx_t line_size = line_end - line_start;
|
|
489
595
|
|
|
490
|
-
objects[count] = ParseLine((char *)line_start, line_size, lines[count]
|
|
596
|
+
objects[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
|
|
491
597
|
|
|
492
598
|
buffer_offset += line_size;
|
|
493
599
|
SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
|
|
494
600
|
}
|
|
495
601
|
}
|
|
496
602
|
|
|
497
|
-
|
|
498
|
-
return
|
|
603
|
+
yyjson_alc *JSONScanLocalState::GetAllocator() {
|
|
604
|
+
return json_allocator.GetYYJSONAllocator();
|
|
499
605
|
}
|
|
500
606
|
|
|
501
607
|
} // namespace duckdb
|