duckdb 0.6.2-dev1978.0 → 0.6.2-dev2015.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/json/buffered_json_reader.cpp +132 -18
  3. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +29 -9
  4. package/src/duckdb/extension/json/include/json_common.hpp +56 -0
  5. package/src/duckdb/extension/json/include/json_functions.hpp +9 -0
  6. package/src/duckdb/extension/json/include/json_scan.hpp +115 -25
  7. package/src/duckdb/extension/json/include/json_structure.hpp +73 -0
  8. package/src/duckdb/extension/json/include/json_transform.hpp +57 -0
  9. package/src/duckdb/extension/json/json-extension.cpp +3 -0
  10. package/src/duckdb/extension/json/json_functions/json_contains.cpp +1 -1
  11. package/src/duckdb/extension/json/json_functions/json_create.cpp +6 -10
  12. package/src/duckdb/extension/json/json_functions/json_extract.cpp +1 -1
  13. package/src/duckdb/extension/json/json_functions/json_keys.cpp +60 -0
  14. package/src/duckdb/extension/json/json_functions/json_structure.cpp +404 -150
  15. package/src/duckdb/extension/json/json_functions/json_transform.cpp +216 -60
  16. package/src/duckdb/extension/json/json_functions/read_json.cpp +224 -0
  17. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +6 -6
  18. package/src/duckdb/extension/json/json_functions.cpp +25 -0
  19. package/src/duckdb/extension/json/json_scan.cpp +192 -86
  20. package/src/duckdb/extension/json/yyjson/include/yyjson.hpp +18 -9
  21. package/src/duckdb/extension/json/yyjson/yyjson.cpp +58 -13
  22. package/src/duckdb/src/function/table/copy_csv.cpp +16 -11
  23. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  24. package/src/duckdb/src/include/duckdb/function/scalar/strftime.hpp +2 -2
  25. package/src/duckdb/src/include/duckdb/main/extension_functions.hpp +5 -0
  26. package/src/duckdb/ub_extension_json_json_functions.cpp +4 -0
@@ -14,8 +14,8 @@ unique_ptr<FunctionData> ReadJSONObjectsBind(ClientContext &context, TableFuncti
14
14
  static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
15
15
  D_ASSERT(output.ColumnCount() == 1);
16
16
  D_ASSERT(JSONCommon::LogicalTypeIsJSON(output.data[0].GetType()));
17
- auto &gstate = (JSONScanGlobalState &)*data_p.global_state;
18
- auto &lstate = (JSONScanLocalState &)*data_p.local_state;
17
+ auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
18
+ auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
19
19
 
20
20
  // Fetch next lines
21
21
  const auto count = lstate.ReadNext(gstate);
@@ -38,8 +38,8 @@ static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &
38
38
 
39
39
  TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
40
40
  auto parameter = list_parameter ? LogicalType::LIST(LogicalType::VARCHAR) : LogicalType::VARCHAR;
41
- TableFunction table_function({parameter}, ReadJSONObjectsFunction, ReadJSONObjectsBind, JSONScanGlobalState::Init,
42
- JSONScanLocalState::Init);
41
+ TableFunction table_function({parameter}, ReadJSONObjectsFunction, ReadJSONObjectsBind,
42
+ JSONGlobalTableFunctionState::Init, JSONLocalTableFunctionState::Init);
43
43
  JSONScan::TableFunctionDefaults(table_function);
44
44
  table_function.function_info = std::move(function_info);
45
45
 
@@ -48,7 +48,7 @@ TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JS
48
48
 
49
49
  CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
50
50
  TableFunctionSet function_set("read_json_objects");
51
- auto function_info = make_shared<JSONScanInfo>(JSONFormat::UNSTRUCTURED, true);
51
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::UNSTRUCTURED);
52
52
  function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
53
53
  function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
54
54
  return CreateTableFunctionInfo(function_set);
@@ -56,7 +56,7 @@ CreateTableFunctionInfo JSONFunctions::GetReadJSONObjectsFunction() {
56
56
 
57
57
  CreateTableFunctionInfo JSONFunctions::GetReadNDJSONObjectsFunction() {
58
58
  TableFunctionSet function_set("read_ndjson_objects");
59
- auto function_info = make_shared<JSONScanInfo>(JSONFormat::NEWLINE_DELIMITED, true);
59
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED);
60
60
  function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
61
61
  function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
62
62
  return CreateTableFunctionInfo(function_set);
@@ -3,6 +3,10 @@
3
3
  #include "duckdb/execution/expression_executor.hpp"
4
4
  #include "duckdb/function/cast/cast_function_set.hpp"
5
5
  #include "duckdb/function/cast/default_casts.hpp"
6
+ #include "duckdb/function/replacement_scan.hpp"
7
+ #include "duckdb/parser/expression/constant_expression.hpp"
8
+ #include "duckdb/parser/expression/function_expression.hpp"
9
+ #include "duckdb/parser/tableref/table_function_ref.hpp"
6
10
 
7
11
  namespace duckdb {
8
12
 
@@ -136,6 +140,7 @@ vector<CreateScalarFunctionInfo> JSONFunctions::GetScalarFunctions() {
136
140
  // Other
137
141
  functions.push_back(GetArrayLengthFunction());
138
142
  functions.push_back(GetContainsFunction());
143
+ functions.push_back(GetKeysFunction());
139
144
  functions.push_back(GetTypeFunction());
140
145
  functions.push_back(GetValidFunction());
141
146
 
@@ -149,9 +154,29 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
149
154
  functions.push_back(GetReadJSONObjectsFunction());
150
155
  functions.push_back(GetReadNDJSONObjectsFunction());
151
156
 
157
+ // Read JSON as columnar data
158
+ functions.push_back(GetReadJSONFunction());
159
+ functions.push_back(GetReadNDJSONFunction());
160
+ functions.push_back(GetReadJSONAutoFunction());
161
+ functions.push_back(GetReadNDJSONAutoFunction());
162
+
152
163
  return functions;
153
164
  }
154
165
 
166
+ unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
167
+ ReplacementScanData *data) {
168
+ auto lower_name = StringUtil::Lower(table_name);
169
+ if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
170
+ !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
171
+ return nullptr;
172
+ }
173
+ auto table_function = make_unique<TableFunctionRef>();
174
+ vector<unique_ptr<ParsedExpression>> children;
175
+ children.push_back(make_unique<ConstantExpression>(Value(table_name)));
176
+ table_function->function = make_unique<FunctionExpression>("read_json_auto", std::move(children));
177
+ return std::move(table_function);
178
+ }
179
+
155
180
  static unique_ptr<FunctionLocalState> InitJSONCastLocalState(ClientContext &context) {
156
181
  return make_unique<JSONFunctionLocalState>(context);
157
182
  }
@@ -19,18 +19,16 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
19
19
  auto &options = result->options;
20
20
 
21
21
  auto &info = (JSONScanInfo &)*input.info;
22
- if (info.forced_format == JSONFormat::AUTO_DETECT) {
23
- throw NotImplementedException("Auto-detection of JSON format");
24
- }
25
- options.format = info.forced_format;
26
- result->return_json_strings = info.return_json_strings;
22
+ options.format = info.format;
23
+ result->type = info.type;
24
+ result->auto_detect = info.auto_detect;
27
25
 
28
26
  vector<string> patterns;
29
- if (input.inputs[0].type().id() == LogicalTypeId::LIST) { // list of globs
27
+ if (input.inputs[0].type().id() == LogicalTypeId::LIST) { // List of globs
30
28
  for (auto &val : ListValue::GetChildren(input.inputs[0])) {
31
29
  patterns.push_back(StringValue::Get(val));
32
30
  }
33
- } else { // single glob pattern
31
+ } else { // Single glob pattern
34
32
  patterns.push_back(StringValue::Get(input.inputs[0]));
35
33
  }
36
34
  InitializeFilePaths(context, patterns, result->file_paths);
@@ -50,15 +48,24 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
50
48
  } else if (format == "newline_delimited") {
51
49
  options.format = JSONFormat::NEWLINE_DELIMITED;
52
50
  } else {
53
- throw InvalidInputException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
51
+ throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
52
+ }
53
+ } else if (loption == "compression") {
54
+ auto compression = StringUtil::Lower(StringValue::Get(kv.second));
55
+ if (compression == "none") {
56
+ options.compression = FileCompressionType::UNCOMPRESSED;
57
+ } else if (compression == "gzip") {
58
+ options.compression = FileCompressionType::GZIP;
59
+ } else if (compression == "zstd") {
60
+ options.compression = FileCompressionType::ZSTD;
61
+ } else if (compression == "auto") {
62
+ options.compression = FileCompressionType::AUTO_DETECT;
63
+ } else {
64
+ throw BinderException("compression must be one of ['none', 'gzip', 'zstd', 'auto']");
54
65
  }
55
66
  }
56
67
  }
57
68
 
58
- if (result->ignore_errors && options.format == JSONFormat::UNSTRUCTURED) {
59
- throw InvalidInputException("Cannot ignore errors with unstructured format");
60
- }
61
-
62
69
  return std::move(result);
63
70
  }
64
71
 
@@ -75,56 +82,99 @@ void JSONScanData::InitializeFilePaths(ClientContext &context, const vector<stri
75
82
  }
76
83
 
77
84
  void JSONScanData::Serialize(FieldWriter &writer) {
85
+ writer.WriteField<JSONScanType>(type);
78
86
  options.Serialize(writer);
79
87
  writer.WriteList<string>(file_paths);
80
88
  writer.WriteField<bool>(ignore_errors);
81
89
  writer.WriteField<idx_t>(maximum_object_size);
82
- writer.WriteField<bool>(return_json_strings);
90
+ transform_options.Serialize(writer);
91
+ writer.WriteField<bool>(auto_detect);
92
+ writer.WriteField<idx_t>(sample_size);
93
+ writer.WriteList<string>(names);
94
+ writer.WriteField<idx_t>(max_depth);
83
95
  }
84
96
 
85
97
  void JSONScanData::Deserialize(FieldReader &reader) {
98
+ type = reader.ReadRequired<JSONScanType>();
86
99
  options.Deserialize(reader);
87
100
  file_paths = reader.ReadRequiredList<string>();
88
101
  ignore_errors = reader.ReadRequired<bool>();
89
102
  maximum_object_size = reader.ReadRequired<idx_t>();
90
- return_json_strings = reader.ReadRequired<bool>();
103
+ transform_options.Deserialize(reader);
104
+ auto_detect = reader.ReadRequired<bool>();
105
+ sample_size = reader.ReadRequired<idx_t>();
106
+ names = reader.ReadRequiredList<string>();
107
+ max_depth = reader.ReadRequired<idx_t>();
91
108
  }
92
109
 
93
110
  JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
94
111
  : bind_data(bind_data_p), allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
95
112
  buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
96
113
  system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
97
- json_readers.reserve(bind_data.file_paths.size());
98
- for (idx_t i = 0; i < bind_data_p.file_paths.size(); i++) {
99
- json_readers.push_back(make_unique<BufferedJSONReader>(context, bind_data.options, i, bind_data.file_paths[i]));
114
+ if (bind_data.stored_readers.empty()) {
115
+ json_readers.reserve(bind_data.file_paths.size());
116
+ for (idx_t i = 0; i < bind_data.file_paths.size(); i++) {
117
+ json_readers.push_back(
118
+ make_unique<BufferedJSONReader>(context, bind_data.options, bind_data.file_paths[i]));
119
+ }
120
+ } else {
121
+ json_readers = std::move(bind_data.stored_readers);
100
122
  }
101
123
  }
102
124
 
103
- unique_ptr<GlobalTableFunctionState> JSONScanGlobalState::Init(ClientContext &context, TableFunctionInitInput &input) {
104
- auto &bind_data = (JSONScanData &)*input.bind_data;
105
- return make_unique<JSONScanGlobalState>(context, bind_data);
106
- }
107
-
108
- idx_t JSONScanGlobalState::MaxThreads() const {
109
- return system_threads;
110
- }
111
-
112
125
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
113
- : batch_index(DConstants::INVALID_INDEX), json_allocator(BufferAllocator::Get(context)), current_reader(nullptr),
114
- current_buffer_handle(nullptr), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
126
+ : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
127
+ json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
128
+ buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
115
129
 
116
130
  // Buffer to reconstruct JSON objects when they cross a buffer boundary
117
- reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size);
131
+ reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
118
132
 
119
133
  // This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
120
134
  current_buffer_copy = gstate.allocator.Allocate(gstate.buffer_capacity);
121
135
  buffer_copy_ptr = (const char *)current_buffer_copy.get();
122
136
  }
123
137
 
124
- unique_ptr<LocalTableFunctionState> JSONScanLocalState::Init(ExecutionContext &context, TableFunctionInitInput &input,
125
- GlobalTableFunctionState *global_state) {
126
- auto &gstate = (JSONScanGlobalState &)*global_state;
127
- return make_unique<JSONScanLocalState>(context.client, gstate);
138
+ JSONGlobalTableFunctionState::JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input)
139
+ : state(context, (JSONScanData &)*input.bind_data) {
140
+ }
141
+
142
+ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientContext &context,
143
+ TableFunctionInitInput &input) {
144
+ auto &bind_data = (JSONScanData &)*input.bind_data;
145
+ auto result = make_unique<JSONGlobalTableFunctionState>(context, input);
146
+
147
+ // Check if we need to do projection pushdown
148
+ if (bind_data.type == JSONScanType::READ_JSON && input.column_ids.size() != bind_data.names.size()) {
149
+ D_ASSERT(input.column_ids.size() < bind_data.names.size()); // Can't project to have more columns
150
+ vector<string> names;
151
+ names.reserve(input.column_ids.size());
152
+ for (const auto &id : input.column_ids) {
153
+ names.push_back(std::move(bind_data.names[id]));
154
+ }
155
+ bind_data.names = std::move(names);
156
+ bind_data.transform_options.error_unknown_key = false;
157
+ }
158
+ return result;
159
+ }
160
+
161
+ idx_t JSONGlobalTableFunctionState::MaxThreads() const {
162
+ return state.system_threads;
163
+ }
164
+
165
+ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
166
+ : state(context, gstate) {
167
+ }
168
+
169
+ unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
170
+ TableFunctionInitInput &input,
171
+ GlobalTableFunctionState *global_state) {
172
+ auto &gstate = (JSONGlobalTableFunctionState &)*global_state;
173
+ return make_unique<JSONLocalTableFunctionState>(context.client, gstate.state);
174
+ }
175
+
176
+ idx_t JSONLocalTableFunctionState::GetBatchIndex() const {
177
+ return state.batch_index;
128
178
  }
129
179
 
130
180
  static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, idx_t &buffer_size) {
@@ -140,12 +190,11 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
140
190
 
141
191
  idx_t count = 0;
142
192
  if (buffer_offset == buffer_size) {
143
- bool first_read;
144
- if (!ReadNextBuffer(gstate, first_read)) {
193
+ if (!ReadNextBuffer(gstate)) {
145
194
  return 0;
146
195
  }
147
-
148
- if (!first_read && current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
196
+ if (current_buffer_handle->buffer_index != 0 &&
197
+ current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
149
198
  ReconstructFirstObject(gstate);
150
199
  count++;
151
200
  }
@@ -157,7 +206,7 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
157
206
  ReadUnstructured(count);
158
207
  break;
159
208
  case JSONFormat::NEWLINE_DELIMITED:
160
- ReadNewlineDelimited(count, gstate.bind_data.ignore_errors);
209
+ ReadNewlineDelimited(count);
161
210
  break;
162
211
  default:
163
212
  throw InternalException("Unknown JSON format");
@@ -193,28 +242,59 @@ static inline void TrimWhitespace(JSONLine &line) {
193
242
  }
194
243
  }
195
244
 
196
- yyjson_doc *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, JSONLine &line,
197
- const bool &ignore_errors) {
198
- // Parse to validate TODO: This is the only place we can maybe parse INSITU (if not returning strings)
199
- yyjson_doc *result;
200
- if (ignore_errors) {
201
- result = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
202
- json_allocator.GetYYJSONAllocator());
245
+ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line) {
246
+ yyjson_doc *doc;
247
+ if (bind_data.ignore_errors) {
248
+ doc = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
249
+ json_allocator.GetYYJSONAllocator());
203
250
  } else {
204
- result =
205
- JSONCommon::ReadDocument(line_start, line_size, JSONCommon::READ_FLAG, json_allocator.GetYYJSONAllocator());
251
+ yyjson_read_err err;
252
+ if (bind_data.type != JSONScanType::READ_JSON_OBJECTS) {
253
+ // Optimization: if we don't ignore errors, and don't need to return strings, we can parse INSITU
254
+ doc = JSONCommon::ReadDocumentUnsafe(line_start, remaining, JSONCommon::STOP_READ_FLAG,
255
+ json_allocator.GetYYJSONAllocator(), &err);
256
+ idx_t read_size = yyjson_doc_get_read_size(doc);
257
+ if (read_size > line_size) {
258
+ err.pos = line_size;
259
+ err.code = YYJSON_READ_ERROR_UNEXPECTED_END;
260
+ err.msg = "unexpected end of data";
261
+ } else if (read_size < line_size) {
262
+ idx_t diff = line_size - read_size;
263
+ char *ptr = line_start + read_size;
264
+ for (idx_t i = 0; i < diff; i++) {
265
+ if (!StringUtil::CharacterIsSpace(ptr[i])) {
266
+ err.pos = read_size;
267
+ err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT;
268
+ err.msg = "unexpected content after document";
269
+ }
270
+ }
271
+ }
272
+ } else {
273
+ doc = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
274
+ json_allocator.GetYYJSONAllocator(), &err);
275
+ }
276
+ if (err.code != YYJSON_READ_SUCCESS) {
277
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
278
+ }
206
279
  }
280
+ lines_or_objects_in_buffer++;
207
281
 
208
- if (result) {
282
+ if (doc) {
209
283
  // Set the JSONLine and trim
210
284
  line = JSONLine(line_start, line_size);
211
285
  TrimWhitespace(line);
286
+ return doc->root;
287
+ } else {
288
+ return nullptr;
212
289
  }
213
-
214
- return result;
215
290
  }
216
291
 
217
- bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first_read) {
292
+ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
293
+ if (current_reader) {
294
+ D_ASSERT(current_buffer_handle);
295
+ current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
296
+ }
297
+
218
298
  AllocatedData buffer;
219
299
  if (current_buffer_handle && --current_buffer_handle->readers == 0) {
220
300
  D_ASSERT(current_reader);
@@ -234,16 +314,8 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
234
314
  idx_t buffer_index;
235
315
  while (true) {
236
316
  if (current_reader) {
237
- if (current_reader->GetFileHandle().CanSeek()) {
238
- ReadNextBufferSeek(gstate, first_read, buffer_index);
239
- } else {
240
- ReadNextBufferNoSeek(gstate, first_read, buffer_index);
241
- }
317
+ ReadNextBuffer(gstate, buffer_index);
242
318
  if (buffer_size != 0) {
243
- if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
244
- lock_guard<mutex> guard(gstate.lock);
245
- batch_index = gstate.batch_index++;
246
- }
247
319
  break; // We read something!
248
320
  }
249
321
  }
@@ -267,6 +339,11 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
267
339
  // Try the next reader
268
340
  current_reader = gstate.json_readers[gstate.file_index].get();
269
341
  if (current_reader->IsOpen()) {
342
+ if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
343
+ // Can only be open from schema detection
344
+ batch_index = gstate.batch_index++;
345
+ gstate.file_index++;
346
+ }
270
347
  continue; // It's open, this thread joins the scan
271
348
  }
272
349
 
@@ -282,12 +359,7 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
282
359
  }
283
360
 
284
361
  // We have to detect whether it's UNSTRUCTURED/NEWLINE_DELIMITED - hold the gstate lock while we do this
285
- if (current_reader->GetFileHandle().CanSeek()) {
286
- ReadNextBufferSeek(gstate, first_read, buffer_index);
287
- } else {
288
- ReadNextBufferNoSeek(gstate, first_read, buffer_index);
289
- }
290
-
362
+ ReadNextBuffer(gstate, buffer_index);
291
363
  if (buffer_size == 0) {
292
364
  gstate.file_index++; // Empty file, move to the next one
293
365
  continue;
@@ -323,22 +395,34 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, bool &first
323
395
  }
324
396
 
325
397
  // Create an entry and insert it into the map
326
- auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, move(buffer), buffer_size);
398
+ auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
327
399
  current_buffer_handle = json_buffer_handle.get();
328
400
  current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
401
+ if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
402
+ // TODO: store buffer
403
+ }
329
404
 
330
405
  buffer_offset = 0;
331
406
  prev_buffer_remainder = 0;
407
+ lines_or_objects_in_buffer = 0;
332
408
 
409
+ memset((void *)(buffer_ptr + buffer_size), 0, YYJSON_PADDING_SIZE);
333
410
  if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
334
- memset((void *)(buffer_ptr + buffer_size), 0, YYJSON_PADDING_SIZE);
335
411
  memcpy((void *)buffer_copy_ptr, buffer_ptr, buffer_size + YYJSON_PADDING_SIZE);
336
412
  }
337
413
 
338
414
  return true;
339
415
  }
340
416
 
341
- void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, bool &first_read, idx_t &buffer_index) {
417
+ void JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index) {
418
+ if (current_reader->GetFileHandle().CanSeek()) {
419
+ ReadNextBufferSeek(gstate, buffer_index);
420
+ } else {
421
+ ReadNextBufferNoSeek(gstate, buffer_index);
422
+ }
423
+ }
424
+
425
+ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
342
426
  auto &file_handle = current_reader->GetFileHandle();
343
427
 
344
428
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
@@ -346,44 +430,57 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, bool &f
346
430
  idx_t read_size;
347
431
 
348
432
  {
349
- lock_guard<mutex> guard(current_reader->lock);
433
+ lock_guard<mutex> reader_guard(current_reader->lock);
350
434
  buffer_index = current_reader->GetBufferIndex();
351
435
 
352
436
  read_size = file_handle.GetPositionAndSize(read_position, request_size);
353
- first_read = read_position == 0;
354
437
  is_last = file_handle.Remaining() == 0;
355
438
 
356
439
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
357
440
  throw InvalidInputException("Invalid JSON detected at the end of file %s", current_reader->file_path);
358
441
  }
442
+
443
+ if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
444
+ batch_index = gstate.batch_index++;
445
+ }
359
446
  }
360
447
  buffer_size = prev_buffer_remainder + read_size;
361
448
  if (buffer_size == 0) {
449
+ current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
362
450
  return;
363
451
  }
364
452
 
365
453
  // Now read the file lock-free!
366
- file_handle.ReadAtPosition(buffer_ptr + prev_buffer_remainder, read_size, read_position);
454
+ file_handle.ReadAtPosition(buffer_ptr + prev_buffer_remainder, read_size, read_position,
455
+ gstate.bind_data.type == JSONScanType::SAMPLE);
367
456
  }
368
457
 
369
- void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, bool &first_read, idx_t &buffer_index) {
458
+ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
370
459
  auto &file_handle = current_reader->GetFileHandle();
371
460
 
372
461
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
373
462
  idx_t read_size;
374
463
  {
375
- lock_guard<mutex> guard(gstate.lock);
464
+ lock_guard<mutex> reader_guard(current_reader->lock);
376
465
  buffer_index = current_reader->GetBufferIndex();
377
466
 
378
- first_read = file_handle.Remaining() == file_handle.FileSize();
379
- read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size);
467
+ read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
468
+ gstate.bind_data.type == JSONScanType::SAMPLE);
380
469
  is_last = read_size < request_size;
381
470
 
382
471
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
383
472
  throw InvalidInputException("Invalid JSON detected at the end of file %s", current_reader->file_path);
384
473
  }
474
+
475
+ if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
476
+ batch_index = gstate.batch_index++;
477
+ }
385
478
  }
386
479
  buffer_size = prev_buffer_remainder + read_size;
480
+ if (buffer_size == 0) {
481
+ current_reader->SetBufferLineOrObjectCount(buffer_index, 0);
482
+ return;
483
+ }
387
484
  }
388
485
 
389
486
  void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
@@ -408,14 +505,21 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
408
505
  auto line_end = NextNewline(buffer_ptr, buffer_size);
409
506
  if (line_end == nullptr) { // TODO I don't think we can ignore this even with ignore_errors ...
410
507
  throw InvalidInputException("maximum_object_size of %llu bytes exceeded (>%llu bytes), is the JSON valid?",
411
- gstate.bind_data.maximum_object_size, buffer_size - buffer_offset);
508
+ bind_data.maximum_object_size, buffer_size - buffer_offset);
412
509
  } else {
413
510
  line_end++;
414
511
  }
415
512
  idx_t part2_size = line_end - buffer_ptr;
416
513
 
514
+ idx_t line_size = part1_size + part2_size;
515
+ if (line_size > bind_data.maximum_object_size) {
516
+ throw InvalidInputException("maximum_object_size of %llu bytes exceeded (%llu bytes), is the JSON valid?",
517
+ bind_data.maximum_object_size, line_size);
518
+ }
519
+
417
520
  // And copy the remainder of the line to the reconstruct buffer
418
521
  memcpy(reconstruct_ptr + part1_size, buffer_ptr, part2_size);
522
+ memset((void *)(reconstruct_ptr + line_size), 0, YYJSON_PADDING_SIZE);
419
523
  buffer_offset += part2_size;
420
524
 
421
525
  // We copied the object, so we are no longer reading the previous buffer
@@ -423,7 +527,7 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
423
527
  current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
424
528
  }
425
529
 
426
- objects[0] = ParseLine((char *)reconstruct_ptr, part1_size + part2_size, lines[0], gstate.bind_data.ignore_errors);
530
+ objects[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
427
531
  }
428
532
 
429
533
  void JSONScanLocalState::ReadUnstructured(idx_t &count) {
@@ -448,9 +552,11 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
448
552
 
449
553
  buffer_offset += line_size;
450
554
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
555
+ lines_or_objects_in_buffer++;
451
556
  } else if (error.pos > max_obj_size) {
452
- JSONCommon::ThrowParseError(obj_copy_start, remaining, error,
453
- "Have you tried increasing maximum_object_size?");
557
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
558
+ "Try increasing \"maximum_object_size\".");
559
+
454
560
  } else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
455
561
  // Copy remaining to reconstruct_buffer
456
562
  const auto reconstruct_ptr = reconstruct_buffer.get();
@@ -459,13 +565,13 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
459
565
  buffer_offset = buffer_size;
460
566
  break;
461
567
  } else {
462
- JSONCommon::ThrowParseError(obj_copy_start, remaining, error);
568
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
463
569
  }
464
- objects[count] = read_doc;
570
+ objects[count] = read_doc->root;
465
571
  }
466
572
  }
467
573
 
468
- void JSONScanLocalState::ReadNewlineDelimited(idx_t &count, const bool &ignore_errors) {
574
+ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
469
575
  for (; count < STANDARD_VECTOR_SIZE; count++) {
470
576
  auto line_start = buffer_ptr + buffer_offset;
471
577
  idx_t remaining = buffer_size - buffer_offset;
@@ -487,15 +593,15 @@ void JSONScanLocalState::ReadNewlineDelimited(idx_t &count, const bool &ignore_e
487
593
  }
488
594
  idx_t line_size = line_end - line_start;
489
595
 
490
- objects[count] = ParseLine((char *)line_start, line_size, lines[count], ignore_errors);
596
+ objects[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
491
597
 
492
598
  buffer_offset += line_size;
493
599
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
494
600
  }
495
601
  }
496
602
 
497
- idx_t JSONScanLocalState::GetBatchIndex() const {
498
- return batch_index;
603
+ yyjson_alc *JSONScanLocalState::GetAllocator() {
604
+ return json_allocator.GetYYJSONAllocator();
499
605
  }
500
606
 
501
607
  } // namespace duckdb