duckdb 0.7.1-dev107.0 → 0.7.1-dev137.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.7.1-dev107.0",
5
+ "version": "0.7.1-dev137.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -25,7 +25,12 @@ JSONBufferHandle::JSONBufferHandle(idx_t buffer_index_p, idx_t readers_p, Alloca
25
25
  JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
26
26
  : file_handle(std::move(file_handle_p)), allocator(allocator_p), can_seek(file_handle->CanSeek()),
27
27
  plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()), read_position(0),
28
- cached_size(0) {
28
+ requested_reads(0), actual_reads(0), cached_size(0) {
29
+ }
30
+
31
+ void JSONFileHandle::Close() {
32
+ file_handle->Close();
33
+ cached_buffers.clear();
29
34
  }
30
35
 
31
36
  idx_t JSONFileHandle::FileSize() const {
@@ -36,10 +41,6 @@ idx_t JSONFileHandle::Remaining() const {
36
41
  return file_size - read_position;
37
42
  }
38
43
 
39
- bool JSONFileHandle::PlainFileSource() const {
40
- return plain_file_source;
41
- }
42
-
43
44
  bool JSONFileHandle::CanSeek() const {
44
45
  return can_seek;
45
46
  }
@@ -53,6 +54,9 @@ idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size)
53
54
  position = read_position;
54
55
  auto actual_size = MinValue<idx_t>(requested_size, Remaining());
55
56
  read_position += actual_size;
57
+ if (actual_size != 0) {
58
+ requested_reads++;
59
+ }
56
60
  return actual_size;
57
61
  }
58
62
 
@@ -60,11 +64,13 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
60
64
  D_ASSERT(size != 0);
61
65
  if (plain_file_source) {
62
66
  file_handle->Read((void *)pointer, size, position);
67
+ actual_reads++;
63
68
  return;
64
69
  }
65
70
 
66
71
  if (sample_run) { // Cache the buffer
67
72
  file_handle->Read((void *)pointer, size, position);
73
+ actual_reads++;
68
74
  cached_buffers.emplace_back(allocator.Allocate(size));
69
75
  memcpy(cached_buffers.back().get(), pointer, size);
70
76
  cached_size += size;
@@ -73,9 +79,11 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
73
79
 
74
80
  if (!cached_buffers.empty() || position < cached_size) {
75
81
  ReadFromCache(pointer, size, position);
82
+ actual_reads++;
76
83
  }
77
84
  if (size != 0) {
78
85
  file_handle->Read((void *)pointer, size, position);
86
+ actual_reads++;
79
87
  }
80
88
  }
81
89
 
@@ -143,6 +151,16 @@ void BufferedJSONReader::OpenJSONFile() {
143
151
  file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
144
152
  }
145
153
 
154
+ void BufferedJSONReader::CloseJSONFile() {
155
+ while (true) {
156
+ lock_guard<mutex> guard(lock);
157
+ if (file_handle->RequestedReadsComplete()) {
158
+ file_handle->Close();
159
+ break;
160
+ }
161
+ }
162
+ }
163
+
146
164
  bool BufferedJSONReader::IsOpen() {
147
165
  return file_handle != nullptr;
148
166
  }
@@ -246,9 +264,15 @@ void BufferedJSONReader::Reset() {
246
264
 
247
265
  void JSONFileHandle::Reset() {
248
266
  read_position = 0;
267
+ requested_reads = 0;
268
+ actual_reads = 0;
249
269
  if (plain_file_source) {
250
270
  file_handle->Reset();
251
271
  }
252
272
  }
253
273
 
274
+ bool JSONFileHandle::RequestedReadsComplete() {
275
+ return requested_reads == actual_reads;
276
+ }
277
+
254
278
  } // namespace duckdb
@@ -58,11 +58,11 @@ public:
58
58
  struct JSONFileHandle {
59
59
  public:
60
60
  JSONFileHandle(unique_ptr<FileHandle> file_handle, Allocator &allocator);
61
+ void Close();
61
62
 
62
63
  idx_t FileSize() const;
63
64
  idx_t Remaining() const;
64
65
 
65
- bool PlainFileSource() const;
66
66
  bool CanSeek() const;
67
67
  void Seek(idx_t position);
68
68
 
@@ -71,6 +71,7 @@ public:
71
71
  idx_t Read(const char *pointer, idx_t requested_size, bool sample_run);
72
72
 
73
73
  void Reset();
74
+ bool RequestedReadsComplete();
74
75
 
75
76
  private:
76
77
  idx_t ReadFromCache(const char *&pointer, idx_t &size, idx_t &position);
@@ -87,6 +88,8 @@ private:
87
88
 
88
89
  //! Read properties
89
90
  idx_t read_position;
91
+ idx_t requested_reads;
92
+ atomic<idx_t> actual_reads;
90
93
 
91
94
  //! Cached buffers for resetting when reading stream
92
95
  vector<AllocatedData> cached_buffers;
@@ -98,6 +101,7 @@ public:
98
101
  BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, string file_path);
99
102
 
100
103
  void OpenJSONFile();
104
+ void CloseJSONFile();
101
105
  bool IsOpen();
102
106
 
103
107
  BufferedJSONReaderOptions &GetOptions();
@@ -26,6 +26,16 @@ enum class JSONScanType : uint8_t {
26
26
  SAMPLE = 3,
27
27
  };
28
28
 
29
+ enum class JSONScanTopLevelType : uint8_t {
30
+ INVALID = 0,
31
+ //! Sequential objects, e.g., NDJSON
32
+ OBJECTS = 1,
33
+ //! Top-level array containing objects
34
+ ARRAY_OF_OBJECTS = 2,
35
+ //! Other, e.g., array of integer, or just strings
36
+ OTHER = 3
37
+ };
38
+
29
39
  //! Even though LogicalTypeId is just a uint8_t, this is still needed ...
30
40
  struct LogicalTypeIdHash {
31
41
  inline std::size_t operator()(const LogicalTypeId &id) const {
@@ -105,7 +115,7 @@ public:
105
115
  //! Max depth we go to detect nested JSON schema (defaults to unlimited)
106
116
  idx_t max_depth = NumericLimits<idx_t>::Maximum();
107
117
  //! Whether we're parsing objects (usually), or something else like arrays
108
- bool objects = true;
118
+ JSONScanTopLevelType top_level_type = JSONScanTopLevelType::OBJECTS;
109
119
  //! Forced date/timestamp formats
110
120
  string date_format;
111
121
  string timestamp_format;
@@ -181,9 +191,14 @@ public:
181
191
  yyjson_alc *GetAllocator();
182
192
  void ThrowTransformError(idx_t count, idx_t object_index, const string &error_message);
183
193
 
194
+ idx_t scan_count;
184
195
  JSONLine lines[STANDARD_VECTOR_SIZE];
185
196
  yyjson_val *objects[STANDARD_VECTOR_SIZE];
186
197
 
198
+ idx_t array_idx;
199
+ idx_t array_offset;
200
+ yyjson_val *array_objects[STANDARD_VECTOR_SIZE];
201
+
187
202
  idx_t batch_index;
188
203
 
189
204
  //! Options when transforming the JSON to columnar data
@@ -192,6 +207,7 @@ public:
192
207
 
193
208
  private:
194
209
  yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
210
+ idx_t GetObjectsFromArray();
195
211
 
196
212
  private:
197
213
  //! Bind data
@@ -300,7 +316,6 @@ public:
300
316
  table_function.serialize = JSONScanSerialize;
301
317
  table_function.deserialize = JSONScanDeserialize;
302
318
 
303
- // TODO: might be able to do some of these
304
319
  table_function.projection_pushdown = false;
305
320
  table_function.filter_pushdown = false;
306
321
  table_function.filter_prune = false;
@@ -523,6 +523,21 @@ static bool TransformArray(yyjson_val *arrays[], yyjson_alc *alc, Vector &result
523
523
  return success;
524
524
  }
525
525
 
526
+ bool TransformToJSON(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count) {
527
+ auto data = (string_t *)FlatVector::GetData(result);
528
+ auto &validity = FlatVector::Validity(result);
529
+ for (idx_t i = 0; i < count; i++) {
530
+ const auto &val = vals[i];
531
+ if (!val) {
532
+ validity.SetInvalid(i);
533
+ } else {
534
+ data[i] = JSONCommon::WriteVal(val, alc);
535
+ }
536
+ }
537
+ // Can always transform to JSON
538
+ return true;
539
+ }
540
+
526
541
  bool JSONTransform::Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &result, const idx_t count,
527
542
  JSONTransformOptions &options) {
528
543
  auto result_type = result.GetType();
@@ -531,6 +546,10 @@ bool JSONTransform::Transform(yyjson_val *vals[], yyjson_alc *alc, Vector &resul
531
546
  return TransformFromStringWithFormat(vals, result, count, options);
532
547
  }
533
548
 
549
+ if (JSONCommon::LogicalTypeIsJSON(result_type)) {
550
+ return TransformToJSON(vals, alc, result, count);
551
+ }
552
+
534
553
  switch (result_type.id()) {
535
554
  case LogicalTypeId::SQLNULL:
536
555
  return true;
@@ -13,32 +13,17 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
13
13
  JSONScanLocalState lstate(context, gstate);
14
14
  ArenaAllocator allocator(BufferAllocator::Get(context));
15
15
 
16
- static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
17
- {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
18
- {LogicalTypeId::TIMESTAMP,
19
- {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
20
- "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
21
- };
22
-
23
- // Populate possible date/timestamp formats, assume this is consistent across columns
24
- for (auto &kv : FORMAT_TEMPLATES) {
25
- const auto &type = kv.first;
26
- if (bind_data.date_format_map.HasFormats(type)) {
27
- continue; // Already populated
28
- }
29
- const auto &format_strings = kv.second;
30
- for (auto &format_string : format_strings) {
31
- bind_data.date_format_map.AddFormat(type, format_string);
32
- }
33
- }
34
-
35
16
  // Read for the specified sample size
36
17
  JSONStructureNode node;
18
+ bool more_than_one = false;
37
19
  Vector string_vector(LogicalType::VARCHAR);
38
20
  idx_t remaining = bind_data.sample_size;
39
21
  while (remaining != 0) {
40
22
  allocator.Reset();
41
23
  auto read_count = lstate.ReadNext(gstate);
24
+ if (read_count > 1) {
25
+ more_than_one = true;
26
+ }
42
27
  if (read_count == 0) {
43
28
  break;
44
29
  }
@@ -54,15 +39,29 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
54
39
  node.InitializeCandidateTypes(bind_data.max_depth);
55
40
  node.RefineCandidateTypes(lstate.objects, next, string_vector, allocator, bind_data.date_format_map);
56
41
  remaining -= next;
42
+
43
+ if (gstate.file_index == 10) {
44
+ // We really shouldn't open more than 10 files when sampling
45
+ break;
46
+ }
57
47
  }
58
48
  bind_data.type = original_scan_type;
59
49
  bind_data.transform_options.date_format_map = &bind_data.date_format_map;
60
50
 
61
- const auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
51
+ auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
52
+ if (type.id() == LogicalTypeId::STRUCT) {
53
+ bind_data.top_level_type = JSONScanTopLevelType::OBJECTS;
54
+ } else if (!more_than_one && type.id() == LogicalTypeId::LIST &&
55
+ ListType::GetChildType(type).id() == LogicalTypeId::STRUCT) {
56
+ bind_data.top_level_type = JSONScanTopLevelType::ARRAY_OF_OBJECTS;
57
+ bind_data.options.format = JSONFormat::UNSTRUCTURED;
58
+ type = ListType::GetChildType(type);
59
+ }
60
+
62
61
  if (type.id() != LogicalTypeId::STRUCT) {
63
62
  return_types.emplace_back(type);
64
63
  names.emplace_back("json");
65
- bind_data.objects = false;
64
+ bind_data.top_level_type = JSONScanTopLevelType::OTHER;
66
65
  } else {
67
66
  const auto &child_types = StructType::GetChildTypes(type);
68
67
  return_types.reserve(child_types.size());
@@ -189,9 +188,11 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
189
188
  auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
190
189
  auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
191
190
 
192
- // Fetch next lines
193
191
  const auto count = lstate.ReadNext(gstate);
194
- const auto objects = lstate.objects;
192
+ const auto objects = gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS
193
+ ? lstate.array_objects
194
+ : lstate.objects;
195
+ output.SetCardinality(count);
195
196
 
196
197
  vector<Vector *> result_vectors;
197
198
  result_vectors.reserve(output.ColumnCount());
@@ -202,13 +203,14 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
202
203
 
203
204
  // Pass current reader to transform options so we can get line number information if an error occurs
204
205
  bool success;
205
- if (gstate.bind_data.objects) {
206
- success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
207
- result_vectors, lstate.transform_options);
208
- } else {
206
+ if (gstate.bind_data.top_level_type == JSONScanTopLevelType::OTHER) {
209
207
  success = JSONTransform::Transform(objects, lstate.GetAllocator(), *result_vectors[0], count,
210
208
  lstate.transform_options);
209
+ } else {
210
+ success = JSONTransform::TransformObject(objects, lstate.GetAllocator(), count, gstate.bind_data.names,
211
+ result_vectors, lstate.transform_options);
211
212
  }
213
+
212
214
  if (!success) {
213
215
  string hint = gstate.bind_data.auto_detect
214
216
  ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns' manually, "
@@ -217,7 +219,6 @@ static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p,
217
219
  lstate.ThrowTransformError(count, lstate.transform_options.object_index,
218
220
  lstate.transform_options.error_message + hint);
219
221
  }
220
- output.SetCardinality(count);
221
222
  }
222
223
 
223
224
  TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
@@ -235,6 +236,7 @@ TableFunction JSONFunctions::GetReadJSONTableFunction(bool list_parameter, share
235
236
  table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
236
237
 
237
238
  table_function.projection_pushdown = true;
239
+ // TODO: might be able to do filter pushdown/prune too
238
240
 
239
241
  table_function.function_info = std::move(function_info);
240
242
 
@@ -166,6 +166,12 @@ vector<CreateTableFunctionInfo> JSONFunctions::GetTableFunctions() {
166
166
  unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
167
167
  ReplacementScanData *data) {
168
168
  auto lower_name = StringUtil::Lower(table_name);
169
+ // remove any compression
170
+ if (StringUtil::EndsWith(lower_name, ".gz")) {
171
+ lower_name = lower_name.substr(0, lower_name.size() - 3);
172
+ } else if (StringUtil::EndsWith(lower_name, ".zst")) {
173
+ lower_name = lower_name.substr(0, lower_name.size() - 4);
174
+ }
169
175
  if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
170
176
  !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
171
177
  return nullptr;
@@ -48,8 +48,11 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
48
48
  options.format = JSONFormat::UNSTRUCTURED;
49
49
  } else if (format == "newline_delimited") {
50
50
  options.format = JSONFormat::NEWLINE_DELIMITED;
51
+ } else if (format == "array_of_objects") {
52
+ result->top_level_type = JSONScanTopLevelType::ARRAY_OF_OBJECTS;
51
53
  } else {
52
- throw BinderException("format must be one of ['auto', 'unstructured', 'newline_delimited']");
54
+ throw BinderException(
55
+ "format must be one of ['auto', 'unstructured', 'newline_delimited', 'array_of_objects']");
53
56
  }
54
57
  } else if (loption == "compression") {
55
58
  auto compression = StringUtil::Lower(StringValue::Get(kv.second));
@@ -67,6 +70,10 @@ unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctio
67
70
  }
68
71
  }
69
72
 
73
+ if (result->top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS) {
74
+ result->options.format = JSONFormat::UNSTRUCTURED;
75
+ }
76
+
70
77
  return std::move(result);
71
78
  }
72
79
 
@@ -98,6 +105,27 @@ void JSONScanData::InitializeFormats() {
98
105
  if (!timestamp_format.empty()) {
99
106
  date_format_map.AddFormat(LogicalTypeId::TIMESTAMP, timestamp_format);
100
107
  }
108
+
109
+ if (auto_detect) {
110
+ static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
111
+ {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
112
+ {LogicalTypeId::TIMESTAMP,
113
+ {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
114
+ "%d-%m-%y %H:%M:%S", "%Y-%m-%d %H:%M:%S", "%y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"}},
115
+ };
116
+
117
+ // Populate possible date/timestamp formats, assume this is consistent across columns
118
+ for (auto &kv : FORMAT_TEMPLATES) {
119
+ const auto &type = kv.first;
120
+ if (date_format_map.HasFormats(type)) {
121
+ continue; // Already populated
122
+ }
123
+ const auto &format_strings = kv.second;
124
+ for (auto &format_string : format_strings) {
125
+ date_format_map.AddFormat(type, format_string);
126
+ }
127
+ }
128
+ }
101
129
  }
102
130
 
103
131
  void JSONScanData::Serialize(FieldWriter &writer) {
@@ -112,9 +140,17 @@ void JSONScanData::Serialize(FieldWriter &writer) {
112
140
  writer.WriteList<string>(names);
113
141
  writer.WriteList<idx_t>(valid_cols);
114
142
  writer.WriteField<idx_t>(max_depth);
115
- writer.WriteField<bool>(objects);
116
- writer.WriteString(date_format);
117
- writer.WriteString(timestamp_format);
143
+ writer.WriteField<JSONScanTopLevelType>(top_level_type);
144
+ if (!date_format.empty()) {
145
+ writer.WriteString(date_format);
146
+ } else {
147
+ writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
148
+ }
149
+ if (!timestamp_format.empty()) {
150
+ writer.WriteString(timestamp_format);
151
+ } else {
152
+ writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
153
+ }
118
154
  }
119
155
 
120
156
  void JSONScanData::Deserialize(FieldReader &reader) {
@@ -129,9 +165,12 @@ void JSONScanData::Deserialize(FieldReader &reader) {
129
165
  names = reader.ReadRequiredList<string>();
130
166
  valid_cols = reader.ReadRequiredList<idx_t>();
131
167
  max_depth = reader.ReadRequired<idx_t>();
132
- objects = reader.ReadRequired<bool>();
168
+ top_level_type = reader.ReadRequired<JSONScanTopLevelType>();
133
169
  date_format = reader.ReadRequired<string>();
134
170
  timestamp_format = reader.ReadRequired<string>();
171
+
172
+ InitializeFormats();
173
+ transform_options.date_format_map = &date_format_map;
135
174
  }
136
175
 
137
176
  JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
@@ -150,9 +189,9 @@ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &b
150
189
  }
151
190
 
152
191
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
153
- : batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
192
+ : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
154
193
  json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
155
- buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
194
+ is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
156
195
 
157
196
  // Buffer to reconstruct JSON objects when they cross a buffer boundary
158
197
  reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
@@ -174,11 +213,6 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
174
213
  // Perform projection pushdown
175
214
  if (bind_data.type == JSONScanType::READ_JSON) {
176
215
  D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
177
- if (bind_data.auto_detect && input.column_ids.size() < bind_data.names.size()) {
178
- // If we are auto-detecting, but don't need all columns present in the file,
179
- // then we don't need to throw an error if we encounter an unseen column
180
- bind_data.transform_options.error_unknown_key = false;
181
- }
182
216
  vector<string> names;
183
217
  names.reserve(input.column_ids.size());
184
218
  for (idx_t i = 0; i < input.column_ids.size(); i++) {
@@ -189,6 +223,11 @@ unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientCo
189
223
  names.push_back(std::move(bind_data.names[id]));
190
224
  bind_data.valid_cols.push_back(i);
191
225
  }
226
+ if (names.size() < bind_data.names.size()) {
227
+ // If we are auto-detecting, but don't need all columns present in the file,
228
+ // then we don't need to throw an error if we encounter an unseen column
229
+ bind_data.transform_options.error_unknown_key = false;
230
+ }
192
231
  bind_data.names = std::move(names);
193
232
  }
194
233
  return result;
@@ -231,6 +270,10 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
231
270
  idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
232
271
  json_allocator.Reset();
233
272
 
273
+ if (gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS && array_idx < scan_count) {
274
+ return GetObjectsFromArray();
275
+ }
276
+
234
277
  idx_t count = 0;
235
278
  if (buffer_offset == buffer_size) {
236
279
  if (!ReadNextBuffer(gstate)) {
@@ -254,10 +297,20 @@ idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
254
297
  default:
255
298
  throw InternalException("Unknown JSON format");
256
299
  }
300
+ scan_count = count;
257
301
 
258
302
  // Skip over any remaining whitespace for the next scan
259
303
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
260
304
 
305
+ if (gstate.bind_data.top_level_type == JSONScanTopLevelType::ARRAY_OF_OBJECTS) {
306
+ if (scan_count > 1) {
307
+ throw InvalidInputException("File must have exactly one array of objects when format='array_of_objects'");
308
+ }
309
+ array_idx = 0;
310
+ array_offset = 0;
311
+ return GetObjectsFromArray();
312
+ }
313
+
261
314
  return count;
262
315
  }
263
316
 
@@ -332,10 +385,39 @@ yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx
332
385
  }
333
386
  }
334
387
 
388
+ idx_t JSONScanLocalState::GetObjectsFromArray() {
389
+ idx_t arr_count = 0;
390
+
391
+ size_t idx, max;
392
+ yyjson_val *val;
393
+ for (; array_idx < scan_count; array_idx++, array_offset = 0) {
394
+ if (objects[array_idx]) {
395
+ yyjson_arr_foreach(objects[array_idx], idx, max, val) {
396
+ if (idx < array_offset) {
397
+ continue;
398
+ }
399
+ array_objects[arr_count++] = val;
400
+ if (arr_count == STANDARD_VECTOR_SIZE) {
401
+ break;
402
+ }
403
+ }
404
+ array_offset = idx + 1;
405
+ if (arr_count == STANDARD_VECTOR_SIZE) {
406
+ break;
407
+ }
408
+ }
409
+ }
410
+ return arr_count;
411
+ }
412
+
335
413
  bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
336
414
  if (current_reader) {
337
415
  D_ASSERT(current_buffer_handle);
338
416
  current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
417
+ if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
418
+ // Close files that are done if we're not sampling
419
+ current_reader->CloseJSONFile();
420
+ }
339
421
  }
340
422
 
341
423
  AllocatedData buffer;
@@ -396,7 +478,9 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
396
478
  // Unopened file
397
479
  current_reader->OpenJSONFile();
398
480
  batch_index = gstate.batch_index++;
399
- if (options.format == JSONFormat::UNSTRUCTURED) {
481
+ if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
482
+ options.compression != FileCompressionType::UNCOMPRESSED &&
483
+ gstate.file_index < gstate.json_readers.size())) {
400
484
  gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
401
485
  }
402
486
  if (options.format != JSONFormat::AUTO_DETECT) {
@@ -450,9 +534,6 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
450
534
  auto json_buffer_handle = make_unique<JSONBufferHandle>(buffer_index, readers, std::move(buffer), buffer_size);
451
535
  current_buffer_handle = json_buffer_handle.get();
452
536
  current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
453
- if (!current_reader->GetFileHandle().PlainFileSource() && gstate.bind_data.type == JSONScanType::SAMPLE) {
454
- // TODO: store buffer
455
- }
456
537
 
457
538
  buffer_offset = 0;
458
539
  prev_buffer_remainder = 0;
@@ -508,16 +589,18 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
508
589
  }
509
590
 
510
591
  void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
511
- auto &file_handle = current_reader->GetFileHandle();
512
-
513
592
  idx_t request_size = gstate.buffer_capacity - prev_buffer_remainder - YYJSON_PADDING_SIZE;
514
593
  idx_t read_size;
515
594
  {
516
595
  lock_guard<mutex> reader_guard(current_reader->lock);
517
596
  buffer_index = current_reader->GetBufferIndex();
518
597
 
519
- read_size = file_handle.Read(buffer_ptr + prev_buffer_remainder, request_size,
520
- gstate.bind_data.type == JSONScanType::SAMPLE);
598
+ if (current_reader->IsOpen()) {
599
+ read_size = current_reader->GetFileHandle().Read(buffer_ptr + prev_buffer_remainder, request_size,
600
+ gstate.bind_data.type == JSONScanType::SAMPLE);
601
+ } else {
602
+ read_size = 0;
603
+ }
521
604
  is_last = read_size < request_size;
522
605
 
523
606
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
@@ -583,6 +666,11 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
583
666
  }
584
667
 
585
668
  void JSONScanLocalState::ReadUnstructured(idx_t &count) {
669
+ // yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
670
+ // if a different error code happens within the last 50 bytes
671
+ // we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
672
+ static constexpr idx_t END_BOUND = 50;
673
+
586
674
  const auto max_obj_size = reconstruct_buffer.GetSize();
587
675
  yyjson_read_err error;
588
676
  for (; count < STANDARD_VECTOR_SIZE; count++) {
@@ -608,8 +696,7 @@ void JSONScanLocalState::ReadUnstructured(idx_t &count) {
608
696
  } else if (error.pos > max_obj_size) {
609
697
  current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
610
698
  "Try increasing \"maximum_object_size\".");
611
-
612
- } else if (error.code == YYJSON_READ_ERROR_UNEXPECTED_END && !is_last) {
699
+ } else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
613
700
  // Copy remaining to reconstruct_buffer
614
701
  const auto reconstruct_ptr = reconstruct_buffer.get();
615
702
  memcpy(reconstruct_ptr, obj_copy_start, remaining);
@@ -504,11 +504,32 @@ LogicalType TransformStringToLogicalType(const string &str) {
504
504
  return Parser::ParseColumnList("dummy " + str).GetColumn(LogicalIndex(0)).Type();
505
505
  }
506
506
 
507
+ LogicalType GetUserTypeRecursive(const LogicalType &type, ClientContext &context) {
508
+ if (type.id() == LogicalTypeId::USER && type.HasAlias()) {
509
+ return Catalog::GetSystemCatalog(context).GetType(context, SYSTEM_CATALOG, DEFAULT_SCHEMA, type.GetAlias());
510
+ }
511
+ // Look for LogicalTypeId::USER in nested types
512
+ if (type.id() == LogicalTypeId::STRUCT) {
513
+ child_list_t<LogicalType> children;
514
+ children.reserve(StructType::GetChildCount(type));
515
+ for (auto &child : StructType::GetChildTypes(type)) {
516
+ children.emplace_back(child.first, GetUserTypeRecursive(child.second, context));
517
+ }
518
+ return LogicalType::STRUCT(std::move(children));
519
+ }
520
+ if (type.id() == LogicalTypeId::LIST) {
521
+ return LogicalType::LIST(GetUserTypeRecursive(ListType::GetChildType(type), context));
522
+ }
523
+ if (type.id() == LogicalTypeId::MAP) {
524
+ return LogicalType::MAP(GetUserTypeRecursive(MapType::KeyType(type), context),
525
+ GetUserTypeRecursive(MapType::ValueType(type), context));
526
+ }
527
+ // Not LogicalTypeId::USER or a nested type
528
+ return type;
529
+ }
530
+
507
531
  LogicalType TransformStringToLogicalType(const string &str, ClientContext &context) {
508
- auto type = TransformStringToLogicalType(str);
509
- return type.id() == LogicalTypeId::USER
510
- ? Catalog::GetSystemCatalog(context).GetType(context, SYSTEM_CATALOG, DEFAULT_SCHEMA, str)
511
- : type;
532
+ return GetUserTypeRecursive(TransformStringToLogicalType(str), context);
512
533
  }
513
534
 
514
535
  bool LogicalType::IsIntegral() const {
@@ -888,18 +909,23 @@ void LogicalType::SetAlias(string alias) {
888
909
  }
889
910
 
890
911
  string LogicalType::GetAlias() const {
891
- if (!type_info_) {
892
- return string();
893
- } else {
912
+ if (id() == LogicalTypeId::USER) {
913
+ return UserType::GetTypeName(*this);
914
+ }
915
+ if (type_info_) {
894
916
  return type_info_->alias;
895
917
  }
918
+ return string();
896
919
  }
897
920
 
898
921
  bool LogicalType::HasAlias() const {
899
- if (!type_info_) {
900
- return false;
922
+ if (id() == LogicalTypeId::USER) {
923
+ return !UserType::GetTypeName(*this).empty();
924
+ }
925
+ if (type_info_ && !type_info_->alias.empty()) {
926
+ return true;
901
927
  }
902
- return !type_info_->alias.empty();
928
+ return false;
903
929
  }
904
930
 
905
931
  void LogicalType::SetCatalog(LogicalType &type, TypeCatalogEntry *catalog_entry) {
@@ -721,6 +721,9 @@ void BufferedCSVReader::DetectHeader(const vector<vector<LogicalType>> &best_sql
721
721
  names.push_back(column_name);
722
722
  }
723
723
  }
724
+ for (idx_t i = 0; i < MinValue<idx_t>(names.size(), options.name_list.size()); i++) {
725
+ names[i] = options.name_list[i];
726
+ }
724
727
  }
725
728
 
726
729
  vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalType> &type_candidates,
@@ -99,6 +99,17 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
99
99
  if (names.empty()) {
100
100
  throw BinderException("read_csv requires at least a single column as input!");
101
101
  }
102
+ } else if (loption == "column_names" || loption == "names") {
103
+ if (!options.name_list.empty()) {
104
+ throw BinderException("read_csv_auto column_names/names can only be supplied once");
105
+ }
106
+ if (kv.second.IsNull()) {
107
+ throw BinderException("read_csv_auto %s cannot be NULL", kv.first);
108
+ }
109
+ auto &children = ListValue::GetChildren(kv.second);
110
+ for (auto &child : children) {
111
+ options.name_list.push_back(StringValue::Get(child));
112
+ }
102
113
  } else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
103
114
  auto &child_type = kv.second.type();
104
115
  if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
@@ -961,6 +972,8 @@ TableFunction ReadCSVTableFunction::GetAutoFunction(bool list_parameter) {
961
972
  read_csv_auto.named_parameters["column_types"] = LogicalType::ANY;
962
973
  read_csv_auto.named_parameters["dtypes"] = LogicalType::ANY;
963
974
  read_csv_auto.named_parameters["types"] = LogicalType::ANY;
975
+ read_csv_auto.named_parameters["names"] = LogicalType::LIST(LogicalType::VARCHAR);
976
+ read_csv_auto.named_parameters["column_names"] = LogicalType::LIST(LogicalType::VARCHAR);
964
977
  return read_csv_auto;
965
978
  }
966
979
 
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.7.1-dev107"
2
+ #define DUCKDB_VERSION "0.7.1-dev137"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "8a8581e2e3"
5
+ #define DUCKDB_SOURCE_ID "eb65c593fe"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -75,6 +75,8 @@ struct BufferedCSVReaderOptions {
75
75
  case_insensitive_map_t<idx_t> sql_types_per_column;
76
76
  //! User-defined SQL type list
77
77
  vector<LogicalType> sql_type_list;
78
+ //! User-defined name list
79
+ vector<string> name_list;
78
80
  //===--------------------------------------------------------------------===//
79
81
  // ReadCSVOptions
80
82
  //===--------------------------------------------------------------------===//
@@ -11,7 +11,7 @@
11
11
  #include "duckdb/common/common.hpp"
12
12
  #include "duckdb/common/enums/output_type.hpp"
13
13
  #include "duckdb/common/types/value.hpp"
14
- #include "duckdb/common/unordered_map.hpp"
14
+ #include "duckdb/common/case_insensitive_map.hpp"
15
15
  #include "duckdb/common/atomic.hpp"
16
16
 
17
17
  namespace duckdb {
@@ -39,7 +39,7 @@ struct ClientData {
39
39
  //! The set of temporary objects that belong to this client
40
40
  shared_ptr<AttachedDatabase> temporary_objects;
41
41
  //! The set of bound prepared statements that belong to this client
42
- unordered_map<string, shared_ptr<PreparedStatementData>> prepared_statements;
42
+ case_insensitive_map_t<shared_ptr<PreparedStatementData>> prepared_statements;
43
43
 
44
44
  //! The writer used to log queries (if logging is enabled)
45
45
  unique_ptr<BufferedFileWriter> log_query_writer;
@@ -11,16 +11,6 @@ CopyStatement::CopyStatement(const CopyStatement &other) : SQLStatement(other),
11
11
  }
12
12
  }
13
13
 
14
- string ConvertOptionValueToString(const Value &val) {
15
- auto type = val.type().id();
16
- switch (type) {
17
- case LogicalTypeId::VARCHAR:
18
- return KeywordHelper::WriteOptionallyQuoted(val.ToString());
19
- default:
20
- return val.ToString();
21
- }
22
- }
23
-
24
14
  string CopyStatement::CopyOptionsToString(const string &format,
25
15
  const case_insensitive_map_t<vector<Value>> &options) const {
26
16
  if (format.empty() && options.empty()) {
@@ -45,15 +35,14 @@ string CopyStatement::CopyOptionsToString(const string &format,
45
35
  // Options like HEADER don't need an explicit value
46
36
  // just providing the name already sets it to true
47
37
  } else if (values.size() == 1) {
48
- result += ConvertOptionValueToString(values[0]);
38
+ result += values[0].ToSQLString();
49
39
  } else {
50
40
  result += "( ";
51
41
  for (idx_t i = 0; i < values.size(); i++) {
52
- auto &value = values[i];
53
42
  if (i) {
54
43
  result += ", ";
55
44
  }
56
- result += KeywordHelper::WriteOptionallyQuoted(value.ToString());
45
+ result += values[i].ToSQLString();
57
46
  }
58
47
  result += " )";
59
48
  }
@@ -9,16 +9,16 @@ unique_ptr<ParsedExpression> Transformer::TransformCase(duckdb_libpgquery::PGCas
9
9
  D_ASSERT(root);
10
10
 
11
11
  auto case_node = make_unique<CaseExpression>();
12
+ auto root_arg = TransformExpression(reinterpret_cast<duckdb_libpgquery::PGNode *>(root->arg));
12
13
  for (auto cell = root->args->head; cell != nullptr; cell = cell->next) {
13
14
  CaseCheck case_check;
14
15
 
15
16
  auto w = reinterpret_cast<duckdb_libpgquery::PGCaseWhen *>(cell->data.ptr_value);
16
17
  auto test_raw = TransformExpression(reinterpret_cast<duckdb_libpgquery::PGNode *>(w->expr));
17
18
  unique_ptr<ParsedExpression> test;
18
- auto arg = TransformExpression(reinterpret_cast<duckdb_libpgquery::PGNode *>(root->arg));
19
- if (arg) {
19
+ if (root_arg) {
20
20
  case_check.when_expr =
21
- make_unique<ComparisonExpression>(ExpressionType::COMPARE_EQUAL, std::move(arg), std::move(test_raw));
21
+ make_unique<ComparisonExpression>(ExpressionType::COMPARE_EQUAL, root_arg->Copy(), std::move(test_raw));
22
22
  } else {
23
23
  case_check.when_expr = std::move(test_raw);
24
24
  }