duckdb 0.6.2-dev1978.0 → 0.6.2-dev2015.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/json/buffered_json_reader.cpp +132 -18
  3. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +29 -9
  4. package/src/duckdb/extension/json/include/json_common.hpp +56 -0
  5. package/src/duckdb/extension/json/include/json_functions.hpp +9 -0
  6. package/src/duckdb/extension/json/include/json_scan.hpp +115 -25
  7. package/src/duckdb/extension/json/include/json_structure.hpp +73 -0
  8. package/src/duckdb/extension/json/include/json_transform.hpp +57 -0
  9. package/src/duckdb/extension/json/json-extension.cpp +3 -0
  10. package/src/duckdb/extension/json/json_functions/json_contains.cpp +1 -1
  11. package/src/duckdb/extension/json/json_functions/json_create.cpp +6 -10
  12. package/src/duckdb/extension/json/json_functions/json_extract.cpp +1 -1
  13. package/src/duckdb/extension/json/json_functions/json_keys.cpp +60 -0
  14. package/src/duckdb/extension/json/json_functions/json_structure.cpp +404 -150
  15. package/src/duckdb/extension/json/json_functions/json_transform.cpp +216 -60
  16. package/src/duckdb/extension/json/json_functions/read_json.cpp +224 -0
  17. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +6 -6
  18. package/src/duckdb/extension/json/json_functions.cpp +25 -0
  19. package/src/duckdb/extension/json/json_scan.cpp +192 -86
  20. package/src/duckdb/extension/json/yyjson/include/yyjson.hpp +18 -9
  21. package/src/duckdb/extension/json/yyjson/yyjson.cpp +58 -13
  22. package/src/duckdb/src/function/table/copy_csv.cpp +16 -11
  23. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  24. package/src/duckdb/src/include/duckdb/function/scalar/strftime.hpp +2 -2
  25. package/src/duckdb/src/include/duckdb/main/extension_functions.hpp +5 -0
  26. package/src/duckdb/ub_extension_json_json_functions.cpp +4 -0
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.6.2-dev1978.0",
5
+ "version": "0.6.2-dev2015.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -2,7 +2,6 @@
2
2
 
3
3
  #include "duckdb/common/field_writer.hpp"
4
4
  #include "duckdb/common/file_opener.hpp"
5
- #include "duckdb/common/file_system.hpp"
6
5
  #include "duckdb/common/printer.hpp"
7
6
 
8
7
  namespace duckdb {
@@ -20,13 +19,13 @@ void BufferedJSONReaderOptions::Deserialize(FieldReader &reader) {
20
19
  }
21
20
 
22
21
  JSONBufferHandle::JSONBufferHandle(idx_t buffer_index_p, idx_t readers_p, AllocatedData &&buffer_p, idx_t buffer_size_p)
23
- : buffer_index(buffer_index_p), readers(readers_p), buffer(move(buffer_p)), buffer_size(buffer_size_p) {
22
+ : buffer_index(buffer_index_p), readers(readers_p), buffer(std::move(buffer_p)), buffer_size(buffer_size_p) {
24
23
  }
25
24
 
26
- JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p)
27
- : file_handle(move(file_handle_p)), can_seek(file_handle->CanSeek()),
28
- plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()),
29
- read_position(0) {
25
+ JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
26
+ : file_handle(std::move(file_handle_p)), allocator(allocator_p), can_seek(file_handle->CanSeek()),
27
+ plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()), read_position(0),
28
+ cached_size(0) {
30
29
  }
31
30
 
32
31
  idx_t JSONFileHandle::FileSize() const {
@@ -37,12 +36,16 @@ idx_t JSONFileHandle::Remaining() const {
37
36
  return file_size - read_position;
38
37
  }
39
38
 
39
+ bool JSONFileHandle::PlainFileSource() const {
40
+ return plain_file_source;
41
+ }
42
+
40
43
  bool JSONFileHandle::CanSeek() const {
41
44
  return can_seek;
42
45
  }
43
46
 
44
- bool JSONFileHandle::PlainFileSource() const {
45
- return plain_file_source;
47
+ void JSONFileHandle::Seek(idx_t position) {
48
+ file_handle->Seek(position);
46
49
  }
47
50
 
48
51
  idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size) {
@@ -53,22 +56,82 @@ idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size)
53
56
  return actual_size;
54
57
  }
55
58
 
56
- void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t position) {
59
+ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t position, bool sample_run) {
57
60
  D_ASSERT(size != 0);
58
- file_handle->Read((void *)pointer, size, position);
61
+ if (plain_file_source) {
62
+ file_handle->Read((void *)pointer, size, position);
63
+ return;
64
+ }
65
+
66
+ if (sample_run) { // Cache the buffer
67
+ file_handle->Read((void *)pointer, size, position);
68
+ cached_buffers.emplace_back(allocator.Allocate(size));
69
+ memcpy(cached_buffers.back().get(), pointer, size);
70
+ cached_size += size;
71
+ return;
72
+ }
73
+
74
+ if (!cached_buffers.empty() || position < cached_size) {
75
+ ReadFromCache(pointer, size, position);
76
+ }
77
+ if (size != 0) {
78
+ file_handle->Read((void *)pointer, size, position);
79
+ }
59
80
  }
60
81
 
61
- idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size) {
82
+ idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size, bool sample_run) {
62
83
  D_ASSERT(requested_size != 0);
63
- auto actual_size = file_handle->Read((void *)pointer, requested_size);
64
- read_position += actual_size;
84
+ if (plain_file_source) {
85
+ auto actual_size = file_handle->Read((void *)pointer, requested_size);
86
+ read_position += actual_size;
87
+ return actual_size;
88
+ }
89
+
90
+ if (sample_run) { // Cache the buffer
91
+ auto actual_size = file_handle->Read((void *)pointer, requested_size);
92
+ if (actual_size > 0) {
93
+ cached_buffers.emplace_back(allocator.Allocate(actual_size));
94
+ memcpy(cached_buffers.back().get(), pointer, actual_size);
95
+ }
96
+ cached_size += actual_size;
97
+ read_position += actual_size;
98
+ return actual_size;
99
+ }
100
+
101
+ idx_t actual_size = 0;
102
+ if (!cached_buffers.empty() || read_position < cached_size) {
103
+ actual_size += ReadFromCache(pointer, requested_size, read_position);
104
+ }
105
+ if (requested_size != 0) {
106
+ actual_size += file_handle->Read((void *)pointer, requested_size);
107
+ }
65
108
  return actual_size;
66
109
  }
67
110
 
68
- BufferedJSONReader::BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options_p, idx_t file_index_p,
69
- string file_path_p)
70
- : file_index(file_index_p), file_path(std::move(file_path_p)), context(context), options(std::move(options_p)),
71
- buffer_index(0) {
111
+ idx_t JSONFileHandle::ReadFromCache(const char *&pointer, idx_t &size, idx_t &position) {
112
+ idx_t read_size = 0;
113
+ idx_t total_offset = 0;
114
+ for (auto &cached_buffer : cached_buffers) {
115
+ if (size == 0) {
116
+ break;
117
+ }
118
+ if (position < total_offset + cached_buffer.GetSize()) {
119
+ idx_t within_buffer_offset = position - total_offset;
120
+ idx_t copy_size = MinValue<idx_t>(size, cached_buffer.GetSize() - within_buffer_offset);
121
+ memcpy((void *)pointer, cached_buffer.get() + within_buffer_offset, copy_size);
122
+
123
+ read_size += copy_size;
124
+ pointer += copy_size;
125
+ size -= copy_size;
126
+ position += copy_size;
127
+ }
128
+ total_offset += cached_buffer.GetSize();
129
+ }
130
+ return read_size;
131
+ }
132
+
133
+ BufferedJSONReader::BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options_p, string file_path_p)
134
+ : file_path(std::move(file_path_p)), context(context), options(std::move(options_p)), buffer_index(0) {
72
135
  }
73
136
 
74
137
  void BufferedJSONReader::OpenJSONFile() {
@@ -77,7 +140,7 @@ void BufferedJSONReader::OpenJSONFile() {
77
140
  auto file_opener = FileOpener::Get(context);
78
141
  auto regular_file_handle = file_system.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ,
79
142
  FileLockType::NO_LOCK, options.compression, file_opener);
80
- file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle));
143
+ file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
81
144
  }
82
145
 
83
146
  bool BufferedJSONReader::IsOpen() {
@@ -113,9 +176,40 @@ AllocatedData BufferedJSONReader::RemoveBuffer(idx_t buffer_idx) {
113
176
  }
114
177
 
115
178
  idx_t BufferedJSONReader::GetBufferIndex() {
179
+ buffer_line_or_object_counts.push_back(-1);
116
180
  return buffer_index++;
117
181
  }
118
182
 
183
+ void BufferedJSONReader::SetBufferLineOrObjectCount(idx_t index, idx_t count) {
184
+ lock_guard<mutex> guard(lock);
185
+ buffer_line_or_object_counts[index] = count;
186
+ }
187
+
188
+ void BufferedJSONReader::ThrowParseError(idx_t buf_index, idx_t line_or_object_in_buf, yyjson_read_err &err,
189
+ const string &extra) {
190
+ D_ASSERT(options.format == JSONFormat::UNSTRUCTURED || options.format == JSONFormat::NEWLINE_DELIMITED);
191
+ while (true) {
192
+ lock_guard<mutex> guard(lock);
193
+ idx_t line = line_or_object_in_buf;
194
+ bool can_throw = true;
195
+ for (idx_t b_idx = 0; b_idx < buf_index; b_idx++) {
196
+ if (buffer_line_or_object_counts[b_idx] == -1) {
197
+ can_throw = false;
198
+ break;
199
+ } else {
200
+ line += buffer_line_or_object_counts[b_idx];
201
+ }
202
+ }
203
+ if (!can_throw) {
204
+ continue;
205
+ }
206
+ string unit = options.format == JSONFormat::NEWLINE_DELIMITED ? "line" : "object";
207
+ // SQL uses 1-based indexing so I guess we will do that in our exception here as well
208
+ throw InvalidInputException("Malformed JSON in file \"%s\", at byte %llu in %s %llu: %s. %s", file_path,
209
+ err.pos + 1, unit, line + 1, err.msg, extra);
210
+ }
211
+ }
212
+
119
213
  double BufferedJSONReader::GetProgress() const {
120
214
  if (file_handle) {
121
215
  return 100.0 * double(file_handle->Remaining()) / double(file_handle->FileSize());
@@ -124,4 +218,24 @@ double BufferedJSONReader::GetProgress() const {
124
218
  }
125
219
  }
126
220
 
221
+ void BufferedJSONReader::Reset() {
222
+ buffer_index = 0;
223
+ buffer_map.clear();
224
+ buffer_line_or_object_counts.clear();
225
+
226
+ if (file_handle->CanSeek()) {
227
+ file_handle->Seek(0);
228
+ } else {
229
+ file_handle->Reset();
230
+ }
231
+ file_handle->Reset();
232
+ }
233
+
234
+ void JSONFileHandle::Reset() {
235
+ read_position = 0;
236
+ if (plain_file_source) {
237
+ file_handle->Reset();
238
+ }
239
+ }
240
+
127
241
  } // namespace duckdb
@@ -10,13 +10,12 @@
10
10
 
11
11
  #include "duckdb/common/atomic.hpp"
12
12
  #include "duckdb/common/enums/file_compression_type.hpp"
13
+ #include "duckdb/common/file_system.hpp"
13
14
  #include "duckdb/common/mutex.hpp"
14
15
  #include "json_common.hpp"
15
16
 
16
17
  namespace duckdb {
17
18
 
18
- struct FileHandle;
19
-
20
19
  enum class JSONFormat : uint8_t {
21
20
  //! Auto-detect format (UNSTRUCTURED / NEWLINE_DELIMITED)
22
21
  AUTO_DETECT = 0,
@@ -58,21 +57,28 @@ public:
58
57
 
59
58
  struct JSONFileHandle {
60
59
  public:
61
- explicit JSONFileHandle(unique_ptr<FileHandle> file_handle);
60
+ JSONFileHandle(unique_ptr<FileHandle> file_handle, Allocator &allocator);
62
61
 
63
62
  idx_t FileSize() const;
64
63
  idx_t Remaining() const;
65
64
 
66
- bool CanSeek() const;
67
65
  bool PlainFileSource() const;
66
+ bool CanSeek() const;
67
+ void Seek(idx_t position);
68
68
 
69
69
  idx_t GetPositionAndSize(idx_t &position, idx_t requested_size);
70
- void ReadAtPosition(const char *pointer, idx_t size, idx_t position);
71
- idx_t Read(const char *pointer, idx_t requested_size);
70
+ void ReadAtPosition(const char *pointer, idx_t size, idx_t position, bool sample_run);
71
+ idx_t Read(const char *pointer, idx_t requested_size, bool sample_run);
72
+
73
+ void Reset();
74
+
75
+ private:
76
+ idx_t ReadFromCache(const char *&pointer, idx_t &size, idx_t &position);
72
77
 
73
78
  private:
74
79
  //! The JSON file handle
75
80
  unique_ptr<FileHandle> file_handle;
81
+ Allocator &allocator;
76
82
 
77
83
  //! File properties
78
84
  const bool can_seek;
@@ -81,11 +87,15 @@ private:
81
87
 
82
88
  //! Read properties
83
89
  idx_t read_position;
90
+
91
+ //! Cached buffers for resetting when reading stream
92
+ vector<AllocatedData> cached_buffers;
93
+ idx_t cached_size;
84
94
  };
85
95
 
86
96
  class BufferedJSONReader {
87
97
  public:
88
- BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, idx_t file_index, string file_path);
98
+ BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, string file_path);
89
99
 
90
100
  void OpenJSONFile();
91
101
  bool IsOpen();
@@ -93,18 +103,25 @@ public:
93
103
  BufferedJSONReaderOptions &GetOptions();
94
104
  JSONFileHandle &GetFileHandle() const;
95
105
 
106
+ //! Insert/get/remove buffer (grabs the lock)
96
107
  void InsertBuffer(idx_t buffer_idx, unique_ptr<JSONBufferHandle> &&buffer);
97
108
  JSONBufferHandle *GetBuffer(idx_t buffer_idx);
98
109
  AllocatedData RemoveBuffer(idx_t buffer_idx);
110
+
111
+ //! Get a new buffer index (must hold the lock)
99
112
  idx_t GetBufferIndex();
113
+ //! Set line count for a buffer that is done (grabs the lock)
114
+ void SetBufferLineOrObjectCount(idx_t index, idx_t count);
115
+ //! Throws an error that mentions the file name and line number
116
+ void ThrowParseError(idx_t buf_index, idx_t line_or_object_in_buf, yyjson_read_err &err, const string &extra = "");
100
117
 
101
118
  double GetProgress() const;
119
+ void Reset();
102
120
 
103
121
  public:
104
122
  mutex lock;
105
123
 
106
- //! File index / path
107
- const idx_t file_index;
124
+ //! File path
108
125
  const string file_path;
109
126
 
110
127
  private:
@@ -118,6 +135,9 @@ private:
118
135
  idx_t buffer_index;
119
136
  //! Mapping from batch index to currently held buffers
120
137
  unordered_map<idx_t, unique_ptr<JSONBufferHandle>> buffer_map;
138
+
139
+ //! Line count per buffer
140
+ vector<int64_t> buffer_line_or_object_counts;
121
141
  };
122
142
 
123
143
  } // namespace duckdb
@@ -50,6 +50,37 @@ private:
50
50
  yyjson_alc yyjson_allocator;
51
51
  };
52
52
 
53
+ struct JSONKey {
54
+ const char *ptr;
55
+ size_t len;
56
+ };
57
+
58
+ struct JSONKeyHash {
59
+ inline std::size_t operator()(const JSONKey &k) const {
60
+ size_t result;
61
+ if (k.len >= sizeof(size_t)) {
62
+ memcpy(&result, k.ptr + k.len - sizeof(size_t), sizeof(size_t));
63
+ } else {
64
+ result = 0;
65
+ duckdb::FastMemcpy(&result, k.ptr, k.len);
66
+ }
67
+ return result;
68
+ }
69
+ };
70
+
71
+ struct JSONKeyEquality {
72
+ inline bool operator()(const JSONKey &a, const JSONKey &b) const {
73
+ if (a.len != b.len) {
74
+ return false;
75
+ }
76
+ return duckdb::FastMemcmp(a.ptr, b.ptr, a.len) == 0;
77
+ }
78
+ };
79
+
80
+ template <typename T>
81
+ using json_key_map_t = unordered_map<JSONKey, T, JSONKeyHash, JSONKeyEquality>;
82
+ using json_key_set_t = unordered_set<JSONKey, JSONKeyHash, JSONKeyEquality>;
83
+
53
84
  struct JSONCommon {
54
85
  public:
55
86
  static constexpr auto JSON_TYPE_NAME = "JSON";
@@ -111,6 +142,31 @@ public:
111
142
  return string_t(ValTypeToString<YYJSON_VAL_T>(val));
112
143
  }
113
144
 
145
+ template <class YYJSON_VAL_T>
146
+ static inline const LogicalTypeId ValTypeToLogicalTypeId(YYJSON_VAL_T *val) {
147
+ switch (GetTag<YYJSON_VAL_T>(val)) {
148
+ case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
149
+ return LogicalTypeId::SQLNULL;
150
+ case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
151
+ return LogicalTypeId::VARCHAR;
152
+ case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
153
+ return LogicalTypeId::LIST;
154
+ case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
155
+ return LogicalTypeId::STRUCT;
156
+ case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
157
+ case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
158
+ return LogicalTypeId::BOOLEAN;
159
+ case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
160
+ return LogicalTypeId::UBIGINT;
161
+ case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
162
+ return LogicalTypeId::BIGINT;
163
+ case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
164
+ return LogicalTypeId::DOUBLE;
165
+ default:
166
+ throw InternalException("Unexpected yyjson tag in ValTypeToLogicalTypeId");
167
+ }
168
+ }
169
+
114
170
  public:
115
171
  static inline yyjson_mut_doc *CreateDocument(yyjson_alc *alc) {
116
172
  D_ASSERT(alc);
@@ -14,6 +14,8 @@
14
14
 
15
15
  namespace duckdb {
16
16
 
17
+ class TableRef;
18
+ struct ReplacementScanData;
17
19
  class CastFunctionSet;
18
20
  struct CastParameters;
19
21
 
@@ -62,6 +64,8 @@ class JSONFunctions {
62
64
  public:
63
65
  static vector<CreateScalarFunctionInfo> GetScalarFunctions();
64
66
  static vector<CreateTableFunctionInfo> GetTableFunctions();
67
+ static unique_ptr<TableRef> ReadJSONReplacement(ClientContext &context, const string &table_name,
68
+ ReplacementScanData *data);
65
69
  static void RegisterCastFunctions(CastFunctionSet &casts);
66
70
 
67
71
  private:
@@ -82,6 +86,7 @@ private:
82
86
 
83
87
  static CreateScalarFunctionInfo GetArrayLengthFunction();
84
88
  static CreateScalarFunctionInfo GetContainsFunction();
89
+ static CreateScalarFunctionInfo GetKeysFunction();
85
90
  static CreateScalarFunctionInfo GetTypeFunction();
86
91
  static CreateScalarFunctionInfo GetValidFunction();
87
92
 
@@ -97,6 +102,10 @@ private:
97
102
  // Table functions
98
103
  static CreateTableFunctionInfo GetReadJSONObjectsFunction();
99
104
  static CreateTableFunctionInfo GetReadNDJSONObjectsFunction();
105
+ static CreateTableFunctionInfo GetReadJSONFunction();
106
+ static CreateTableFunctionInfo GetReadNDJSONFunction();
107
+ static CreateTableFunctionInfo GetReadJSONAutoFunction();
108
+ static CreateTableFunctionInfo GetReadNDJSONAutoFunction();
100
109
  };
101
110
 
102
111
  } // namespace duckdb
@@ -10,11 +10,59 @@
10
10
 
11
11
  #include "buffered_json_reader.hpp"
12
12
  #include "duckdb/common/mutex.hpp"
13
+ #include "duckdb/function/scalar/strftime.hpp"
13
14
  #include "duckdb/function/table_function.hpp"
15
+ #include "json_transform.hpp"
14
16
 
15
17
  namespace duckdb {
16
18
 
17
- struct JSONScanLocalState;
19
+ enum class JSONScanType : uint8_t {
20
+ INVALID = 0,
21
+ //! Read JSON straight to columnar data
22
+ READ_JSON = 1,
23
+ //! Read JSON objects as strings
24
+ READ_JSON_OBJECTS = 2,
25
+ //! Sample run for schema detection
26
+ SAMPLE = 3,
27
+ };
28
+
29
+ //! Even though LogicalTypeId is just a uint8_t, this is still needed ...
30
+ struct LogicalTypeIdHash {
31
+ inline std::size_t operator()(const LogicalTypeId &id) const {
32
+ return (size_t)id;
33
+ }
34
+ };
35
+
36
+ struct DateFormatMap {
37
+ public:
38
+ void Initialize(const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> &format_templates) {
39
+ for (const auto &entry : format_templates) {
40
+ auto &formats = candidate_formats.emplace(entry.first, vector<StrpTimeFormat>()).first->second;
41
+ formats.reserve(entry.second.size());
42
+ for (const auto &format : entry.second) {
43
+ formats.emplace_back();
44
+ formats.back().format_specifier = format;
45
+ StrpTimeFormat::ParseFormatSpecifier(formats.back().format_specifier, formats.back());
46
+ }
47
+ }
48
+ }
49
+
50
+ bool HasFormats(LogicalTypeId type) const {
51
+ return candidate_formats.find(type) != candidate_formats.end();
52
+ }
53
+
54
+ vector<StrpTimeFormat> &GetCandidateFormats(LogicalTypeId type) {
55
+ D_ASSERT(HasFormats(type));
56
+ return candidate_formats[type];
57
+ }
58
+
59
+ StrpTimeFormat &GetFormat(LogicalTypeId type) {
60
+ return candidate_formats[type].back();
61
+ }
62
+
63
+ private:
64
+ unordered_map<LogicalTypeId, vector<StrpTimeFormat>, LogicalTypeIdHash> candidate_formats;
65
+ };
18
66
 
19
67
  struct JSONScanData : public TableFunctionData {
20
68
  public:
@@ -27,6 +75,8 @@ public:
27
75
  void Deserialize(FieldReader &reader);
28
76
 
29
77
  public:
78
+ //! Scan type
79
+ JSONScanType type;
30
80
  //! File-specific options
31
81
  BufferedJSONReaderOptions options;
32
82
  //! The files we're reading
@@ -34,27 +84,41 @@ public:
34
84
 
35
85
  //! Whether or not we should ignore malformed JSON (default to NULL)
36
86
  bool ignore_errors = false;
37
- //! Maximum JSON object size (defaults to 1MB)
87
+ //! Maximum JSON object size (defaults to 1MB minimum)
38
88
  idx_t maximum_object_size = 1048576;
39
- //! Whether we return JSON strings (if not, we return YYJSON documents)
40
- bool return_json_strings = true;
89
+ //! Options when transforming the JSON to columnar data
90
+ JSONTransformOptions transform_options;
91
+
92
+ //! Whether we auto-detect a schema
93
+ bool auto_detect = false;
94
+ //! Sample size for detecting schema
95
+ idx_t sample_size = STANDARD_VECTOR_SIZE;
96
+ //! Column names (in order)
97
+ vector<string> names;
98
+ //! Max depth we go to detect nested JSON schema (defaults to unlimited)
99
+ idx_t max_depth = NumericLimits<idx_t>::Maximum();
100
+
101
+ //! Stored readers for when we're detecting the schema
102
+ vector<unique_ptr<BufferedJSONReader>> stored_readers;
103
+ //! Candidate date formats
104
+ DateFormatMap date_format_map;
41
105
  };
42
106
 
43
107
  struct JSONScanInfo : public TableFunctionInfo {
44
108
  public:
45
- explicit JSONScanInfo(JSONFormat forced_format_p, bool return_strings)
46
- : forced_format(forced_format_p), return_json_strings(return_strings) {
109
+ explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
110
+ bool auto_detect_p = false)
111
+ : type(type_p), format(format_p), auto_detect(auto_detect_p) {
47
112
  }
48
113
 
49
- JSONFormat forced_format;
50
- bool return_json_strings;
114
+ JSONScanType type;
115
+ JSONFormat format;
116
+ bool auto_detect;
51
117
  };
52
118
 
53
- struct JSONScanGlobalState : public GlobalTableFunctionState {
119
+ struct JSONScanGlobalState {
54
120
  public:
55
121
  JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data);
56
- static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
57
- idx_t MaxThreads() const override;
58
122
 
59
123
  public:
60
124
  //! Bound data
@@ -70,7 +134,7 @@ public:
70
134
  vector<unique_ptr<BufferedJSONReader>> json_readers;
71
135
  //! Current file/batch index
72
136
  idx_t file_index;
73
- idx_t batch_index;
137
+ atomic<idx_t> batch_index;
74
138
 
75
139
  //! Current number of threads active
76
140
  idx_t system_threads;
@@ -96,23 +160,25 @@ public:
96
160
  }
97
161
  };
98
162
 
99
- struct JSONScanLocalState : public LocalTableFunctionState {
163
+ struct JSONScanLocalState {
100
164
  public:
101
165
  JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate);
102
- static unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
103
- GlobalTableFunctionState *global_state);
166
+
167
+ public:
104
168
  idx_t ReadNext(JSONScanGlobalState &gstate);
105
- idx_t GetBatchIndex() const;
169
+ yyjson_alc *GetAllocator();
106
170
 
107
171
  JSONLine lines[STANDARD_VECTOR_SIZE];
108
- yyjson_doc *objects[STANDARD_VECTOR_SIZE];
172
+ yyjson_val *objects[STANDARD_VECTOR_SIZE];
109
173
 
110
174
  idx_t batch_index;
111
175
 
112
176
  private:
113
- yyjson_doc *ParseLine(char *line_start, idx_t line_size, JSONLine &line, const bool &ignore_errors);
177
+ yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
114
178
 
115
179
  private:
180
+ //! Bind data
181
+ JSONScanData &bind_data;
116
182
  //! Thread-local allocator
117
183
  JSONAllocator json_allocator;
118
184
 
@@ -127,6 +193,7 @@ private:
127
193
  idx_t buffer_size;
128
194
  idx_t buffer_offset;
129
195
  idx_t prev_buffer_remainder;
196
+ idx_t lines_or_objects_in_buffer;
130
197
 
131
198
  //! Buffer to reconstruct split objects
132
199
  AllocatedData reconstruct_buffer;
@@ -135,21 +202,43 @@ private:
135
202
  const char *buffer_copy_ptr;
136
203
 
137
204
  private:
138
- bool ReadNextBuffer(JSONScanGlobalState &gstate, bool &first_read);
139
- void ReadNextBufferSeek(JSONScanGlobalState &gstate, bool &first_read, idx_t &buffer_index);
140
- void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, bool &first_read, idx_t &buffer_index);
205
+ bool ReadNextBuffer(JSONScanGlobalState &gstate);
206
+ void ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index);
207
+ void ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
208
+ void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
141
209
 
142
210
  void ReconstructFirstObject(JSONScanGlobalState &gstate);
143
211
 
144
212
  void ReadUnstructured(idx_t &count);
145
- void ReadNewlineDelimited(idx_t &count, const bool &ignore_errors);
213
+ void ReadNewlineDelimited(idx_t &count);
214
+ };
215
+
216
+ struct JSONGlobalTableFunctionState : public GlobalTableFunctionState {
217
+ public:
218
+ JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input);
219
+ static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
220
+ idx_t MaxThreads() const override;
221
+
222
+ public:
223
+ JSONScanGlobalState state;
224
+ };
225
+
226
+ struct JSONLocalTableFunctionState : public LocalTableFunctionState {
227
+ public:
228
+ JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate);
229
+ static unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
230
+ GlobalTableFunctionState *global_state);
231
+ idx_t GetBatchIndex() const;
232
+
233
+ public:
234
+ JSONScanLocalState state;
146
235
  };
147
236
 
148
237
  struct JSONScan {
149
238
  public:
150
239
  static double JSONScanProgress(ClientContext &context, const FunctionData *bind_data_p,
151
240
  const GlobalTableFunctionState *global_state) {
152
- auto &gstate = (JSONScanGlobalState &)*global_state;
241
+ auto &gstate = ((JSONGlobalTableFunctionState &)*global_state).state;
153
242
  double progress = 0;
154
243
  for (auto &reader : gstate.json_readers) {
155
244
  progress += reader->GetProgress();
@@ -159,14 +248,13 @@ public:
159
248
 
160
249
  static idx_t JSONScanGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
161
250
  LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
162
- auto &lstate = (JSONScanLocalState &)*local_state;
251
+ auto &lstate = (JSONLocalTableFunctionState &)*local_state;
163
252
  return lstate.GetBatchIndex();
164
253
  }
165
254
 
166
255
  static void JSONScanSerialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
167
256
  auto &bind_data = (JSONScanData &)*bind_data_p;
168
257
  bind_data.Serialize(writer);
169
- bind_data.options.Serialize(writer);
170
258
  }
171
259
 
172
260
  static unique_ptr<FunctionData> JSONScanDeserialize(ClientContext &context, FieldReader &reader,
@@ -180,6 +268,7 @@ public:
180
268
  table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
181
269
  table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
182
270
  table_function.named_parameters["format"] = LogicalType::VARCHAR;
271
+ table_function.named_parameters["compression"] = LogicalType::VARCHAR;
183
272
 
184
273
  table_function.table_scan_progress = JSONScanProgress;
185
274
  table_function.get_batch_index = JSONScanGetBatchIndex;
@@ -187,6 +276,7 @@ public:
187
276
  table_function.serialize = JSONScanSerialize;
188
277
  table_function.deserialize = JSONScanDeserialize;
189
278
 
279
+ // TODO: might be able to do some of these
190
280
  table_function.projection_pushdown = false;
191
281
  table_function.filter_pushdown = false;
192
282
  table_function.filter_prune = false;