duckdb 0.6.2-dev1978.0 → 0.6.2-dev2015.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/json/buffered_json_reader.cpp +132 -18
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +29 -9
- package/src/duckdb/extension/json/include/json_common.hpp +56 -0
- package/src/duckdb/extension/json/include/json_functions.hpp +9 -0
- package/src/duckdb/extension/json/include/json_scan.hpp +115 -25
- package/src/duckdb/extension/json/include/json_structure.hpp +73 -0
- package/src/duckdb/extension/json/include/json_transform.hpp +57 -0
- package/src/duckdb/extension/json/json-extension.cpp +3 -0
- package/src/duckdb/extension/json/json_functions/json_contains.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_create.cpp +6 -10
- package/src/duckdb/extension/json/json_functions/json_extract.cpp +1 -1
- package/src/duckdb/extension/json/json_functions/json_keys.cpp +60 -0
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +404 -150
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +216 -60
- package/src/duckdb/extension/json/json_functions/read_json.cpp +224 -0
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +6 -6
- package/src/duckdb/extension/json/json_functions.cpp +25 -0
- package/src/duckdb/extension/json/json_scan.cpp +192 -86
- package/src/duckdb/extension/json/yyjson/include/yyjson.hpp +18 -9
- package/src/duckdb/extension/json/yyjson/yyjson.cpp +58 -13
- package/src/duckdb/src/function/table/copy_csv.cpp +16 -11
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/function/scalar/strftime.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/extension_functions.hpp +5 -0
- package/src/duckdb/ub_extension_json_json_functions.cpp +4 -0
package/package.json
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
#include "duckdb/common/field_writer.hpp"
|
|
4
4
|
#include "duckdb/common/file_opener.hpp"
|
|
5
|
-
#include "duckdb/common/file_system.hpp"
|
|
6
5
|
#include "duckdb/common/printer.hpp"
|
|
7
6
|
|
|
8
7
|
namespace duckdb {
|
|
@@ -20,13 +19,13 @@ void BufferedJSONReaderOptions::Deserialize(FieldReader &reader) {
|
|
|
20
19
|
}
|
|
21
20
|
|
|
22
21
|
JSONBufferHandle::JSONBufferHandle(idx_t buffer_index_p, idx_t readers_p, AllocatedData &&buffer_p, idx_t buffer_size_p)
|
|
23
|
-
: buffer_index(buffer_index_p), readers(readers_p), buffer(move(buffer_p)), buffer_size(buffer_size_p) {
|
|
22
|
+
: buffer_index(buffer_index_p), readers(readers_p), buffer(std::move(buffer_p)), buffer_size(buffer_size_p) {
|
|
24
23
|
}
|
|
25
24
|
|
|
26
|
-
JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p)
|
|
27
|
-
: file_handle(move(file_handle_p)), can_seek(file_handle->CanSeek()),
|
|
28
|
-
plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()),
|
|
29
|
-
|
|
25
|
+
JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
|
|
26
|
+
: file_handle(std::move(file_handle_p)), allocator(allocator_p), can_seek(file_handle->CanSeek()),
|
|
27
|
+
plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()), read_position(0),
|
|
28
|
+
cached_size(0) {
|
|
30
29
|
}
|
|
31
30
|
|
|
32
31
|
idx_t JSONFileHandle::FileSize() const {
|
|
@@ -37,12 +36,16 @@ idx_t JSONFileHandle::Remaining() const {
|
|
|
37
36
|
return file_size - read_position;
|
|
38
37
|
}
|
|
39
38
|
|
|
39
|
+
bool JSONFileHandle::PlainFileSource() const {
|
|
40
|
+
return plain_file_source;
|
|
41
|
+
}
|
|
42
|
+
|
|
40
43
|
bool JSONFileHandle::CanSeek() const {
|
|
41
44
|
return can_seek;
|
|
42
45
|
}
|
|
43
46
|
|
|
44
|
-
|
|
45
|
-
|
|
47
|
+
void JSONFileHandle::Seek(idx_t position) {
|
|
48
|
+
file_handle->Seek(position);
|
|
46
49
|
}
|
|
47
50
|
|
|
48
51
|
idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size) {
|
|
@@ -53,22 +56,82 @@ idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size)
|
|
|
53
56
|
return actual_size;
|
|
54
57
|
}
|
|
55
58
|
|
|
56
|
-
void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t position) {
|
|
59
|
+
void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t position, bool sample_run) {
|
|
57
60
|
D_ASSERT(size != 0);
|
|
58
|
-
|
|
61
|
+
if (plain_file_source) {
|
|
62
|
+
file_handle->Read((void *)pointer, size, position);
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (sample_run) { // Cache the buffer
|
|
67
|
+
file_handle->Read((void *)pointer, size, position);
|
|
68
|
+
cached_buffers.emplace_back(allocator.Allocate(size));
|
|
69
|
+
memcpy(cached_buffers.back().get(), pointer, size);
|
|
70
|
+
cached_size += size;
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (!cached_buffers.empty() || position < cached_size) {
|
|
75
|
+
ReadFromCache(pointer, size, position);
|
|
76
|
+
}
|
|
77
|
+
if (size != 0) {
|
|
78
|
+
file_handle->Read((void *)pointer, size, position);
|
|
79
|
+
}
|
|
59
80
|
}
|
|
60
81
|
|
|
61
|
-
idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size) {
|
|
82
|
+
idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size, bool sample_run) {
|
|
62
83
|
D_ASSERT(requested_size != 0);
|
|
63
|
-
|
|
64
|
-
|
|
84
|
+
if (plain_file_source) {
|
|
85
|
+
auto actual_size = file_handle->Read((void *)pointer, requested_size);
|
|
86
|
+
read_position += actual_size;
|
|
87
|
+
return actual_size;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (sample_run) { // Cache the buffer
|
|
91
|
+
auto actual_size = file_handle->Read((void *)pointer, requested_size);
|
|
92
|
+
if (actual_size > 0) {
|
|
93
|
+
cached_buffers.emplace_back(allocator.Allocate(actual_size));
|
|
94
|
+
memcpy(cached_buffers.back().get(), pointer, actual_size);
|
|
95
|
+
}
|
|
96
|
+
cached_size += actual_size;
|
|
97
|
+
read_position += actual_size;
|
|
98
|
+
return actual_size;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
idx_t actual_size = 0;
|
|
102
|
+
if (!cached_buffers.empty() || read_position < cached_size) {
|
|
103
|
+
actual_size += ReadFromCache(pointer, requested_size, read_position);
|
|
104
|
+
}
|
|
105
|
+
if (requested_size != 0) {
|
|
106
|
+
actual_size += file_handle->Read((void *)pointer, requested_size);
|
|
107
|
+
}
|
|
65
108
|
return actual_size;
|
|
66
109
|
}
|
|
67
110
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
111
|
+
idx_t JSONFileHandle::ReadFromCache(const char *&pointer, idx_t &size, idx_t &position) {
|
|
112
|
+
idx_t read_size = 0;
|
|
113
|
+
idx_t total_offset = 0;
|
|
114
|
+
for (auto &cached_buffer : cached_buffers) {
|
|
115
|
+
if (size == 0) {
|
|
116
|
+
break;
|
|
117
|
+
}
|
|
118
|
+
if (position < total_offset + cached_buffer.GetSize()) {
|
|
119
|
+
idx_t within_buffer_offset = position - total_offset;
|
|
120
|
+
idx_t copy_size = MinValue<idx_t>(size, cached_buffer.GetSize() - within_buffer_offset);
|
|
121
|
+
memcpy((void *)pointer, cached_buffer.get() + within_buffer_offset, copy_size);
|
|
122
|
+
|
|
123
|
+
read_size += copy_size;
|
|
124
|
+
pointer += copy_size;
|
|
125
|
+
size -= copy_size;
|
|
126
|
+
position += copy_size;
|
|
127
|
+
}
|
|
128
|
+
total_offset += cached_buffer.GetSize();
|
|
129
|
+
}
|
|
130
|
+
return read_size;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
BufferedJSONReader::BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options_p, string file_path_p)
|
|
134
|
+
: file_path(std::move(file_path_p)), context(context), options(std::move(options_p)), buffer_index(0) {
|
|
72
135
|
}
|
|
73
136
|
|
|
74
137
|
void BufferedJSONReader::OpenJSONFile() {
|
|
@@ -77,7 +140,7 @@ void BufferedJSONReader::OpenJSONFile() {
|
|
|
77
140
|
auto file_opener = FileOpener::Get(context);
|
|
78
141
|
auto regular_file_handle = file_system.OpenFile(file_path.c_str(), FileFlags::FILE_FLAGS_READ,
|
|
79
142
|
FileLockType::NO_LOCK, options.compression, file_opener);
|
|
80
|
-
file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle));
|
|
143
|
+
file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
|
|
81
144
|
}
|
|
82
145
|
|
|
83
146
|
bool BufferedJSONReader::IsOpen() {
|
|
@@ -113,9 +176,40 @@ AllocatedData BufferedJSONReader::RemoveBuffer(idx_t buffer_idx) {
|
|
|
113
176
|
}
|
|
114
177
|
|
|
115
178
|
idx_t BufferedJSONReader::GetBufferIndex() {
|
|
179
|
+
buffer_line_or_object_counts.push_back(-1);
|
|
116
180
|
return buffer_index++;
|
|
117
181
|
}
|
|
118
182
|
|
|
183
|
+
void BufferedJSONReader::SetBufferLineOrObjectCount(idx_t index, idx_t count) {
|
|
184
|
+
lock_guard<mutex> guard(lock);
|
|
185
|
+
buffer_line_or_object_counts[index] = count;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
void BufferedJSONReader::ThrowParseError(idx_t buf_index, idx_t line_or_object_in_buf, yyjson_read_err &err,
|
|
189
|
+
const string &extra) {
|
|
190
|
+
D_ASSERT(options.format == JSONFormat::UNSTRUCTURED || options.format == JSONFormat::NEWLINE_DELIMITED);
|
|
191
|
+
while (true) {
|
|
192
|
+
lock_guard<mutex> guard(lock);
|
|
193
|
+
idx_t line = line_or_object_in_buf;
|
|
194
|
+
bool can_throw = true;
|
|
195
|
+
for (idx_t b_idx = 0; b_idx < buf_index; b_idx++) {
|
|
196
|
+
if (buffer_line_or_object_counts[b_idx] == -1) {
|
|
197
|
+
can_throw = false;
|
|
198
|
+
break;
|
|
199
|
+
} else {
|
|
200
|
+
line += buffer_line_or_object_counts[b_idx];
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (!can_throw) {
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
string unit = options.format == JSONFormat::NEWLINE_DELIMITED ? "line" : "object";
|
|
207
|
+
// SQL uses 1-based indexing so I guess we will do that in our exception here as well
|
|
208
|
+
throw InvalidInputException("Malformed JSON in file \"%s\", at byte %llu in %s %llu: %s. %s", file_path,
|
|
209
|
+
err.pos + 1, unit, line + 1, err.msg, extra);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
119
213
|
double BufferedJSONReader::GetProgress() const {
|
|
120
214
|
if (file_handle) {
|
|
121
215
|
return 100.0 * double(file_handle->Remaining()) / double(file_handle->FileSize());
|
|
@@ -124,4 +218,24 @@ double BufferedJSONReader::GetProgress() const {
|
|
|
124
218
|
}
|
|
125
219
|
}
|
|
126
220
|
|
|
221
|
+
void BufferedJSONReader::Reset() {
|
|
222
|
+
buffer_index = 0;
|
|
223
|
+
buffer_map.clear();
|
|
224
|
+
buffer_line_or_object_counts.clear();
|
|
225
|
+
|
|
226
|
+
if (file_handle->CanSeek()) {
|
|
227
|
+
file_handle->Seek(0);
|
|
228
|
+
} else {
|
|
229
|
+
file_handle->Reset();
|
|
230
|
+
}
|
|
231
|
+
file_handle->Reset();
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
void JSONFileHandle::Reset() {
|
|
235
|
+
read_position = 0;
|
|
236
|
+
if (plain_file_source) {
|
|
237
|
+
file_handle->Reset();
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
127
241
|
} // namespace duckdb
|
|
@@ -10,13 +10,12 @@
|
|
|
10
10
|
|
|
11
11
|
#include "duckdb/common/atomic.hpp"
|
|
12
12
|
#include "duckdb/common/enums/file_compression_type.hpp"
|
|
13
|
+
#include "duckdb/common/file_system.hpp"
|
|
13
14
|
#include "duckdb/common/mutex.hpp"
|
|
14
15
|
#include "json_common.hpp"
|
|
15
16
|
|
|
16
17
|
namespace duckdb {
|
|
17
18
|
|
|
18
|
-
struct FileHandle;
|
|
19
|
-
|
|
20
19
|
enum class JSONFormat : uint8_t {
|
|
21
20
|
//! Auto-detect format (UNSTRUCTURED / NEWLINE_DELIMITED)
|
|
22
21
|
AUTO_DETECT = 0,
|
|
@@ -58,21 +57,28 @@ public:
|
|
|
58
57
|
|
|
59
58
|
struct JSONFileHandle {
|
|
60
59
|
public:
|
|
61
|
-
|
|
60
|
+
JSONFileHandle(unique_ptr<FileHandle> file_handle, Allocator &allocator);
|
|
62
61
|
|
|
63
62
|
idx_t FileSize() const;
|
|
64
63
|
idx_t Remaining() const;
|
|
65
64
|
|
|
66
|
-
bool CanSeek() const;
|
|
67
65
|
bool PlainFileSource() const;
|
|
66
|
+
bool CanSeek() const;
|
|
67
|
+
void Seek(idx_t position);
|
|
68
68
|
|
|
69
69
|
idx_t GetPositionAndSize(idx_t &position, idx_t requested_size);
|
|
70
|
-
void ReadAtPosition(const char *pointer, idx_t size, idx_t position);
|
|
71
|
-
idx_t Read(const char *pointer, idx_t requested_size);
|
|
70
|
+
void ReadAtPosition(const char *pointer, idx_t size, idx_t position, bool sample_run);
|
|
71
|
+
idx_t Read(const char *pointer, idx_t requested_size, bool sample_run);
|
|
72
|
+
|
|
73
|
+
void Reset();
|
|
74
|
+
|
|
75
|
+
private:
|
|
76
|
+
idx_t ReadFromCache(const char *&pointer, idx_t &size, idx_t &position);
|
|
72
77
|
|
|
73
78
|
private:
|
|
74
79
|
//! The JSON file handle
|
|
75
80
|
unique_ptr<FileHandle> file_handle;
|
|
81
|
+
Allocator &allocator;
|
|
76
82
|
|
|
77
83
|
//! File properties
|
|
78
84
|
const bool can_seek;
|
|
@@ -81,11 +87,15 @@ private:
|
|
|
81
87
|
|
|
82
88
|
//! Read properties
|
|
83
89
|
idx_t read_position;
|
|
90
|
+
|
|
91
|
+
//! Cached buffers for resetting when reading stream
|
|
92
|
+
vector<AllocatedData> cached_buffers;
|
|
93
|
+
idx_t cached_size;
|
|
84
94
|
};
|
|
85
95
|
|
|
86
96
|
class BufferedJSONReader {
|
|
87
97
|
public:
|
|
88
|
-
BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options,
|
|
98
|
+
BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, string file_path);
|
|
89
99
|
|
|
90
100
|
void OpenJSONFile();
|
|
91
101
|
bool IsOpen();
|
|
@@ -93,18 +103,25 @@ public:
|
|
|
93
103
|
BufferedJSONReaderOptions &GetOptions();
|
|
94
104
|
JSONFileHandle &GetFileHandle() const;
|
|
95
105
|
|
|
106
|
+
//! Insert/get/remove buffer (grabs the lock)
|
|
96
107
|
void InsertBuffer(idx_t buffer_idx, unique_ptr<JSONBufferHandle> &&buffer);
|
|
97
108
|
JSONBufferHandle *GetBuffer(idx_t buffer_idx);
|
|
98
109
|
AllocatedData RemoveBuffer(idx_t buffer_idx);
|
|
110
|
+
|
|
111
|
+
//! Get a new buffer index (must hold the lock)
|
|
99
112
|
idx_t GetBufferIndex();
|
|
113
|
+
//! Set line count for a buffer that is done (grabs the lock)
|
|
114
|
+
void SetBufferLineOrObjectCount(idx_t index, idx_t count);
|
|
115
|
+
//! Throws an error that mentions the file name and line number
|
|
116
|
+
void ThrowParseError(idx_t buf_index, idx_t line_or_object_in_buf, yyjson_read_err &err, const string &extra = "");
|
|
100
117
|
|
|
101
118
|
double GetProgress() const;
|
|
119
|
+
void Reset();
|
|
102
120
|
|
|
103
121
|
public:
|
|
104
122
|
mutex lock;
|
|
105
123
|
|
|
106
|
-
//! File
|
|
107
|
-
const idx_t file_index;
|
|
124
|
+
//! File path
|
|
108
125
|
const string file_path;
|
|
109
126
|
|
|
110
127
|
private:
|
|
@@ -118,6 +135,9 @@ private:
|
|
|
118
135
|
idx_t buffer_index;
|
|
119
136
|
//! Mapping from batch index to currently held buffers
|
|
120
137
|
unordered_map<idx_t, unique_ptr<JSONBufferHandle>> buffer_map;
|
|
138
|
+
|
|
139
|
+
//! Line count per buffer
|
|
140
|
+
vector<int64_t> buffer_line_or_object_counts;
|
|
121
141
|
};
|
|
122
142
|
|
|
123
143
|
} // namespace duckdb
|
|
@@ -50,6 +50,37 @@ private:
|
|
|
50
50
|
yyjson_alc yyjson_allocator;
|
|
51
51
|
};
|
|
52
52
|
|
|
53
|
+
struct JSONKey {
|
|
54
|
+
const char *ptr;
|
|
55
|
+
size_t len;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
struct JSONKeyHash {
|
|
59
|
+
inline std::size_t operator()(const JSONKey &k) const {
|
|
60
|
+
size_t result;
|
|
61
|
+
if (k.len >= sizeof(size_t)) {
|
|
62
|
+
memcpy(&result, k.ptr + k.len - sizeof(size_t), sizeof(size_t));
|
|
63
|
+
} else {
|
|
64
|
+
result = 0;
|
|
65
|
+
duckdb::FastMemcpy(&result, k.ptr, k.len);
|
|
66
|
+
}
|
|
67
|
+
return result;
|
|
68
|
+
}
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
struct JSONKeyEquality {
|
|
72
|
+
inline bool operator()(const JSONKey &a, const JSONKey &b) const {
|
|
73
|
+
if (a.len != b.len) {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
return duckdb::FastMemcmp(a.ptr, b.ptr, a.len) == 0;
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
template <typename T>
|
|
81
|
+
using json_key_map_t = unordered_map<JSONKey, T, JSONKeyHash, JSONKeyEquality>;
|
|
82
|
+
using json_key_set_t = unordered_set<JSONKey, JSONKeyHash, JSONKeyEquality>;
|
|
83
|
+
|
|
53
84
|
struct JSONCommon {
|
|
54
85
|
public:
|
|
55
86
|
static constexpr auto JSON_TYPE_NAME = "JSON";
|
|
@@ -111,6 +142,31 @@ public:
|
|
|
111
142
|
return string_t(ValTypeToString<YYJSON_VAL_T>(val));
|
|
112
143
|
}
|
|
113
144
|
|
|
145
|
+
template <class YYJSON_VAL_T>
|
|
146
|
+
static inline const LogicalTypeId ValTypeToLogicalTypeId(YYJSON_VAL_T *val) {
|
|
147
|
+
switch (GetTag<YYJSON_VAL_T>(val)) {
|
|
148
|
+
case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
|
|
149
|
+
return LogicalTypeId::SQLNULL;
|
|
150
|
+
case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
|
|
151
|
+
return LogicalTypeId::VARCHAR;
|
|
152
|
+
case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE:
|
|
153
|
+
return LogicalTypeId::LIST;
|
|
154
|
+
case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE:
|
|
155
|
+
return LogicalTypeId::STRUCT;
|
|
156
|
+
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
|
|
157
|
+
case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
|
|
158
|
+
return LogicalTypeId::BOOLEAN;
|
|
159
|
+
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
|
|
160
|
+
return LogicalTypeId::UBIGINT;
|
|
161
|
+
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
|
|
162
|
+
return LogicalTypeId::BIGINT;
|
|
163
|
+
case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
|
|
164
|
+
return LogicalTypeId::DOUBLE;
|
|
165
|
+
default:
|
|
166
|
+
throw InternalException("Unexpected yyjson tag in ValTypeToLogicalTypeId");
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
114
170
|
public:
|
|
115
171
|
static inline yyjson_mut_doc *CreateDocument(yyjson_alc *alc) {
|
|
116
172
|
D_ASSERT(alc);
|
|
@@ -14,6 +14,8 @@
|
|
|
14
14
|
|
|
15
15
|
namespace duckdb {
|
|
16
16
|
|
|
17
|
+
class TableRef;
|
|
18
|
+
struct ReplacementScanData;
|
|
17
19
|
class CastFunctionSet;
|
|
18
20
|
struct CastParameters;
|
|
19
21
|
|
|
@@ -62,6 +64,8 @@ class JSONFunctions {
|
|
|
62
64
|
public:
|
|
63
65
|
static vector<CreateScalarFunctionInfo> GetScalarFunctions();
|
|
64
66
|
static vector<CreateTableFunctionInfo> GetTableFunctions();
|
|
67
|
+
static unique_ptr<TableRef> ReadJSONReplacement(ClientContext &context, const string &table_name,
|
|
68
|
+
ReplacementScanData *data);
|
|
65
69
|
static void RegisterCastFunctions(CastFunctionSet &casts);
|
|
66
70
|
|
|
67
71
|
private:
|
|
@@ -82,6 +86,7 @@ private:
|
|
|
82
86
|
|
|
83
87
|
static CreateScalarFunctionInfo GetArrayLengthFunction();
|
|
84
88
|
static CreateScalarFunctionInfo GetContainsFunction();
|
|
89
|
+
static CreateScalarFunctionInfo GetKeysFunction();
|
|
85
90
|
static CreateScalarFunctionInfo GetTypeFunction();
|
|
86
91
|
static CreateScalarFunctionInfo GetValidFunction();
|
|
87
92
|
|
|
@@ -97,6 +102,10 @@ private:
|
|
|
97
102
|
// Table functions
|
|
98
103
|
static CreateTableFunctionInfo GetReadJSONObjectsFunction();
|
|
99
104
|
static CreateTableFunctionInfo GetReadNDJSONObjectsFunction();
|
|
105
|
+
static CreateTableFunctionInfo GetReadJSONFunction();
|
|
106
|
+
static CreateTableFunctionInfo GetReadNDJSONFunction();
|
|
107
|
+
static CreateTableFunctionInfo GetReadJSONAutoFunction();
|
|
108
|
+
static CreateTableFunctionInfo GetReadNDJSONAutoFunction();
|
|
100
109
|
};
|
|
101
110
|
|
|
102
111
|
} // namespace duckdb
|
|
@@ -10,11 +10,59 @@
|
|
|
10
10
|
|
|
11
11
|
#include "buffered_json_reader.hpp"
|
|
12
12
|
#include "duckdb/common/mutex.hpp"
|
|
13
|
+
#include "duckdb/function/scalar/strftime.hpp"
|
|
13
14
|
#include "duckdb/function/table_function.hpp"
|
|
15
|
+
#include "json_transform.hpp"
|
|
14
16
|
|
|
15
17
|
namespace duckdb {
|
|
16
18
|
|
|
17
|
-
|
|
19
|
+
enum class JSONScanType : uint8_t {
|
|
20
|
+
INVALID = 0,
|
|
21
|
+
//! Read JSON straight to columnar data
|
|
22
|
+
READ_JSON = 1,
|
|
23
|
+
//! Read JSON objects as strings
|
|
24
|
+
READ_JSON_OBJECTS = 2,
|
|
25
|
+
//! Sample run for schema detection
|
|
26
|
+
SAMPLE = 3,
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
//! Even though LogicalTypeId is just a uint8_t, this is still needed ...
|
|
30
|
+
struct LogicalTypeIdHash {
|
|
31
|
+
inline std::size_t operator()(const LogicalTypeId &id) const {
|
|
32
|
+
return (size_t)id;
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
struct DateFormatMap {
|
|
37
|
+
public:
|
|
38
|
+
void Initialize(const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> &format_templates) {
|
|
39
|
+
for (const auto &entry : format_templates) {
|
|
40
|
+
auto &formats = candidate_formats.emplace(entry.first, vector<StrpTimeFormat>()).first->second;
|
|
41
|
+
formats.reserve(entry.second.size());
|
|
42
|
+
for (const auto &format : entry.second) {
|
|
43
|
+
formats.emplace_back();
|
|
44
|
+
formats.back().format_specifier = format;
|
|
45
|
+
StrpTimeFormat::ParseFormatSpecifier(formats.back().format_specifier, formats.back());
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
bool HasFormats(LogicalTypeId type) const {
|
|
51
|
+
return candidate_formats.find(type) != candidate_formats.end();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
vector<StrpTimeFormat> &GetCandidateFormats(LogicalTypeId type) {
|
|
55
|
+
D_ASSERT(HasFormats(type));
|
|
56
|
+
return candidate_formats[type];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
StrpTimeFormat &GetFormat(LogicalTypeId type) {
|
|
60
|
+
return candidate_formats[type].back();
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
private:
|
|
64
|
+
unordered_map<LogicalTypeId, vector<StrpTimeFormat>, LogicalTypeIdHash> candidate_formats;
|
|
65
|
+
};
|
|
18
66
|
|
|
19
67
|
struct JSONScanData : public TableFunctionData {
|
|
20
68
|
public:
|
|
@@ -27,6 +75,8 @@ public:
|
|
|
27
75
|
void Deserialize(FieldReader &reader);
|
|
28
76
|
|
|
29
77
|
public:
|
|
78
|
+
//! Scan type
|
|
79
|
+
JSONScanType type;
|
|
30
80
|
//! File-specific options
|
|
31
81
|
BufferedJSONReaderOptions options;
|
|
32
82
|
//! The files we're reading
|
|
@@ -34,27 +84,41 @@ public:
|
|
|
34
84
|
|
|
35
85
|
//! Whether or not we should ignore malformed JSON (default to NULL)
|
|
36
86
|
bool ignore_errors = false;
|
|
37
|
-
//! Maximum JSON object size (defaults to 1MB)
|
|
87
|
+
//! Maximum JSON object size (defaults to 1MB minimum)
|
|
38
88
|
idx_t maximum_object_size = 1048576;
|
|
39
|
-
//!
|
|
40
|
-
|
|
89
|
+
//! Options when transforming the JSON to columnar data
|
|
90
|
+
JSONTransformOptions transform_options;
|
|
91
|
+
|
|
92
|
+
//! Whether we auto-detect a schema
|
|
93
|
+
bool auto_detect = false;
|
|
94
|
+
//! Sample size for detecting schema
|
|
95
|
+
idx_t sample_size = STANDARD_VECTOR_SIZE;
|
|
96
|
+
//! Column names (in order)
|
|
97
|
+
vector<string> names;
|
|
98
|
+
//! Max depth we go to detect nested JSON schema (defaults to unlimited)
|
|
99
|
+
idx_t max_depth = NumericLimits<idx_t>::Maximum();
|
|
100
|
+
|
|
101
|
+
//! Stored readers for when we're detecting the schema
|
|
102
|
+
vector<unique_ptr<BufferedJSONReader>> stored_readers;
|
|
103
|
+
//! Candidate date formats
|
|
104
|
+
DateFormatMap date_format_map;
|
|
41
105
|
};
|
|
42
106
|
|
|
43
107
|
struct JSONScanInfo : public TableFunctionInfo {
|
|
44
108
|
public:
|
|
45
|
-
explicit JSONScanInfo(
|
|
46
|
-
|
|
109
|
+
explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
|
|
110
|
+
bool auto_detect_p = false)
|
|
111
|
+
: type(type_p), format(format_p), auto_detect(auto_detect_p) {
|
|
47
112
|
}
|
|
48
113
|
|
|
49
|
-
|
|
50
|
-
|
|
114
|
+
JSONScanType type;
|
|
115
|
+
JSONFormat format;
|
|
116
|
+
bool auto_detect;
|
|
51
117
|
};
|
|
52
118
|
|
|
53
|
-
struct JSONScanGlobalState
|
|
119
|
+
struct JSONScanGlobalState {
|
|
54
120
|
public:
|
|
55
121
|
JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data);
|
|
56
|
-
static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
|
|
57
|
-
idx_t MaxThreads() const override;
|
|
58
122
|
|
|
59
123
|
public:
|
|
60
124
|
//! Bound data
|
|
@@ -70,7 +134,7 @@ public:
|
|
|
70
134
|
vector<unique_ptr<BufferedJSONReader>> json_readers;
|
|
71
135
|
//! Current file/batch index
|
|
72
136
|
idx_t file_index;
|
|
73
|
-
idx_t batch_index;
|
|
137
|
+
atomic<idx_t> batch_index;
|
|
74
138
|
|
|
75
139
|
//! Current number of threads active
|
|
76
140
|
idx_t system_threads;
|
|
@@ -96,23 +160,25 @@ public:
|
|
|
96
160
|
}
|
|
97
161
|
};
|
|
98
162
|
|
|
99
|
-
struct JSONScanLocalState
|
|
163
|
+
struct JSONScanLocalState {
|
|
100
164
|
public:
|
|
101
165
|
JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate);
|
|
102
|
-
|
|
103
|
-
|
|
166
|
+
|
|
167
|
+
public:
|
|
104
168
|
idx_t ReadNext(JSONScanGlobalState &gstate);
|
|
105
|
-
|
|
169
|
+
yyjson_alc *GetAllocator();
|
|
106
170
|
|
|
107
171
|
JSONLine lines[STANDARD_VECTOR_SIZE];
|
|
108
|
-
|
|
172
|
+
yyjson_val *objects[STANDARD_VECTOR_SIZE];
|
|
109
173
|
|
|
110
174
|
idx_t batch_index;
|
|
111
175
|
|
|
112
176
|
private:
|
|
113
|
-
|
|
177
|
+
yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
|
|
114
178
|
|
|
115
179
|
private:
|
|
180
|
+
//! Bind data
|
|
181
|
+
JSONScanData &bind_data;
|
|
116
182
|
//! Thread-local allocator
|
|
117
183
|
JSONAllocator json_allocator;
|
|
118
184
|
|
|
@@ -127,6 +193,7 @@ private:
|
|
|
127
193
|
idx_t buffer_size;
|
|
128
194
|
idx_t buffer_offset;
|
|
129
195
|
idx_t prev_buffer_remainder;
|
|
196
|
+
idx_t lines_or_objects_in_buffer;
|
|
130
197
|
|
|
131
198
|
//! Buffer to reconstruct split objects
|
|
132
199
|
AllocatedData reconstruct_buffer;
|
|
@@ -135,21 +202,43 @@ private:
|
|
|
135
202
|
const char *buffer_copy_ptr;
|
|
136
203
|
|
|
137
204
|
private:
|
|
138
|
-
bool ReadNextBuffer(JSONScanGlobalState &gstate
|
|
139
|
-
void
|
|
140
|
-
void
|
|
205
|
+
bool ReadNextBuffer(JSONScanGlobalState &gstate);
|
|
206
|
+
void ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
|
207
|
+
void ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
|
208
|
+
void ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t &buffer_index);
|
|
141
209
|
|
|
142
210
|
void ReconstructFirstObject(JSONScanGlobalState &gstate);
|
|
143
211
|
|
|
144
212
|
void ReadUnstructured(idx_t &count);
|
|
145
|
-
void ReadNewlineDelimited(idx_t &count
|
|
213
|
+
void ReadNewlineDelimited(idx_t &count);
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
struct JSONGlobalTableFunctionState : public GlobalTableFunctionState {
|
|
217
|
+
public:
|
|
218
|
+
JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input);
|
|
219
|
+
static unique_ptr<GlobalTableFunctionState> Init(ClientContext &context, TableFunctionInitInput &input);
|
|
220
|
+
idx_t MaxThreads() const override;
|
|
221
|
+
|
|
222
|
+
public:
|
|
223
|
+
JSONScanGlobalState state;
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
struct JSONLocalTableFunctionState : public LocalTableFunctionState {
|
|
227
|
+
public:
|
|
228
|
+
JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate);
|
|
229
|
+
static unique_ptr<LocalTableFunctionState> Init(ExecutionContext &context, TableFunctionInitInput &input,
|
|
230
|
+
GlobalTableFunctionState *global_state);
|
|
231
|
+
idx_t GetBatchIndex() const;
|
|
232
|
+
|
|
233
|
+
public:
|
|
234
|
+
JSONScanLocalState state;
|
|
146
235
|
};
|
|
147
236
|
|
|
148
237
|
struct JSONScan {
|
|
149
238
|
public:
|
|
150
239
|
static double JSONScanProgress(ClientContext &context, const FunctionData *bind_data_p,
|
|
151
240
|
const GlobalTableFunctionState *global_state) {
|
|
152
|
-
auto &gstate = (
|
|
241
|
+
auto &gstate = ((JSONGlobalTableFunctionState &)*global_state).state;
|
|
153
242
|
double progress = 0;
|
|
154
243
|
for (auto &reader : gstate.json_readers) {
|
|
155
244
|
progress += reader->GetProgress();
|
|
@@ -159,14 +248,13 @@ public:
|
|
|
159
248
|
|
|
160
249
|
static idx_t JSONScanGetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
|
|
161
250
|
LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
|
|
162
|
-
auto &lstate = (
|
|
251
|
+
auto &lstate = (JSONLocalTableFunctionState &)*local_state;
|
|
163
252
|
return lstate.GetBatchIndex();
|
|
164
253
|
}
|
|
165
254
|
|
|
166
255
|
static void JSONScanSerialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
|
|
167
256
|
auto &bind_data = (JSONScanData &)*bind_data_p;
|
|
168
257
|
bind_data.Serialize(writer);
|
|
169
|
-
bind_data.options.Serialize(writer);
|
|
170
258
|
}
|
|
171
259
|
|
|
172
260
|
static unique_ptr<FunctionData> JSONScanDeserialize(ClientContext &context, FieldReader &reader,
|
|
@@ -180,6 +268,7 @@ public:
|
|
|
180
268
|
table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
|
|
181
269
|
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
|
182
270
|
table_function.named_parameters["format"] = LogicalType::VARCHAR;
|
|
271
|
+
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
|
|
183
272
|
|
|
184
273
|
table_function.table_scan_progress = JSONScanProgress;
|
|
185
274
|
table_function.get_batch_index = JSONScanGetBatchIndex;
|
|
@@ -187,6 +276,7 @@ public:
|
|
|
187
276
|
table_function.serialize = JSONScanSerialize;
|
|
188
277
|
table_function.deserialize = JSONScanDeserialize;
|
|
189
278
|
|
|
279
|
+
// TODO: might be able to do some of these
|
|
190
280
|
table_function.projection_pushdown = false;
|
|
191
281
|
table_function.filter_pushdown = false;
|
|
192
282
|
table_function.filter_prune = false;
|