duckdb 0.7.1-dev90.0 → 0.7.2-dev0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/binding.gyp +7 -7
- package/package.json +3 -3
- package/src/duckdb/extension/json/buffered_json_reader.cpp +50 -9
- package/src/duckdb/extension/json/include/buffered_json_reader.hpp +7 -2
- package/src/duckdb/extension/json/include/json_scan.hpp +45 -10
- package/src/duckdb/extension/json/json_functions/copy_json.cpp +35 -22
- package/src/duckdb/extension/json/json_functions/json_create.cpp +8 -8
- package/src/duckdb/extension/json/json_functions/json_structure.cpp +8 -3
- package/src/duckdb/extension/json/json_functions/json_transform.cpp +54 -10
- package/src/duckdb/extension/json/json_functions/read_json.cpp +104 -49
- package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +5 -3
- package/src/duckdb/extension/json/json_functions.cpp +7 -0
- package/src/duckdb/extension/json/json_scan.cpp +144 -38
- package/src/duckdb/extension/parquet/column_reader.cpp +7 -0
- package/src/duckdb/extension/parquet/include/column_reader.hpp +1 -0
- package/src/duckdb/extension/parquet/parquet-extension.cpp +2 -10
- package/src/duckdb/src/catalog/catalog.cpp +62 -13
- package/src/duckdb/src/catalog/catalog_entry/index_catalog_entry.cpp +8 -7
- package/src/duckdb/src/catalog/catalog_entry/schema_catalog_entry.cpp +1 -1
- package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
- package/src/duckdb/src/catalog/default/default_functions.cpp +1 -0
- package/src/duckdb/src/catalog/default/default_views.cpp +1 -1
- package/src/duckdb/src/common/bind_helpers.cpp +55 -0
- package/src/duckdb/src/common/file_system.cpp +23 -9
- package/src/duckdb/src/common/hive_partitioning.cpp +1 -0
- package/src/duckdb/src/common/local_file_system.cpp +4 -4
- package/src/duckdb/src/common/string_util.cpp +8 -4
- package/src/duckdb/src/common/types/partitioned_column_data.cpp +1 -0
- package/src/duckdb/src/common/types.cpp +37 -11
- package/src/duckdb/src/execution/column_binding_resolver.cpp +5 -2
- package/src/duckdb/src/execution/index/art/art.cpp +117 -67
- package/src/duckdb/src/execution/index/art/art_key.cpp +24 -12
- package/src/duckdb/src/execution/index/art/leaf.cpp +7 -8
- package/src/duckdb/src/execution/index/art/node.cpp +13 -27
- package/src/duckdb/src/execution/index/art/node16.cpp +5 -8
- package/src/duckdb/src/execution/index/art/node256.cpp +3 -5
- package/src/duckdb/src/execution/index/art/node4.cpp +4 -7
- package/src/duckdb/src/execution/index/art/node48.cpp +5 -8
- package/src/duckdb/src/execution/index/art/prefix.cpp +2 -3
- package/src/duckdb/src/execution/operator/aggregate/physical_window.cpp +6 -27
- package/src/duckdb/src/execution/operator/helper/physical_reset.cpp +1 -9
- package/src/duckdb/src/execution/operator/helper/physical_set.cpp +1 -9
- package/src/duckdb/src/execution/operator/join/physical_iejoin.cpp +7 -9
- package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +9 -0
- package/src/duckdb/src/execution/physical_operator.cpp +6 -6
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +38 -11
- package/src/duckdb/src/function/scalar/generic/current_setting.cpp +2 -2
- package/src/duckdb/src/function/scalar/list/array_slice.cpp +2 -3
- package/src/duckdb/src/function/scalar/map/map.cpp +69 -21
- package/src/duckdb/src/function/scalar/string/like.cpp +6 -3
- package/src/duckdb/src/function/table/read_csv.cpp +16 -5
- package/src/duckdb/src/function/table/system/duckdb_temporary_files.cpp +59 -0
- package/src/duckdb/src/function/table/system_functions.cpp +1 -0
- package/src/duckdb/src/function/table/table_scan.cpp +3 -0
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/catalog/catalog.hpp +7 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/duck_index_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/catalog/catalog_entry/index_catalog_entry.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/bind_helpers.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/enums/statement_type.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/enums/wal_type.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/file_system.hpp +1 -1
- package/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +9 -1
- package/src/duckdb/src/include/duckdb/common/radix_partitioning.hpp +4 -4
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +9 -2
- package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +37 -41
- package/src/duckdb/src/include/duckdb/execution/index/art/art_key.hpp +8 -11
- package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/function/scalar/string_functions.hpp +2 -1
- package/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +4 -0
- package/src/duckdb/src/include/duckdb/main/client_data.hpp +2 -2
- package/src/duckdb/src/include/duckdb/main/config.hpp +2 -0
- package/src/duckdb/src/include/duckdb/main/{extension_functions.hpp → extension_entries.hpp} +27 -5
- package/src/duckdb/src/include/duckdb/main/extension_helper.hpp +11 -1
- package/src/duckdb/src/include/duckdb/main/settings.hpp +9 -0
- package/src/duckdb/src/include/duckdb/parallel/pipeline_executor.hpp +0 -7
- package/src/duckdb/src/include/duckdb/parser/query_node/select_node.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +2 -2
- package/src/duckdb/src/include/duckdb/parser/statement/copy_statement.hpp +1 -1
- package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +3 -3
- package/src/duckdb/src/include/duckdb/parser/tableref/subqueryref.hpp +1 -1
- package/src/duckdb/src/include/duckdb/planner/binder.hpp +3 -0
- package/src/duckdb/src/include/duckdb/planner/expression_binder/index_binder.hpp +10 -3
- package/src/duckdb/src/include/duckdb/planner/operator/logical_execute.hpp +1 -5
- package/src/duckdb/src/include/duckdb/planner/operator/logical_show.hpp +1 -2
- package/src/duckdb/src/include/duckdb/storage/buffer_manager.hpp +8 -0
- package/src/duckdb/src/include/duckdb/storage/data_table.hpp +7 -1
- package/src/duckdb/src/include/duckdb/storage/index.hpp +47 -38
- package/src/duckdb/src/include/duckdb/storage/write_ahead_log.hpp +7 -0
- package/src/duckdb/src/main/client_context.cpp +2 -0
- package/src/duckdb/src/main/config.cpp +1 -0
- package/src/duckdb/src/main/database.cpp +14 -5
- package/src/duckdb/src/main/extension/extension_alias.cpp +2 -1
- package/src/duckdb/src/main/extension/extension_helper.cpp +15 -0
- package/src/duckdb/src/main/extension/extension_install.cpp +60 -16
- package/src/duckdb/src/main/extension/extension_load.cpp +62 -13
- package/src/duckdb/src/main/settings/settings.cpp +16 -0
- package/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +2 -6
- package/src/duckdb/src/parallel/pipeline_executor.cpp +1 -55
- package/src/duckdb/src/parser/parsed_data/create_index_info.cpp +3 -0
- package/src/duckdb/src/parser/statement/copy_statement.cpp +2 -13
- package/src/duckdb/src/parser/statement/delete_statement.cpp +3 -0
- package/src/duckdb/src/parser/statement/insert_statement.cpp +9 -0
- package/src/duckdb/src/parser/statement/update_statement.cpp +3 -0
- package/src/duckdb/src/parser/transform/expression/transform_case.cpp +3 -3
- package/src/duckdb/src/planner/bind_context.cpp +1 -1
- package/src/duckdb/src/planner/binder/expression/bind_aggregate_expression.cpp +3 -0
- package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +7 -14
- package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +13 -0
- package/src/duckdb/src/planner/binder/statement/bind_drop.cpp +2 -2
- package/src/duckdb/src/planner/binder/statement/bind_insert.cpp +22 -1
- package/src/duckdb/src/planner/expression_binder/index_binder.cpp +32 -1
- package/src/duckdb/src/planner/logical_operator.cpp +4 -1
- package/src/duckdb/src/storage/buffer_manager.cpp +105 -26
- package/src/duckdb/src/storage/compression/bitpacking.cpp +16 -7
- package/src/duckdb/src/storage/data_table.cpp +66 -3
- package/src/duckdb/src/storage/index.cpp +1 -1
- package/src/duckdb/src/storage/local_storage.cpp +1 -1
- package/src/duckdb/src/storage/table_index_list.cpp +1 -2
- package/src/duckdb/src/storage/wal_replay.cpp +68 -0
- package/src/duckdb/src/storage/write_ahead_log.cpp +21 -1
- package/src/duckdb/src/transaction/commit_state.cpp +5 -2
- package/src/duckdb/third_party/concurrentqueue/blockingconcurrentqueue.h +2 -2
- package/src/duckdb/third_party/fmt/include/fmt/core.h +1 -2
- package/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +4 -4
- package/src/duckdb/ub_src_function_table_system.cpp +2 -0
- package/src/statement.cpp +46 -12
- package/test/arrow.test.ts +3 -3
- package/test/prepare.test.ts +39 -1
- package/test/typescript_decls.test.ts +1 -1
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# DuckDB Node Bindings
|
|
2
2
|
|
|
3
|
-
This package provides a node.js API for [DuckDB](https://github.com/
|
|
3
|
+
This package provides a node.js API for [DuckDB](https://github.com/duckdb/duckdb), the "SQLite for Analytics". The API for this client is somewhat compliant to the SQLite node.js client for easier transition (and transition you must eventually).
|
|
4
4
|
|
|
5
5
|
Load the package and create a database object:
|
|
6
6
|
|
package/binding.gyp
CHANGED
|
@@ -222,16 +222,16 @@
|
|
|
222
222
|
"src/duckdb/third_party/zstd/compress/zstd_lazy.cpp",
|
|
223
223
|
"src/duckdb/third_party/zstd/compress/zstd_ldm.cpp",
|
|
224
224
|
"src/duckdb/third_party/zstd/compress/zstd_opt.cpp",
|
|
225
|
-
"src/duckdb/extension/icu/./icu-
|
|
226
|
-
"src/duckdb/extension/icu/./icu-makedate.cpp",
|
|
227
|
-
"src/duckdb/extension/icu/./icu-datepart.cpp",
|
|
228
|
-
"src/duckdb/extension/icu/./icu-datesub.cpp",
|
|
225
|
+
"src/duckdb/extension/icu/./icu-dateadd.cpp",
|
|
229
226
|
"src/duckdb/extension/icu/./icu-datetrunc.cpp",
|
|
230
|
-
"src/duckdb/extension/icu/./icu-timebucket.cpp",
|
|
231
227
|
"src/duckdb/extension/icu/./icu-strptime.cpp",
|
|
232
|
-
"src/duckdb/extension/icu/./icu-extension.cpp",
|
|
233
|
-
"src/duckdb/extension/icu/./icu-dateadd.cpp",
|
|
234
228
|
"src/duckdb/extension/icu/./icu-datefunc.cpp",
|
|
229
|
+
"src/duckdb/extension/icu/./icu-extension.cpp",
|
|
230
|
+
"src/duckdb/extension/icu/./icu-makedate.cpp",
|
|
231
|
+
"src/duckdb/extension/icu/./icu-timezone.cpp",
|
|
232
|
+
"src/duckdb/extension/icu/./icu-datesub.cpp",
|
|
233
|
+
"src/duckdb/extension/icu/./icu-timebucket.cpp",
|
|
234
|
+
"src/duckdb/extension/icu/./icu-datepart.cpp",
|
|
235
235
|
"src/duckdb/ub_extension_icu_third_party_icu_common.cpp",
|
|
236
236
|
"src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp",
|
|
237
237
|
"src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp",
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "duckdb",
|
|
3
3
|
"main": "./lib/duckdb.js",
|
|
4
4
|
"types": "./lib/duckdb.d.ts",
|
|
5
|
-
"version": "0.7.
|
|
5
|
+
"version": "0.7.2-dev0.0",
|
|
6
6
|
"description": "DuckDB node.js API",
|
|
7
7
|
"gypfile": true,
|
|
8
8
|
"dependencies": {
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
},
|
|
42
42
|
"repository": {
|
|
43
43
|
"type": "git",
|
|
44
|
-
"url": "git+https://github.com/
|
|
44
|
+
"url": "git+https://github.com/duckdb/duckdb.git"
|
|
45
45
|
},
|
|
46
46
|
"ts-node": {
|
|
47
47
|
"require": [
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
"author": "Hannes Mühleisen",
|
|
57
57
|
"license": "MPL-2.0",
|
|
58
58
|
"bugs": {
|
|
59
|
-
"url": "https://github.com/
|
|
59
|
+
"url": "https://github.com/duckdb/duckdb/issues"
|
|
60
60
|
},
|
|
61
61
|
"homepage": "https://www.duckdb.org"
|
|
62
62
|
}
|
|
@@ -25,7 +25,12 @@ JSONBufferHandle::JSONBufferHandle(idx_t buffer_index_p, idx_t readers_p, Alloca
|
|
|
25
25
|
JSONFileHandle::JSONFileHandle(unique_ptr<FileHandle> file_handle_p, Allocator &allocator_p)
|
|
26
26
|
: file_handle(std::move(file_handle_p)), allocator(allocator_p), can_seek(file_handle->CanSeek()),
|
|
27
27
|
plain_file_source(file_handle->OnDiskFile() && can_seek), file_size(file_handle->GetFileSize()), read_position(0),
|
|
28
|
-
cached_size(0) {
|
|
28
|
+
requested_reads(0), actual_reads(0), cached_size(0) {
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
void JSONFileHandle::Close() {
|
|
32
|
+
file_handle->Close();
|
|
33
|
+
cached_buffers.clear();
|
|
29
34
|
}
|
|
30
35
|
|
|
31
36
|
idx_t JSONFileHandle::FileSize() const {
|
|
@@ -36,10 +41,6 @@ idx_t JSONFileHandle::Remaining() const {
|
|
|
36
41
|
return file_size - read_position;
|
|
37
42
|
}
|
|
38
43
|
|
|
39
|
-
bool JSONFileHandle::PlainFileSource() const {
|
|
40
|
-
return plain_file_source;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
44
|
bool JSONFileHandle::CanSeek() const {
|
|
44
45
|
return can_seek;
|
|
45
46
|
}
|
|
@@ -53,6 +54,9 @@ idx_t JSONFileHandle::GetPositionAndSize(idx_t &position, idx_t requested_size)
|
|
|
53
54
|
position = read_position;
|
|
54
55
|
auto actual_size = MinValue<idx_t>(requested_size, Remaining());
|
|
55
56
|
read_position += actual_size;
|
|
57
|
+
if (actual_size != 0) {
|
|
58
|
+
requested_reads++;
|
|
59
|
+
}
|
|
56
60
|
return actual_size;
|
|
57
61
|
}
|
|
58
62
|
|
|
@@ -60,11 +64,13 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
|
|
|
60
64
|
D_ASSERT(size != 0);
|
|
61
65
|
if (plain_file_source) {
|
|
62
66
|
file_handle->Read((void *)pointer, size, position);
|
|
67
|
+
actual_reads++;
|
|
63
68
|
return;
|
|
64
69
|
}
|
|
65
70
|
|
|
66
71
|
if (sample_run) { // Cache the buffer
|
|
67
72
|
file_handle->Read((void *)pointer, size, position);
|
|
73
|
+
actual_reads++;
|
|
68
74
|
cached_buffers.emplace_back(allocator.Allocate(size));
|
|
69
75
|
memcpy(cached_buffers.back().get(), pointer, size);
|
|
70
76
|
cached_size += size;
|
|
@@ -73,22 +79,24 @@ void JSONFileHandle::ReadAtPosition(const char *pointer, idx_t size, idx_t posit
|
|
|
73
79
|
|
|
74
80
|
if (!cached_buffers.empty() || position < cached_size) {
|
|
75
81
|
ReadFromCache(pointer, size, position);
|
|
82
|
+
actual_reads++;
|
|
76
83
|
}
|
|
77
84
|
if (size != 0) {
|
|
78
85
|
file_handle->Read((void *)pointer, size, position);
|
|
86
|
+
actual_reads++;
|
|
79
87
|
}
|
|
80
88
|
}
|
|
81
89
|
|
|
82
90
|
idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size, bool sample_run) {
|
|
83
91
|
D_ASSERT(requested_size != 0);
|
|
84
92
|
if (plain_file_source) {
|
|
85
|
-
auto actual_size =
|
|
93
|
+
auto actual_size = ReadInternal(pointer, requested_size);
|
|
86
94
|
read_position += actual_size;
|
|
87
95
|
return actual_size;
|
|
88
96
|
}
|
|
89
97
|
|
|
90
98
|
if (sample_run) { // Cache the buffer
|
|
91
|
-
auto actual_size =
|
|
99
|
+
auto actual_size = ReadInternal(pointer, requested_size);
|
|
92
100
|
if (actual_size > 0) {
|
|
93
101
|
cached_buffers.emplace_back(allocator.Allocate(actual_size));
|
|
94
102
|
memcpy(cached_buffers.back().get(), pointer, actual_size);
|
|
@@ -103,7 +111,7 @@ idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size, bool sampl
|
|
|
103
111
|
actual_size += ReadFromCache(pointer, requested_size, read_position);
|
|
104
112
|
}
|
|
105
113
|
if (requested_size != 0) {
|
|
106
|
-
actual_size +=
|
|
114
|
+
actual_size += ReadInternal(pointer, requested_size);
|
|
107
115
|
}
|
|
108
116
|
return actual_size;
|
|
109
117
|
}
|
|
@@ -111,7 +119,10 @@ idx_t JSONFileHandle::Read(const char *pointer, idx_t requested_size, bool sampl
|
|
|
111
119
|
idx_t JSONFileHandle::ReadFromCache(const char *&pointer, idx_t &size, idx_t &position) {
|
|
112
120
|
idx_t read_size = 0;
|
|
113
121
|
idx_t total_offset = 0;
|
|
114
|
-
|
|
122
|
+
|
|
123
|
+
idx_t cached_buffer_idx;
|
|
124
|
+
for (cached_buffer_idx = 0; cached_buffer_idx < cached_buffers.size(); cached_buffer_idx++) {
|
|
125
|
+
auto &cached_buffer = cached_buffers[cached_buffer_idx];
|
|
115
126
|
if (size == 0) {
|
|
116
127
|
break;
|
|
117
128
|
}
|
|
@@ -127,9 +138,23 @@ idx_t JSONFileHandle::ReadFromCache(const char *&pointer, idx_t &size, idx_t &po
|
|
|
127
138
|
}
|
|
128
139
|
total_offset += cached_buffer.GetSize();
|
|
129
140
|
}
|
|
141
|
+
|
|
130
142
|
return read_size;
|
|
131
143
|
}
|
|
132
144
|
|
|
145
|
+
idx_t JSONFileHandle::ReadInternal(const char *pointer, const idx_t requested_size) {
|
|
146
|
+
// Deal with reading from pipes
|
|
147
|
+
idx_t total_read_size = 0;
|
|
148
|
+
while (total_read_size < requested_size) {
|
|
149
|
+
auto read_size = file_handle->Read((void *)(pointer + total_read_size), requested_size - total_read_size);
|
|
150
|
+
if (read_size == 0) {
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
total_read_size += read_size;
|
|
154
|
+
}
|
|
155
|
+
return total_read_size;
|
|
156
|
+
}
|
|
157
|
+
|
|
133
158
|
BufferedJSONReader::BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options_p, string file_path_p)
|
|
134
159
|
: file_path(std::move(file_path_p)), context(context), options(std::move(options_p)), buffer_index(0) {
|
|
135
160
|
}
|
|
@@ -143,6 +168,16 @@ void BufferedJSONReader::OpenJSONFile() {
|
|
|
143
168
|
file_handle = make_unique<JSONFileHandle>(std::move(regular_file_handle), BufferAllocator::Get(context));
|
|
144
169
|
}
|
|
145
170
|
|
|
171
|
+
void BufferedJSONReader::CloseJSONFile() {
|
|
172
|
+
while (true) {
|
|
173
|
+
lock_guard<mutex> guard(lock);
|
|
174
|
+
if (file_handle->RequestedReadsComplete()) {
|
|
175
|
+
file_handle->Close();
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
146
181
|
bool BufferedJSONReader::IsOpen() {
|
|
147
182
|
return file_handle != nullptr;
|
|
148
183
|
}
|
|
@@ -246,9 +281,15 @@ void BufferedJSONReader::Reset() {
|
|
|
246
281
|
|
|
247
282
|
void JSONFileHandle::Reset() {
|
|
248
283
|
read_position = 0;
|
|
284
|
+
requested_reads = 0;
|
|
285
|
+
actual_reads = 0;
|
|
249
286
|
if (plain_file_source) {
|
|
250
287
|
file_handle->Reset();
|
|
251
288
|
}
|
|
252
289
|
}
|
|
253
290
|
|
|
291
|
+
bool JSONFileHandle::RequestedReadsComplete() {
|
|
292
|
+
return requested_reads == actual_reads;
|
|
293
|
+
}
|
|
294
|
+
|
|
254
295
|
} // namespace duckdb
|
|
@@ -21,7 +21,7 @@ enum class JSONFormat : uint8_t {
|
|
|
21
21
|
AUTO_DETECT = 0,
|
|
22
22
|
//! One object after another, newlines can be anywhere
|
|
23
23
|
UNSTRUCTURED = 1,
|
|
24
|
-
//! Objects are separated by newlines, newlines do not occur within
|
|
24
|
+
//! Objects are separated by newlines, newlines do not occur within values (NDJSON)
|
|
25
25
|
NEWLINE_DELIMITED = 2,
|
|
26
26
|
};
|
|
27
27
|
|
|
@@ -58,11 +58,11 @@ public:
|
|
|
58
58
|
struct JSONFileHandle {
|
|
59
59
|
public:
|
|
60
60
|
JSONFileHandle(unique_ptr<FileHandle> file_handle, Allocator &allocator);
|
|
61
|
+
void Close();
|
|
61
62
|
|
|
62
63
|
idx_t FileSize() const;
|
|
63
64
|
idx_t Remaining() const;
|
|
64
65
|
|
|
65
|
-
bool PlainFileSource() const;
|
|
66
66
|
bool CanSeek() const;
|
|
67
67
|
void Seek(idx_t position);
|
|
68
68
|
|
|
@@ -71,9 +71,11 @@ public:
|
|
|
71
71
|
idx_t Read(const char *pointer, idx_t requested_size, bool sample_run);
|
|
72
72
|
|
|
73
73
|
void Reset();
|
|
74
|
+
bool RequestedReadsComplete();
|
|
74
75
|
|
|
75
76
|
private:
|
|
76
77
|
idx_t ReadFromCache(const char *&pointer, idx_t &size, idx_t &position);
|
|
78
|
+
idx_t ReadInternal(const char *pointer, const idx_t requested_size);
|
|
77
79
|
|
|
78
80
|
private:
|
|
79
81
|
//! The JSON file handle
|
|
@@ -87,6 +89,8 @@ private:
|
|
|
87
89
|
|
|
88
90
|
//! Read properties
|
|
89
91
|
idx_t read_position;
|
|
92
|
+
idx_t requested_reads;
|
|
93
|
+
atomic<idx_t> actual_reads;
|
|
90
94
|
|
|
91
95
|
//! Cached buffers for resetting when reading stream
|
|
92
96
|
vector<AllocatedData> cached_buffers;
|
|
@@ -98,6 +102,7 @@ public:
|
|
|
98
102
|
BufferedJSONReader(ClientContext &context, BufferedJSONReaderOptions options, string file_path);
|
|
99
103
|
|
|
100
104
|
void OpenJSONFile();
|
|
105
|
+
void CloseJSONFile();
|
|
101
106
|
bool IsOpen();
|
|
102
107
|
|
|
103
108
|
BufferedJSONReaderOptions &GetOptions();
|
|
@@ -20,12 +20,25 @@ enum class JSONScanType : uint8_t {
|
|
|
20
20
|
INVALID = 0,
|
|
21
21
|
//! Read JSON straight to columnar data
|
|
22
22
|
READ_JSON = 1,
|
|
23
|
-
//! Read JSON
|
|
23
|
+
//! Read JSON values as strings
|
|
24
24
|
READ_JSON_OBJECTS = 2,
|
|
25
25
|
//! Sample run for schema detection
|
|
26
26
|
SAMPLE = 3,
|
|
27
27
|
};
|
|
28
28
|
|
|
29
|
+
enum class JSONRecordType : uint8_t {
|
|
30
|
+
//! Sequential values
|
|
31
|
+
RECORDS = 0,
|
|
32
|
+
//! Array of values
|
|
33
|
+
ARRAY_OF_RECORDS = 1,
|
|
34
|
+
//! Sequential non-object JSON
|
|
35
|
+
JSON = 2,
|
|
36
|
+
//! Array of non-object JSON
|
|
37
|
+
ARRAY_OF_JSON = 3,
|
|
38
|
+
//! Auto-detect
|
|
39
|
+
AUTO = 4,
|
|
40
|
+
};
|
|
41
|
+
|
|
29
42
|
//! Even though LogicalTypeId is just a uint8_t, this is still needed ...
|
|
30
43
|
struct LogicalTypeIdHash {
|
|
31
44
|
inline std::size_t operator()(const LogicalTypeId &id) const {
|
|
@@ -104,8 +117,8 @@ public:
|
|
|
104
117
|
vector<idx_t> valid_cols;
|
|
105
118
|
//! Max depth we go to detect nested JSON schema (defaults to unlimited)
|
|
106
119
|
idx_t max_depth = NumericLimits<idx_t>::Maximum();
|
|
107
|
-
//! Whether we're parsing
|
|
108
|
-
|
|
120
|
+
//! Whether we're parsing values (usually), or something else
|
|
121
|
+
JSONRecordType record_type = JSONRecordType::RECORDS;
|
|
109
122
|
//! Forced date/timestamp formats
|
|
110
123
|
string date_format;
|
|
111
124
|
string timestamp_format;
|
|
@@ -119,12 +132,13 @@ public:
|
|
|
119
132
|
struct JSONScanInfo : public TableFunctionInfo {
|
|
120
133
|
public:
|
|
121
134
|
explicit JSONScanInfo(JSONScanType type_p = JSONScanType::INVALID, JSONFormat format_p = JSONFormat::AUTO_DETECT,
|
|
122
|
-
bool auto_detect_p = false)
|
|
123
|
-
: type(type_p), format(format_p), auto_detect(auto_detect_p) {
|
|
135
|
+
JSONRecordType record_type_p = JSONRecordType::AUTO, bool auto_detect_p = false)
|
|
136
|
+
: type(type_p), format(format_p), record_type(record_type_p), auto_detect(auto_detect_p) {
|
|
124
137
|
}
|
|
125
138
|
|
|
126
139
|
JSONScanType type;
|
|
127
140
|
JSONFormat format;
|
|
141
|
+
JSONRecordType record_type;
|
|
128
142
|
bool auto_detect;
|
|
129
143
|
};
|
|
130
144
|
|
|
@@ -179,10 +193,15 @@ public:
|
|
|
179
193
|
public:
|
|
180
194
|
idx_t ReadNext(JSONScanGlobalState &gstate);
|
|
181
195
|
yyjson_alc *GetAllocator();
|
|
182
|
-
void ThrowTransformError(idx_t
|
|
196
|
+
void ThrowTransformError(idx_t object_index, const string &error_message);
|
|
183
197
|
|
|
198
|
+
idx_t scan_count;
|
|
184
199
|
JSONLine lines[STANDARD_VECTOR_SIZE];
|
|
185
|
-
yyjson_val *
|
|
200
|
+
yyjson_val *values[STANDARD_VECTOR_SIZE];
|
|
201
|
+
|
|
202
|
+
idx_t array_idx;
|
|
203
|
+
idx_t array_offset;
|
|
204
|
+
yyjson_val *array_values[STANDARD_VECTOR_SIZE];
|
|
186
205
|
|
|
187
206
|
idx_t batch_index;
|
|
188
207
|
|
|
@@ -192,6 +211,7 @@ public:
|
|
|
192
211
|
|
|
193
212
|
private:
|
|
194
213
|
yyjson_val *ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line);
|
|
214
|
+
idx_t GetObjectsFromArray(JSONScanGlobalState &gstate);
|
|
195
215
|
|
|
196
216
|
private:
|
|
197
217
|
//! Bind data
|
|
@@ -212,7 +232,7 @@ private:
|
|
|
212
232
|
idx_t prev_buffer_remainder;
|
|
213
233
|
idx_t lines_or_objects_in_buffer;
|
|
214
234
|
|
|
215
|
-
//! Buffer to reconstruct split
|
|
235
|
+
//! Buffer to reconstruct split values
|
|
216
236
|
AllocatedData reconstruct_buffer;
|
|
217
237
|
//! Copy of current buffer for YYJSON_READ_INSITU
|
|
218
238
|
AllocatedData current_buffer_copy;
|
|
@@ -276,6 +296,21 @@ public:
|
|
|
276
296
|
return lstate.GetBatchIndex();
|
|
277
297
|
}
|
|
278
298
|
|
|
299
|
+
static unique_ptr<NodeStatistics> JSONScanCardinality(ClientContext &context, const FunctionData *bind_data) {
|
|
300
|
+
auto &data = (JSONScanData &)*bind_data;
|
|
301
|
+
idx_t per_file_cardinality;
|
|
302
|
+
if (data.stored_readers.empty()) {
|
|
303
|
+
// The cardinality of an unknown JSON file is the almighty number 42 except when it's not
|
|
304
|
+
per_file_cardinality = 42;
|
|
305
|
+
} else {
|
|
306
|
+
// If we multiply the almighty number 42 by 10, we get the exact average size of a JSON
|
|
307
|
+
// Not really, but the average size of a lineitem row in JSON is around 360 bytes
|
|
308
|
+
per_file_cardinality = data.stored_readers[0]->GetFileHandle().FileSize() / 420;
|
|
309
|
+
}
|
|
310
|
+
// Obviously this can be improved but this is better than defaulting to 0
|
|
311
|
+
return make_unique<NodeStatistics>(per_file_cardinality * data.file_paths.size());
|
|
312
|
+
}
|
|
313
|
+
|
|
279
314
|
static void JSONScanSerialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
|
|
280
315
|
auto &bind_data = (JSONScanData &)*bind_data_p;
|
|
281
316
|
bind_data.Serialize(writer);
|
|
@@ -291,16 +326,16 @@ public:
|
|
|
291
326
|
static void TableFunctionDefaults(TableFunction &table_function) {
|
|
292
327
|
table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
|
|
293
328
|
table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
|
|
294
|
-
table_function.named_parameters["
|
|
329
|
+
table_function.named_parameters["lines"] = LogicalType::VARCHAR;
|
|
295
330
|
table_function.named_parameters["compression"] = LogicalType::VARCHAR;
|
|
296
331
|
|
|
297
332
|
table_function.table_scan_progress = JSONScanProgress;
|
|
298
333
|
table_function.get_batch_index = JSONScanGetBatchIndex;
|
|
334
|
+
table_function.cardinality = JSONScanCardinality;
|
|
299
335
|
|
|
300
336
|
table_function.serialize = JSONScanSerialize;
|
|
301
337
|
table_function.deserialize = JSONScanDeserialize;
|
|
302
338
|
|
|
303
|
-
// TODO: might be able to do some of these
|
|
304
339
|
table_function.projection_pushdown = false;
|
|
305
340
|
table_function.filter_pushdown = false;
|
|
306
341
|
table_function.filter_prune = false;
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
#include "duckdb/function/copy_function.hpp"
|
|
2
2
|
#include "duckdb/parser/expression/constant_expression.hpp"
|
|
3
3
|
#include "duckdb/parser/expression/function_expression.hpp"
|
|
4
|
+
#include "duckdb/parser/expression/positional_reference_expression.hpp"
|
|
4
5
|
#include "duckdb/parser/query_node/select_node.hpp"
|
|
6
|
+
#include "duckdb/parser/tableref/subqueryref.hpp"
|
|
5
7
|
#include "duckdb/planner/binder.hpp"
|
|
6
8
|
#include "json_functions.hpp"
|
|
7
9
|
#include "json_scan.hpp"
|
|
@@ -12,42 +14,52 @@ namespace duckdb {
|
|
|
12
14
|
static BoundStatement CopyToJSONPlan(Binder &binder, CopyStatement &stmt) {
|
|
13
15
|
auto stmt_copy = stmt.Copy();
|
|
14
16
|
auto © = (CopyStatement &)*stmt_copy;
|
|
15
|
-
auto &select_stmt = (SelectNode &)*copy.select_statement;
|
|
16
17
|
auto &info = *copy.info;
|
|
17
18
|
|
|
18
|
-
// strftime if the user specified a format TODO: deal with date/timestamp within nested types
|
|
19
|
-
auto date_it = info.options.find("dateformat");
|
|
20
|
-
auto timestamp_it = info.options.find("timestampformat");
|
|
21
|
-
|
|
22
19
|
// Bind the select statement of the original to resolve the types
|
|
23
20
|
auto dummy_binder = Binder::CreateBinder(binder.context, &binder, true);
|
|
24
21
|
auto bound_original = dummy_binder->Bind(*stmt.select_statement);
|
|
25
|
-
D_ASSERT(bound_original.types.size() == select_stmt.select_list.size());
|
|
26
|
-
const idx_t num_cols = bound_original.types.size();
|
|
27
22
|
|
|
28
|
-
//
|
|
23
|
+
// Create new SelectNode with the original SelectNode as a subquery in the FROM clause
|
|
24
|
+
auto select_stmt = make_unique<SelectStatement>();
|
|
25
|
+
select_stmt->node = std::move(copy.select_statement);
|
|
26
|
+
auto subquery_ref = make_unique<SubqueryRef>(std::move(select_stmt));
|
|
27
|
+
copy.select_statement = make_unique_base<QueryNode, SelectNode>();
|
|
28
|
+
auto &new_select_node = (SelectNode &)*copy.select_statement;
|
|
29
|
+
new_select_node.from_table = std::move(subquery_ref);
|
|
30
|
+
|
|
31
|
+
// Create new select list
|
|
32
|
+
vector<unique_ptr<ParsedExpression>> select_list;
|
|
33
|
+
select_list.reserve(bound_original.types.size());
|
|
34
|
+
|
|
35
|
+
// strftime if the user specified a format (loop also gives columns a name, needed for struct_pack)
|
|
36
|
+
// TODO: deal with date/timestamp within nested types
|
|
37
|
+
const auto date_it = info.options.find("dateformat");
|
|
38
|
+
const auto timestamp_it = info.options.find("timestampformat");
|
|
29
39
|
vector<unique_ptr<ParsedExpression>> strftime_children;
|
|
30
|
-
for (idx_t
|
|
40
|
+
for (idx_t col_idx = 0; col_idx < bound_original.types.size(); col_idx++) {
|
|
41
|
+
auto column = make_unique_base<ParsedExpression, PositionalReferenceExpression>(col_idx + 1);
|
|
31
42
|
strftime_children.clear();
|
|
32
|
-
auto &
|
|
33
|
-
auto name =
|
|
34
|
-
if (
|
|
35
|
-
strftime_children.emplace_back(std::move(
|
|
43
|
+
const auto &type = bound_original.types[col_idx];
|
|
44
|
+
const auto &name = bound_original.names[col_idx];
|
|
45
|
+
if (date_it != info.options.end() && type == LogicalTypeId::DATE) {
|
|
46
|
+
strftime_children.emplace_back(std::move(column));
|
|
36
47
|
strftime_children.emplace_back(make_unique<ConstantExpression>(date_it->second.back()));
|
|
37
|
-
|
|
38
|
-
} else if (
|
|
39
|
-
strftime_children.emplace_back(std::move(
|
|
48
|
+
column = make_unique<FunctionExpression>("strftime", std::move(strftime_children));
|
|
49
|
+
} else if (timestamp_it != info.options.end() && type == LogicalTypeId::TIMESTAMP) {
|
|
50
|
+
strftime_children.emplace_back(std::move(column));
|
|
40
51
|
strftime_children.emplace_back(make_unique<ConstantExpression>(timestamp_it->second.back()));
|
|
41
|
-
|
|
52
|
+
column = make_unique<FunctionExpression>("strftime", std::move(strftime_children));
|
|
42
53
|
}
|
|
43
|
-
|
|
54
|
+
column->alias = name;
|
|
55
|
+
select_list.emplace_back(std::move(column));
|
|
44
56
|
}
|
|
45
57
|
|
|
46
58
|
// Now create the struct_pack/to_json to create a JSON object per row
|
|
59
|
+
auto &select_node = (SelectNode &)*copy.select_statement;
|
|
47
60
|
vector<unique_ptr<ParsedExpression>> struct_pack_child;
|
|
48
|
-
struct_pack_child.emplace_back(make_unique<FunctionExpression>("struct_pack", std::move(
|
|
49
|
-
|
|
50
|
-
select_stmt.select_list.emplace_back(make_unique<FunctionExpression>("to_json", std::move(struct_pack_child)));
|
|
61
|
+
struct_pack_child.emplace_back(make_unique<FunctionExpression>("struct_pack", std::move(select_list)));
|
|
62
|
+
select_node.select_list.emplace_back(make_unique<FunctionExpression>("to_json", std::move(struct_pack_child)));
|
|
51
63
|
|
|
52
64
|
// Now we can just use the CSV writer
|
|
53
65
|
info.format = "csv";
|
|
@@ -101,7 +113,8 @@ CreateCopyFunctionInfo JSONFunctions::GetJSONCopyFunction() {
|
|
|
101
113
|
|
|
102
114
|
function.copy_from_bind = CopyFromJSONBind;
|
|
103
115
|
function.copy_from_function = JSONFunctions::GetReadJSONTableFunction(
|
|
104
|
-
false,
|
|
116
|
+
false,
|
|
117
|
+
make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::RECORDS, false));
|
|
105
118
|
|
|
106
119
|
return CreateCopyFunctionInfo(function);
|
|
107
120
|
}
|
|
@@ -56,7 +56,7 @@ static LogicalType GetJSONType(unordered_map<string, unique_ptr<Vector>> &const_
|
|
|
56
56
|
// The nested types need to conform as well
|
|
57
57
|
case LogicalTypeId::LIST:
|
|
58
58
|
return LogicalType::LIST(GetJSONType(const_struct_names, ListType::GetChildType(type)));
|
|
59
|
-
// Struct and MAP are treated as JSON
|
|
59
|
+
// Struct and MAP are treated as JSON values
|
|
60
60
|
case LogicalTypeId::STRUCT: {
|
|
61
61
|
child_list_t<LogicalType> child_types;
|
|
62
62
|
for (const auto &child_type : StructType::GetChildTypes(type)) {
|
|
@@ -247,14 +247,14 @@ static void TemplatedCreateValues(yyjson_mut_doc *doc, yyjson_mut_val *vals[], V
|
|
|
247
247
|
|
|
248
248
|
static void CreateValuesStruct(const JSONCreateFunctionData &info, yyjson_mut_doc *doc, yyjson_mut_val *vals[],
|
|
249
249
|
Vector &value_v, idx_t count) {
|
|
250
|
-
// Structs become
|
|
250
|
+
// Structs become values, therefore we initialize vals to JSON values
|
|
251
251
|
for (idx_t i = 0; i < count; i++) {
|
|
252
252
|
vals[i] = yyjson_mut_obj(doc);
|
|
253
253
|
}
|
|
254
254
|
// Initialize re-usable array for the nested values
|
|
255
255
|
auto nested_vals = (yyjson_mut_val **)doc->alc.malloc(doc->alc.ctx, sizeof(yyjson_mut_val *) * count);
|
|
256
256
|
|
|
257
|
-
// Add the key/value pairs to the
|
|
257
|
+
// Add the key/value pairs to the values
|
|
258
258
|
auto &entries = StructVector::GetEntries(value_v);
|
|
259
259
|
for (idx_t entry_i = 0; entry_i < entries.size(); entry_i++) {
|
|
260
260
|
auto &struct_key_v = *info.const_struct_names.at(StructType::GetChildName(value_v.GetType(), entry_i));
|
|
@@ -284,7 +284,7 @@ static void CreateValuesMap(const JSONCreateFunctionData &info, yyjson_mut_doc *
|
|
|
284
284
|
auto map_val_count = ListVector::GetListSize(value_v);
|
|
285
285
|
auto nested_vals = (yyjson_mut_val **)doc->alc.malloc(doc->alc.ctx, sizeof(yyjson_mut_val *) * map_val_count);
|
|
286
286
|
CreateValues(info, doc, nested_vals, map_val_v, map_val_count);
|
|
287
|
-
// Add the key/value pairs to the
|
|
287
|
+
// Add the key/value pairs to the values
|
|
288
288
|
UnifiedVectorFormat map_data;
|
|
289
289
|
value_v.ToUnifiedFormat(count, map_data);
|
|
290
290
|
auto map_key_list_entries = (list_entry_t *)map_data.data;
|
|
@@ -308,7 +308,7 @@ static void CreateValuesMap(const JSONCreateFunctionData &info, yyjson_mut_doc *
|
|
|
308
308
|
|
|
309
309
|
static void CreateValuesUnion(const JSONCreateFunctionData &info, yyjson_mut_doc *doc, yyjson_mut_val *vals[],
|
|
310
310
|
Vector &value_v, idx_t count) {
|
|
311
|
-
// Structs become
|
|
311
|
+
// Structs become values, therefore we initialize vals to JSON values
|
|
312
312
|
for (idx_t i = 0; i < count; i++) {
|
|
313
313
|
vals[i] = yyjson_mut_obj(doc);
|
|
314
314
|
}
|
|
@@ -320,7 +320,7 @@ static void CreateValuesUnion(const JSONCreateFunctionData &info, yyjson_mut_doc
|
|
|
320
320
|
UnifiedVectorFormat tag_data;
|
|
321
321
|
tag_v.ToUnifiedFormat(count, tag_data);
|
|
322
322
|
|
|
323
|
-
// Add the key/value pairs to the
|
|
323
|
+
// Add the key/value pairs to the values
|
|
324
324
|
for (idx_t member_idx = 0; member_idx < UnionType::GetMemberCount(value_v.GetType()); member_idx++) {
|
|
325
325
|
auto &member_val_v = UnionVector::GetMember(value_v, member_idx);
|
|
326
326
|
auto &member_key_v = *info.const_struct_names.at(UnionType::GetMemberName(value_v.GetType(), member_idx));
|
|
@@ -425,7 +425,7 @@ static void ObjectFunction(DataChunk &args, ExpressionState &state, Vector &resu
|
|
|
425
425
|
auto &lstate = JSONFunctionLocalState::ResetAndGet(state);
|
|
426
426
|
auto alc = lstate.json_allocator.GetYYJSONAllocator();
|
|
427
427
|
|
|
428
|
-
// Initialize
|
|
428
|
+
// Initialize values
|
|
429
429
|
const idx_t count = args.size();
|
|
430
430
|
auto doc = JSONCommon::CreateDocument(alc);
|
|
431
431
|
yyjson_mut_val *objs[STANDARD_VECTOR_SIZE];
|
|
@@ -440,7 +440,7 @@ static void ObjectFunction(DataChunk &args, ExpressionState &state, Vector &resu
|
|
|
440
440
|
Vector &value_v = args.data[pair_idx * 2 + 1];
|
|
441
441
|
CreateKeyValuePairs(info, doc, objs, vals, key_v, value_v, count);
|
|
442
442
|
}
|
|
443
|
-
// Write JSON
|
|
443
|
+
// Write JSON values to string
|
|
444
444
|
auto objects = FlatVector::GetData<string_t>(result);
|
|
445
445
|
for (idx_t i = 0; i < count; i++) {
|
|
446
446
|
objects[i] = JSONCommon::WriteVal<yyjson_mut_val>(objs[i], alc);
|
|
@@ -214,9 +214,6 @@ void JSONStructureNode::RefineCandidateTypesObject(yyjson_val *vals[], idx_t cou
|
|
|
214
214
|
}
|
|
215
215
|
}
|
|
216
216
|
|
|
217
|
-
if (count > STANDARD_VECTOR_SIZE) {
|
|
218
|
-
string_vector.Initialize(false, count);
|
|
219
|
-
}
|
|
220
217
|
for (idx_t child_idx = 0; child_idx < child_count; child_idx++) {
|
|
221
218
|
desc.children[child_idx].RefineCandidateTypes(child_vals[child_idx], count, string_vector, allocator,
|
|
222
219
|
date_format_map);
|
|
@@ -431,6 +428,10 @@ static inline yyjson_mut_val *ConvertStructureArray(const JSONStructureNode &nod
|
|
|
431
428
|
static inline yyjson_mut_val *ConvertStructureObject(const JSONStructureNode &node, yyjson_mut_doc *doc) {
|
|
432
429
|
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::STRUCT);
|
|
433
430
|
auto &desc = node.descriptions[0];
|
|
431
|
+
if (desc.children.empty()) {
|
|
432
|
+
// Empty struct - let's do JSON instead
|
|
433
|
+
return yyjson_mut_str(doc, JSONCommon::JSON_TYPE_NAME);
|
|
434
|
+
}
|
|
434
435
|
|
|
435
436
|
auto obj = yyjson_mut_obj(doc);
|
|
436
437
|
for (auto &child : desc.children) {
|
|
@@ -495,6 +496,10 @@ static LogicalType StructureToTypeObject(ClientContext &context, const JSONStruc
|
|
|
495
496
|
idx_t depth) {
|
|
496
497
|
D_ASSERT(node.descriptions.size() == 1 && node.descriptions[0].type == LogicalTypeId::STRUCT);
|
|
497
498
|
auto &desc = node.descriptions[0];
|
|
499
|
+
if (desc.children.empty()) {
|
|
500
|
+
// Empty struct - let's do JSON instead
|
|
501
|
+
return JSONCommon::JSONType();
|
|
502
|
+
}
|
|
498
503
|
|
|
499
504
|
child_list_t<LogicalType> child_types;
|
|
500
505
|
child_types.reserve(desc.children.size());
|