duckdb 0.7.2-dev3515.0 → 0.7.2-dev3666.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/configure.py +2 -0
  2. package/package.json +1 -1
  3. package/src/database.cpp +1 -0
  4. package/src/duckdb/extension/json/buffered_json_reader.cpp +56 -17
  5. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +56 -31
  6. package/src/duckdb/extension/json/include/json_common.hpp +5 -4
  7. package/src/duckdb/extension/json/include/json_executors.hpp +13 -18
  8. package/src/duckdb/extension/json/include/json_functions.hpp +3 -0
  9. package/src/duckdb/extension/json/include/json_scan.hpp +106 -153
  10. package/src/duckdb/extension/json/include/json_transform.hpp +2 -2
  11. package/src/duckdb/extension/json/json_common.cpp +1 -1
  12. package/src/duckdb/extension/json/json_functions/copy_json.cpp +94 -38
  13. package/src/duckdb/extension/json/json_functions/json_contains.cpp +7 -8
  14. package/src/duckdb/extension/json/json_functions/json_create.cpp +7 -7
  15. package/src/duckdb/extension/json/json_functions/json_merge_patch.cpp +4 -4
  16. package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +4 -4
  17. package/src/duckdb/extension/json/json_functions/json_structure.cpp +7 -5
  18. package/src/duckdb/extension/json/json_functions/json_transform.cpp +10 -8
  19. package/src/duckdb/extension/json/json_functions/json_valid.cpp +1 -1
  20. package/src/duckdb/extension/json/json_functions/read_json.cpp +167 -169
  21. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +37 -16
  22. package/src/duckdb/extension/json/json_functions.cpp +11 -4
  23. package/src/duckdb/extension/json/json_scan.cpp +593 -374
  24. package/src/duckdb/extension/parquet/parquet-extension.cpp +5 -0
  25. package/src/duckdb/src/catalog/catalog_entry/macro_catalog_entry.cpp +42 -0
  26. package/src/duckdb/src/catalog/catalog_search_path.cpp +5 -0
  27. package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
  28. package/src/duckdb/src/common/constants.cpp +1 -0
  29. package/src/duckdb/src/common/file_system.cpp +26 -6
  30. package/src/duckdb/src/common/local_file_system.cpp +0 -13
  31. package/src/duckdb/src/common/types/vector.cpp +3 -3
  32. package/src/duckdb/src/common/types/vector_buffer.cpp +11 -3
  33. package/src/duckdb/src/common/types/vector_cache.cpp +5 -5
  34. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +12 -6
  35. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +10 -0
  36. package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +2 -2
  37. package/src/duckdb/src/function/macro_function.cpp +43 -0
  38. package/src/duckdb/src/function/pragma/pragma_queries.cpp +5 -3
  39. package/src/duckdb/src/function/scalar/strftime_format.cpp +1 -0
  40. package/src/duckdb/src/function/scalar_macro_function.cpp +10 -0
  41. package/src/duckdb/src/function/table/copy_csv.cpp +68 -18
  42. package/src/duckdb/src/function/table/read_csv.cpp +30 -3
  43. package/src/duckdb/src/function/table/version/pragma_version.cpp +8 -2
  44. package/src/duckdb/src/function/table_macro_function.cpp +10 -0
  45. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/column_dependency_manager.hpp +1 -1
  46. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/macro_catalog_entry.hpp +3 -1
  47. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/scalar_macro_catalog_entry.hpp +0 -6
  48. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_macro_catalog_entry.hpp +0 -6
  49. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
  50. package/src/duckdb/src/include/duckdb/catalog/similar_catalog_entry.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
  52. package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -3
  53. package/src/duckdb/src/include/duckdb/common/field_writer.hpp +3 -3
  54. package/src/duckdb/src/include/duckdb/common/file_system.hpp +5 -0
  55. package/src/duckdb/src/include/duckdb/common/http_state.hpp +2 -1
  56. package/src/duckdb/src/include/duckdb/common/hugeint.hpp +6 -6
  57. package/src/duckdb/src/include/duckdb/common/limits.hpp +46 -46
  58. package/src/duckdb/src/include/duckdb/common/operator/cast_operators.hpp +8 -8
  59. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +6 -6
  60. package/src/duckdb/src/include/duckdb/common/operator/convert_to_string.hpp +1 -1
  61. package/src/duckdb/src/include/duckdb/common/operator/decimal_cast_operators.hpp +2 -4
  62. package/src/duckdb/src/include/duckdb/common/operator/string_cast.hpp +1 -1
  63. package/src/duckdb/src/include/duckdb/common/operator/subtract.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +1 -1
  65. package/src/duckdb/src/include/duckdb/common/re2_regex.hpp +1 -1
  66. package/src/duckdb/src/include/duckdb/common/string_util.hpp +7 -7
  67. package/src/duckdb/src/include/duckdb/common/types/chunk_collection.hpp +10 -10
  68. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +12 -12
  69. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection_iterators.hpp +2 -2
  70. package/src/duckdb/src/include/duckdb/common/types/value.hpp +1 -1
  71. package/src/duckdb/src/include/duckdb/common/types/vector_buffer.hpp +12 -2
  72. package/src/duckdb/src/include/duckdb/common/types.hpp +2 -2
  73. package/src/duckdb/src/include/duckdb/common/winapi.hpp +1 -1
  74. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
  75. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +9 -5
  76. package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_type.hpp +1 -1
  77. package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +10 -14
  78. package/src/duckdb/src/include/duckdb/function/macro_function.hpp +7 -1
  79. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +3 -4
  80. package/src/duckdb/src/include/duckdb/function/scalar_macro_function.hpp +7 -2
  81. package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
  82. package/src/duckdb/src/include/duckdb/function/table_macro_function.hpp +5 -0
  83. package/src/duckdb/src/include/duckdb/function/udf_function.hpp +56 -50
  84. package/src/duckdb/src/include/duckdb/main/appender.hpp +2 -2
  85. package/src/duckdb/src/include/duckdb/main/client_context.hpp +2 -2
  86. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -1
  87. package/src/duckdb/src/include/duckdb/main/connection.hpp +8 -9
  88. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
  89. package/src/duckdb/src/include/duckdb/main/query_result.hpp +3 -3
  90. package/src/duckdb/src/include/duckdb/main/relation.hpp +6 -7
  91. package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +1 -1
  92. package/src/duckdb/src/include/duckdb/parser/column_list.hpp +7 -7
  93. package/src/duckdb/src/include/duckdb/parser/parsed_data/attach_info.hpp +4 -7
  94. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_macro_info.hpp +8 -12
  95. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_sequence_info.hpp +6 -20
  96. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_type_info.hpp +6 -18
  97. package/src/duckdb/src/include/duckdb/parser/parsed_data/detach_info.hpp +4 -8
  98. package/src/duckdb/src/include/duckdb/parser/parsed_data/drop_info.hpp +4 -38
  99. package/src/duckdb/src/include/duckdb/parser/parsed_data/transaction_info.hpp +5 -2
  100. package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +10 -10
  101. package/src/duckdb/src/include/duckdb/parser/parser_extension.hpp +2 -2
  102. package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +1 -1
  103. package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +1 -1
  104. package/src/duckdb/src/include/duckdb/planner/operator_extension.hpp +2 -2
  105. package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +2 -2
  106. package/src/duckdb/src/parser/parsed_data/attach_info.cpp +42 -0
  107. package/src/duckdb/src/parser/parsed_data/create_index_info.cpp +0 -7
  108. package/src/duckdb/src/parser/parsed_data/create_info.cpp +19 -8
  109. package/src/duckdb/src/parser/parsed_data/create_macro_info.cpp +46 -0
  110. package/src/duckdb/src/parser/parsed_data/create_sequence_info.cpp +56 -0
  111. package/src/duckdb/src/parser/parsed_data/create_type_info.cpp +47 -0
  112. package/src/duckdb/src/parser/parsed_data/detach_info.cpp +34 -0
  113. package/src/duckdb/src/parser/parsed_data/drop_info.cpp +46 -0
  114. package/src/duckdb/src/parser/parsed_data/transaction_info.cpp +24 -0
  115. package/src/duckdb/src/parser/parsed_data/vacuum_info.cpp +37 -0
  116. package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +27 -9
  117. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +9 -4
  118. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +2 -1
  119. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +1 -0
  120. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +1 -1
  121. package/src/duckdb/src/planner/logical_operator.cpp +1 -2
  122. package/src/duckdb/src/planner/operator/logical_create_index.cpp +16 -25
  123. package/src/duckdb/src/planner/operator/logical_insert.cpp +30 -0
  124. package/src/duckdb/src/planner/operator/logical_simple.cpp +33 -5
  125. package/src/duckdb/src/planner/parsed_data/bound_create_table_info.cpp +6 -16
  126. package/src/duckdb/src/planner/planner.cpp +4 -13
  127. package/src/duckdb/src/storage/checkpoint_manager.cpp +12 -6
  128. package/src/duckdb/src/storage/single_file_block_manager.cpp +0 -4
  129. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  130. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5735 -5773
  131. package/src/duckdb/ub_src_catalog_catalog_entry.cpp +1 -1
  132. package/src/duckdb/ub_src_parser_parsed_data.cpp +16 -0
  133. package/src/duckdb/src/catalog/catalog_entry/scalar_macro_catalog_entry.cpp +0 -104
@@ -1,7 +1,7 @@
1
1
  #include "json_scan.hpp"
2
2
 
3
+ #include "duckdb/common/enum_util.hpp"
3
4
  #include "duckdb/common/multi_file_reader.hpp"
4
- #include "duckdb/main/database.hpp"
5
5
  #include "duckdb/main/extension_helper.hpp"
6
6
  #include "duckdb/parallel/task_scheduler.hpp"
7
7
  #include "duckdb/storage/buffer_manager.hpp"
@@ -11,51 +11,59 @@ namespace duckdb {
11
11
  JSONScanData::JSONScanData() {
12
12
  }
13
13
 
14
- unique_ptr<FunctionData> JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
15
- auto result = make_uniq<JSONScanData>();
16
- auto &options = result->options;
17
-
18
- auto &info = (JSONScanInfo &)*input.info;
19
- result->type = info.type;
14
+ void JSONScanData::Bind(ClientContext &context, TableFunctionBindInput &input) {
15
+ auto &info = input.info->Cast<JSONScanInfo>();
16
+ type = info.type;
20
17
  options.format = info.format;
21
- result->record_type = info.record_type;
22
- result->auto_detect = info.auto_detect;
23
- result->file_paths = MultiFileReader::GetFileList(context, input.inputs[0], "JSON");
18
+ options.record_type = info.record_type;
19
+ auto_detect = info.auto_detect;
24
20
 
25
21
  for (auto &kv : input.named_parameters) {
22
+ if (MultiFileReader::ParseOption(kv.first, kv.second, options.file_options)) {
23
+ continue;
24
+ }
26
25
  auto loption = StringUtil::Lower(kv.first);
27
26
  if (loption == "ignore_errors") {
28
- result->ignore_errors = BooleanValue::Get(kv.second);
27
+ ignore_errors = BooleanValue::Get(kv.second);
29
28
  } else if (loption == "maximum_object_size") {
30
- result->maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), result->maximum_object_size);
31
- } else if (loption == "lines") {
32
- auto format = StringUtil::Lower(StringValue::Get(kv.second));
33
- if (format == "auto") {
29
+ maximum_object_size = MaxValue<idx_t>(UIntegerValue::Get(kv.second), maximum_object_size);
30
+ } else if (loption == "format") {
31
+ auto arg = StringValue::Get(kv.second);
32
+ if (arg == "auto") {
34
33
  options.format = JSONFormat::AUTO_DETECT;
35
- } else if (format == "false") {
34
+ } else if (arg == "unstructured") {
36
35
  options.format = JSONFormat::UNSTRUCTURED;
37
- } else if (format == "true") {
36
+ } else if (arg == "newline_delimited" || arg == "nd") {
38
37
  options.format = JSONFormat::NEWLINE_DELIMITED;
38
+ } else if (arg == "array") {
39
+ options.format = JSONFormat::ARRAY;
39
40
  } else {
40
- throw BinderException("\"lines\" must be one of ['auto', 'true', 'false']");
41
+ throw InvalidInputException(
42
+ "format must be one of ['auto', 'unstructured', 'newline_delimited', 'array']");
41
43
  }
42
44
  } else if (loption == "compression") {
43
- auto compression = StringUtil::Lower(StringValue::Get(kv.second));
44
- if (compression == "none") {
45
- options.compression = FileCompressionType::UNCOMPRESSED;
46
- } else if (compression == "gzip") {
47
- options.compression = FileCompressionType::GZIP;
48
- } else if (compression == "zstd") {
49
- options.compression = FileCompressionType::ZSTD;
50
- } else if (compression == "auto") {
51
- options.compression = FileCompressionType::AUTO_DETECT;
52
- } else {
53
- throw BinderException("compression must be one of ['none', 'gzip', 'zstd', 'auto']");
54
- }
45
+ SetCompression(StringUtil::Lower(StringValue::Get(kv.second)));
55
46
  }
56
47
  }
57
48
 
58
- return std::move(result);
49
+ files = MultiFileReader::GetFileList(context, input.inputs[0], "JSON");
50
+
51
+ if (options.file_options.auto_detect_hive_partitioning) {
52
+ options.file_options.hive_partitioning = MultiFileReaderOptions::AutoDetectHivePartitioning(files);
53
+ }
54
+
55
+ InitializeReaders(context);
56
+ }
57
+
58
+ void JSONScanData::InitializeReaders(ClientContext &context) {
59
+ union_readers.resize(files.empty() ? 0 : files.size() - 1);
60
+ for (idx_t file_idx = 0; file_idx < files.size(); file_idx++) {
61
+ if (file_idx == 0) {
62
+ initial_reader = make_uniq<BufferedJSONReader>(context, options, files[0]);
63
+ } else {
64
+ union_readers[file_idx - 1] = make_uniq<BufferedJSONReader>(context, options, files[file_idx]);
65
+ }
66
+ }
59
67
  }
60
68
 
61
69
  void JSONScanData::InitializeFormats() {
@@ -63,14 +71,6 @@ void JSONScanData::InitializeFormats() {
63
71
  }
64
72
 
65
73
  void JSONScanData::InitializeFormats(bool auto_detect_p) {
66
- // Set defaults for date/timestamp formats if we need to
67
- if (!auto_detect_p && date_format.empty()) {
68
- date_format = "%Y-%m-%d";
69
- }
70
- if (!auto_detect_p && timestamp_format.empty()) {
71
- timestamp_format = "%Y-%m-%dT%H:%M:%S.%fZ";
72
- }
73
-
74
74
  // Initialize date_format_map if anything was specified
75
75
  if (!date_format.empty()) {
76
76
  date_format_map.AddFormat(LogicalTypeId::DATE, date_format);
@@ -80,7 +80,7 @@ void JSONScanData::InitializeFormats(bool auto_detect_p) {
80
80
  }
81
81
 
82
82
  if (auto_detect_p) {
83
- static const unordered_map<LogicalTypeId, vector<const char *>, LogicalTypeIdHash> FORMAT_TEMPLATES = {
83
+ static const type_id_map_t<vector<const char *>> FORMAT_TEMPLATES = {
84
84
  {LogicalTypeId::DATE, {"%m-%d-%Y", "%m-%d-%y", "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%y-%m-%d"}},
85
85
  {LogicalTypeId::TIMESTAMP,
86
86
  {"%Y-%m-%d %H:%M:%S.%f", "%m-%d-%Y %I:%M:%S %p", "%m-%d-%y %I:%M:%S %p", "%d-%m-%Y %H:%M:%S",
@@ -89,56 +89,73 @@ void JSONScanData::InitializeFormats(bool auto_detect_p) {
89
89
 
90
90
  // Populate possible date/timestamp formats, assume this is consistent across columns
91
91
  for (auto &kv : FORMAT_TEMPLATES) {
92
- const auto &type = kv.first;
93
- if (date_format_map.HasFormats(type)) {
92
+ const auto &logical_type = kv.first;
93
+ if (date_format_map.HasFormats(logical_type)) {
94
94
  continue; // Already populated
95
95
  }
96
96
  const auto &format_strings = kv.second;
97
97
  for (auto &format_string : format_strings) {
98
- date_format_map.AddFormat(type, format_string);
98
+ date_format_map.AddFormat(logical_type, format_string);
99
99
  }
100
100
  }
101
101
  }
102
102
  }
103
103
 
104
- void JSONScanData::Serialize(FieldWriter &writer) {
104
+ void JSONScanData::SetCompression(const string &compression) {
105
+ options.compression = EnumUtil::FromString<FileCompressionType>(StringUtil::Upper(compression));
106
+ }
107
+
108
+ void JSONScanData::Serialize(FieldWriter &writer) const {
105
109
  writer.WriteField<JSONScanType>(type);
110
+
106
111
  options.Serialize(writer);
107
- writer.WriteList<string>(file_paths);
112
+
113
+ writer.WriteSerializable(reader_bind);
114
+
115
+ writer.WriteList<string>(files);
116
+
108
117
  writer.WriteField<bool>(ignore_errors);
109
118
  writer.WriteField<idx_t>(maximum_object_size);
110
- transform_options.Serialize(writer);
111
119
  writer.WriteField<bool>(auto_detect);
112
120
  writer.WriteField<idx_t>(sample_size);
113
- writer.WriteList<string>(names);
114
- writer.WriteList<idx_t>(valid_cols);
115
121
  writer.WriteField<idx_t>(max_depth);
116
- writer.WriteField<JSONRecordType>(record_type);
122
+
123
+ transform_options.Serialize(writer);
124
+ writer.WriteList<string>(names);
117
125
  if (!date_format.empty()) {
118
126
  writer.WriteString(date_format);
119
- } else {
127
+ } else if (date_format_map.HasFormats(LogicalTypeId::DATE)) {
120
128
  writer.WriteString(date_format_map.GetFormat(LogicalTypeId::DATE).format_specifier);
129
+ } else {
130
+ writer.WriteString("");
121
131
  }
122
132
  if (!timestamp_format.empty()) {
123
133
  writer.WriteString(timestamp_format);
124
- } else {
134
+ } else if (date_format_map.HasFormats(LogicalTypeId::TIMESTAMP)) {
125
135
  writer.WriteString(date_format_map.GetFormat(LogicalTypeId::TIMESTAMP).format_specifier);
136
+ } else {
137
+ writer.WriteString("");
126
138
  }
127
139
  }
128
140
 
129
- void JSONScanData::Deserialize(FieldReader &reader) {
141
+ void JSONScanData::Deserialize(ClientContext &context, FieldReader &reader) {
130
142
  type = reader.ReadRequired<JSONScanType>();
143
+
131
144
  options.Deserialize(reader);
132
- file_paths = reader.ReadRequiredList<string>();
145
+
146
+ reader_bind = reader.ReadRequiredSerializable<MultiFileReaderBindData, MultiFileReaderBindData>();
147
+
148
+ files = reader.ReadRequiredList<string>();
149
+ InitializeReaders(context);
150
+
133
151
  ignore_errors = reader.ReadRequired<bool>();
134
152
  maximum_object_size = reader.ReadRequired<idx_t>();
135
- transform_options.Deserialize(reader);
136
153
  auto_detect = reader.ReadRequired<bool>();
137
154
  sample_size = reader.ReadRequired<idx_t>();
138
- names = reader.ReadRequiredList<string>();
139
- valid_cols = reader.ReadRequiredList<idx_t>();
140
155
  max_depth = reader.ReadRequired<idx_t>();
141
- record_type = reader.ReadRequired<JSONRecordType>();
156
+
157
+ transform_options.Deserialize(reader);
158
+ names = reader.ReadRequiredList<string>();
142
159
  date_format = reader.ReadRequired<string>();
143
160
  timestamp_format = reader.ReadRequired<string>();
144
161
 
@@ -146,86 +163,97 @@ void JSONScanData::Deserialize(FieldReader &reader) {
146
163
  transform_options.date_format_map = &date_format_map;
147
164
  }
148
165
 
149
- JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, JSONScanData &bind_data_p)
150
- : bind_data(bind_data_p), allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
166
+ JSONScanGlobalState::JSONScanGlobalState(ClientContext &context, const JSONScanData &bind_data_p)
167
+ : bind_data(bind_data_p), transform_options(bind_data.transform_options),
168
+ allocator(BufferManager::GetBufferManager(context).GetBufferAllocator()),
151
169
  buffer_capacity(bind_data.maximum_object_size * 2), file_index(0), batch_index(0),
152
170
  system_threads(TaskScheduler::GetScheduler(context).NumberOfThreads()) {
153
- if (bind_data.stored_readers.empty()) {
154
- json_readers.reserve(bind_data.file_paths.size());
155
- for (idx_t i = 0; i < bind_data.file_paths.size(); i++) {
156
- json_readers.push_back(make_uniq<BufferedJSONReader>(context, bind_data.options, bind_data.file_paths[i]));
157
- }
158
- } else {
159
- json_readers = std::move(bind_data.stored_readers);
160
- }
161
171
  }
162
172
 
163
173
  JSONScanLocalState::JSONScanLocalState(ClientContext &context, JSONScanGlobalState &gstate)
164
- : scan_count(0), array_idx(0), array_offset(0), batch_index(DConstants::INVALID_INDEX), bind_data(gstate.bind_data),
165
- json_allocator(BufferAllocator::Get(context)), current_reader(nullptr), current_buffer_handle(nullptr),
166
- is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
174
+ : scan_count(0), batch_index(DConstants::INVALID_INDEX), total_read_size(0), total_tuple_count(0),
175
+ bind_data(gstate.bind_data), allocator(BufferAllocator::Get(context)), current_reader(nullptr),
176
+ current_buffer_handle(nullptr), is_last(false), buffer_size(0), buffer_offset(0), prev_buffer_remainder(0) {
167
177
 
168
178
  // Buffer to reconstruct JSON values when they cross a buffer boundary
169
- reconstruct_buffer = gstate.allocator.Allocate(gstate.bind_data.maximum_object_size + YYJSON_PADDING_SIZE);
170
-
171
- // This is needed for JSONFormat::UNSTRUCTURED, to make use of YYJSON_READ_INSITU
172
- current_buffer_copy = gstate.allocator.Allocate(gstate.buffer_capacity);
173
- buffer_copy_ptr = (const char *)current_buffer_copy.get();
179
+ reconstruct_buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
174
180
  }
175
181
 
176
182
  JSONGlobalTableFunctionState::JSONGlobalTableFunctionState(ClientContext &context, TableFunctionInitInput &input)
177
- : state(context, (JSONScanData &)*input.bind_data) {
183
+ : state(context, input.bind_data->Cast<JSONScanData>()) {
178
184
  }
179
185
 
180
186
  unique_ptr<GlobalTableFunctionState> JSONGlobalTableFunctionState::Init(ClientContext &context,
181
187
  TableFunctionInitInput &input) {
182
- auto &bind_data = (JSONScanData &)*input.bind_data;
188
+ auto &bind_data = input.bind_data->Cast<JSONScanData>();
183
189
  auto result = make_uniq<JSONGlobalTableFunctionState>(context, input);
190
+ auto &gstate = result->state;
184
191
 
185
192
  // Perform projection pushdown
186
- if (bind_data.type == JSONScanType::READ_JSON) {
187
- D_ASSERT(input.column_ids.size() <= bind_data.names.size()); // Can't project to have more columns
188
- vector<string> names;
189
- names.reserve(input.column_ids.size());
190
- for (idx_t i = 0; i < input.column_ids.size(); i++) {
191
- const auto &id = input.column_ids[i];
192
- if (IsRowIdColumnId(id)) {
193
- continue;
193
+ for (idx_t col_idx = 0; col_idx < input.column_ids.size(); col_idx++) {
194
+ const auto &col_id = input.column_ids[col_idx];
195
+
196
+ // Skip any multi-file reader / row id stuff
197
+ if (col_id == bind_data.reader_bind.filename_idx || IsRowIdColumnId(col_id)) {
198
+ continue;
199
+ }
200
+ bool skip = false;
201
+ for (const auto &hive_partitioning_index : bind_data.reader_bind.hive_partitioning_indexes) {
202
+ if (col_id == hive_partitioning_index.index) {
203
+ skip = true;
204
+ break;
194
205
  }
195
- names.push_back(std::move(bind_data.names[id]));
196
- bind_data.valid_cols.push_back(i);
197
206
  }
198
- if (names.size() < bind_data.names.size()) {
199
- // If we are auto-detecting, but don't need all columns present in the file,
200
- // then we don't need to throw an error if we encounter an unseen column
201
- bind_data.transform_options.error_unknown_key = false;
207
+ if (skip) {
208
+ continue;
202
209
  }
203
- bind_data.names = std::move(names);
210
+
211
+ gstate.column_indices.push_back(col_idx);
212
+ gstate.names.push_back(bind_data.names[col_id]);
213
+ }
214
+
215
+ if (gstate.names.size() < bind_data.names.size() || bind_data.options.file_options.union_by_name) {
216
+ // If we are auto-detecting, but don't need all columns present in the file,
217
+ // then we don't need to throw an error if we encounter an unseen column
218
+ gstate.transform_options.error_unknown_key = false;
219
+ }
220
+
221
+ // Place readers where they belong
222
+ if (bind_data.initial_reader) {
223
+ bind_data.initial_reader->Reset();
224
+ gstate.json_readers.emplace_back(bind_data.initial_reader.get());
204
225
  }
226
+ for (const auto &reader : bind_data.union_readers) {
227
+ reader->Reset();
228
+ gstate.json_readers.emplace_back(reader.get());
229
+ }
230
+
231
+ vector<LogicalType> dummy_types(input.column_ids.size(), LogicalType::ANY);
232
+ for (auto &reader : gstate.json_readers) {
233
+ MultiFileReader::FinalizeBind(reader->GetOptions().file_options, gstate.bind_data.reader_bind,
234
+ reader->GetFileName(), gstate.names, dummy_types, bind_data.names,
235
+ input.column_ids, reader->reader_data);
236
+ }
237
+
205
238
  return std::move(result);
206
239
  }
207
240
 
208
241
  idx_t JSONGlobalTableFunctionState::MaxThreads() const {
209
242
  auto &bind_data = state.bind_data;
243
+ if (bind_data.options.format == JSONFormat::NEWLINE_DELIMITED &&
244
+ bind_data.options.compression == FileCompressionType::UNCOMPRESSED) {
245
+ return state.system_threads;
246
+ }
210
247
 
211
- auto num_files = bind_data.file_paths.size();
212
- idx_t readers_per_file;
213
- if (bind_data.options.format == JSONFormat::UNSTRUCTURED) {
214
- // Unstructured necessitates single thread
215
- readers_per_file = 1;
216
- } else if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
248
+ if (!state.json_readers.empty() && state.json_readers[0]->IsOpen()) {
217
249
  auto &reader = *state.json_readers[0];
218
- const auto &options = reader.GetOptions();
219
- if (options.format == JSONFormat::UNSTRUCTURED || options.compression != FileCompressionType::UNCOMPRESSED) {
220
- // Auto-detected unstructured - same story, compression also really limits parallelism
221
- readers_per_file = 1;
222
- } else {
250
+ if (reader.IsParallel()) { // Auto-detected parallel scan
223
251
  return state.system_threads;
224
252
  }
225
- } else {
226
- return state.system_threads;
227
253
  }
228
- return num_files * readers_per_file;
254
+
255
+ // One reader per file
256
+ return bind_data.files.size();
229
257
  }
230
258
 
231
259
  JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context, JSONScanGlobalState &gstate)
@@ -235,12 +263,12 @@ JSONLocalTableFunctionState::JSONLocalTableFunctionState(ClientContext &context,
235
263
  unique_ptr<LocalTableFunctionState> JSONLocalTableFunctionState::Init(ExecutionContext &context,
236
264
  TableFunctionInitInput &input,
237
265
  GlobalTableFunctionState *global_state) {
238
- auto &gstate = (JSONGlobalTableFunctionState &)*global_state;
266
+ auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>();
239
267
  auto result = make_uniq<JSONLocalTableFunctionState>(context.client, gstate.state);
240
268
 
241
269
  // Copy the transform options / date format map because we need to do thread-local stuff
242
270
  result->state.date_format_map = gstate.state.bind_data.date_format_map;
243
- result->state.transform_options = gstate.state.bind_data.transform_options;
271
+ result->state.transform_options = gstate.state.transform_options;
244
272
  result->state.transform_options.date_format_map = &result->state.date_format_map;
245
273
 
246
274
  return std::move(result);
@@ -250,7 +278,7 @@ idx_t JSONLocalTableFunctionState::GetBatchIndex() const {
250
278
  return state.batch_index;
251
279
  }
252
280
 
253
- static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, idx_t &buffer_size) {
281
+ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset, const idx_t &buffer_size) {
254
282
  for (; buffer_offset != buffer_size; buffer_offset++) {
255
283
  if (!StringUtil::CharacterIsSpace(buffer_ptr[buffer_offset])) {
256
284
  break;
@@ -259,50 +287,21 @@ static inline void SkipWhitespace(const char *buffer_ptr, idx_t &buffer_offset,
259
287
  }
260
288
 
261
289
  idx_t JSONScanLocalState::ReadNext(JSONScanGlobalState &gstate) {
262
- json_allocator.Reset();
263
-
264
- if ((gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
265
- gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) &&
266
- array_idx < scan_count) {
267
- return GetObjectsFromArray(gstate);
268
- }
290
+ allocator.Reset();
269
291
 
270
- idx_t count = 0;
292
+ scan_count = 0;
271
293
  if (buffer_offset == buffer_size) {
272
294
  if (!ReadNextBuffer(gstate)) {
273
- return 0;
295
+ return scan_count;
274
296
  }
275
- if (current_buffer_handle->buffer_index != 0 &&
276
- current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
297
+ if (current_buffer_handle->buffer_index != 0 && current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
277
298
  ReconstructFirstObject(gstate);
278
- count++;
299
+ scan_count++;
279
300
  }
280
301
  }
302
+ ParseNextChunk();
281
303
 
282
- auto &options = current_reader->GetOptions();
283
- switch (options.format) {
284
- case JSONFormat::UNSTRUCTURED:
285
- ReadUnstructured(count);
286
- break;
287
- case JSONFormat::NEWLINE_DELIMITED:
288
- ReadNewlineDelimited(count);
289
- break;
290
- default:
291
- throw InternalException("Unknown JSON format");
292
- }
293
- scan_count = count;
294
-
295
- // Skip over any remaining whitespace for the next scan
296
- SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
297
-
298
- if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
299
- gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
300
- array_idx = 0;
301
- array_offset = 0;
302
- return GetObjectsFromArray(gstate);
303
- }
304
-
305
- return count;
304
+ return scan_count;
306
305
  }
307
306
 
308
307
  static inline const char *NextNewline(const char *ptr, idx_t size) {
@@ -319,7 +318,71 @@ static inline const char *PreviousNewline(const char *ptr) {
319
318
  return ptr;
320
319
  }
321
320
 
322
- static inline void TrimWhitespace(JSONLine &line) {
321
+ static inline const char *NextJSONDefault(const char *ptr, const idx_t size, const char *const end) {
322
+ idx_t parents = 0;
323
+ while (ptr != end) {
324
+ switch (*ptr++) {
325
+ case '{':
326
+ case '[':
327
+ parents++;
328
+ continue;
329
+ case '}':
330
+ case ']':
331
+ parents--;
332
+ break;
333
+ case '"':
334
+ while (ptr != end) {
335
+ auto string_char = *ptr++;
336
+ if (string_char == '"') {
337
+ break;
338
+ } else if (string_char == '\\') {
339
+ if (ptr != end) {
340
+ ptr++; // Skip the escaped char
341
+ }
342
+ }
343
+ }
344
+ break;
345
+ default:
346
+ continue;
347
+ }
348
+
349
+ if (parents == 0) {
350
+ break;
351
+ }
352
+ }
353
+
354
+ return ptr;
355
+ }
356
+
357
+ static inline const char *NextJSON(const char *ptr, const idx_t size) {
358
+ D_ASSERT(!StringUtil::CharacterIsSpace(*ptr)); // Should be handled before
359
+
360
+ const char *const end = ptr + size;
361
+ switch (*ptr) {
362
+ case '{':
363
+ case '[':
364
+ case '"':
365
+ ptr = NextJSONDefault(ptr, size, end);
366
+ break;
367
+ default:
368
+ // Special case: JSON array containing JSON without clear "parents", i.e., not obj/arr/str
369
+ while (ptr != end) {
370
+ switch (*ptr++) {
371
+ case ',':
372
+ case ']':
373
+ ptr--;
374
+ break;
375
+ default:
376
+ continue;
377
+ }
378
+ break;
379
+ }
380
+ }
381
+
382
+ return ptr == end ? nullptr : ptr;
383
+ }
384
+
385
+ static inline void TrimWhitespace(JSONString &line) {
323
386
  while (line.size != 0 && StringUtil::CharacterIsSpace(line[0])) {
324
387
  line.pointer++;
325
388
  line.size--;
@@ -329,204 +392,248 @@ static inline void TrimWhitespace(JSONLine &line) {
329
392
  }
330
393
  }
331
394
 
332
- yyjson_val *JSONScanLocalState::ParseLine(char *line_start, idx_t line_size, idx_t remaining, JSONLine &line) {
395
+ void JSONScanLocalState::ParseJSON(char *const json_start, const idx_t json_size, const idx_t remaining) {
333
396
  yyjson_doc *doc;
334
- if (bind_data.ignore_errors) {
335
- doc = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
336
- json_allocator.GetYYJSONAllocator());
397
+ yyjson_read_err err;
398
+ if (bind_data.type == JSONScanType::READ_JSON_OBJECTS) { // If we return strings, we cannot parse INSITU
399
+ doc = JSONCommon::ReadDocumentUnsafe(json_start, json_size, JSONCommon::READ_STOP_FLAG, allocator.GetYYAlc(),
400
+ &err);
337
401
  } else {
338
- yyjson_read_err err;
339
- if (bind_data.type != JSONScanType::READ_JSON_OBJECTS) {
340
- // Optimization: if we don't ignore errors, and don't need to return strings, we can parse INSITU
341
- doc = JSONCommon::ReadDocumentUnsafe(line_start, remaining, JSONCommon::STOP_READ_FLAG,
342
- json_allocator.GetYYJSONAllocator(), &err);
343
- idx_t read_size = yyjson_doc_get_read_size(doc);
344
- if (read_size > line_size) {
345
- err.pos = line_size;
346
- err.code = YYJSON_READ_ERROR_UNEXPECTED_END;
347
- err.msg = "unexpected end of data";
348
- } else if (read_size < line_size) {
349
- idx_t diff = line_size - read_size;
350
- char *ptr = line_start + read_size;
351
- for (idx_t i = 0; i < diff; i++) {
352
- if (!StringUtil::CharacterIsSpace(ptr[i])) {
353
- err.pos = read_size;
354
- err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT;
355
- err.msg = "unexpected content after document";
356
- }
357
- }
358
- }
359
- } else {
360
- doc = JSONCommon::ReadDocumentUnsafe(line_start, line_size, JSONCommon::READ_FLAG,
361
- json_allocator.GetYYJSONAllocator(), &err);
362
- }
363
- if (err.code != YYJSON_READ_SUCCESS) {
364
- current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
402
+ doc = JSONCommon::ReadDocumentUnsafe(json_start, remaining, JSONCommon::READ_INSITU_FLAG, allocator.GetYYAlc(),
403
+ &err);
404
+ }
405
+ if (!bind_data.ignore_errors && err.code != YYJSON_READ_SUCCESS) {
406
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
407
+ }
408
+
409
+ // We parse with YYJSON_STOP_WHEN_DONE, so we need to check this by hand
410
+ const auto read_size = yyjson_doc_get_read_size(doc);
411
+ if (read_size > json_size) {
412
+ // Can't go past the boundary, even with ignore_errors
413
+ err.code = YYJSON_READ_ERROR_UNEXPECTED_END;
414
+ err.msg = "unexpected end of data";
415
+ err.pos = json_size;
416
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err,
417
+ "Try auto-detecting the JSON format");
418
+ } else if (!bind_data.ignore_errors && read_size < json_size) {
419
+ idx_t off = read_size;
420
+ idx_t rem = json_size;
421
+ SkipWhitespace(json_start, off, rem);
422
+ if (off != rem) { // Between end of document and boundary should be whitespace only
423
+ err.code = YYJSON_READ_ERROR_UNEXPECTED_CONTENT;
424
+ err.msg = "unexpected content after document";
425
+ err.pos = read_size;
426
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err,
427
+ "Try auto-detecting the JSON format");
365
428
  }
366
429
  }
367
- lines_or_objects_in_buffer++;
368
430
 
369
- if (doc) {
370
- // Set the JSONLine and trim
371
- line = JSONLine(line_start, line_size);
372
- TrimWhitespace(line);
373
- return doc->root;
374
- } else {
375
- return nullptr;
431
+ lines_or_objects_in_buffer++;
432
+ if (!doc) {
433
+ values[scan_count] = nullptr;
434
+ return;
376
435
  }
436
+
437
+ // Set the JSONLine and trim
438
+ units[scan_count] = JSONString(json_start, json_size);
439
+ TrimWhitespace(units[scan_count]);
440
+ values[scan_count] = doc->root;
377
441
  }
378
442
 
379
- idx_t JSONScanLocalState::GetObjectsFromArray(JSONScanGlobalState &gstate) {
380
- idx_t arr_count = 0;
443
+ void JSONScanLocalState::ThrowObjectSizeError(const idx_t object_size) {
444
+ throw InvalidInputException(
445
+ "\"maximum_object_size\" of %llu bytes exceeded while reading file \"%s\" (>%llu bytes)."
446
+ "\n Try increasing \"maximum_object_size\".",
447
+ bind_data.maximum_object_size, current_reader->GetFileName(), object_size);
448
+ }
381
449
 
382
- size_t idx, max;
383
- yyjson_val *val;
384
- for (; array_idx < scan_count; array_idx++, array_offset = 0) {
385
- auto &value = values[array_idx];
386
- if (!value) {
387
- continue;
388
- }
389
- if (unsafe_yyjson_is_arr(value)) {
390
- yyjson_arr_foreach(value, idx, max, val) {
391
- if (idx < array_offset) {
392
- continue;
393
- }
394
- array_values[arr_count++] = val;
395
- if (arr_count == STANDARD_VECTOR_SIZE) {
396
- break;
450
+ void JSONScanLocalState::ThrowInvalidAtEndError() {
451
+ throw InvalidInputException("Invalid JSON detected at the end of file \"%s\".", current_reader->GetFileName());
452
+ }
453
+
454
+ static pair<JSONFormat, JSONRecordType> DetectFormatAndRecordType(const char *const buffer_ptr, const idx_t buffer_size,
455
+ yyjson_alc *alc) {
456
+ // First we do the easy check whether it's NEWLINE_DELIMITED
457
+ auto line_end = NextNewline(buffer_ptr, buffer_size);
458
+ if (line_end != nullptr) {
459
+ idx_t line_size = line_end - buffer_ptr;
460
+ SkipWhitespace(buffer_ptr, line_size, buffer_size);
461
+
462
+ yyjson_read_err error;
463
+ auto doc = JSONCommon::ReadDocumentUnsafe((char *)buffer_ptr, line_size, JSONCommon::READ_FLAG, alc, &error);
464
+ if (error.code == YYJSON_READ_SUCCESS) { // We successfully read the line
465
+ if (yyjson_is_arr(doc->root) && line_size == buffer_size) {
466
+ // It's just one array, let's actually assume ARRAY, not NEWLINE_DELIMITED
467
+ if (yyjson_arr_size(doc->root) == 0 || yyjson_is_obj(yyjson_arr_get(doc->root, 0))) {
468
+ // Either an empty array (assume records), or an array of objects
469
+ return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
470
+ } else {
471
+ return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
397
472
  }
473
+ } else if (yyjson_is_obj(doc->root)) {
474
+ return make_pair(JSONFormat::NEWLINE_DELIMITED, JSONRecordType::RECORDS);
475
+ } else {
476
+ return make_pair(JSONFormat::NEWLINE_DELIMITED, JSONRecordType::VALUES);
398
477
  }
399
- array_offset = idx + 1;
400
- if (arr_count == STANDARD_VECTOR_SIZE) {
401
- break;
402
- }
403
- } else if (!gstate.bind_data.ignore_errors) {
404
- ThrowTransformError(
405
- array_idx,
406
- StringUtil::Format("Expected JSON ARRAY but got %s: %s\nTry setting json_format to 'records'",
407
- JSONCommon::ValTypeToString(value), JSONCommon::ValToString(value, 50)));
408
478
  }
409
479
  }
410
- return arr_count;
480
+
481
+ // Skip whitespace
482
+ idx_t buffer_offset = 0;
483
+ SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
484
+ auto remaining = buffer_size - buffer_offset;
485
+
486
+ // We know it's not NEWLINE_DELIMITED at this point, if there's a '{', we know it's not ARRAY either
487
+ // Also if it's fully whitespace we just return something because we don't know
488
+ if (remaining == 0 || buffer_ptr[buffer_offset] == '{') {
489
+ return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::RECORDS);
490
+ }
491
+
492
+ // We know it's not top-level records, if it's not '[', it's not ARRAY either
493
+ if (buffer_ptr[buffer_offset] != '[') {
494
+ return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::VALUES);
495
+ }
496
+
497
+ // It's definitely an ARRAY, but now we have to figure out if there's more than one top-level array
498
+ yyjson_read_err error;
499
+ auto doc = JSONCommon::ReadDocumentUnsafe((char *)buffer_ptr + buffer_offset, remaining, JSONCommon::READ_STOP_FLAG,
500
+ alc, &error);
501
+ if (error.code == YYJSON_READ_SUCCESS) {
502
+ D_ASSERT(yyjson_is_arr(doc->root));
503
+
504
+ // We successfully read something!
505
+ buffer_offset += yyjson_doc_get_read_size(doc);
506
+ SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
507
+ remaining = buffer_size - buffer_offset;
508
+
509
+ if (remaining != 0) { // There's more
510
+ return make_pair(JSONFormat::UNSTRUCTURED, JSONRecordType::VALUES);
511
+ }
512
+
513
+ // Just one array, check what's in there
514
+ if (yyjson_arr_size(doc->root) == 0 || yyjson_is_obj(yyjson_arr_get(doc->root, 0))) {
515
+ // Either an empty array (assume records), or an array of objects
516
+ return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
517
+ } else {
518
+ return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
519
+ }
520
+ }
521
+
522
+ // We weren't able to parse an array, could be broken or an array larger than our buffer size, let's skip over '['
523
+ SkipWhitespace(buffer_ptr, ++buffer_offset, --remaining);
524
+ remaining = buffer_size - buffer_offset;
525
+
526
+ // If it's '{' we know there's RECORDS in the ARRAY, else it's VALUES
527
+ if (remaining == 0 || buffer_ptr[buffer_offset] == '{') {
528
+ return make_pair(JSONFormat::ARRAY, JSONRecordType::RECORDS);
529
+ }
530
+
531
+ // It's not RECORDS, so it must be VALUES
532
+ return make_pair(JSONFormat::ARRAY, JSONRecordType::VALUES);
411
533
  }
412
534
 
413
535
  bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
536
+ AllocatedData buffer;
414
537
  if (current_reader) {
415
- D_ASSERT(current_buffer_handle);
538
+ // Keep track of this for accurate errors
416
539
  current_reader->SetBufferLineOrObjectCount(current_buffer_handle->buffer_index, lines_or_objects_in_buffer);
417
- if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
418
- // Close files that are done if we're not sampling
419
- current_reader->CloseJSONFile();
540
+
541
+ // Try to re-use existing buffer
542
+ if (current_buffer_handle && --current_buffer_handle->readers == 0) {
543
+ buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
544
+ } else {
545
+ buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
420
546
  }
421
- }
422
547
 
423
- AllocatedData buffer;
424
- if (current_buffer_handle && --current_buffer_handle->readers == 0) {
425
- D_ASSERT(current_reader);
426
- // Take ownership of the last buffer this thread used and remove entry from map
427
- buffer = current_reader->RemoveBuffer(current_buffer_handle->buffer_index);
548
+ if (!is_last) {
549
+ if (current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
550
+ memcpy(buffer.get(), reconstruct_buffer.get(),
551
+ prev_buffer_remainder); // Copy last bit of previous buffer
552
+ }
553
+ } else {
554
+ if (gstate.bind_data.type != JSONScanType::SAMPLE) {
555
+ current_reader->CloseJSONFile(); // Close files that are done if we're not sampling
556
+ }
557
+ current_reader = nullptr;
558
+ }
428
559
  } else {
429
- // Allocate a new buffer
430
560
  buffer = gstate.allocator.Allocate(gstate.buffer_capacity);
431
561
  }
432
562
  buffer_ptr = (const char *)buffer.get();
433
563
 
434
- if (current_reader && current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
435
- // Copy last bit of previous buffer
436
- memcpy(buffer.get(), reconstruct_buffer.get(), prev_buffer_remainder);
437
- }
438
-
439
564
  idx_t buffer_index;
440
565
  while (true) {
441
566
  if (current_reader) {
442
- ReadNextBuffer(gstate, buffer_index);
443
- if (buffer_size != 0) {
567
+ ReadNextBufferInternal(gstate, buffer_index);
568
+ if (buffer_size == 0) {
569
+ if (is_last && gstate.bind_data.type != JSONScanType::SAMPLE) {
570
+ current_reader->CloseJSONFile();
571
+ }
572
+ if (current_reader->IsParallel()) {
573
+ // If this threads' current reader is still the one at gstate.file_index,
574
+ // this thread can end the parallel scan
575
+ lock_guard<mutex> guard(gstate.lock);
576
+ if (gstate.file_index < gstate.json_readers.size() &&
577
+ current_reader == gstate.json_readers[gstate.file_index].get()) {
578
+ gstate.file_index++; // End parallel scan
579
+ }
580
+ }
581
+ current_reader = nullptr;
582
+ } else {
444
583
  break; // We read something!
445
584
  }
446
585
  }
447
586
 
448
- // No reader, or exhausted current reader
449
- lock_guard<mutex> guard(gstate.lock);
450
- D_ASSERT(gstate.file_index <= gstate.json_readers.size());
451
- if (gstate.file_index == gstate.json_readers.size()) {
452
- return false; // No more files left
453
- }
454
- if (current_reader && current_reader == gstate.json_readers[gstate.file_index].get() &&
455
- current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
456
- // We had a reader, but we didn't read anything, move to the next file
457
- gstate.file_index++;
458
- }
459
- // Check again since we may have just updated
460
- if (gstate.file_index == gstate.json_readers.size()) {
461
- return false; // No more files left
462
- }
463
-
464
- // Try the next reader
465
- current_reader = gstate.json_readers[gstate.file_index].get();
466
- auto &options = current_reader->GetOptions();
467
- if (current_reader->IsOpen()) {
468
- if (options.format == JSONFormat::UNSTRUCTURED ||
469
- (options.compression != FileCompressionType::UNCOMPRESSED &&
470
- gstate.file_index < gstate.json_readers.size())) {
471
- // Can only be open from schema detection
472
- batch_index = gstate.batch_index++;
473
- gstate.file_index++;
587
+ // This thread needs a new reader
588
+ {
589
+ lock_guard<mutex> guard(gstate.lock);
590
+ if (gstate.file_index == gstate.json_readers.size()) {
591
+ return false; // No more files left
474
592
  }
475
- continue; // It's open, this thread joins the scan
476
- }
477
-
478
- // Unopened file
479
- current_reader->OpenJSONFile();
480
- batch_index = gstate.batch_index++;
481
- if (options.format == JSONFormat::UNSTRUCTURED || (options.format == JSONFormat::NEWLINE_DELIMITED &&
482
- options.compression != FileCompressionType::UNCOMPRESSED &&
483
- gstate.file_index < gstate.json_readers.size())) {
484
- gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
485
- }
486
- if (options.format != JSONFormat::AUTO_DETECT) {
487
- continue; // Re-enter loop to proceed reading
488
- }
489
593
 
490
- // We have to detect whether it's UNSTRUCTURED/NEWLINE_DELIMITED - hold the gstate lock while we do this
491
- ReadNextBuffer(gstate, buffer_index);
492
- if (buffer_size == 0) {
493
- gstate.file_index++; // Empty file, move to the next one
494
- continue;
495
- }
594
+ // Try the next reader
595
+ current_reader = gstate.json_readers[gstate.file_index].get();
596
+ if (current_reader->IsOpen()) {
597
+ // Can only be open from auto detection, so these should be known
598
+ if (!current_reader->IsParallel()) {
599
+ batch_index = gstate.batch_index++;
600
+ gstate.file_index++;
601
+ }
602
+ continue; // Re-enter the loop to start scanning the assigned file
603
+ }
496
604
 
497
- auto line_end = NextNewline(buffer_ptr, buffer_size);
498
- if (line_end == nullptr) {
499
- options.format = JSONFormat::UNSTRUCTURED; // No newlines in buffer at all
500
- gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
501
- break;
502
- }
503
- idx_t line_size = line_end - buffer_ptr;
605
+ current_reader->OpenJSONFile();
606
+ batch_index = gstate.batch_index++;
607
+ if (current_reader->GetFormat() != JSONFormat::AUTO_DETECT) {
608
+ if (!current_reader->IsParallel()) {
609
+ gstate.file_index++;
610
+ }
611
+ continue;
612
+ }
504
613
 
505
- yyjson_read_err error;
506
- JSONCommon::ReadDocumentUnsafe((char *)buffer_ptr, line_size, JSONCommon::READ_FLAG,
507
- json_allocator.GetYYJSONAllocator(), &error);
508
- // Detected format depends on whether we can successfully read the first line
509
- if (error.code == YYJSON_READ_SUCCESS) {
510
- options.format = JSONFormat::NEWLINE_DELIMITED;
511
- } else {
512
- options.format = JSONFormat::UNSTRUCTURED;
513
- gstate.file_index++; // UNSTRUCTURED necessitates single-threaded read
514
- }
614
+ // If we have a low amount of files, we auto-detect within the lock,
615
+ // so other threads may join a parallel NDJSON scan
616
+ if (gstate.json_readers.size() < 100) {
617
+ if (ReadAndAutoDetect(gstate, buffer_index, false)) {
618
+ continue;
619
+ }
620
+ break;
621
+ }
515
622
 
516
- // Optimization: decompression limits parallelism quite a bit
517
- if (options.compression != FileCompressionType::UNCOMPRESSED &&
518
- gstate.file_index < gstate.json_readers.size()) {
623
+ // Increment the file index within the lock, then read/auto-detect outside of the lock
519
624
  gstate.file_index++;
520
625
  }
521
626
 
627
+ // High amount of files, just do 1 thread per file
628
+ if (ReadAndAutoDetect(gstate, buffer_index, true)) {
629
+ continue;
630
+ }
522
631
  break;
523
632
  }
524
633
  D_ASSERT(buffer_size != 0); // We should have read something if we got here
525
634
 
526
- idx_t readers;
527
- if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
528
- readers = 1;
529
- } else {
635
+ idx_t readers = 1;
636
+ if (current_reader->IsParallel()) {
530
637
  readers = is_last ? 1 : 2;
531
638
  }
532
639
 
@@ -535,24 +642,57 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
535
642
  current_buffer_handle = json_buffer_handle.get();
536
643
  current_reader->InsertBuffer(buffer_index, std::move(json_buffer_handle));
537
644
 
538
- buffer_offset = 0;
539
645
  prev_buffer_remainder = 0;
540
646
  lines_or_objects_in_buffer = 0;
541
647
 
648
+ // YYJSON needs this
542
649
  memset((void *)(buffer_ptr + buffer_size), 0, YYJSON_PADDING_SIZE);
543
- if (current_reader->GetOptions().format == JSONFormat::UNSTRUCTURED) {
544
- memcpy((void *)buffer_copy_ptr, buffer_ptr, buffer_size + YYJSON_PADDING_SIZE);
545
- }
546
650
 
547
651
  return true;
548
652
  }
549
653
 
550
- void JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate, idx_t &buffer_index) {
654
+ bool JSONScanLocalState::ReadAndAutoDetect(JSONScanGlobalState &gstate, idx_t &buffer_index,
655
+ const bool already_incremented_file_idx) {
656
+ // We have to detect the JSON format - hold the gstate lock while we do this
657
+ ReadNextBufferInternal(gstate, buffer_index);
658
+ if (buffer_size == 0) {
659
+ if (!already_incremented_file_idx) {
660
+ gstate.file_index++; // Empty file, move to the next one
661
+ }
662
+ return true;
663
+ }
664
+
665
+ auto format_and_record_type = DetectFormatAndRecordType(buffer_ptr, buffer_size, allocator.GetYYAlc());
666
+ current_reader->SetFormat(format_and_record_type.first);
667
+ if (current_reader->GetRecordType() == JSONRecordType::AUTO_DETECT) {
668
+ current_reader->SetRecordType(format_and_record_type.second);
669
+ }
670
+ if (current_reader->GetFormat() == JSONFormat::ARRAY) {
671
+ SkipOverArrayStart();
672
+ }
673
+
674
+ if (bind_data.options.record_type == JSONRecordType::RECORDS &&
675
+ current_reader->GetRecordType() != JSONRecordType::RECORDS) {
676
+ throw InvalidInputException("Expected file \"%s\" to contain records, detected non-record JSON instead.",
677
+ current_reader->GetFileName());
678
+ }
679
+ if (!already_incremented_file_idx && !current_reader->IsParallel()) {
680
+ gstate.file_index++;
681
+ }
682
+ return false;
683
+ }
684
+
685
+ void JSONScanLocalState::ReadNextBufferInternal(JSONScanGlobalState &gstate, idx_t &buffer_index) {
551
686
  if (current_reader->GetFileHandle().CanSeek()) {
552
687
  ReadNextBufferSeek(gstate, buffer_index);
553
688
  } else {
554
689
  ReadNextBufferNoSeek(gstate, buffer_index);
555
690
  }
691
+
692
+ buffer_offset = 0;
693
+ if (buffer_index == 0 && current_reader->GetFormat() == JSONFormat::ARRAY) {
694
+ SkipOverArrayStart();
695
+ }
556
696
  }
557
697
 
558
698
  void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &buffer_index) {
@@ -567,13 +707,13 @@ void JSONScanLocalState::ReadNextBufferSeek(JSONScanGlobalState &gstate, idx_t &
567
707
  buffer_index = current_reader->GetBufferIndex();
568
708
 
569
709
  read_size = file_handle.GetPositionAndSize(read_position, request_size);
570
- is_last = file_handle.Remaining() == 0;
710
+ is_last = read_size < request_size;
571
711
 
572
712
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
573
- throw InvalidInputException("Invalid JSON detected at the end of file %s", current_reader->file_path);
713
+ ThrowInvalidAtEndError();
574
714
  }
575
715
 
576
- if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
716
+ if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
577
717
  batch_index = gstate.batch_index++;
578
718
  }
579
719
  }
@@ -604,10 +744,10 @@ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t
604
744
  is_last = read_size < request_size;
605
745
 
606
746
  if (!gstate.bind_data.ignore_errors && read_size == 0 && prev_buffer_remainder != 0) {
607
- throw InvalidInputException("Invalid JSON detected at the end of file %s", current_reader->file_path);
747
+ ThrowInvalidAtEndError();
608
748
  }
609
749
 
610
- if (current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED) {
750
+ if (current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED) {
611
751
  batch_index = gstate.batch_index++;
612
752
  }
613
753
  }
@@ -618,9 +758,38 @@ void JSONScanLocalState::ReadNextBufferNoSeek(JSONScanGlobalState &gstate, idx_t
618
758
  }
619
759
  }
620
760
 
761
+ void JSONScanLocalState::SkipOverArrayStart() {
762
+ // First read of this buffer, check if it's actually an array and skip over the bytes
763
+ SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
764
+ if (buffer_offset == buffer_size) {
765
+ return; // Empty file
766
+ }
767
+ if (buffer_ptr[buffer_offset] != '[') {
768
+ throw InvalidInputException(
769
+ "Expected top-level JSON array with format='array', but first character is '%c' in file \"%s\"."
770
+ "\n Try setting format='auto' or format='newline_delimited'.",
771
+ buffer_ptr[buffer_offset], current_reader->GetFileName());
772
+ }
773
+ SkipWhitespace(buffer_ptr, ++buffer_offset, buffer_size);
774
+ if (buffer_offset >= buffer_size) {
775
+ throw InvalidInputException("Missing closing brace ']' in JSON array with format='array' in file \"%s\"",
776
+ current_reader->GetFileName());
777
+ }
778
+ if (buffer_ptr[buffer_offset] == ']') {
779
+ // Empty array
780
+ SkipWhitespace(buffer_ptr, ++buffer_offset, buffer_size);
781
+ if (buffer_offset != buffer_size) {
782
+ throw InvalidInputException(
783
+ "Empty array with trailing data when parsing JSON array with format='array' in file \"%s\"",
784
+ current_reader->GetFileName());
785
+ }
786
+ return;
787
+ }
788
+ }
789
+
621
790
  void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
622
791
  D_ASSERT(current_buffer_handle->buffer_index != 0);
623
- D_ASSERT(current_reader->GetOptions().format == JSONFormat::NEWLINE_DELIMITED);
792
+ D_ASSERT(current_reader->GetFormat() == JSONFormat::NEWLINE_DELIMITED);
624
793
 
625
794
  // Spinlock until the previous batch index has also read its buffer
626
795
  JSONBufferHandle *previous_buffer_handle = nullptr;
@@ -638,9 +807,8 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
638
807
  memcpy(reconstruct_ptr, part1_ptr, part1_size);
639
808
  // Now find the newline in the current block
640
809
  auto line_end = NextNewline(buffer_ptr, buffer_size);
641
- if (line_end == nullptr) { // TODO I don't think we can ignore this even with ignore_errors ...
642
- throw InvalidInputException("maximum_object_size of %llu bytes exceeded (>%llu bytes), is the JSON valid?",
643
- bind_data.maximum_object_size, buffer_size - buffer_offset);
810
+ if (line_end == nullptr) {
811
+ ThrowObjectSizeError(buffer_size - buffer_offset);
644
812
  } else {
645
813
  line_end++;
646
814
  }
@@ -648,8 +816,7 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
648
816
 
649
817
  idx_t line_size = part1_size + part2_size;
650
818
  if (line_size > bind_data.maximum_object_size) {
651
- throw InvalidInputException("maximum_object_size of %llu bytes exceeded (%llu bytes), is the JSON valid?",
652
- bind_data.maximum_object_size, line_size);
819
+ ThrowObjectSizeError(line_size);
653
820
  }
654
821
 
655
822
  // And copy the remainder of the line to the reconstruct buffer
@@ -662,85 +829,68 @@ void JSONScanLocalState::ReconstructFirstObject(JSONScanGlobalState &gstate) {
662
829
  current_reader->RemoveBuffer(current_buffer_handle->buffer_index - 1);
663
830
  }
664
831
 
665
- values[0] = ParseLine((char *)reconstruct_ptr, line_size, line_size, lines[0]);
832
+ ParseJSON((char *)reconstruct_ptr, line_size, line_size);
666
833
  }
667
834
 
668
- void JSONScanLocalState::ReadUnstructured(idx_t &count) {
669
- // yyjson does not always return YYJSON_READ_ERROR_UNEXPECTED_END properly
670
- // if a different error code happens within the last 50 bytes
671
- // we assume it should be YYJSON_READ_ERROR_UNEXPECTED_END instead
672
- static constexpr idx_t END_BOUND = 50;
673
-
674
- const auto max_obj_size = reconstruct_buffer.GetSize();
675
- yyjson_read_err error;
676
- for (; count < STANDARD_VECTOR_SIZE; count++) {
677
- const auto obj_start = buffer_ptr + buffer_offset;
678
- const auto obj_copy_start = buffer_copy_ptr + buffer_offset;
679
-
680
- idx_t remaining = buffer_size - buffer_offset;
681
- if (remaining == 0) {
682
- break;
683
- }
684
-
685
- // Read next JSON doc
686
- auto read_doc = JSONCommon::ReadDocumentUnsafe((char *)obj_start, remaining, JSONCommon::STOP_READ_FLAG,
687
- json_allocator.GetYYJSONAllocator(), &error);
688
- if (error.code == YYJSON_READ_SUCCESS) {
689
- idx_t line_size = yyjson_doc_get_read_size(read_doc);
690
- lines[count] = JSONLine(obj_copy_start, line_size);
691
- TrimWhitespace(lines[count]);
835
+ void JSONScanLocalState::ParseNextChunk() {
836
+ auto buffer_offset_before = buffer_offset;
692
837
 
693
- buffer_offset += line_size;
694
- SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
695
- lines_or_objects_in_buffer++;
696
- } else if (error.pos > max_obj_size) {
697
- current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error,
698
- "Try increasing \"maximum_object_size\".");
699
- } else if (!is_last && (error.code == YYJSON_READ_ERROR_UNEXPECTED_END || remaining - error.pos < END_BOUND)) {
700
- // Copy remaining to reconstruct_buffer
701
- const auto reconstruct_ptr = reconstruct_buffer.get();
702
- memcpy(reconstruct_ptr, obj_copy_start, remaining);
703
- prev_buffer_remainder = remaining;
704
- buffer_offset = buffer_size;
705
- break;
706
- } else {
707
- current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, error);
708
- }
709
- values[count] = read_doc->root;
710
- }
711
- }
712
-
713
- void JSONScanLocalState::ReadNewlineDelimited(idx_t &count) {
714
- for (; count < STANDARD_VECTOR_SIZE; count++) {
715
- auto line_start = buffer_ptr + buffer_offset;
838
+ const auto format = current_reader->GetFormat();
839
+ for (; scan_count < STANDARD_VECTOR_SIZE; scan_count++) {
840
+ SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
841
+ auto json_start = buffer_ptr + buffer_offset;
716
842
  idx_t remaining = buffer_size - buffer_offset;
717
843
  if (remaining == 0) {
718
844
  break;
719
845
  }
720
-
721
- // Search for newline
722
- auto line_end = NextNewline(line_start, remaining);
723
-
724
- if (line_end == nullptr) {
846
+ const char *json_end = format == JSONFormat::NEWLINE_DELIMITED ? NextNewline(json_start, remaining)
847
+ : NextJSON(json_start, remaining);
848
+ if (json_end == nullptr) {
725
849
  // We reached the end of the buffer
726
850
  if (!is_last) {
727
851
  // Last bit of data belongs to the next batch
852
+ if (format != JSONFormat::NEWLINE_DELIMITED) {
853
+ if (scan_count == 0) {
854
+ ThrowObjectSizeError(remaining);
855
+ }
856
+ memcpy(reconstruct_buffer.get(), json_start, remaining);
857
+ prev_buffer_remainder = remaining;
858
+ }
728
859
  buffer_offset = buffer_size;
729
860
  break;
730
861
  }
731
- line_end = line_start + remaining;
862
+ json_end = json_start + remaining;
732
863
  }
733
- idx_t line_size = line_end - line_start;
734
864
 
735
- values[count] = ParseLine((char *)line_start, line_size, remaining, lines[count]);
865
+ idx_t json_size = json_end - json_start;
866
+ ParseJSON((char *)json_start, json_size, remaining);
867
+ buffer_offset += json_size;
736
868
 
737
- buffer_offset += line_size;
869
+ if (format == JSONFormat::ARRAY) {
870
+ SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
871
+ if (buffer_ptr[buffer_offset] == ',' || buffer_ptr[buffer_offset] == ']') {
872
+ buffer_offset++;
873
+ } else { // We can't ignore this error, even with 'ignore_errors'
874
+ yyjson_read_err err;
875
+ err.code = YYJSON_READ_ERROR_UNEXPECTED_CHARACTER;
876
+ err.msg = "unexpected character";
877
+ err.pos = json_size;
878
+ current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
879
+ }
880
+ }
738
881
  SkipWhitespace(buffer_ptr, buffer_offset, buffer_size);
739
882
  }
883
+
884
+ total_read_size += buffer_offset - buffer_offset_before;
885
+ total_tuple_count += scan_count;
740
886
  }
741
887
 
742
888
  yyjson_alc *JSONScanLocalState::GetAllocator() {
743
- return json_allocator.GetYYJSONAllocator();
889
+ return allocator.GetYYAlc();
890
+ }
891
+
892
+ const MultiFileReaderData &JSONScanLocalState::GetReaderData() const {
893
+ return current_reader->reader_data;
744
894
  }
745
895
 
746
896
  void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &error_message) {
@@ -751,4 +901,73 @@ void JSONScanLocalState::ThrowTransformError(idx_t object_index, const string &e
751
901
  current_reader->ThrowTransformError(current_buffer_handle->buffer_index, line_or_object_in_buffer, error_message);
752
902
  }
753
903
 
904
+ double JSONScan::ScanProgress(ClientContext &context, const FunctionData *bind_data_p,
905
+ const GlobalTableFunctionState *global_state) {
906
+ auto &gstate = global_state->Cast<JSONGlobalTableFunctionState>().state;
907
+ double progress = 0;
908
+ for (auto &reader : gstate.json_readers) {
909
+ progress += reader->GetProgress();
910
+ }
911
+ return progress / double(gstate.json_readers.size());
912
+ }
913
+
914
+ idx_t JSONScan::GetBatchIndex(ClientContext &context, const FunctionData *bind_data_p,
915
+ LocalTableFunctionState *local_state, GlobalTableFunctionState *global_state) {
916
+ auto &lstate = local_state->Cast<JSONLocalTableFunctionState>();
917
+ return lstate.GetBatchIndex();
918
+ }
919
+
920
+ unique_ptr<NodeStatistics> JSONScan::Cardinality(ClientContext &context, const FunctionData *bind_data) {
921
+ auto &data = bind_data->Cast<JSONScanData>();
922
+ idx_t per_file_cardinality;
923
+ if (data.initial_reader && data.initial_reader->IsOpen()) {
924
+ per_file_cardinality = data.initial_reader->GetFileHandle().FileSize() / data.avg_tuple_size;
925
+ } else {
926
+ per_file_cardinality = 42; // The cardinality of an unknown JSON file is the almighty number 42
927
+ }
928
+ return make_uniq<NodeStatistics>(per_file_cardinality * data.files.size());
929
+ }
930
+
931
+ void JSONScan::ComplexFilterPushdown(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p,
932
+ vector<unique_ptr<Expression>> &filters) {
933
+ auto &data = bind_data_p->Cast<JSONScanData>();
934
+ auto reset_reader =
935
+ MultiFileReader::ComplexFilterPushdown(context, data.files, data.options.file_options, get, filters);
936
+ if (reset_reader) {
937
+ MultiFileReader::PruneReaders(data);
938
+ }
939
+ }
940
+
941
+ void JSONScan::Serialize(FieldWriter &writer, const FunctionData *bind_data_p, const TableFunction &function) {
942
+ auto &bind_data = bind_data_p->Cast<JSONScanData>();
943
+ bind_data.Serialize(writer);
944
+ }
945
+
946
+ unique_ptr<FunctionData> JSONScan::Deserialize(ClientContext &context, FieldReader &reader, TableFunction &function) {
947
+ auto result = make_uniq<JSONScanData>();
948
+ result->Deserialize(context, reader);
949
+ return std::move(result);
950
+ }
951
+
952
+ void JSONScan::TableFunctionDefaults(TableFunction &table_function) {
953
+ MultiFileReader::AddParameters(table_function);
954
+
955
+ table_function.named_parameters["maximum_object_size"] = LogicalType::UINTEGER;
956
+ table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN;
957
+ table_function.named_parameters["format"] = LogicalType::VARCHAR;
958
+ table_function.named_parameters["compression"] = LogicalType::VARCHAR;
959
+
960
+ table_function.table_scan_progress = ScanProgress;
961
+ table_function.get_batch_index = GetBatchIndex;
962
+ table_function.cardinality = Cardinality;
963
+
964
+ table_function.serialize = Serialize;
965
+ table_function.deserialize = Deserialize;
966
+
967
+ table_function.projection_pushdown = true;
968
+ table_function.filter_pushdown = false;
969
+ table_function.filter_prune = false;
970
+ table_function.pushdown_complex_filter = ComplexFilterPushdown;
971
+ }
972
+
754
973
  } // namespace duckdb