duckdb 0.7.2-dev3515.0 → 0.7.2-dev3666.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/configure.py +2 -0
  2. package/package.json +1 -1
  3. package/src/database.cpp +1 -0
  4. package/src/duckdb/extension/json/buffered_json_reader.cpp +56 -17
  5. package/src/duckdb/extension/json/include/buffered_json_reader.hpp +56 -31
  6. package/src/duckdb/extension/json/include/json_common.hpp +5 -4
  7. package/src/duckdb/extension/json/include/json_executors.hpp +13 -18
  8. package/src/duckdb/extension/json/include/json_functions.hpp +3 -0
  9. package/src/duckdb/extension/json/include/json_scan.hpp +106 -153
  10. package/src/duckdb/extension/json/include/json_transform.hpp +2 -2
  11. package/src/duckdb/extension/json/json_common.cpp +1 -1
  12. package/src/duckdb/extension/json/json_functions/copy_json.cpp +94 -38
  13. package/src/duckdb/extension/json/json_functions/json_contains.cpp +7 -8
  14. package/src/duckdb/extension/json/json_functions/json_create.cpp +7 -7
  15. package/src/duckdb/extension/json/json_functions/json_merge_patch.cpp +4 -4
  16. package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +4 -4
  17. package/src/duckdb/extension/json/json_functions/json_structure.cpp +7 -5
  18. package/src/duckdb/extension/json/json_functions/json_transform.cpp +10 -8
  19. package/src/duckdb/extension/json/json_functions/json_valid.cpp +1 -1
  20. package/src/duckdb/extension/json/json_functions/read_json.cpp +167 -169
  21. package/src/duckdb/extension/json/json_functions/read_json_objects.cpp +37 -16
  22. package/src/duckdb/extension/json/json_functions.cpp +11 -4
  23. package/src/duckdb/extension/json/json_scan.cpp +593 -374
  24. package/src/duckdb/extension/parquet/parquet-extension.cpp +5 -0
  25. package/src/duckdb/src/catalog/catalog_entry/macro_catalog_entry.cpp +42 -0
  26. package/src/duckdb/src/catalog/catalog_search_path.cpp +5 -0
  27. package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
  28. package/src/duckdb/src/common/constants.cpp +1 -0
  29. package/src/duckdb/src/common/file_system.cpp +26 -6
  30. package/src/duckdb/src/common/local_file_system.cpp +0 -13
  31. package/src/duckdb/src/common/types/vector.cpp +3 -3
  32. package/src/duckdb/src/common/types/vector_buffer.cpp +11 -3
  33. package/src/duckdb/src/common/types/vector_cache.cpp +5 -5
  34. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +12 -6
  35. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +10 -0
  36. package/src/duckdb/src/execution/operator/schema/physical_create_type.cpp +2 -2
  37. package/src/duckdb/src/function/macro_function.cpp +43 -0
  38. package/src/duckdb/src/function/pragma/pragma_queries.cpp +5 -3
  39. package/src/duckdb/src/function/scalar/strftime_format.cpp +1 -0
  40. package/src/duckdb/src/function/scalar_macro_function.cpp +10 -0
  41. package/src/duckdb/src/function/table/copy_csv.cpp +68 -18
  42. package/src/duckdb/src/function/table/read_csv.cpp +30 -3
  43. package/src/duckdb/src/function/table/version/pragma_version.cpp +8 -2
  44. package/src/duckdb/src/function/table_macro_function.cpp +10 -0
  45. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/column_dependency_manager.hpp +1 -1
  46. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/macro_catalog_entry.hpp +3 -1
  47. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/scalar_macro_catalog_entry.hpp +0 -6
  48. package/src/duckdb/src/include/duckdb/catalog/catalog_entry/table_macro_catalog_entry.hpp +0 -6
  49. package/src/duckdb/src/include/duckdb/catalog/catalog_search_path.hpp +1 -1
  50. package/src/duckdb/src/include/duckdb/catalog/similar_catalog_entry.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/common/constants.hpp +2 -0
  52. package/src/duckdb/src/include/duckdb/common/exception.hpp +3 -3
  53. package/src/duckdb/src/include/duckdb/common/field_writer.hpp +3 -3
  54. package/src/duckdb/src/include/duckdb/common/file_system.hpp +5 -0
  55. package/src/duckdb/src/include/duckdb/common/http_state.hpp +2 -1
  56. package/src/duckdb/src/include/duckdb/common/hugeint.hpp +6 -6
  57. package/src/duckdb/src/include/duckdb/common/limits.hpp +46 -46
  58. package/src/duckdb/src/include/duckdb/common/operator/cast_operators.hpp +8 -8
  59. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +6 -6
  60. package/src/duckdb/src/include/duckdb/common/operator/convert_to_string.hpp +1 -1
  61. package/src/duckdb/src/include/duckdb/common/operator/decimal_cast_operators.hpp +2 -4
  62. package/src/duckdb/src/include/duckdb/common/operator/string_cast.hpp +1 -1
  63. package/src/duckdb/src/include/duckdb/common/operator/subtract.hpp +1 -1
  64. package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +1 -1
  65. package/src/duckdb/src/include/duckdb/common/re2_regex.hpp +1 -1
  66. package/src/duckdb/src/include/duckdb/common/string_util.hpp +7 -7
  67. package/src/duckdb/src/include/duckdb/common/types/chunk_collection.hpp +10 -10
  68. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection.hpp +12 -12
  69. package/src/duckdb/src/include/duckdb/common/types/column/column_data_collection_iterators.hpp +2 -2
  70. package/src/duckdb/src/include/duckdb/common/types/value.hpp +1 -1
  71. package/src/duckdb/src/include/duckdb/common/types/vector_buffer.hpp +12 -2
  72. package/src/duckdb/src/include/duckdb/common/types.hpp +2 -2
  73. package/src/duckdb/src/include/duckdb/common/winapi.hpp +1 -1
  74. package/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +1 -1
  75. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +9 -5
  76. package/src/duckdb/src/include/duckdb/execution/operator/schema/physical_create_type.hpp +1 -1
  77. package/src/duckdb/src/include/duckdb/function/aggregate_function.hpp +10 -14
  78. package/src/duckdb/src/include/duckdb/function/macro_function.hpp +7 -1
  79. package/src/duckdb/src/include/duckdb/function/scalar/strftime_format.hpp +3 -4
  80. package/src/duckdb/src/include/duckdb/function/scalar_macro_function.hpp +7 -2
  81. package/src/duckdb/src/include/duckdb/function/table_function.hpp +1 -1
  82. package/src/duckdb/src/include/duckdb/function/table_macro_function.hpp +5 -0
  83. package/src/duckdb/src/include/duckdb/function/udf_function.hpp +56 -50
  84. package/src/duckdb/src/include/duckdb/main/appender.hpp +2 -2
  85. package/src/duckdb/src/include/duckdb/main/client_context.hpp +2 -2
  86. package/src/duckdb/src/include/duckdb/main/client_data.hpp +3 -1
  87. package/src/duckdb/src/include/duckdb/main/connection.hpp +8 -9
  88. package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
  89. package/src/duckdb/src/include/duckdb/main/query_result.hpp +3 -3
  90. package/src/duckdb/src/include/duckdb/main/relation.hpp +6 -7
  91. package/src/duckdb/src/include/duckdb/optimizer/optimizer_extension.hpp +1 -1
  92. package/src/duckdb/src/include/duckdb/parser/column_list.hpp +7 -7
  93. package/src/duckdb/src/include/duckdb/parser/parsed_data/attach_info.hpp +4 -7
  94. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_macro_info.hpp +8 -12
  95. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_sequence_info.hpp +6 -20
  96. package/src/duckdb/src/include/duckdb/parser/parsed_data/create_type_info.hpp +6 -18
  97. package/src/duckdb/src/include/duckdb/parser/parsed_data/detach_info.hpp +4 -8
  98. package/src/duckdb/src/include/duckdb/parser/parsed_data/drop_info.hpp +4 -38
  99. package/src/duckdb/src/include/duckdb/parser/parsed_data/transaction_info.hpp +5 -2
  100. package/src/duckdb/src/include/duckdb/parser/parsed_data/vacuum_info.hpp +10 -10
  101. package/src/duckdb/src/include/duckdb/parser/parser_extension.hpp +2 -2
  102. package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +1 -1
  103. package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +1 -1
  104. package/src/duckdb/src/include/duckdb/planner/operator_extension.hpp +2 -2
  105. package/src/duckdb/src/include/duckdb/storage/storage_extension.hpp +2 -2
  106. package/src/duckdb/src/parser/parsed_data/attach_info.cpp +42 -0
  107. package/src/duckdb/src/parser/parsed_data/create_index_info.cpp +0 -7
  108. package/src/duckdb/src/parser/parsed_data/create_info.cpp +19 -8
  109. package/src/duckdb/src/parser/parsed_data/create_macro_info.cpp +46 -0
  110. package/src/duckdb/src/parser/parsed_data/create_sequence_info.cpp +56 -0
  111. package/src/duckdb/src/parser/parsed_data/create_type_info.cpp +47 -0
  112. package/src/duckdb/src/parser/parsed_data/detach_info.cpp +34 -0
  113. package/src/duckdb/src/parser/parsed_data/drop_info.cpp +46 -0
  114. package/src/duckdb/src/parser/parsed_data/transaction_info.cpp +24 -0
  115. package/src/duckdb/src/parser/parsed_data/vacuum_info.cpp +37 -0
  116. package/src/duckdb/src/planner/binder/expression/bind_star_expression.cpp +27 -9
  117. package/src/duckdb/src/planner/binder/statement/bind_copy.cpp +9 -4
  118. package/src/duckdb/src/planner/binder/statement/bind_create.cpp +2 -1
  119. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +1 -0
  120. package/src/duckdb/src/planner/binder/tableref/bind_basetableref.cpp +1 -1
  121. package/src/duckdb/src/planner/logical_operator.cpp +1 -2
  122. package/src/duckdb/src/planner/operator/logical_create_index.cpp +16 -25
  123. package/src/duckdb/src/planner/operator/logical_insert.cpp +30 -0
  124. package/src/duckdb/src/planner/operator/logical_simple.cpp +33 -5
  125. package/src/duckdb/src/planner/parsed_data/bound_create_table_info.cpp +6 -16
  126. package/src/duckdb/src/planner/planner.cpp +4 -13
  127. package/src/duckdb/src/storage/checkpoint_manager.cpp +12 -6
  128. package/src/duckdb/src/storage/single_file_block_manager.cpp +0 -4
  129. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  130. package/src/duckdb/third_party/libpg_query/src_backend_parser_gram.cpp +5735 -5773
  131. package/src/duckdb/ub_src_catalog_catalog_entry.cpp +1 -1
  132. package/src/duckdb/ub_src_parser_parsed_data.cpp +16 -0
  133. package/src/duckdb/src/catalog/catalog_entry/scalar_macro_catalog_entry.cpp +0 -104
@@ -8,88 +8,83 @@ namespace duckdb {
8
8
 
9
9
  void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vector<LogicalType> &return_types,
10
10
  vector<string> &names) {
11
- auto original_scan_type = bind_data.type;
12
- bind_data.type = JSONScanType::SAMPLE; // Set scan type to sample for the auto-detect, we restore it later
13
- JSONScanGlobalState gstate(context, bind_data);
14
- JSONScanLocalState lstate(context, gstate);
15
- ArenaAllocator allocator(BufferAllocator::Get(context));
11
+ // Change scan type during detection
12
+ bind_data.type = JSONScanType::SAMPLE;
16
13
 
17
- // Read for the specified sample size
14
+ // These are used across files (if union_by_name)
18
15
  JSONStructureNode node;
19
- bool more_than_one = false;
16
+ ArenaAllocator allocator(BufferAllocator::Get(context));
20
17
  Vector string_vector(LogicalType::VARCHAR);
21
- idx_t remaining = bind_data.sample_size;
22
- while (remaining != 0) {
23
- allocator.Reset();
24
-
25
- if (gstate.file_index >= 10) {
26
- // We really shouldn't open more than 10 files when sampling
27
- break;
28
- }
29
18
 
30
- auto read_count = lstate.ReadNext(gstate);
31
- if (lstate.scan_count > 1) {
32
- more_than_one = true;
33
- }
34
- if (read_count == 0) {
35
- break;
36
- }
37
- idx_t next = MinValue<idx_t>(read_count, remaining);
38
- yyjson_val **values;
39
- if (bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
40
- bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
41
- values = lstate.array_values;
19
+ // Loop through the files (if union_by_name, else just sample the first file)
20
+ for (idx_t file_idx = 0; file_idx < bind_data.files.size(); file_idx++) {
21
+ // Create global/local state and place the reader in the right field
22
+ JSONScanGlobalState gstate(context, bind_data);
23
+ JSONScanLocalState lstate(context, gstate);
24
+ if (file_idx == 0) {
25
+ gstate.json_readers.emplace_back(bind_data.initial_reader.get());
42
26
  } else {
43
- values = lstate.values;
27
+ gstate.json_readers.emplace_back(bind_data.union_readers[file_idx - 1].get());
44
28
  }
45
- for (idx_t i = 0; i < next; i++) {
46
- if (values[i]) {
47
- JSONStructure::ExtractStructure(values[i], node);
29
+
30
+ // Read and detect schema
31
+ idx_t remaining = bind_data.sample_size;
32
+ while (remaining != 0) {
33
+ allocator.Reset();
34
+ auto read_count = lstate.ReadNext(gstate);
35
+ if (read_count == 0) {
36
+ break;
37
+ }
38
+
39
+ idx_t next = MinValue<idx_t>(read_count, remaining);
40
+ for (idx_t i = 0; i < next; i++) {
41
+ const auto &val = lstate.values[i];
42
+ if (val) {
43
+ JSONStructure::ExtractStructure(val, node);
44
+ }
48
45
  }
46
+ if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
47
+ continue;
48
+ }
49
+ node.InitializeCandidateTypes(bind_data.max_depth);
50
+ node.RefineCandidateTypes(lstate.values, next, string_vector, allocator, bind_data.date_format_map);
51
+ remaining -= next;
52
+ }
53
+
54
+ if (file_idx == 0 && lstate.total_tuple_count != 0) {
55
+ bind_data.avg_tuple_size = lstate.total_read_size / lstate.total_tuple_count;
49
56
  }
50
- if (!node.ContainsVarchar()) { // Can't refine non-VARCHAR types
51
- continue;
57
+
58
+ // Close the file and stop detection if not union_by_name
59
+ if (!bind_data.options.file_options.union_by_name) {
60
+ break;
52
61
  }
53
- node.InitializeCandidateTypes(bind_data.max_depth);
54
- node.RefineCandidateTypes(values, next, string_vector, allocator, bind_data.date_format_map);
55
- remaining -= next;
56
62
  }
57
- bind_data.type = original_scan_type;
63
+
64
+ // Restore the scan type
65
+ bind_data.type = JSONScanType::READ_JSON;
58
66
 
59
67
  // Convert structure to logical type
60
68
  auto type = JSONStructure::StructureToType(context, node, bind_data.max_depth);
61
69
 
62
- // Detect record type
63
- if (bind_data.record_type == JSONRecordType::AUTO) {
64
- switch (type.id()) {
65
- case LogicalTypeId::STRUCT:
66
- bind_data.record_type = JSONRecordType::RECORDS;
67
- break;
68
- case LogicalTypeId::LIST: {
69
- if (more_than_one) {
70
- bind_data.record_type = JSONRecordType::JSON;
71
- } else {
72
- type = ListType::GetChildType(type);
73
- if (type.id() == LogicalTypeId::STRUCT) {
74
- bind_data.record_type = JSONRecordType::ARRAY_OF_RECORDS;
75
- } else {
76
- bind_data.record_type = JSONRecordType::ARRAY_OF_JSON;
77
- }
78
- }
79
- break;
80
- }
81
- default:
82
- bind_data.record_type = JSONRecordType::JSON;
70
+ // Auto-detect record type
71
+ if (bind_data.options.record_type == JSONRecordType::AUTO_DETECT) {
72
+ if (type.id() == LogicalTypeId::STRUCT) {
73
+ bind_data.options.record_type = JSONRecordType::RECORDS;
74
+ } else {
75
+ bind_data.options.record_type = JSONRecordType::VALUES;
83
76
  }
84
77
  }
85
78
 
86
- // Detect return type
87
- if (bind_data.auto_detect) {
88
- bind_data.transform_options.date_format_map = &bind_data.date_format_map;
89
- if (type.id() != LogicalTypeId::STRUCT) {
90
- return_types.emplace_back(type);
91
- names.emplace_back("json");
92
- } else {
79
+ if (!bind_data.auto_detect) {
80
+ return;
81
+ }
82
+
83
+ bind_data.transform_options.date_format_map = &bind_data.date_format_map;
84
+
85
+ // Auto-detect columns
86
+ if (bind_data.options.record_type == JSONRecordType::RECORDS) {
87
+ if (type.id() == LogicalTypeId::STRUCT) {
93
88
  const auto &child_types = StructType::GetChildTypes(type);
94
89
  return_types.reserve(child_types.size());
95
90
  names.reserve(child_types.size());
@@ -97,26 +92,29 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
97
92
  return_types.emplace_back(child_type.second);
98
93
  names.emplace_back(child_type.first);
99
94
  }
95
+ } else {
96
+ throw BinderException("json_read expected records, but got non-record JSON instead."
97
+ "\n Try setting records='auto' or records='false'.");
100
98
  }
99
+ } else {
100
+ D_ASSERT(bind_data.options.record_type == JSONRecordType::VALUES);
101
+ return_types.emplace_back(type);
102
+ names.emplace_back("json");
101
103
  }
102
-
103
- for (auto &reader : gstate.json_readers) {
104
- if (reader->IsOpen()) {
105
- reader->Reset();
106
- }
107
- }
108
- bind_data.stored_readers = std::move(gstate.json_readers);
109
104
  }
110
105
 
111
- void JSONScan::InitializeBindData(ClientContext &context, JSONScanData &bind_data,
112
- const named_parameter_map_t &named_parameters, vector<string> &names,
113
- vector<LogicalType> &return_types) {
114
- for (auto &kv : named_parameters) {
106
+ unique_ptr<FunctionData> ReadJSONBind(ClientContext &context, TableFunctionBindInput &input,
107
+ vector<LogicalType> &return_types, vector<string> &names) {
108
+ // First bind default params
109
+ auto bind_data = make_uniq<JSONScanData>();
110
+ bind_data->Bind(context, input);
111
+
112
+ for (auto &kv : input.named_parameters) {
115
113
  auto loption = StringUtil::Lower(kv.first);
116
114
  if (loption == "columns") {
117
115
  auto &child_type = kv.second.type();
118
116
  if (child_type.id() != LogicalTypeId::STRUCT) {
119
- throw BinderException("read_json \"columns\" parameter requires a struct as input");
117
+ throw BinderException("read_json \"columns\" parameter requires a struct as input.");
120
118
  }
121
119
  auto &struct_children = StructValue::GetChildren(kv.second);
122
120
  D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
@@ -125,157 +123,158 @@ void JSONScan::InitializeBindData(ClientContext &context, JSONScanData &bind_dat
125
123
  auto &val = struct_children[i];
126
124
  names.push_back(name);
127
125
  if (val.type().id() != LogicalTypeId::VARCHAR) {
128
- throw BinderException("read_json \"columns\" parameter type specification must be VARCHAR");
126
+ throw BinderException("read_json \"columns\" parameter type specification must be VARCHAR.");
129
127
  }
130
128
  return_types.emplace_back(TransformStringToLogicalType(StringValue::Get(val), context));
131
129
  }
132
130
  D_ASSERT(names.size() == return_types.size());
133
131
  if (names.empty()) {
134
- throw BinderException("read_json \"columns\" parameter needs at least one column");
132
+ throw BinderException("read_json \"columns\" parameter needs at least one column.");
135
133
  }
136
- bind_data.names = names;
134
+ bind_data->names = names;
137
135
  } else if (loption == "auto_detect") {
138
- bind_data.auto_detect = BooleanValue::Get(kv.second);
136
+ bind_data->auto_detect = BooleanValue::Get(kv.second);
139
137
  } else if (loption == "sample_size") {
140
138
  auto arg = BigIntValue::Get(kv.second);
141
139
  if (arg == -1) {
142
- bind_data.sample_size = NumericLimits<idx_t>::Maximum();
140
+ bind_data->sample_size = NumericLimits<idx_t>::Maximum();
143
141
  } else if (arg > 0) {
144
- bind_data.sample_size = arg;
142
+ bind_data->sample_size = arg;
145
143
  } else {
146
144
  throw BinderException(
147
- "read_json \"sample_size\" parameter must be positive, or -1 to sample the entire file");
145
+ "read_json \"sample_size\" parameter must be positive, or -1 to sample the entire file.");
148
146
  }
149
147
  } else if (loption == "maximum_depth") {
150
148
  auto arg = BigIntValue::Get(kv.second);
151
149
  if (arg == -1) {
152
- bind_data.max_depth = NumericLimits<idx_t>::Maximum();
150
+ bind_data->max_depth = NumericLimits<idx_t>::Maximum();
153
151
  } else {
154
- bind_data.max_depth = arg;
152
+ bind_data->max_depth = arg;
155
153
  }
156
154
  } else if (loption == "dateformat" || loption == "date_format") {
157
155
  auto format_string = StringValue::Get(kv.second);
158
156
  if (StringUtil::Lower(format_string) == "iso") {
159
157
  format_string = "%Y-%m-%d";
160
158
  }
161
- bind_data.date_format = format_string;
159
+ bind_data->date_format = format_string;
162
160
 
163
161
  StrpTimeFormat format;
164
162
  auto error = StrTimeFormat::ParseFormatSpecifier(format_string, format);
165
163
  if (!error.empty()) {
166
- throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
164
+ throw InvalidInputException("read_json could not parse \"dateformat\": '%s'.", error.c_str());
167
165
  }
168
166
  } else if (loption == "timestampformat" || loption == "timestamp_format") {
169
167
  auto format_string = StringValue::Get(kv.second);
170
168
  if (StringUtil::Lower(format_string) == "iso") {
171
169
  format_string = "%Y-%m-%dT%H:%M:%S.%fZ";
172
170
  }
173
- bind_data.timestamp_format = format_string;
171
+ bind_data->timestamp_format = format_string;
174
172
 
175
173
  StrpTimeFormat format;
176
174
  auto error = StrTimeFormat::ParseFormatSpecifier(format_string, format);
177
175
  if (!error.empty()) {
178
- throw InvalidInputException("Could not parse TIMESTAMPFORMAT: %s", error.c_str());
176
+ throw InvalidInputException("read_json could not parse \"timestampformat\": '%s'.", error.c_str());
179
177
  }
180
- } else if (loption == "json_format") {
178
+ } else if (loption == "records") {
181
179
  auto arg = StringValue::Get(kv.second);
182
- if (arg == "records") {
183
- bind_data.record_type = JSONRecordType::RECORDS;
184
- } else if (arg == "array_of_records") {
185
- bind_data.record_type = JSONRecordType::ARRAY_OF_RECORDS;
186
- } else if (arg == "values") {
187
- bind_data.record_type = JSONRecordType::JSON;
188
- } else if (arg == "array_of_values") {
189
- bind_data.record_type = JSONRecordType::ARRAY_OF_JSON;
190
- } else if (arg == "auto") {
191
- bind_data.record_type = JSONRecordType::AUTO;
180
+ if (arg == "auto") {
181
+ bind_data->options.record_type = JSONRecordType::AUTO_DETECT;
182
+ } else if (arg == "true") {
183
+ bind_data->options.record_type = JSONRecordType::RECORDS;
184
+ } else if (arg == "false") {
185
+ bind_data->options.record_type = JSONRecordType::VALUES;
192
186
  } else {
193
- throw InvalidInputException("\"json_format\" must be one of ['records', 'array_of_records', 'json', "
194
- "'array_of_json', 'auto']");
187
+ throw InvalidInputException("read_json requires \"records\" to be one of ['auto', 'true', 'false'].");
195
188
  }
196
189
  }
197
190
  }
198
- }
199
-
200
- unique_ptr<FunctionData> ReadJSONBind(ClientContext &context, TableFunctionBindInput &input,
201
- vector<LogicalType> &return_types, vector<string> &names) {
202
- // First bind default params
203
- auto result = JSONScanData::Bind(context, input);
204
- auto &bind_data = (JSONScanData &)*result;
205
191
 
206
- JSONScan::InitializeBindData(context, bind_data, input.named_parameters, names, return_types);
192
+ // Specifying column names overrides auto-detect
193
+ if (!return_types.empty()) {
194
+ bind_data->auto_detect = false;
195
+ }
207
196
 
208
- if (!bind_data.names.empty()) {
209
- bind_data.auto_detect = false; // override auto_detect when columns are specified
210
- } else if (!bind_data.auto_detect) {
211
- throw BinderException("read_json \"columns\" parameter is required when auto_detect is false");
197
+ if (!bind_data->auto_detect) {
198
+ // Need to specify columns if RECORDS and not auto-detecting
199
+ if (return_types.empty()) {
200
+ throw BinderException("read_json requires columns to be specified through the \"columns\" parameter."
201
+ "\n Use read_json_auto or set auto_detect=true to automatically guess columns.");
202
+ }
203
+ // If we are reading VALUES, we can only have one column
204
+ if (bind_data->options.record_type == JSONRecordType::VALUES && return_types.size() != 1) {
205
+ throw BinderException("read_json requires a single column to be specified through the \"columns\" "
206
+ "parameter when \"records\" is set to 'false'.");
207
+ }
212
208
  }
213
209
 
214
- bind_data.InitializeFormats();
210
+ bind_data->InitializeFormats();
215
211
 
216
- if (bind_data.auto_detect || bind_data.record_type == JSONRecordType::AUTO) {
217
- JSONScan::AutoDetect(context, bind_data, return_types, names);
218
- bind_data.names = names;
212
+ if (bind_data->auto_detect || bind_data->options.record_type == JSONRecordType::AUTO_DETECT) {
213
+ JSONScan::AutoDetect(context, *bind_data, return_types, names);
214
+ bind_data->names = names;
215
+ D_ASSERT(return_types.size() == names.size());
219
216
  }
220
217
 
221
- auto &transform_options = bind_data.transform_options;
222
- transform_options.strict_cast = !bind_data.ignore_errors;
223
- transform_options.error_duplicate_key = !bind_data.ignore_errors;
218
+ bind_data->reader_bind =
219
+ MultiFileReader::BindOptions(bind_data->options.file_options, bind_data->files, return_types, names);
220
+
221
+ auto &transform_options = bind_data->transform_options;
222
+ transform_options.strict_cast = !bind_data->ignore_errors;
223
+ transform_options.error_duplicate_key = !bind_data->ignore_errors;
224
224
  transform_options.error_missing_key = false;
225
- transform_options.error_unknown_key = bind_data.auto_detect && !bind_data.ignore_errors;
225
+ transform_options.error_unknown_key = bind_data->auto_detect && !bind_data->ignore_errors;
226
226
  transform_options.delay_error = true;
227
227
 
228
- return result;
228
+ return bind_data;
229
229
  }
230
230
 
231
231
  static void ReadJSONFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
232
- auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
233
- auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
232
+ auto &gstate = data_p.global_state->Cast<JSONGlobalTableFunctionState>().state;
233
+ auto &lstate = data_p.local_state->Cast<JSONLocalTableFunctionState>().state;
234
234
 
235
235
  const auto count = lstate.ReadNext(gstate);
236
- yyjson_val **values;
237
- if (gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS ||
238
- gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_JSON) {
239
- values = lstate.array_values;
240
- } else {
241
- D_ASSERT(gstate.bind_data.record_type != JSONRecordType::AUTO);
242
- values = lstate.values;
243
- }
236
+ yyjson_val **values = lstate.values;
244
237
  output.SetCardinality(count);
245
238
 
246
- vector<Vector *> result_vectors;
247
- result_vectors.reserve(output.ColumnCount());
248
- for (auto &valid_col_idx : gstate.bind_data.valid_cols) {
249
- result_vectors.push_back(&output.data[valid_col_idx]);
250
- }
251
- D_ASSERT(result_vectors.size() == gstate.bind_data.names.size());
252
-
253
- // Pass current reader to transform options so we can get line number information if an error occurs
254
- bool success;
255
- if (gstate.bind_data.record_type == JSONRecordType::RECORDS ||
256
- gstate.bind_data.record_type == JSONRecordType::ARRAY_OF_RECORDS) {
257
- success = JSONTransform::TransformObject(values, lstate.GetAllocator(), count, gstate.bind_data.names,
258
- result_vectors, lstate.transform_options);
259
- } else {
260
- success = JSONTransform::Transform(values, lstate.GetAllocator(), *result_vectors[0], count,
261
- lstate.transform_options);
239
+ if (!gstate.names.empty()) {
240
+ vector<Vector *> result_vectors;
241
+ result_vectors.reserve(gstate.column_indices.size());
242
+ for (const auto &col_idx : gstate.column_indices) {
243
+ result_vectors.emplace_back(&output.data[col_idx]);
244
+ }
245
+
246
+ D_ASSERT(gstate.bind_data.options.record_type != JSONRecordType::AUTO_DETECT);
247
+ bool success;
248
+ if (gstate.bind_data.options.record_type == JSONRecordType::RECORDS) {
249
+ success = JSONTransform::TransformObject(values, lstate.GetAllocator(), count, gstate.names, result_vectors,
250
+ lstate.transform_options);
251
+ } else {
252
+ D_ASSERT(gstate.bind_data.options.record_type == JSONRecordType::VALUES);
253
+ success = JSONTransform::Transform(values, lstate.GetAllocator(), *result_vectors[0], count,
254
+ lstate.transform_options);
255
+ }
256
+
257
+ if (!success) {
258
+ string hint =
259
+ gstate.bind_data.auto_detect
260
+ ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'format' or "
261
+ "'records' manually, or setting 'ignore_errors' to true."
262
+ : "\nTry setting 'auto_detect' to true, specifying 'format' or 'records' manually, or setting "
263
+ "'ignore_errors' to true.";
264
+ lstate.ThrowTransformError(lstate.transform_options.object_index,
265
+ lstate.transform_options.error_message + hint);
266
+ }
262
267
  }
263
268
 
264
- if (!success) {
265
- string hint =
266
- gstate.bind_data.auto_detect
267
- ? "\nTry increasing 'sample_size', reducing 'maximum_depth', specifying 'columns', 'lines' or "
268
- "'json_format' manually, or setting 'ignore_errors' to true."
269
- : "\nTry setting 'auto_detect' to true, specifying 'lines' or 'json_format' manually, or setting "
270
- "'ignore_errors' to true.";
271
- lstate.ThrowTransformError(lstate.transform_options.object_index,
272
- lstate.transform_options.error_message + hint);
269
+ if (output.size() != 0) {
270
+ MultiFileReader::FinalizeChunk(gstate.bind_data.reader_bind, lstate.GetReaderData(), output);
273
271
  }
274
272
  }
275
273
 
276
274
  TableFunction JSONFunctions::GetReadJSONTableFunction(shared_ptr<JSONScanInfo> function_info) {
277
275
  TableFunction table_function({LogicalType::VARCHAR}, ReadJSONFunction, ReadJSONBind,
278
276
  JSONGlobalTableFunctionState::Init, JSONLocalTableFunctionState::Init);
277
+ table_function.name = "read_json";
279
278
 
280
279
  JSONScan::TableFunctionDefaults(table_function);
281
280
  table_function.named_parameters["columns"] = LogicalType::ANY;
@@ -285,10 +284,9 @@ TableFunction JSONFunctions::GetReadJSONTableFunction(shared_ptr<JSONScanInfo> f
285
284
  table_function.named_parameters["date_format"] = LogicalType::VARCHAR;
286
285
  table_function.named_parameters["timestampformat"] = LogicalType::VARCHAR;
287
286
  table_function.named_parameters["timestamp_format"] = LogicalType::VARCHAR;
288
- table_function.named_parameters["json_format"] = LogicalType::VARCHAR;
287
+ table_function.named_parameters["records"] = LogicalType::VARCHAR;
289
288
 
290
- table_function.projection_pushdown = true;
291
- // TODO: might be able to do filter pushdown/prune too
289
+ // TODO: might be able to do filter pushdown/prune ?
292
290
 
293
291
  table_function.function_info = std::move(function_info);
294
292
 
@@ -305,25 +303,25 @@ TableFunctionSet CreateJSONFunctionInfo(string name, shared_ptr<JSONScanInfo> in
305
303
  }
306
304
 
307
305
  TableFunctionSet JSONFunctions::GetReadJSONFunction() {
308
- auto info =
309
- make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::UNSTRUCTURED, JSONRecordType::RECORDS, false);
306
+ auto info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::ARRAY, JSONRecordType::RECORDS);
310
307
  return CreateJSONFunctionInfo("read_json", std::move(info));
311
308
  }
312
309
 
313
310
  TableFunctionSet JSONFunctions::GetReadNDJSONFunction() {
314
- auto info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
315
- JSONRecordType::RECORDS, false);
311
+ auto info =
312
+ make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::RECORDS);
316
313
  return CreateJSONFunctionInfo("read_ndjson", std::move(info));
317
314
  }
318
315
 
319
316
  TableFunctionSet JSONFunctions::GetReadJSONAutoFunction() {
320
- auto info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::AUTO, true);
317
+ auto info =
318
+ make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::AUTO_DETECT, JSONRecordType::AUTO_DETECT, true);
321
319
  return CreateJSONFunctionInfo("read_json_auto", std::move(info), true);
322
320
  }
323
321
 
324
322
  TableFunctionSet JSONFunctions::GetReadNDJSONAutoFunction() {
325
- auto info =
326
- make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::AUTO, true);
323
+ auto info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON, JSONFormat::NEWLINE_DELIMITED,
324
+ JSONRecordType::AUTO_DETECT, true);
327
325
  return CreateJSONFunctionInfo("read_ndjson_auto", std::move(info), true);
328
326
  }
329
327
 
@@ -6,34 +6,46 @@ namespace duckdb {
6
6
 
7
7
  unique_ptr<FunctionData> ReadJSONObjectsBind(ClientContext &context, TableFunctionBindInput &input,
8
8
  vector<LogicalType> &return_types, vector<string> &names) {
9
+ auto bind_data = make_uniq<JSONScanData>();
10
+ bind_data->Bind(context, input);
11
+
12
+ bind_data->names.emplace_back("json");
9
13
  return_types.push_back(JSONCommon::JSONType());
10
14
  names.emplace_back("json");
11
- return JSONScanData::Bind(context, input);
15
+
16
+ bind_data->reader_bind =
17
+ MultiFileReader::BindOptions(bind_data->options.file_options, bind_data->files, return_types, names);
18
+
19
+ return bind_data;
12
20
  }
13
21
 
14
22
  static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
15
- D_ASSERT(output.ColumnCount() == 1);
16
- D_ASSERT(JSONCommon::LogicalTypeIsJSON(output.data[0].GetType()));
17
- auto &gstate = ((JSONGlobalTableFunctionState &)*data_p.global_state).state;
18
- auto &lstate = ((JSONLocalTableFunctionState &)*data_p.local_state).state;
23
+ auto &gstate = data_p.global_state->Cast<JSONGlobalTableFunctionState>().state;
24
+ auto &lstate = data_p.local_state->Cast<JSONLocalTableFunctionState>().state;
19
25
 
20
26
  // Fetch next lines
21
27
  const auto count = lstate.ReadNext(gstate);
22
- const auto lines = lstate.lines;
28
+ const auto units = lstate.units;
23
29
  const auto objects = lstate.values;
24
30
 
25
- // Create the strings without copying them
26
- auto strings = FlatVector::GetData<string_t>(output.data[0]);
27
- auto &validity = FlatVector::Validity(output.data[0]);
28
- for (idx_t i = 0; i < count; i++) {
29
- if (objects[i]) {
30
- strings[i] = string_t(lines[i].pointer, lines[i].size);
31
- } else {
32
- validity.SetInvalid(i);
31
+ if (!gstate.names.empty()) {
32
+ // Create the strings without copying them
33
+ auto strings = FlatVector::GetData<string_t>(output.data[0]);
34
+ auto &validity = FlatVector::Validity(output.data[0]);
35
+ for (idx_t i = 0; i < count; i++) {
36
+ if (objects[i]) {
37
+ strings[i] = string_t(units[i].pointer, units[i].size);
38
+ } else {
39
+ validity.SetInvalid(i);
40
+ }
33
41
  }
34
42
  }
35
43
 
36
44
  output.SetCardinality(count);
45
+
46
+ if (output.size() != 0) {
47
+ MultiFileReader::FinalizeChunk(gstate.bind_data.reader_bind, lstate.GetReaderData(), output);
48
+ }
37
49
  }
38
50
 
39
51
  TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JSONScanInfo> function_info) {
@@ -49,7 +61,7 @@ TableFunction GetReadJSONObjectsTableFunction(bool list_parameter, shared_ptr<JS
49
61
  TableFunctionSet JSONFunctions::GetReadJSONObjectsFunction() {
50
62
  TableFunctionSet function_set("read_json_objects");
51
63
  auto function_info =
52
- make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::UNSTRUCTURED, JSONRecordType::JSON);
64
+ make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::ARRAY, JSONRecordType::RECORDS);
53
65
  function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
54
66
  function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
55
67
  return function_set;
@@ -57,8 +69,17 @@ TableFunctionSet JSONFunctions::GetReadJSONObjectsFunction() {
57
69
 
58
70
  TableFunctionSet JSONFunctions::GetReadNDJSONObjectsFunction() {
59
71
  TableFunctionSet function_set("read_ndjson_objects");
72
+ auto function_info = make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED,
73
+ JSONRecordType::RECORDS);
74
+ function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
75
+ function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
76
+ return function_set;
77
+ }
78
+
79
+ TableFunctionSet JSONFunctions::GetReadJSONObjectsAutoFunction() {
80
+ TableFunctionSet function_set("read_json_objects_auto");
60
81
  auto function_info =
61
- make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::NEWLINE_DELIMITED, JSONRecordType::JSON);
82
+ make_shared<JSONScanInfo>(JSONScanType::READ_JSON_OBJECTS, JSONFormat::AUTO_DETECT, JSONRecordType::RECORDS);
62
83
  function_set.AddFunction(GetReadJSONObjectsTableFunction(false, function_info));
63
84
  function_set.AddFunction(GetReadJSONObjectsTableFunction(true, function_info));
64
85
  return function_set;
@@ -1,5 +1,6 @@
1
1
  #include "json_functions.hpp"
2
2
 
3
+ #include "duckdb/common/file_system.hpp"
3
4
  #include "duckdb/execution/expression_executor.hpp"
4
5
  #include "duckdb/function/cast/cast_function_set.hpp"
5
6
  #include "duckdb/function/cast/default_casts.hpp"
@@ -50,7 +51,7 @@ bool JSONReadFunctionData::Equals(const FunctionData &other_p) const {
50
51
  }
51
52
 
52
53
  unique_ptr<FunctionData> JSONReadFunctionData::Bind(ClientContext &context, ScalarFunction &bound_function,
53
- vector<duckdb::unique_ptr<Expression>> &arguments) {
54
+ vector<unique_ptr<Expression>> &arguments) {
54
55
  D_ASSERT(bound_function.arguments.size() == 2);
55
56
  bool constant = false;
56
57
  string path = "";
@@ -80,7 +81,7 @@ bool JSONReadManyFunctionData::Equals(const FunctionData &other_p) const {
80
81
  }
81
82
 
82
83
  unique_ptr<FunctionData> JSONReadManyFunctionData::Bind(ClientContext &context, ScalarFunction &bound_function,
83
- vector<duckdb::unique_ptr<Expression>> &arguments) {
84
+ vector<unique_ptr<Expression>> &arguments) {
84
85
  D_ASSERT(bound_function.arguments.size() == 2);
85
86
  if (arguments[1]->HasParameter()) {
86
87
  throw ParameterNotResolvedException();
@@ -173,6 +174,7 @@ vector<TableFunctionSet> JSONFunctions::GetTableFunctions() {
173
174
  // Reads JSON as string
174
175
  functions.push_back(GetReadJSONObjectsFunction());
175
176
  functions.push_back(GetReadNDJSONObjectsFunction());
177
+ functions.push_back(GetReadJSONObjectsAutoFunction());
176
178
 
177
179
  // Read JSON as columnar data
178
180
  functions.push_back(GetReadJSONFunction());
@@ -199,16 +201,21 @@ unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context,
199
201
  return nullptr;
200
202
  }
201
203
  auto table_function = make_uniq<TableFunctionRef>();
202
- vector<duckdb::unique_ptr<ParsedExpression>> children;
204
+ vector<unique_ptr<ParsedExpression>> children;
203
205
  children.push_back(make_uniq<ConstantExpression>(Value(table_name)));
204
206
  table_function->function = make_uniq<FunctionExpression>("read_json_auto", std::move(children));
207
+
208
+ if (!FileSystem::HasGlob(table_name)) {
209
+ table_function->alias = FileSystem::ExtractBaseName(table_name);
210
+ }
211
+
205
212
  return std::move(table_function);
206
213
  }
207
214
 
208
215
  static bool CastVarcharToJSON(Vector &source, Vector &result, idx_t count, CastParameters &parameters) {
209
216
  auto &lstate = parameters.local_state->Cast<JSONFunctionLocalState>();
210
217
  lstate.json_allocator.Reset();
211
- auto alc = lstate.json_allocator.GetYYJSONAllocator();
218
+ auto alc = lstate.json_allocator.GetYYAlc();
212
219
 
213
220
  bool success = true;
214
221
  UnaryExecutor::ExecuteWithNulls<string_t, string_t>(