duckdb 0.7.2-dev717.0 → 0.7.2-dev832.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +2 -0
- package/lib/duckdb.d.ts +12 -1
- package/lib/duckdb.js +19 -0
- package/package.json +1 -1
- package/src/duckdb/extension/json/include/json_common.hpp +1 -0
- package/src/duckdb/extension/json/include/json_functions.hpp +1 -0
- package/src/duckdb/extension/json/include/json_serializer.hpp +77 -0
- package/src/duckdb/extension/json/json_functions/json_serialize_sql.cpp +147 -0
- package/src/duckdb/extension/json/json_functions.cpp +1 -0
- package/src/duckdb/extension/json/json_scan.cpp +2 -2
- package/src/duckdb/extension/json/json_serializer.cpp +217 -0
- package/src/duckdb/src/common/enums/expression_type.cpp +8 -222
- package/src/duckdb/src/common/enums/join_type.cpp +3 -22
- package/src/duckdb/src/common/exception.cpp +2 -2
- package/src/duckdb/src/common/serializer/enum_serializer.cpp +1172 -0
- package/src/duckdb/src/common/types/value.cpp +117 -0
- package/src/duckdb/src/common/types/vector.cpp +140 -1
- package/src/duckdb/src/common/types.cpp +166 -89
- package/src/duckdb/src/function/scalar/string/regexp/regexp_extract_all.cpp +243 -0
- package/src/duckdb/src/function/scalar/string/regexp/regexp_util.cpp +79 -0
- package/src/duckdb/src/function/scalar/string/regexp.cpp +21 -80
- package/src/duckdb/src/function/table/arrow_conversion.cpp +7 -1
- package/src/duckdb/src/function/table/table_scan.cpp +1 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/enums/aggregate_handling.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/enums/expression_type.hpp +2 -3
- package/src/duckdb/src/include/duckdb/common/enums/joinref_type.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/enums/order_type.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/enums/set_operation_type.hpp +2 -1
- package/src/duckdb/src/include/duckdb/common/exception.hpp +40 -9
- package/src/duckdb/src/include/duckdb/common/preserved_error.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/serializer/enum_serializer.hpp +113 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_deserializer.hpp +336 -0
- package/src/duckdb/src/include/duckdb/common/serializer/format_serializer.hpp +268 -0
- package/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +126 -0
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +12 -0
- package/src/duckdb/src/include/duckdb/common/types/value.hpp +2 -0
- package/src/duckdb/src/include/duckdb/common/types/vector.hpp +3 -0
- package/src/duckdb/src/include/duckdb/common/types.hpp +8 -2
- package/src/duckdb/src/include/duckdb/function/scalar/regexp.hpp +81 -1
- package/src/duckdb/src/include/duckdb/main/extension_entries.hpp +1 -0
- package/src/duckdb/src/include/duckdb/parser/common_table_expression_info.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/between_expression.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/expression/bound_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/case_expression.hpp +5 -0
- package/src/duckdb/src/include/duckdb/parser/expression/cast_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/collate_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/columnref_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/comparison_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/conjunction_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/constant_expression.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/expression/default_expression.hpp +1 -0
- package/src/duckdb/src/include/duckdb/parser/expression/function_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/lambda_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/operator_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/parameter_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/positional_reference_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/star_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/subquery_expression.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/expression/window_expression.hpp +5 -0
- package/src/duckdb/src/include/duckdb/parser/parsed_data/sample_options.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/parsed_expression.hpp +5 -0
- package/src/duckdb/src/include/duckdb/parser/query_node/recursive_cte_node.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/query_node/select_node.hpp +5 -0
- package/src/duckdb/src/include/duckdb/parser/query_node/set_operation_node.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/query_node.hpp +11 -1
- package/src/duckdb/src/include/duckdb/parser/result_modifier.hpp +24 -1
- package/src/duckdb/src/include/duckdb/parser/sql_statement.hpp +2 -1
- package/src/duckdb/src/include/duckdb/parser/statement/select_statement.hpp +6 -1
- package/src/duckdb/src/include/duckdb/parser/tableref/basetableref.hpp +4 -0
- package/src/duckdb/src/include/duckdb/parser/tableref/emptytableref.hpp +2 -0
- package/src/duckdb/src/include/duckdb/parser/tableref/expressionlistref.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/tableref/joinref.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/tableref/pivotref.hpp +9 -0
- package/src/duckdb/src/include/duckdb/parser/tableref/subqueryref.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/tableref/table_function_ref.hpp +3 -0
- package/src/duckdb/src/include/duckdb/parser/tableref.hpp +3 -1
- package/src/duckdb/src/main/extension/extension_install.cpp +7 -2
- package/src/duckdb/src/optimizer/deliminator.cpp +1 -1
- package/src/duckdb/src/optimizer/filter_combiner.cpp +1 -1
- package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +3 -3
- package/src/duckdb/src/optimizer/rule/move_constants.cpp +2 -2
- package/src/duckdb/src/optimizer/statistics/operator/propagate_filter.cpp +1 -1
- package/src/duckdb/src/parser/common_table_expression_info.cpp +19 -0
- package/src/duckdb/src/parser/expression/between_expression.cpp +17 -0
- package/src/duckdb/src/parser/expression/case_expression.cpp +28 -0
- package/src/duckdb/src/parser/expression/cast_expression.cpp +17 -0
- package/src/duckdb/src/parser/expression/collate_expression.cpp +16 -0
- package/src/duckdb/src/parser/expression/columnref_expression.cpp +15 -0
- package/src/duckdb/src/parser/expression/comparison_expression.cpp +16 -0
- package/src/duckdb/src/parser/expression/conjunction_expression.cpp +15 -0
- package/src/duckdb/src/parser/expression/constant_expression.cpp +14 -0
- package/src/duckdb/src/parser/expression/default_expression.cpp +7 -0
- package/src/duckdb/src/parser/expression/function_expression.cpp +35 -0
- package/src/duckdb/src/parser/expression/lambda_expression.cpp +16 -0
- package/src/duckdb/src/parser/expression/operator_expression.cpp +15 -0
- package/src/duckdb/src/parser/expression/parameter_expression.cpp +15 -0
- package/src/duckdb/src/parser/expression/positional_reference_expression.cpp +14 -0
- package/src/duckdb/src/parser/expression/star_expression.cpp +20 -0
- package/src/duckdb/src/parser/expression/subquery_expression.cpp +20 -0
- package/src/duckdb/src/parser/expression/window_expression.cpp +43 -0
- package/src/duckdb/src/parser/parsed_data/sample_options.cpp +22 -10
- package/src/duckdb/src/parser/parsed_expression.cpp +72 -0
- package/src/duckdb/src/parser/query_node/recursive_cte_node.cpp +21 -0
- package/src/duckdb/src/parser/query_node/select_node.cpp +31 -0
- package/src/duckdb/src/parser/query_node/set_operation_node.cpp +17 -0
- package/src/duckdb/src/parser/query_node.cpp +50 -0
- package/src/duckdb/src/parser/result_modifier.cpp +78 -0
- package/src/duckdb/src/parser/statement/select_statement.cpp +12 -0
- package/src/duckdb/src/parser/tableref/basetableref.cpp +21 -0
- package/src/duckdb/src/parser/tableref/emptytableref.cpp +4 -0
- package/src/duckdb/src/parser/tableref/expressionlistref.cpp +17 -0
- package/src/duckdb/src/parser/tableref/joinref.cpp +25 -0
- package/src/duckdb/src/parser/tableref/pivotref.cpp +53 -0
- package/src/duckdb/src/parser/tableref/subqueryref.cpp +15 -0
- package/src/duckdb/src/parser/tableref/table_function.cpp +17 -0
- package/src/duckdb/src/parser/tableref.cpp +46 -0
- package/src/duckdb/src/parser/transform/expression/transform_bool_expr.cpp +1 -1
- package/src/duckdb/src/parser/transform/expression/transform_operator.cpp +1 -1
- package/src/duckdb/src/parser/transform/expression/transform_subquery.cpp +1 -1
- package/src/duckdb/src/planner/binder/expression/bind_subquery_expression.cpp +4 -0
- package/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +1 -1
- package/src/duckdb/src/planner/expression/bound_expression.cpp +4 -0
- package/src/duckdb/src/verification/deserialized_statement_verifier.cpp +0 -1
- package/src/duckdb/third_party/re2/re2/re2.cc +9 -0
- package/src/duckdb/third_party/re2/re2/re2.h +2 -0
- package/src/duckdb/ub_extension_json_json_functions.cpp +2 -0
- package/src/duckdb/ub_src_common_serializer.cpp +2 -0
- package/src/duckdb/ub_src_function_scalar_string_regexp.cpp +4 -0
- package/src/duckdb/ub_src_parser.cpp +2 -0
- package/src/utils.cpp +12 -0
- package/test/extension.test.ts +44 -26
@@ -0,0 +1,243 @@
|
|
1
|
+
#include "duckdb/function/scalar/regexp.hpp"
|
2
|
+
#include "duckdb/execution/expression_executor.hpp"
|
3
|
+
#include "duckdb/planner/expression/bound_function_expression.hpp"
|
4
|
+
#include "duckdb/function/scalar/string_functions.hpp"
|
5
|
+
#include "re2/re2.h"
|
6
|
+
|
7
|
+
namespace duckdb {
|
8
|
+
|
9
|
+
using regexp_util::CreateStringPiece;
|
10
|
+
using regexp_util::Extract;
|
11
|
+
using regexp_util::ParseRegexOptions;
|
12
|
+
using regexp_util::TryParseConstantPattern;
|
13
|
+
|
14
|
+
unique_ptr<FunctionLocalState>
|
15
|
+
RegexpExtractAll::InitLocalState(ExpressionState &state, const BoundFunctionExpression &expr, FunctionData *bind_data) {
|
16
|
+
auto &info = (RegexpBaseBindData &)*bind_data;
|
17
|
+
if (info.constant_pattern) {
|
18
|
+
return make_unique<RegexLocalState>(info, true);
|
19
|
+
}
|
20
|
+
return nullptr;
|
21
|
+
}
|
22
|
+
|
23
|
+
// Forwards startpos automatically
|
24
|
+
bool ExtractAll(duckdb_re2::StringPiece &input, duckdb_re2::RE2 &pattern, idx_t *startpos,
|
25
|
+
duckdb_re2::StringPiece *groups, int ngroups) {
|
26
|
+
|
27
|
+
D_ASSERT(pattern.ok());
|
28
|
+
D_ASSERT(pattern.NumberOfCapturingGroups() == ngroups);
|
29
|
+
|
30
|
+
if (!pattern.Match(input, *startpos, input.size(), pattern.Anchored(), groups, ngroups + 1)) {
|
31
|
+
return false;
|
32
|
+
}
|
33
|
+
idx_t consumed = static_cast<size_t>(groups[0].end() - (input.begin() + *startpos));
|
34
|
+
if (!consumed) {
|
35
|
+
// Empty match found, have to manually forward the input
|
36
|
+
// to avoid an infinite loop
|
37
|
+
// FIXME: support unicode characters
|
38
|
+
consumed++;
|
39
|
+
while (*startpos + consumed < input.length() && !LengthFun::IsCharacter(input[*startpos + consumed])) {
|
40
|
+
consumed++;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
*startpos += consumed;
|
44
|
+
return true;
|
45
|
+
}
|
46
|
+
|
47
|
+
void ExtractSingleTuple(const string_t &string, duckdb_re2::RE2 &pattern, int32_t group, RegexStringPieceArgs &args,
|
48
|
+
Vector &result, idx_t row) {
|
49
|
+
auto input = CreateStringPiece(string);
|
50
|
+
|
51
|
+
auto &child_vector = ListVector::GetEntry(result);
|
52
|
+
auto list_content = FlatVector::GetData<string_t>(child_vector);
|
53
|
+
auto &child_validity = FlatVector::Validity(child_vector);
|
54
|
+
|
55
|
+
auto current_list_size = ListVector::GetListSize(result);
|
56
|
+
auto current_list_capacity = ListVector::GetListCapacity(result);
|
57
|
+
|
58
|
+
auto result_data = FlatVector::GetData<list_entry_t>(result);
|
59
|
+
auto &list_entry = result_data[row];
|
60
|
+
list_entry.offset = current_list_size;
|
61
|
+
|
62
|
+
if (group < 0) {
|
63
|
+
list_entry.length = 0;
|
64
|
+
return;
|
65
|
+
}
|
66
|
+
// If the requested group index is out of bounds
|
67
|
+
// we want to throw only if there is a match
|
68
|
+
bool throw_on_group_found = (idx_t)group > args.size;
|
69
|
+
|
70
|
+
idx_t startpos = 0;
|
71
|
+
for (idx_t iteration = 0; ExtractAll(input, pattern, &startpos, args.group_buffer, args.size); iteration++) {
|
72
|
+
if (!iteration && throw_on_group_found) {
|
73
|
+
throw InvalidInputException("Pattern has %d groups. Cannot access group %d", args.size, group);
|
74
|
+
}
|
75
|
+
|
76
|
+
// Make sure we have enough room for the new entries
|
77
|
+
if (current_list_size + 1 >= current_list_capacity) {
|
78
|
+
ListVector::Reserve(result, current_list_capacity * 2);
|
79
|
+
current_list_capacity = ListVector::GetListCapacity(result);
|
80
|
+
list_content = FlatVector::GetData<string_t>(child_vector);
|
81
|
+
}
|
82
|
+
|
83
|
+
// Write the captured groups into the list-child vector
|
84
|
+
auto &match_group = args.group_buffer[group];
|
85
|
+
|
86
|
+
idx_t child_idx = current_list_size;
|
87
|
+
if (match_group.empty()) {
|
88
|
+
// This group was not matched
|
89
|
+
list_content[child_idx] = string_t(string.GetDataUnsafe(), 0);
|
90
|
+
if (match_group.begin() == nullptr) {
|
91
|
+
// This group is optional
|
92
|
+
child_validity.SetInvalid(child_idx);
|
93
|
+
}
|
94
|
+
} else {
|
95
|
+
// Every group is a substring of the original, we can find out the offset using the pointer
|
96
|
+
// the 'match_group' address is guaranteed to be bigger than that of the source
|
97
|
+
D_ASSERT((const char *)match_group.begin() >= string.GetDataUnsafe());
|
98
|
+
idx_t offset = match_group.begin() - string.GetDataUnsafe();
|
99
|
+
list_content[child_idx] = string_t(string.GetDataUnsafe() + offset, match_group.size());
|
100
|
+
}
|
101
|
+
current_list_size++;
|
102
|
+
if (startpos > input.size()) {
|
103
|
+
// Empty match found at the end of the string
|
104
|
+
break;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
list_entry.length = current_list_size - list_entry.offset;
|
108
|
+
ListVector::SetListSize(result, current_list_size);
|
109
|
+
}
|
110
|
+
|
111
|
+
int32_t GetGroupIndex(DataChunk &args, idx_t row, int32_t &result) {
|
112
|
+
if (args.ColumnCount() < 3) {
|
113
|
+
result = 0;
|
114
|
+
return true;
|
115
|
+
}
|
116
|
+
UnifiedVectorFormat format;
|
117
|
+
args.data[2].ToUnifiedFormat(args.size(), format);
|
118
|
+
idx_t index = format.sel->get_index(row);
|
119
|
+
if (!format.validity.RowIsValid(index)) {
|
120
|
+
return false;
|
121
|
+
}
|
122
|
+
result = ((int32_t *)format.data)[index];
|
123
|
+
return true;
|
124
|
+
}
|
125
|
+
|
126
|
+
duckdb_re2::RE2 &GetPattern(const RegexpBaseBindData &info, ExpressionState &state,
|
127
|
+
unique_ptr<duckdb_re2::RE2> &pattern_p) {
|
128
|
+
if (info.constant_pattern) {
|
129
|
+
auto &lstate = (RegexLocalState &)*ExecuteFunctionState::GetFunctionState(state);
|
130
|
+
return lstate.constant_pattern;
|
131
|
+
}
|
132
|
+
D_ASSERT(pattern_p);
|
133
|
+
return *pattern_p;
|
134
|
+
}
|
135
|
+
|
136
|
+
RegexStringPieceArgs &GetGroupsBuffer(const RegexpBaseBindData &info, ExpressionState &state,
|
137
|
+
unique_ptr<RegexStringPieceArgs> &groups_p) {
|
138
|
+
if (info.constant_pattern) {
|
139
|
+
auto &lstate = (RegexLocalState &)*ExecuteFunctionState::GetFunctionState(state);
|
140
|
+
return lstate.group_buffer;
|
141
|
+
}
|
142
|
+
D_ASSERT(groups_p);
|
143
|
+
return *groups_p;
|
144
|
+
}
|
145
|
+
|
146
|
+
void RegexpExtractAll::Execute(DataChunk &args, ExpressionState &state, Vector &result) {
|
147
|
+
auto &func_expr = (BoundFunctionExpression &)state.expr;
|
148
|
+
const auto &info = (RegexpBaseBindData &)*func_expr.bind_info;
|
149
|
+
|
150
|
+
auto &strings = args.data[0];
|
151
|
+
auto &patterns = args.data[1];
|
152
|
+
D_ASSERT(result.GetType().id() == LogicalTypeId::LIST);
|
153
|
+
auto &output_child = ListVector::GetEntry(result);
|
154
|
+
|
155
|
+
UnifiedVectorFormat strings_data;
|
156
|
+
strings.ToUnifiedFormat(args.size(), strings_data);
|
157
|
+
|
158
|
+
UnifiedVectorFormat pattern_data;
|
159
|
+
patterns.ToUnifiedFormat(args.size(), pattern_data);
|
160
|
+
|
161
|
+
ListVector::Reserve(result, STANDARD_VECTOR_SIZE);
|
162
|
+
// Reference the 'strings' StringBuffer, because we won't need to allocate new data
|
163
|
+
// for the result, all returned strings are substrings of the originals
|
164
|
+
output_child.SetAuxiliary(strings.GetAuxiliary());
|
165
|
+
|
166
|
+
// Avoid doing extra work if all the inputs are constant
|
167
|
+
idx_t tuple_count = args.AllConstant() ? 1 : args.size();
|
168
|
+
|
169
|
+
unique_ptr<RegexStringPieceArgs> non_const_args;
|
170
|
+
unique_ptr<duckdb_re2::RE2> stored_re;
|
171
|
+
if (!info.constant_pattern) {
|
172
|
+
non_const_args = make_unique<RegexStringPieceArgs>();
|
173
|
+
} else {
|
174
|
+
// Verify that the constant pattern is valid
|
175
|
+
auto &re = GetPattern(info, state, stored_re);
|
176
|
+
auto group_count_p = re.NumberOfCapturingGroups();
|
177
|
+
if (group_count_p == -1) {
|
178
|
+
throw InvalidInputException("Pattern failed to parse, error: '%s'", re.error());
|
179
|
+
}
|
180
|
+
}
|
181
|
+
|
182
|
+
for (idx_t row = 0; row < tuple_count; row++) {
|
183
|
+
bool pattern_valid = true;
|
184
|
+
if (!info.constant_pattern) {
|
185
|
+
// Check if the pattern is NULL or not,
|
186
|
+
// and compile the pattern if it's not constant
|
187
|
+
auto pattern_idx = pattern_data.sel->get_index(row);
|
188
|
+
if (!pattern_data.validity.RowIsValid(pattern_idx)) {
|
189
|
+
pattern_valid = false;
|
190
|
+
} else {
|
191
|
+
auto &pattern_p = ((string_t *)pattern_data.data)[pattern_idx];
|
192
|
+
auto pattern_strpiece = CreateStringPiece(pattern_p);
|
193
|
+
stored_re = make_unique<duckdb_re2::RE2>(pattern_strpiece, info.options);
|
194
|
+
|
195
|
+
// Increase the size of the args buffer if needed
|
196
|
+
auto group_count_p = stored_re->NumberOfCapturingGroups();
|
197
|
+
if (group_count_p == -1) {
|
198
|
+
throw InvalidInputException("Pattern failed to parse, error: '%s'", stored_re->error());
|
199
|
+
}
|
200
|
+
non_const_args->SetSize(group_count_p);
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
auto string_idx = strings_data.sel->get_index(row);
|
205
|
+
int32_t group_index;
|
206
|
+
if (!pattern_valid || !strings_data.validity.RowIsValid(string_idx) || !GetGroupIndex(args, row, group_index)) {
|
207
|
+
// If something is NULL, the result is NULL
|
208
|
+
// FIXME: do we even need 'SPECIAL_HANDLING'?
|
209
|
+
auto result_data = FlatVector::GetData<list_entry_t>(result);
|
210
|
+
auto &result_validity = FlatVector::Validity(result);
|
211
|
+
result_data[row].length = 0;
|
212
|
+
result_data[row].offset = ListVector::GetListSize(result);
|
213
|
+
result_validity.SetInvalid(row);
|
214
|
+
continue;
|
215
|
+
}
|
216
|
+
|
217
|
+
auto &re = GetPattern(info, state, stored_re);
|
218
|
+
auto &groups = GetGroupsBuffer(info, state, non_const_args);
|
219
|
+
auto &string = ((string_t *)strings_data.data)[string_idx];
|
220
|
+
ExtractSingleTuple(string, re, group_index, groups, result, row);
|
221
|
+
}
|
222
|
+
|
223
|
+
if (args.AllConstant()) {
|
224
|
+
result.SetVectorType(VectorType::CONSTANT_VECTOR);
|
225
|
+
}
|
226
|
+
}
|
227
|
+
|
228
|
+
unique_ptr<FunctionData> RegexpExtractAll::Bind(ClientContext &context, ScalarFunction &bound_function,
|
229
|
+
vector<unique_ptr<Expression>> &arguments) {
|
230
|
+
D_ASSERT(arguments.size() >= 2);
|
231
|
+
|
232
|
+
duckdb_re2::RE2::Options options;
|
233
|
+
|
234
|
+
string constant_string;
|
235
|
+
bool constant_pattern = TryParseConstantPattern(context, *arguments[1], constant_string);
|
236
|
+
|
237
|
+
if (arguments.size() >= 4) {
|
238
|
+
ParseRegexOptions(context, *arguments[3], options);
|
239
|
+
}
|
240
|
+
return make_unique<RegexpExtractBindData>(options, std::move(constant_string), constant_pattern, "");
|
241
|
+
}
|
242
|
+
|
243
|
+
} // namespace duckdb
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#include "duckdb/function/scalar/regexp.hpp"
|
2
|
+
#include "duckdb/execution/expression_executor.hpp"
|
3
|
+
|
4
|
+
namespace duckdb {
|
5
|
+
|
6
|
+
namespace regexp_util {
|
7
|
+
|
8
|
+
bool TryParseConstantPattern(ClientContext &context, Expression &expr, string &constant_string) {
|
9
|
+
if (!expr.IsFoldable()) {
|
10
|
+
return false;
|
11
|
+
}
|
12
|
+
Value pattern_str = ExpressionExecutor::EvaluateScalar(context, expr);
|
13
|
+
if (!pattern_str.IsNull() && pattern_str.type().id() == LogicalTypeId::VARCHAR) {
|
14
|
+
constant_string = StringValue::Get(pattern_str);
|
15
|
+
return true;
|
16
|
+
}
|
17
|
+
return false;
|
18
|
+
}
|
19
|
+
|
20
|
+
void ParseRegexOptions(const string &options, duckdb_re2::RE2::Options &result, bool *global_replace) {
|
21
|
+
for (idx_t i = 0; i < options.size(); i++) {
|
22
|
+
switch (options[i]) {
|
23
|
+
case 'c':
|
24
|
+
// case-sensitive matching
|
25
|
+
result.set_case_sensitive(true);
|
26
|
+
break;
|
27
|
+
case 'i':
|
28
|
+
// case-insensitive matching
|
29
|
+
result.set_case_sensitive(false);
|
30
|
+
break;
|
31
|
+
case 'l':
|
32
|
+
// literal matching
|
33
|
+
result.set_literal(true);
|
34
|
+
break;
|
35
|
+
case 'm':
|
36
|
+
case 'n':
|
37
|
+
case 'p':
|
38
|
+
// newline-sensitive matching
|
39
|
+
result.set_dot_nl(false);
|
40
|
+
break;
|
41
|
+
case 's':
|
42
|
+
// non-newline-sensitive matching
|
43
|
+
result.set_dot_nl(true);
|
44
|
+
break;
|
45
|
+
case 'g':
|
46
|
+
// global replace, only available for regexp_replace
|
47
|
+
if (global_replace) {
|
48
|
+
*global_replace = true;
|
49
|
+
} else {
|
50
|
+
throw InvalidInputException("Option 'g' (global replace) is only valid for regexp_replace");
|
51
|
+
}
|
52
|
+
break;
|
53
|
+
case ' ':
|
54
|
+
case '\t':
|
55
|
+
case '\n':
|
56
|
+
// ignore whitespace
|
57
|
+
break;
|
58
|
+
default:
|
59
|
+
throw InvalidInputException("Unrecognized Regex option %c", options[i]);
|
60
|
+
}
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
void ParseRegexOptions(ClientContext &context, Expression &expr, RE2::Options &target, bool *global_replace) {
|
65
|
+
if (expr.HasParameter()) {
|
66
|
+
throw ParameterNotResolvedException();
|
67
|
+
}
|
68
|
+
if (!expr.IsFoldable()) {
|
69
|
+
throw InvalidInputException("Regex options field must be a constant");
|
70
|
+
}
|
71
|
+
Value options_str = ExpressionExecutor::EvaluateScalar(context, expr);
|
72
|
+
if (!options_str.IsNull() && options_str.type().id() == LogicalTypeId::VARCHAR) {
|
73
|
+
ParseRegexOptions(StringValue::Get(options_str), target, global_replace);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
} // namespace regexp_util
|
78
|
+
|
79
|
+
} // namespace duckdb
|
@@ -12,6 +12,11 @@
|
|
12
12
|
|
13
13
|
namespace duckdb {
|
14
14
|
|
15
|
+
using regexp_util::CreateStringPiece;
|
16
|
+
using regexp_util::Extract;
|
17
|
+
using regexp_util::ParseRegexOptions;
|
18
|
+
using regexp_util::TryParseConstantPattern;
|
19
|
+
|
15
20
|
static bool RegexOptionsEquals(const duckdb_re2::RE2::Options &opt_a, const duckdb_re2::RE2::Options &opt_b) {
|
16
21
|
return opt_a.case_sensitive() == opt_b.case_sensitive();
|
17
22
|
}
|
@@ -32,10 +37,6 @@ bool RegexpBaseBindData::Equals(const FunctionData &other_p) const {
|
|
32
37
|
RegexOptionsEquals(options, other.options);
|
33
38
|
}
|
34
39
|
|
35
|
-
static inline duckdb_re2::StringPiece CreateStringPiece(string_t &input) {
|
36
|
-
return duckdb_re2::StringPiece(input.GetDataUnsafe(), input.GetSize());
|
37
|
-
}
|
38
|
-
|
39
40
|
unique_ptr<FunctionLocalState> RegexInitLocalState(ExpressionState &state, const BoundFunctionExpression &expr,
|
40
41
|
FunctionData *bind_data) {
|
41
42
|
auto &info = (RegexpBaseBindData &)*bind_data;
|
@@ -45,75 +46,6 @@ unique_ptr<FunctionLocalState> RegexInitLocalState(ExpressionState &state, const
|
|
45
46
|
return nullptr;
|
46
47
|
}
|
47
48
|
|
48
|
-
static void ParseRegexOptions(const string &options, duckdb_re2::RE2::Options &result, bool *global_replace = nullptr) {
|
49
|
-
for (idx_t i = 0; i < options.size(); i++) {
|
50
|
-
switch (options[i]) {
|
51
|
-
case 'c':
|
52
|
-
// case-sensitive matching
|
53
|
-
result.set_case_sensitive(true);
|
54
|
-
break;
|
55
|
-
case 'i':
|
56
|
-
// case-insensitive matching
|
57
|
-
result.set_case_sensitive(false);
|
58
|
-
break;
|
59
|
-
case 'l':
|
60
|
-
// literal matching
|
61
|
-
result.set_literal(true);
|
62
|
-
break;
|
63
|
-
case 'm':
|
64
|
-
case 'n':
|
65
|
-
case 'p':
|
66
|
-
// newline-sensitive matching
|
67
|
-
result.set_dot_nl(false);
|
68
|
-
break;
|
69
|
-
case 's':
|
70
|
-
// non-newline-sensitive matching
|
71
|
-
result.set_dot_nl(true);
|
72
|
-
break;
|
73
|
-
case 'g':
|
74
|
-
// global replace, only available for regexp_replace
|
75
|
-
if (global_replace) {
|
76
|
-
*global_replace = true;
|
77
|
-
} else {
|
78
|
-
throw InvalidInputException("Option 'g' (global replace) is only valid for regexp_replace");
|
79
|
-
}
|
80
|
-
break;
|
81
|
-
case ' ':
|
82
|
-
case '\t':
|
83
|
-
case '\n':
|
84
|
-
// ignore whitespace
|
85
|
-
break;
|
86
|
-
default:
|
87
|
-
throw InvalidInputException("Unrecognized Regex option %c", options[i]);
|
88
|
-
}
|
89
|
-
}
|
90
|
-
}
|
91
|
-
|
92
|
-
void ParseRegexOptions(ClientContext &context, Expression &expr, RE2::Options &target, bool *global_replace = nullptr) {
|
93
|
-
if (expr.HasParameter()) {
|
94
|
-
throw ParameterNotResolvedException();
|
95
|
-
}
|
96
|
-
if (!expr.IsFoldable()) {
|
97
|
-
throw InvalidInputException("Regex options field must be a constant");
|
98
|
-
}
|
99
|
-
Value options_str = ExpressionExecutor::EvaluateScalar(context, expr);
|
100
|
-
if (!options_str.IsNull() && options_str.type().id() == LogicalTypeId::VARCHAR) {
|
101
|
-
ParseRegexOptions(StringValue::Get(options_str), target, global_replace);
|
102
|
-
}
|
103
|
-
}
|
104
|
-
|
105
|
-
static bool TryParseConstantPattern(ClientContext &context, Expression &expr, string &constant_string) {
|
106
|
-
if (!expr.IsFoldable()) {
|
107
|
-
return false;
|
108
|
-
}
|
109
|
-
Value pattern_str = ExpressionExecutor::EvaluateScalar(context, expr);
|
110
|
-
if (!pattern_str.IsNull() && pattern_str.type().id() == LogicalTypeId::VARCHAR) {
|
111
|
-
constant_string = StringValue::Get(pattern_str);
|
112
|
-
return true;
|
113
|
-
}
|
114
|
-
return false;
|
115
|
-
}
|
116
|
-
|
117
49
|
//===--------------------------------------------------------------------===//
|
118
50
|
// Regexp Matches
|
119
51
|
//===--------------------------------------------------------------------===//
|
@@ -321,13 +253,6 @@ static unique_ptr<FunctionData> RegexExtractBind(ClientContext &context, ScalarF
|
|
321
253
|
std::move(group_string));
|
322
254
|
}
|
323
255
|
|
324
|
-
inline static string_t Extract(const string_t &input, Vector &result, const RE2 &re,
|
325
|
-
const duckdb_re2::StringPiece &rewrite) {
|
326
|
-
std::string extracted;
|
327
|
-
RE2::Extract(input.GetString(), re, rewrite, &extracted);
|
328
|
-
return StringVector::AddString(result, extracted.c_str(), extracted.size());
|
329
|
-
}
|
330
|
-
|
331
256
|
static void RegexExtractFunction(DataChunk &args, ExpressionState &state, Vector &result) {
|
332
257
|
auto &func_expr = (BoundFunctionExpression &)state.expr;
|
333
258
|
const auto &info = (RegexpExtractBindData &)*func_expr.bind_info;
|
@@ -391,10 +316,26 @@ void RegexpFun::RegisterFunction(BuiltinFunctions &set) {
|
|
391
316
|
RegexExtractFunction, RegexExtractBind, nullptr, nullptr, RegexInitLocalState, LogicalType::INVALID,
|
392
317
|
FunctionSideEffects::NO_SIDE_EFFECTS, FunctionNullHandling::SPECIAL_HANDLING));
|
393
318
|
|
319
|
+
ScalarFunctionSet regexp_extract_all("regexp_extract_all");
|
320
|
+
regexp_extract_all.AddFunction(ScalarFunction(
|
321
|
+
{LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::LIST(LogicalType::VARCHAR),
|
322
|
+
RegexpExtractAll::Execute, RegexpExtractAll::Bind, nullptr, nullptr, RegexpExtractAll::InitLocalState,
|
323
|
+
LogicalType::INVALID, FunctionSideEffects::NO_SIDE_EFFECTS, FunctionNullHandling::SPECIAL_HANDLING));
|
324
|
+
regexp_extract_all.AddFunction(ScalarFunction(
|
325
|
+
{LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::INTEGER}, LogicalType::LIST(LogicalType::VARCHAR),
|
326
|
+
RegexpExtractAll::Execute, RegexpExtractAll::Bind, nullptr, nullptr, RegexpExtractAll::InitLocalState,
|
327
|
+
LogicalType::INVALID, FunctionSideEffects::NO_SIDE_EFFECTS, FunctionNullHandling::SPECIAL_HANDLING));
|
328
|
+
regexp_extract_all.AddFunction(
|
329
|
+
ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::INTEGER, LogicalType::VARCHAR},
|
330
|
+
LogicalType::LIST(LogicalType::VARCHAR), RegexpExtractAll::Execute, RegexpExtractAll::Bind,
|
331
|
+
nullptr, nullptr, RegexpExtractAll::InitLocalState, LogicalType::INVALID,
|
332
|
+
FunctionSideEffects::NO_SIDE_EFFECTS, FunctionNullHandling::SPECIAL_HANDLING));
|
333
|
+
|
394
334
|
set.AddFunction(regexp_full_match);
|
395
335
|
set.AddFunction(regexp_partial_match);
|
396
336
|
set.AddFunction(regexp_replace);
|
397
337
|
set.AddFunction(regexp_extract);
|
338
|
+
set.AddFunction(regexp_extract_all);
|
398
339
|
}
|
399
340
|
|
400
341
|
} // namespace duckdb
|
@@ -20,7 +20,13 @@ void ShiftRight(unsigned char *ar, int size, int shift) {
|
|
20
20
|
|
21
21
|
void GetValidityMask(ValidityMask &mask, ArrowArray &array, ArrowScanLocalState &scan_state, idx_t size,
|
22
22
|
int64_t nested_offset = -1, bool add_null = false) {
|
23
|
-
|
23
|
+
// In certains we don't need to or cannot copy arrow's validity mask to duckdb.
|
24
|
+
//
|
25
|
+
// The conditions where we do want to copy arrow's mask to duckdb are:
|
26
|
+
// 1. nulls exist
|
27
|
+
// 2. n_buffers > 0, meaning the array's arrow type is not `null`
|
28
|
+
// 3. the validity buffer (the first buffer) is not a nullptr
|
29
|
+
if (array.null_count != 0 && array.n_buffers > 0 && array.buffers[0]) {
|
24
30
|
auto bit_offset = scan_state.chunk_offset + array.offset;
|
25
31
|
if (nested_offset != -1) {
|
26
32
|
bit_offset = nested_offset;
|
@@ -332,7 +332,7 @@ void TableScanPushdownComplexFilter(ClientContext &context, LogicalGet &get, Fun
|
|
332
332
|
auto comparison_type = comparison->type;
|
333
333
|
if (comparison->left->type == ExpressionType::VALUE_CONSTANT) {
|
334
334
|
// the expression is on the right side, we flip them around
|
335
|
-
comparison_type =
|
335
|
+
comparison_type = FlipComparisonExpression(comparison_type);
|
336
336
|
}
|
337
337
|
if (comparison_type == ExpressionType::COMPARE_EQUAL) {
|
338
338
|
// equality value
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev832"
|
3
3
|
#endif
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
5
|
+
#define DUCKDB_SOURCE_ID "a12329e657"
|
6
6
|
#endif
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
8
8
|
#include "duckdb/main/database.hpp"
|
@@ -205,11 +205,10 @@ string ExpressionTypeToOperator(ExpressionType type);
|
|
205
205
|
|
206
206
|
// Operator String to ExpressionType (e.g. + => OPERATOR_ADD)
|
207
207
|
ExpressionType OperatorToExpressionType(const string &op);
|
208
|
-
|
209
208
|
//! Negate a comparison expression, turning e.g. = into !=, or < into >=
|
210
|
-
ExpressionType
|
209
|
+
ExpressionType NegateComparisonExpression(ExpressionType type);
|
211
210
|
//! Flip a comparison expression, turning e.g. < into >, or = into =
|
212
|
-
ExpressionType
|
211
|
+
ExpressionType FlipComparisonExpression(ExpressionType type);
|
213
212
|
|
214
213
|
DUCKDB_API string ExpressionClassToString(ExpressionClass type);
|
215
214
|
|
@@ -9,10 +9,12 @@
|
|
9
9
|
#pragma once
|
10
10
|
|
11
11
|
#include "duckdb/common/constants.hpp"
|
12
|
+
#include "duckdb/common/exception.hpp"
|
12
13
|
|
13
14
|
namespace duckdb {
|
14
15
|
|
15
16
|
enum class OrderType : uint8_t { INVALID = 0, ORDER_DEFAULT = 1, ASCENDING = 2, DESCENDING = 3 };
|
17
|
+
|
16
18
|
enum class OrderByNullType : uint8_t { INVALID = 0, ORDER_DEFAULT = 1, NULLS_FIRST = 2, NULLS_LAST = 3 };
|
17
19
|
|
18
20
|
} // namespace duckdb
|
@@ -13,6 +13,7 @@
|
|
13
13
|
#include "duckdb/common/exception_format_value.hpp"
|
14
14
|
#include "duckdb/common/vector.hpp"
|
15
15
|
|
16
|
+
#include <map>
|
16
17
|
#include <stdexcept>
|
17
18
|
|
18
19
|
namespace duckdb {
|
@@ -283,29 +284,59 @@ public:
|
|
283
284
|
|
284
285
|
class HTTPException : public IOException {
|
285
286
|
public:
|
286
|
-
|
287
|
-
|
287
|
+
template <typename>
|
288
|
+
struct ResponseShape {
|
289
|
+
typedef int status;
|
290
|
+
};
|
291
|
+
|
292
|
+
template <class RESPONSE, typename ResponseShape<decltype(RESPONSE::status)>::status = 0, typename... ARGS>
|
293
|
+
explicit HTTPException(RESPONSE &response, const string &msg, ARGS... params)
|
294
|
+
: HTTPException(response.status, response.body, response.headers, response.reason, msg, params...) {
|
295
|
+
}
|
296
|
+
|
297
|
+
template <typename>
|
298
|
+
struct ResponseWrapperShape {
|
299
|
+
typedef int code;
|
300
|
+
};
|
301
|
+
template <class RESPONSE, typename ResponseWrapperShape<decltype(RESPONSE::code)>::code = 0, typename... ARGS>
|
302
|
+
explicit HTTPException(RESPONSE &response, const string &msg, ARGS... params)
|
303
|
+
: HTTPException(response.code, response.body, response.headers, response.error, msg, params...) {
|
288
304
|
}
|
289
305
|
|
290
|
-
template <typename... ARGS>
|
291
|
-
explicit HTTPException(int status_code, string
|
292
|
-
|
306
|
+
template <typename HEADERS, typename... ARGS>
|
307
|
+
explicit HTTPException(int status_code, string response_body, HEADERS headers, const string &reason,
|
308
|
+
const string &msg, ARGS... params)
|
309
|
+
: IOException(ExceptionType::HTTP, ConstructMessage(msg, params...)), status_code(status_code), reason(reason),
|
310
|
+
response_body(std::move(response_body)) {
|
311
|
+
this->headers.insert(headers.begin(), headers.end());
|
312
|
+
D_ASSERT(this->headers.size() > 0);
|
293
313
|
}
|
294
314
|
|
295
315
|
std::shared_ptr<Exception> Copy() const {
|
296
|
-
return make_shared<HTTPException>(status_code,
|
316
|
+
return make_shared<HTTPException>(status_code, response_body, headers, reason, RawMessage());
|
297
317
|
}
|
298
318
|
|
319
|
+
const std::multimap<std::string, std::string> GetHeaders() const {
|
320
|
+
return headers;
|
321
|
+
}
|
299
322
|
int GetStatusCode() const {
|
300
323
|
return status_code;
|
301
324
|
}
|
302
|
-
const string &
|
303
|
-
return
|
325
|
+
const string &GetResponseBody() const {
|
326
|
+
return response_body;
|
327
|
+
}
|
328
|
+
const string &GetReason() const {
|
329
|
+
return reason;
|
330
|
+
}
|
331
|
+
[[noreturn]] void Throw() const {
|
332
|
+
throw HTTPException(status_code, response_body, headers, reason, RawMessage());
|
304
333
|
}
|
305
334
|
|
306
335
|
private:
|
307
336
|
int status_code;
|
308
|
-
string
|
337
|
+
string reason;
|
338
|
+
string response_body;
|
339
|
+
std::multimap<string, string> headers;
|
309
340
|
};
|
310
341
|
|
311
342
|
class SerializationException : public Exception {
|
@@ -39,6 +39,9 @@ public:
|
|
39
39
|
//! Let's us do things like 'if (error)'
|
40
40
|
DUCKDB_API operator bool() const;
|
41
41
|
DUCKDB_API bool operator==(const PreservedError &other) const;
|
42
|
+
DUCKDB_API const shared_ptr<Exception> &GetError() {
|
43
|
+
return exception_instance;
|
44
|
+
}
|
42
45
|
|
43
46
|
private:
|
44
47
|
//! Whether this PreservedError contains an exception or not
|