duckdb 0.6.2-dev1166.0 → 0.6.2-dev1170.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.6.2-dev1166.0",
5
+ "version": "0.6.2-dev1170.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -832,6 +832,27 @@ vector<LogicalType> BufferedCSVReader::RefineTypeDetection(const vector<LogicalT
832
832
  return detected_types;
833
833
  }
834
834
 
835
+ string BufferedCSVReader::ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column,
836
+ const vector<string> &names) {
837
+ for (idx_t i = 0; i < names.size(); i++) {
838
+ auto it = sql_types_per_column.find(names[i]);
839
+ if (it != sql_types_per_column.end()) {
840
+ sql_types_per_column.erase(names[i]);
841
+ continue;
842
+ }
843
+ }
844
+ if (sql_types_per_column.empty()) {
845
+ return string();
846
+ }
847
+ string exception = "COLUMN_TYPES error: Columns with names: ";
848
+ for (auto &col : sql_types_per_column) {
849
+ exception += "\"" + col.first + "\",";
850
+ }
851
+ exception.pop_back();
852
+ exception += " do not exist in the CSV File";
853
+ return exception;
854
+ }
855
+
835
856
  vector<LogicalType> BufferedCSVReader::SniffCSV(const vector<LogicalType> &requested_types) {
836
857
  for (auto &type : requested_types) {
837
858
  // auto detect for blobs not supported: there may be invalid UTF-8 in the file
@@ -887,23 +908,38 @@ vector<LogicalType> BufferedCSVReader::SniffCSV(const vector<LogicalType> &reque
887
908
  // #######
888
909
  options.num_cols = best_num_cols;
889
910
  DetectHeader(best_sql_types_candidates, best_header_row);
890
- auto sql_types_per_column = options.sql_types_per_column;
891
- for (idx_t i = 0; i < names.size(); i++) {
892
- auto it = sql_types_per_column.find(names[i]);
893
- if (it != sql_types_per_column.end()) {
894
- best_sql_types_candidates[i] = {it->second};
895
- sql_types_per_column.erase(names[i]);
896
- }
897
- }
898
- if (!sql_types_per_column.empty()) {
899
- string exception = "COLUMN_TYPES error: Columns with names: ";
900
- for (auto &col : sql_types_per_column) {
901
- exception += "\"" + col.first + "\",";
911
+ if (!options.sql_type_list.empty()) {
912
+ // user-defined types were supplied for certain columns
913
+ // override the types
914
+ if (!options.sql_types_per_column.empty()) {
915
+ // types supplied as name -> value map
916
+ idx_t found = 0;
917
+ for (idx_t i = 0; i < names.size(); i++) {
918
+ auto it = options.sql_types_per_column.find(names[i]);
919
+ if (it != options.sql_types_per_column.end()) {
920
+ best_sql_types_candidates[i] = {options.sql_type_list[it->second]};
921
+ found++;
922
+ continue;
923
+ }
924
+ }
925
+ if (!options.union_by_name && found < options.sql_types_per_column.size()) {
926
+ string exception = ColumnTypesError(options.sql_types_per_column, names);
927
+ if (!exception.empty()) {
928
+ throw BinderException(exception);
929
+ }
930
+ }
931
+ } else {
932
+ // types supplied as list
933
+ if (names.size() < options.sql_type_list.size()) {
934
+ throw BinderException("read_csv: %d types were provided, but CSV file only has %d columns",
935
+ options.sql_type_list.size(), names.size());
936
+ }
937
+ for (idx_t i = 0; i < options.sql_type_list.size(); i++) {
938
+ best_sql_types_candidates[i] = {options.sql_type_list[i]};
939
+ }
902
940
  }
903
- exception.pop_back();
904
- exception += " do not exist in the CSV File";
905
- throw BinderException(exception);
906
941
  }
942
+
907
943
  // #######
908
944
  // ### type detection (refining)
909
945
  // #######
@@ -89,24 +89,45 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
89
89
  if (names.empty()) {
90
90
  throw BinderException("read_csv requires at least a single column as input!");
91
91
  }
92
- } else if (loption == "column_types") {
92
+ } else if (loption == "column_types" || loption == "types" || loption == "dtypes") {
93
93
  auto &child_type = kv.second.type();
94
- if (child_type.id() != LogicalTypeId::STRUCT) {
95
- throw BinderException("read_csv_auto column_types requires a struct as input");
94
+ if (child_type.id() != LogicalTypeId::STRUCT && child_type.id() != LogicalTypeId::LIST) {
95
+ throw BinderException("read_csv_auto %s requires a struct or list as input", kv.first);
96
96
  }
97
- auto &struct_children = StructValue::GetChildren(kv.second);
98
- D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
99
- for (idx_t i = 0; i < struct_children.size(); i++) {
100
- auto &name = StructType::GetChildName(child_type, i);
101
- auto &val = struct_children[i];
102
- if (val.type().id() != LogicalTypeId::VARCHAR) {
103
- throw BinderException("read_csv_auto requires a type specification as string");
97
+ if (!options.sql_type_list.empty()) {
98
+ throw BinderException("read_csv_auto column_types/types/dtypes can only be supplied once");
99
+ }
100
+ vector<string> sql_type_names;
101
+ if (child_type.id() == LogicalTypeId::STRUCT) {
102
+ auto &struct_children = StructValue::GetChildren(kv.second);
103
+ D_ASSERT(StructType::GetChildCount(child_type) == struct_children.size());
104
+ for (idx_t i = 0; i < struct_children.size(); i++) {
105
+ auto &name = StructType::GetChildName(child_type, i);
106
+ auto &val = struct_children[i];
107
+ if (val.type().id() != LogicalTypeId::VARCHAR) {
108
+ throw BinderException("read_csv_auto %s requires a type specification as string", kv.first);
109
+ }
110
+ sql_type_names.push_back(StringValue::Get(val));
111
+ options.sql_types_per_column[name] = i;
104
112
  }
105
- auto def_type = TransformStringToLogicalType(StringValue::Get(val));
113
+ } else {
114
+ auto &list_child = ListType::GetChildType(child_type);
115
+ if (list_child.id() != LogicalTypeId::VARCHAR) {
116
+ throw BinderException("read_csv_auto %s requires a list of types (varchar) as input", kv.first);
117
+ }
118
+ auto &children = ListValue::GetChildren(kv.second);
119
+ for (auto &child : children) {
120
+ sql_type_names.push_back(StringValue::Get(child));
121
+ }
122
+ }
123
+ options.sql_type_list.reserve(sql_type_names.size());
124
+ for (auto &sql_type : sql_type_names) {
125
+ auto def_type = TransformStringToLogicalType(sql_type);
106
126
  if (def_type.id() == LogicalTypeId::USER) {
107
- throw BinderException("Unrecognized type for read_csv_auto column_types definition");
127
+ throw BinderException("Unrecognized type \"%s\" for read_csv_auto %s definition", sql_type,
128
+ kv.first);
108
129
  }
109
- options.sql_types_per_column[name] = def_type;
130
+ options.sql_type_list.push_back(move(def_type));
110
131
  }
111
132
  } else if (loption == "all_varchar") {
112
133
  options.all_varchar = BooleanValue::Get(kv.second);
@@ -173,6 +194,13 @@ static unique_ptr<FunctionData> ReadCSVBind(ClientContext &context, TableFunctio
173
194
  const idx_t first_file_index = 0;
174
195
  result->initial_reader = std::move(result->union_readers[first_file_index]);
175
196
  D_ASSERT(names.size() == return_types.size());
197
+
198
+ if (!options.sql_types_per_column.empty()) {
199
+ auto exception = BufferedCSVReader::ColumnTypesError(options.sql_types_per_column, names);
200
+ if (!exception.empty()) {
201
+ throw BinderException(exception);
202
+ }
203
+ }
176
204
  }
177
205
 
178
206
  if (result->options.include_file_name) {
@@ -830,6 +858,8 @@ TableFunction ReadCSVTableFunction::GetAutoFunction(bool list_parameter) {
830
858
  read_csv_auto.cardinality = CSVReaderCardinality;
831
859
  ReadCSVAddNamedParameters(read_csv_auto);
832
860
  read_csv_auto.named_parameters["column_types"] = LogicalType::ANY;
861
+ read_csv_auto.named_parameters["dtypes"] = LogicalType::ANY;
862
+ read_csv_auto.named_parameters["types"] = LogicalType::ANY;
833
863
  return read_csv_auto;
834
864
  }
835
865
 
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.6.2-dev1166"
2
+ #define DUCKDB_VERSION "0.6.2-dev1170"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "67ceaf6e2c"
5
+ #define DUCKDB_SOURCE_ID "72d187c5ff"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -76,6 +76,8 @@ public:
76
76
  //! Extract a single DataChunk from the CSV file and stores it in insert_chunk
77
77
  void ParseCSV(DataChunk &insert_chunk);
78
78
 
79
+ static string ColumnTypesError(case_insensitive_map_t<idx_t> sql_types_per_column, const vector<string> &names);
80
+
79
81
  private:
80
82
  //! Initialize Parser
81
83
  void Initialize(const vector<LogicalType> &requested_types);
@@ -13,6 +13,7 @@
13
13
  #include "duckdb/function/scalar/strftime.hpp"
14
14
  #include "duckdb/common/types/value.hpp"
15
15
  #include "duckdb/common/field_writer.hpp"
16
+ #include "duckdb/common/case_insensitive_map.hpp"
16
17
 
17
18
  namespace duckdb {
18
19
 
@@ -54,8 +55,10 @@ struct BufferedCSVReaderOptions {
54
55
  //===--------------------------------------------------------------------===//
55
56
  // CSVAutoOptions
56
57
  //===--------------------------------------------------------------------===//
57
- //! SQL Types defined per specific column
58
- unordered_map<string, LogicalType> sql_types_per_column;
58
+ //! SQL Type list mapping of name to SQL type index in sql_type_list
59
+ case_insensitive_map_t<idx_t> sql_types_per_column;
60
+ //! User-defined SQL type list
61
+ vector<LogicalType> sql_type_list;
59
62
  //===--------------------------------------------------------------------===//
60
63
  // ReadCSVOptions
61
64
  //===--------------------------------------------------------------------===//