rbql 0.19.3 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -32,7 +32,7 @@ The following two functions are avilable in the browser version:
32
32
  Run user query against input array of records and put the result set in the output array:
33
33
 
34
34
  ```
35
- async function query_table(user_query, input_table, output_table, output_warnings, join_table=null, input_column_names=null, join_column_names=null, normalize_column_names=true)
35
+ async function query_table(user_query, input_table, output_table, output_warnings, join_table=null, input_column_names=null, join_column_names=null, output_column_names=null, normalize_column_names=true)
36
36
  ```
37
37
 
38
38
  #### Parameters:
@@ -51,6 +51,8 @@ async function query_table(user_query, input_table, output_table, output_warning
51
51
  Names of _input_table_ columns which users of the app can use in their queries
52
52
  * _join_column_names_: **array**
53
53
  Names of _join_table_ columns which users of the app can use in their queries
54
+ * _output_column_names_: **array**
55
+ Output column names will be stored in this array after the query completion.
54
56
  * _normalize_column_names_: **boolean**
55
57
  If set to true - column names provided with _input_column_names_ and _join_column_names_ will be normalized to "a" and "b" prefix forms e.g. "Age" -> "a.Age", "Sale price" -> "b['Sale price']".
56
58
  If set to false - column names can be used in user queries "as is".
@@ -137,7 +139,7 @@ The following 3 functions are avilable in Node version:
137
139
  Run user query against input_path CSV file and save it as output_path CSV file.
138
140
 
139
141
  ```
140
- async function rbql.query_csv(user_query, input_path, input_delim, input_policy, output_path, output_delim, output_policy, csv_encoding, output_warnings)
142
+ async function rbql.query_csv(user_query, input_path, input_delim, input_policy, output_path, output_delim, output_policy, csv_encoding, output_warnings, with_headers=false, comment_prefix=null)
141
143
  ```
142
144
 
143
145
  #### Parameters:
@@ -161,6 +163,10 @@ async function rbql.query_csv(user_query, input_path, input_delim, input_policy,
161
163
  encoding of input, output and join tables (join table can be defined inside the user query)
162
164
  * _output_warnings_: **array**
163
165
  Warnings will be stored here after the query completion. If no warnings - the array would be empty
166
+ * _with_headers_: **boolean**
167
+ If set to `true` treat the first records in input (and join) file as header.
168
+ * _comment_prefix_: **string**
169
+ Treat lines starting with the prefix as comments and skip them.
164
170
 
165
171
 
166
172
  ## Usage:
@@ -236,16 +242,14 @@ $ rbql-js --input input.csv --output result.csv
236
242
  ### Main Features
237
243
 
238
244
  * Use JavaScript expressions inside _SELECT_, _UPDATE_, _WHERE_ and _ORDER BY_ statements
239
- * Result set of any query immediately becomes a first-class table on it's own
240
- * Supports input tables with inconsistent number of fields per record
241
- * Output records appear in the same order as in input unless _ORDER BY_ is provided
242
- * Each record has a unique NR (record number) identifier
245
+ * Supports multiple input formats
246
+ * Result set of any query immediately becomes a first-class table on its own
247
+ * No need to provide FROM statement in the query when the input table is defined by the current context.
243
248
  * Supports all main SQL keywords
244
249
  * Supports aggregate functions and GROUP BY queries
245
- * Provides some new useful query modes which traditional SQL engines do not have
246
- * Supports both _TOP_ and _LIMIT_ keywords
247
250
  * Supports user-defined functions (UDF)
248
- * Works out of the box, no external dependencies
251
+ * Provides some new useful query modes which traditional SQL engines do not have
252
+ * Lightweight, dependency-free, works out of the box
249
253
 
250
254
  #### Limitations:
251
255
 
@@ -284,24 +288,15 @@ RBQL for CSV files provides the following variables which you can use in your qu
284
288
  Description: Number of fields in the current record
285
289
  * _a.name_, _b.Person_age_, ... _a.{Good_alphanumeric_column_name}_
286
290
  Variable type: **string**
287
- Description: Value of the field referenced by it's "name". You can use this notation if the field in the first (header) CSV line has a "good" alphanumeric name
291
+ Description: Value of the field referenced by it's "name". You can use this notation if the field in the header has a "good" alphanumeric name
288
292
  * _a["object id"]_, _a['9.12341234']_, _b["%$ !! 10 20"]_ ... _a["Arbitrary column name!"]_
289
293
  Variable type: **string**
290
- Description: Value of the field referenced by it's "name". You can use this notation to reference fields by arbitrary values in the first (header) CSV line, even when there is no header at all
291
-
292
-
293
- #### Notes:
294
- * You can mix all variable types in a single query, example:
295
- ```select a1, b2 JOIN /path/to/b.csv ON a['Item Id'] == b.Identifier WHERE NR > 1 and parseInt(a.Weight) * 100 > parseInt(b["weight of the item"])```
296
- * Referencing fields by header names does not automatically skip the header line (you can use `where NR > 1` trick to skip it)
297
- * If you want to use RBQL as a library for your own app you can define your own custom variables and do not have to support the above mentioned CSV-related variables.
294
+ Description: Value of the field referenced by it's "name". You can use this notation to reference fields by arbitrary values in the header
298
295
 
299
296
 
300
297
  ### UPDATE statement
301
298
 
302
- _UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query. This prevents accidental data loss from poorly written queries.
303
- _UPDATE SET_ is synonym to _UPDATE_, because in RBQL there is no need to specify the source table.
304
-
299
+ _UPDATE_ query produces a new table where original values are replaced according to the UPDATE expression, so it can also be considered a special type of SELECT query.
305
300
 
306
301
  ### Aggregate functions and queries
307
302
 
@@ -316,11 +311,11 @@ There is a workaround for the limitation above for _ARRAY_AGG_ function which su
316
311
 
317
312
  ### JOIN statements
318
313
 
319
- Join table B can be referenced either by it's file path or by it's name - an arbitary string which user should provide before executing the JOIN query.
320
- RBQL supports _STRICT LEFT JOIN_ which is like _LEFT JOIN_, but generates an error if any key in left table "A" doesn't have exactly one matching key in the right table "B".
314
+ Join table B can be referenced either by its file path or by its name - an arbitrary string which the user should provide before executing the JOIN query.
315
+ RBQL supports _STRICT LEFT JOIN_ which is like _LEFT JOIN_, but generates an error if any key in the left table "A" doesn't have exactly one matching key in the right table "B".
316
+ Table B path can be either relative to the working dir, relative to the main table or absolute.
321
317
  Limitation: _JOIN_ statements can't contain JavaScript expressions and must have the following form: _<JOIN\_KEYWORD> (/path/to/table.tsv | table_name ) ON a... == b... [AND a... == b... [AND ... ]]_
322
318
 
323
-
324
319
  ### SELECT EXCEPT statement
325
320
 
326
321
  SELECT EXCEPT can be used to select everything except specific columns. E.g. to select everything but columns 2 and 4, run: `SELECT * EXCEPT a2, a4`
@@ -337,6 +332,12 @@ RBQL does not support LIKE operator, instead it provides "like()" function which
337
332
  `SELECT * where like(a1, 'foo%bar')`
338
333
 
339
334
 
335
+ ### WITH (header) and WITH (noheader) statements
336
+ You can set whether the input (and join) CSV file has a header or not using the environment configuration parameters which could be `--with_headers` CLI flag or GUI checkbox or something else.
337
+ But it is also possible to override this selection directly in the query by adding either `WITH (header)` or `WITH (noheader)` statement at the end of the query.
338
+ Example: `select top 5 NR, * with (header)`
339
+
340
+
340
341
  ### User Defined Functions (UDF)
341
342
 
342
343
  RBQL supports User Defined Functions
@@ -346,8 +347,8 @@ You can define custom functions and/or import libraries in a special file: `~/.r
346
347
  ## Examples of RBQL queries
347
348
 
348
349
  * `select top 100 a1, a2 * 10, a4.length where a1 == "Buy" order by parseInt(a2) desc`
349
- * `select * order by Math.random() where NR > 1` - skip header record and random sort
350
- * `select top 20 a.vehicle_price.length / 10, a2 where NR > 1 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header record and skipping the header
350
+ * `select * order by Math.random()` - random sort
351
+ * `select top 20 a.vehicle_price.length / 10, a2 where parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
351
352
  * `update set a3 = 'NPC' where a3.indexOf('Non-playable character') != -1`
352
353
  * `select NR, *` - enumerate records, NR is 1-based
353
354
  * `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query
@@ -355,16 +356,6 @@ You can define custom functions and/or import libraries in a special file: `~/.r
355
356
  * `select ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
356
357
 
357
358
 
358
- ### FAQ
359
-
360
- #### How do I skip header record in CSV files?
361
-
362
- You can use the following trick: add `... where NR > 1 ...` to your query
363
-
364
- And if you are doing math operation you can modify your query like this, example:
365
- `select parseInt(a3) * 1000, a2` -> `select NR > 1 ? parseInt(a3) * 1000 : a3, a2`
366
-
367
-
368
359
  ### References
369
360
 
370
361
  * [RBQL: Official Site](https://rbql.org/)
package/cli_rbql.js CHANGED
@@ -158,7 +158,7 @@ async function autodetect_delim_policy(table_path) {
158
158
  }
159
159
 
160
160
 
161
- function print_colorized(records, delim, show_column_names, skip_header) {
161
+ function print_colorized(records, delim, show_column_names, with_headers) {
162
162
  let reset_color_code = '\x1b[0m';
163
163
  let color_codes = ['\x1b[0m', '\x1b[31m', '\x1b[32m', '\x1b[33m', '\x1b[34m', '\x1b[35m', '\x1b[36m', '\x1b[31;1m', '\x1b[32;1m', '\x1b[33;1m'];
164
164
  for (let r = 0; r < records.length; r++) {
@@ -166,7 +166,7 @@ function print_colorized(records, delim, show_column_names, skip_header) {
166
166
  for (let c = 0; c < records[r].length; c++) {
167
167
  let color_code = color_codes[c % color_codes.length];
168
168
  let field = records[r][c];
169
- let colored_field = (!show_column_names || (skip_header && r == 0)) ? color_code + field : `${color_code}a${c + 1}:${field}`;
169
+ let colored_field = (!show_column_names || (with_headers && r == 0)) ? color_code + field : `${color_code}a${c + 1}:${field}`;
170
170
  out_fields.push(colored_field);
171
171
  }
172
172
  let out_line = out_fields.join(delim) + reset_color_code;
@@ -208,7 +208,7 @@ async function run_with_js(args) {
208
208
  var input_path = get_default(args, 'input', null);
209
209
  var output_path = get_default(args, 'output', null);
210
210
  var csv_encoding = args['encoding'];
211
- var skip_header = args['skip-header'];
211
+ var with_headers = args['with-headers'];
212
212
  var comment_prefix = args['comment-prefix'];
213
213
  var output_delim = get_default(args, 'out-delim', null);
214
214
  var output_policy = get_default(args, 'out-policy', null);
@@ -229,8 +229,8 @@ async function run_with_js(args) {
229
229
  // * binary/latin-1 do not require the decoder anyway
230
230
  // * This is CLI so no way we are in the Electron environment which can't use the TextDecoder
231
231
  // * Streaming mode works a little faster (since we don't need to do the manual validation)
232
- // TODO check if the current node installation doesn't have ICU enabled and report a user-friendly error with an option to use latin-1 encoding or switch the interpreter
233
- await rbql_csv.query_csv(query, input_path, delim, policy, output_path, output_delim, output_policy, csv_encoding, warnings, skip_header, comment_prefix, user_init_code/*, {'bulk_read': true}*/);
232
+ // TODO check if the current node installation doesn't have ICU enabled (which is typicaly provided by Node.js by default, see https://nodejs.org/api/intl.html) and report a user-friendly error with an option to use latin-1 encoding or switch the interpreter
233
+ await rbql_csv.query_csv(query, input_path, delim, policy, output_path, output_delim, output_policy, csv_encoding, warnings, with_headers, comment_prefix, user_init_code/*, {'bulk_read': true}*/);
234
234
  await handle_query_success(warnings, output_path, csv_encoding, output_delim, output_policy);
235
235
  return true;
236
236
  } catch (e) {
@@ -250,11 +250,11 @@ function get_default_output_path(input_path, delim) {
250
250
  }
251
251
 
252
252
 
253
- async function show_preview(input_path, encoding, delim, policy, skip_header) {
253
+ async function show_preview(input_path, encoding, delim, policy, with_headers) {
254
254
  let [records, warnings] = await sample_records(input_path, encoding, delim, policy);
255
255
  console.log('Input table preview:');
256
256
  console.log('====================================');
257
- print_colorized(records, delim, true, skip_header);
257
+ print_colorized(records, delim, true, with_headers);
258
258
  console.log('====================================\n');
259
259
  for (let warning of warnings) {
260
260
  show_warning(warning);
@@ -280,7 +280,7 @@ async function run_interactive_loop(args) {
280
280
  if (!delim)
281
281
  throw new GenericError('Unable to autodetect table delimiter. Provide column separator explicitly with "--delim" option');
282
282
  }
283
- await show_preview(input_path, args['encoding'], delim, policy, args['skip-header']);
283
+ await show_preview(input_path, args['encoding'], delim, policy, args['with-headers']);
284
284
  args.delim = delim;
285
285
  args.policy = policy;
286
286
  if (!args.output) {
@@ -365,7 +365,7 @@ function main() {
365
365
  '--output': {'help': 'Write output table to FILE instead of stdout', 'metavar': 'FILE'},
366
366
  '--delim': {'help': 'Delimiter character or multicharacter string, e.g. "," or "###". Can be autodetected in interactive mode', 'metavar': 'DELIM'},
367
367
  '--policy': {'help': 'Split policy, see the explanation below. Supported values: "simple", "quoted", "quoted_rfc", "whitespace", "monocolumn". Can be autodetected in interactive mode', 'metavar': 'POLICY'},
368
- '--skip-header': {'boolean': true, 'help': 'Skip header line in input and join tables. Roughly equivalent of ... WHERE NR > 1 ... in your Query'},
368
+ '--with-headers': {'boolean': true, 'help': 'Indicates that input (and join) table has header'},
369
369
  '--comment-prefix': {'help': 'Ignore lines in input and join tables that start with the comment PREFIX, e.g. "#" or ">>"', 'metavar': 'PREFIX'},
370
370
  '--encoding': {'default': 'utf-8', 'help': 'Manually set csv encoding', 'metavar': 'ENCODING'},
371
371
  '--out-format': {'default': 'input', 'help': 'Output format. Supported values: ' + out_format_names.map(v => `"${v}"`).join(', '), 'metavar': 'FORMAT'},
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rbql",
3
- "version": "0.19.3",
3
+ "version": "0.25.0",
4
4
  "description": "Rainbow Query Language",
5
5
  "keywords": ["CSV", "TSV", "spreadsheet", "SQL", "SQL-like", "transpiler", "CLI", "command-line", "library", "browser", "Node", "select", "update", "join"],
6
6
  "scripts": {
package/rbql.js CHANGED
@@ -66,11 +66,117 @@ class RBQLContext {
66
66
  }
67
67
  }
68
68
 
69
- var query_context = null; // Needs to be global for MIN(), MAX(), etc functions
69
+ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions. TODO find a way to make it local.
70
70
 
71
71
 
72
72
  const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
73
- const RBQL_VERSION = '0.19.3';
73
+ const RBQL_VERSION = '0.25.0';
74
+
75
+
76
+ function check_if_brackets_match(opening_bracket, closing_bracket) {
77
+ return (opening_bracket == '[' && closing_bracket == ']') || (opening_bracket == '(' && closing_bracket == ')') || (opening_bracket == '{' && closing_bracket == '}');
78
+ }
79
+
80
+
81
+ function parse_root_bracket_level_text_spans(select_expression) {
82
+ let text_spans = []; // parts of text separated by commas at the root parenthesis level
83
+ let last_pos = 0;
84
+ let bracket_stack = [];
85
+ for (let i = 0; i < select_expression.length; i++) {
86
+ let cur_char = select_expression[i];
87
+ if (cur_char == ',' && bracket_stack.length == 0) {
88
+ text_spans.push(select_expression.substring(last_pos, i));
89
+ last_pos = i + 1;
90
+ } else if (['[', '{', '('].indexOf(cur_char) != -1) {
91
+ bracket_stack.push(cur_char);
92
+ } else if ([']', '}', ')'].indexOf(cur_char) != -1) {
93
+ if (bracket_stack.length && check_if_brackets_match(bracket_stack[bracket_stack.length - 1], cur_char)) {
94
+ bracket_stack.pop();
95
+ } else {
96
+ throw new RbqlParsingError(`Unable to parse column headers in SELECT expression: No matching opening bracket for closing "${cur_char}"`);
97
+ }
98
+ }
99
+ }
100
+ if (bracket_stack.length) {
101
+ throw new RbqlParsingError(`Unable to parse column headers in SELECT expression: No matching closing bracket for opening "${bracket_stack[0]}"`);
102
+ }
103
+ text_spans.push(select_expression.substring(last_pos, select_expression.length));
104
+ text_spans = text_spans.map(span => span.trim());
105
+ return text_spans;
106
+ }
107
+
108
+
109
+ function unquote_string(quoted_str) {
110
+ // It's possible to use eval here to unqoute the quoted_column_name, but it would be a little barbaric, let's do it manually instead
111
+ if (!quoted_str || quoted_str.length < 2)
112
+ return null;
113
+ if (quoted_str[0] == "'" && quoted_str[quoted_str.length - 1] == "'") {
114
+ return quoted_str.substring(1, quoted_str.length - 1).replace(/\\'/g, "'").replace(/\\\\/g, "\\");
115
+ } else if (quoted_str[0] == '"' && quoted_str[quoted_str.length - 1] == '"') {
116
+ return quoted_str.substring(1, quoted_str.length - 1).replace(/\\"/g, '"').replace(/\\\\/g, "\\");
117
+ } else {
118
+ return null;
119
+ }
120
+ }
121
+
122
+
123
+ function column_info_from_text_span(text_span, string_literals) {
124
+ // This function is a rough equivalent of "column_info_from_node()" function in python version of RBQL
125
+ text_span = text_span.trim();
126
+ let rbql_star_marker = '__RBQL_INTERNAL_STAR';
127
+ let simple_var_match = /^[_a-zA-Z][_a-zA-Z0-9]*$/.exec(text_span);
128
+ let attribute_match = /^([ab])\.([_a-zA-Z][_a-zA-Z0-9]*)$/.exec(text_span);
129
+ let subscript_int_match = /^([ab])\[([0-9]+)\]$/.exec(text_span);
130
+ let subscript_str_match = /^([ab])\[___RBQL_STRING_LITERAL([0-9]+)___\]$/.exec(text_span);
131
+ if (simple_var_match !== null) {
132
+ if (text_span == rbql_star_marker)
133
+ return {table_name: null, column_index: null, column_name: null, is_star: true};
134
+ if (text_span.startsWith('___RBQL_STRING_LITERAL'))
135
+ return null;
136
+ let match = /^([ab])([0-9]+)$/.exec(text_span);
137
+ if (match !== null) {
138
+ return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false};
139
+ }
140
+ // Some examples for this branch: NR, NF
141
+ return {table_name: null, column_index: null, column_name: text_span, is_star: false};
142
+ } else if (attribute_match !== null) {
143
+ let table_name = attribute_match[1];
144
+ let column_name = attribute_match[2];
145
+ if (column_name == rbql_star_marker) {
146
+ return {table_name: table_name, column_index: null, column_name: null, is_star: true};
147
+ }
148
+ return {table_name: null, column_index: null, column_name: column_name, is_star: false};
149
+ } else if (subscript_int_match != null) {
150
+ let table_name = subscript_int_match[1];
151
+ let column_index = parseInt(subscript_int_match[2]) - 1;
152
+ return {table_name: table_name, column_index: column_index, column_name: null, is_star: false};
153
+ } else if (subscript_str_match != null) {
154
+ let table_name = subscript_str_match[1];
155
+ let replaced_string_literal_id = subscript_str_match[2];
156
+ if (replaced_string_literal_id < string_literals.length) {
157
+ let quoted_column_name = string_literals[replaced_string_literal_id];
158
+ let unquoted_column_name = unquote_string(quoted_column_name);
159
+ if (unquoted_column_name !== null && unquoted_column_name !== undefined) {
160
+ return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false};
161
+ }
162
+ }
163
+ }
164
+ return null;
165
+ }
166
+
167
+
168
+ function adhoc_parse_select_expression_to_column_infos(select_expression, string_literals) {
169
+ // It is acceptable for the algorithm to provide null column name when it could be theorethically possible to deduce the name.
170
+ // I.e. this algorithm guarantees precision but doesn't guarantee completeness in all theorethically possible queries.
171
+ // Although the algorithm should be complete in all practical scenarios, i.e. it should be hard to come up with the query that doesn't produce complete set of column names.
172
+ // The null column name just means that the output column will be named as col{i}, so the failure to detect the proper column name can be tolerated.
173
+ // Specifically this function guarantees the following:
174
+ // 1. The number of column_infos is correct and will match the number of fields in each record in the output - otherwise the exception should be thrown
175
+ // 2. If column_info at pos j is not null, it is guaranteed to correctly represent that column name in the output
176
+ let text_spans = parse_root_bracket_level_text_spans(select_expression);
177
+ let column_infos = text_spans.map(ts => column_info_from_text_span(ts, string_literals));
178
+ return column_infos;
179
+ }
74
180
 
75
181
 
76
182
  function stable_compare(a, b) {
@@ -469,10 +575,10 @@ class TopWriter {
469
575
  this.top_count = top_count;
470
576
  }
471
577
 
472
- write(record) {
578
+ async write(record) {
473
579
  if (this.top_count !== null && this.NW >= this.top_count)
474
580
  return false;
475
- this.subwriter.write(record);
581
+ await this.subwriter.write(record);
476
582
  this.NW += 1;
477
583
  return true;
478
584
  }
@@ -489,10 +595,10 @@ class UniqWriter {
489
595
  this.seen = new Set();
490
596
  }
491
597
 
492
- write(record) {
598
+ async write(record) {
493
599
  if (!add_to_set(this.seen, JSON.stringify(record)))
494
600
  return true;
495
- if (!this.subwriter.write(record))
601
+ if (!await this.subwriter.write(record))
496
602
  return false;
497
603
  return true;
498
604
  }
@@ -509,7 +615,7 @@ class UniqCountWriter {
509
615
  this.records = new Map();
510
616
  }
511
617
 
512
- write(record) {
618
+ async write(record) {
513
619
  var key = JSON.stringify(record);
514
620
  var old_val = this.records.get(key);
515
621
  if (old_val) {
@@ -524,7 +630,7 @@ class UniqCountWriter {
524
630
  for (var [key, value] of this.records) {
525
631
  let [count, record] = value;
526
632
  record.unshift(count);
527
- if (!this.subwriter.write(record))
633
+ if (!await this.subwriter.write(record))
528
634
  break;
529
635
  }
530
636
  await this.subwriter.finish();
@@ -539,7 +645,7 @@ class SortedWriter {
539
645
  this.unsorted_entries = [];
540
646
  }
541
647
 
542
- write(stable_entry) {
648
+ async write(stable_entry) {
543
649
  this.unsorted_entries.push(stable_entry);
544
650
  return true;
545
651
  }
@@ -551,7 +657,7 @@ class SortedWriter {
551
657
  unsorted_entries.reverse();
552
658
  for (var i = 0; i < unsorted_entries.length; i++) {
553
659
  var entry = unsorted_entries[i];
554
- if (!this.subwriter.write(entry[entry.length - 1]))
660
+ if (!await this.subwriter.write(entry[entry.length - 1]))
555
661
  break;
556
662
  }
557
663
  await this.subwriter.finish();
@@ -575,7 +681,7 @@ class AggregateWriter {
575
681
  for (var ag of this.aggregators) {
576
682
  out_fields.push(ag.get_final(key));
577
683
  }
578
- if (!this.subwriter.write(out_fields))
684
+ if (!await this.subwriter.write(out_fields))
579
685
  break;
580
686
  }
581
687
  await this.subwriter.finish();
@@ -635,13 +741,13 @@ function select_except(src, except_fields) {
635
741
  }
636
742
 
637
743
 
638
- function select_simple(sort_key, NR, out_fields) {
744
+ async function select_simple(sort_key, NR, out_fields) {
639
745
  if (query_context.sort_key_expression !== null) {
640
746
  var sort_entry = sort_key.concat([NR, out_fields]);
641
- if (!query_context.writer.write(sort_entry))
747
+ if (!await query_context.writer.write(sort_entry))
642
748
  return false;
643
749
  } else {
644
- if (!query_context.writer.write(out_fields))
750
+ if (!await query_context.writer.write(out_fields))
645
751
  return false;
646
752
  }
647
753
  return true;
@@ -683,12 +789,12 @@ function select_aggregated(key, transparent_values) {
683
789
  }
684
790
 
685
791
 
686
- function select_unnested(sort_key, NR, folded_fields) {
792
+ async function select_unnested(sort_key, NR, folded_fields) {
687
793
  let out_fields = folded_fields.slice();
688
794
  let unnest_pos = folded_fields.findIndex(val => val instanceof UnnestMarker);
689
795
  for (var i = 0; i < query_context.unnest_list.length; i++) {
690
796
  out_fields[unnest_pos] = query_context.unnest_list[i];
691
- if (!select_simple(sort_key, NR, out_fields.slice()))
797
+ if (!await select_simple(sort_key, NR, out_fields.slice()))
692
798
  return false;
693
799
  }
694
800
  return true;
@@ -705,10 +811,10 @@ if (__RBQLMP__where_expression) {
705
811
  } else {
706
812
  let sort_key = [__RBQLMP__sort_key_expression];
707
813
  if (query_context.unnest_list !== null) {
708
- if (!select_unnested(sort_key, NR, out_fields))
814
+ if (!await select_unnested(sort_key, NR, out_fields))
709
815
  stop_flag = true;
710
816
  } else {
711
- if (!select_simple(sort_key, NR, out_fields))
817
+ if (!await select_simple(sort_key, NR, out_fields))
712
818
  stop_flag = true;
713
819
  }
714
820
  }
@@ -749,7 +855,7 @@ if (join_matches.length == 1 && (__RBQLMP__where_expression)) {
749
855
  NU += 1;
750
856
  __RBQLMP__update_expressions
751
857
  }
752
- if (!query_context.writer.write(up_fields))
858
+ if (!await query_context.writer.write(up_fields))
753
859
  stop_flag = true;
754
860
  `;
755
861
 
@@ -761,7 +867,7 @@ if (__RBQLMP__where_expression) {
761
867
  NU += 1;
762
868
  __RBQLMP__update_expressions
763
869
  }
764
- if (!query_context.writer.write(up_fields))
870
+ if (!await query_context.writer.write(up_fields))
765
871
  stop_flag = true;
766
872
  `;
767
873
 
@@ -866,7 +972,7 @@ async function compile_and_run(query_context) {
866
972
  if (lower_case_query.indexOf(' like ') != -1)
867
973
  throw new SyntaxError(e.message + "\nRBQL doesn't support \"LIKE\" operator, use like() function instead e.g. ... WHERE like(a1, 'foo%bar') ... "); // UT JSON
868
974
  if (lower_case_query.indexOf(' from ') != -1)
869
- throw new SyntaxError(e.message + "\nRBQL doesn't use \"FROM\" keyword, e.g. you can query 'SELECT *' without FROM"); // UT JSON
975
+ throw new SyntaxError(e.message + "\nTip: If input table is defined by the environment, RBQL query should not have \"FROM\" keyword"); // UT JSON
870
976
  if (e && e.message && String(e.message).toLowerCase().indexOf('unexpected identifier') != -1) {
871
977
  if (lower_case_query.indexOf(' and ') != -1)
872
978
  throw new SyntaxError(e.message + "\nDid you use 'and' keyword in your query?\nJavaScript backend doesn't support 'and' keyword, use '&&' operator instead!");
@@ -893,6 +999,7 @@ const ORDER_BY = 'ORDER BY';
893
999
  const WHERE = 'WHERE';
894
1000
  const LIMIT = 'LIMIT';
895
1001
  const EXCEPT = 'EXCEPT';
1002
+ const WITH = 'WITH';
896
1003
 
897
1004
 
898
1005
  function get_ambiguous_error_msg(variable_name) {
@@ -925,7 +1032,7 @@ function strip_comments(cline) {
925
1032
 
926
1033
  function combine_string_literals(backend_expression, string_literals) {
927
1034
  for (var i = 0; i < string_literals.length; i++) {
928
- backend_expression = replace_all(backend_expression, `###RBQL_STRING_LITERAL${i}###`, string_literals[i]);
1035
+ backend_expression = replace_all(backend_expression, `___RBQL_STRING_LITERAL${i}___`, string_literals[i]);
929
1036
  }
930
1037
  return backend_expression;
931
1038
  }
@@ -1172,6 +1279,24 @@ function replace_star_vars(rbql_expression) {
1172
1279
  }
1173
1280
 
1174
1281
 
1282
+ function replace_star_vars_for_header_parsing(rbql_expression) {
1283
+ let star_rgx = /(?:(?<=^)|(?<=,)) *(\*|a\.\*|b\.\*) *(?=$|,)/g;
1284
+ let matches = get_all_matches(star_rgx, rbql_expression);
1285
+ let last_pos = 0;
1286
+ let result = '';
1287
+ for (let match of matches) {
1288
+ let star_expression = match[1];
1289
+ let replacement_expression = {'*': '__RBQL_INTERNAL_STAR', 'a.*': 'a.__RBQL_INTERNAL_STAR', 'b.*': 'b.__RBQL_INTERNAL_STAR'}[star_expression];
1290
+ if (last_pos < match.index)
1291
+ result += rbql_expression.substring(last_pos, match.index);
1292
+ result += replacement_expression;
1293
+ last_pos = match.index + match[0].length;
1294
+ }
1295
+ result += rbql_expression.substring(last_pos);
1296
+ return result;
1297
+ }
1298
+
1299
+
1175
1300
  function translate_update_expression(update_expression, input_variables_map, string_literals, indent) {
1176
1301
  let first_assignment = str_strip(update_expression.split('=')[0]);
1177
1302
  let first_assignment_error = `Unable to parse "UPDATE" expression: the expression must start with assignment, but "${first_assignment}" does not look like an assignable field name`;
@@ -1203,12 +1328,12 @@ function translate_update_expression(update_expression, input_variables_map, str
1203
1328
 
1204
1329
 
1205
1330
  function translate_select_expression(select_expression) {
1206
- var translated = replace_star_count(select_expression);
1207
- translated = replace_star_vars(translated);
1208
- translated = str_strip(translated);
1331
+ let expression_without_stars = replace_star_count(select_expression);
1332
+ let translated = str_strip(replace_star_vars(expression_without_stars));
1333
+ let translated_for_header = str_strip(replace_star_vars_for_header_parsing(expression_without_stars));
1209
1334
  if (!translated.length)
1210
1335
  throw new RbqlParsingError('"SELECT" expression is empty');
1211
- return `[].concat([${translated}])`;
1336
+ return [`[].concat([${translated}])`, translated_for_header];
1212
1337
  }
1213
1338
 
1214
1339
 
@@ -1225,7 +1350,7 @@ function separate_string_literals(rbql_expression) {
1225
1350
  string_literals.push(string_literal);
1226
1351
  var start_index = match_obj.index;
1227
1352
  format_parts.push(rbql_expression.substring(idx_before, start_index));
1228
- format_parts.push(`###RBQL_STRING_LITERAL${literal_id}###`);
1353
+ format_parts.push(`___RBQL_STRING_LITERAL${literal_id}___`);
1229
1354
  idx_before = rgx.lastIndex;
1230
1355
  }
1231
1356
  format_parts.push(rbql_expression.substring(idx_before));
@@ -1269,8 +1394,13 @@ function locate_statements(rbql_expression) {
1269
1394
 
1270
1395
  function separate_actions(rbql_expression) {
1271
1396
  rbql_expression = str_strip(rbql_expression);
1272
- var ordered_statements = locate_statements(rbql_expression);
1273
1397
  var result = {};
1398
+ let with_match = /^(.*) *[Ww][Ii][Tt][Hh] *\(([a-z]{4,20})\) *$/.exec(rbql_expression);
1399
+ if (with_match !== null) {
1400
+ rbql_expression = with_match[1];
1401
+ result[WITH] = with_match[2];
1402
+ }
1403
+ var ordered_statements = locate_statements(rbql_expression);
1274
1404
  for (var i = 0; i < ordered_statements.length; i++) {
1275
1405
  var statement_start = ordered_statements[i][0];
1276
1406
  var span_start = ordered_statements[i][1];
@@ -1305,7 +1435,7 @@ function separate_actions(rbql_expression) {
1305
1435
  if (statement == SELECT) {
1306
1436
  if (statement_start != 0)
1307
1437
  throw new RbqlParsingError('SELECT keyword must be at the beginning of the query');
1308
- var match = /^ *TOP *([0-9]+) /i.exec(span);
1438
+ let match = /^ *TOP *([0-9]+) /i.exec(span);
1309
1439
  if (match !== null) {
1310
1440
  statement_params['top'] = parseInt(match[1]);
1311
1441
  span = span.substr(match.index + match[0].length);
@@ -1347,7 +1477,7 @@ function find_top(rb_actions) {
1347
1477
  }
1348
1478
 
1349
1479
 
1350
- function translate_except_expression(except_expression, input_variables_map, string_literals) {
1480
+ function translate_except_expression(except_expression, input_variables_map, string_literals, input_header) {
1351
1481
  let skip_vars = except_expression.split(',');
1352
1482
  skip_vars = skip_vars.map(str_strip);
1353
1483
  let skip_indices = [];
@@ -1358,8 +1488,9 @@ function translate_except_expression(except_expression, input_variables_map, str
1358
1488
  skip_indices.push(input_variables_map[var_name].index);
1359
1489
  }
1360
1490
  skip_indices = skip_indices.sort((a, b) => a - b);
1491
+ let output_header = input_header === null ? null : select_except(input_header, skip_indices);
1361
1492
  let indices_str = skip_indices.join(',');
1362
- return `select_except(record_a, [${indices_str}])`;
1493
+ return [output_header, `select_except(record_a, [${indices_str}])`];
1363
1494
  }
1364
1495
 
1365
1496
 
@@ -1428,7 +1559,7 @@ class HashJoinMap {
1428
1559
 
1429
1560
 
1430
1561
  function cleanup_query(query_text) {
1431
- return query_text.split('\n').map(strip_comments).filter(line => line.length).join(' ');
1562
+ return query_text.split('\n').map(strip_comments).filter(line => line.length).join(' ').replace(/;+$/g, '');
1432
1563
  }
1433
1564
 
1434
1565
 
@@ -1439,6 +1570,44 @@ function remove_redundant_table_name(query_text) {
1439
1570
  }
1440
1571
 
1441
1572
 
1573
+ function select_output_header(input_header, join_header, query_column_infos) {
1574
+ if (input_header === null && join_header === null)
1575
+ return null;
1576
+ if (input_header === null)
1577
+ input_header = [];
1578
+ if (join_header === null)
1579
+ join_header = [];
1580
+ let output_header = [];
1581
+ for (let qci of query_column_infos) {
1582
+ // TODO refactor this and python version: extract this code into a function instead to always return something
1583
+ if (qci === null) {
1584
+ output_header.push('col' + (output_header.length + 1));
1585
+ } else if (qci.is_star) {
1586
+ if (qci.table_name === null) {
1587
+ output_header = output_header.concat(input_header).concat(join_header);
1588
+ } else if (qci.table_name === 'a') {
1589
+ output_header = output_header.concat(input_header);
1590
+ } else if (qci.table_name === 'b') {
1591
+ output_header = output_header.concat(join_header);
1592
+ }
1593
+ } else if (qci.column_name !== null) {
1594
+ output_header.push(qci.column_name);
1595
+ } else if (qci.column_index !== null) {
1596
+ if (qci.table_name == 'a' && qci.column_index < input_header.length) {
1597
+ output_header.push(input_header[qci.column_index]);
1598
+ } else if (qci.table_name == 'b' && qci.column_index < join_header.length) {
1599
+ output_header.push(join_header[qci.column_index]);
1600
+ } else {
1601
+ output_header.push('col' + (output_header.length + 1));
1602
+ }
1603
+ } else { // Should never happen
1604
+ output_header.push('col' + (output_header.length + 1));
1605
+ }
1606
+ }
1607
+ return output_header;
1608
+ }
1609
+
1610
+
1442
1611
  function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) {
1443
1612
  let keys = Object.keys(inconsistent_records_info);
1444
1613
  let entries = [];
@@ -1468,16 +1637,22 @@ class RBQLInputIterator {
1468
1637
  async get_record() {
1469
1638
  throw new Error("Unable to call the interface method");
1470
1639
  }
1640
+ handle_query_modifier() {
1641
+ return; // Reimplement if you need to handle a boolean query modifier that can be used like this: `SELECT * WITH (modifiername)`
1642
+ }
1471
1643
  get_warnings() {
1472
1644
  return []; // Reimplement if your class can produce warnings
1473
1645
  }
1646
+ async get_header() {
1647
+ return null; // Reimplement if your class can provide input header
1648
+ }
1474
1649
  }
1475
1650
 
1476
1651
 
1477
1652
  class RBQLOutputWriter {
1478
1653
  constructor(){}
1479
1654
 
1480
- write(fields) {
1655
+ async write(fields) {
1481
1656
  throw new Error("Unable to call the interface method");
1482
1657
  }
1483
1658
 
@@ -1488,6 +1663,10 @@ class RBQLOutputWriter {
1488
1663
  get_warnings() {
1489
1664
  return []; // Reimplement if your class can produce warnings
1490
1665
  };
1666
+
1667
+ set_header() {
1668
+ return; // Reimplement if your class can handle output headers in a meaningful way
1669
+ }
1491
1670
  }
1492
1671
 
1493
1672
 
@@ -1558,6 +1737,10 @@ class TableIterator extends RBQLInputIterator {
1558
1737
  return [make_inconsistent_num_fields_warning('input', this.fields_info)];
1559
1738
  return [];
1560
1739
  };
1740
+
1741
+ async get_header() {
1742
+ return this.column_names;
1743
+ }
1561
1744
  }
1562
1745
 
1563
1746
 
@@ -1565,12 +1748,17 @@ class TableWriter extends RBQLOutputWriter {
1565
1748
  constructor(external_table) {
1566
1749
  super();
1567
1750
  this.table = external_table;
1751
+ this.header = null;
1568
1752
  }
1569
1753
 
1570
- write(fields) {
1754
+ async write(fields) {
1571
1755
  this.table.push(fields);
1572
1756
  return true;
1573
1757
  };
1758
+
1759
+ set_header(header) {
1760
+ this.header = header;
1761
+ }
1574
1762
  }
1575
1763
 
1576
1764
 
@@ -1595,9 +1783,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1595
1783
  query_text = cleanup_query(query_text);
1596
1784
  var [format_expression, string_literals] = separate_string_literals(query_text);
1597
1785
  format_expression = remove_redundant_table_name(format_expression);
1598
- var input_variables_map = await input_iterator.get_variables_map(query_text);
1599
1786
 
1600
1787
  var rb_actions = separate_actions(format_expression);
1788
+ if (rb_actions.hasOwnProperty(WITH)) {
1789
+ input_iterator.handle_query_modifier(rb_actions[WITH]);
1790
+ }
1791
+ var input_variables_map = await input_iterator.get_variables_map(query_text);
1601
1792
 
1602
1793
  if (rb_actions.hasOwnProperty(ORDER_BY) && rb_actions.hasOwnProperty(UPDATE))
1603
1794
  throw new RbqlParsingError('"ORDER BY" is not allowed in "UPDATE" queries');
@@ -1609,6 +1800,7 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1609
1800
  }
1610
1801
 
1611
1802
  let join_variables_map = null;
1803
+ let join_header = null;
1612
1804
  if (rb_actions.hasOwnProperty(JOIN)) {
1613
1805
  var [rhs_table_id, variable_pairs] = parse_join_expression(rb_actions[JOIN]['text']);
1614
1806
  if (join_tables_registry === null)
@@ -1616,7 +1808,11 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1616
1808
  let join_record_iterator = join_tables_registry.get_iterator_by_table_id(rhs_table_id);
1617
1809
  if (!join_record_iterator)
1618
1810
  throw new RbqlParsingError(`Unable to find join table: "${rhs_table_id}"`);
1811
+ if (rb_actions.hasOwnProperty(WITH)) {
1812
+ join_record_iterator.handle_query_modifier(rb_actions[WITH]);
1813
+ }
1619
1814
  join_variables_map = await join_record_iterator.get_variables_map(query_text);
1815
+ join_header = await join_record_iterator.get_header();
1620
1816
  let [lhs_variables, rhs_indices] = resolve_join_variables(input_variables_map, join_variables_map, variable_pairs, string_literals);
1621
1817
  let sql_join_type = {'JOIN': InnerJoiner, 'INNER JOIN': InnerJoiner, 'LEFT JOIN': LeftJoiner, 'LEFT OUTER JOIN': LeftJoiner, 'STRICT LEFT JOIN': StrictLeftJoiner}[rb_actions[JOIN]['join_subtype']];
1622
1818
  query_context.lhs_join_var_expression = lhs_variables.length == 1 ? lhs_variables[0] : 'JSON.stringify([' + lhs_variables.join(',') + '])';
@@ -1634,26 +1830,33 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1634
1830
  query_context.where_expression = combine_string_literals(where_expression, string_literals);
1635
1831
  }
1636
1832
 
1833
+ let input_header = await input_iterator.get_header();
1637
1834
  if (rb_actions.hasOwnProperty(UPDATE)) {
1638
1835
  var update_expression = translate_update_expression(rb_actions[UPDATE]['text'], input_variables_map, string_literals, ' '.repeat(8));
1639
1836
  query_context.update_expressions = combine_string_literals(update_expression, string_literals);
1837
+ query_context.writer.set_header(input_header);
1640
1838
  }
1641
1839
 
1642
1840
  if (rb_actions.hasOwnProperty(SELECT)) {
1643
1841
  query_context.top_count = find_top(rb_actions);
1644
- query_context.writer = new TopWriter(query_context.writer, query_context.top_count);
1842
+ if (rb_actions.hasOwnProperty(EXCEPT)) {
1843
+ let [output_header, select_expression] = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals, input_header);
1844
+ query_context.select_expression = select_expression;
1845
+ query_context.writer.set_header(output_header);
1846
+ } else {
1847
+ let [select_expression, select_expression_for_ast] = translate_select_expression(rb_actions[SELECT]['text']);
1848
+ query_context.select_expression = combine_string_literals(select_expression, string_literals);
1849
+ let column_infos = adhoc_parse_select_expression_to_column_infos(select_expression_for_ast, string_literals);
1850
+ let output_header = select_output_header(input_header, join_header, column_infos);
1851
+ query_context.writer.set_header(output_header);
1852
+ }
1645
1853
 
1854
+ query_context.writer = new TopWriter(query_context.writer, query_context.top_count);
1646
1855
  if (rb_actions[SELECT].hasOwnProperty('distinct_count')) {
1647
1856
  query_context.writer = new UniqCountWriter(query_context.writer);
1648
1857
  } else if (rb_actions[SELECT].hasOwnProperty('distinct')) {
1649
1858
  query_context.writer = new UniqWriter(query_context.writer);
1650
1859
  }
1651
- if (rb_actions.hasOwnProperty(EXCEPT)) {
1652
- query_context.select_expression = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals);
1653
- } else {
1654
- let select_expression = translate_select_expression(rb_actions[SELECT]['text']);
1655
- query_context.select_expression = combine_string_literals(select_expression, string_literals);
1656
- }
1657
1860
  }
1658
1861
 
1659
1862
  if (rb_actions.hasOwnProperty(ORDER_BY)) {
@@ -1676,13 +1879,21 @@ async function query(query_text, input_iterator, output_writer, output_warnings,
1676
1879
  }
1677
1880
 
1678
1881
 
1679
- async function query_table(query_text, input_table, output_table, output_warnings, join_table=null, input_column_names=null, join_column_names=null, normalize_column_names=true, user_init_code='') {
1882
+ async function query_table(query_text, input_table, output_table, output_warnings, join_table=null, input_column_names=null, join_column_names=null, output_column_names=null, normalize_column_names=true, user_init_code='') {
1680
1883
  if (!normalize_column_names && input_column_names !== null && join_column_names !== null)
1681
1884
  ensure_no_ambiguous_variables(query_text, input_column_names, join_column_names);
1682
1885
  let input_iterator = new TableIterator(input_table, input_column_names, normalize_column_names);
1683
1886
  let output_writer = new TableWriter(output_table);
1684
1887
  let join_tables_registry = join_table === null ? null : new SingleTableRegistry(join_table, join_column_names, normalize_column_names);
1685
1888
  await query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry, user_init_code);
1889
+ if (output_column_names !== null) {
1890
+ assert(output_column_names.length == 0, '`output_column_names` param must be an empty list or null');
1891
+ if (output_writer.header !== null) {
1892
+ for (let column_name of output_writer.header) {
1893
+ output_column_names.push(column_name);
1894
+ }
1895
+ }
1896
+ }
1686
1897
  }
1687
1898
 
1688
1899
 
@@ -1716,6 +1927,7 @@ exports.exception_to_error_info = exception_to_error_info;
1716
1927
 
1717
1928
 
1718
1929
  // The functions below are exported just for unit tests, they are not part of the rbql API
1930
+ // TODO exports through the special unit_test proxy e.g. exports.unit_test.parse_basic_variables = parse_basic_variables;
1719
1931
  exports.parse_basic_variables = parse_basic_variables;
1720
1932
  exports.parse_array_variables = parse_array_variables;
1721
1933
  exports.parse_dictionary_variables = parse_dictionary_variables;
@@ -1725,11 +1937,15 @@ exports.strip_comments = strip_comments;
1725
1937
  exports.separate_actions = separate_actions;
1726
1938
  exports.separate_string_literals = separate_string_literals;
1727
1939
  exports.combine_string_literals = combine_string_literals;
1728
- exports.translate_except_expression = translate_except_expression;
1729
1940
  exports.parse_join_expression = parse_join_expression;
1730
1941
  exports.resolve_join_variables = resolve_join_variables;
1731
1942
  exports.translate_update_expression = translate_update_expression;
1732
1943
  exports.translate_select_expression = translate_select_expression;
1944
+ exports.translate_except_expression = translate_except_expression;
1733
1945
  exports.like_to_regex = like_to_regex;
1946
+ exports.adhoc_parse_select_expression_to_column_infos = adhoc_parse_select_expression_to_column_infos;
1947
+ exports.replace_star_count = replace_star_count;
1948
+ exports.replace_star_vars_for_header_parsing = replace_star_vars_for_header_parsing;
1949
+ exports.select_output_header = select_output_header;
1734
1950
 
1735
1951
  }(typeof exports === 'undefined' ? this.rbql = {} : exports));
package/rbql_csv.js CHANGED
@@ -117,11 +117,18 @@ function get_index_record(index_path, key) {
117
117
  }
118
118
 
119
119
 
120
- function find_table_path(table_id) {
120
+ function find_table_path(main_table_dir, table_id) {
121
+ // If table_id is a relative path it could be relative either to the current directory or to the main table dir.
121
122
  var candidate_path = expanduser(table_id);
122
123
  if (fs.existsSync(candidate_path)) {
123
124
  return candidate_path;
124
125
  }
126
+ if (main_table_dir && !path.isAbsolute(candidate_path)) {
127
+ candidate_path = path.join(main_table_dir, candidate_path);
128
+ if (fs.existsSync(candidate_path)) {
129
+ return candidate_path;
130
+ }
131
+ }
125
132
  let table_names_settings_path = path.join(os.homedir(), '.rbql_table_names');
126
133
  var name_record = get_index_record(table_names_settings_path, table_id);
127
134
  if (name_record && name_record.length > 1 && fs.existsSync(name_record[1])) {
@@ -152,10 +159,6 @@ class RecordQueue {
152
159
  }
153
160
  return this.pull_stack.pop();
154
161
  }
155
-
156
- return_to_pull_stack(record) {
157
- this.pull_stack.push(record);
158
- }
159
162
  }
160
163
 
161
164
 
@@ -163,7 +166,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
163
166
  // CSVRecordIterator implements a typical async producer-consumer model with an internal buffer:
164
167
  // get_record() - consumer
165
168
  // stream.on('data') - producer
166
- constructor(stream, csv_path, encoding, delim, policy, skip_headers=false, comment_prefix=null, table_name='input', variable_prefix='a') {
169
+ constructor(stream, csv_path, encoding, delim, policy, has_header=false, comment_prefix=null, table_name='input', variable_prefix='a') {
167
170
  super();
168
171
  this.stream = stream;
169
172
  this.csv_path = csv_path;
@@ -171,7 +174,12 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
171
174
  this.encoding = encoding;
172
175
  this.delim = delim;
173
176
  this.policy = policy;
174
- this.skip_headers = skip_headers;
177
+
178
+ this.has_header = has_header;
179
+ this.first_record = null;
180
+ this.first_record_should_be_emitted = !has_header;
181
+ this.header_preread_complete = false;
182
+
175
183
  this.table_name = table_name;
176
184
  this.variable_prefix = variable_prefix;
177
185
  this.comment_prefix = (comment_prefix !== null && comment_prefix.length) ? comment_prefix : null;
@@ -203,9 +211,13 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
203
211
  this.rfc_line_buffer = [];
204
212
 
205
213
  this.partially_decoded_line = '';
214
+ this.partially_decoded_line_ends_with_cr = false;
206
215
 
216
+ // Holds an external "resolve" function which is called when everything is fine.
207
217
  this.resolve_current_record = null;
218
+ // Holds an external "reject" function which is called when error has occured.
208
219
  this.reject_current_record = null;
220
+ // Holds last exception if we don't have any reject callbacks from clients yet.
209
221
  this.current_exception = null;
210
222
 
211
223
  this.produced_records_queue = new RecordQueue();
@@ -213,27 +225,56 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
213
225
  this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line;
214
226
  }
215
227
 
216
- handle_exception(exception) {
217
- if (this.reject_current_record) {
228
+
229
+ handle_query_modifier(modifier) {
230
+ // For `... WITH (header) ...` syntax
231
+ if (['header', 'headers'].indexOf(modifier) != -1) {
232
+ this.has_header = true;
233
+ this.first_record_should_be_emitted = false;
234
+ }
235
+ if (['noheader', 'noheaders'].indexOf(modifier) != -1) {
236
+ this.has_header = false;
237
+ this.first_record_should_be_emitted = true;
238
+ }
239
+ }
240
+
241
+
242
+ reset_external_callbacks() {
243
+ // Drop external callbacks simultaneously since promises can only resolve once, see: https://stackoverflow.com/a/18218542/2898283
244
+ this.reject_current_record = null;
245
+ this.resolve_current_record = null;
246
+ }
247
+
248
+ try_propagate_exception() {
249
+ if (this.current_exception && this.reject_current_record) {
218
250
  let reject = this.reject_current_record;
219
- this.reject_current_record = null;
220
- this.resolve_current_record = null;
251
+ let exception = this.current_exception;
252
+ this.reset_external_callbacks();
253
+ this.current_exception = null;
221
254
  reject(exception);
222
- } else {
223
- this.current_exception = exception;
224
255
  }
256
+ }
257
+
225
258
 
259
+ store_or_propagate_exception(exception) {
260
+ if (this.current_exception === null)
261
+ // Ignore subsequent exceptions if we already have an unreported error. This way we prioritize earlier errors over the more recent ones.
262
+ this.current_exception = exception;
263
+ this.try_propagate_exception();
226
264
  }
227
265
 
228
- async preread_header() {
229
- let header_record = await this.get_record();
230
- if (header_record === null)
231
- return null;
232
- if (!this.skip_headers)
233
- this.produced_records_queue.return_to_pull_stack(header_record);
266
+
267
+ async preread_first_record() {
268
+ if (this.header_preread_complete)
269
+ return;
270
+ this.first_record = await this.get_record();
271
+ this.header_preread_complete = true; // We must set header_preread_complete to true after calling get_record(), because get_record() uses it internally.
272
+ if (this.first_record === null) {
273
+ return;
274
+ }
234
275
  if (this.stream)
235
276
  this.stream.pause();
236
- return header_record.slice();
277
+ this.first_record = this.first_record.slice();
237
278
  };
238
279
 
239
280
 
@@ -242,24 +283,37 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
242
283
  rbql.parse_basic_variables(query_text, this.variable_prefix, variable_map);
243
284
  rbql.parse_array_variables(query_text, this.variable_prefix, variable_map);
244
285
 
245
- let header_record = await this.preread_header(); // TODO optimize: do not start the stream if query_text doesn't seem to have dictionary or attribute -looking patterns
246
- if (header_record) {
247
- rbql.parse_attribute_variables(query_text, this.variable_prefix, header_record, 'CSV header line', variable_map);
248
- rbql.parse_dictionary_variables(query_text, this.variable_prefix, header_record, variable_map);
286
+ await this.preread_first_record();
287
+ if (this.has_header && this.first_record) {
288
+ rbql.parse_attribute_variables(query_text, this.variable_prefix, this.first_record, 'CSV header line', variable_map);
289
+ rbql.parse_dictionary_variables(query_text, this.variable_prefix, this.first_record, variable_map);
249
290
  }
250
291
  return variable_map;
251
292
  };
252
293
 
294
+ async get_header() {
295
+ await this.preread_first_record();
296
+ return this.has_header ? this.first_record : null;
297
+ }
298
+
253
299
 
254
300
  try_resolve_next_record() {
301
+ this.try_propagate_exception();
255
302
  if (this.resolve_current_record === null)
256
303
  return;
257
- let record = this.produced_records_queue.dequeue();
304
+
305
+ let record = null;
306
+ if (this.first_record_should_be_emitted && this.header_preread_complete) {
307
+ this.first_record_should_be_emitted = false;
308
+ record = this.first_record;
309
+ } else {
310
+ record = this.produced_records_queue.dequeue();
311
+ }
312
+
258
313
  if (record === null && !this.input_exhausted)
259
314
  return;
260
315
  let resolve = this.resolve_current_record;
261
- this.resolve_current_record = null;
262
- this.reject_current_record = null;
316
+ this.reset_external_callbacks();
263
317
  resolve(record);
264
318
  };
265
319
 
@@ -275,9 +329,6 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
275
329
  parent_iterator.resolve_current_record = resolve;
276
330
  parent_iterator.reject_current_record = reject;
277
331
  });
278
- if (this.current_exception) {
279
- this.reject_current_record(this.current_exception);
280
- }
281
332
  this.try_resolve_next_record();
282
333
  return current_record_promise;
283
334
  };
@@ -308,7 +359,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
308
359
  if (this.first_defective_line === null) {
309
360
  this.first_defective_line = this.NL;
310
361
  if (this.policy == 'quoted_rfc')
311
- this.handle_exception(new RbqlIOHandlingError(`Inconsistent double quote escaping in ${this.table_name} table at record ${this.NR}, line ${this.NL}`));
362
+ this.store_or_propagate_exception(new RbqlIOHandlingError(`Inconsistent double quote escaping in ${this.table_name} table at record ${this.NR}, line ${this.NL}`));
312
363
  }
313
364
  }
314
365
  let num_fields = record.length;
@@ -359,19 +410,23 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
359
410
  decoded_string = this.decoder.decode(data_chunk);
360
411
  } catch (e) {
361
412
  if (e instanceof TypeError) {
362
- this.handle_exception(new RbqlIOHandlingError(utf_decoding_error));
413
+ this.store_or_propagate_exception(new RbqlIOHandlingError(utf_decoding_error));
363
414
  } else {
364
- this.handle_exception(e);
415
+ this.store_or_propagate_exception(e);
365
416
  }
366
417
  return;
367
418
  }
368
419
  } else {
369
420
  decoded_string = data_chunk.toString(this.encoding);
370
421
  }
422
+ let line_starts_with_lf = decoded_string.length && decoded_string[0] == '\n';
423
+ let first_line_index = line_starts_with_lf && this.partially_decoded_line_ends_with_cr ? 1 : 0;
424
+ this.partially_decoded_line_ends_with_cr = decoded_string.length && decoded_string[decoded_string.length - 1] == '\r';
371
425
  let lines = csv_utils.split_lines(decoded_string);
372
426
  lines[0] = this.partially_decoded_line + lines[0];
427
+ assert(first_line_index == 0 || lines[0].length == 0);
373
428
  this.partially_decoded_line = lines.pop();
374
- for (let i = 0; i < lines.length; i++) {
429
+ for (let i = first_line_index; i < lines.length; i++) {
375
430
  this.process_line(lines[i]);
376
431
  }
377
432
  };
@@ -384,7 +439,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
384
439
  // TODO get rid of this once TextDecoder is really fixed or when alternative method of reliable decoding appears
385
440
  let control_buffer = Buffer.from(decoded_string, 'utf-8');
386
441
  if (Buffer.compare(data_chunk, control_buffer) != 0) {
387
- this.handle_exception(new RbqlIOHandlingError(utf_decoding_error));
442
+ this.store_or_propagate_exception(new RbqlIOHandlingError(utf_decoding_error));
388
443
  return;
389
444
  }
390
445
  }
@@ -465,6 +520,7 @@ class CSVWriter extends rbql.RBQLOutputWriter {
465
520
  this.encoding = encoding;
466
521
  if (encoding)
467
522
  this.stream.setDefaultEncoding(encoding);
523
+ this.stream.on('error', (error_obj) => { this.store_first_error(error_obj); })
468
524
  this.delim = delim;
469
525
  this.policy = policy;
470
526
  this.line_separator = line_separator;
@@ -474,6 +530,8 @@ class CSVWriter extends rbql.RBQLOutputWriter {
474
530
 
475
531
  this.null_in_output = false;
476
532
  this.delim_in_simple_output = false;
533
+ this.header_len = null;
534
+ this.first_error = null;
477
535
 
478
536
  if (policy == 'simple') {
479
537
  this.polymorphic_join = this.simple_join;
@@ -491,6 +549,20 @@ class CSVWriter extends rbql.RBQLOutputWriter {
491
549
  }
492
550
 
493
551
 
552
+ store_first_error(error_obj) {
553
+ // Store only first error because it is typically more important than the subsequent ones.
554
+ if (this.first_error === null)
555
+ this.first_error = error_obj;
556
+ }
557
+
558
+ set_header(header) {
559
+ if (header !== null) {
560
+ this.header_len = header.length;
561
+ this.write(header);
562
+ }
563
+ }
564
+
565
+
494
566
  quoted_join(fields) {
495
567
  let delim = this.delim;
496
568
  var quoted_fields = fields.map(function(v) { return csv_utils.quote_field(String(v), delim); });
@@ -535,11 +607,20 @@ class CSVWriter extends rbql.RBQLOutputWriter {
535
607
  };
536
608
 
537
609
 
538
- write(fields) {
610
+ async write(fields) {
611
+ if (this.header_len !== null && fields.length != this.header_len)
612
+ throw new RbqlIOHandlingError(`Inconsistent number of columns in output header and the current record: ${this.header_len} != ${fields.length}`);
539
613
  this.normalize_fields(fields);
540
614
  this.stream.write(this.polymorphic_join(fields));
541
615
  this.stream.write(this.line_separator);
542
- return true;
616
+ let writer_error = this.first_error;
617
+ return new Promise(function(resolve, reject) {
618
+ if (writer_error !== null) {
619
+ reject(writer_error);
620
+ } else {
621
+ resolve(true);
622
+ }
623
+ });
543
624
  };
544
625
 
545
626
 
@@ -554,7 +635,11 @@ class CSVWriter extends rbql.RBQLOutputWriter {
554
635
  let close_stream_on_finish = this.close_stream_on_finish;
555
636
  let output_stream = this.stream;
556
637
  let output_encoding = this.encoding;
638
+ let writer_error = this.first_error;
557
639
  let finish_promise = new Promise(function(resolve, reject) {
640
+ if (writer_error !== null) {
641
+ reject(writer_error);
642
+ }
558
643
  if (close_stream_on_finish) {
559
644
  output_stream.end('', output_encoding, () => { resolve(); });
560
645
  } else {
@@ -578,12 +663,13 @@ class CSVWriter extends rbql.RBQLOutputWriter {
578
663
 
579
664
 
580
665
  class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
581
- constructor(delim, policy, encoding, skip_headers=false, comment_prefix=null, options=null) {
666
+ constructor(input_file_dir, delim, policy, encoding, has_header=false, comment_prefix=null, options=null) {
582
667
  super();
668
+ this.input_file_dir = input_file_dir;
583
669
  this.delim = delim;
584
670
  this.policy = policy;
585
671
  this.encoding = encoding;
586
- this.skip_headers = skip_headers;
672
+ this.has_header = has_header;
587
673
  this.comment_prefix = comment_prefix;
588
674
  this.stream = null;
589
675
  this.record_iterator = null;
@@ -594,7 +680,7 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
594
680
  }
595
681
 
596
682
  get_iterator_by_table_id(table_id) {
597
- this.table_path = find_table_path(table_id);
683
+ this.table_path = find_table_path(this.input_file_dir, table_id);
598
684
  if (this.table_path === null) {
599
685
  throw new RbqlIOHandlingError(`Unable to find join table "${table_id}"`);
600
686
  }
@@ -603,19 +689,19 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
603
689
  } else {
604
690
  this.stream = fs.createReadStream(this.table_path);
605
691
  }
606
- this.record_iterator = new CSVRecordIterator(this.stream, this.bulk_input_path, this.encoding, this.delim, this.policy, this.skip_headers, this.comment_prefix, table_id, 'b');
692
+ this.record_iterator = new CSVRecordIterator(this.stream, this.bulk_input_path, this.encoding, this.delim, this.policy, this.has_header, this.comment_prefix, table_id, 'b');
607
693
  return this.record_iterator;
608
694
  };
609
695
 
610
696
  get_warnings(output_warnings) {
611
- if (this.record_iterator && this.skip_headers) {
612
- output_warnings.push(`The first (header) record was also skipped in the JOIN file: ${path.basename(this.table_path)}`);
697
+ if (this.record_iterator && this.has_header) {
698
+ output_warnings.push(`The first record in JOIN file ${path.basename(this.table_path)} was also treated as header (and skipped)`);
613
699
  }
614
700
  }
615
701
  }
616
702
 
617
703
 
618
- async function query_csv(query_text, input_path, input_delim, input_policy, output_path, output_delim, output_policy, csv_encoding, output_warnings, skip_headers=false, comment_prefix=null, user_init_code='', options=null) {
704
+ async function query_csv(query_text, input_path, input_delim, input_policy, output_path, output_delim, output_policy, csv_encoding, output_warnings, with_headers=false, comment_prefix=null, user_init_code='', options=null) {
619
705
  let input_stream = null;
620
706
  let bulk_input_path = null;
621
707
  if (options && options['bulk_read'] && input_path) {
@@ -624,6 +710,7 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
624
710
  input_stream = input_path === null ? process.stdin : fs.createReadStream(input_path);
625
711
  }
626
712
  let [output_stream, close_output_on_finish] = output_path === null ? [process.stdout, false] : [fs.createWriteStream(output_path), true];
713
+ // FIXME add on(error) handler to avoid async errors, see https://github.com/nodejs/node-v0.x-archive/issues/406
627
714
  if (input_delim == '"' && input_policy == 'quoted')
628
715
  throw new RbqlIOHandlingError('Double quote delimiter is incompatible with "quoted" policy');
629
716
  if (csv_encoding == 'latin-1')
@@ -637,9 +724,9 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
637
724
  if (user_init_code == '' && fs.existsSync(default_init_source_path)) {
638
725
  user_init_code = read_user_init_code(default_init_source_path);
639
726
  }
640
-
641
- let join_tables_registry = new FileSystemCSVRegistry(input_delim, input_policy, csv_encoding, skip_headers, comment_prefix, options);
642
- let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, skip_headers, comment_prefix);
727
+ let input_file_dir = input_path ? path.dirname(input_path) : null;
728
+ let join_tables_registry = new FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
729
+ let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix);
643
730
  let output_writer = new CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy);
644
731
 
645
732
  await rbql.query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry, user_init_code);