rbql 0.25.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +26 -0
- package/README.md +13 -11
- package/cli_parser.js +0 -0
- package/csv_utils.js +37 -0
- package/index.js +0 -0
- package/package.json +1 -1
- package/rbql.js +60 -34
- package/rbql_csv.js +32 -47
package/.eslintrc.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"env": {
|
|
3
|
+
"browser": false,
|
|
4
|
+
"commonjs": true,
|
|
5
|
+
"es6": true,
|
|
6
|
+
"node": true
|
|
7
|
+
},
|
|
8
|
+
"parserOptions": {
|
|
9
|
+
"ecmaFeatures": {
|
|
10
|
+
"jsx": true
|
|
11
|
+
},
|
|
12
|
+
"sourceType": "module",
|
|
13
|
+
"ecmaVersion": 2018
|
|
14
|
+
},
|
|
15
|
+
"rules": {
|
|
16
|
+
"no-const-assign": "warn",
|
|
17
|
+
"no-this-before-super": "warn",
|
|
18
|
+
"no-undef": "warn",
|
|
19
|
+
"semi": [2, "always"],
|
|
20
|
+
"no-unreachable": "warn",
|
|
21
|
+
"no-unused-vars": "warn",
|
|
22
|
+
"constructor-super": "warn",
|
|
23
|
+
"no-trailing-spaces": "error",
|
|
24
|
+
"valid-typeof": "warn"
|
|
25
|
+
}
|
|
26
|
+
}
|
package/README.md
CHANGED
|
@@ -196,7 +196,7 @@ rbql.query_table(user_query, input_table, output_table, warnings).then(success_h
|
|
|
196
196
|
|
|
197
197
|
#### Example of query_csv() usage:
|
|
198
198
|
```
|
|
199
|
-
const
|
|
199
|
+
const rbql = require('rbql');
|
|
200
200
|
let user_query = 'SELECT a1, parseInt(a2) % 1000 WHERE a3 != "USA" LIMIT 5';
|
|
201
201
|
let error_handler = function(exception) {
|
|
202
202
|
console.log('Error: ' + String(exception));
|
|
@@ -207,7 +207,7 @@ let success_handler = function() {
|
|
|
207
207
|
console.log('warnings: ' + JSON.stringify(warnings));
|
|
208
208
|
console.log('output table: output.csv');
|
|
209
209
|
}
|
|
210
|
-
|
|
210
|
+
rbql.query_csv(user_query, 'input.csv', ',', 'quoted', 'output.csv', ',', 'quoted', 'utf-8', warnings).then(success_handler).catch(error_handler);
|
|
211
211
|
```
|
|
212
212
|
|
|
213
213
|
|
|
@@ -267,6 +267,7 @@ $ rbql-js --input input.csv --output result.csv
|
|
|
267
267
|
* GROUP BY
|
|
268
268
|
* TOP _N_
|
|
269
269
|
* LIMIT _N_
|
|
270
|
+
* AS
|
|
270
271
|
|
|
271
272
|
All keywords have the same meaning as in SQL queries. You can check them [online](https://www.w3schools.com/sql/default.asp)
|
|
272
273
|
|
|
@@ -306,7 +307,7 @@ _COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_
|
|
|
306
307
|
Limitation: aggregate functions inside JavaScript expressions are not supported. Although you can use expressions inside aggregate functions.
|
|
307
308
|
E.g. `MAX(float(a1) / 1000)` - valid; `MAX(a1) / 1000` - invalid.
|
|
308
309
|
There is a workaround for the limitation above for _ARRAY_AGG_ function which supports an optional parameter - a callback function that can do something with the aggregated array. Example:
|
|
309
|
-
`
|
|
310
|
+
`SELECT a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) GROUP BY a2`
|
|
310
311
|
|
|
311
312
|
|
|
312
313
|
### JOIN statements
|
|
@@ -346,14 +347,15 @@ You can define custom functions and/or import libraries in a special file: `~/.r
|
|
|
346
347
|
|
|
347
348
|
## Examples of RBQL queries
|
|
348
349
|
|
|
349
|
-
* `
|
|
350
|
-
* `
|
|
351
|
-
* `
|
|
352
|
-
* `
|
|
353
|
-
* `
|
|
354
|
-
* `
|
|
355
|
-
* `
|
|
356
|
-
* `
|
|
350
|
+
* `SELECT TOP 100 a1, a2 * 10, a4.length WHERE a1 == "Buy" ORDER BY parseInt(a2) DESC`
|
|
351
|
+
* `SELECT a.id, a.weight / 1000 AS weight_kg`
|
|
352
|
+
* `SELECT * ORDER BY Math.random()` - random sort
|
|
353
|
+
* `SELECT TOP 20 a.vehicle_price.length / 10, a2 WHERE parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
|
|
354
|
+
* `UPDATE SET a3 = 'NPC' WHERE a3.indexOf('Non-playable character') != -1`
|
|
355
|
+
* `SELECT NR, *` - enumerate records, NR is 1-based
|
|
356
|
+
* `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query
|
|
357
|
+
* `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query
|
|
358
|
+
* `SELECT ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
|
|
357
359
|
|
|
358
360
|
|
|
359
361
|
### References
|
package/cli_parser.js
CHANGED
|
File without changes
|
package/csv_utils.js
CHANGED
|
@@ -39,6 +39,7 @@ function extract_next_field(src, dlm, preserve_quotes_and_whitespaces, allow_ext
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
function split_quoted_str(src, dlm, preserve_quotes_and_whitespaces=false) {
|
|
42
|
+
// This function is newline-agnostic i.e. it can also split records with multiline fields.
|
|
42
43
|
if (src.indexOf('"') == -1) // Optimization for most common case
|
|
43
44
|
return [src.split(dlm), false];
|
|
44
45
|
var result = [];
|
|
@@ -116,6 +117,41 @@ function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) {
|
|
|
116
117
|
}
|
|
117
118
|
|
|
118
119
|
|
|
120
|
+
class MultilineRecordAggregator {
|
|
121
|
+
constructor(comment_prefix) {
|
|
122
|
+
this.comment_prefix = comment_prefix;
|
|
123
|
+
this.reset();
|
|
124
|
+
}
|
|
125
|
+
add_line(line_text) {
|
|
126
|
+
if (this.has_full_record || this.has_comment_line) {
|
|
127
|
+
throw new Error('Invalid usage - record aggregator must be reset before adding new lines');
|
|
128
|
+
}
|
|
129
|
+
if (this.comment_prefix && this.rfc_line_buffer.length == 0 && line_text.startsWith(this.comment_prefix)) {
|
|
130
|
+
this.has_comment_line = true;
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
let match_list = line_text.match(/"/g);
|
|
134
|
+
let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
|
|
135
|
+
this.rfc_line_buffer.push(line_text);
|
|
136
|
+
this.has_full_record = (!has_unbalanced_double_quote && this.rfc_line_buffer.length == 1) || (has_unbalanced_double_quote && this.rfc_line_buffer.length > 1);
|
|
137
|
+
return this.has_full_record;
|
|
138
|
+
}
|
|
139
|
+
is_inside_multiline_record() {
|
|
140
|
+
return this.rfc_line_buffer.length && !this.has_full_record;
|
|
141
|
+
}
|
|
142
|
+
get_full_line(line_separator) {
|
|
143
|
+
return this.rfc_line_buffer.join(line_separator);
|
|
144
|
+
}
|
|
145
|
+
get_num_lines_in_record() {
|
|
146
|
+
return this.rfc_line_buffer.length;
|
|
147
|
+
}
|
|
148
|
+
reset() {
|
|
149
|
+
this.rfc_line_buffer = [];
|
|
150
|
+
this.has_full_record = false;
|
|
151
|
+
this.has_comment_line = false;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
119
155
|
|
|
120
156
|
module.exports.split_quoted_str = split_quoted_str;
|
|
121
157
|
module.exports.split_whitespace_separated_str = split_whitespace_separated_str;
|
|
@@ -125,3 +161,4 @@ module.exports.rfc_quote_field = rfc_quote_field;
|
|
|
125
161
|
module.exports.unquote_field = unquote_field;
|
|
126
162
|
module.exports.unquote_fields = unquote_fields;
|
|
127
163
|
module.exports.split_lines = split_lines;
|
|
164
|
+
module.exports.MultilineRecordAggregator = MultilineRecordAggregator;
|
package/index.js
CHANGED
|
File without changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "rbql",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.26.0",
|
|
4
4
|
"description": "Rainbow Query Language",
|
|
5
5
|
"keywords": ["CSV", "TSV", "spreadsheet", "SQL", "SQL-like", "transpiler", "CLI", "command-line", "library", "browser", "Node", "select", "update", "join"],
|
|
6
6
|
"scripts": {
|
package/rbql.js
CHANGED
|
@@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions.
|
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
|
|
73
|
-
const RBQL_VERSION = '0.
|
|
73
|
+
const RBQL_VERSION = '0.26.0';
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
function check_if_brackets_match(opening_bracket, closing_bracket) {
|
|
@@ -128,28 +128,32 @@ function column_info_from_text_span(text_span, string_literals) {
|
|
|
128
128
|
let attribute_match = /^([ab])\.([_a-zA-Z][_a-zA-Z0-9]*)$/.exec(text_span);
|
|
129
129
|
let subscript_int_match = /^([ab])\[([0-9]+)\]$/.exec(text_span);
|
|
130
130
|
let subscript_str_match = /^([ab])\[___RBQL_STRING_LITERAL([0-9]+)___\]$/.exec(text_span);
|
|
131
|
+
let as_alias_match = /^(.*) (as|AS) +([a-zA-Z][a-zA-Z0-9_]*) *$/.exec(text_span);
|
|
132
|
+
if (as_alias_match !== null) {
|
|
133
|
+
return {table_name: null, column_index: null, column_name: as_alias_match[3], is_star: false, is_alias: true};
|
|
134
|
+
}
|
|
131
135
|
if (simple_var_match !== null) {
|
|
132
136
|
if (text_span == rbql_star_marker)
|
|
133
|
-
return {table_name: null, column_index: null, column_name: null, is_star: true};
|
|
137
|
+
return {table_name: null, column_index: null, column_name: null, is_star: true, is_alias: false};
|
|
134
138
|
if (text_span.startsWith('___RBQL_STRING_LITERAL'))
|
|
135
139
|
return null;
|
|
136
140
|
let match = /^([ab])([0-9]+)$/.exec(text_span);
|
|
137
141
|
if (match !== null) {
|
|
138
|
-
return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false};
|
|
142
|
+
return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false, is_alias: false};
|
|
139
143
|
}
|
|
140
144
|
// Some examples for this branch: NR, NF
|
|
141
|
-
return {table_name: null, column_index: null, column_name: text_span, is_star: false};
|
|
145
|
+
return {table_name: null, column_index: null, column_name: text_span, is_star: false, is_alias: false};
|
|
142
146
|
} else if (attribute_match !== null) {
|
|
143
147
|
let table_name = attribute_match[1];
|
|
144
148
|
let column_name = attribute_match[2];
|
|
145
149
|
if (column_name == rbql_star_marker) {
|
|
146
|
-
return {table_name: table_name, column_index: null, column_name: null, is_star: true};
|
|
150
|
+
return {table_name: table_name, column_index: null, column_name: null, is_star: true, is_alias: false};
|
|
147
151
|
}
|
|
148
|
-
return {table_name: null, column_index: null, column_name: column_name, is_star: false};
|
|
152
|
+
return {table_name: null, column_index: null, column_name: column_name, is_star: false, is_alias: false};
|
|
149
153
|
} else if (subscript_int_match != null) {
|
|
150
154
|
let table_name = subscript_int_match[1];
|
|
151
155
|
let column_index = parseInt(subscript_int_match[2]) - 1;
|
|
152
|
-
return {table_name: table_name, column_index: column_index, column_name: null, is_star: false};
|
|
156
|
+
return {table_name: table_name, column_index: column_index, column_name: null, is_star: false, is_alias: false};
|
|
153
157
|
} else if (subscript_str_match != null) {
|
|
154
158
|
let table_name = subscript_str_match[1];
|
|
155
159
|
let replaced_string_literal_id = subscript_str_match[2];
|
|
@@ -157,7 +161,7 @@ function column_info_from_text_span(text_span, string_literals) {
|
|
|
157
161
|
let quoted_column_name = string_literals[replaced_string_literal_id];
|
|
158
162
|
let unquoted_column_name = unquote_string(quoted_column_name);
|
|
159
163
|
if (unquoted_column_name !== null && unquoted_column_name !== undefined) {
|
|
160
|
-
return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false};
|
|
164
|
+
return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false, is_alias: false};
|
|
161
165
|
}
|
|
162
166
|
}
|
|
163
167
|
}
|
|
@@ -1328,9 +1332,11 @@ function translate_update_expression(update_expression, input_variables_map, str
|
|
|
1328
1332
|
|
|
1329
1333
|
|
|
1330
1334
|
function translate_select_expression(select_expression) {
|
|
1331
|
-
let
|
|
1332
|
-
let
|
|
1333
|
-
let
|
|
1335
|
+
let as_alias_replacement_regexp = / +(AS|as) +([a-zA-Z][a-zA-Z0-9_]*) *(?=$|,)/g;
|
|
1336
|
+
let expression_without_counting_stars = replace_star_count(select_expression);
|
|
1337
|
+
let expression_without_as_column_alias = expression_without_counting_stars.replace(as_alias_replacement_regexp, '');
|
|
1338
|
+
let translated = str_strip(replace_star_vars(expression_without_as_column_alias));
|
|
1339
|
+
let translated_for_header = str_strip(replace_star_vars_for_header_parsing(expression_without_counting_stars));
|
|
1334
1340
|
if (!translated.length)
|
|
1335
1341
|
throw new RbqlParsingError('"SELECT" expression is empty');
|
|
1336
1342
|
return [`[].concat([${translated}])`, translated_for_header];
|
|
@@ -1571,12 +1577,21 @@ function remove_redundant_table_name(query_text) {
|
|
|
1571
1577
|
|
|
1572
1578
|
|
|
1573
1579
|
function select_output_header(input_header, join_header, query_column_infos) {
|
|
1574
|
-
if (input_header === null
|
|
1580
|
+
if (input_header === null) {
|
|
1581
|
+
assert(join_header === null);
|
|
1582
|
+
}
|
|
1583
|
+
if (input_header === null) {
|
|
1584
|
+
for (let qci of query_column_infos) {
|
|
1585
|
+
if (qci !== null && qci.is_alias) {
|
|
1586
|
+
throw new RbqlParsingError(`Specifying column alias "AS ${qci.column_name}" is not allowed if input table has no header`);
|
|
1587
|
+
}
|
|
1588
|
+
}
|
|
1575
1589
|
return null;
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1590
|
+
}
|
|
1591
|
+
if (join_header === null) {
|
|
1592
|
+
// This means there is no JOIN table.
|
|
1579
1593
|
join_header = [];
|
|
1594
|
+
}
|
|
1580
1595
|
let output_header = [];
|
|
1581
1596
|
for (let qci of query_column_infos) {
|
|
1582
1597
|
// TODO refactor this and python version: extract this code into a function instead to always return something
|
|
@@ -1608,20 +1623,20 @@ function select_output_header(input_header, join_header, query_column_infos) {
|
|
|
1608
1623
|
}
|
|
1609
1624
|
|
|
1610
1625
|
|
|
1611
|
-
function
|
|
1612
|
-
let
|
|
1613
|
-
|
|
1614
|
-
for (let i = 0; i < keys.length; i++) {
|
|
1615
|
-
let key = keys[i];
|
|
1616
|
-
let record_id = inconsistent_records_info[key];
|
|
1617
|
-
entries.push([record_id, key]);
|
|
1618
|
-
}
|
|
1619
|
-
entries.sort(function(a, b) { return a[0] - b[0]; });
|
|
1626
|
+
function sample_first_two_inconsistent_records(inconsistent_records_info) {
|
|
1627
|
+
let entries = Array.from(inconsistent_records_info.entries());
|
|
1628
|
+
entries.sort(function(a, b) { return a[1] - b[1]; });
|
|
1620
1629
|
assert(entries.length > 1);
|
|
1621
|
-
let [
|
|
1622
|
-
let [
|
|
1630
|
+
let [num_fields_1, record_num_1] = entries[0];
|
|
1631
|
+
let [num_fields_2, record_num_2] = entries[1];
|
|
1632
|
+
return [record_num_1, num_fields_1, record_num_2, num_fields_2];
|
|
1633
|
+
}
|
|
1634
|
+
|
|
1635
|
+
|
|
1636
|
+
function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) {
|
|
1637
|
+
let [record_num_1, num_fields_1, record_num_2, num_fields_2] = sample_first_two_inconsistent_records(inconsistent_records_info);
|
|
1623
1638
|
let warn_msg = `Number of fields in "${table_name}" table is not consistent: `;
|
|
1624
|
-
warn_msg += `e.g. record ${
|
|
1639
|
+
warn_msg += `e.g. record ${record_num_1} -> ${num_fields_1} fields, record ${record_num_2} -> ${num_fields_2} fields`;
|
|
1625
1640
|
return warn_msg;
|
|
1626
1641
|
}
|
|
1627
1642
|
|
|
@@ -1691,7 +1706,7 @@ class TableIterator extends RBQLInputIterator {
|
|
|
1691
1706
|
this.normalize_column_names = normalize_column_names;
|
|
1692
1707
|
this.variable_prefix = variable_prefix;
|
|
1693
1708
|
this.nr = 0;
|
|
1694
|
-
this.fields_info = new
|
|
1709
|
+
this.fields_info = new Map();
|
|
1695
1710
|
this.stopped = false;
|
|
1696
1711
|
}
|
|
1697
1712
|
|
|
@@ -1727,13 +1742,13 @@ class TableIterator extends RBQLInputIterator {
|
|
|
1727
1742
|
let record = this.table[this.nr];
|
|
1728
1743
|
this.nr += 1;
|
|
1729
1744
|
let num_fields = record.length;
|
|
1730
|
-
if (!this.fields_info.
|
|
1731
|
-
this.fields_info
|
|
1745
|
+
if (!this.fields_info.has(num_fields))
|
|
1746
|
+
this.fields_info.set(num_fields, this.nr);
|
|
1732
1747
|
return record;
|
|
1733
1748
|
};
|
|
1734
1749
|
|
|
1735
1750
|
get_warnings() {
|
|
1736
|
-
if (
|
|
1751
|
+
if (this.fields_info.size > 1)
|
|
1737
1752
|
return [make_inconsistent_num_fields_warning('input', this.fields_info)];
|
|
1738
1753
|
return [];
|
|
1739
1754
|
};
|
|
@@ -1799,6 +1814,8 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
|
|
|
1799
1814
|
query_context.aggregation_key_expression = '[' + combine_string_literals(rb_actions[GROUP_BY]['text'], string_literals) + ']';
|
|
1800
1815
|
}
|
|
1801
1816
|
|
|
1817
|
+
|
|
1818
|
+
let input_header = await input_iterator.get_header();
|
|
1802
1819
|
let join_variables_map = null;
|
|
1803
1820
|
let join_header = null;
|
|
1804
1821
|
if (rb_actions.hasOwnProperty(JOIN)) {
|
|
@@ -1813,6 +1830,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
|
|
|
1813
1830
|
}
|
|
1814
1831
|
join_variables_map = await join_record_iterator.get_variables_map(query_text);
|
|
1815
1832
|
join_header = await join_record_iterator.get_header();
|
|
1833
|
+
if (input_header === null && join_header !== null) {
|
|
1834
|
+
throw new RbqlIOHandlingError('Inconsistent modes: Input table doesn\'t have a header while the Join table has a header');
|
|
1835
|
+
}
|
|
1836
|
+
if (input_header !== null && join_header === null) {
|
|
1837
|
+
throw new RbqlIOHandlingError('Inconsistent modes: Input table has a header while the Join table doesn\'t have a header');
|
|
1838
|
+
}
|
|
1816
1839
|
let [lhs_variables, rhs_indices] = resolve_join_variables(input_variables_map, join_variables_map, variable_pairs, string_literals);
|
|
1817
1840
|
let sql_join_type = {'JOIN': InnerJoiner, 'INNER JOIN': InnerJoiner, 'LEFT JOIN': LeftJoiner, 'LEFT OUTER JOIN': LeftJoiner, 'STRICT LEFT JOIN': StrictLeftJoiner}[rb_actions[JOIN]['join_subtype']];
|
|
1818
1841
|
query_context.lhs_join_var_expression = lhs_variables.length == 1 ? lhs_variables[0] : 'JSON.stringify([' + lhs_variables.join(',') + '])';
|
|
@@ -1830,7 +1853,6 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
|
|
|
1830
1853
|
query_context.where_expression = combine_string_literals(where_expression, string_literals);
|
|
1831
1854
|
}
|
|
1832
1855
|
|
|
1833
|
-
let input_header = await input_iterator.get_header();
|
|
1834
1856
|
if (rb_actions.hasOwnProperty(UPDATE)) {
|
|
1835
1857
|
var update_expression = translate_update_expression(rb_actions[UPDATE]['text'], input_variables_map, string_literals, ' '.repeat(8));
|
|
1836
1858
|
query_context.update_expressions = combine_string_literals(update_expression, string_literals);
|
|
@@ -1840,13 +1862,16 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
|
|
|
1840
1862
|
if (rb_actions.hasOwnProperty(SELECT)) {
|
|
1841
1863
|
query_context.top_count = find_top(rb_actions);
|
|
1842
1864
|
if (rb_actions.hasOwnProperty(EXCEPT)) {
|
|
1865
|
+
if (rb_actions.hasOwnProperty(JOIN)) {
|
|
1866
|
+
throw new RbqlParsingError('EXCEPT and JOIN are not allowed in the same query');
|
|
1867
|
+
}
|
|
1843
1868
|
let [output_header, select_expression] = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals, input_header);
|
|
1844
1869
|
query_context.select_expression = select_expression;
|
|
1845
1870
|
query_context.writer.set_header(output_header);
|
|
1846
1871
|
} else {
|
|
1847
|
-
let [select_expression,
|
|
1872
|
+
let [select_expression, select_expression_for_header] = translate_select_expression(rb_actions[SELECT]['text']);
|
|
1848
1873
|
query_context.select_expression = combine_string_literals(select_expression, string_literals);
|
|
1849
|
-
let column_infos = adhoc_parse_select_expression_to_column_infos(
|
|
1874
|
+
let column_infos = adhoc_parse_select_expression_to_column_infos(select_expression_for_header, string_literals);
|
|
1850
1875
|
let output_header = select_output_header(input_header, join_header, column_infos);
|
|
1851
1876
|
query_context.writer.set_header(output_header);
|
|
1852
1877
|
}
|
|
@@ -1947,5 +1972,6 @@ exports.adhoc_parse_select_expression_to_column_infos = adhoc_parse_select_expre
|
|
|
1947
1972
|
exports.replace_star_count = replace_star_count;
|
|
1948
1973
|
exports.replace_star_vars_for_header_parsing = replace_star_vars_for_header_parsing;
|
|
1949
1974
|
exports.select_output_header = select_output_header;
|
|
1975
|
+
exports.sample_first_two_inconsistent_records = sample_first_two_inconsistent_records;
|
|
1950
1976
|
|
|
1951
1977
|
}(typeof exports === 'undefined' ? this.rbql = {} : exports));
|
package/rbql_csv.js
CHANGED
|
@@ -62,19 +62,9 @@ function remove_utf8_bom(line, assumed_source_encoding) {
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) {
|
|
65
|
-
let
|
|
66
|
-
let entries = [];
|
|
67
|
-
for (let i = 0; i < keys.length; i++) {
|
|
68
|
-
let key = keys[i];
|
|
69
|
-
let record_id = inconsistent_records_info[key];
|
|
70
|
-
entries.push([record_id, key]);
|
|
71
|
-
}
|
|
72
|
-
entries.sort(function(a, b) { return a[0] - b[0]; });
|
|
73
|
-
assert(entries.length > 1);
|
|
74
|
-
let [record_1, num_fields_1] = entries[0];
|
|
75
|
-
let [record_2, num_fields_2] = entries[1];
|
|
65
|
+
let [record_num_1, num_fields_1, record_num_2, num_fields_2] = rbql.sample_first_two_inconsistent_records(inconsistent_records_info);
|
|
76
66
|
let warn_msg = `Number of fields in "${table_name}" table is not consistent: `;
|
|
77
|
-
warn_msg += `e.g. record ${
|
|
67
|
+
warn_msg += `e.g. record ${record_num_1} -> ${num_fields_1} fields, record ${record_num_2} -> ${num_fields_2} fields`;
|
|
78
68
|
return warn_msg;
|
|
79
69
|
}
|
|
80
70
|
|
|
@@ -182,7 +172,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
182
172
|
|
|
183
173
|
this.table_name = table_name;
|
|
184
174
|
this.variable_prefix = variable_prefix;
|
|
185
|
-
this.comment_prefix =
|
|
175
|
+
this.comment_prefix = comment_prefix;
|
|
186
176
|
|
|
187
177
|
this.decoder = null;
|
|
188
178
|
if (encoding == 'utf-8' && this.csv_path === null) {
|
|
@@ -204,11 +194,11 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
204
194
|
this.utf8_bom_removed = false; // BOM doesn't get automatically removed by the decoder when utf-8 file is treated as latin-1
|
|
205
195
|
this.first_defective_line = null;
|
|
206
196
|
|
|
207
|
-
this.fields_info = new
|
|
197
|
+
this.fields_info = new Map();
|
|
208
198
|
this.NR = 0; // Record number
|
|
209
199
|
this.NL = 0; // Line number (NL != NR when the CSV file has comments or multiline fields)
|
|
210
200
|
|
|
211
|
-
this.
|
|
201
|
+
this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix);
|
|
212
202
|
|
|
213
203
|
this.partially_decoded_line = '';
|
|
214
204
|
this.partially_decoded_line_ends_with_cr = false;
|
|
@@ -222,7 +212,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
222
212
|
|
|
223
213
|
this.produced_records_queue = new RecordQueue();
|
|
224
214
|
|
|
225
|
-
this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.
|
|
215
|
+
this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line_simple;
|
|
226
216
|
}
|
|
227
217
|
|
|
228
218
|
|
|
@@ -350,9 +340,14 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
350
340
|
};
|
|
351
341
|
|
|
352
342
|
|
|
353
|
-
|
|
354
|
-
if (this.comment_prefix
|
|
343
|
+
process_record_line_simple(line) {
|
|
344
|
+
if (this.comment_prefix && line.startsWith(this.comment_prefix))
|
|
355
345
|
return; // Just skip the line
|
|
346
|
+
this.process_record_line(line);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
process_record_line(line) {
|
|
356
351
|
this.NR += 1;
|
|
357
352
|
var [record, warning] = csv_utils.smart_split(line, this.delim, this.policy, false);
|
|
358
353
|
if (warning) {
|
|
@@ -363,29 +358,20 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
363
358
|
}
|
|
364
359
|
}
|
|
365
360
|
let num_fields = record.length;
|
|
366
|
-
if (!this.fields_info.
|
|
367
|
-
this.fields_info
|
|
361
|
+
if (!this.fields_info.has(num_fields))
|
|
362
|
+
this.fields_info.set(num_fields, this.NR);
|
|
368
363
|
this.produced_records_queue.enqueue(record);
|
|
369
364
|
this.try_resolve_next_record();
|
|
370
365
|
};
|
|
371
366
|
|
|
372
367
|
|
|
373
368
|
process_partial_rfc_record_line(line) {
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
this.
|
|
380
|
-
} else if (this.rfc_line_buffer.length == 0 && has_unbalanced_double_quote) {
|
|
381
|
-
this.rfc_line_buffer.push(line);
|
|
382
|
-
} else if (!has_unbalanced_double_quote) {
|
|
383
|
-
this.rfc_line_buffer.push(line);
|
|
384
|
-
} else {
|
|
385
|
-
this.rfc_line_buffer.push(line);
|
|
386
|
-
let multiline_row = this.rfc_line_buffer.join('\n');
|
|
387
|
-
this.rfc_line_buffer = [];
|
|
388
|
-
this.process_record_line(multiline_row);
|
|
369
|
+
this.line_aggregator.add_line(line);
|
|
370
|
+
if (this.line_aggregator.has_comment_line) {
|
|
371
|
+
this.line_aggregator.reset();
|
|
372
|
+
} else if (this.line_aggregator.has_full_record) {
|
|
373
|
+
this.process_record_line(this.line_aggregator.get_full_line('\n'));
|
|
374
|
+
this.line_aggregator.reset();
|
|
389
375
|
}
|
|
390
376
|
};
|
|
391
377
|
|
|
@@ -432,13 +418,13 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
432
418
|
};
|
|
433
419
|
|
|
434
420
|
|
|
435
|
-
process_data_bulk(
|
|
436
|
-
let decoded_string =
|
|
421
|
+
process_data_bulk(data_blob) {
|
|
422
|
+
let decoded_string = data_blob.toString(this.encoding);
|
|
437
423
|
if (this.encoding == 'utf-8') {
|
|
438
424
|
// Using hacky comparison method from here: https://stackoverflow.com/a/32279283/2898283
|
|
439
425
|
// TODO get rid of this once TextDecoder is really fixed or when alternative method of reliable decoding appears
|
|
440
426
|
let control_buffer = Buffer.from(decoded_string, 'utf-8');
|
|
441
|
-
if (Buffer.compare(
|
|
427
|
+
if (Buffer.compare(data_blob, control_buffer) != 0) {
|
|
442
428
|
this.store_or_propagate_exception(new RbqlIOHandlingError(utf_decoding_error));
|
|
443
429
|
return;
|
|
444
430
|
}
|
|
@@ -449,8 +435,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
449
435
|
for (let i = 0; i < lines.length; i++) {
|
|
450
436
|
this.process_line(lines[i]);
|
|
451
437
|
}
|
|
452
|
-
if (this.
|
|
453
|
-
this.process_record_line(this.
|
|
438
|
+
if (this.line_aggregator.is_inside_multiline_record()) {
|
|
439
|
+
this.process_record_line(this.line_aggregator.get_full_line('\n'));
|
|
454
440
|
}
|
|
455
441
|
this.input_exhausted = true;
|
|
456
442
|
this.try_resolve_next_record(); // Should be a NOOP here?
|
|
@@ -464,8 +450,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
464
450
|
this.partially_decoded_line = '';
|
|
465
451
|
this.process_line(last_line);
|
|
466
452
|
}
|
|
467
|
-
if (this.
|
|
468
|
-
this.process_record_line(this.
|
|
453
|
+
if (this.line_aggregator.is_inside_multiline_record()) {
|
|
454
|
+
this.process_record_line(this.line_aggregator.get_full_line('\n'));
|
|
469
455
|
}
|
|
470
456
|
this.try_resolve_next_record();
|
|
471
457
|
};
|
|
@@ -487,11 +473,11 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
487
473
|
} else {
|
|
488
474
|
let parent_iterator = this;
|
|
489
475
|
return new Promise(function(resolve, reject) {
|
|
490
|
-
fs.readFile(parent_iterator.csv_path, (err,
|
|
476
|
+
fs.readFile(parent_iterator.csv_path, (err, data_blob) => {
|
|
491
477
|
if (err) {
|
|
492
478
|
reject(err);
|
|
493
479
|
} else {
|
|
494
|
-
parent_iterator.process_data_bulk(
|
|
480
|
+
parent_iterator.process_data_bulk(data_blob);
|
|
495
481
|
resolve();
|
|
496
482
|
}
|
|
497
483
|
});
|
|
@@ -506,8 +492,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
|
|
|
506
492
|
result.push(`Inconsistent double quote escaping in ${this.table_name} table. E.g. at line ${this.first_defective_line}`);
|
|
507
493
|
if (this.utf8_bom_removed)
|
|
508
494
|
result.push(`UTF-8 Byte Order Mark (BOM) was found and skipped in ${this.table_name} table`);
|
|
509
|
-
if (
|
|
510
|
-
result.push(make_inconsistent_num_fields_warning(
|
|
495
|
+
if (this.fields_info.size > 1)
|
|
496
|
+
result.push(make_inconsistent_num_fields_warning(this.table_name, this.fields_info));
|
|
511
497
|
return result;
|
|
512
498
|
};
|
|
513
499
|
}
|
|
@@ -710,7 +696,6 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
|
|
|
710
696
|
input_stream = input_path === null ? process.stdin : fs.createReadStream(input_path);
|
|
711
697
|
}
|
|
712
698
|
let [output_stream, close_output_on_finish] = output_path === null ? [process.stdout, false] : [fs.createWriteStream(output_path), true];
|
|
713
|
-
// FIXME add on(error) handler to avoid async errors, see https://github.com/nodejs/node-v0.x-archive/issues/406
|
|
714
699
|
if (input_delim == '"' && input_policy == 'quoted')
|
|
715
700
|
throw new RbqlIOHandlingError('Double quote delimiter is incompatible with "quoted" policy');
|
|
716
701
|
if (csv_encoding == 'latin-1')
|