rbql 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc.json ADDED
@@ -0,0 +1,26 @@
1
+ {
2
+ "env": {
3
+ "browser": false,
4
+ "commonjs": true,
5
+ "es6": true,
6
+ "node": true
7
+ },
8
+ "parserOptions": {
9
+ "ecmaFeatures": {
10
+ "jsx": true
11
+ },
12
+ "sourceType": "module",
13
+ "ecmaVersion": 2018
14
+ },
15
+ "rules": {
16
+ "no-const-assign": "warn",
17
+ "no-this-before-super": "warn",
18
+ "no-undef": "warn",
19
+ "semi": [2, "always"],
20
+ "no-unreachable": "warn",
21
+ "no-unused-vars": "warn",
22
+ "constructor-super": "warn",
23
+ "no-trailing-spaces": "error",
24
+ "valid-typeof": "warn"
25
+ }
26
+ }
package/README.md CHANGED
@@ -196,7 +196,7 @@ rbql.query_table(user_query, input_table, output_table, warnings).then(success_h
196
196
 
197
197
  #### Example of query_csv() usage:
198
198
  ```
199
- const rbql_csv = require('rbql_csv');
199
+ const rbql = require('rbql');
200
200
  let user_query = 'SELECT a1, parseInt(a2) % 1000 WHERE a3 != "USA" LIMIT 5';
201
201
  let error_handler = function(exception) {
202
202
  console.log('Error: ' + String(exception));
@@ -207,7 +207,7 @@ let success_handler = function() {
207
207
  console.log('warnings: ' + JSON.stringify(warnings));
208
208
  console.log('output table: output.csv');
209
209
  }
210
- rbql_csv.query_csv(user_query, 'input.csv', ',', 'quoted', 'output.csv', ',', 'quoted', 'utf-8', warnings).then(success_handler).catch(error_handler);
210
+ rbql.query_csv(user_query, 'input.csv', ',', 'quoted', 'output.csv', ',', 'quoted', 'utf-8', warnings).then(success_handler).catch(error_handler);
211
211
  ```
212
212
 
213
213
 
@@ -267,6 +267,7 @@ $ rbql-js --input input.csv --output result.csv
267
267
  * GROUP BY
268
268
  * TOP _N_
269
269
  * LIMIT _N_
270
+ * AS
270
271
 
271
272
  All keywords have the same meaning as in SQL queries. You can check them [online](https://www.w3schools.com/sql/default.asp)
272
273
 
@@ -306,7 +307,7 @@ _COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_
306
307
  Limitation: aggregate functions inside JavaScript expressions are not supported. Although you can use expressions inside aggregate functions.
307
308
  E.g. `MAX(float(a1) / 1000)` - valid; `MAX(a1) / 1000` - invalid.
308
309
  There is a workaround for the limitation above for _ARRAY_AGG_ function which supports an optional parameter - a callback function that can do something with the aggregated array. Example:
309
- `select a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) group by a2`
310
+ `SELECT a2, ARRAY_AGG(a1, v => v.sort().slice(0, 5)) GROUP BY a2`
310
311
 
311
312
 
312
313
  ### JOIN statements
@@ -346,14 +347,15 @@ You can define custom functions and/or import libraries in a special file: `~/.r
346
347
 
347
348
  ## Examples of RBQL queries
348
349
 
349
- * `select top 100 a1, a2 * 10, a4.length where a1 == "Buy" order by parseInt(a2) desc`
350
- * `select * order by Math.random()` - random sort
351
- * `select top 20 a.vehicle_price.length / 10, a2 where parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
352
- * `update set a3 = 'NPC' where a3.indexOf('Non-playable character') != -1`
353
- * `select NR, *` - enumerate records, NR is 1-based
354
- * `select a1, b1, b2 inner join ./countries.txt on a2 == b1 order by a1, a3` - example of join query
355
- * `select MAX(a1), MIN(a1) where a.Name != 'John' group by a2, a3` - example of aggregate query
356
- * `select ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
350
+ * `SELECT TOP 100 a1, a2 * 10, a4.length WHERE a1 == "Buy" ORDER BY parseInt(a2) DESC`
351
+ * `SELECT a.id, a.weight / 1000 AS weight_kg`
352
+ * `SELECT * ORDER BY Math.random()` - random sort
353
+ * `SELECT TOP 20 a.vehicle_price.length / 10, a2 WHERE parseInt(a.vehicle_price) < 500 && ["car", "plane", "boat"].indexOf(a['Vehicle type']) > -1 limit 20` - referencing columns by names from header
354
+ * `UPDATE SET a3 = 'NPC' WHERE a3.indexOf('Non-playable character') != -1`
355
+ * `SELECT NR, *` - enumerate records, NR is 1-based
356
+ * `SELECT a1, b1, b2 INNER JOIN ./countries.txt ON a2 == b1 ORDER BY a1, a3` - example of join query
357
+ * `SELECT MAX(a1), MIN(a1) WHERE a.Name != 'John' GROUP BY a2, a3` - example of aggregate query
358
+ * `SELECT ...a1.split(':')` - Using JS "destructuring assignment" syntax to split one column into many. Do not try this with other SQL engines!
357
359
 
358
360
 
359
361
  ### References
package/cli_parser.js CHANGED
File without changes
package/csv_utils.js CHANGED
@@ -39,6 +39,7 @@ function extract_next_field(src, dlm, preserve_quotes_and_whitespaces, allow_ext
39
39
 
40
40
 
41
41
  function split_quoted_str(src, dlm, preserve_quotes_and_whitespaces=false) {
42
+ // This function is newline-agnostic i.e. it can also split records with multiline fields.
42
43
  if (src.indexOf('"') == -1) // Optimization for most common case
43
44
  return [src.split(dlm), false];
44
45
  var result = [];
@@ -116,6 +117,41 @@ function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) {
116
117
  }
117
118
 
118
119
 
120
+ class MultilineRecordAggregator {
121
+ constructor(comment_prefix) {
122
+ this.comment_prefix = comment_prefix;
123
+ this.reset();
124
+ }
125
+ add_line(line_text) {
126
+ if (this.has_full_record || this.has_comment_line) {
127
+ throw new Error('Invalid usage - record aggregator must be reset before adding new lines');
128
+ }
129
+ if (this.comment_prefix && this.rfc_line_buffer.length == 0 && line_text.startsWith(this.comment_prefix)) {
130
+ this.has_comment_line = true;
131
+ return false;
132
+ }
133
+ let match_list = line_text.match(/"/g);
134
+ let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
135
+ this.rfc_line_buffer.push(line_text);
136
+ this.has_full_record = (!has_unbalanced_double_quote && this.rfc_line_buffer.length == 1) || (has_unbalanced_double_quote && this.rfc_line_buffer.length > 1);
137
+ return this.has_full_record;
138
+ }
139
+ is_inside_multiline_record() {
140
+ return this.rfc_line_buffer.length && !this.has_full_record;
141
+ }
142
+ get_full_line(line_separator) {
143
+ return this.rfc_line_buffer.join(line_separator);
144
+ }
145
+ get_num_lines_in_record() {
146
+ return this.rfc_line_buffer.length;
147
+ }
148
+ reset() {
149
+ this.rfc_line_buffer = [];
150
+ this.has_full_record = false;
151
+ this.has_comment_line = false;
152
+ }
153
+ }
154
+
119
155
 
120
156
  module.exports.split_quoted_str = split_quoted_str;
121
157
  module.exports.split_whitespace_separated_str = split_whitespace_separated_str;
@@ -125,3 +161,4 @@ module.exports.rfc_quote_field = rfc_quote_field;
125
161
  module.exports.unquote_field = unquote_field;
126
162
  module.exports.unquote_fields = unquote_fields;
127
163
  module.exports.split_lines = split_lines;
164
+ module.exports.MultilineRecordAggregator = MultilineRecordAggregator;
package/index.js CHANGED
File without changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rbql",
3
- "version": "0.25.0",
3
+ "version": "0.26.0",
4
4
  "description": "Rainbow Query Language",
5
5
  "keywords": ["CSV", "TSV", "spreadsheet", "SQL", "SQL-like", "transpiler", "CLI", "command-line", "library", "browser", "Node", "select", "update", "join"],
6
6
  "scripts": {
package/rbql.js CHANGED
@@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions.
70
70
 
71
71
 
72
72
  const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
73
- const RBQL_VERSION = '0.25.0';
73
+ const RBQL_VERSION = '0.26.0';
74
74
 
75
75
 
76
76
  function check_if_brackets_match(opening_bracket, closing_bracket) {
@@ -128,28 +128,32 @@ function column_info_from_text_span(text_span, string_literals) {
128
128
  let attribute_match = /^([ab])\.([_a-zA-Z][_a-zA-Z0-9]*)$/.exec(text_span);
129
129
  let subscript_int_match = /^([ab])\[([0-9]+)\]$/.exec(text_span);
130
130
  let subscript_str_match = /^([ab])\[___RBQL_STRING_LITERAL([0-9]+)___\]$/.exec(text_span);
131
+ let as_alias_match = /^(.*) (as|AS) +([a-zA-Z][a-zA-Z0-9_]*) *$/.exec(text_span);
132
+ if (as_alias_match !== null) {
133
+ return {table_name: null, column_index: null, column_name: as_alias_match[3], is_star: false, is_alias: true};
134
+ }
131
135
  if (simple_var_match !== null) {
132
136
  if (text_span == rbql_star_marker)
133
- return {table_name: null, column_index: null, column_name: null, is_star: true};
137
+ return {table_name: null, column_index: null, column_name: null, is_star: true, is_alias: false};
134
138
  if (text_span.startsWith('___RBQL_STRING_LITERAL'))
135
139
  return null;
136
140
  let match = /^([ab])([0-9]+)$/.exec(text_span);
137
141
  if (match !== null) {
138
- return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false};
142
+ return {table_name: match[1], column_index: parseInt(match[2]) - 1, column_name: null, is_star: false, is_alias: false};
139
143
  }
140
144
  // Some examples for this branch: NR, NF
141
- return {table_name: null, column_index: null, column_name: text_span, is_star: false};
145
+ return {table_name: null, column_index: null, column_name: text_span, is_star: false, is_alias: false};
142
146
  } else if (attribute_match !== null) {
143
147
  let table_name = attribute_match[1];
144
148
  let column_name = attribute_match[2];
145
149
  if (column_name == rbql_star_marker) {
146
- return {table_name: table_name, column_index: null, column_name: null, is_star: true};
150
+ return {table_name: table_name, column_index: null, column_name: null, is_star: true, is_alias: false};
147
151
  }
148
- return {table_name: null, column_index: null, column_name: column_name, is_star: false};
152
+ return {table_name: null, column_index: null, column_name: column_name, is_star: false, is_alias: false};
149
153
  } else if (subscript_int_match != null) {
150
154
  let table_name = subscript_int_match[1];
151
155
  let column_index = parseInt(subscript_int_match[2]) - 1;
152
- return {table_name: table_name, column_index: column_index, column_name: null, is_star: false};
156
+ return {table_name: table_name, column_index: column_index, column_name: null, is_star: false, is_alias: false};
153
157
  } else if (subscript_str_match != null) {
154
158
  let table_name = subscript_str_match[1];
155
159
  let replaced_string_literal_id = subscript_str_match[2];
@@ -157,7 +161,7 @@ function column_info_from_text_span(text_span, string_literals) {
157
161
  let quoted_column_name = string_literals[replaced_string_literal_id];
158
162
  let unquoted_column_name = unquote_string(quoted_column_name);
159
163
  if (unquoted_column_name !== null && unquoted_column_name !== undefined) {
160
- return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false};
164
+ return {table_name: null, column_index: null, column_name: unquoted_column_name, is_star: false, is_alias: false};
161
165
  }
162
166
  }
163
167
  }
@@ -1328,9 +1332,11 @@ function translate_update_expression(update_expression, input_variables_map, str
1328
1332
 
1329
1333
 
1330
1334
  function translate_select_expression(select_expression) {
1331
- let expression_without_stars = replace_star_count(select_expression);
1332
- let translated = str_strip(replace_star_vars(expression_without_stars));
1333
- let translated_for_header = str_strip(replace_star_vars_for_header_parsing(expression_without_stars));
1335
+ let as_alias_replacement_regexp = / +(AS|as) +([a-zA-Z][a-zA-Z0-9_]*) *(?=$|,)/g;
1336
+ let expression_without_counting_stars = replace_star_count(select_expression);
1337
+ let expression_without_as_column_alias = expression_without_counting_stars.replace(as_alias_replacement_regexp, '');
1338
+ let translated = str_strip(replace_star_vars(expression_without_as_column_alias));
1339
+ let translated_for_header = str_strip(replace_star_vars_for_header_parsing(expression_without_counting_stars));
1334
1340
  if (!translated.length)
1335
1341
  throw new RbqlParsingError('"SELECT" expression is empty');
1336
1342
  return [`[].concat([${translated}])`, translated_for_header];
@@ -1571,12 +1577,21 @@ function remove_redundant_table_name(query_text) {
1571
1577
 
1572
1578
 
1573
1579
  function select_output_header(input_header, join_header, query_column_infos) {
1574
- if (input_header === null && join_header === null)
1580
+ if (input_header === null) {
1581
+ assert(join_header === null);
1582
+ }
1583
+ if (input_header === null) {
1584
+ for (let qci of query_column_infos) {
1585
+ if (qci !== null && qci.is_alias) {
1586
+ throw new RbqlParsingError(`Specifying column alias "AS ${qci.column_name}" is not allowed if input table has no header`);
1587
+ }
1588
+ }
1575
1589
  return null;
1576
- if (input_header === null)
1577
- input_header = [];
1578
- if (join_header === null)
1590
+ }
1591
+ if (join_header === null) {
1592
+ // This means there is no JOIN table.
1579
1593
  join_header = [];
1594
+ }
1580
1595
  let output_header = [];
1581
1596
  for (let qci of query_column_infos) {
1582
1597
  // TODO refactor this and python version: extract this code into a function instead to always return something
@@ -1608,20 +1623,20 @@ function select_output_header(input_header, join_header, query_column_infos) {
1608
1623
  }
1609
1624
 
1610
1625
 
1611
- function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) {
1612
- let keys = Object.keys(inconsistent_records_info);
1613
- let entries = [];
1614
- for (let i = 0; i < keys.length; i++) {
1615
- let key = keys[i];
1616
- let record_id = inconsistent_records_info[key];
1617
- entries.push([record_id, key]);
1618
- }
1619
- entries.sort(function(a, b) { return a[0] - b[0]; });
1626
+ function sample_first_two_inconsistent_records(inconsistent_records_info) {
1627
+ let entries = Array.from(inconsistent_records_info.entries());
1628
+ entries.sort(function(a, b) { return a[1] - b[1]; });
1620
1629
  assert(entries.length > 1);
1621
- let [record_1, num_fields_1] = entries[0];
1622
- let [record_2, num_fields_2] = entries[1];
1630
+ let [num_fields_1, record_num_1] = entries[0];
1631
+ let [num_fields_2, record_num_2] = entries[1];
1632
+ return [record_num_1, num_fields_1, record_num_2, num_fields_2];
1633
+ }
1634
+
1635
+
1636
+ function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) {
1637
+ let [record_num_1, num_fields_1, record_num_2, num_fields_2] = sample_first_two_inconsistent_records(inconsistent_records_info);
1623
1638
  let warn_msg = `Number of fields in "${table_name}" table is not consistent: `;
1624
- warn_msg += `e.g. record ${record_1} -> ${num_fields_1} fields, record ${record_2} -> ${num_fields_2} fields`;
1639
+ warn_msg += `e.g. record ${record_num_1} -> ${num_fields_1} fields, record ${record_num_2} -> ${num_fields_2} fields`;
1625
1640
  return warn_msg;
1626
1641
  }
1627
1642
 
@@ -1691,7 +1706,7 @@ class TableIterator extends RBQLInputIterator {
1691
1706
  this.normalize_column_names = normalize_column_names;
1692
1707
  this.variable_prefix = variable_prefix;
1693
1708
  this.nr = 0;
1694
- this.fields_info = new Object();
1709
+ this.fields_info = new Map();
1695
1710
  this.stopped = false;
1696
1711
  }
1697
1712
 
@@ -1727,13 +1742,13 @@ class TableIterator extends RBQLInputIterator {
1727
1742
  let record = this.table[this.nr];
1728
1743
  this.nr += 1;
1729
1744
  let num_fields = record.length;
1730
- if (!this.fields_info.hasOwnProperty(num_fields))
1731
- this.fields_info[num_fields] = this.nr;
1745
+ if (!this.fields_info.has(num_fields))
1746
+ this.fields_info.set(num_fields, this.nr);
1732
1747
  return record;
1733
1748
  };
1734
1749
 
1735
1750
  get_warnings() {
1736
- if (Object.keys(this.fields_info).length > 1)
1751
+ if (this.fields_info.size > 1)
1737
1752
  return [make_inconsistent_num_fields_warning('input', this.fields_info)];
1738
1753
  return [];
1739
1754
  };
@@ -1799,6 +1814,8 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1799
1814
  query_context.aggregation_key_expression = '[' + combine_string_literals(rb_actions[GROUP_BY]['text'], string_literals) + ']';
1800
1815
  }
1801
1816
 
1817
+
1818
+ let input_header = await input_iterator.get_header();
1802
1819
  let join_variables_map = null;
1803
1820
  let join_header = null;
1804
1821
  if (rb_actions.hasOwnProperty(JOIN)) {
@@ -1813,6 +1830,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1813
1830
  }
1814
1831
  join_variables_map = await join_record_iterator.get_variables_map(query_text);
1815
1832
  join_header = await join_record_iterator.get_header();
1833
+ if (input_header === null && join_header !== null) {
1834
+ throw new RbqlIOHandlingError('Inconsistent modes: Input table doesn\'t have a header while the Join table has a header');
1835
+ }
1836
+ if (input_header !== null && join_header === null) {
1837
+ throw new RbqlIOHandlingError('Inconsistent modes: Input table has a header while the Join table doesn\'t have a header');
1838
+ }
1816
1839
  let [lhs_variables, rhs_indices] = resolve_join_variables(input_variables_map, join_variables_map, variable_pairs, string_literals);
1817
1840
  let sql_join_type = {'JOIN': InnerJoiner, 'INNER JOIN': InnerJoiner, 'LEFT JOIN': LeftJoiner, 'LEFT OUTER JOIN': LeftJoiner, 'STRICT LEFT JOIN': StrictLeftJoiner}[rb_actions[JOIN]['join_subtype']];
1818
1841
  query_context.lhs_join_var_expression = lhs_variables.length == 1 ? lhs_variables[0] : 'JSON.stringify([' + lhs_variables.join(',') + '])';
@@ -1830,7 +1853,6 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1830
1853
  query_context.where_expression = combine_string_literals(where_expression, string_literals);
1831
1854
  }
1832
1855
 
1833
- let input_header = await input_iterator.get_header();
1834
1856
  if (rb_actions.hasOwnProperty(UPDATE)) {
1835
1857
  var update_expression = translate_update_expression(rb_actions[UPDATE]['text'], input_variables_map, string_literals, ' '.repeat(8));
1836
1858
  query_context.update_expressions = combine_string_literals(update_expression, string_literals);
@@ -1840,13 +1862,16 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
1840
1862
  if (rb_actions.hasOwnProperty(SELECT)) {
1841
1863
  query_context.top_count = find_top(rb_actions);
1842
1864
  if (rb_actions.hasOwnProperty(EXCEPT)) {
1865
+ if (rb_actions.hasOwnProperty(JOIN)) {
1866
+ throw new RbqlParsingError('EXCEPT and JOIN are not allowed in the same query');
1867
+ }
1843
1868
  let [output_header, select_expression] = translate_except_expression(rb_actions[EXCEPT]['text'], input_variables_map, string_literals, input_header);
1844
1869
  query_context.select_expression = select_expression;
1845
1870
  query_context.writer.set_header(output_header);
1846
1871
  } else {
1847
- let [select_expression, select_expression_for_ast] = translate_select_expression(rb_actions[SELECT]['text']);
1872
+ let [select_expression, select_expression_for_header] = translate_select_expression(rb_actions[SELECT]['text']);
1848
1873
  query_context.select_expression = combine_string_literals(select_expression, string_literals);
1849
- let column_infos = adhoc_parse_select_expression_to_column_infos(select_expression_for_ast, string_literals);
1874
+ let column_infos = adhoc_parse_select_expression_to_column_infos(select_expression_for_header, string_literals);
1850
1875
  let output_header = select_output_header(input_header, join_header, column_infos);
1851
1876
  query_context.writer.set_header(output_header);
1852
1877
  }
@@ -1947,5 +1972,6 @@ exports.adhoc_parse_select_expression_to_column_infos = adhoc_parse_select_expre
1947
1972
  exports.replace_star_count = replace_star_count;
1948
1973
  exports.replace_star_vars_for_header_parsing = replace_star_vars_for_header_parsing;
1949
1974
  exports.select_output_header = select_output_header;
1975
+ exports.sample_first_two_inconsistent_records = sample_first_two_inconsistent_records;
1950
1976
 
1951
1977
  }(typeof exports === 'undefined' ? this.rbql = {} : exports));
package/rbql_csv.js CHANGED
@@ -62,19 +62,9 @@ function remove_utf8_bom(line, assumed_source_encoding) {
62
62
 
63
63
 
64
64
  function make_inconsistent_num_fields_warning(table_name, inconsistent_records_info) {
65
- let keys = Object.keys(inconsistent_records_info);
66
- let entries = [];
67
- for (let i = 0; i < keys.length; i++) {
68
- let key = keys[i];
69
- let record_id = inconsistent_records_info[key];
70
- entries.push([record_id, key]);
71
- }
72
- entries.sort(function(a, b) { return a[0] - b[0]; });
73
- assert(entries.length > 1);
74
- let [record_1, num_fields_1] = entries[0];
75
- let [record_2, num_fields_2] = entries[1];
65
+ let [record_num_1, num_fields_1, record_num_2, num_fields_2] = rbql.sample_first_two_inconsistent_records(inconsistent_records_info);
76
66
  let warn_msg = `Number of fields in "${table_name}" table is not consistent: `;
77
- warn_msg += `e.g. record ${record_1} -> ${num_fields_1} fields, record ${record_2} -> ${num_fields_2} fields`;
67
+ warn_msg += `e.g. record ${record_num_1} -> ${num_fields_1} fields, record ${record_num_2} -> ${num_fields_2} fields`;
78
68
  return warn_msg;
79
69
  }
80
70
 
@@ -182,7 +172,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
182
172
 
183
173
  this.table_name = table_name;
184
174
  this.variable_prefix = variable_prefix;
185
- this.comment_prefix = (comment_prefix !== null && comment_prefix.length) ? comment_prefix : null;
175
+ this.comment_prefix = comment_prefix;
186
176
 
187
177
  this.decoder = null;
188
178
  if (encoding == 'utf-8' && this.csv_path === null) {
@@ -204,11 +194,11 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
204
194
  this.utf8_bom_removed = false; // BOM doesn't get automatically removed by the decoder when utf-8 file is treated as latin-1
205
195
  this.first_defective_line = null;
206
196
 
207
- this.fields_info = new Object();
197
+ this.fields_info = new Map();
208
198
  this.NR = 0; // Record number
209
199
  this.NL = 0; // Line number (NL != NR when the CSV file has comments or multiline fields)
210
200
 
211
- this.rfc_line_buffer = [];
201
+ this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix);
212
202
 
213
203
  this.partially_decoded_line = '';
214
204
  this.partially_decoded_line_ends_with_cr = false;
@@ -222,7 +212,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
222
212
 
223
213
  this.produced_records_queue = new RecordQueue();
224
214
 
225
- this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line;
215
+ this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line_simple;
226
216
  }
227
217
 
228
218
 
@@ -350,9 +340,14 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
350
340
  };
351
341
 
352
342
 
353
- process_record_line(line) {
354
- if (this.comment_prefix !== null && line.startsWith(this.comment_prefix))
343
+ process_record_line_simple(line) {
344
+ if (this.comment_prefix && line.startsWith(this.comment_prefix))
355
345
  return; // Just skip the line
346
+ this.process_record_line(line);
347
+ }
348
+
349
+
350
+ process_record_line(line) {
356
351
  this.NR += 1;
357
352
  var [record, warning] = csv_utils.smart_split(line, this.delim, this.policy, false);
358
353
  if (warning) {
@@ -363,29 +358,20 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
363
358
  }
364
359
  }
365
360
  let num_fields = record.length;
366
- if (!this.fields_info.hasOwnProperty(num_fields))
367
- this.fields_info[num_fields] = this.NR;
361
+ if (!this.fields_info.has(num_fields))
362
+ this.fields_info.set(num_fields, this.NR);
368
363
  this.produced_records_queue.enqueue(record);
369
364
  this.try_resolve_next_record();
370
365
  };
371
366
 
372
367
 
373
368
  process_partial_rfc_record_line(line) {
374
- if (this.comment_prefix !== null && this.rfc_line_buffer.length == 0 && line.startsWith(this.comment_prefix))
375
- return; // Just skip the line
376
- let match_list = line.match(/"/g);
377
- let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
378
- if (this.rfc_line_buffer.length == 0 && !has_unbalanced_double_quote) {
379
- this.process_record_line(line);
380
- } else if (this.rfc_line_buffer.length == 0 && has_unbalanced_double_quote) {
381
- this.rfc_line_buffer.push(line);
382
- } else if (!has_unbalanced_double_quote) {
383
- this.rfc_line_buffer.push(line);
384
- } else {
385
- this.rfc_line_buffer.push(line);
386
- let multiline_row = this.rfc_line_buffer.join('\n');
387
- this.rfc_line_buffer = [];
388
- this.process_record_line(multiline_row);
369
+ this.line_aggregator.add_line(line);
370
+ if (this.line_aggregator.has_comment_line) {
371
+ this.line_aggregator.reset();
372
+ } else if (this.line_aggregator.has_full_record) {
373
+ this.process_record_line(this.line_aggregator.get_full_line('\n'));
374
+ this.line_aggregator.reset();
389
375
  }
390
376
  };
391
377
 
@@ -432,13 +418,13 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
432
418
  };
433
419
 
434
420
 
435
- process_data_bulk(data_chunk) {
436
- let decoded_string = data_chunk.toString(this.encoding);
421
+ process_data_bulk(data_blob) {
422
+ let decoded_string = data_blob.toString(this.encoding);
437
423
  if (this.encoding == 'utf-8') {
438
424
  // Using hacky comparison method from here: https://stackoverflow.com/a/32279283/2898283
439
425
  // TODO get rid of this once TextDecoder is really fixed or when alternative method of reliable decoding appears
440
426
  let control_buffer = Buffer.from(decoded_string, 'utf-8');
441
- if (Buffer.compare(data_chunk, control_buffer) != 0) {
427
+ if (Buffer.compare(data_blob, control_buffer) != 0) {
442
428
  this.store_or_propagate_exception(new RbqlIOHandlingError(utf_decoding_error));
443
429
  return;
444
430
  }
@@ -449,8 +435,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
449
435
  for (let i = 0; i < lines.length; i++) {
450
436
  this.process_line(lines[i]);
451
437
  }
452
- if (this.rfc_line_buffer.length > 0) {
453
- this.process_record_line(this.rfc_line_buffer.join('\n'));
438
+ if (this.line_aggregator.is_inside_multiline_record()) {
439
+ this.process_record_line(this.line_aggregator.get_full_line('\n'));
454
440
  }
455
441
  this.input_exhausted = true;
456
442
  this.try_resolve_next_record(); // Should be a NOOP here?
@@ -464,8 +450,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
464
450
  this.partially_decoded_line = '';
465
451
  this.process_line(last_line);
466
452
  }
467
- if (this.rfc_line_buffer.length > 0) {
468
- this.process_record_line(this.rfc_line_buffer.join('\n'));
453
+ if (this.line_aggregator.is_inside_multiline_record()) {
454
+ this.process_record_line(this.line_aggregator.get_full_line('\n'));
469
455
  }
470
456
  this.try_resolve_next_record();
471
457
  };
@@ -487,11 +473,11 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
487
473
  } else {
488
474
  let parent_iterator = this;
489
475
  return new Promise(function(resolve, reject) {
490
- fs.readFile(parent_iterator.csv_path, (err, data_chunk) => {
476
+ fs.readFile(parent_iterator.csv_path, (err, data_blob) => {
491
477
  if (err) {
492
478
  reject(err);
493
479
  } else {
494
- parent_iterator.process_data_bulk(data_chunk);
480
+ parent_iterator.process_data_bulk(data_blob);
495
481
  resolve();
496
482
  }
497
483
  });
@@ -506,8 +492,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
506
492
  result.push(`Inconsistent double quote escaping in ${this.table_name} table. E.g. at line ${this.first_defective_line}`);
507
493
  if (this.utf8_bom_removed)
508
494
  result.push(`UTF-8 Byte Order Mark (BOM) was found and skipped in ${this.table_name} table`);
509
- if (Object.keys(this.fields_info).length > 1)
510
- result.push(make_inconsistent_num_fields_warning('input', this.fields_info));
495
+ if (this.fields_info.size > 1)
496
+ result.push(make_inconsistent_num_fields_warning(this.table_name, this.fields_info));
511
497
  return result;
512
498
  };
513
499
  }
@@ -710,7 +696,6 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
710
696
  input_stream = input_path === null ? process.stdin : fs.createReadStream(input_path);
711
697
  }
712
698
  let [output_stream, close_output_on_finish] = output_path === null ? [process.stdout, false] : [fs.createWriteStream(output_path), true];
713
- // FIXME add on(error) handler to avoid async errors, see https://github.com/nodejs/node-v0.x-archive/issues/406
714
699
  if (input_delim == '"' && input_policy == 'quoted')
715
700
  throw new RbqlIOHandlingError('Double quote delimiter is incompatible with "quoted" policy');
716
701
  if (csv_encoding == 'latin-1')