npm - rbql - Versions diffs - 0.28.0 → 0.30.0 - Mend

rbql 0.28.0 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/.eslintrc.json CHANGED Viewed

File without changes

package/DEV_README.md CHANGED Viewed

@@ -1,4 +1,8 @@
 # Publishing the package to npm
-1. Run `git clean -fd` just in case.
-2. Run `npm publish`.
+1. Update version in package.json. Make sure it is synced with unit tests js version - run the unit tests. Python and JS version don't have to be in sync!
+2. Run `git clean -fd` just in case.
+3. Run `npm publish`.
+Note: you need to be authorize in order to publish so in the new system you might need to run `npm adduser` first,
+if you run `npm publish` without authorizing it would prompt you to do it anyway, so no big deal.

package/README.md CHANGED Viewed

@@ -219,11 +219,14 @@ You can also check rbql-js cli app code as a usage example: [rbql-js cli source
 ### Installation:
-To use RBQL as CLI app you need to install it in global (-g) mode:
+To use RBQL as CLI app you can install it in global (-g) mode:
 ```
 $ npm install -g rbql
 ```
+RBQL can also be installed locally with `$ npm install rbql`, but then you would have to run it with `$ npx rbql-js ...` instead of `$ rbql-js ...`.
 ### Usage (non-interactive mode):
 ```
@@ -339,6 +342,13 @@ But it is also possible to override this selection directly in the query by addi
 Example: `select top 5 NR, * with (header)`
+### Pipe syntax for query chaining
+You can chain consecutive queries via pipe `|` syntax. Example:
+```
+SELECT a2 AS region, count(*) AS cnt GROUP BY a2 | SELECT * ORDER BY a.cnt DESC
+```
 ### User Defined Functions (UDF)
 RBQL supports User Defined Functions

package/cli_parser.js CHANGED Viewed

File without changes

package/cli_rbql.js CHANGED Viewed

@@ -16,6 +16,7 @@ var interactive_mode = false;
 // TODO implement colored output like in Python version
 // TODO implement query history like in Python version. "readline" modules allows to do that, see "completer" parameter.
+// TODO switch to built-in node util parseArgs module (added in 2022)
 // FIXME test readline on Win: disable interactive mode?
@@ -133,9 +134,9 @@ async function sample_lines(table_path) {
 }
-async function sample_records(table_path, encoding, delim, policy, comment_prefix, trim_whitespaces) {
+async function sample_records(table_path, encoding, delim, policy, comment_prefix, trim_whitespaces, comment_regex) {
     let table_stream = fs.createReadStream(table_path);
-    let sampling_iterator = new rbql_csv.CSVRecordIterator(table_stream, null, encoding, delim, policy, /*has_header=*/false, comment_prefix, 'input', 'a', trim_whitespaces);
+    let sampling_iterator = new rbql_csv.CSVRecordIterator(table_stream, null, encoding, delim, policy, /*has_header=*/false, comment_prefix, 'input', 'a', trim_whitespaces, comment_regex);
     let sampled_records = await sampling_iterator.get_all_records(10);
     let warnings = sampling_iterator.get_warnings();
     return [sampled_records, warnings];
@@ -183,7 +184,7 @@ async function handle_query_success(warnings, output_path, encoding, delim, poli
             }
         }
         if (interactive_mode) {
-            let [records, _warnings] = await sample_records(output_path, encoding, delim, policy, /*comment_prefix=*/null, /*trim_whitespaces=*/false);
+            let [records, _warnings] = await sample_records(output_path, encoding, delim, policy, /*comment_prefix=*/null, /*trim_whitespaces=*/false, /*comment_regex=*/null);
             console.log('\nOutput table preview:');
             console.log('====================================');
             print_colorized(records, delim, false, false);
@@ -210,6 +211,7 @@ async function run_with_js(args) {
     var csv_encoding = args['encoding'];
     var with_headers = args['with-headers'];
     var comment_prefix = args['comment-prefix'];
+    var comment_regex = args['comment-regex'];
     var trim_whitespaces = args['trim-spaces'];
     var output_delim = get_default(args, 'out-delim', null);
     var output_policy = get_default(args, 'out-policy', null);
@@ -231,7 +233,7 @@ async function run_with_js(args) {
         // * This is CLI so no way we are in the Electron environment which can't use the TextDecoder
         // * Streaming mode works a little faster (since we don't need to do the manual validation)
         // TODO check if the current node installation doesn't have ICU enabled (which is typicaly provided by Node.js by default, see https://nodejs.org/api/intl.html) and report a user-friendly error with an option to use latin-1 encoding or switch the interpreter
-        await rbql_csv.query_csv(query, input_path, delim, policy, output_path, output_delim, output_policy, csv_encoding, warnings, with_headers, comment_prefix, user_init_code, {'trim_whitespaces': trim_whitespaces});
+        await rbql_csv.query_csv(query, input_path, delim, policy, output_path, output_delim, output_policy, csv_encoding, warnings, with_headers, comment_prefix, user_init_code, {'trim_whitespaces': trim_whitespaces, 'comment_regex': comment_regex});
         await handle_query_success(warnings, output_path, csv_encoding, output_delim, output_policy);
         return true;
     } catch (e) {
@@ -251,8 +253,8 @@ function get_default_output_path(input_path, delim) {
 }
-async function show_preview(input_path, encoding, delim, policy, with_headers, comment_prefix, trim_whitespaces) {
-    let [records, warnings] = await sample_records(input_path, encoding, delim, policy, comment_prefix, trim_whitespaces);
+async function show_preview(input_path, encoding, delim, policy, with_headers, comment_prefix, trim_whitespaces, comment_regex) {
+    let [records, warnings] = await sample_records(input_path, encoding, delim, policy, comment_prefix, trim_whitespaces, comment_regex);
     console.log('Input table preview:');
     console.log('====================================');
     print_colorized(records, delim, true, with_headers);
@@ -281,7 +283,7 @@ async function run_interactive_loop(args) {
         if (!delim)
             throw new GenericError('Unable to autodetect table delimiter. Provide column separator explicitly with "--delim" option');
     }
-    await show_preview(input_path, args['encoding'], delim, policy, args['with-headers'], args['comment-prefix'], args['trim-spaces']);
+    await show_preview(input_path, args['encoding'], delim, policy, args['with-headers'], args['comment-prefix'], args['trim-spaces'], args['comment-regex']);
     args.delim = delim;
     args.policy = policy;
     if (!args.output) {
@@ -367,7 +369,8 @@ function main() {
         '--delim': {'help': 'Delimiter character or multicharacter string, e.g. "," or "###". Can be autodetected in interactive mode', 'metavar': 'DELIM'},
         '--policy': {'help': 'Split policy, see the explanation below. Supported values: "simple", "quoted", "quoted_rfc", "whitespace", "monocolumn". Can be autodetected in interactive mode', 'metavar': 'POLICY'},
         '--with-headers': {'boolean': true, 'help': 'Indicates that input (and join) table has header'},
-        '--comment-prefix': {'help': 'Ignore lines in input and join tables that start with the comment PREFIX, e.g. "#" or ">>"', 'metavar': 'PREFIX'},
+        '--comment-prefix': {'help': 'Ignore lines in input and join tables that start with the comment PREFIX, e.g. "#"', 'metavar': 'PREFIX'},
+        '--comment-regex': {'help': 'Ignore lines in input and join tables that contain the comment REGEX.', 'metavar': 'REGEX'},
         '--encoding': {'default': 'utf-8', 'help': 'Manually set csv encoding', 'metavar': 'ENCODING'},
         '--trim-spaces': {'boolean': true, 'help': 'Trim leading and trailing spaces from fields'},
         '--out-format': {'default': 'input', 'help': 'Output format. Supported values: ' + out_format_names.map(v => `"${v}"`).join(', '), 'metavar': 'FORMAT'},

package/csv_utils.js CHANGED Viewed

@@ -106,20 +106,30 @@ function split_whitespace_separated_str(src, preserve_whitespaces=false) {
 }
+function get_polymorphic_split_function(dlm, policy, preserve_quotes_and_whitespaces) {
+    // TODO consider moving this function to rbql_csv.js
+    if (policy === 'simple') {
+        return (src) => [src.split(dlm), false];
+    } else if (policy === 'whitespace') {
+        return (src) => [split_whitespace_separated_str(src, preserve_quotes_and_whitespaces), false];
+    } else if (policy === 'monocolumn') {
+        return (src) => [[src], false];
+    } else if (policy === 'quoted' || policy === 'quoted_rfc') {
+        return (src) => split_quoted_str(src, dlm, preserve_quotes_and_whitespaces);
+    } else {
+        throw new Error(`Unsupported splitting policy: ${policy}`);
+    }
+}
 function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) {
-    if (policy === 'simple')
-        return [src.split(dlm), false];
-    if (policy === 'whitespace')
-        return [split_whitespace_separated_str(src, preserve_quotes_and_whitespaces), false];
-    if (policy === 'monocolumn')
-        return [[src], false];
-    return split_quoted_str(src, dlm, preserve_quotes_and_whitespaces);
+    return get_polymorphic_split_function(dlm, policy, preserve_quotes_and_whitespaces)(src);
 }
 class MultilineRecordAggregator {
-    constructor(comment_prefix) {
+    constructor(comment_prefix, comment_regex) {
         this.comment_prefix = comment_prefix;
+        this.comment_regex = comment_regex;
         this.reset();
     }
     add_line(line_text) {
@@ -130,6 +140,10 @@ class MultilineRecordAggregator {
             this.has_comment_line = true;
             return false;
         }
+        if (this.comment_regex && this.rfc_line_buffer.length == 0 && line_text.search(this.comment_regex) != -1) {
+            this.has_comment_line = true;
+            return false;
+        }
         let match_list = line_text.match(/"/g);
         let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
         this.rfc_line_buffer.push(line_text);
@@ -156,6 +170,7 @@ class MultilineRecordAggregator {
 module.exports.split_quoted_str = split_quoted_str;
 module.exports.split_whitespace_separated_str = split_whitespace_separated_str;
 module.exports.smart_split = smart_split;
+module.exports.get_polymorphic_split_function = get_polymorphic_split_function;
 module.exports.quote_field = quote_field;
 module.exports.rfc_quote_field = rfc_quote_field;
 module.exports.unquote_field = unquote_field;

package/index.js CHANGED Viewed

File without changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "rbql",
-  "version": "0.28.0",
+  "version": "0.30.0",
   "description": "Rainbow Query Language",
   "keywords": ["CSV", "TSV", "spreadsheet", "SQL", "SQL-like", "transpiler", "CLI", "command-line", "library", "browser", "Node", "select", "update", "join"],
   "scripts": {

package/rbql.js CHANGED Viewed

@@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions.
 const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
-const RBQL_VERSION = '0.27.0';
+const RBQL_VERSION = '0.30.0';
 function check_if_brackets_match(opening_bracket, closing_bracket) {
@@ -1796,6 +1796,7 @@ class TableWriter extends RBQLOutputWriter {
         super();
         this.table = external_table;
         this.header = null;
+        this.finished = false;
     }
     async write(fields) {
@@ -1806,6 +1807,33 @@ class TableWriter extends RBQLOutputWriter {
     set_header(header) {
         this.header = header;
     }
+    async finish() {
+        this.finished = true;
+    }
+}
+class TablePipe {
+    constructor() {
+        this.table = [];
+        this.writer = new TableWriter(this.table);
+        this.iterator = null;
+    }
+    get_writer() {
+        return this.writer;
+    }
+    get_iterator() {
+        if (!this.writer.finished) {
+            throw new RbqlIOHandlingError("Trying to read from non-thread-safe table pipe while not finishing writing yet");
+        }
+        if (this.iterator === null) {
+            this.iterator = new TableIterator(this.table, this.writer.header);
+        }
+        return this.iterator;
+    }
 }
@@ -1925,7 +1953,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
 }
-async function query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry=null, user_init_code='') {
+function split_query_to_stages(query_text) {
+    return query_text.split(/\|[>]?[ ]*(?=(?:select|update)[ ])/i);
+}
+async function staged_query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry, user_init_code) {
     query_context = new RBQLContext(query_text, input_iterator, output_writer, user_init_code);
     await shallow_parse_input_query(query_text, input_iterator, join_tables_registry, query_context);
     await compile_and_run(query_context);
@@ -1937,6 +1970,20 @@ async function query(query_text, input_iterator, output_writer, output_warnings,
 }
+async function query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry=null, user_init_code='') {
+    let query_stages = split_query_to_stages(query_text);
+    let previous_pipe = null;
+    for (let i = 0; i < query_stages.length; i++) {
+        let query_stage_text = query_stages[i];
+        let output_pipe = i + 1 < query_stages.length ? new TablePipe() : null;
+        let stage_iterator = previous_pipe === null ? input_iterator : previous_pipe.get_iterator();
+        let stage_writer = output_pipe === null ? output_writer : output_pipe.get_writer();
+        await staged_query(query_stage_text, stage_iterator, stage_writer, output_warnings, join_tables_registry, user_init_code);
+        previous_pipe = output_pipe;
+    }
+}
 async function query_table(query_text, input_table, output_table, output_warnings, join_table=null, input_column_names=null, join_column_names=null, output_column_names=null, normalize_column_names=true, user_init_code='') {
     if (!normalize_column_names && input_column_names !== null && join_column_names !== null)
         ensure_no_ambiguous_variables(query_text, input_column_names, join_column_names);

package/rbql_csv.js CHANGED Viewed

@@ -14,9 +14,6 @@ class RbqlIOHandlingError extends Error {}
 class AssertionError extends Error {}
-// TODO performance improvement: replace smart_split() with polymorphic_split()
 function assert(condition, message=null) {
     if (!condition) {
         if (!message) {
@@ -156,7 +153,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
     // CSVRecordIterator implements a typical async producer-consumer model with an internal buffer:
     // get_record() - consumer
     // stream.on('data') - producer
-    constructor(stream, csv_path, encoding, delim, policy, has_header=false, comment_prefix=null, table_name='input', variable_prefix='a', trim_whitespaces=false) {
+    constructor(stream, csv_path, encoding, delim, policy, has_header=false, comment_prefix=null, table_name='input', variable_prefix='a', trim_whitespaces=false, comment_regex=null) {
         super();
         this.stream = stream;
         this.csv_path = csv_path;
@@ -173,6 +170,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
         this.table_name = table_name;
         this.variable_prefix = variable_prefix;
         this.comment_prefix = comment_prefix;
+        this.comment_regex = comment_regex;
         this.trim_whitespaces = trim_whitespaces;
         this.decoder = null;
@@ -199,7 +197,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
         this.NR = 0; // Record number
         this.NL = 0; // Line number (NL != NR when the CSV file has comments or multiline fields)
-        this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix);
+        this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix, comment_regex);
         this.partially_decoded_line = '';
         this.partially_decoded_line_ends_with_cr = false;
@@ -214,6 +212,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
         this.produced_records_queue = new RecordQueue();
         this.process_line_polymorphic = policy == 'quoted_rfc' ? this.process_partial_rfc_record_line : this.process_record_line_simple;
+        this.polymorphic_split = csv_utils.get_polymorphic_split_function(this.delim, this.policy, false);
     }
@@ -344,13 +344,15 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
     process_record_line_simple(line) {
         if (this.comment_prefix && line.startsWith(this.comment_prefix))
             return; // Just skip the line
+        if (this.comment_regex && line.search(this.comment_regex) != -1)
+            return; // Just skip the line
         this.process_record_line(line);
     }
     process_record_line(line) {
         this.NR += 1;
-        var [record, warning] = csv_utils.smart_split(line, this.delim, this.policy, false);
+        var [record, warning] = this.polymorphic_split(line);
         if (this.trim_whitespaces) {
             record = record.map((v) => v.trim());
         }
@@ -661,32 +663,34 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
         this.encoding = encoding;
         this.has_header = has_header;
         this.comment_prefix = comment_prefix;
-        this.stream = null;
-        this.record_iterator = null;
         this.options = options;
-        this.bulk_input_path = null;
-        this.table_path = null;
+        this.active_join_files = [];
     }
     get_iterator_by_table_id(table_id) {
-        this.table_path = find_table_path(this.input_file_dir, table_id);
-        if (this.table_path === null) {
+        let stream = null;
+        let table_path = find_table_path(this.input_file_dir, table_id);
+        if (table_path === null) {
             throw new RbqlIOHandlingError(`Unable to find join table "${table_id}"`);
         }
+        let bulk_input_path = null;
         if (this.options && this.options['bulk_read']) {
-            this.bulk_input_path = this.table_path;
+            bulk_input_path = table_path;
         } else {
-            this.stream = fs.createReadStream(this.table_path);
+            stream = fs.createReadStream(table_path);
         }
         let trim_whitespaces = this.options && this.options['trim_whitespaces'] ? true : false;
-        this.record_iterator = new CSVRecordIterator(this.stream, this.bulk_input_path, this.encoding, this.delim, this.policy, this.has_header, this.comment_prefix, table_id, 'b', trim_whitespaces);
-        return this.record_iterator;
+        let comment_regex = this.options && this.options.hasOwnProperty('comment_regex') ? this.options['comment_regex'] : null;
+        let record_iterator = new CSVRecordIterator(stream, bulk_input_path, this.encoding, this.delim, this.policy, this.has_header, this.comment_prefix, table_id, 'b', trim_whitespaces, comment_regex);
+        this.active_join_files.push({'table_path': table_path, 'input_stream': stream, 'record_iterator': record_iterator});
+        return record_iterator;
     };
     get_warnings(output_warnings) {
-        if (this.record_iterator && this.has_header) {
-            output_warnings.push(`The first record in JOIN file ${path.basename(this.table_path)} was also treated as header (and skipped)`);
+        if (this.has_header) {
+            for (let active_join_file of this.active_join_files) {
+                output_warnings.push(`The first record in JOIN file ${path.basename(active_join_file.table_path)} was also treated as header (and skipped)`);
+            }
         }
     }
 }
@@ -701,6 +705,7 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
         input_stream = input_path === null ? process.stdin : fs.createReadStream(input_path);
     }
     let trim_whitespaces = options && options['trim_whitespaces'] ? true : false;
+    let comment_regex = options && options.hasOwnProperty('comment_regex') ? options['comment_regex'] : null;
     let [output_stream, close_output_on_finish] = output_path === null ? [process.stdout, false] : [fs.createWriteStream(output_path), true];
     if (input_delim == '"' && input_policy == 'quoted')
         throw new RbqlIOHandlingError('Double quote delimiter is incompatible with "quoted" policy');
@@ -717,7 +722,7 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
     }
     let input_file_dir = input_path ? path.dirname(input_path) : null;
     let join_tables_registry = new FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
-    let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix, 'input', 'a', trim_whitespaces);
+    let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix, 'input', 'a', trim_whitespaces, comment_regex);
     let output_writer = new CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy);
     await rbql.query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry, user_init_code);