npm - rbql - Versions diffs - 0.27.0 → 0.29.0 - Mend

rbql 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/DEV_README.md ADDED Viewed

@@ -0,0 +1,8 @@
+# Publishing the package to npm
+1. Update version in package.json. Make sure it is synced with unit tests js version - run the unit tests. Python and JS version don't have to be in sync!
+2. Run `git clean -fd` just in case.
+3. Run `npm publish`.
+Note: you need to be authorize in order to publish so in the new system you might need to run `npm adduser` first,
+if you run `npm publish` without authorizing it would prompt you to do it anyway, so no big deal.

package/README.md CHANGED Viewed

@@ -219,11 +219,14 @@ You can also check rbql-js cli app code as a usage example: [rbql-js cli source
 ### Installation:
-To use RBQL as CLI app you need to install it in global (-g) mode:
+To use RBQL as CLI app you can install it in global (-g) mode:
 ```
 $ npm install -g rbql
 ```
+RBQL can also be installed locally with `$ npm install rbql`, but then you would have to run it with `$ npx rbql-js ...` instead of `$ rbql-js ...`.
 ### Usage (non-interactive mode):
 ```
@@ -302,7 +305,7 @@ _UPDATE_ query produces a new table where original values are replaced according
 ### Aggregate functions and queries
 RBQL supports the following aggregate functions, which can also be used with _GROUP BY_ keyword:
-_COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_
+_COUNT_, _ARRAY_AGG_, _MIN_, _MAX_, _ANY_VALUE_, _SUM_, _AVG_, _VARIANCE_, _MEDIAN_
 Limitation: aggregate functions inside JavaScript expressions are not supported. Although you can use expressions inside aggregate functions.
 E.g. `MAX(float(a1) / 1000)` - valid; `MAX(a1) / 1000` - invalid.
@@ -339,6 +342,13 @@ But it is also possible to override this selection directly in the query by addi
 Example: `select top 5 NR, * with (header)`
+### Pipe syntax for query chaining
+You can chain consecutive queries via pipe `|` syntax. Example:
+```
+SELECT a2 AS region, count(*) AS cnt GROUP BY a2 | SELECT * ORDER BY a.cnt DESC
+```
 ### User Defined Functions (UDF)
 RBQL supports User Defined Functions

package/cli_rbql.js CHANGED Viewed

@@ -16,6 +16,7 @@ var interactive_mode = false;
 // TODO implement colored output like in Python version
 // TODO implement query history like in Python version. "readline" modules allows to do that, see "completer" parameter.
+// TODO switch to built-in node util parseArgs module (added in 2022)
 // FIXME test readline on Win: disable interactive mode?
@@ -133,9 +134,9 @@ async function sample_lines(table_path) {
 }
-async function sample_records(table_path, encoding, delim, policy) {
+async function sample_records(table_path, encoding, delim, policy, comment_prefix, trim_whitespaces, comment_regex) {
     let table_stream = fs.createReadStream(table_path);
-    let sampling_iterator = new rbql_csv.CSVRecordIterator(table_stream, null, encoding, delim, policy);
+    let sampling_iterator = new rbql_csv.CSVRecordIterator(table_stream, null, encoding, delim, policy, /*has_header=*/false, comment_prefix, 'input', 'a', trim_whitespaces, comment_regex);
     let sampled_records = await sampling_iterator.get_all_records(10);
     let warnings = sampling_iterator.get_warnings();
     return [sampled_records, warnings];
@@ -183,7 +184,7 @@ async function handle_query_success(warnings, output_path, encoding, delim, poli
             }
         }
         if (interactive_mode) {
-            let [records, _warnings] = await sample_records(output_path, encoding, delim, policy);
+            let [records, _warnings] = await sample_records(output_path, encoding, delim, policy, /*comment_prefix=*/null, /*trim_whitespaces=*/false, /*comment_regex=*/null);
             console.log('\nOutput table preview:');
             console.log('====================================');
             print_colorized(records, delim, false, false);
@@ -210,6 +211,8 @@ async function run_with_js(args) {
     var csv_encoding = args['encoding'];
     var with_headers = args['with-headers'];
     var comment_prefix = args['comment-prefix'];
+    var comment_regex = args['comment-regex'];
+    var trim_whitespaces = args['trim-spaces'];
     var output_delim = get_default(args, 'out-delim', null);
     var output_policy = get_default(args, 'out-policy', null);
     let init_source_file = get_default(args, 'init-source-file', null);
@@ -230,7 +233,7 @@ async function run_with_js(args) {
         // * This is CLI so no way we are in the Electron environment which can't use the TextDecoder
         // * Streaming mode works a little faster (since we don't need to do the manual validation)
         // TODO check if the current node installation doesn't have ICU enabled (which is typicaly provided by Node.js by default, see https://nodejs.org/api/intl.html) and report a user-friendly error with an option to use latin-1 encoding or switch the interpreter
-        await rbql_csv.query_csv(query, input_path, delim, policy, output_path, output_delim, output_policy, csv_encoding, warnings, with_headers, comment_prefix, user_init_code/*, {'bulk_read': true}*/);
+        await rbql_csv.query_csv(query, input_path, delim, policy, output_path, output_delim, output_policy, csv_encoding, warnings, with_headers, comment_prefix, user_init_code, {'trim_whitespaces': trim_whitespaces, 'comment_regex': comment_regex});
         await handle_query_success(warnings, output_path, csv_encoding, output_delim, output_policy);
         return true;
     } catch (e) {
@@ -250,8 +253,8 @@ function get_default_output_path(input_path, delim) {
 }
-async function show_preview(input_path, encoding, delim, policy, with_headers) {
-    let [records, warnings] = await sample_records(input_path, encoding, delim, policy);
+async function show_preview(input_path, encoding, delim, policy, with_headers, comment_prefix, trim_whitespaces, comment_regex) {
+    let [records, warnings] = await sample_records(input_path, encoding, delim, policy, comment_prefix, trim_whitespaces, comment_regex);
     console.log('Input table preview:');
     console.log('====================================');
     print_colorized(records, delim, true, with_headers);
@@ -280,7 +283,7 @@ async function run_interactive_loop(args) {
         if (!delim)
             throw new GenericError('Unable to autodetect table delimiter. Provide column separator explicitly with "--delim" option');
     }
-    await show_preview(input_path, args['encoding'], delim, policy, args['with-headers']);
+    await show_preview(input_path, args['encoding'], delim, policy, args['with-headers'], args['comment-prefix'], args['trim-spaces'], args['comment-regex']);
     args.delim = delim;
     args.policy = policy;
     if (!args.output) {
@@ -366,8 +369,10 @@ function main() {
         '--delim': {'help': 'Delimiter character or multicharacter string, e.g. "," or "###". Can be autodetected in interactive mode', 'metavar': 'DELIM'},
         '--policy': {'help': 'Split policy, see the explanation below. Supported values: "simple", "quoted", "quoted_rfc", "whitespace", "monocolumn". Can be autodetected in interactive mode', 'metavar': 'POLICY'},
         '--with-headers': {'boolean': true, 'help': 'Indicates that input (and join) table has header'},
-        '--comment-prefix': {'help': 'Ignore lines in input and join tables that start with the comment PREFIX, e.g. "#" or ">>"', 'metavar': 'PREFIX'},
+        '--comment-prefix': {'help': 'Ignore lines in input and join tables that start with the comment PREFIX, e.g. "#"', 'metavar': 'PREFIX'},
+        '--comment-regex': {'help': 'Ignore lines in input and join tables that contain the comment REGEX.', 'metavar': 'REGEX'},
         '--encoding': {'default': 'utf-8', 'help': 'Manually set csv encoding', 'metavar': 'ENCODING'},
+        '--trim-spaces': {'boolean': true, 'help': 'Trim leading and trailing spaces from fields'},
         '--out-format': {'default': 'input', 'help': 'Output format. Supported values: ' + out_format_names.map(v => `"${v}"`).join(', '), 'metavar': 'FORMAT'},
         '--out-delim': {'help': 'Output delim. Use with "out-policy". Overrides out-format', 'metavar': 'DELIM'},
         '--out-policy': {'help': 'Output policy. Use with "out-delim". Overrides out-format', 'metavar': 'POLICY'},

package/csv_utils.js CHANGED Viewed

@@ -118,8 +118,9 @@ function smart_split(src, dlm, policy, preserve_quotes_and_whitespaces) {
 class MultilineRecordAggregator {
-    constructor(comment_prefix) {
+    constructor(comment_prefix, comment_regex) {
         this.comment_prefix = comment_prefix;
+        this.comment_regex = comment_regex;
         this.reset();
     }
     add_line(line_text) {
@@ -130,6 +131,10 @@ class MultilineRecordAggregator {
             this.has_comment_line = true;
             return false;
         }
+        if (this.comment_regex && this.rfc_line_buffer.length == 0 && line_text.search(this.comment_regex) != -1) {
+            this.has_comment_line = true;
+            return false;
+        }
         let match_list = line_text.match(/"/g);
         let has_unbalanced_double_quote = match_list && match_list.length % 2 == 1;
         this.rfc_line_buffer.push(line_text);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "rbql",
-  "version": "0.27.0",
+  "version": "0.29.0",
   "description": "Rainbow Query Language",
   "keywords": ["CSV", "TSV", "spreadsheet", "SQL", "SQL-like", "transpiler", "CLI", "command-line", "library", "browser", "Node", "select", "update", "join"],
   "scripts": {

package/rbql.js CHANGED Viewed

@@ -70,7 +70,7 @@ var query_context = null; // Needs to be global for MIN(), MAX(), etc functions.
 const wrong_aggregation_usage_error = 'Usage of RBQL aggregation functions inside JavaScript expressions is not allowed, see the docs';
-const RBQL_VERSION = '0.27.0';
+const RBQL_VERSION = '0.29.0';
 function check_if_brackets_match(opening_bracket, closing_bracket) {
@@ -289,6 +289,24 @@ function parse_number(val) {
 }
+class AnyValueAggregator {
+    constructor() {
+        this.stats = new Map();
+    }
+    increment(key, val) {
+        var cur_aggr = this.stats.get(key);
+        if (cur_aggr === undefined) {
+            this.stats.set(key, val);
+        }
+    }
+    get_final(key) {
+        return this.stats.get(key);
+    }
+}
 class MinAggregator {
     constructor() {
         this.stats = new Map();
@@ -310,7 +328,6 @@ class MinAggregator {
 }
 class MaxAggregator {
     constructor() {
         this.stats = new Map();
@@ -514,6 +531,11 @@ function init_aggregator(generator_name, val, post_proc=null) {
     return res;
 }
+function ANY_VALUE(val) {
+    return query_context.aggregation_stage < 2 ? init_aggregator(AnyValueAggregator, val) : val;
+}
+const any_value = ANY_VALUE;
+const Any_value = ANY_VALUE;
 function MIN(val) {
     return query_context.aggregation_stage < 2 ? init_aggregator(MinAggregator, val) : val;
@@ -521,7 +543,6 @@ function MIN(val) {
 const min = MIN;
 const Min = MIN;
 function MAX(val) {
     return query_context.aggregation_stage < 2 ? init_aggregator(MaxAggregator, val) : val;
 }
@@ -1775,6 +1796,7 @@ class TableWriter extends RBQLOutputWriter {
         super();
         this.table = external_table;
         this.header = null;
+        this.finished = false;
     }
     async write(fields) {
@@ -1785,6 +1807,33 @@ class TableWriter extends RBQLOutputWriter {
     set_header(header) {
         this.header = header;
     }
+    async finish() {
+        this.finished = true;
+    }
+}
+class TablePipe {
+    constructor() {
+        this.table = [];
+        this.writer = new TableWriter(this.table);
+        this.iterator = null;
+    }
+    get_writer() {
+        return this.writer;
+    }
+    get_iterator() {
+        if (!this.writer.finished) {
+            throw new RbqlIOHandlingError("Trying to read from non-thread-safe table pipe while not finishing writing yet");
+        }
+        if (this.iterator === null) {
+            this.iterator = new TableIterator(this.table, this.writer.header);
+        }
+        return this.iterator;
+    }
 }
@@ -1823,6 +1872,7 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
         if (rb_actions.hasOwnProperty(ORDER_BY) || rb_actions.hasOwnProperty(UPDATE))
             throw new RbqlParsingError('"ORDER BY", "UPDATE" and "DISTINCT" keywords are not allowed in aggregate queries');
         query_context.aggregation_key_expression = '[' + combine_string_literals(rb_actions[GROUP_BY]['text'], string_literals) + ']';
+        query_context.aggregation_stage = 1;
     }
@@ -1903,7 +1953,12 @@ async function shallow_parse_input_query(query_text, input_iterator, join_tables
 }
-async function query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry=null, user_init_code='') {
+function split_query_to_stages(query_text) {
+    return query_text.split(/\|[>]?[ ]*(?=(?:select|update)[ ])/i);
+}
+async function staged_query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry, user_init_code) {
     query_context = new RBQLContext(query_text, input_iterator, output_writer, user_init_code);
     await shallow_parse_input_query(query_text, input_iterator, join_tables_registry, query_context);
     await compile_and_run(query_context);
@@ -1915,6 +1970,20 @@ async function query(query_text, input_iterator, output_writer, output_warnings,
 }
+async function query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry=null, user_init_code='') {
+    let query_stages = split_query_to_stages(query_text);
+    let previous_pipe = null;
+    for (let i = 0; i < query_stages.length; i++) {
+        let query_stage_text = query_stages[i];
+        let output_pipe = i + 1 < query_stages.length ? new TablePipe() : null;
+        let stage_iterator = previous_pipe === null ? input_iterator : previous_pipe.get_iterator();
+        let stage_writer = output_pipe === null ? output_writer : output_pipe.get_writer();
+        await staged_query(query_stage_text, stage_iterator, stage_writer, output_warnings, join_tables_registry, user_init_code);
+        previous_pipe = output_pipe;
+    }
+}
 async function query_table(query_text, input_table, output_table, output_warnings, join_table=null, input_column_names=null, join_column_names=null, output_column_names=null, normalize_column_names=true, user_init_code='') {
     if (!normalize_column_names && input_column_names !== null && join_column_names !== null)
         ensure_no_ambiguous_variables(query_text, input_column_names, join_column_names);

package/rbql_csv.js CHANGED Viewed

@@ -156,7 +156,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
     // CSVRecordIterator implements a typical async producer-consumer model with an internal buffer:
     // get_record() - consumer
     // stream.on('data') - producer
-    constructor(stream, csv_path, encoding, delim, policy, has_header=false, comment_prefix=null, table_name='input', variable_prefix='a') {
+    constructor(stream, csv_path, encoding, delim, policy, has_header=false, comment_prefix=null, table_name='input', variable_prefix='a', trim_whitespaces=false, comment_regex=null) {
         super();
         this.stream = stream;
         this.csv_path = csv_path;
@@ -173,6 +173,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
         this.table_name = table_name;
         this.variable_prefix = variable_prefix;
         this.comment_prefix = comment_prefix;
+        this.comment_regex = comment_regex;
+        this.trim_whitespaces = trim_whitespaces;
         this.decoder = null;
         if (encoding == 'utf-8' && this.csv_path === null) {
@@ -198,7 +200,7 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
         this.NR = 0; // Record number
         this.NL = 0; // Line number (NL != NR when the CSV file has comments or multiline fields)
-        this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix);
+        this.line_aggregator = new csv_utils.MultilineRecordAggregator(comment_prefix, comment_regex);
         this.partially_decoded_line = '';
         this.partially_decoded_line_ends_with_cr = false;
@@ -343,6 +345,8 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
     process_record_line_simple(line) {
         if (this.comment_prefix && line.startsWith(this.comment_prefix))
             return; // Just skip the line
+        if (this.comment_regex && line.search(this.comment_regex) != -1)
+            return; // Just skip the line
         this.process_record_line(line);
     }
@@ -350,6 +354,9 @@ class CSVRecordIterator extends rbql.RBQLInputIterator {
     process_record_line(line) {
         this.NR += 1;
         var [record, warning] = csv_utils.smart_split(line, this.delim, this.policy, false);
+        if (this.trim_whitespaces) {
+            record = record.map((v) => v.trim());
+        }
         if (warning) {
             if (this.first_defective_line === null) {
                 this.first_defective_line = this.NL;
@@ -675,7 +682,9 @@ class FileSystemCSVRegistry extends rbql.RBQLTableRegistry {
         } else {
             this.stream = fs.createReadStream(this.table_path);
         }
-        this.record_iterator = new CSVRecordIterator(this.stream, this.bulk_input_path, this.encoding, this.delim, this.policy, this.has_header, this.comment_prefix, table_id, 'b');
+        let trim_whitespaces = this.options && this.options['trim_whitespaces'] ? true : false;
+        let comment_regex = this.options && this.options.hasOwnProperty('comment_regex') ? this.options['comment_regex'] : null;
+        this.record_iterator = new CSVRecordIterator(this.stream, this.bulk_input_path, this.encoding, this.delim, this.policy, this.has_header, this.comment_prefix, table_id, 'b', trim_whitespaces, comment_regex);
         return this.record_iterator;
     };
@@ -695,6 +704,8 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
     } else {
         input_stream = input_path === null ? process.stdin : fs.createReadStream(input_path);
     }
+    let trim_whitespaces = options && options['trim_whitespaces'] ? true : false;
+    let comment_regex = options && options.hasOwnProperty('comment_regex') ? options['comment_regex'] : null;
     let [output_stream, close_output_on_finish] = output_path === null ? [process.stdout, false] : [fs.createWriteStream(output_path), true];
     if (input_delim == '"' && input_policy == 'quoted')
         throw new RbqlIOHandlingError('Double quote delimiter is incompatible with "quoted" policy');
@@ -711,7 +722,7 @@ async function query_csv(query_text, input_path, input_delim, input_policy, outp
     }
     let input_file_dir = input_path ? path.dirname(input_path) : null;
     let join_tables_registry = new FileSystemCSVRegistry(input_file_dir, input_delim, input_policy, csv_encoding, with_headers, comment_prefix, options);
-    let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix);
+    let input_iterator = new CSVRecordIterator(input_stream, bulk_input_path, csv_encoding, input_delim, input_policy, with_headers, comment_prefix, 'input', 'a', trim_whitespaces, comment_regex);
     let output_writer = new CSVWriter(output_stream, close_output_on_finish, csv_encoding, output_delim, output_policy);
     await rbql.query(query_text, input_iterator, output_writer, output_warnings, join_tables_registry, user_init_code);