@malloydata/malloy-filter 0.0.237-dev250221201621

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +81 -0
  2. package/SAMPLES.md +381 -0
  3. package/SERIALIZE_SAMPLES.md +300 -0
  4. package/dist/a_simple_parser.d.ts +1 -0
  5. package/dist/a_simple_parser.js +20 -0
  6. package/dist/a_simple_parser.js.map +1 -0
  7. package/dist/a_simple_serializer.d.ts +1 -0
  8. package/dist/a_simple_serializer.js +31 -0
  9. package/dist/a_simple_serializer.js.map +1 -0
  10. package/dist/base_parser.d.ts +13 -0
  11. package/dist/base_parser.js +33 -0
  12. package/dist/base_parser.js.map +1 -0
  13. package/dist/base_serializer.d.ts +6 -0
  14. package/dist/base_serializer.js +11 -0
  15. package/dist/base_serializer.js.map +1 -0
  16. package/dist/boolean_parser.d.ts +7 -0
  17. package/dist/boolean_parser.js +59 -0
  18. package/dist/boolean_parser.js.map +1 -0
  19. package/dist/boolean_serializer.d.ts +8 -0
  20. package/dist/boolean_serializer.js +31 -0
  21. package/dist/boolean_serializer.js.map +1 -0
  22. package/dist/clause_types.d.ts +70 -0
  23. package/dist/clause_types.js +3 -0
  24. package/dist/clause_types.js.map +1 -0
  25. package/dist/date_parser.d.ts +22 -0
  26. package/dist/date_parser.js +315 -0
  27. package/dist/date_parser.js.map +1 -0
  28. package/dist/date_serializer.d.ts +10 -0
  29. package/dist/date_serializer.js +100 -0
  30. package/dist/date_serializer.js.map +1 -0
  31. package/dist/filter_parser.d.ts +12 -0
  32. package/dist/filter_parser.js +66 -0
  33. package/dist/filter_parser.js.map +1 -0
  34. package/dist/filter_serializer.d.ts +13 -0
  35. package/dist/filter_serializer.js +43 -0
  36. package/dist/filter_serializer.js.map +1 -0
  37. package/dist/filter_types.d.ts +10 -0
  38. package/dist/filter_types.js +3 -0
  39. package/dist/filter_types.js.map +1 -0
  40. package/dist/generate_samples.d.ts +1 -0
  41. package/dist/generate_samples.js +344 -0
  42. package/dist/generate_samples.js.map +1 -0
  43. package/dist/number_parser.d.ts +20 -0
  44. package/dist/number_parser.js +275 -0
  45. package/dist/number_parser.js.map +1 -0
  46. package/dist/number_serializer.d.ts +11 -0
  47. package/dist/number_serializer.js +76 -0
  48. package/dist/number_serializer.js.map +1 -0
  49. package/dist/string_parser.d.ts +18 -0
  50. package/dist/string_parser.js +198 -0
  51. package/dist/string_parser.js.map +1 -0
  52. package/dist/string_serializer.d.ts +11 -0
  53. package/dist/string_serializer.js +77 -0
  54. package/dist/string_serializer.js.map +1 -0
  55. package/dist/token_types.d.ts +7 -0
  56. package/dist/token_types.js +3 -0
  57. package/dist/token_types.js.map +1 -0
  58. package/dist/tokenizer.d.ts +52 -0
  59. package/dist/tokenizer.js +263 -0
  60. package/dist/tokenizer.js.map +1 -0
  61. package/dist/tokenizer.spec.d.ts +1 -0
  62. package/dist/tokenizer.spec.js +255 -0
  63. package/dist/tokenizer.spec.js.map +1 -0
  64. package/jest.config.js +3 -0
  65. package/package.json +21 -0
  66. package/src/DEVELOPING.md +26 -0
  67. package/src/a_simple_parser.ts +22 -0
  68. package/src/a_simple_serializer.ts +40 -0
  69. package/src/base_parser.ts +45 -0
  70. package/src/base_serializer.ts +9 -0
  71. package/src/boolean_parser.ts +60 -0
  72. package/src/boolean_serializer.ts +32 -0
  73. package/src/clause_types.ts +160 -0
  74. package/src/date_parser.ts +413 -0
  75. package/src/date_serializer.ts +114 -0
  76. package/src/filter_parser.ts +68 -0
  77. package/src/filter_serializer.ts +49 -0
  78. package/src/filter_types.ts +12 -0
  79. package/src/generate_samples.ts +387 -0
  80. package/src/number_parser.ts +308 -0
  81. package/src/number_serializer.ts +96 -0
  82. package/src/string_parser.ts +193 -0
  83. package/src/string_serializer.ts +87 -0
  84. package/src/token_types.ts +7 -0
  85. package/src/tokenizer.spec.ts +273 -0
  86. package/src/tokenizer.ts +320 -0
  87. package/tsconfig.json +14 -0
@@ -0,0 +1,96 @@
1
+ import {
2
+ NumberCondition,
3
+ NumberRange,
4
+ NumberOperator,
5
+ NumberRangeOperator,
6
+ Clause,
7
+ } from './clause_types';
8
+ import {BaseSerializer} from './base_serializer';
9
+
10
+ export class NumberSerializer extends BaseSerializer {
11
+ constructor(clauses: Clause[]) {
12
+ super(clauses);
13
+ }
14
+
15
+ public serialize(): string {
16
+ const result = NumberSerializer.clauseToString(this.clauses);
17
+ return result.trim().replace(/,$/, '');
18
+ }
19
+
20
+ // NumberOperator = '<=' | '>=' | '!=' | '=' | '>' | '<';
21
+ private static numberConditionToString(
22
+ operator: NumberOperator,
23
+ value: number | null
24
+ ): string {
25
+ if (value === null) {
26
+ return operator === '=' ? 'NULL' : '-NULL';
27
+ }
28
+ const operatorString = operator === '=' ? '' : operator; // Remove operator for eg "5, 7, 9"
29
+ return operatorString + value;
30
+ }
31
+
32
+ private static getNegatedType(
33
+ operator: NumberRangeOperator
34
+ ): NumberRangeOperator {
35
+ switch (operator) {
36
+ case '<':
37
+ return '>=';
38
+ case '<=':
39
+ return '>';
40
+ case '>':
41
+ return '<=';
42
+ case '>=':
43
+ return '<';
44
+ }
45
+ }
46
+
47
+ private static isNumberOperator(value: string): value is NumberOperator {
48
+ return ['<=', '>=', '!=', '=', '>', '<'].includes(value);
49
+ }
50
+
51
+ private static rangeToString(clause: NumberRange): string {
52
+ const negated: string =
53
+ clause.startOperator === '<' || clause.startOperator === '<=' ? '!=' : '';
54
+ const startOperator = negated
55
+ ? NumberSerializer.getNegatedType(clause.startOperator)
56
+ : clause.startOperator;
57
+ const endOperator = negated
58
+ ? NumberSerializer.getNegatedType(clause.endOperator)
59
+ : clause.endOperator;
60
+ const leftBracket: string = startOperator === '>' ? '(' : '[';
61
+ const rightBracket: string = endOperator === '<' ? ')' : ']';
62
+ return (
63
+ negated +
64
+ leftBracket +
65
+ clause.startValue +
66
+ ', ' +
67
+ clause.endValue +
68
+ rightBracket
69
+ );
70
+ }
71
+
72
+ private static clauseToString(clauses: Clause[]): string {
73
+ let result = '';
74
+ for (const clause of clauses) {
75
+ if ('operator' in clause && clause.operator === 'range') {
76
+ result += NumberSerializer.rangeToString(clause);
77
+ result += ', ';
78
+ } else if (
79
+ 'operator' in clause &&
80
+ NumberSerializer.isNumberOperator(clause.operator)
81
+ ) {
82
+ const numberClause: NumberCondition = clause as NumberCondition;
83
+ for (const value of numberClause.values) {
84
+ result += NumberSerializer.numberConditionToString(
85
+ numberClause.operator,
86
+ value
87
+ );
88
+ result += ', ';
89
+ }
90
+ } else {
91
+ throw new Error('Invalid number clause ' + JSON.stringify(clause));
92
+ }
93
+ }
94
+ return result;
95
+ }
96
+ }
@@ -0,0 +1,193 @@
1
+ import {SpecialToken, Tokenizer, TokenizerParams} from './tokenizer';
2
+ import {StringCondition, StringOperator, QuoteType} from './clause_types';
3
+ import {BaseParser} from './base_parser';
4
+ import {FilterParserResponse, FilterError} from './filter_types';
5
+
6
+ export class StringParser extends BaseParser {
7
+ private static readonly percentRegex: RegExp = /(?<!\\)%/;
8
+ private static readonly underscoreRegex: RegExp = /(?<!\\)_/;
9
+ private static readonly percentStartRegex: RegExp = /^%/;
10
+ private static readonly percentEndRegex: RegExp = /(?<!\\)%$/;
11
+ private static readonly negatedStartRegex: RegExp = /^-(.+)$/;
12
+ private static readonly singleBackslashRegex: RegExp = /(?<!\\)\\(?!\\)/g;
13
+
14
+ constructor(input: string) {
15
+ super(input);
16
+ }
17
+
18
+ private tokenize(): void {
19
+ const specialSubstrings: SpecialToken[] = [{type: ',', value: ','}];
20
+ const specialWords: SpecialToken[] = [
21
+ {type: 'NULL', value: 'null', ignoreCase: true},
22
+ {type: 'EMPTY', value: 'empty', ignoreCase: true},
23
+ {type: 'NOTNULL', value: '-null', ignoreCase: true},
24
+ {type: 'NOTEMPTY', value: '-empty', ignoreCase: true},
25
+ ];
26
+ const params: TokenizerParams = {
27
+ trimWordWhitespace: true,
28
+ combineAdjacentWords: true,
29
+ specialSubstrings,
30
+ specialWords: specialWords,
31
+ };
32
+
33
+ const tokenizer = new Tokenizer(this.inputString, params);
34
+ this.tokens = tokenizer.parse();
35
+ this.tokens = Tokenizer.convertSpecialWords(this.tokens, specialWords);
36
+ }
37
+
38
+ public parse(): FilterParserResponse {
39
+ this.index = 0;
40
+ this.tokenize();
41
+ const clauses: StringCondition[] = [];
42
+ const errors: FilterError[] = [];
43
+ while (this.index < this.tokens.length) {
44
+ const token = this.getNext();
45
+ if (token.type === ',') {
46
+ this.index++;
47
+ } else if (token.type === 'NULL') {
48
+ clauses.push({operator: '=', values: [null]});
49
+ this.index++;
50
+ } else if (token.type === 'EMPTY') {
51
+ clauses.push({operator: 'EMPTY', values: [null]});
52
+ this.index++;
53
+ } else if (token.type === 'NOTNULL') {
54
+ clauses.push({operator: '!=', values: [null]});
55
+ this.index++;
56
+ } else if (token.type === 'NOTEMPTY') {
57
+ clauses.push({operator: 'NOTEMPTY', values: [null]});
58
+ this.index++;
59
+ } else if (this.checkSimpleWord(clauses)) {
60
+ this.index++;
61
+ } else {
62
+ errors.push({
63
+ message: 'Invalid expression',
64
+ startIndex: token.startIndex,
65
+ endIndex: token.endIndex,
66
+ });
67
+ this.index++;
68
+ }
69
+ }
70
+ return {clauses: StringParser.groupClauses(clauses), errors};
71
+ }
72
+
73
+ private static findQuotes(str: string): QuoteType[] {
74
+ const quotes: Set<QuoteType> = new Set();
75
+ let i = 0;
76
+
77
+ while (i < str.length) {
78
+ // Check for triple quotes first to avoid false positives
79
+ if (str.slice(i, i + 3) === "'''") {
80
+ quotes.add('TRIPLESINGLE');
81
+ i += 3;
82
+ } else if (str.slice(i, i + 3) === '"""') {
83
+ quotes.add('TRIPLEDOUBLE');
84
+ i += 3;
85
+ } else if (str[i] === '\\') {
86
+ // Check for escaped quotes
87
+ if (i + 1 < str.length) {
88
+ switch (str[i + 1]) {
89
+ case "'":
90
+ quotes.add('ESCAPEDSINGLE');
91
+ break;
92
+ case '"':
93
+ quotes.add('ESCAPEDDOUBLE');
94
+ break;
95
+ case '`':
96
+ quotes.add('ESCAPEDBACKTICK');
97
+ break;
98
+ }
99
+ i += 2;
100
+ } else {
101
+ i++;
102
+ }
103
+ } else {
104
+ // Check for single quotes
105
+ switch (str[i]) {
106
+ case "'":
107
+ quotes.add('SINGLE');
108
+ break;
109
+ case '"':
110
+ quotes.add('DOUBLE');
111
+ break;
112
+ case '`':
113
+ quotes.add('BACKTICK');
114
+ break;
115
+ }
116
+ i++;
117
+ }
118
+ }
119
+ return Array.from(quotes);
120
+ }
121
+
122
+ private static groupClauses(clauses: StringCondition[]): StringCondition[] {
123
+ if (clauses.length < 2) {
124
+ return clauses;
125
+ }
126
+ let previous: StringCondition = clauses[0];
127
+ const outputs: StringCondition[] = [previous];
128
+ for (let i = 1; i < clauses.length; i++) {
129
+ if (previous.operator === clauses[i].operator) {
130
+ previous.values.push(...clauses[i].values);
131
+ } else {
132
+ previous = clauses[i];
133
+ outputs.push(previous);
134
+ }
135
+ }
136
+ return outputs;
137
+ }
138
+
139
+ private static percentInMiddle(word: string): boolean {
140
+ if (word.length < 3) return false;
141
+ word = word.substring(1, word.length - 1);
142
+ return StringParser.percentRegex.test(word);
143
+ }
144
+
145
+ private static removeBackslashes(word: string): string {
146
+ StringParser.singleBackslashRegex.lastIndex = 0;
147
+ return word.replace(StringParser.singleBackslashRegex, _match => '');
148
+ }
149
+
150
+ private checkSimpleWord(clauses: StringCondition[]): boolean {
151
+ const token = this.getNext();
152
+ if (token.type !== 'word') {
153
+ return false;
154
+ }
155
+ const negatedMatch = StringParser.negatedStartRegex.exec(token.value);
156
+ let word = negatedMatch ? negatedMatch[1] : token.value;
157
+
158
+ const isPercentStart = StringParser.percentStartRegex.test(word);
159
+ const isPercentEnd = StringParser.percentEndRegex.test(word);
160
+ const isPercentBoth = isPercentStart && isPercentEnd;
161
+ const isUnderscore = StringParser.underscoreRegex.test(word);
162
+ const isPercentMiddle = StringParser.percentInMiddle(word);
163
+
164
+ let operator: StringOperator = negatedMatch ? '!=' : '=';
165
+ if (isUnderscore || isPercentMiddle || (isPercentBoth && word.length < 3)) {
166
+ operator = negatedMatch ? '!~' : '~';
167
+ } else if (isPercentBoth && word.length > 2) {
168
+ operator = negatedMatch ? 'notContains' : 'contains';
169
+ word = word.substring(1, word.length - 1);
170
+ word = StringParser.removeBackslashes(word);
171
+ } else if (isPercentStart) {
172
+ operator = negatedMatch ? 'notEnds' : 'ends';
173
+ word = word.substring(1, word.length);
174
+ word = StringParser.removeBackslashes(word);
175
+ } else if (isPercentEnd) {
176
+ operator = negatedMatch ? 'notStarts' : 'starts';
177
+ word = word.substring(0, word.length - 1);
178
+ word = StringParser.removeBackslashes(word);
179
+ } else {
180
+ // = or !=
181
+ word = StringParser.removeBackslashes(word);
182
+ }
183
+ if (word.length === 0) {
184
+ return false;
185
+ }
186
+
187
+ const clause: StringCondition = {operator: operator, values: [word]};
188
+ //const quotes: QuoteType[] = StringParser.findQuotes(word);
189
+ //if (quotes.length > 0) { clause.quotes = quotes; }
190
+ clauses.push(clause);
191
+ return true;
192
+ }
193
+ }
@@ -0,0 +1,87 @@
1
+ import {StringCondition, StringOperator, Clause} from './clause_types';
2
+ import {BaseSerializer} from './base_serializer';
3
+
4
+ export class StringSerializer extends BaseSerializer {
5
+ constructor(clauses: Clause[]) {
6
+ super(clauses);
7
+ }
8
+
9
+ public serialize(): string {
10
+ const result = StringSerializer.clauseToString(this.clauses);
11
+ return result.trim().replace(/,$/, '');
12
+ }
13
+
14
+ private static isNegated(operator: StringOperator): boolean {
15
+ return (
16
+ operator === 'NOTEMPTY' ||
17
+ operator === '!~' ||
18
+ operator === '!=' ||
19
+ operator === 'notStarts' ||
20
+ operator === 'notEnds' ||
21
+ operator === 'notContains'
22
+ );
23
+ }
24
+
25
+ private static escapeSpecialCharacters(input: string): string {
26
+ return input.replace(/[,\\]/g, match => `\\${match}`);
27
+ }
28
+
29
+ private static escapeWildcardCharacters(input: string): string {
30
+ return input.replace(/[_%]/g, match => `\\${match}`);
31
+ }
32
+
33
+ // export type StringOperator = 'EMPTY' | 'NOTEMPTY' | 'starts' | 'ends' | 'contains' | 'notStarts' |
34
+ // 'notEnds' | 'notContains' | '~' | '=' | '!~' | '!=';
35
+ private static stringConditionToString(
36
+ operator: StringOperator,
37
+ value: string | null
38
+ ): string {
39
+ if (operator === 'EMPTY') {
40
+ return 'EMPTY';
41
+ } else if (operator === 'NOTEMPTY') {
42
+ return '-EMPTY';
43
+ }
44
+
45
+ const negated: boolean = StringSerializer.isNegated(operator);
46
+ if (value === null) {
47
+ return negated ? '-NULL' : 'NULL';
48
+ }
49
+ if (value === 'NULL' || value === '-NULL') {
50
+ return (negated ? '-' : '') + '\\' + value;
51
+ }
52
+
53
+ value = StringSerializer.escapeSpecialCharacters(value);
54
+ if (operator === 'starts' || operator === 'notStarts') {
55
+ value = StringSerializer.escapeWildcardCharacters(value);
56
+ return (negated ? '-' : '') + value + '%';
57
+ } else if (operator === 'ends' || operator === 'notEnds') {
58
+ value = StringSerializer.escapeWildcardCharacters(value);
59
+ return (negated ? '-' : '') + '%' + value;
60
+ } else if (operator === 'contains' || operator === 'notContains') {
61
+ value = StringSerializer.escapeWildcardCharacters(value);
62
+ return (negated ? '-' : '') + '%' + value + '%';
63
+ } else if (operator === '=' || operator === '!=') {
64
+ value = StringSerializer.escapeWildcardCharacters(value);
65
+ return (negated ? '-' : '') + value;
66
+ }
67
+
68
+ return (negated ? '-' : '') + value;
69
+ }
70
+
71
+ private static clauseToString(clauses: Clause[]): string {
72
+ let result = '';
73
+ for (const genericClause of clauses) {
74
+ const clause: StringCondition = genericClause as StringCondition;
75
+ for (const value of clause.values) {
76
+ const word = StringSerializer.stringConditionToString(
77
+ clause.operator,
78
+ value
79
+ );
80
+ if (word) {
81
+ result += word + ', ';
82
+ }
83
+ }
84
+ }
85
+ return result;
86
+ }
87
+ }
@@ -0,0 +1,7 @@
1
+ export interface Token {
2
+ type: string;
3
+ value: string;
4
+ startIndex: number; // The start index of this token in the original string.
5
+ endIndex: number; // The end index of this token in the original string.
6
+ values?: Token[]; // Merged tokens can contain tokens. Otherwise undefined.
7
+ }
@@ -0,0 +1,273 @@
1
+ import {Tokenizer, SpecialToken, TokenizerParams} from './tokenizer';
2
+ import {Token} from './token_types';
3
+
4
+ function makeParams(): TokenizerParams {
5
+ const specialSubstrings: SpecialToken[] = [
6
+ {type: ',', value: ','},
7
+ {type: 'VARIABLE', value: /^\$\{[^}]+\}/},
8
+ ];
9
+ const specialWords: SpecialToken[] = [
10
+ {type: 'NULL', value: 'null', ignoreCase: true},
11
+ {type: 'EMPTY', value: 'empty', ignoreCase: true},
12
+ {type: 'NOTNULL', value: '-null', ignoreCase: true},
13
+ {type: 'NOTEMPTY', value: '-empty', ignoreCase: true},
14
+ {
15
+ type: 'DAYOFWEEK',
16
+ value: /^(monday|tuesday|wednesday|thursday|friday|saturday|sunday)$/i,
17
+ ignoreCase: true,
18
+ },
19
+ {type: 'STATE', value: /^(California|Washington)$/i},
20
+ {type: 'DATE', value: /^\d\d\d\d-\d\d-\d\d$/},
21
+ {type: 'DATE', value: /^\d\d\d\d-\d\d$/},
22
+ {type: 'DATE', value: /^\d\d\d\d$/},
23
+ ];
24
+ return {
25
+ splitOnWhitespace: true,
26
+ trimWordWhitespace: true,
27
+ specialSubstrings,
28
+ specialWords,
29
+ };
30
+ }
31
+
32
+ describe('Tokenizer', () => {
33
+ const makeToken = (
34
+ type: string,
35
+ value: string,
36
+ startIndex: number,
37
+ endIndex: number
38
+ ): Token => ({type, value, startIndex, endIndex});
39
+ it('should tokenize a simple string', () => {
40
+ const input = 'hello world';
41
+ const expectedTokens = [
42
+ makeToken('word', 'hello', 0, 5),
43
+ makeToken('word', 'world', 6, 11),
44
+ ];
45
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
46
+ });
47
+ it('should split adjacent characters when no whitespace', () => {
48
+ const specialSubstrings: SpecialToken[] = [
49
+ {type: ',', value: ','},
50
+ {type: 'exclamation', value: '!'},
51
+ {type: 'bracket', value: '['},
52
+ ];
53
+ const input = '[hello],big,world!';
54
+ const expectedTokens = [
55
+ makeToken('bracket', '[', 0, 1),
56
+ makeToken('word', 'hello]', 1, 7),
57
+ makeToken(',', ',', 7, 8),
58
+ makeToken('word', 'big', 8, 11),
59
+ makeToken(',', ',', 11, 12),
60
+ makeToken('word', 'world', 12, 17),
61
+ makeToken('exclamation', '!', 17, 18),
62
+ ];
63
+ expect(
64
+ new Tokenizer(input, {...makeParams(), specialSubstrings}).parse()
65
+ ).toEqual(expectedTokens);
66
+ });
67
+ it('should match special tokens', () => {
68
+ const input = 'hello NULL world,-Null,-\'NULL" ,NULL, NULL , ';
69
+ const expectedTokens = [
70
+ makeToken('word', 'hello', 0, 5),
71
+ makeToken('NULL', 'NULL', 6, 10),
72
+ makeToken('word', 'world', 11, 16),
73
+ makeToken(',', ',', 16, 17),
74
+ makeToken('NOTNULL', '-NULL', 17, 22),
75
+ makeToken(',', ',', 22, 23),
76
+ makeToken('word', '-\'NULL"', 23, 30),
77
+ makeToken(',', ',', 31, 32),
78
+ makeToken('NULL', 'NULL', 32, 36),
79
+ makeToken(',', ',', 36, 37),
80
+ makeToken('NULL', 'NULL', 38, 42),
81
+ makeToken(',', ',', 43, 44),
82
+ ];
83
+ const params = makeParams();
84
+ expect(new Tokenizer(input, params).parse()).toEqual(expectedTokens);
85
+ });
86
+ it('should not combine adjacent words', () => {
87
+ const input = 'ABC DEF';
88
+ const expectedTokens = [
89
+ makeToken('word', 'ABC', 0, 3),
90
+ makeToken('word', 'DEF', 4, 7),
91
+ ];
92
+ const params = makeParams();
93
+ expect(new Tokenizer(input, params).parse()).toEqual(expectedTokens);
94
+ });
95
+ it('should combine adjacent words when combineAdjacentWords', () => {
96
+ const input = 'ABC DEF';
97
+ const expectedTokens = [makeToken('word', 'ABCDEF', 0, 7)];
98
+ const params = {...makeParams(), combineAdjacentWords: true};
99
+ expect(new Tokenizer(input, params).parse()).toEqual(expectedTokens);
100
+ });
101
+ it('escaping should prevent special token matching', () => {
102
+ const input = 'N\\ULL';
103
+ const expectedTokens = [makeToken('word', 'N\\ULL', 0, 5)];
104
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
105
+ });
106
+ it('should match escaped characters', () => {
107
+ const input = 'hello \\n world';
108
+ const expectedTokens = [
109
+ makeToken('word', 'hello', 0, 5),
110
+ makeToken('word', '\\n', 6, 8),
111
+ makeToken('word', 'world', 9, 14),
112
+ ];
113
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
114
+ });
115
+ it('should preserve all escaped characters', () => {
116
+ const input = "he'llo \\t \\${w}or\\,ld";
117
+ const expectedTokens = [
118
+ makeToken('word', "he'llo", 0, 6),
119
+ makeToken('word', '\\t', 7, 9),
120
+ makeToken('word', '\\${w}or\\,ld', 10, 21),
121
+ ];
122
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
123
+ });
124
+ it('should match regexp and capitalize special matches', () => {
125
+ const input =
126
+ "hello tuesDAY,ttuesday, tuesdayy ,Tuesday , ttuesday, 'TUESday' ";
127
+ const expectedTokens = [
128
+ makeToken('word', 'hello', 0, 5),
129
+ makeToken('DAYOFWEEK', 'TUESDAY', 6, 13),
130
+ makeToken(',', ',', 13, 14),
131
+ makeToken('word', 'ttuesday', 14, 22),
132
+ makeToken(',', ',', 22, 23),
133
+ makeToken('word', 'tuesdayy', 24, 32),
134
+ makeToken(',', ',', 33, 34),
135
+ makeToken('DAYOFWEEK', 'TUESDAY', 34, 41),
136
+ makeToken(',', ',', 42, 43),
137
+ makeToken('word', 'ttuesday', 44, 52),
138
+ makeToken(',', ',', 52, 53),
139
+ makeToken('word', "'TUESday'", 54, 63),
140
+ ];
141
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
142
+ });
143
+ it('should match regexp and not capitalize', () => {
144
+ const input = 'Washington, Washingo,washington,wWashington ';
145
+ const expectedTokens = [
146
+ makeToken('STATE', 'Washington', 0, 10),
147
+ makeToken(',', ',', 10, 11),
148
+ makeToken('word', 'Washingo', 12, 20),
149
+ makeToken(',', ',', 20, 21),
150
+ makeToken('STATE', 'washington', 21, 31),
151
+ makeToken(',', ',', 31, 32),
152
+ makeToken('word', 'wWashington', 32, 43),
153
+ ];
154
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
155
+ });
156
+ it('should match substring regexp', () => {
157
+ const input = 'hello \\${var1},aa${var2}bb, cc${var3} dd';
158
+ const expectedTokens = [
159
+ makeToken('word', 'hello', 0, 5),
160
+ makeToken('word', '\\${var1}', 7, 15),
161
+ makeToken(',', ',', 15, 16),
162
+ makeToken('word', 'aa', 16, 18),
163
+ makeToken('VARIABLE', '${var2}', 18, 25),
164
+ makeToken('word', 'bb', 25, 27),
165
+ makeToken(',', ',', 27, 28),
166
+ makeToken('word', 'cc', 29, 31),
167
+ makeToken('VARIABLE', '${var3}', 31, 38),
168
+ makeToken('word', 'dd', 39, 41),
169
+ ];
170
+ expect(new Tokenizer(input, makeParams()).parse()).toEqual(expectedTokens);
171
+ });
172
+
173
+ describe('mergeTypes', () => {
174
+ it('should return an empty array when input is empty', () => {
175
+ const result = Tokenizer.mergeTypes('', [], 'merged');
176
+ expect(result).toEqual([]);
177
+ });
178
+
179
+ it('should not merge tokens when no match is found', () => {
180
+ const tokens: Token[] = [
181
+ {type: 'x', value: '1', startIndex: 0, endIndex: 1},
182
+ {type: 'y', value: '2', startIndex: 1, endIndex: 2},
183
+ ];
184
+ const result = Tokenizer.mergeTypes('a|b', tokens, 'merged');
185
+ expect(result).toEqual(tokens);
186
+ });
187
+
188
+ it('should merge tokens when a single match is found', () => {
189
+ const tokens: Token[] = [
190
+ {type: 'a', value: '1', startIndex: 0, endIndex: 1},
191
+ {type: 'b', value: '2', startIndex: 1, endIndex: 2},
192
+ ];
193
+ const expected: Token[] = [
194
+ {
195
+ type: 'merged',
196
+ value: '',
197
+ values: [tokens[0], tokens[1]],
198
+ startIndex: 0,
199
+ endIndex: 2,
200
+ },
201
+ ];
202
+ const result = Tokenizer.mergeTypes('a|b', tokens, 'merged');
203
+ expect(result).toEqual(expected);
204
+ });
205
+
206
+ it('should merge multiple matches', () => {
207
+ const tokens: Token[] = [
208
+ {type: 'a', value: '1', startIndex: 0, endIndex: 1},
209
+ {type: 'b', value: '2', startIndex: 1, endIndex: 2},
210
+ {type: 'a', value: '3', startIndex: 2, endIndex: 3},
211
+ {type: 'b', value: '4', startIndex: 3, endIndex: 4},
212
+ ];
213
+ const expected: Token[] = [
214
+ {
215
+ type: 'merged',
216
+ value: '',
217
+ values: [tokens[0], tokens[1]],
218
+ startIndex: 0,
219
+ endIndex: 2,
220
+ },
221
+ {
222
+ type: 'merged',
223
+ value: '',
224
+ values: [tokens[2], tokens[3]],
225
+ startIndex: 2,
226
+ endIndex: 4,
227
+ },
228
+ ];
229
+ const result = Tokenizer.mergeTypes('a|b', tokens, 'merged');
230
+ expect(result).toEqual(expected);
231
+ });
232
+
233
+ it('should not merge partial matches', () => {
234
+ const tokens: Token[] = [
235
+ {type: 'a', value: '1', startIndex: 0, endIndex: 1},
236
+ {type: 'b', value: '2', startIndex: 1, endIndex: 2},
237
+ ];
238
+ const result = Tokenizer.mergeTypes('a|b|c', tokens, 'merged');
239
+ expect(result).toEqual(tokens);
240
+ });
241
+ });
242
+
243
+ describe('matchTypes', () => {
244
+ it('should return tokens when types match', () => {
245
+ const tokens = [makeToken('a', '1', 0, 1), makeToken('b', '2', 1, 2)];
246
+ expect(Tokenizer.matchTypes('a|b', tokens, 0)).toEqual(tokens);
247
+ });
248
+
249
+ it('should return undefined when types do not match', () => {
250
+ const tokens = [makeToken('x', '1', 0, 1), makeToken('y', '2', 1, 2)];
251
+ expect(Tokenizer.matchTypes('a|b', tokens, 0)).toBeUndefined();
252
+ });
253
+
254
+ it('should return undefined when index is out of range', () => {
255
+ const tokens = [makeToken('a', '1', 0, 1)];
256
+ expect(Tokenizer.matchTypes('a|b', tokens, 1)).toBeUndefined();
257
+ });
258
+
259
+ it('should return all matching tokens when multiple types match', () => {
260
+ const tokens = [
261
+ makeToken('a', '1', 0, 1),
262
+ makeToken('b', '2', 1, 2),
263
+ makeToken('c', '3', 2, 3),
264
+ ];
265
+ expect(Tokenizer.matchTypes('a|b|c', tokens, 0)).toEqual(tokens);
266
+ });
267
+
268
+ it('should return only matching tokens when partial match occurs', () => {
269
+ const tokens = [makeToken('a', '1', 0, 1), makeToken('b', '2', 1, 2)];
270
+ expect(Tokenizer.matchTypes('a|b|c', tokens, 0)).toBeUndefined();
271
+ });
272
+ });
273
+ });