@atomic-ehr/fhirpath 0.0.1-canary.0c6931e.20250727185306

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +473 -0
  2. package/dist/index.d.ts +462 -0
  3. package/dist/index.js +10307 -0
  4. package/dist/index.js.map +1 -0
  5. package/package.json +58 -0
  6. package/src/analyzer/analyzer.ts +499 -0
  7. package/src/analyzer/model-provider.ts +244 -0
  8. package/src/analyzer/schemas/index.ts +2 -0
  9. package/src/analyzer/schemas/types.ts +40 -0
  10. package/src/analyzer/types.ts +142 -0
  11. package/src/api/builder.ts +157 -0
  12. package/src/api/errors.ts +145 -0
  13. package/src/api/expression.ts +156 -0
  14. package/src/api/index.ts +122 -0
  15. package/src/api/inspect.ts +99 -0
  16. package/src/api/registry.ts +128 -0
  17. package/src/api/types.ts +210 -0
  18. package/src/compiler/compiler.ts +546 -0
  19. package/src/compiler/index.ts +2 -0
  20. package/src/compiler/prototype-context-adapter.ts +99 -0
  21. package/src/compiler/types.ts +24 -0
  22. package/src/index.ts +107 -0
  23. package/src/interpreter/README.md +78 -0
  24. package/src/interpreter/interpreter.ts +475 -0
  25. package/src/interpreter/types.ts +108 -0
  26. package/src/lexer/char-tables.ts +37 -0
  27. package/src/lexer/errors.ts +31 -0
  28. package/src/lexer/index.ts +5 -0
  29. package/src/lexer/lexer.ts +745 -0
  30. package/src/lexer/token.ts +104 -0
  31. package/src/lexer2/index.md +232 -0
  32. package/src/lexer2/index.perf.test.ts +68 -0
  33. package/src/lexer2/index.test.ts +549 -0
  34. package/src/lexer2/index.ts +1251 -0
  35. package/src/lexer2/notes.md +173 -0
  36. package/src/lexer2/optimization-summary.md +718 -0
  37. package/src/parser/ast-factory.ts +220 -0
  38. package/src/parser/ast.ts +144 -0
  39. package/src/parser/collection-parser.ts +89 -0
  40. package/src/parser/diagnostic-messages.ts +216 -0
  41. package/src/parser/diagnostics.ts +85 -0
  42. package/src/parser/error-reporter.ts +230 -0
  43. package/src/parser/index.ts +3 -0
  44. package/src/parser/literal-parser.ts +103 -0
  45. package/src/parser/parse-error.ts +16 -0
  46. package/src/parser/parser-error-factory.ts +141 -0
  47. package/src/parser/parser-state.ts +134 -0
  48. package/src/parser/parser.ts +1272 -0
  49. package/src/parser/pprint.ts +169 -0
  50. package/src/parser/precedence-manager.ts +64 -0
  51. package/src/parser/source-mapper.ts +248 -0
  52. package/src/parser/special-constructs.ts +142 -0
  53. package/src/parser/token-navigator.ts +110 -0
  54. package/src/parser/types.ts +60 -0
  55. package/src/parser2/index.md +177 -0
  56. package/src/parser2/index.perf.test.ts +184 -0
  57. package/src/parser2/index.test.ts +305 -0
  58. package/src/parser2/index.ts +578 -0
  59. package/src/parser2/optimization-summary.md +176 -0
  60. package/src/registry/default-analyzers.ts +257 -0
  61. package/src/registry/default-compilers.ts +31 -0
  62. package/src/registry/index.ts +96 -0
  63. package/src/registry/operations/arithmetic.ts +506 -0
  64. package/src/registry/operations/collection.ts +425 -0
  65. package/src/registry/operations/comparison.ts +432 -0
  66. package/src/registry/operations/existence.ts +703 -0
  67. package/src/registry/operations/filtering.ts +358 -0
  68. package/src/registry/operations/literals.ts +341 -0
  69. package/src/registry/operations/logical.ts +439 -0
  70. package/src/registry/operations/math.ts +128 -0
  71. package/src/registry/operations/membership.ts +132 -0
  72. package/src/registry/operations/navigation.ts +52 -0
  73. package/src/registry/operations/string.ts +507 -0
  74. package/src/registry/operations/subsetting.ts +174 -0
  75. package/src/registry/operations/type-checking.ts +162 -0
  76. package/src/registry/operations/type-conversion.ts +404 -0
  77. package/src/registry/operations/type-operators.ts +308 -0
  78. package/src/registry/operations/utility.ts +644 -0
  79. package/src/registry/registry.ts +146 -0
  80. package/src/registry/types.ts +161 -0
  81. package/src/registry/utils/evaluation-helpers.ts +93 -0
  82. package/src/registry/utils/index.ts +3 -0
  83. package/src/registry/utils/type-system.ts +173 -0
  84. package/src/runtime/context.ts +158 -0
  85. package/src/runtime/debug-context.ts +135 -0
@@ -0,0 +1,104 @@
1
+ export enum TokenType {
2
+ // Literals
3
+ LITERAL = 'LITERAL', // Generic literal token for registry-based literals
4
+ NULL = 'NULL', // {} (nullLiteral in grammar)
5
+ TRUE = 'TRUE', // true
6
+ FALSE = 'FALSE', // false
7
+ STRING = 'STRING', // 'string value'
8
+ NUMBER = 'NUMBER', // 123, 45.67, 0123 (allows leading zeros)
9
+ DATE = 'DATE', // @2024, @2024-01, @2024-01-15
10
+ DATETIME = 'DATETIME', // @2024-01-15T10:30:00Z
11
+ TIME = 'TIME', // @T14:30:00
12
+
13
+ // Identifiers
14
+ IDENTIFIER = 'IDENTIFIER', // [A-Za-z_][A-Za-z0-9_]*
15
+ DELIMITED_IDENTIFIER = 'DELIMITED_IDENTIFIER', // `identifier`
16
+
17
+ // Special variables
18
+ THIS = 'THIS', // $this
19
+ INDEX = 'INDEX', // $index
20
+ TOTAL = 'TOTAL', // $total
21
+
22
+ // Environment variables
23
+ ENV_VAR = 'ENV_VAR', // %context, %`vs-name`
24
+
25
+ // Operators (by precedence)
26
+ DOT = 'DOT', // .
27
+ LBRACKET = 'LBRACKET', // [
28
+ RBRACKET = 'RBRACKET', // ]
29
+ LPAREN = 'LPAREN', // (
30
+ RPAREN = 'RPAREN', // )
31
+
32
+ // Arithmetic
33
+ PLUS = 'PLUS', // +
34
+ MINUS = 'MINUS', // -
35
+ STAR = 'STAR', // *
36
+ SLASH = 'SLASH', // /
37
+ DIV = 'DIV', // div
38
+ MOD = 'MOD', // mod
39
+ CONCAT = 'CONCAT', // &
40
+
41
+ // Type operators
42
+ IS = 'IS', // is
43
+ AS = 'AS', // as
44
+
45
+ // Union
46
+ PIPE = 'PIPE', // |
47
+
48
+ // Comparison
49
+ LT = 'LT', // <
50
+ LTE = 'LTE', // <=
51
+ GT = 'GT', // >
52
+ GTE = 'GTE', // >=
53
+ EQ = 'EQ', // =
54
+ NEQ = 'NEQ', // !=
55
+ EQUIV = 'EQUIV', // ~
56
+ NEQUIV = 'NEQUIV', // !~
57
+
58
+ // Membership
59
+ IN = 'IN', // in
60
+ CONTAINS = 'CONTAINS', // contains
61
+
62
+ // Boolean
63
+ AND = 'AND', // and
64
+ OR = 'OR', // or
65
+ XOR = 'XOR', // xor
66
+ IMPLIES = 'IMPLIES', // implies
67
+ NOT = 'NOT', // not
68
+
69
+ // Collection
70
+ LBRACE = 'LBRACE', // {
71
+ RBRACE = 'RBRACE', // }
72
+
73
+ // Other
74
+ COMMA = 'COMMA', // ,
75
+ EOF = 'EOF',
76
+
77
+ // Units (for quantities)
78
+ UNIT = 'UNIT', // year, month, 'mg', etc.
79
+
80
+ // Trivia tokens (when preserving whitespace/comments)
81
+ WS = 'WS', // Whitespace
82
+ COMMENT = 'COMMENT', // /* Multi-line comment */
83
+ LINE_COMMENT = 'LINE_COMMENT', // // Single-line comment
84
+ }
85
+
86
+ export interface Position {
87
+ line: number;
88
+ column: number;
89
+ offset: number;
90
+ }
91
+
92
+ export enum Channel {
93
+ DEFAULT = 0,
94
+ HIDDEN = 1 // For whitespace and comments
95
+ }
96
+
97
+ export interface Token {
98
+ type: TokenType;
99
+ value: string;
100
+ position: Position;
101
+ channel?: Channel; // Optional channel for trivia
102
+ operation?: any; // Operation from registry (using any to avoid circular dependency)
103
+ literalValue?: any; // Parsed literal value for LITERAL tokens
104
+ }
@@ -0,0 +1,232 @@
1
+ # Lexer2 Overview
2
+
3
+ ## Introduction
4
+
5
+ Lexer2 is a high-performance manual lexer for the FHIRPath expression language, designed as a drop-in replacement for the ANTLR-based lexer. It achieves ~2.2M expressions/second, representing a 49% improvement over the initial implementation through systematic optimizations.
6
+
7
+ ## Algorithm Overview
8
+
9
+ The lexer uses a single-pass, character-by-character scanning approach with the following key components:
10
+
11
+ ### 1. Character Classification via Lookup Tables
12
+
13
+ Instead of function calls for character classification, we use pre-computed lookup tables:
14
+
15
+ ```typescript
16
+ // src/lexer2/index.ts:184-203
17
+ const IS_DIGIT = new Uint8Array(256);
18
+ const IS_LETTER = new Uint8Array(256);
19
+ const IS_LETTER_OR_DIGIT = new Uint8Array(256);
20
+ const IS_HEX_DIGIT = new Uint8Array(256);
21
+ ```
22
+
23
+ These 256-byte arrays provide O(1) character classification with excellent cache locality.
24
+
25
+ ### 2. Switch-Based Token Dispatch
26
+
27
+ The main tokenization logic uses a switch statement on the first character for efficient dispatch:
28
+
29
+ ```typescript
30
+ // src/lexer2/index.ts:786-906
31
+ switch (firstChar) {
32
+ case "'": return this.readString();
33
+ case '`': return this.readDelimitedIdentifier();
34
+ case '@': return this.readDateTime();
35
+ case '$': return this.readSpecialIdentifier();
36
+ // ... single-character operators
37
+ default:
38
+ if (IS_DIGIT[firstCharCode]) return this.readNumber();
39
+ if (IS_LETTER[firstCharCode]) return this.readIdentifierOrKeyword();
40
+ }
41
+ ```
42
+
43
+ ### 3. Optimized Keyword Recognition
44
+
45
+ Keywords are recognized using nested switches on string length first, then value:
46
+
47
+ ```typescript
48
+ // src/lexer2/index.ts:662-721
49
+ switch (length) {
50
+ case 2:
51
+ switch (value) {
52
+ case 'as': type = TokenType.AS; break;
53
+ case 'in': type = TokenType.IN; break;
54
+ // ...
55
+ }
56
+ break;
57
+ case 3:
58
+ switch (value) {
59
+ case 'div': type = TokenType.DIV; break;
60
+ case 'mod': type = TokenType.MOD; break;
61
+ // ...
62
+ }
63
+ break;
64
+ // ...
65
+ }
66
+ ```
67
+
68
+ This approach filters out most identifiers immediately and compiles to efficient jump tables.
69
+
70
+ ## Design Decisions
71
+
72
+ ### 1. Token Representation
73
+
74
+ After extensive benchmarking (see [optimization-summary.md](./optimization-summary.md#token-representation-benchmarks)), we use plain object literals:
75
+
76
+ ```typescript
77
+ // src/lexer2/index.ts:159-165
78
+ export interface Token {
79
+ type: TokenType; // numeric enum
80
+ start: number; // position in input string
81
+ end: number; // end position
82
+ line: number; // line number for error reporting
83
+ column: number; // column number for error reporting
84
+ }
85
+ ```
86
+
87
+ Key findings:
88
+ - Object literals are 78% faster than classes
89
+ - Arrays are 244% slower despite lower memory usage
90
+ - V8 optimizes object literals with consistent shapes via hidden classes
91
+
92
+ ### 2. Numeric Enums for Token Types
93
+
94
+ Token types use numeric enums for better performance:
95
+
96
+ ```typescript
97
+ // src/lexer2/index.ts:1-82
98
+ export enum TokenType {
99
+ NULL, // 0
100
+ BOOLEAN, // 1
101
+ STRING, // 2
102
+ NUMBER, // 3
103
+ // ...
104
+ }
105
+ ```
106
+
107
+ Benefits:
108
+ - 2.5% performance improvement over string enums
109
+ - Smaller memory footprint (4 bytes vs string length)
110
+ - Better switch statement optimization
111
+
112
+ Debug support is maintained via helper functions:
113
+ - `tokenTypeToString()` - converts numeric type to string
114
+ - `debugTokens()` - human-readable token output
115
+
116
+ ### 3. Position Tracking Strategy
117
+
118
+ The lexer tracks both:
119
+ - **Character positions** (`start`, `end`) - for substring extraction
120
+ - **Line/column information** - for error reporting
121
+
122
+ ```typescript
123
+ // src/lexer2/index.ts:241-257
124
+ private advance(): string {
125
+ const char = this.input[this.position] || '';
126
+ this.position++;
127
+
128
+ if (char === '\n') {
129
+ this.line++;
130
+ this.column = 1;
131
+ } else {
132
+ this.column++;
133
+ }
134
+
135
+ return char;
136
+ }
137
+ ```
138
+
139
+ This dual tracking adds ~4.2% overhead but provides essential debugging information.
140
+
141
+ ### 4. Whitespace Handling Optimization
142
+
143
+ Whitespace recognition uses a character code switch for efficiency:
144
+
145
+ ```typescript
146
+ // src/lexer2/index.ts:268-283
147
+ switch (charCode) {
148
+ case 32: // ' ' (space)
149
+ case 9: // '\t' (tab)
150
+ this.position++;
151
+ this.column++;
152
+ break;
153
+ case 13: // '\r' (carriage return)
154
+ this.position++;
155
+ break;
156
+ case 10: // '\n' (line feed)
157
+ this.position++;
158
+ this.line++;
159
+ this.column = 1;
160
+ break;
161
+ default:
162
+ // Not whitespace
163
+ }
164
+ ```
165
+
166
+ ### 5. Inline Hot Functions
167
+
168
+ Critical path functions are inlined to reduce call overhead:
169
+ - Character classification uses lookup tables directly
170
+ - Digit reading loops are fully inlined
171
+ - No separate utility functions in hot paths
172
+
173
+ ## Performance Characteristics
174
+
175
+ Current performance: **~2,200K expressions/second**
176
+
177
+ ### Optimization Timeline:
178
+ 1. Initial implementation: ~1,477K expr/sec
179
+ 2. Lookup tables: ~1,546K expr/sec (+4.7%)
180
+ 3. Switch-based keywords: ~2,192K expr/sec (+42%)
181
+ 4. Character code switches: ~2,240K expr/sec (+2.2%)
182
+ 5. Numeric enums: ~2,200K expr/sec (+2.5%, with position tracking)
183
+
184
+ ### Failed Optimizations:
185
+ - **Reusable tokens**: 11% performance degradation due to method call overhead
186
+ - **Character arrays**: Slower than string indexing in modern V8
187
+ - **Object pooling**: V8's allocation is already highly optimized
188
+
189
+ ## Future Optimization Opportunities
190
+
191
+ Based on profiling, remaining optimization opportunities include:
192
+
193
+ 1. **Optimize readSpecialIdentifier** (est. 2-3% improvement)
194
+ - Remove substring call at line 734
195
+ - Use direct character comparison
196
+
197
+ 2. **Optimize readDateTime/readTimeFormat** (est. 1-2% improvement)
198
+ - Reduce redundant charCode lookups
199
+ - Cache frequently accessed positions
200
+
201
+ 3. **Whitespace lookup table** (est. 0.5-1% improvement)
202
+ - Replace switch with lookup table
203
+ - May improve branch prediction
204
+
205
+ ## Usage Example
206
+
207
+ ```typescript
208
+ import { Lexer, TokenType, tokenTypeToString } from './index';
209
+
210
+ const lexer = new Lexer("Patient.name.where(use = 'official')");
211
+ const tokens = lexer.tokenize();
212
+
213
+ // Process tokens
214
+ for (const token of tokens) {
215
+ const value = lexer.getTokenValue(token);
216
+ const type = tokenTypeToString(token.type);
217
+ console.log(`${type}(${value}) at ${token.line}:${token.column}`);
218
+ }
219
+
220
+ // Debug output
221
+ console.log(lexer.debugTokens());
222
+ ```
223
+
224
+ ## Architecture Integration
225
+
226
+ The lexer integrates with the FHIRPath parser by:
227
+ 1. Providing a token stream via `tokenize()`
228
+ 2. Supporting position information for error reporting
229
+ 3. Maintaining compatibility with the existing Token interface
230
+ 4. Offering configurable whitespace/comment handling
231
+
232
+ See [parser integration](../parser/parser.ts) for usage in the parsing pipeline.
@@ -0,0 +1,68 @@
1
+ import { describe, it } from 'bun:test';
2
+ import { Lexer } from './index';
3
+ import * as fs from 'fs';
4
+ import * as path from 'path';
5
+
6
+ describe('Lexer Performance', () => {
7
+ it('measures lexer performance on fixture expressions', () => {
8
+ runPerformanceTest(false);
9
+ });
10
+ });
11
+
12
+ function runPerformanceTest(preserveTrivia: boolean) {
13
+ const fixturesPath = path.join(process.cwd(), 'test', 'fixtures');
14
+ const iterations = 10000;
15
+
16
+ // Read all fixture files
17
+ const fixtureFiles = fs.readdirSync(fixturesPath)
18
+ .filter(file => file.endsWith('.json'))
19
+ .map(file => ({
20
+ name: file,
21
+ path: path.join(fixturesPath, file)
22
+ }));
23
+
24
+ console.log(`\nRunning lexer performance tests with ${iterations} iterations per expression`);
25
+ console.log(`Trivia preservation: ${preserveTrivia ? 'ENABLED' : 'DISABLED'}\n`);
26
+
27
+ let totalExpressions = 0;
28
+ let totalIterations = 0;
29
+ let totalTime = 0;
30
+
31
+ for (const fixture of fixtureFiles) {
32
+ console.log(`Processing ${fixture.name}...`);
33
+
34
+ const content = fs.readFileSync(fixture.path, 'utf-8');
35
+ const expressions: string[] = JSON.parse(content);
36
+
37
+ for (const expression of expressions) {
38
+ if (!expression) continue;
39
+
40
+ // Warm up run
41
+ const warmupLexer = new Lexer(expression, { preserveTrivia });
42
+ warmupLexer.tokenize();
43
+
44
+ // Measure total time for all iterations
45
+ const start = performance.now();
46
+ for (let j = 0; j < iterations; j++) {
47
+ const lexer = new Lexer(expression, { preserveTrivia });
48
+ lexer.tokenize();
49
+ }
50
+ const end = performance.now();
51
+
52
+ totalTime += (end - start);
53
+ totalExpressions++;
54
+ totalIterations += iterations;
55
+ }
56
+ }
57
+
58
+ const avgTimePerExpression = totalTime / totalIterations;
59
+
60
+ console.log('\n' + '='.repeat(50));
61
+ console.log('RESULTS');
62
+ console.log('='.repeat(50));
63
+ console.log(`Total expressions: ${totalExpressions}`);
64
+ console.log(`Total iterations: ${totalIterations}`);
65
+ console.log(`Total time: ${(totalTime / 1000).toFixed(2)}s`);
66
+ console.log(`Time per expression: ${avgTimePerExpression.toFixed(4)}ms`);
67
+ console.log(`Expressions per second: ${(1000 / avgTimePerExpression).toFixed(0)}`);
68
+ }