@chr33s/pdf-dfa 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +91 -0
  2. package/dfa.d.ts +44 -0
  3. package/dist/compile.d.ts +6 -0
  4. package/dist/compile.js +22 -0
  5. package/dist/compile.js.map +1 -0
  6. package/dist/dfa.d.ts +16 -0
  7. package/dist/dfa.js +81 -0
  8. package/dist/dfa.js.map +1 -0
  9. package/dist/grammar.d.ts +11 -0
  10. package/dist/grammar.js +1266 -0
  11. package/dist/grammar.js.map +1 -0
  12. package/dist/index.d.ts +3 -0
  13. package/dist/index.js +4 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/nodes.d.ts +113 -0
  16. package/dist/nodes.js +241 -0
  17. package/dist/nodes.js.map +1 -0
  18. package/dist/state-machine.d.ts +29 -0
  19. package/dist/state-machine.js +71 -0
  20. package/dist/state-machine.js.map +1 -0
  21. package/dist/symbol-table.d.ts +17 -0
  22. package/dist/symbol-table.js +64 -0
  23. package/dist/symbol-table.js.map +1 -0
  24. package/dist/utils.d.ts +12 -0
  25. package/dist/utils.js +34 -0
  26. package/dist/utils.js.map +1 -0
  27. package/package.json +41 -0
  28. package/scripts/build-grammar.ts +33 -0
  29. package/src/compile.ts +31 -0
  30. package/src/dfa.ts +104 -0
  31. package/src/grammar.js +1312 -0
  32. package/src/grammar.peg +72 -0
  33. package/src/index.ts +9 -0
  34. package/src/nodes.ts +308 -0
  35. package/src/state-machine.ts +94 -0
  36. package/src/symbol-table.ts +78 -0
  37. package/src/utils.ts +38 -0
  38. package/test/compile.test.ts +131 -0
  39. package/test/dfa.test.ts +87 -0
  40. package/test/nodes.test.ts +324 -0
  41. package/test/parse-build.test.ts +50 -0
  42. package/test/state-machine.test.ts +132 -0
  43. package/test/symbol-table.test.ts +69 -0
  44. package/test/utils.test.ts +108 -0
  45. package/tsconfig.json +16 -0
  46. package/tsconfig.test.json +8 -0
  47. package/tsconfig.typecheck.json +16 -0
  48. package/vitest.config.ts +8 -0
@@ -0,0 +1,72 @@
1
+ {
2
+ const n = options?.nodes;
3
+ if (!n) {
4
+ throw new Error('The parser expects a nodes module via options.nodes.');
5
+ }
6
+ }
7
+
8
+ rules
9
+ = statement+
10
+
11
+ statement
12
+ = s:statement_type _ { return s }
13
+
14
+ statement_type
15
+ = assignment
16
+ / comment
17
+
18
+ comment
19
+ = '#' v:[^\r\n]* [\r\n] { return new n.Comment(v.join('')) }
20
+
21
+ assignment
22
+ = v:variable _ '=' _ e:expr _ ';' { return new n.Assignment(v, e) }
23
+
24
+ variable
25
+ = v:name { return new n.Variable(v) }
26
+
27
+ expr
28
+ = alternation
29
+
30
+ alternation
31
+ = a:concatenation _ '|' _ b:alternation { return new n.Alternation(a, b) }
32
+ / concatenation
33
+
34
+ concatenation
35
+ = a:repeat _ b:concatenation { return new n.Concatenation(a, b) }
36
+ / repeat
37
+
38
+ repeat
39
+ = t:name ':' e:repeat { return new n.Concatenation(e, new n.Tag(t)) }
40
+ / t:term '*' { return new n.Repeat(t, '*') }
41
+ / t:term '?' { return new n.Repeat(t, '?') }
42
+ / t:term '+' { return new n.Repeat(t, '+') }
43
+ / t:term '{' m:number '}' { return n.buildRepetition(t, m, m) }
44
+ / t:term '{' min:number ',' '}' { return n.buildRepetition(t, min, Infinity) }
45
+ / t:term '{' ',' max:number '}' { return n.buildRepetition(t, 0, max) }
46
+ / t:term '{' min:number ',' max:number '}' { return n.buildRepetition(t, min, max) }
47
+ / term
48
+
49
+ term
50
+ = variable
51
+ / x:number { return new n.Literal(x) }
52
+ / '(' e:expr ')' { return e }
53
+
54
+ repetition
55
+ = '{' number '}' { return n.buildRepetition() }
56
+
57
+ name
58
+ = a:name_start_char b:name_char* { return a + b.join('') }
59
+
60
+ name_start_char
61
+ = "_"
62
+ / [a-zA-Z]
63
+
64
+ name_char
65
+ = name_start_char
66
+ / [0-9]
67
+
68
+ number
69
+ = num:[0-9]+ { return parseInt(num.join('')) }
70
+
71
+ _
72
+ = [ \t\r\n]*
package/src/index.ts ADDED
@@ -0,0 +1,9 @@
1
+ export { build, default as compile, parse, type ExternalSymbols } from "./compile.js";
2
+ export * from "./nodes.js";
3
+ export {
4
+ FAIL_STATE,
5
+ INITIAL_STATE,
6
+ default as StateMachine,
7
+ type Match,
8
+ type StateMachineConfig,
9
+ } from "./state-machine.js";
package/src/nodes.ts ADDED
@@ -0,0 +1,308 @@
1
+ import { addAll, union } from "./utils.js";
2
+
3
+ export type RepeatOperator = "*" | "?" | "+";
4
+
5
+ export type PositionNode = Literal | Tag | EndMarker;
6
+
7
+ export type ExpressionNode =
8
+ | Alternation
9
+ | Concatenation
10
+ | Repeat
11
+ | Literal
12
+ | Tag
13
+ | EndMarker
14
+ | Variable;
15
+
16
+ /**
17
+ * Base AST node
18
+ */
19
+ export class Node {
20
+ readonly followpos: Set<PositionNode>;
21
+
22
+ constructor() {
23
+ const followpos = new Set<PositionNode>();
24
+ this.followpos = followpos;
25
+ Object.defineProperty(this, "followpos", {
26
+ value: followpos,
27
+ });
28
+ }
29
+
30
+ calcFollowpos(): void {
31
+ for (const key of Object.keys(this)) {
32
+ const value = (this as Record<string, unknown>)[key];
33
+ if (value instanceof Node) {
34
+ value.calcFollowpos();
35
+ }
36
+ }
37
+ }
38
+ }
39
+
40
+ /**
41
+ * Represents a variable reference
42
+ */
43
+ export class Variable extends Node {
44
+ readonly name: string;
45
+
46
+ constructor(name: string) {
47
+ super();
48
+ this.name = name;
49
+ }
50
+
51
+ get nullable(): boolean {
52
+ throw new Error("Variable nodes must be resolved before evaluation");
53
+ }
54
+
55
+ get firstpos(): Set<PositionNode> {
56
+ throw new Error("Variable nodes must be resolved before evaluation");
57
+ }
58
+
59
+ get lastpos(): Set<PositionNode> {
60
+ throw new Error("Variable nodes must be resolved before evaluation");
61
+ }
62
+
63
+ copy(): Variable {
64
+ return new Variable(this.name);
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Represents a comment
70
+ */
71
+ export class Comment extends Node {
72
+ readonly value: string;
73
+
74
+ constructor(value: string) {
75
+ super();
76
+ this.value = value;
77
+ }
78
+ }
79
+
80
+ /**
81
+ * Represents an assignment statement.
82
+ * e.g. `variable = expression;`
83
+ */
84
+ export class Assignment extends Node {
85
+ readonly variable: Variable;
86
+ expression: ExpressionNode;
87
+
88
+ constructor(variable: Variable, expression: ExpressionNode) {
89
+ super();
90
+ this.variable = variable;
91
+ this.expression = expression;
92
+ }
93
+ }
94
+
95
+ /**
96
+ * Represents an alternation.
97
+ * e.g. `a | b`
98
+ */
99
+ export class Alternation extends Node {
100
+ readonly a: ExpressionNode;
101
+ readonly b: ExpressionNode;
102
+
103
+ constructor(a: ExpressionNode, b: ExpressionNode) {
104
+ super();
105
+ this.a = a;
106
+ this.b = b;
107
+ }
108
+
109
+ get nullable(): boolean {
110
+ return this.a.nullable || this.b.nullable;
111
+ }
112
+
113
+ get firstpos(): Set<PositionNode> {
114
+ return union(this.a.firstpos, this.b.firstpos);
115
+ }
116
+
117
+ get lastpos(): Set<PositionNode> {
118
+ return union(this.a.lastpos, this.b.lastpos);
119
+ }
120
+
121
+ copy(): Alternation {
122
+ return new Alternation(this.a.copy(), this.b.copy());
123
+ }
124
+ }
125
+
126
+ /**
127
+ * Represents a concatenation, or chain.
128
+ * e.g. `a b c`
129
+ */
130
+ export class Concatenation extends Node {
131
+ readonly a: ExpressionNode;
132
+ readonly b: ExpressionNode;
133
+
134
+ constructor(a: ExpressionNode, b: ExpressionNode) {
135
+ super();
136
+ this.a = a;
137
+ this.b = b;
138
+ }
139
+
140
+ get nullable(): boolean {
141
+ return this.a.nullable && this.b.nullable;
142
+ }
143
+
144
+ get firstpos(): Set<PositionNode> {
145
+ let s = this.a.firstpos;
146
+ if (this.a.nullable) {
147
+ s = union(s, this.b.firstpos);
148
+ }
149
+
150
+ return s;
151
+ }
152
+
153
+ get lastpos(): Set<PositionNode> {
154
+ let s = this.b.lastpos;
155
+ if (this.b.nullable) {
156
+ s = union(s, this.a.lastpos);
157
+ }
158
+
159
+ return s;
160
+ }
161
+
162
+ calcFollowpos(): void {
163
+ super.calcFollowpos();
164
+ for (const n of this.a.lastpos) {
165
+ addAll(n.followpos, this.b.firstpos);
166
+ }
167
+ }
168
+
169
+ copy(): Concatenation {
170
+ return new Concatenation(this.a.copy(), this.b.copy());
171
+ }
172
+ }
173
+
174
+ /**
175
+ * Represents a repetition.
176
+ * e.g. `a+`, `b*`, or `c?`
177
+ */
178
+ export class Repeat extends Node {
179
+ readonly expression: ExpressionNode;
180
+ readonly op: RepeatOperator;
181
+
182
+ constructor(expression: ExpressionNode, op: RepeatOperator) {
183
+ super();
184
+ this.expression = expression;
185
+ this.op = op;
186
+ }
187
+
188
+ get nullable(): boolean {
189
+ return this.op === "*" || this.op === "?";
190
+ }
191
+
192
+ get firstpos(): Set<PositionNode> {
193
+ return this.expression.firstpos;
194
+ }
195
+
196
+ get lastpos(): Set<PositionNode> {
197
+ return this.expression.lastpos;
198
+ }
199
+
200
+ calcFollowpos(): void {
201
+ super.calcFollowpos();
202
+ if (this.op === "*" || this.op === "+") {
203
+ for (const n of this.lastpos) {
204
+ addAll(n.followpos, this.firstpos);
205
+ }
206
+ }
207
+ }
208
+
209
+ copy(): Repeat {
210
+ return new Repeat(this.expression.copy(), this.op);
211
+ }
212
+ }
213
+
214
+ export function buildRepetition(
215
+ expression: ExpressionNode,
216
+ min = 0,
217
+ max = Infinity,
218
+ ): ExpressionNode {
219
+ if (min < 0 || min > max) {
220
+ throw new Error(`Invalid repetition range: ${min} ${max}`);
221
+ }
222
+
223
+ let res: ExpressionNode | null = null;
224
+ for (let i = 0; i < min; i++) {
225
+ res = concat(res, expression.copy());
226
+ }
227
+
228
+ if (max === Infinity) {
229
+ res = concat(res, new Repeat(expression.copy(), "*"));
230
+ } else {
231
+ for (let i = min; i < max; i++) {
232
+ res = concat(res, new Repeat(expression.copy(), "?"));
233
+ }
234
+ }
235
+
236
+ return res ?? expression.copy();
237
+ }
238
+
239
+ function concat(a: ExpressionNode | null, b: ExpressionNode): ExpressionNode {
240
+ if (!a) {
241
+ return b;
242
+ }
243
+
244
+ return new Concatenation(a, b);
245
+ }
246
+
247
+ /**
248
+ * Base class for leaf nodes
249
+ */
250
+ class Leaf extends Node {
251
+ copy(): this {
252
+ return this;
253
+ }
254
+
255
+ get nullable(): boolean {
256
+ return false;
257
+ }
258
+
259
+ get firstpos(): Set<PositionNode> {
260
+ return new Set([this as PositionNode]);
261
+ }
262
+
263
+ get lastpos(): Set<PositionNode> {
264
+ return new Set([this as PositionNode]);
265
+ }
266
+ }
267
+
268
+ /**
269
+ * Represents a literal value, e.g. a number
270
+ */
271
+ export class Literal extends Leaf {
272
+ readonly value: number;
273
+
274
+ constructor(value: number) {
275
+ super();
276
+ this.value = value;
277
+ }
278
+
279
+ copy(): this {
280
+ return new Literal(this.value) as this;
281
+ }
282
+ }
283
+
284
+ /**
285
+ * Marks the end of an expression
286
+ */
287
+ export class EndMarker extends Leaf {}
288
+
289
+ /**
290
+ * Represents a tag
291
+ * e.g. `a:(a b)`
292
+ */
293
+ export class Tag extends Leaf {
294
+ readonly name: string;
295
+
296
+ constructor(name: string) {
297
+ super();
298
+ this.name = name;
299
+ }
300
+
301
+ get nullable(): boolean {
302
+ return true;
303
+ }
304
+
305
+ copy(): this {
306
+ return new Tag(this.name) as this;
307
+ }
308
+ }
@@ -0,0 +1,94 @@
1
+ export const INITIAL_STATE = 1;
2
+ export const FAIL_STATE = 0;
3
+
4
+ export type Match = [start: number, end: number, tags: string[]];
5
+
6
+ export interface StateMachineConfig {
7
+ stateTable: number[][];
8
+ accepting: boolean[];
9
+ tags: string[][];
10
+ }
11
+
12
+ /**
13
+ * A StateMachine represents a deterministic finite automaton.
14
+ * It can perform matches over a sequence of values, similar to a regular expression.
15
+ */
16
+ export default class StateMachine {
17
+ readonly stateTable: number[][];
18
+ readonly accepting: boolean[];
19
+ readonly tags: string[][];
20
+
21
+ constructor(dfa: StateMachineConfig) {
22
+ this.stateTable = dfa.stateTable;
23
+ this.accepting = dfa.accepting;
24
+ this.tags = dfa.tags;
25
+ }
26
+
27
+ /**
28
+ * Returns an iterable object that yields pattern matches over the input sequence.
29
+ * Matches are of the form [startIndex, endIndex, tags].
30
+ */
31
+ match(input: readonly number[]): Iterable<Match> {
32
+ const { stateTable, accepting, tags } = this;
33
+
34
+ return {
35
+ *[Symbol.iterator](): Iterator<Match> {
36
+ let state = INITIAL_STATE;
37
+ let startRun: number | null = null;
38
+ let lastAccepting: number | null = null;
39
+ let lastState = INITIAL_STATE;
40
+
41
+ for (let index = 0; index < input.length; index++) {
42
+ const symbol = input[index];
43
+
44
+ lastState = state;
45
+ state = stateTable[state]?.[symbol] ?? FAIL_STATE;
46
+
47
+ if (state === FAIL_STATE) {
48
+ if (startRun != null && lastAccepting != null && lastAccepting >= startRun) {
49
+ yield [startRun, lastAccepting, tags[lastState] ?? []];
50
+ }
51
+
52
+ state = stateTable[INITIAL_STATE]?.[symbol] ?? FAIL_STATE;
53
+ startRun = null;
54
+ }
55
+
56
+ if (state !== FAIL_STATE && startRun == null) {
57
+ startRun = index;
58
+ }
59
+
60
+ if (accepting[state]) {
61
+ lastAccepting = index;
62
+ }
63
+
64
+ if (state === FAIL_STATE) {
65
+ state = INITIAL_STATE;
66
+ }
67
+ }
68
+
69
+ if (startRun != null && lastAccepting != null && lastAccepting >= startRun) {
70
+ yield [startRun, lastAccepting, tags[state] ?? []];
71
+ }
72
+ },
73
+ };
74
+ }
75
+
76
+ /**
77
+ * For each match over the input sequence, action functions matching
78
+ * the tag definitions in the input pattern are called with the startIndex,
79
+ * endIndex, and sub-match sequence.
80
+ */
81
+ apply(
82
+ input: readonly number[],
83
+ actions: Record<string, (start: number, end: number, slice: number[]) => void>,
84
+ ): void {
85
+ for (const [start, end, tagList] of this.match(input)) {
86
+ for (const tag of tagList) {
87
+ const handler = actions[tag];
88
+ if (typeof handler === "function") {
89
+ handler(start, end, input.slice(start, end + 1));
90
+ }
91
+ }
92
+ }
93
+ }
94
+ }
@@ -0,0 +1,78 @@
1
+ import { Assignment, ExpressionNode, Literal, Node, Variable } from "./nodes.js";
2
+
3
+ type SymbolTableStatement = Node;
4
+ type ExternalSymbolMap = Record<string, number>;
5
+
6
+ /**
7
+ * Processes a list of statements into a symbol table
8
+ */
9
+ export default class SymbolTable {
10
+ readonly variables: Record<string, ExpressionNode>;
11
+ readonly symbols: Record<string, number>;
12
+ main: ExpressionNode;
13
+ size: number;
14
+
15
+ constructor(statements: SymbolTableStatement[], externalSymbols: ExternalSymbolMap = {}) {
16
+ this.variables = {};
17
+ this.symbols = {};
18
+ this.main = undefined as unknown as ExpressionNode;
19
+ this.size = 0;
20
+
21
+ this.addExternalSymbols(externalSymbols);
22
+ this.process(statements);
23
+ }
24
+
25
+ private addExternalSymbols(externalSymbols: ExternalSymbolMap): void {
26
+ for (const [key, value] of Object.entries(externalSymbols)) {
27
+ const literal = new Literal(value);
28
+ this.variables[key] = literal;
29
+ this.symbols[key] = value;
30
+ this.size++;
31
+ }
32
+ }
33
+
34
+ private process(statements: SymbolTableStatement[]): void {
35
+ for (const statement of statements) {
36
+ if (statement instanceof Assignment) {
37
+ const resolved = this.processExpression(statement.expression);
38
+ this.variables[statement.variable.name] = resolved;
39
+ statement.expression = resolved;
40
+
41
+ if (resolved instanceof Literal) {
42
+ this.symbols[statement.variable.name] = resolved.value;
43
+ this.size++;
44
+ }
45
+ }
46
+ }
47
+
48
+ const main = this.variables.main;
49
+ if (!main) {
50
+ throw new Error("No main variable declaration found");
51
+ }
52
+
53
+ this.main = main;
54
+ }
55
+
56
+ private processExpression(expr: ExpressionNode): ExpressionNode {
57
+ // Process children
58
+ for (const key of Object.keys(expr)) {
59
+ const recordExpr = expr as unknown as Record<string, unknown>;
60
+ const value = recordExpr[key];
61
+ if (value instanceof Node) {
62
+ recordExpr[key] = this.processExpression(value as ExpressionNode);
63
+ }
64
+ }
65
+
66
+ // Replace variable references with their values
67
+ if (expr instanceof Variable) {
68
+ const value = this.variables[expr.name];
69
+ if (value == null) {
70
+ throw new Error(`Undeclared identifier ${expr.name}`);
71
+ }
72
+
73
+ return this.processExpression(value.copy());
74
+ }
75
+
76
+ return expr;
77
+ }
78
+ }
package/src/utils.ts ADDED
@@ -0,0 +1,38 @@
1
+ /**
2
+ * Returns a new set representing the union of a and b.
3
+ */
4
+ export function union<T>(a: Set<T>, b: Iterable<T>): Set<T> {
5
+ const result = new Set(a);
6
+ addAll(result, b);
7
+ return result;
8
+ }
9
+
10
+ /**
11
+ * Adds all items from the set b to a.
12
+ */
13
+ export function addAll<T>(target: Set<T>, source: Iterable<T>): void {
14
+ for (const item of source) {
15
+ target.add(item);
16
+ }
17
+ }
18
+
19
+ /**
20
+ * Returns whether two sets are equal
21
+ */
22
+ export function equal<T>(a: Set<T>, b: Set<T>): boolean {
23
+ if (a === b) {
24
+ return true;
25
+ }
26
+
27
+ if (a.size !== b.size) {
28
+ return false;
29
+ }
30
+
31
+ for (const item of a) {
32
+ if (!b.has(item)) {
33
+ return false;
34
+ }
35
+ }
36
+
37
+ return true;
38
+ }