@chr33s/pdf-dfa 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +91 -0
  2. package/dfa.d.ts +44 -0
  3. package/dist/compile.d.ts +6 -0
  4. package/dist/compile.js +22 -0
  5. package/dist/compile.js.map +1 -0
  6. package/dist/dfa.d.ts +16 -0
  7. package/dist/dfa.js +81 -0
  8. package/dist/dfa.js.map +1 -0
  9. package/dist/grammar.d.ts +11 -0
  10. package/dist/grammar.js +1266 -0
  11. package/dist/grammar.js.map +1 -0
  12. package/dist/index.d.ts +3 -0
  13. package/dist/index.js +4 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/nodes.d.ts +113 -0
  16. package/dist/nodes.js +241 -0
  17. package/dist/nodes.js.map +1 -0
  18. package/dist/state-machine.d.ts +29 -0
  19. package/dist/state-machine.js +71 -0
  20. package/dist/state-machine.js.map +1 -0
  21. package/dist/symbol-table.d.ts +17 -0
  22. package/dist/symbol-table.js +64 -0
  23. package/dist/symbol-table.js.map +1 -0
  24. package/dist/utils.d.ts +12 -0
  25. package/dist/utils.js +34 -0
  26. package/dist/utils.js.map +1 -0
  27. package/package.json +41 -0
  28. package/scripts/build-grammar.ts +33 -0
  29. package/src/compile.ts +31 -0
  30. package/src/dfa.ts +104 -0
  31. package/src/grammar.js +1312 -0
  32. package/src/grammar.peg +72 -0
  33. package/src/index.ts +9 -0
  34. package/src/nodes.ts +308 -0
  35. package/src/state-machine.ts +94 -0
  36. package/src/symbol-table.ts +78 -0
  37. package/src/utils.ts +38 -0
  38. package/test/compile.test.ts +131 -0
  39. package/test/dfa.test.ts +87 -0
  40. package/test/nodes.test.ts +324 -0
  41. package/test/parse-build.test.ts +50 -0
  42. package/test/state-machine.test.ts +132 -0
  43. package/test/symbol-table.test.ts +69 -0
  44. package/test/utils.test.ts +108 -0
  45. package/tsconfig.json +16 -0
  46. package/tsconfig.test.json +8 -0
  47. package/tsconfig.typecheck.json +16 -0
  48. package/vitest.config.ts +8 -0
package/README.md ADDED
@@ -0,0 +1,91 @@
1
+ # @chr33s/pdf-dfa
2
+
3
+ > Deterministic finite automata compiler packaged as modern ES modules.
4
+
5
+ Distributed as native ES modules with NodeNext resolution (use Node.js 18+ or a modern bundler).
6
+
7
+ `@chr33s/pdf-dfa` lives in the [`chr33s/pdf`](https://github.com/chr33s/pdf) monorepo and provides native ES modules with TypeScript declarations.
8
+
9
+ ## Overview
10
+
11
+ Compiles a regular expression like syntax to fast deterministic finite automata.
12
+ Useful for pattern matching against non-string sequences.
13
+
14
+ ## Example
15
+
16
+ This example matches [Hangul](https://en.wikipedia.org/wiki/Hangul) syllables. The symbols defined in the machine are Unicode character categories which could be mapped from code points.
17
+
18
+ Machine definition:
19
+
20
+ ```coffeescript
21
+ # define symbols
22
+ X = 0; # Other character
23
+ L = 1; # Leading consonant
24
+ V = 2; # Medial vowel
25
+ T = 3; # Trailing consonant
26
+ LV = 4; # Composed <LV> syllable
27
+ LVT = 5; # Composed <LVT> syllable
28
+ M = 6; # Tone mark
29
+
30
+ # define variables
31
+ decomposed = L V T?;
32
+ partial = LV T?;
33
+ composed = LVT;
34
+
35
+ # define main state machine pattern
36
+ main = (decomposed | partial | composed) M?;
37
+ ```
38
+
39
+ Visualized, the machine looks like this (double circles are accepting states):
40
+
41
+ ![dfa](https://cloud.githubusercontent.com/assets/19409/19143719/8fbc6a12-8b5a-11e6-868d-99621644d094.png)
42
+
43
+ Compiling and using the machine:
44
+
45
+ ```javascript
46
+ import { compile } from "@chr33s/pdf-dfa";
47
+ import fs from "node:fs";
48
+
49
+ let stateMachine = compile(fs.readFileSync('hangul.machine', 'utf8'));
50
+
51
+ // find matches
52
+ for (let [startIndex, endIndex] of stateMachine.match([0, 1, 2, 3, 0, 4, 6]) {
53
+ console.log('match:', startIndex, endIndex);
54
+ }
55
+ ```
56
+
57
+ Output:
58
+ ```
59
+ match: 1 3
60
+ match: 5 6
61
+ ```
62
+
63
+ ## Syntax
64
+
65
+ A state machine file contains a list of assignment statements. Comments are also allowed
66
+ and are started with the `#` character. Each statement is an assignment of a variable name
67
+ to a value or expression. Assigning a variable to a number produces a symbol, which is
68
+ added to the state machine's alphabet. Assigning a variable to an expression allows
69
+ for substitutions into later expressions. The special `main` variable should always be
70
+ assigned to at the end of the file, and is the final expression that will be compiled.
71
+
72
+ A subset of common regular expression syntax is supported. A list of operators and their
73
+ precedence is below. Operators with the same precedence are evaluated left to right.
74
+
75
+ | Precedence | Syntax | Type | Meaning |
76
+ | ---------- | ---------- | --------------| ------------------------------------------ |
77
+ | 1 | `a \| b` | Alternation | Matches either `a` or `b` |
78
+ | 2 | `a b` | Concatenation | Matches `a` followed by `b` |
79
+ | 3 | `a*` | Repetition | Matches zero or more occurrences of `a` |
80
+ | 3 | `a+` | Repetition | Matches one ore more occurrences of `a` |
81
+ | 3 | `a?` | Optional | Matches zero or one occurrence of `a` |
82
+ | 3 | `a{n}` | Repetition | Matches exactly n occurrences of `a` |
83
+ | 3 | `a{n,}` | Repetition | Matches n or more occurrences of `a` |
84
+ | 3 | `a{,n}` | Repetition | Matches up to n occurrences of `a` |
85
+ | 3 | `a{n,m}` | Repetition | Matches n to m occurrences of `a` |
86
+ | 4 | `t:<expr>` | Tag | Tags the following expression with tag `t` |
87
+ | 5 | `(<expr>)` | Grouping | Groups an expression |
88
+
89
+ ## License
90
+
91
+ MIT
package/dfa.d.ts ADDED
@@ -0,0 +1,44 @@
1
+ declare module "peggy" {
2
+ export type OutputFormat = "parser" | "source";
3
+ export type ModuleFormat = "es" | "commonjs" | "umd";
4
+
5
+ export interface GenerateOptions {
6
+ allowedStartRules?: string[];
7
+ cache?: boolean;
8
+ format?: ModuleFormat;
9
+ output?: OutputFormat;
10
+ }
11
+
12
+ export interface PeggyParser {
13
+ parse<T = unknown>(input: string, options?: unknown): T;
14
+ }
15
+
16
+ export interface PeggyApi {
17
+ generate(grammar: string, options?: GenerateOptions): string | PeggyParser;
18
+ }
19
+
20
+ const peggy: PeggyApi;
21
+ export default peggy;
22
+ }
23
+
24
+ declare module "graphviz" {
25
+ export interface GraphAttributeMap {
26
+ [key: string]: string | number | undefined;
27
+ }
28
+
29
+ export interface Edge {
30
+ set(attribute: string, value: string): Edge;
31
+ }
32
+
33
+ export interface Node {
34
+ set(attribute: string, value: string): void;
35
+ }
36
+
37
+ export interface Graph {
38
+ set(attribute: string, value: string): void;
39
+ addNode(id: string, attributes?: GraphAttributeMap): Node;
40
+ addEdge(from: Node, to: Node, attributes?: GraphAttributeMap): Edge;
41
+ }
42
+
43
+ export function digraph(id: string): Graph;
44
+ }
@@ -0,0 +1,6 @@
1
+ import StateMachine from "./state-machine.js";
2
+ import SymbolTable from "./symbol-table.js";
3
+ export type ExternalSymbols = Record<string, number>;
4
+ export declare function parse(source: string, externalSymbols?: ExternalSymbols): SymbolTable;
5
+ export declare function build(symbolTable: SymbolTable): StateMachine;
6
+ export default function compile(source: string, externalSymbols?: ExternalSymbols): StateMachine;
@@ -0,0 +1,22 @@
1
+ import buildDFA from "./dfa.js";
2
+ import { parse as parseGrammar } from "./grammar.js";
3
+ import * as nodes from "./nodes.js";
4
+ import StateMachine from "./state-machine.js";
5
+ import SymbolTable from "./symbol-table.js";
6
+ export function parse(source, externalSymbols = {}) {
7
+ const ast = parseGrammar(source, { nodes });
8
+ return new SymbolTable(ast, externalSymbols);
9
+ }
10
+ export function build(symbolTable) {
11
+ const states = buildDFA(symbolTable.main, symbolTable.size);
12
+ const config = {
13
+ stateTable: states.map((state) => Array.from(state.transitions)),
14
+ accepting: states.map((state) => state.accepting),
15
+ tags: states.map((state) => Array.from(state.tags)),
16
+ };
17
+ return new StateMachine(config);
18
+ }
19
+ export default function compile(source, externalSymbols = {}) {
20
+ return build(parse(source, externalSymbols));
21
+ }
22
+ //# sourceMappingURL=compile.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compile.js","sourceRoot":"","sources":["../src/compile.ts"],"names":[],"mappings":"AAAA,OAAO,QAAsB,MAAM,UAAU,CAAC;AAC9C,OAAO,EAAE,KAAK,IAAI,YAAY,EAAE,MAAM,cAAc,CAAC;AACrD,OAAO,KAAK,KAAK,MAAM,YAAY,CAAC;AACpC,OAAO,YAAoC,MAAM,oBAAoB,CAAC;AACtE,OAAO,WAAW,MAAM,mBAAmB,CAAC;AAI5C,MAAM,UAAU,KAAK,CAAC,MAAc,EAAE,kBAAmC,EAAE;IACzE,MAAM,GAAG,GAAG,YAAY,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;IAC5C,OAAO,IAAI,WAAW,CAAC,GAAG,EAAE,eAAe,CAAC,CAAC;AAC/C,CAAC;AAED,MAAM,UAAU,KAAK,CAAC,WAAwB;IAC5C,MAAM,MAAM,GAAe,QAAQ,CAAC,WAAW,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,CAAC,CAAC;IAExE,MAAM,MAAM,GAAuB;QACjC,UAAU,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;QAChE,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,SAAS,CAAC;QACjD,IAAI,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;KACpD,CAAC;IAEF,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC;AAClC,CAAC;AAED,MAAM,CAAC,OAAO,UAAU,OAAO,CAC7B,MAAc,EACd,kBAAmC,EAAE;IAErC,OAAO,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,eAAe,CAAC,CAAC,CAAC;AAC/C,CAAC"}
package/dist/dfa.d.ts ADDED
@@ -0,0 +1,16 @@
1
+ import { ExpressionNode, PositionNode } from "./nodes.js";
2
+ export interface DFAState {
3
+ positions: Set<PositionNode>;
4
+ transitions: Uint16Array;
5
+ accepting: boolean;
6
+ marked: boolean;
7
+ tags: Set<string>;
8
+ }
9
+ /**
10
+ * This is an implementation of the direct regular expression to DFA algorithm described
11
+ * in section 3.9.5 of "Compilers: Principles, Techniques, and Tools" by Aho,
12
+ * Lam, Sethi, and Ullman. http://dragonbook.stanford.edu
13
+ * There is a PDF of the book here:
14
+ * http://www.informatik.uni-bremen.de/agbkb/lehre/ccfl/Material/ALSUdragonbook.pdf
15
+ */
16
+ export default function buildDFA(root: ExpressionNode, numSymbols: number): DFAState[];
package/dist/dfa.js ADDED
@@ -0,0 +1,81 @@
1
+ import { Concatenation, EndMarker, Literal, Tag } from "./nodes.js";
2
+ import { addAll, equal } from "./utils.js";
3
+ const END_MARKER = new EndMarker();
4
+ /**
5
+ * This is an implementation of the direct regular expression to DFA algorithm described
6
+ * in section 3.9.5 of "Compilers: Principles, Techniques, and Tools" by Aho,
7
+ * Lam, Sethi, and Ullman. http://dragonbook.stanford.edu
8
+ * There is a PDF of the book here:
9
+ * http://www.informatik.uni-bremen.de/agbkb/lehre/ccfl/Material/ALSUdragonbook.pdf
10
+ */
11
+ export default function buildDFA(root, numSymbols) {
12
+ const augmentedRoot = new Concatenation(root, END_MARKER);
13
+ augmentedRoot.calcFollowpos();
14
+ const failState = new State(new Set(), numSymbols);
15
+ const initialState = new State(augmentedRoot.firstpos, numSymbols);
16
+ const dstates = [failState, initialState];
17
+ // while there is an unmarked state S in dstates
18
+ while (true) {
19
+ let s = null;
20
+ for (let j = 1; j < dstates.length; j++) {
21
+ if (!dstates[j].marked) {
22
+ s = dstates[j];
23
+ break;
24
+ }
25
+ }
26
+ if (s == null) {
27
+ break;
28
+ }
29
+ // mark S
30
+ s.marked = true;
31
+ // for each input symbol a
32
+ for (let a = 0; a < numSymbols; a++) {
33
+ // let U be the union of followpos(p) for all
34
+ // p in S that correspond to a
35
+ const u = new Set();
36
+ for (const p of s.positions) {
37
+ if (p instanceof Literal && p.value === a) {
38
+ addAll(u, p.followpos);
39
+ }
40
+ }
41
+ if (u.size === 0) {
42
+ continue;
43
+ }
44
+ // if U is not in dstates
45
+ let ux = -1;
46
+ for (let i = 0; i < dstates.length; i++) {
47
+ if (equal(u, dstates[i].positions)) {
48
+ ux = i;
49
+ break;
50
+ }
51
+ }
52
+ if (ux === -1) {
53
+ // Add U as an unmarked state to dstates
54
+ dstates.push(new State(u, numSymbols));
55
+ ux = dstates.length - 1;
56
+ }
57
+ s.transitions[a] = ux;
58
+ }
59
+ }
60
+ return dstates;
61
+ }
62
+ class State {
63
+ positions;
64
+ transitions;
65
+ accepting;
66
+ marked;
67
+ tags;
68
+ constructor(positions, len) {
69
+ this.positions = positions;
70
+ this.transitions = new Uint16Array(len);
71
+ this.accepting = positions.has(END_MARKER);
72
+ this.marked = false;
73
+ this.tags = new Set();
74
+ for (const pos of positions) {
75
+ if (pos instanceof Tag) {
76
+ this.tags.add(pos.name);
77
+ }
78
+ }
79
+ }
80
+ }
81
+ //# sourceMappingURL=dfa.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dfa.js","sourceRoot":"","sources":["../src/dfa.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,SAAS,EAAkB,OAAO,EAAgB,GAAG,EAAE,MAAM,YAAY,CAAC;AAClG,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAE3C,MAAM,UAAU,GAAG,IAAI,SAAS,EAAE,CAAC;AAUnC;;;;;;GAMG;AACH,MAAM,CAAC,OAAO,UAAU,QAAQ,CAAC,IAAoB,EAAE,UAAkB;IACvE,MAAM,aAAa,GAAG,IAAI,aAAa,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;IAC1D,aAAa,CAAC,aAAa,EAAE,CAAC;IAE9B,MAAM,SAAS,GAAG,IAAI,KAAK,CAAC,IAAI,GAAG,EAAgB,EAAE,UAAU,CAAC,CAAC;IACjE,MAAM,YAAY,GAAG,IAAI,KAAK,CAAC,aAAa,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IACnE,MAAM,OAAO,GAAY,CAAC,SAAS,EAAE,YAAY,CAAC,CAAC;IAEnD,gDAAgD;IAChD,OAAO,IAAI,EAAE,CAAC;QACZ,IAAI,CAAC,GAAiB,IAAI,CAAC;QAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;gBACvB,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;gBACf,MAAM;YACR,CAAC;QACH,CAAC;QAED,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;YACd,MAAM;QACR,CAAC;QAED,SAAS;QACT,CAAC,CAAC,MAAM,GAAG,IAAI,CAAC;QAEhB,0BAA0B;QAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,6CAA6C;YAC7C,+BAA+B;YAC/B,MAAM,CAAC,GAAG,IAAI,GAAG,EAAgB,CAAC;YAClC,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,EAAE,CAAC;gBAC5B,IAAI,CAAC,YAAY,OAAO,IAAI,CAAC,CAAC,KAAK,KAAK,CAAC,EAAE,CAAC;oBAC1C,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC;gBACzB,CAAC;YACH,CAAC;YAED,IAAI,CAAC,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;gBACjB,SAAS;YACX,CAAC;YAED,yBAAyB;YACzB,IAAI,EAAE,GAAG,CAAC,CAAC,CAAC;YACZ,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACxC,IAAI,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,EAAE,CAAC;oBACnC,EAAE,GAAG,CAAC,CAAC;oBACP,MAAM;gBACR,CAAC;YACH,CAAC;YAED,IAAI,EAAE,KAAK,CAAC,CAAC,EAAE,CAAC;gBACd,wCAAwC;gBACxC,OAAO,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC;gBACvC,EAAE,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;YAC1B,CAAC;YAED,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC;QACxB,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,KAAK;IACA,SAAS,CAAoB;IAC7B,WAAW,CAAc;IACzB,SAAS,CAAU;IAC5B,MAAM,CAAU;IACP,IAAI,CAAc;IAE3B,YAAY,SAA4B,EAAE,GAAW;QACnD,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,CAAC;QACxC,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAC3C,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC;QACpB,IAAI,CAAC,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAE9B,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,IAAI,GAAG,YAAY,GAAG,EAAE,CAAC;gBACvB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,11 @@
1
+ declare const peg$allowedStartRules: string[];
2
+ declare class peg$SyntaxError extends SyntaxError {
3
+ static buildMessage(expected: any, found: any): string;
4
+ constructor(message: any, expected: any, found: any, location: any);
5
+ expected: any;
6
+ found: any;
7
+ location: any;
8
+ format(sources: any): string;
9
+ }
10
+ declare function peg$parse(input: any, options: any): any;
11
+ export { peg$allowedStartRules as StartRules, peg$SyntaxError as SyntaxError, peg$parse as parse };