@wdprlib/parser 3.1.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/dist/index.cjs +295 -118
  2. package/dist/index.js +272 -95
  3. package/package.json +5 -3
  4. package/src/index.ts +163 -0
  5. package/src/lexer/index.ts +20 -0
  6. package/src/lexer/lexer.ts +687 -0
  7. package/src/lexer/tokens.ts +141 -0
  8. package/src/parser/constants.ts +173 -0
  9. package/src/parser/depth.ts +251 -0
  10. package/src/parser/index.ts +18 -0
  11. package/src/parser/parse.ts +315 -0
  12. package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
  13. package/src/parser/postprocess/index.ts +15 -0
  14. package/src/parser/postprocess/spanStrip.ts +697 -0
  15. package/src/parser/preprocess/expr.ts +265 -0
  16. package/src/parser/preprocess/index.ts +38 -0
  17. package/src/parser/preprocess/typography.ts +67 -0
  18. package/src/parser/preprocess/utils.ts +250 -0
  19. package/src/parser/preprocess/whitespace.ts +111 -0
  20. package/src/parser/rules/block/align.ts +282 -0
  21. package/src/parser/rules/block/bibliography.ts +359 -0
  22. package/src/parser/rules/block/block-list.ts +689 -0
  23. package/src/parser/rules/block/blockquote.ts +238 -0
  24. package/src/parser/rules/block/center.ts +87 -0
  25. package/src/parser/rules/block/clear-float.ts +75 -0
  26. package/src/parser/rules/block/code.ts +187 -0
  27. package/src/parser/rules/block/collapsible.ts +337 -0
  28. package/src/parser/rules/block/comment.ts +73 -0
  29. package/src/parser/rules/block/content-separator.ts +79 -0
  30. package/src/parser/rules/block/definition-list.ts +270 -0
  31. package/src/parser/rules/block/div.ts +400 -0
  32. package/src/parser/rules/block/embed-block.ts +153 -0
  33. package/src/parser/rules/block/footnoteblock.ts +200 -0
  34. package/src/parser/rules/block/heading.ts +142 -0
  35. package/src/parser/rules/block/horizontal-rule.ts +61 -0
  36. package/src/parser/rules/block/html.ts +222 -0
  37. package/src/parser/rules/block/iframe.ts +239 -0
  38. package/src/parser/rules/block/iftags.ts +150 -0
  39. package/src/parser/rules/block/include.ts +179 -0
  40. package/src/parser/rules/block/index.ts +127 -0
  41. package/src/parser/rules/block/list.ts +244 -0
  42. package/src/parser/rules/block/math.ts +183 -0
  43. package/src/parser/rules/block/module/backlinks/index.ts +31 -0
  44. package/src/parser/rules/block/module/backlinks/types.ts +21 -0
  45. package/src/parser/rules/block/module/categories/index.ts +34 -0
  46. package/src/parser/rules/block/module/categories/types.ts +21 -0
  47. package/src/parser/rules/block/module/css/index.ts +37 -0
  48. package/src/parser/rules/block/module/iftags/condition.ts +109 -0
  49. package/src/parser/rules/block/module/iftags/index.ts +26 -0
  50. package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
  51. package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
  52. package/src/parser/rules/block/module/iftags/types.ts +63 -0
  53. package/src/parser/rules/block/module/include/index.ts +20 -0
  54. package/src/parser/rules/block/module/include/resolve.ts +556 -0
  55. package/src/parser/rules/block/module/index.ts +122 -0
  56. package/src/parser/rules/block/module/join/index.ts +34 -0
  57. package/src/parser/rules/block/module/join/types.ts +23 -0
  58. package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
  59. package/src/parser/rules/block/module/listpages/extract.ts +410 -0
  60. package/src/parser/rules/block/module/listpages/index.ts +83 -0
  61. package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
  62. package/src/parser/rules/block/module/listpages/parser.ts +106 -0
  63. package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
  64. package/src/parser/rules/block/module/listpages/types.ts +513 -0
  65. package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
  66. package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
  67. package/src/parser/rules/block/module/listusers/extract.ts +45 -0
  68. package/src/parser/rules/block/module/listusers/index.ts +36 -0
  69. package/src/parser/rules/block/module/listusers/parser.ts +54 -0
  70. package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
  71. package/src/parser/rules/block/module/listusers/types.ts +93 -0
  72. package/src/parser/rules/block/module/mapping.ts +61 -0
  73. package/src/parser/rules/block/module/page-tree/index.ts +38 -0
  74. package/src/parser/rules/block/module/page-tree/types.ts +29 -0
  75. package/src/parser/rules/block/module/rate/index.ts +28 -0
  76. package/src/parser/rules/block/module/rate/types.ts +19 -0
  77. package/src/parser/rules/block/module/resolve.ts +411 -0
  78. package/src/parser/rules/block/module/types-common.ts +59 -0
  79. package/src/parser/rules/block/module/types.ts +61 -0
  80. package/src/parser/rules/block/module/utils.ts +43 -0
  81. package/src/parser/rules/block/module/walk.ts +380 -0
  82. package/src/parser/rules/block/module.ts +164 -0
  83. package/src/parser/rules/block/orphan-li.ts +177 -0
  84. package/src/parser/rules/block/paragraph.ts +157 -0
  85. package/src/parser/rules/block/table-block.ts +726 -0
  86. package/src/parser/rules/block/table.ts +441 -0
  87. package/src/parser/rules/block/tabview.ts +331 -0
  88. package/src/parser/rules/block/toc.ts +129 -0
  89. package/src/parser/rules/block/utils.ts +615 -0
  90. package/src/parser/rules/index.ts +49 -0
  91. package/src/parser/rules/inline/anchor-name.ts +154 -0
  92. package/src/parser/rules/inline/anchor.ts +327 -0
  93. package/src/parser/rules/inline/bibcite.ts +153 -0
  94. package/src/parser/rules/inline/bold.ts +86 -0
  95. package/src/parser/rules/inline/color.ts +140 -0
  96. package/src/parser/rules/inline/comment.ts +90 -0
  97. package/src/parser/rules/inline/equation-ref.ts +115 -0
  98. package/src/parser/rules/inline/expr.ts +526 -0
  99. package/src/parser/rules/inline/footnote.ts +223 -0
  100. package/src/parser/rules/inline/guillemet.ts +64 -0
  101. package/src/parser/rules/inline/html.ts +132 -0
  102. package/src/parser/rules/inline/image.ts +328 -0
  103. package/src/parser/rules/inline/index.ts +150 -0
  104. package/src/parser/rules/inline/italic.ts +74 -0
  105. package/src/parser/rules/inline/line-break.ts +326 -0
  106. package/src/parser/rules/inline/link-anchor.ts +147 -0
  107. package/src/parser/rules/inline/link-single.ts +164 -0
  108. package/src/parser/rules/inline/link-star.ts +134 -0
  109. package/src/parser/rules/inline/link-triple.ts +267 -0
  110. package/src/parser/rules/inline/math-inline.ts +126 -0
  111. package/src/parser/rules/inline/monospace.ts +78 -0
  112. package/src/parser/rules/inline/raw.ts +262 -0
  113. package/src/parser/rules/inline/size.ts +244 -0
  114. package/src/parser/rules/inline/span.ts +424 -0
  115. package/src/parser/rules/inline/strikethrough.ts +115 -0
  116. package/src/parser/rules/inline/subscript.ts +84 -0
  117. package/src/parser/rules/inline/superscript.ts +84 -0
  118. package/src/parser/rules/inline/text.ts +84 -0
  119. package/src/parser/rules/inline/underline.ts +127 -0
  120. package/src/parser/rules/inline/user.ts +147 -0
  121. package/src/parser/rules/inline/utils.ts +344 -0
  122. package/src/parser/rules/types.ts +252 -0
  123. package/src/parser/rules/utils.ts +155 -0
  124. package/src/parser/toc.ts +130 -0
@@ -0,0 +1,441 @@
1
+ /**
2
+ *
3
+ * Block rule for Wikidot pipe-syntax tables.
4
+ *
5
+ * Wikidot tables are written using `||` delimiters at the start of a line:
6
+ *
7
+ * ```
8
+ * || Cell 1 || Cell 2 ||
9
+ * || Cell 3 || Cell 4 ||
10
+ * ```
11
+ *
12
+ * Cell variants:
13
+ * - `||` -- normal cell (`<td>`)
14
+ * - `||~` -- header cell (`<th>`)
15
+ * - `||<` -- left-aligned cell
16
+ * - `||>` -- right-aligned cell (TABLE_RIGHT)
17
+ * - `||=` -- center-aligned cell
18
+ *
19
+ * Colspan is achieved by using multiple consecutive `||` before content:
20
+ * `||||` = colspan 2, `||||||` = colspan 3, etc.
21
+ *
22
+ * Key Wikidot behaviour:
23
+ * - Cells MUST be terminated by another `||` (or variant). Unterminated
24
+ * cells (reaching end of line without a closing `||`) are discarded.
25
+ * - If all cells in a row are unterminated, one empty cell is kept.
26
+ * - Content within cells supports inline markup (bold, links, etc.).
27
+ * - Leading and trailing whitespace in cell content is trimmed.
28
+ *
29
+ * The table element carries `_source: "pipe"` in its attributes to
30
+ * distinguish it from block-syntax tables (`[[table]]`).
31
+ *
32
+ * @module
33
+ */
34
+ import type { Element, TableData, TableRow, TableCell, Alignment } from "@wdprlib/ast";
35
+ import type { BlockRule, ParseContext, RuleResult } from "../types";
36
+ import { currentToken } from "../types";
37
+ import type { TokenType } from "../../../lexer/tokens";
38
+ import { canApplyInlineRule } from "../inline/utils";
39
+
40
+ /** Token types that begin a table cell or act as cell delimiters. */
41
+ const TABLE_COL_TOKENS: TokenType[] = [
42
+ "TABLE_MARKER",
43
+ "TABLE_HEADER",
44
+ "TABLE_LEFT",
45
+ "TABLE_CENTER",
46
+ "TABLE_RIGHT",
47
+ ];
48
+
49
+ /**
50
+ * Tests whether a token type is one of the table column delimiters.
51
+ *
52
+ * @param type - The token type to check.
53
+ * @returns `true` if the type starts or delimits a table cell.
54
+ */
55
+ function isTableColToken(type: TokenType): boolean {
56
+ return TABLE_COL_TOKENS.includes(type);
57
+ }
58
+
59
+ /**
60
+ * Describes the opening properties of a table cell, determined by
61
+ * the sequence of column delimiter tokens at the start of the cell.
62
+ */
63
+ interface CellStart {
64
+ /** Explicit alignment if a styled token (`||<`, `||=`, `||>`) was used. */
65
+ align?: Alignment;
66
+ /** Whether this is a header cell (`||~`). */
67
+ header: boolean;
68
+ /** Colspan count: consecutive `||` tokens increment this. */
69
+ colspan: number;
70
+ }
71
+
72
+ /**
73
+ * Block rule for pipe-syntax tables.
74
+ *
75
+ * Parsing strategy:
76
+ * 1. Verify the first token is a table column token at line start.
77
+ * 2. Parse consecutive rows (each row is a line starting with a table
78
+ * column token).
79
+ * 3. Each row is parsed by `parseTableRow()`, which iterates cells
80
+ * via `parseCellStart()` and `parseTableCell()`.
81
+ * 4. Emit a `table` element with `_source: "pipe"`.
82
+ */
83
+ export const tableRule: BlockRule = {
84
+ name: "table",
85
+ startTokens: ["TABLE_MARKER", "TABLE_HEADER", "TABLE_LEFT", "TABLE_CENTER", "TABLE_RIGHT"],
86
+ requiresLineStart: true,
87
+
88
+ parse(ctx: ParseContext): RuleResult<Element> {
89
+ const firstToken = currentToken(ctx);
90
+
91
+ if (!firstToken.lineStart || !isTableColToken(firstToken.type)) {
92
+ return { success: false };
93
+ }
94
+
95
+ const rows: TableRow[] = [];
96
+ let pos = ctx.pos;
97
+ let consumed = 0;
98
+
99
+ // Parse rows
100
+ while (pos < ctx.tokens.length) {
101
+ const token = ctx.tokens[pos];
102
+
103
+ if (!token || !token.lineStart || !isTableColToken(token.type)) {
104
+ break;
105
+ }
106
+
107
+ const rowResult = parseTableRow(ctx, pos);
108
+ rows.push(rowResult.row);
109
+ pos += rowResult.consumed;
110
+ consumed += rowResult.consumed;
111
+ }
112
+
113
+ const tableData: TableData = {
114
+ attributes: { _source: "pipe" },
115
+ rows,
116
+ };
117
+
118
+ return {
119
+ success: true,
120
+ elements: [
121
+ {
122
+ element: "table",
123
+ data: tableData,
124
+ },
125
+ ],
126
+ consumed,
127
+ };
128
+ },
129
+ };
130
+
131
+ /**
132
+ * Parses the cell-start delimiter tokens to determine alignment, header
133
+ * status, and colspan.
134
+ *
135
+ * Multiple consecutive TABLE_MARKER tokens (`||`) increase the colspan
136
+ * count. A styled token (`||~`, `||<`, `||=`, `||>`) ends the sequence
137
+ * and sets the corresponding property.
138
+ *
139
+ * @param ctx - Parse context.
140
+ * @param startPos - Token index of the first delimiter token.
141
+ * @returns The cell properties and consumed count, or `null` if no cell.
142
+ */
143
+ function parseCellStart(
144
+ ctx: ParseContext,
145
+ startPos: number,
146
+ ): { cellStart: CellStart; consumed: number } | null {
147
+ let pos = startPos;
148
+ let colspan = 0;
149
+ let align: Alignment | undefined;
150
+ let header = false;
151
+
152
+ while (pos < ctx.tokens.length) {
153
+ const token = ctx.tokens[pos];
154
+ if (!token) break;
155
+
156
+ if (token.type === "TABLE_HEADER") {
157
+ colspan++;
158
+ header = true;
159
+ pos++;
160
+ // Styled token ends the colspan counting
161
+ break;
162
+ }
163
+ if (token.type === "TABLE_LEFT") {
164
+ colspan++;
165
+ align = "left";
166
+ pos++;
167
+ break;
168
+ }
169
+ if (token.type === "TABLE_CENTER") {
170
+ colspan++;
171
+ align = "center";
172
+ pos++;
173
+ break;
174
+ }
175
+ if (token.type === "TABLE_RIGHT") {
176
+ colspan++;
177
+ align = "right";
178
+ pos++;
179
+ break;
180
+ }
181
+ if (token.type === "TABLE_MARKER") {
182
+ colspan++;
183
+ pos++;
184
+ // Keep checking for more column markers (colspan)
185
+ continue;
186
+ }
187
+ // No more table column tokens
188
+ if (colspan > 0) {
189
+ return {
190
+ cellStart: { align, header, colspan },
191
+ consumed: pos - startPos,
192
+ };
193
+ }
194
+ return null;
195
+ }
196
+
197
+ if (colspan > 0) {
198
+ return {
199
+ cellStart: { align, header, colspan },
200
+ consumed: pos - startPos,
201
+ };
202
+ }
203
+
204
+ return null;
205
+ }
206
+
207
+ /**
208
+ * Parses a single table row (one line of `||`-delimited cells).
209
+ *
210
+ * Cells are collected until end of line. Only properly terminated cells
211
+ * (followed by another `||` token) are added to the row. If all cells
212
+ * are unterminated, one empty cell is kept as a placeholder.
213
+ *
214
+ * @param ctx - Parse context.
215
+ * @param startPos - Token index at the first cell delimiter of the row.
216
+ * @returns The parsed row and consumed token count.
217
+ */
218
+ function parseTableRow(ctx: ParseContext, startPos: number): { row: TableRow; consumed: number } {
219
+ const cells: TableCell[] = [];
220
+ let pos = startPos;
221
+ let consumed = 0;
222
+
223
+ // Parse cells until end of line
224
+ while (pos < ctx.tokens.length) {
225
+ const token = ctx.tokens[pos];
226
+ if (!token || token.type === "NEWLINE" || token.type === "EOF") {
227
+ break;
228
+ }
229
+
230
+ // Parse cell start
231
+ const startResult = parseCellStart(ctx, pos);
232
+ if (!startResult) {
233
+ // Not a cell start, break
234
+ break;
235
+ }
236
+
237
+ pos += startResult.consumed;
238
+ consumed += startResult.consumed;
239
+
240
+ // Check if end of row (followed by newline/EOF)
241
+ const nextToken = ctx.tokens[pos];
242
+ if (!nextToken || nextToken.type === "NEWLINE" || nextToken.type === "EOF") {
243
+ break;
244
+ }
245
+
246
+ // Parse cell content
247
+ const cellResult = parseTableCell(ctx, pos, startResult.cellStart);
248
+ // Only add properly terminated cells to the row
249
+ // Wikidot behavior: cells without closing || are discarded
250
+ if (cellResult.terminatedProperly) {
251
+ cells.push(cellResult.cell);
252
+ }
253
+ pos += cellResult.consumed;
254
+ consumed += cellResult.consumed;
255
+ }
256
+
257
+ // Consume newline
258
+ if (ctx.tokens[pos]?.type === "NEWLINE") {
259
+ pos++;
260
+ consumed++;
261
+ }
262
+
263
+ // Wikidot behavior: if all cells are unterminated, keep one empty cell
264
+ // This handles cases like "|| Missing end" which produces one empty cell
265
+ if (cells.length === 0) {
266
+ cells.push({
267
+ header: false,
268
+ "column-span": 1,
269
+ align: null,
270
+ attributes: {},
271
+ elements: [],
272
+ });
273
+ }
274
+
275
+ return {
276
+ row: {
277
+ attributes: {},
278
+ cells,
279
+ },
280
+ consumed,
281
+ };
282
+ }
283
+
284
+ /**
285
+ * Parses the content of a single table cell.
286
+ *
287
+ * Inline content is collected until the next table column token or end
288
+ * of line. If the cell is not terminated by a column token, its content
289
+ * is discarded (`terminatedProperly: false`), matching Wikidot behaviour.
290
+ *
291
+ * @param ctx - Parse context.
292
+ * @param startPos - Token index after the cell-start delimiter.
293
+ * @param cellStart - Properties from the cell-start delimiter sequence.
294
+ * @returns The parsed cell, consumed count, and termination status.
295
+ */
296
+ function parseTableCell(
297
+ ctx: ParseContext,
298
+ startPos: number,
299
+ cellStart: CellStart,
300
+ ): { cell: TableCell; consumed: number; terminatedProperly: boolean } {
301
+ let pos = startPos;
302
+ let consumed = 0;
303
+ const children: Element[] = [];
304
+
305
+ // Skip leading whitespace
306
+ while (ctx.tokens[pos]?.type === "WHITESPACE") {
307
+ pos++;
308
+ consumed++;
309
+ }
310
+
311
+ const { inlineRules } = ctx;
312
+
313
+ // Parse inline content until next table column token or newline
314
+ while (pos < ctx.tokens.length) {
315
+ const token = ctx.tokens[pos];
316
+ if (!token || token.type === "NEWLINE" || token.type === "EOF") {
317
+ break;
318
+ }
319
+ // Stop at table column tokens
320
+ if (isTableColToken(token.type)) {
321
+ break;
322
+ }
323
+
324
+ // Check for underscore line-break pattern: WHITESPACE + UNDERSCORE + NEWLINE
325
+ // Wikidot processes " _\n" before table parsing, replacing it with <br />.
326
+ // This allows cell content to continue on the next line.
327
+ if (token.type === "WHITESPACE") {
328
+ const nextTok = ctx.tokens[pos + 1];
329
+ const afterTok = ctx.tokens[pos + 2];
330
+ if (
331
+ nextTok?.type === "UNDERSCORE" &&
332
+ afterTok &&
333
+ (afterTok.type === "NEWLINE" || afterTok.type === "EOF")
334
+ ) {
335
+ children.push({ element: "line-break" });
336
+ pos += 3;
337
+ consumed += 3;
338
+ continue;
339
+ }
340
+ }
341
+
342
+ // Skip whitespace between tokens but preserve it as text if not at start
343
+ if (token.type === "WHITESPACE") {
344
+ children.push({ element: "text", data: token.value });
345
+ pos++;
346
+ consumed++;
347
+ continue;
348
+ }
349
+
350
+ // Try each inline rule
351
+ const inlineCtx: ParseContext = { ...ctx, pos };
352
+ let matched = false;
353
+
354
+ for (const rule of inlineRules) {
355
+ if (canApplyInlineRule(rule, token)) {
356
+ const result = rule.parse(inlineCtx);
357
+ if (result.success) {
358
+ children.push(...result.elements);
359
+ consumed += result.consumed;
360
+ pos += result.consumed;
361
+ matched = true;
362
+ break;
363
+ }
364
+ }
365
+ }
366
+
367
+ if (!matched) {
368
+ // Fallback to text
369
+ children.push({ element: "text", data: token.value });
370
+ consumed++;
371
+ pos++;
372
+ }
373
+ }
374
+
375
+ // Check if cell is properly terminated with table column token
376
+ // Wikidot behavior: cells without proper termination have empty content
377
+ const currentToken = ctx.tokens[pos];
378
+ const terminatedProperly = currentToken ? isTableColToken(currentToken.type) : false;
379
+
380
+ // Trim leading/trailing whitespace from children
381
+ const trimmedChildren = terminatedProperly ? trimElements(children) : [];
382
+
383
+ return {
384
+ cell: {
385
+ header: cellStart.header,
386
+ "column-span": cellStart.colspan,
387
+ align: terminatedProperly ? (cellStart.align ?? null) : null,
388
+ attributes: {},
389
+ elements: trimmedChildren,
390
+ },
391
+ consumed,
392
+ terminatedProperly,
393
+ };
394
+ }
395
+
396
+ /**
397
+ * Trims leading and trailing whitespace-only text elements from an array.
398
+ *
399
+ * Partial whitespace at the edges is trimmed in-place (e.g. `" foo"` becomes
400
+ * `"foo"` if it is the first element). Non-text elements are left untouched.
401
+ *
402
+ * @param elements - The element array to trim.
403
+ * @returns A new array with edge whitespace removed.
404
+ */
405
+ function trimElements(elements: Element[]): Element[] {
406
+ const result = [...elements];
407
+
408
+ // Trim leading whitespace
409
+ while (result.length > 0) {
410
+ const first = result[0];
411
+ if (first?.element === "text" && typeof first.data === "string") {
412
+ const trimmed = first.data.trimStart();
413
+ if (trimmed === "") {
414
+ result.shift();
415
+ } else {
416
+ result[0] = { element: "text", data: trimmed };
417
+ break;
418
+ }
419
+ } else {
420
+ break;
421
+ }
422
+ }
423
+
424
+ // Trim trailing whitespace
425
+ while (result.length > 0) {
426
+ const last = result[result.length - 1];
427
+ if (last?.element === "text" && typeof last.data === "string") {
428
+ const trimmed = last.data.trimEnd();
429
+ if (trimmed === "") {
430
+ result.pop();
431
+ } else {
432
+ result[result.length - 1] = { element: "text", data: trimmed };
433
+ break;
434
+ }
435
+ } else {
436
+ break;
437
+ }
438
+ }
439
+
440
+ return result;
441
+ }