lark-docx2md 0.5.3-beta.1 → 0.5.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -57,6 +57,7 @@ npx -y lark-docx2md@latest download <url>
57
57
  > - 非 agent 模式下 `--wb-format yaml` 时:`--wb-image-mode` 强制为 `online`。
58
58
  > - `--filter-title`:按标题文本精确匹配(忽略前后空格),收集该标题及其所有子级块,遇到同级或更高级标题时停止。同名标题取首个;未匹配时错误信息附全文标题 yaml 清单。
59
59
  > - `--filter-title-block-id`:按 heading 块 id 严格相等匹配,适用于同名标题或脚本化场景;通常先用 `get-titles` 查出目标 `blockId` 再传入。与 `--filter-title` 互斥。
60
+ > - **命中深层标题时自动注入父级标题(仅 heading 块本身)**:两个过滤参数均会按文档顺序补齐包含路径上的顶层→该标题的所有祖先标题,以保留章节层级上下文;不会引入旁支兄弟或伪造跳级。
60
61
 
61
62
  ## 子命令:`get-titles`
62
63
 
@@ -100,7 +101,7 @@ npx -y lark-docx2md@latest get-titles --agent <url>
100
101
  | Callout | 高亮块 | `>[!TIP]` + 子块 |
101
102
  | Divider | 分割线 | `---` |
102
103
  | Image | 图片 | `![图片](url)` |
103
- | Table / TableCell | 表格 | `<table>` HTML(支持合并单元格) |
104
+ | Table / TableCell | 表格 | GFM 管道表格(合并单元格按值展开) |
104
105
  | QuoteContainer | 引用容器 | `> 子块内容` |
105
106
  | Grid / GridColumn | 分栏布局 | 展平为子块内容 |
106
107
  | Sheet | 电子表格 | GFM 表格(合并单元格自动展开) |
package/dist/cli.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { a as setLogLevel, n as buildTitleTree, o as serializeYaml, r as getTitles, t as convert } from "./converter-DLMAssSI.js";
2
+ import { a as setLogLevel, n as buildTitleTree, o as serializeYaml, r as getTitles, t as convert } from "./converter-C2dAcaTO.js";
3
3
  import * as fs from "node:fs";
4
4
  import * as path from "node:path";
5
5
  import { Command } from "commander";
@@ -657,6 +657,75 @@ function registerBuiltinParsers(parser) {
657
657
  parser.register(sheetBlockParser);
658
658
  }
659
659
  //#endregion
660
+ //#region src/sheet/index.ts
661
+ const escapeCell = (s) => s.replace(/\|/g, "\\|").replace(/\n/g, "<br>");
662
+ function cellToMd(cell) {
663
+ if (cell == null) return "";
664
+ if (typeof cell !== "object") return escapeCell(String(cell));
665
+ if (Array.isArray(cell)) return cell.map(cellToMd).join("");
666
+ const o = cell;
667
+ switch (o.type) {
668
+ case "text": return escapeCell(String(o.text ?? ""));
669
+ case "url": return `[${escapeCell(o.text ?? o.link ?? "")}](${o.link ?? ""})`;
670
+ case "mentionUser": return `@${escapeCell(o.name ?? o.textArr?.join("") ?? "")}`;
671
+ case "formula": return `\`${escapeCell(o.text ?? "")}\``;
672
+ default:
673
+ if (o.text != null) return escapeCell(String(o.text));
674
+ return "";
675
+ }
676
+ }
677
+ function expandMerges(rows, merges) {
678
+ const grid = rows.map((r) => r.slice());
679
+ for (const m of merges) {
680
+ const r0 = m.start_row_index ?? 0;
681
+ const r1 = m.end_row_index ?? r0;
682
+ const c0 = m.start_column_index ?? 0;
683
+ const c1 = m.end_column_index ?? c0;
684
+ const v = grid[r0]?.[c0] ?? "";
685
+ for (let r = r0; r <= r1; r++) {
686
+ if (!grid[r]) grid[r] = [];
687
+ for (let c = c0; c <= c1; c++) {
688
+ if (r === r0 && c === c0) continue;
689
+ grid[r][c] = v;
690
+ }
691
+ }
692
+ }
693
+ return grid;
694
+ }
695
+ const isEmptyCell = (v) => v == null || v === "";
696
+ function trimTrailingEmpty(rows) {
697
+ let lastRow = -1;
698
+ for (let r = 0; r < rows.length; r++) if ((rows[r] ?? []).some((c) => !isEmptyCell(c))) lastRow = r;
699
+ if (lastRow < 0) return [];
700
+ const trimmedRows = rows.slice(0, lastRow + 1);
701
+ let lastCol = -1;
702
+ for (const row of trimmedRows) for (let c = (row?.length ?? 0) - 1; c >= 0; c--) if (!isEmptyCell(row[c])) {
703
+ if (c > lastCol) lastCol = c;
704
+ break;
705
+ }
706
+ if (lastCol < 0) return trimmedRows.map(() => []);
707
+ return trimmedRows.map((r) => (r ?? []).slice(0, lastCol + 1));
708
+ }
709
+ function columnIndexToLetter(n) {
710
+ let result = "";
711
+ while (n > 0) {
712
+ n--;
713
+ result = String.fromCharCode(65 + n % 26) + result;
714
+ n = Math.floor(n / 26);
715
+ }
716
+ return result;
717
+ }
718
+ function renderMarkdownTable(rows) {
719
+ if (!rows.length) return "_(空表)_\n\n";
720
+ const [head, ...body] = rows;
721
+ if (!head || head.length === 0) return "_(空表)_\n\n";
722
+ let out = `| ${head.join(" | ")} |\n`;
723
+ out += `| ${head.map(() => "---").join(" | ")} |\n`;
724
+ for (const r of body) out += `| ${r.join(" | ")} |\n`;
725
+ out += "\n";
726
+ return out;
727
+ }
728
+ //#endregion
660
729
  //#region src/md-ast/serializer.ts
661
730
  var MdSerializer = class {
662
731
  constructor() {
@@ -821,22 +890,38 @@ const tableSerializer = {
821
890
  type: "table",
822
891
  serialize(node, ctx) {
823
892
  if (node.type !== "table") return "";
824
- let buf = "<table>\n";
825
- for (const row of node.rows) {
826
- buf += "<tr>\n";
827
- for (const cell of row.cells) {
828
- let attrs = "";
829
- if (cell.rowSpan && cell.rowSpan > 1) attrs += ` rowspan="${cell.rowSpan}"`;
830
- if (cell.colSpan && cell.colSpan > 1) attrs += ` colspan="${cell.colSpan}"`;
831
- const content = ctx.serializeInline(cell.content);
832
- buf += `<td${attrs}>${content}</td>`;
893
+ return renderMarkdownTable(rebuildGrid(node.rows, ctx));
894
+ }
895
+ };
896
+ /**
897
+ * 基于 parser 产出的稀疏 rows(被合并覆盖的格已被过滤,顶格包含 rowSpan/colSpan)
898
+ * 还原为稠密二维字符串网格:Markdown 不支持合并,按"复制左上角值"展开。
899
+ */
900
+ function rebuildGrid(rows, ctx) {
901
+ const occupied = /* @__PURE__ */ new Set();
902
+ const grid = [];
903
+ for (let r = 0; r < rows.length; r++) {
904
+ if (!grid[r]) grid[r] = [];
905
+ let c = 0;
906
+ for (const cell of rows[r].cells) {
907
+ while (occupied.has(`${r}-${c}`)) c++;
908
+ const value = escapeCell(ctx.serializeInline(cell.content));
909
+ const rs = cell.rowSpan ?? 1;
910
+ const cs = cell.colSpan ?? 1;
911
+ for (let rr = r; rr < r + rs; rr++) {
912
+ if (!grid[rr]) grid[rr] = [];
913
+ for (let cc = c; cc < c + cs; cc++) {
914
+ grid[rr][cc] = value;
915
+ if (rr !== r || cc !== c) occupied.add(`${rr}-${cc}`);
916
+ }
833
917
  }
834
- buf += "</tr>\n";
918
+ c += cs;
835
919
  }
836
- buf += "</table>\n";
837
- return buf;
838
920
  }
839
- };
921
+ const maxCols = grid.reduce((m, row) => Math.max(m, row.length), 0);
922
+ for (const row of grid) for (let i = 0; i < maxCols; i++) if (row[i] === void 0) row[i] = "";
923
+ return grid;
924
+ }
840
925
  const gridSerializer = {
841
926
  type: "grid",
842
927
  serialize(node, ctx) {
@@ -862,19 +947,7 @@ const sheetResolvedSerializer = {
862
947
  out += `> ${s.error}\n\n`;
863
948
  continue;
864
949
  }
865
- if (!s.rows.length) {
866
- out += "_(空表)_\n\n";
867
- continue;
868
- }
869
- const [head, ...body] = s.rows;
870
- if (!head) {
871
- out += "_(空表)_\n\n";
872
- continue;
873
- }
874
- out += `| ${head.join(" | ")} |\n`;
875
- out += `| ${head.map(() => "---").join(" | ")} |\n`;
876
- for (const r of body) out += `| ${r.join(" | ")} |\n`;
877
- out += "\n";
950
+ out += renderMarkdownTable(s.rows);
878
951
  }
879
952
  return out;
880
953
  }
@@ -2587,65 +2660,6 @@ function collectImageTokens(nodes, out) {
2587
2660
  }
2588
2661
  }
2589
2662
  //#endregion
2590
- //#region src/sheet/index.ts
2591
- const escapeCell = (s) => s.replace(/\|/g, "\\|").replace(/\n/g, "<br>");
2592
- function cellToMd(cell) {
2593
- if (cell == null) return "";
2594
- if (typeof cell !== "object") return escapeCell(String(cell));
2595
- if (Array.isArray(cell)) return cell.map(cellToMd).join("");
2596
- const o = cell;
2597
- switch (o.type) {
2598
- case "text": return escapeCell(String(o.text ?? ""));
2599
- case "url": return `[${escapeCell(o.text ?? o.link ?? "")}](${o.link ?? ""})`;
2600
- case "mentionUser": return `@${escapeCell(o.name ?? o.textArr?.join("") ?? "")}`;
2601
- case "formula": return `\`${escapeCell(o.text ?? "")}\``;
2602
- default:
2603
- if (o.text != null) return escapeCell(String(o.text));
2604
- return "";
2605
- }
2606
- }
2607
- function expandMerges(rows, merges) {
2608
- const grid = rows.map((r) => r.slice());
2609
- for (const m of merges) {
2610
- const r0 = m.start_row_index ?? 0;
2611
- const r1 = m.end_row_index ?? r0;
2612
- const c0 = m.start_column_index ?? 0;
2613
- const c1 = m.end_column_index ?? c0;
2614
- const v = grid[r0]?.[c0] ?? "";
2615
- for (let r = r0; r <= r1; r++) {
2616
- if (!grid[r]) grid[r] = [];
2617
- for (let c = c0; c <= c1; c++) {
2618
- if (r === r0 && c === c0) continue;
2619
- grid[r][c] = v;
2620
- }
2621
- }
2622
- }
2623
- return grid;
2624
- }
2625
- const isEmptyCell = (v) => v == null || v === "";
2626
- function trimTrailingEmpty(rows) {
2627
- let lastRow = -1;
2628
- for (let r = 0; r < rows.length; r++) if ((rows[r] ?? []).some((c) => !isEmptyCell(c))) lastRow = r;
2629
- if (lastRow < 0) return [];
2630
- const trimmedRows = rows.slice(0, lastRow + 1);
2631
- let lastCol = -1;
2632
- for (const row of trimmedRows) for (let c = (row?.length ?? 0) - 1; c >= 0; c--) if (!isEmptyCell(row[c])) {
2633
- if (c > lastCol) lastCol = c;
2634
- break;
2635
- }
2636
- if (lastCol < 0) return trimmedRows.map(() => []);
2637
- return trimmedRows.map((r) => (r ?? []).slice(0, lastCol + 1));
2638
- }
2639
- function columnIndexToLetter(n) {
2640
- let result = "";
2641
- while (n > 0) {
2642
- n--;
2643
- result = String.fromCharCode(65 + n % 26) + result;
2644
- n = Math.floor(n / 26);
2645
- }
2646
- return result;
2647
- }
2648
- //#endregion
2649
2663
  //#region src/logger.ts
2650
2664
  const COLORS = {
2651
2665
  [LoggerLevel.fatal]: "\x1B[35m",
@@ -3013,7 +3027,9 @@ function toHeadingInfo(block) {
3013
3027
  *
3014
3028
  * 复用要点:
3015
3029
  * - page 节点(block_type=1)始终保留
3016
- * - scanning 阶段把所有 heading 推入 availableHeadings
3030
+ * - scanning 阶段把所有 heading 推入 availableHeadings,并以栈式回溯维护祖先链
3031
+ * - 命中时把祖先 heading(level < 命中 level)按文档顺序注入 collected,
3032
+ * 作为「仅标题」上下文(heading 的非命中兄弟内容不在 blockMap 中,Parser 会自动跳过)
3017
3033
  * - collecting 阶段遇到同级或更高级标题终止
3018
3034
  */
3019
3035
  function createHeadingMatchFilter(match) {
@@ -3021,6 +3037,7 @@ function createHeadingMatchFilter(match) {
3021
3037
  let matchedLevel = 0;
3022
3038
  const collected = [];
3023
3039
  const seen = [];
3040
+ const ancestorStack = [];
3024
3041
  function pageHandler(blocks) {
3025
3042
  for (const block of blocks) {
3026
3043
  if (block.block_type === 1) {
@@ -3031,12 +3048,17 @@ function createHeadingMatchFilter(match) {
3031
3048
  case "scanning": {
3032
3049
  const info = toHeadingInfo(block);
3033
3050
  if (info) {
3051
+ while (ancestorStack.length > 0 && ancestorStack[ancestorStack.length - 1].info.level >= info.level) ancestorStack.pop();
3034
3052
  seen.push(info);
3035
3053
  if (match(block, info)) {
3054
+ for (const entry of ancestorStack) collected.push(entry.block);
3036
3055
  state = "collecting";
3037
3056
  matchedLevel = info.level;
3038
3057
  collected.push(block);
3039
- }
3058
+ } else ancestorStack.push({
3059
+ info,
3060
+ block
3061
+ });
3040
3062
  }
3041
3063
  break;
3042
3064
  }
@@ -3240,4 +3262,4 @@ function buildFilterErrorMessage(opts, result, url, docToken) {
3240
3262
  //#endregion
3241
3263
  export { setLogLevel as a, parseWikiUrl as i, buildTitleTree as n, serializeYaml as o, getTitles as r, convert as t };
3242
3264
 
3243
- //# sourceMappingURL=converter-DLMAssSI.js.map
3265
+ //# sourceMappingURL=converter-C2dAcaTO.js.map