lark-docx2md 0.5.3-beta.1 → 0.5.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -57,6 +57,7 @@ npx -y lark-docx2md@latest download <url>
|
|
|
57
57
|
> - 非 agent 模式下 `--wb-format yaml` 时:`--wb-image-mode` 强制为 `online`。
|
|
58
58
|
> - `--filter-title`:按标题文本精确匹配(忽略前后空格),收集该标题及其所有子级块,遇到同级或更高级标题时停止。同名标题取首个;未匹配时错误信息附全文标题 yaml 清单。
|
|
59
59
|
> - `--filter-title-block-id`:按 heading 块 id 严格相等匹配,适用于同名标题或脚本化场景;通常先用 `get-titles` 查出目标 `blockId` 再传入。与 `--filter-title` 互斥。
|
|
60
|
+
> - **命中深层标题时自动注入父级标题(仅 heading 块本身)**:两个过滤参数均会按文档顺序补齐包含路径上的顶层→该标题的所有祖先标题,以保留章节层级上下文;不会引入旁支兄弟或伪造跳级。
|
|
60
61
|
|
|
61
62
|
## 子命令:`get-titles`
|
|
62
63
|
|
|
@@ -100,7 +101,7 @@ npx -y lark-docx2md@latest get-titles --agent <url>
|
|
|
100
101
|
| Callout | 高亮块 | `>[!TIP]` + 子块 |
|
|
101
102
|
| Divider | 分割线 | `---` |
|
|
102
103
|
| Image | 图片 | `` |
|
|
103
|
-
| Table / TableCell | 表格 |
|
|
104
|
+
| Table / TableCell | 表格 | GFM 管道表格(合并单元格按值展开) |
|
|
104
105
|
| QuoteContainer | 引用容器 | `> 子块内容` |
|
|
105
106
|
| Grid / GridColumn | 分栏布局 | 展平为子块内容 |
|
|
106
107
|
| Sheet | 电子表格 | GFM 表格(合并单元格自动展开) |
|
package/dist/cli.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { a as setLogLevel, n as buildTitleTree, o as serializeYaml, r as getTitles, t as convert } from "./converter-
|
|
2
|
+
import { a as setLogLevel, n as buildTitleTree, o as serializeYaml, r as getTitles, t as convert } from "./converter-C2dAcaTO.js";
|
|
3
3
|
import * as fs from "node:fs";
|
|
4
4
|
import * as path from "node:path";
|
|
5
5
|
import { Command } from "commander";
|
|
@@ -657,6 +657,75 @@ function registerBuiltinParsers(parser) {
|
|
|
657
657
|
parser.register(sheetBlockParser);
|
|
658
658
|
}
|
|
659
659
|
//#endregion
|
|
660
|
+
//#region src/sheet/index.ts
|
|
661
|
+
const escapeCell = (s) => s.replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
662
|
+
function cellToMd(cell) {
|
|
663
|
+
if (cell == null) return "";
|
|
664
|
+
if (typeof cell !== "object") return escapeCell(String(cell));
|
|
665
|
+
if (Array.isArray(cell)) return cell.map(cellToMd).join("");
|
|
666
|
+
const o = cell;
|
|
667
|
+
switch (o.type) {
|
|
668
|
+
case "text": return escapeCell(String(o.text ?? ""));
|
|
669
|
+
case "url": return `[${escapeCell(o.text ?? o.link ?? "")}](${o.link ?? ""})`;
|
|
670
|
+
case "mentionUser": return `@${escapeCell(o.name ?? o.textArr?.join("") ?? "")}`;
|
|
671
|
+
case "formula": return `\`${escapeCell(o.text ?? "")}\``;
|
|
672
|
+
default:
|
|
673
|
+
if (o.text != null) return escapeCell(String(o.text));
|
|
674
|
+
return "";
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
function expandMerges(rows, merges) {
|
|
678
|
+
const grid = rows.map((r) => r.slice());
|
|
679
|
+
for (const m of merges) {
|
|
680
|
+
const r0 = m.start_row_index ?? 0;
|
|
681
|
+
const r1 = m.end_row_index ?? r0;
|
|
682
|
+
const c0 = m.start_column_index ?? 0;
|
|
683
|
+
const c1 = m.end_column_index ?? c0;
|
|
684
|
+
const v = grid[r0]?.[c0] ?? "";
|
|
685
|
+
for (let r = r0; r <= r1; r++) {
|
|
686
|
+
if (!grid[r]) grid[r] = [];
|
|
687
|
+
for (let c = c0; c <= c1; c++) {
|
|
688
|
+
if (r === r0 && c === c0) continue;
|
|
689
|
+
grid[r][c] = v;
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
return grid;
|
|
694
|
+
}
|
|
695
|
+
const isEmptyCell = (v) => v == null || v === "";
|
|
696
|
+
function trimTrailingEmpty(rows) {
|
|
697
|
+
let lastRow = -1;
|
|
698
|
+
for (let r = 0; r < rows.length; r++) if ((rows[r] ?? []).some((c) => !isEmptyCell(c))) lastRow = r;
|
|
699
|
+
if (lastRow < 0) return [];
|
|
700
|
+
const trimmedRows = rows.slice(0, lastRow + 1);
|
|
701
|
+
let lastCol = -1;
|
|
702
|
+
for (const row of trimmedRows) for (let c = (row?.length ?? 0) - 1; c >= 0; c--) if (!isEmptyCell(row[c])) {
|
|
703
|
+
if (c > lastCol) lastCol = c;
|
|
704
|
+
break;
|
|
705
|
+
}
|
|
706
|
+
if (lastCol < 0) return trimmedRows.map(() => []);
|
|
707
|
+
return trimmedRows.map((r) => (r ?? []).slice(0, lastCol + 1));
|
|
708
|
+
}
|
|
709
|
+
function columnIndexToLetter(n) {
|
|
710
|
+
let result = "";
|
|
711
|
+
while (n > 0) {
|
|
712
|
+
n--;
|
|
713
|
+
result = String.fromCharCode(65 + n % 26) + result;
|
|
714
|
+
n = Math.floor(n / 26);
|
|
715
|
+
}
|
|
716
|
+
return result;
|
|
717
|
+
}
|
|
718
|
+
function renderMarkdownTable(rows) {
|
|
719
|
+
if (!rows.length) return "_(空表)_\n\n";
|
|
720
|
+
const [head, ...body] = rows;
|
|
721
|
+
if (!head || head.length === 0) return "_(空表)_\n\n";
|
|
722
|
+
let out = `| ${head.join(" | ")} |\n`;
|
|
723
|
+
out += `| ${head.map(() => "---").join(" | ")} |\n`;
|
|
724
|
+
for (const r of body) out += `| ${r.join(" | ")} |\n`;
|
|
725
|
+
out += "\n";
|
|
726
|
+
return out;
|
|
727
|
+
}
|
|
728
|
+
//#endregion
|
|
660
729
|
//#region src/md-ast/serializer.ts
|
|
661
730
|
var MdSerializer = class {
|
|
662
731
|
constructor() {
|
|
@@ -821,22 +890,38 @@ const tableSerializer = {
|
|
|
821
890
|
type: "table",
|
|
822
891
|
serialize(node, ctx) {
|
|
823
892
|
if (node.type !== "table") return "";
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
893
|
+
return renderMarkdownTable(rebuildGrid(node.rows, ctx));
|
|
894
|
+
}
|
|
895
|
+
};
|
|
896
|
+
/**
|
|
897
|
+
* 基于 parser 产出的稀疏 rows(被合并覆盖的格已被过滤,顶格包含 rowSpan/colSpan)
|
|
898
|
+
* 还原为稠密二维字符串网格:Markdown 不支持合并,按"复制左上角值"展开。
|
|
899
|
+
*/
|
|
900
|
+
function rebuildGrid(rows, ctx) {
|
|
901
|
+
const occupied = /* @__PURE__ */ new Set();
|
|
902
|
+
const grid = [];
|
|
903
|
+
for (let r = 0; r < rows.length; r++) {
|
|
904
|
+
if (!grid[r]) grid[r] = [];
|
|
905
|
+
let c = 0;
|
|
906
|
+
for (const cell of rows[r].cells) {
|
|
907
|
+
while (occupied.has(`${r}-${c}`)) c++;
|
|
908
|
+
const value = escapeCell(ctx.serializeInline(cell.content));
|
|
909
|
+
const rs = cell.rowSpan ?? 1;
|
|
910
|
+
const cs = cell.colSpan ?? 1;
|
|
911
|
+
for (let rr = r; rr < r + rs; rr++) {
|
|
912
|
+
if (!grid[rr]) grid[rr] = [];
|
|
913
|
+
for (let cc = c; cc < c + cs; cc++) {
|
|
914
|
+
grid[rr][cc] = value;
|
|
915
|
+
if (rr !== r || cc !== c) occupied.add(`${rr}-${cc}`);
|
|
916
|
+
}
|
|
833
917
|
}
|
|
834
|
-
|
|
918
|
+
c += cs;
|
|
835
919
|
}
|
|
836
|
-
buf += "</table>\n";
|
|
837
|
-
return buf;
|
|
838
920
|
}
|
|
839
|
-
|
|
921
|
+
const maxCols = grid.reduce((m, row) => Math.max(m, row.length), 0);
|
|
922
|
+
for (const row of grid) for (let i = 0; i < maxCols; i++) if (row[i] === void 0) row[i] = "";
|
|
923
|
+
return grid;
|
|
924
|
+
}
|
|
840
925
|
const gridSerializer = {
|
|
841
926
|
type: "grid",
|
|
842
927
|
serialize(node, ctx) {
|
|
@@ -862,19 +947,7 @@ const sheetResolvedSerializer = {
|
|
|
862
947
|
out += `> ${s.error}\n\n`;
|
|
863
948
|
continue;
|
|
864
949
|
}
|
|
865
|
-
|
|
866
|
-
out += "_(空表)_\n\n";
|
|
867
|
-
continue;
|
|
868
|
-
}
|
|
869
|
-
const [head, ...body] = s.rows;
|
|
870
|
-
if (!head) {
|
|
871
|
-
out += "_(空表)_\n\n";
|
|
872
|
-
continue;
|
|
873
|
-
}
|
|
874
|
-
out += `| ${head.join(" | ")} |\n`;
|
|
875
|
-
out += `| ${head.map(() => "---").join(" | ")} |\n`;
|
|
876
|
-
for (const r of body) out += `| ${r.join(" | ")} |\n`;
|
|
877
|
-
out += "\n";
|
|
950
|
+
out += renderMarkdownTable(s.rows);
|
|
878
951
|
}
|
|
879
952
|
return out;
|
|
880
953
|
}
|
|
@@ -2587,65 +2660,6 @@ function collectImageTokens(nodes, out) {
|
|
|
2587
2660
|
}
|
|
2588
2661
|
}
|
|
2589
2662
|
//#endregion
|
|
2590
|
-
//#region src/sheet/index.ts
|
|
2591
|
-
const escapeCell = (s) => s.replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
2592
|
-
function cellToMd(cell) {
|
|
2593
|
-
if (cell == null) return "";
|
|
2594
|
-
if (typeof cell !== "object") return escapeCell(String(cell));
|
|
2595
|
-
if (Array.isArray(cell)) return cell.map(cellToMd).join("");
|
|
2596
|
-
const o = cell;
|
|
2597
|
-
switch (o.type) {
|
|
2598
|
-
case "text": return escapeCell(String(o.text ?? ""));
|
|
2599
|
-
case "url": return `[${escapeCell(o.text ?? o.link ?? "")}](${o.link ?? ""})`;
|
|
2600
|
-
case "mentionUser": return `@${escapeCell(o.name ?? o.textArr?.join("") ?? "")}`;
|
|
2601
|
-
case "formula": return `\`${escapeCell(o.text ?? "")}\``;
|
|
2602
|
-
default:
|
|
2603
|
-
if (o.text != null) return escapeCell(String(o.text));
|
|
2604
|
-
return "";
|
|
2605
|
-
}
|
|
2606
|
-
}
|
|
2607
|
-
function expandMerges(rows, merges) {
|
|
2608
|
-
const grid = rows.map((r) => r.slice());
|
|
2609
|
-
for (const m of merges) {
|
|
2610
|
-
const r0 = m.start_row_index ?? 0;
|
|
2611
|
-
const r1 = m.end_row_index ?? r0;
|
|
2612
|
-
const c0 = m.start_column_index ?? 0;
|
|
2613
|
-
const c1 = m.end_column_index ?? c0;
|
|
2614
|
-
const v = grid[r0]?.[c0] ?? "";
|
|
2615
|
-
for (let r = r0; r <= r1; r++) {
|
|
2616
|
-
if (!grid[r]) grid[r] = [];
|
|
2617
|
-
for (let c = c0; c <= c1; c++) {
|
|
2618
|
-
if (r === r0 && c === c0) continue;
|
|
2619
|
-
grid[r][c] = v;
|
|
2620
|
-
}
|
|
2621
|
-
}
|
|
2622
|
-
}
|
|
2623
|
-
return grid;
|
|
2624
|
-
}
|
|
2625
|
-
const isEmptyCell = (v) => v == null || v === "";
|
|
2626
|
-
function trimTrailingEmpty(rows) {
|
|
2627
|
-
let lastRow = -1;
|
|
2628
|
-
for (let r = 0; r < rows.length; r++) if ((rows[r] ?? []).some((c) => !isEmptyCell(c))) lastRow = r;
|
|
2629
|
-
if (lastRow < 0) return [];
|
|
2630
|
-
const trimmedRows = rows.slice(0, lastRow + 1);
|
|
2631
|
-
let lastCol = -1;
|
|
2632
|
-
for (const row of trimmedRows) for (let c = (row?.length ?? 0) - 1; c >= 0; c--) if (!isEmptyCell(row[c])) {
|
|
2633
|
-
if (c > lastCol) lastCol = c;
|
|
2634
|
-
break;
|
|
2635
|
-
}
|
|
2636
|
-
if (lastCol < 0) return trimmedRows.map(() => []);
|
|
2637
|
-
return trimmedRows.map((r) => (r ?? []).slice(0, lastCol + 1));
|
|
2638
|
-
}
|
|
2639
|
-
function columnIndexToLetter(n) {
|
|
2640
|
-
let result = "";
|
|
2641
|
-
while (n > 0) {
|
|
2642
|
-
n--;
|
|
2643
|
-
result = String.fromCharCode(65 + n % 26) + result;
|
|
2644
|
-
n = Math.floor(n / 26);
|
|
2645
|
-
}
|
|
2646
|
-
return result;
|
|
2647
|
-
}
|
|
2648
|
-
//#endregion
|
|
2649
2663
|
//#region src/logger.ts
|
|
2650
2664
|
const COLORS = {
|
|
2651
2665
|
[LoggerLevel.fatal]: "\x1B[35m",
|
|
@@ -3013,7 +3027,9 @@ function toHeadingInfo(block) {
|
|
|
3013
3027
|
*
|
|
3014
3028
|
* 复用要点:
|
|
3015
3029
|
* - page 节点(block_type=1)始终保留
|
|
3016
|
-
* - scanning 阶段把所有 heading 推入 availableHeadings
|
|
3030
|
+
* - scanning 阶段把所有 heading 推入 availableHeadings,并以栈式回溯维护祖先链
|
|
3031
|
+
* - 命中时把祖先 heading(level < 命中 level)按文档顺序注入 collected,
|
|
3032
|
+
* 作为「仅标题」上下文(heading 的非命中兄弟内容不在 blockMap 中,Parser 会自动跳过)
|
|
3017
3033
|
* - collecting 阶段遇到同级或更高级标题终止
|
|
3018
3034
|
*/
|
|
3019
3035
|
function createHeadingMatchFilter(match) {
|
|
@@ -3021,6 +3037,7 @@ function createHeadingMatchFilter(match) {
|
|
|
3021
3037
|
let matchedLevel = 0;
|
|
3022
3038
|
const collected = [];
|
|
3023
3039
|
const seen = [];
|
|
3040
|
+
const ancestorStack = [];
|
|
3024
3041
|
function pageHandler(blocks) {
|
|
3025
3042
|
for (const block of blocks) {
|
|
3026
3043
|
if (block.block_type === 1) {
|
|
@@ -3031,12 +3048,17 @@ function createHeadingMatchFilter(match) {
|
|
|
3031
3048
|
case "scanning": {
|
|
3032
3049
|
const info = toHeadingInfo(block);
|
|
3033
3050
|
if (info) {
|
|
3051
|
+
while (ancestorStack.length > 0 && ancestorStack[ancestorStack.length - 1].info.level >= info.level) ancestorStack.pop();
|
|
3034
3052
|
seen.push(info);
|
|
3035
3053
|
if (match(block, info)) {
|
|
3054
|
+
for (const entry of ancestorStack) collected.push(entry.block);
|
|
3036
3055
|
state = "collecting";
|
|
3037
3056
|
matchedLevel = info.level;
|
|
3038
3057
|
collected.push(block);
|
|
3039
|
-
}
|
|
3058
|
+
} else ancestorStack.push({
|
|
3059
|
+
info,
|
|
3060
|
+
block
|
|
3061
|
+
});
|
|
3040
3062
|
}
|
|
3041
3063
|
break;
|
|
3042
3064
|
}
|
|
@@ -3240,4 +3262,4 @@ function buildFilterErrorMessage(opts, result, url, docToken) {
|
|
|
3240
3262
|
//#endregion
|
|
3241
3263
|
export { setLogLevel as a, parseWikiUrl as i, buildTitleTree as n, serializeYaml as o, getTitles as r, convert as t };
|
|
3242
3264
|
|
|
3243
|
-
//# sourceMappingURL=converter-
|
|
3265
|
+
//# sourceMappingURL=converter-C2dAcaTO.js.map
|