lark-docx2md 0.2.0 → 0.2.1-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -90,6 +90,37 @@ function createClient(appId, appSecret, loggerLevel = LoggerLevel.warn) {
90
90
  await resp.writeFile(filename);
91
91
  return filename;
92
92
  }
93
+ async function getSpreadsheetInfo(token) {
94
+ const data = await call("getSpreadsheetInfo", () => client.sheets.v3.spreadsheet.get({ path: { spreadsheet_token: token } }));
95
+ const spreadsheet = data.spreadsheet ?? data;
96
+ return {
97
+ title: spreadsheet.title ?? "",
98
+ url: spreadsheet.url
99
+ };
100
+ }
101
+ async function listSheets(token) {
102
+ return (await call("listSheets", () => client.sheets.v3.spreadsheetSheet.query({ path: { spreadsheet_token: token } }))).sheets ?? [];
103
+ }
104
+ async function getSheetMeta(token, sheetId) {
105
+ const data = await call("getSheetMeta", () => client.sheets.v3.spreadsheetSheet.get({ path: {
106
+ spreadsheet_token: token,
107
+ sheet_id: sheetId
108
+ } }));
109
+ return data.sheet ?? data;
110
+ }
111
+ async function readSheetValues(token, range) {
112
+ const resp = await client.request({
113
+ url: `https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/${token}/values/${encodeURIComponent(range)}`,
114
+ method: "GET",
115
+ params: {
116
+ valueRenderOption: "UnformattedValue",
117
+ dateTimeRenderOption: "FormattedString"
118
+ },
119
+ headers: { "Content-Type": "application/json; charset=utf-8" }
120
+ });
121
+ if (resp?.code !== 0) throw new Error(`readSheetValues failed: [${resp?.code}] ${resp?.msg}`);
122
+ return resp.data?.valueRange?.values ?? [];
123
+ }
93
124
  return {
94
125
  getWikiNodeInfo,
95
126
  getDocxDocument,
@@ -97,7 +128,11 @@ function createClient(appId, appSecret, loggerLevel = LoggerLevel.warn) {
97
128
  downloadImage,
98
129
  batchGetTmpDownloadUrl,
99
130
  getWhiteboardNodes,
100
- downloadWhiteboardAsImage
131
+ downloadWhiteboardAsImage,
132
+ getSpreadsheetInfo,
133
+ listSheets,
134
+ getSheetMeta,
135
+ readSheetValues
101
136
  };
102
137
  }
103
138
  //#endregion
@@ -194,37 +229,36 @@ function parseElement(e, inline) {
194
229
  }
195
230
  function parseTextRun(tr) {
196
231
  const s = tr.text_element_style;
197
- const textNode = {
232
+ let node = s?.inline_code ? {
233
+ type: "inlineCode",
234
+ content: tr.content
235
+ } : {
198
236
  type: "text",
199
237
  content: tr.content
200
238
  };
201
- if (!s) return textNode;
202
- if (s.bold) return {
203
- type: "bold",
204
- children: [textNode]
239
+ if (!s) return node;
240
+ if (s.link) node = {
241
+ type: "link",
242
+ url: decodeURIComponent(s.link.url),
243
+ children: [node]
205
244
  };
206
- if (s.italic) return {
207
- type: "italic",
208
- children: [textNode]
245
+ if (s.underline) node = {
246
+ type: "underline",
247
+ children: [node]
209
248
  };
210
- if (s.strikethrough) return {
249
+ if (s.strikethrough) node = {
211
250
  type: "strikethrough",
212
- children: [textNode]
251
+ children: [node]
213
252
  };
214
- if (s.underline) return {
215
- type: "underline",
216
- children: [textNode]
217
- };
218
- if (s.inline_code) return {
219
- type: "inlineCode",
220
- content: tr.content
253
+ if (s.italic) node = {
254
+ type: "italic",
255
+ children: [node]
221
256
  };
222
- if (s.link) return {
223
- type: "link",
224
- url: decodeURIComponent(s.link.url),
225
- children: [textNode]
257
+ if (s.bold) node = {
258
+ type: "bold",
259
+ children: [node]
226
260
  };
227
- return textNode;
261
+ return node;
228
262
  }
229
263
  //#endregion
230
264
  //#region src/md-ast/parsers/page.ts
@@ -562,6 +596,17 @@ const whiteboardParser = {
562
596
  }
563
597
  };
564
598
  //#endregion
599
+ //#region src/md-ast/parsers/sheet.ts
600
+ const sheetBlockParser = {
601
+ blockType: 30,
602
+ parse(block, _ctx) {
603
+ return {
604
+ type: "sheet",
605
+ token: block.sheet?.token ?? ""
606
+ };
607
+ }
608
+ };
609
+ //#endregion
565
610
  //#region src/md-ast/parsers/index.ts
566
611
  function registerBuiltinParsers(parser) {
567
612
  parser.register(pageParser);
@@ -588,6 +633,7 @@ function registerBuiltinParsers(parser) {
588
633
  parser.register(tableParser);
589
634
  parser.register(quoteContainerParser);
590
635
  parser.register(whiteboardParser);
636
+ parser.register(sheetBlockParser);
591
637
  }
592
638
  //#endregion
593
639
  //#region src/md-ast/serializer.ts
@@ -783,6 +829,34 @@ const htmlSerializer = {
783
829
  return node.content;
784
830
  }
785
831
  };
832
+ const sheetResolvedSerializer = {
833
+ type: "sheetResolved",
834
+ serialize(node) {
835
+ if (node.type !== "sheetResolved") return "";
836
+ let out = "";
837
+ for (const s of node.sheets) {
838
+ out += `## 工作表:${s.title}\n\n`;
839
+ if (s.error) {
840
+ out += `> ${s.error}\n\n`;
841
+ continue;
842
+ }
843
+ if (!s.rows.length) {
844
+ out += "_(空表)_\n\n";
845
+ continue;
846
+ }
847
+ const [head, ...body] = s.rows;
848
+ if (!head) {
849
+ out += "_(空表)_\n\n";
850
+ continue;
851
+ }
852
+ out += `| ${head.join(" | ")} |\n`;
853
+ out += `| ${head.map(() => "---").join(" | ")} |\n`;
854
+ for (const r of body) out += `| ${r.join(" | ")} |\n`;
855
+ out += "\n";
856
+ }
857
+ return out;
858
+ }
859
+ };
786
860
  function registerBuiltinSerializers(serializer) {
787
861
  serializer.register(pageSerializer);
788
862
  serializer.register(headingSerializer);
@@ -799,6 +873,7 @@ function registerBuiltinSerializers(serializer) {
799
873
  serializer.register(tableSerializer);
800
874
  serializer.register(gridSerializer);
801
875
  serializer.register(htmlSerializer);
876
+ serializer.register(sheetResolvedSerializer);
802
877
  }
803
878
  //#endregion
804
879
  //#region src/whiteboard/utils.ts
@@ -2490,6 +2565,65 @@ function collectImageTokens$1(nodes, out) {
2490
2565
  }
2491
2566
  }
2492
2567
  //#endregion
2568
+ //#region src/sheet/index.ts
2569
+ const escapeCell = (s) => s.replace(/\|/g, "\\|").replace(/\n/g, "<br>");
2570
+ function cellToMd(cell) {
2571
+ if (cell == null) return "";
2572
+ if (typeof cell !== "object") return escapeCell(String(cell));
2573
+ if (Array.isArray(cell)) return cell.map(cellToMd).join("");
2574
+ const o = cell;
2575
+ switch (o.type) {
2576
+ case "text": return escapeCell(String(o.text ?? ""));
2577
+ case "url": return `[${escapeCell(o.text ?? o.link ?? "")}](${o.link ?? ""})`;
2578
+ case "mentionUser": return `@${escapeCell(o.name ?? o.textArr?.join("") ?? "")}`;
2579
+ case "formula": return `\`${escapeCell(o.text ?? "")}\``;
2580
+ default:
2581
+ if (o.text != null) return escapeCell(String(o.text));
2582
+ return "";
2583
+ }
2584
+ }
2585
+ function expandMerges(rows, merges) {
2586
+ const grid = rows.map((r) => r.slice());
2587
+ for (const m of merges) {
2588
+ const r0 = m.start_row_index ?? 0;
2589
+ const r1 = m.end_row_index ?? r0;
2590
+ const c0 = m.start_column_index ?? 0;
2591
+ const c1 = m.end_column_index ?? c0;
2592
+ const v = grid[r0]?.[c0] ?? "";
2593
+ for (let r = r0; r <= r1; r++) {
2594
+ if (!grid[r]) grid[r] = [];
2595
+ for (let c = c0; c <= c1; c++) {
2596
+ if (r === r0 && c === c0) continue;
2597
+ grid[r][c] = v;
2598
+ }
2599
+ }
2600
+ }
2601
+ return grid;
2602
+ }
2603
+ const isEmptyCell = (v) => v == null || v === "";
2604
+ function trimTrailingEmpty(rows) {
2605
+ let lastRow = -1;
2606
+ for (let r = 0; r < rows.length; r++) if ((rows[r] ?? []).some((c) => !isEmptyCell(c))) lastRow = r;
2607
+ if (lastRow < 0) return [];
2608
+ const trimmedRows = rows.slice(0, lastRow + 1);
2609
+ let lastCol = -1;
2610
+ for (const row of trimmedRows) for (let c = (row?.length ?? 0) - 1; c >= 0; c--) if (!isEmptyCell(row[c])) {
2611
+ if (c > lastCol) lastCol = c;
2612
+ break;
2613
+ }
2614
+ if (lastCol < 0) return trimmedRows.map(() => []);
2615
+ return trimmedRows.map((r) => (r ?? []).slice(0, lastCol + 1));
2616
+ }
2617
+ function columnIndexToLetter(n) {
2618
+ let result = "";
2619
+ while (n > 0) {
2620
+ n--;
2621
+ result = String.fromCharCode(65 + n % 26) + result;
2622
+ n = Math.floor(n / 26);
2623
+ }
2624
+ return result;
2625
+ }
2626
+ //#endregion
2493
2627
  //#region src/logger.ts
2494
2628
  const COLORS = {
2495
2629
  [LoggerLevel.fatal]: "\x1B[35m",
@@ -2540,13 +2674,14 @@ var MdTransformer = class {
2540
2674
  async transform(ast) {
2541
2675
  const imageTokens = collectImageTokens(ast);
2542
2676
  const whiteboardTokens = collectWhiteboardTokens(ast);
2543
- replaceInAst(ast, await this.resolveImages(imageTokens), await this.resolveWhiteboards(whiteboardTokens));
2677
+ const sheetTokens = collectSheetTokens(ast);
2678
+ replaceInAst(ast, await this.resolveImages(imageTokens), await this.resolveWhiteboards(whiteboardTokens), await this.resolveSheets(sheetTokens));
2544
2679
  }
2545
2680
  async resolveImages(tokens) {
2546
2681
  const map = /* @__PURE__ */ new Map();
2547
2682
  const uniqueTokens = [...new Set(tokens)];
2548
2683
  if (uniqueTokens.length === 0) return map;
2549
- if (this.opts.imageMode === "online" || this.opts.agent) for (let i = 0; i < uniqueTokens.length; i += 5) {
2684
+ if (this.opts.imageMode === "online" || this.opts.agent === true) for (let i = 0; i < uniqueTokens.length; i += 5) {
2550
2685
  const batch = uniqueTokens.slice(i, i + 5);
2551
2686
  const urlMap = await this.client.batchGetTmpDownloadUrl(batch);
2552
2687
  for (const token of batch) {
@@ -2679,6 +2814,70 @@ var MdTransformer = class {
2679
2814
  }
2680
2815
  return yamlContent;
2681
2816
  }
2817
+ async resolveSheets(tokens) {
2818
+ const map = /* @__PURE__ */ new Map();
2819
+ const uniqueTokens = [...new Set(tokens)];
2820
+ if (uniqueTokens.length === 0) return map;
2821
+ for (const token of uniqueTokens) {
2822
+ if (!token) continue;
2823
+ try {
2824
+ const info = await this.client.getSpreadsheetInfo(token);
2825
+ const list = await this.client.listSheets(token);
2826
+ const resolved = [];
2827
+ for (const s of list) {
2828
+ if (s.hidden) continue;
2829
+ if (s.resource_type && s.resource_type !== "sheet") {
2830
+ resolved.push({
2831
+ title: s.title ?? "",
2832
+ kind: "bitable",
2833
+ rows: [],
2834
+ error: `非网格表(${s.resource_type}),已跳过`
2835
+ });
2836
+ continue;
2837
+ }
2838
+ try {
2839
+ const meta = await this.client.getSheetMeta(token, s.sheet_id);
2840
+ const { row_count = 0, column_count = 0 } = meta.grid_properties ?? {};
2841
+ if (!row_count || !column_count) {
2842
+ resolved.push({
2843
+ title: s.title ?? "",
2844
+ kind: "grid",
2845
+ rows: []
2846
+ });
2847
+ continue;
2848
+ }
2849
+ const endCol = columnIndexToLetter(column_count);
2850
+ const trimmed = trimTrailingEmpty(expandMerges((await this.client.readSheetValues(token, `${s.sheet_id}!A1:${endCol}${row_count}`) ?? []).map((row) => row.map(cellToMd)), meta.merges ?? []));
2851
+ resolved.push({
2852
+ title: s.title ?? "",
2853
+ kind: "grid",
2854
+ rows: trimmed
2855
+ });
2856
+ } catch (e) {
2857
+ resolved.push({
2858
+ title: s.title ?? "",
2859
+ kind: "grid",
2860
+ rows: [],
2861
+ error: `读取失败:${e.message}`
2862
+ });
2863
+ }
2864
+ }
2865
+ map.set(token, {
2866
+ type: "sheetResolved",
2867
+ title: info.title ?? "",
2868
+ sheets: resolved
2869
+ });
2870
+ } catch (e) {
2871
+ logger$1.warn(`Failed to render sheet ${token}:`, e.message);
2872
+ map.set(token, {
2873
+ type: "sheetResolved",
2874
+ title: "",
2875
+ sheets: []
2876
+ });
2877
+ }
2878
+ }
2879
+ return map;
2880
+ }
2682
2881
  };
2683
2882
  function collectImageTokens(node) {
2684
2883
  const tokens = [];
@@ -2694,7 +2893,14 @@ function collectWhiteboardTokens(node) {
2694
2893
  });
2695
2894
  return tokens;
2696
2895
  }
2697
- function replaceInAst(node, imageMap, whiteboardMap) {
2896
+ function collectSheetTokens(node) {
2897
+ const tokens = [];
2898
+ traverseBlockAst(node, (n) => {
2899
+ if (n.type === "sheet") tokens.push(n.token);
2900
+ });
2901
+ return tokens;
2902
+ }
2903
+ function replaceInAst(node, imageMap, whiteboardMap, sheetMap) {
2698
2904
  if (node.type === "image") {
2699
2905
  const newSrc = imageMap.get(node.src);
2700
2906
  if (newSrc) node.src = newSrc;
@@ -2711,11 +2917,18 @@ function replaceInAst(node, imageMap, whiteboardMap) {
2711
2917
  continue;
2712
2918
  }
2713
2919
  }
2920
+ if (child.type === "sheet") {
2921
+ const replacement = sheetMap.get(child.token);
2922
+ if (replacement) {
2923
+ children[i] = replacement;
2924
+ continue;
2925
+ }
2926
+ }
2714
2927
  if (child.type === "image") {
2715
2928
  const newSrc = imageMap.get(child.src);
2716
2929
  if (newSrc) child.src = newSrc;
2717
2930
  }
2718
- replaceInAst(child, imageMap, whiteboardMap);
2931
+ replaceInAst(child, imageMap, whiteboardMap, sheetMap);
2719
2932
  }
2720
2933
  }
2721
2934
  function traverseBlockAst(node, visitor) {
@@ -2730,7 +2943,7 @@ function hasBlockChildren(node) {
2730
2943
  //#region src/converter.ts
2731
2944
  const logger = createLogger("converter");
2732
2945
  function parseWikiUrl(url) {
2733
- const m = url.match(/^https:\/\/[\w.-]+\/(docs|docx|wiki)\/([a-zA-Z0-9]+)/);
2946
+ const m = url.match(/^https:\/\/[\w.-]+\/(docs|docx|wiki|sheets)\/([a-zA-Z0-9]+)/);
2734
2947
  if (!m) throw new Error("Invalid feishu document URL");
2735
2948
  return {
2736
2949
  docType: m[1],
@@ -2743,24 +2956,41 @@ async function convert(opts) {
2743
2956
  const sdkLoggerLevel = opts.agent ? LoggerLevel.error : LoggerLevel.warn;
2744
2957
  const client = createClient(opts.appId, opts.appSecret, sdkLoggerLevel);
2745
2958
  let docToken = rawToken;
2959
+ let objType = docType;
2746
2960
  if (docType === "wiki") {
2747
- docToken = (await client.getWikiNodeInfo(docToken)).obj_token;
2748
- logger.info("Resolved docx token:", docToken);
2749
- }
2750
- const doc = await client.getDocxDocument(docToken);
2751
- const blocks = await client.getDocxBlocks(docToken);
2752
- logger.info(`Fetched ${blocks.length} blocks`);
2753
- const parser = new Parser();
2754
- registerBuiltinParsers(parser);
2755
- const ast = parser.parse(doc, blocks);
2961
+ const node = await client.getWikiNodeInfo(docToken);
2962
+ docToken = node.obj_token;
2963
+ objType = node.obj_type ?? "docx";
2964
+ logger.info("Resolved wiki node:", objType, docToken);
2965
+ } else if (docType === "sheets") objType = "sheet";
2966
+ let ast;
2967
+ if (objType === "sheet") ast = {
2968
+ type: "page",
2969
+ title: [{
2970
+ type: "text",
2971
+ content: (await client.getSpreadsheetInfo(docToken)).title ?? ""
2972
+ }],
2973
+ children: [{
2974
+ type: "sheet",
2975
+ token: docToken
2976
+ }]
2977
+ };
2978
+ else {
2979
+ const doc = await client.getDocxDocument(docToken);
2980
+ const blocks = await client.getDocxBlocks(docToken);
2981
+ logger.info(`Fetched ${blocks.length} blocks`);
2982
+ const parser = new Parser();
2983
+ registerBuiltinParsers(parser);
2984
+ ast = parser.parse(doc, blocks);
2985
+ }
2756
2986
  await new MdTransformer(client, opts).transform(ast);
2757
2987
  const serializer = new MdSerializer();
2758
2988
  registerBuiltinSerializers(serializer);
2759
2989
  const markdown = serializer.serialize(ast);
2760
2990
  let filePath;
2761
- if (!opts.agent) {
2991
+ if (!opts.agent || opts.agent === "local") {
2762
2992
  fs.mkdirSync(opts.output, { recursive: true });
2763
- filePath = path.join(opts.output, `${docToken}.md`);
2993
+ filePath = path.resolve(opts.output, `${docToken}.md`);
2764
2994
  fs.writeFileSync(filePath, markdown);
2765
2995
  logger.info("Downloaded markdown file to", filePath);
2766
2996
  }
@@ -2773,4 +3003,4 @@ async function convert(opts) {
2773
3003
  //#endregion
2774
3004
  export { parseWikiUrl as n, setLogLevel as r, convert as t };
2775
3005
 
2776
- //# sourceMappingURL=converter-ByfbJV0N.js.map
3006
+ //# sourceMappingURL=converter-FwY1m1jm.js.map