@cesarandreslopez/occ 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -28
- package/dist/bin/occ.d.ts +2 -0
- package/{bin → dist/bin}/occ.js +1 -0
- package/dist/bin/occ.js.map +1 -0
- package/dist/src/cli.d.ts +1 -0
- package/dist/src/cli.js +184 -0
- package/dist/src/cli.js.map +1 -0
- package/dist/src/markdown/convert.d.ts +2 -0
- package/dist/src/markdown/convert.js +117 -0
- package/dist/src/markdown/convert.js.map +1 -0
- package/dist/src/output/json.d.ts +4 -0
- package/dist/src/output/json.js +42 -0
- package/dist/src/output/json.js.map +1 -0
- package/dist/src/output/tabular.d.ts +12 -0
- package/dist/src/output/tabular.js +238 -0
- package/dist/src/output/tabular.js.map +1 -0
- package/dist/src/output/tree.d.ts +11 -0
- package/dist/src/output/tree.js +79 -0
- package/dist/src/output/tree.js.map +1 -0
- package/dist/src/parsers/docx.d.ts +2 -0
- package/dist/src/parsers/docx.js +14 -0
- package/dist/src/parsers/docx.js.map +1 -0
- package/dist/src/parsers/index.d.ts +4 -0
- package/dist/src/parsers/index.js +65 -0
- package/dist/src/parsers/index.js.map +1 -0
- package/dist/src/parsers/odf.d.ts +2 -0
- package/dist/src/parsers/odf.js +54 -0
- package/dist/src/parsers/odf.js.map +1 -0
- package/dist/src/parsers/pdf.d.ts +2 -0
- package/dist/src/parsers/pdf.js +43 -0
- package/dist/src/parsers/pdf.js.map +1 -0
- package/dist/src/parsers/pptx.d.ts +2 -0
- package/dist/src/parsers/pptx.js +19 -0
- package/dist/src/parsers/pptx.js.map +1 -0
- package/dist/src/parsers/xlsx.d.ts +2 -0
- package/dist/src/parsers/xlsx.js +21 -0
- package/dist/src/parsers/xlsx.js.map +1 -0
- package/dist/src/progress.d.ts +10 -0
- package/dist/src/progress.js +38 -0
- package/dist/src/progress.js.map +1 -0
- package/dist/src/scc.d.ts +28 -0
- package/dist/src/scc.js +83 -0
- package/dist/src/scc.js.map +1 -0
- package/dist/src/stats.d.ts +30 -0
- package/dist/src/stats.js +88 -0
- package/dist/src/stats.js.map +1 -0
- package/dist/src/structure/extract.d.ts +7 -0
- package/dist/src/structure/extract.js +176 -0
- package/dist/src/structure/extract.js.map +1 -0
- package/dist/src/structure/index.d.ts +3 -0
- package/dist/src/structure/index.js +3 -0
- package/dist/src/structure/index.js.map +1 -0
- package/dist/src/structure/types.d.ts +29 -0
- package/dist/src/structure/types.js +72 -0
- package/dist/src/structure/types.js.map +1 -0
- package/dist/src/types.d.ts +20 -0
- package/dist/src/types.js +2 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/utils.d.ts +9 -0
- package/dist/src/utils.js +37 -0
- package/dist/src/utils.js.map +1 -0
- package/dist/src/walker.d.ts +13 -0
- package/dist/src/walker.js +59 -0
- package/dist/src/walker.js.map +1 -0
- package/package.json +13 -6
- package/scripts/postinstall.js +28 -1
- package/src/cli.js +0 -126
- package/src/output/json.js +0 -37
- package/src/output/tabular.js +0 -197
- package/src/parsers/docx.js +0 -23
- package/src/parsers/index.js +0 -72
- package/src/parsers/odf.js +0 -85
- package/src/parsers/pdf.js +0 -56
- package/src/parsers/pptx.js +0 -32
- package/src/parsers/xlsx.js +0 -31
- package/src/progress.js +0 -45
- package/src/scc.js +0 -94
- package/src/stats.js +0 -143
- package/src/utils.js +0 -35
- package/src/walker.js +0 -86
package/README.md
CHANGED
|
@@ -19,6 +19,7 @@ OCC scans directories for office documents (DOCX, XLSX, PPTX, PDF, ODT, ODS, ODP
|
|
|
19
19
|
|
|
20
20
|
- **Office document metrics** — words, pages, paragraphs, slides, sheets, rows, cells
|
|
21
21
|
- **Seven formats supported** — DOCX, XLSX, PPTX, PDF, ODT, ODS, ODP
|
|
22
|
+
- **Document structure extraction** — `--structure` parses heading hierarchy into a navigable tree with dotted section codes (1, 1.1, 1.2, ...)
|
|
22
23
|
- **Code metrics via scc** — auto-detects code files and integrates scc output
|
|
23
24
|
- **Multiple output modes** — grouped by type, per-file breakdown, or JSON
|
|
24
25
|
- **CI-friendly** — ASCII-only, no-color mode for pipelines
|
|
@@ -46,6 +47,7 @@ npx @cesarandreslopez/occ docs/ reports/
|
|
|
46
47
|
```bash
|
|
47
48
|
git clone https://github.com/cesarandreslopez/occ.git && cd occ
|
|
48
49
|
npm install
|
|
50
|
+
npm run build
|
|
49
51
|
npm start
|
|
50
52
|
```
|
|
51
53
|
|
|
@@ -64,6 +66,12 @@ occ --by-file docs/
|
|
|
64
66
|
# JSON output
|
|
65
67
|
occ --format json docs/
|
|
66
68
|
|
|
69
|
+
# Extract document structure (heading hierarchy)
|
|
70
|
+
occ --structure docs/
|
|
71
|
+
|
|
72
|
+
# Structure as JSON
|
|
73
|
+
occ --structure --format json docs/
|
|
74
|
+
|
|
67
75
|
# Only specific formats
|
|
68
76
|
occ --include-ext pdf,docx docs/
|
|
69
77
|
|
|
@@ -77,37 +85,55 @@ occ --ci docs/
|
|
|
77
85
|
## Example Output
|
|
78
86
|
|
|
79
87
|
```
|
|
80
|
-
-- Documents
|
|
81
|
-
Format
|
|
82
|
-
|
|
83
|
-
Word
|
|
84
|
-
PDF
|
|
85
|
-
Excel
|
|
86
|
-
|
|
87
|
-
Total
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
88
|
+
-- Documents ---------------------------------------------------------------
|
|
89
|
+
Format Files Words Pages Details Size
|
|
90
|
+
----------------------------------------------------------------------------
|
|
91
|
+
Word 12 34,210 137 1,203 paras 1.2 MB
|
|
92
|
+
PDF 8 22,540 64 4.5 MB
|
|
93
|
+
Excel 3 12 sheets 890 KB
|
|
94
|
+
----------------------------------------------------------------------------
|
|
95
|
+
Total 23 56,750 201 1,203 paras 6.5 MB
|
|
96
|
+
|
|
97
|
+
-- Code (via scc) ----------------------------------------------------------
|
|
98
|
+
Language Files Lines Blanks Comments Code
|
|
99
|
+
----------------------------------------------------------------------------
|
|
100
|
+
JavaScript 15 2340 180 320 1840
|
|
101
|
+
Python 8 1200 90 150 960
|
|
102
|
+
----------------------------------------------------------------------------
|
|
103
|
+
Total 23 3540 270 470 2800
|
|
104
|
+
|
|
105
|
+
Scanned 23 documents (56,750 words, 201 pages) in 120ms
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Structure Output (`--structure`)
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
-- Structure: report.docx --------------------------------------------------
|
|
112
|
+
1 Executive Summary
|
|
113
|
+
1.1 Background ......................................... p.1
|
|
114
|
+
1.2 Key Findings ....................................... p.1-2
|
|
115
|
+
2 Methodology
|
|
116
|
+
2.1 Data Collection .................................... p.3
|
|
117
|
+
2.2 Analysis Framework ................................. p.4
|
|
118
|
+
2.2.1 Quantitative Methods ........................... p.4
|
|
119
|
+
2.2.2 Qualitative Methods ............................ p.5
|
|
120
|
+
3 Results ................................................ p.6-8
|
|
121
|
+
4 Conclusions ............................................ p.9
|
|
122
|
+
|
|
123
|
+
4 sections, 10 nodes, max depth 3
|
|
98
124
|
```
|
|
99
125
|
|
|
100
126
|
## Supported Formats
|
|
101
127
|
|
|
102
|
-
| Format | Extension | Metrics |
|
|
103
|
-
|
|
104
|
-
| Word | `.docx` | words, pages*, paragraphs |
|
|
105
|
-
| PDF | `.pdf` | words, pages |
|
|
106
|
-
| Excel | `.xlsx` | sheets, rows, cells |
|
|
107
|
-
| PowerPoint | `.pptx` | words, slides |
|
|
108
|
-
| ODT | `.odt` | words, pages*, paragraphs |
|
|
109
|
-
| ODS | `.ods` | sheets, rows, cells |
|
|
110
|
-
| ODP | `.odp` | words, slides |
|
|
128
|
+
| Format | Extension | Metrics | Structure |
|
|
129
|
+
|--------|-----------|---------|-----------|
|
|
130
|
+
| Word | `.docx` | words, pages*, paragraphs | Yes |
|
|
131
|
+
| PDF | `.pdf` | words, pages | Yes (with page mapping) |
|
|
132
|
+
| Excel | `.xlsx` | sheets, rows, cells | — |
|
|
133
|
+
| PowerPoint | `.pptx` | words, slides | Yes (slide headers) |
|
|
134
|
+
| ODT | `.odt` | words, pages*, paragraphs | Yes (best-effort) |
|
|
135
|
+
| ODS | `.ods` | sheets, rows, cells | — |
|
|
136
|
+
| ODP | `.odp` | words, slides | Yes (slide headers) |
|
|
111
137
|
|
|
112
138
|
\* Pages for Word/ODT are estimated at 250 words/page.
|
|
113
139
|
|
|
@@ -117,6 +143,7 @@ Total 23 3540 270 470 2800
|
|
|
117
143
|
|------|-------------|---------|
|
|
118
144
|
| `--by-file` / `-f` | Row per file | grouped by type |
|
|
119
145
|
| `--format <type>` | `tabular` or `json` | `tabular` |
|
|
146
|
+
| `--structure` | Extract and display document heading hierarchy | off |
|
|
120
147
|
| `--include-ext <exts>` | Comma-separated extensions | all supported |
|
|
121
148
|
| `--exclude-ext <exts>` | Comma-separated to skip | none |
|
|
122
149
|
| `--exclude-dir <dirs>` | Directories to skip | `node_modules,.git` |
|
|
@@ -151,12 +178,15 @@ Tools like `scc`, `cloc`, and `tokei` give you instant visibility into codebases
|
|
|
151
178
|
|
|
152
179
|
- **Context budgeting** — LLMs have finite context windows. OCC's word and page counts let agents estimate how much of a document set they can ingest before hitting token limits
|
|
153
180
|
- **Prioritization** — an agent deciding which documents to read can use OCC's JSON output to rank files by size, word count, or type, focusing on the most relevant content first
|
|
181
|
+
- **RAG chunk mapping** — `--structure --format json` outputs heading trees with character offsets, enabling chunk-to-section mapping, scoped retrieval, and citation paths in RAG pipelines
|
|
154
182
|
- **Repository mapping** — agents exploring an unfamiliar codebase can run `occ --format json` to build a structured inventory of all non-code content alongside `scc` code metrics
|
|
155
183
|
- **Pipeline integration** — JSON output pipes directly into agent toolchains for automated document analysis, summarization, or compliance checking
|
|
156
184
|
|
|
157
185
|
## How It Works
|
|
158
186
|
|
|
159
|
-
OCC uses [fast-glob](https://github.com/mrmlnc/fast-glob) for file discovery, dispatches to format-specific parsers (mammoth for DOCX, pdf-parse for PDF, SheetJS for XLSX, JSZip + officeparser for PPTX/ODF), aggregates metrics, and renders output via cli-table3. For code metrics, it shells out to a vendored [scc](https://github.com/boyter/scc) binary (auto-downloaded during `npm install`, with PATH fallback).
|
|
187
|
+
OCC is written in TypeScript and uses [fast-glob](https://github.com/mrmlnc/fast-glob) for file discovery, dispatches to format-specific parsers (mammoth for DOCX, pdf-parse for PDF, SheetJS for XLSX, JSZip + officeparser for PPTX/ODF), aggregates metrics, and renders output via cli-table3. For code metrics, it shells out to a vendored [scc](https://github.com/boyter/scc) binary (auto-downloaded during `npm install`, with PATH fallback).
|
|
188
|
+
|
|
189
|
+
For structure extraction (`--structure`), documents are first converted to markdown (mammoth + [turndown](https://github.com/mixmark-io/turndown) for DOCX, pdf-parse with page markers for PDF), then headers are extracted and assembled into a tree with dotted section codes.
|
|
160
190
|
|
|
161
191
|
## Contributing
|
|
162
192
|
|
package/{bin → dist/bin}/occ.js
RENAMED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"occ.js","sourceRoot":"","sources":["../../bin/occ.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,GAAG,EAAE,MAAM,eAAe,CAAC;AACpC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function run(argv: string[]): Promise<void>;
|
package/dist/src/cli.js
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import { Command, Option } from 'commander';
|
|
2
|
+
import { readFile, writeFile } from 'node:fs/promises';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { findFiles } from './walker.js';
|
|
6
|
+
import { parseFiles } from './parsers/index.js';
|
|
7
|
+
import { aggregate } from './stats.js';
|
|
8
|
+
import { formatDocumentTable, formatSccTable, formatSummaryLine } from './output/tabular.js';
|
|
9
|
+
import { formatJson } from './output/json.js';
|
|
10
|
+
import { checkScc, runScc } from './scc.js';
|
|
11
|
+
import { createProgress } from './progress.js';
|
|
12
|
+
import { documentToMarkdown } from './markdown/convert.js';
|
|
13
|
+
import { extractFromMarkdown } from './structure/index.js';
|
|
14
|
+
import { formatStructureTree } from './output/tree.js';
|
|
15
|
+
import { getExtension } from './utils.js';
|
|
16
|
+
// Find package.json — works from both src/ (dev) and dist/src/ (built)
|
|
17
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
18
|
+
async function loadPkg() {
|
|
19
|
+
for (const rel of ['..', '../..']) {
|
|
20
|
+
try {
|
|
21
|
+
return JSON.parse(await readFile(path.resolve(__dirname, rel, 'package.json'), 'utf8'));
|
|
22
|
+
}
|
|
23
|
+
catch { /* try next */ }
|
|
24
|
+
}
|
|
25
|
+
return { version: '0.0.0' };
|
|
26
|
+
}
|
|
27
|
+
const pkg = await loadPkg();
|
|
28
|
+
export async function run(argv) {
|
|
29
|
+
const program = new Command();
|
|
30
|
+
program
|
|
31
|
+
.name('occ')
|
|
32
|
+
.description('Office Cloc and Count — scc-style summary tables for office documents')
|
|
33
|
+
.version(pkg.version)
|
|
34
|
+
.argument('[directories...]', 'directories to scan', [])
|
|
35
|
+
.option('-f, --by-file', 'show a row per file instead of grouped by type')
|
|
36
|
+
.option('--format <type>', 'output format: tabular or json', 'tabular')
|
|
37
|
+
.option('--include-ext <exts>', 'comma-separated extensions to include')
|
|
38
|
+
.option('--exclude-ext <exts>', 'comma-separated extensions to exclude')
|
|
39
|
+
.option('--exclude-dir <dirs>', 'directories to skip (comma-separated)', 'node_modules,.git')
|
|
40
|
+
.option('--no-gitignore', 'disable .gitignore respect')
|
|
41
|
+
.addOption(new Option('--sort <col>', 'sort by: files, name, words, size').choices(['files', 'name', 'words', 'size']).default('files'))
|
|
42
|
+
.option('-o, --output <file>', 'write output to file')
|
|
43
|
+
.option('--ci', 'ASCII-only output, no colors')
|
|
44
|
+
.option('--large-file-limit <mb>', 'skip files over this size in MB', '50')
|
|
45
|
+
.option('--no-code', 'skip scc code analysis')
|
|
46
|
+
.option('--structure', 'extract and display document structure')
|
|
47
|
+
.action(async (directories, opts) => {
|
|
48
|
+
try {
|
|
49
|
+
await execute(directories, opts);
|
|
50
|
+
}
|
|
51
|
+
catch (err) {
|
|
52
|
+
const error = err;
|
|
53
|
+
process.stderr.write(`Error: ${error.message}\n`);
|
|
54
|
+
process.exit(1);
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
await program.parseAsync(argv);
|
|
58
|
+
}
|
|
59
|
+
function validateLargeFileLimit(value) {
|
|
60
|
+
const n = parseFloat(value);
|
|
61
|
+
if (Number.isNaN(n) || n <= 0) {
|
|
62
|
+
throw new Error(`Invalid --large-file-limit value: "${value}" (must be a positive number)`);
|
|
63
|
+
}
|
|
64
|
+
return n;
|
|
65
|
+
}
|
|
66
|
+
const STRUCTURABLE_EXTS = new Set(['docx', 'pdf', 'pptx', 'odt', 'odp']);
|
|
67
|
+
async function extractStructures(files, concurrency, onProgress) {
|
|
68
|
+
const results = [];
|
|
69
|
+
for (let i = 0; i < files.length; i += concurrency) {
|
|
70
|
+
const batch = files.slice(i, i + concurrency);
|
|
71
|
+
const batchResults = await Promise.allSettled(batch.map(async (f) => {
|
|
72
|
+
const markdown = await documentToMarkdown(f.path);
|
|
73
|
+
if (markdown == null)
|
|
74
|
+
return null;
|
|
75
|
+
const structure = extractFromMarkdown(markdown);
|
|
76
|
+
return { file: f.path, structure, markdown };
|
|
77
|
+
}));
|
|
78
|
+
for (let j = 0; j < batchResults.length; j++) {
|
|
79
|
+
const r = batchResults[j];
|
|
80
|
+
if (r.status === 'fulfilled' && r.value) {
|
|
81
|
+
results.push(r.value);
|
|
82
|
+
}
|
|
83
|
+
if (onProgress)
|
|
84
|
+
onProgress(1, batch[j]?.path);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return results;
|
|
88
|
+
}
|
|
89
|
+
async function execute(directories, opts) {
|
|
90
|
+
const startTime = Date.now();
|
|
91
|
+
const excludeDirs = opts.excludeDir
|
|
92
|
+
? opts.excludeDir.split(',').map(d => d.trim())
|
|
93
|
+
: ['node_modules', '.git'];
|
|
94
|
+
const includeCode = opts.code !== false;
|
|
95
|
+
let sccBinary = null;
|
|
96
|
+
if (includeCode) {
|
|
97
|
+
sccBinary = await checkScc();
|
|
98
|
+
}
|
|
99
|
+
// Find and parse office documents
|
|
100
|
+
const { files, skipped } = await findFiles(directories, {
|
|
101
|
+
includeExt: opts.includeExt,
|
|
102
|
+
excludeExt: opts.excludeExt,
|
|
103
|
+
excludeDir: excludeDirs,
|
|
104
|
+
noGitignore: !opts.gitignore,
|
|
105
|
+
largeFileLimit: validateLargeFileLimit(opts.largeFileLimit),
|
|
106
|
+
});
|
|
107
|
+
const showProgress = opts.format !== 'json' && process.stderr.isTTY;
|
|
108
|
+
let results = [];
|
|
109
|
+
if (files.length > 0) {
|
|
110
|
+
const progress = createProgress({ total: files.length, label: 'Parsing', enabled: showProgress });
|
|
111
|
+
results = await parseFiles(files, 10, (inc, detail) => progress.update(inc, detail));
|
|
112
|
+
progress.done();
|
|
113
|
+
}
|
|
114
|
+
const stats = aggregate(results, {
|
|
115
|
+
byFile: opts.byFile,
|
|
116
|
+
sort: opts.sort,
|
|
117
|
+
});
|
|
118
|
+
let sccData = null;
|
|
119
|
+
if (includeCode) {
|
|
120
|
+
if (showProgress)
|
|
121
|
+
process.stderr.write('\rAnalyzing code with scc...');
|
|
122
|
+
sccData = await runScc(sccBinary, directories, {
|
|
123
|
+
byFile: opts.byFile,
|
|
124
|
+
excludeDir: excludeDirs,
|
|
125
|
+
sort: opts.sort,
|
|
126
|
+
ci: opts.ci,
|
|
127
|
+
noGitignore: !opts.gitignore,
|
|
128
|
+
});
|
|
129
|
+
if (showProgress) {
|
|
130
|
+
const cols = process.stderr.columns || 80;
|
|
131
|
+
process.stderr.write('\r' + ' '.repeat(cols) + '\r');
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
// Structure extraction
|
|
135
|
+
let structureResults = [];
|
|
136
|
+
if (opts.structure) {
|
|
137
|
+
const structurableFiles = files.filter(f => STRUCTURABLE_EXTS.has(getExtension(f.path)));
|
|
138
|
+
if (structurableFiles.length > 0) {
|
|
139
|
+
const progress = createProgress({ total: structurableFiles.length, label: 'Extracting structure', enabled: showProgress });
|
|
140
|
+
structureResults = await extractStructures(structurableFiles, 10, (inc, detail) => progress.update(inc, detail));
|
|
141
|
+
progress.done();
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
// Format output
|
|
145
|
+
let output;
|
|
146
|
+
if (opts.format === 'json') {
|
|
147
|
+
output = formatJson(stats, sccData, opts.structure ? structureResults : undefined);
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
const parts = [];
|
|
151
|
+
if (files.length === 0 && (!sccData || sccData.length === 0)) {
|
|
152
|
+
parts.push('No files found.');
|
|
153
|
+
}
|
|
154
|
+
else {
|
|
155
|
+
if (files.length > 0) {
|
|
156
|
+
parts.push(formatDocumentTable(stats, { ci: opts.ci }));
|
|
157
|
+
}
|
|
158
|
+
if (sccData && sccData.length > 0) {
|
|
159
|
+
parts.push(formatSccTable(sccData, { ci: opts.ci, byFile: opts.byFile }));
|
|
160
|
+
}
|
|
161
|
+
// Structure trees
|
|
162
|
+
if (structureResults.length > 0) {
|
|
163
|
+
for (const sr of structureResults) {
|
|
164
|
+
parts.push(formatStructureTree(sr, { ci: opts.ci }));
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
const elapsed = Date.now() - startTime;
|
|
168
|
+
const summary = formatSummaryLine(stats, sccData, elapsed, { ci: opts.ci });
|
|
169
|
+
if (summary)
|
|
170
|
+
parts.push(summary);
|
|
171
|
+
}
|
|
172
|
+
if (skipped.length > 0) {
|
|
173
|
+
parts.push(`\n${skipped.length} file(s) skipped (use --large-file-limit to adjust)`);
|
|
174
|
+
}
|
|
175
|
+
output = parts.join('\n') + '\n';
|
|
176
|
+
}
|
|
177
|
+
if (opts.output) {
|
|
178
|
+
await writeFile(opts.output, output);
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
process.stdout.write(output);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
//# sourceMappingURL=cli.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/cli.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACvD,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAChD,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,EAAE,mBAAmB,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAC7F,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC9C,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,UAAU,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAC/C,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAC3D,OAAO,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AAC3D,OAAO,EAAE,mBAAmB,EAAuB,MAAM,kBAAkB,CAAC;AAK5E,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAiB1C,uEAAuE;AACvE,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC/D,KAAK,UAAU,OAAO;IACpB,KAAK,MAAM,GAAG,IAAI,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE,CAAC;QAClC,IAAI,CAAC;YAAC,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,GAAG,EAAE,cAAc,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;QAAC,CAAC;QAChG,MAAM,CAAC,CAAC,cAAc,CAAC,CAAC;IAC1B,CAAC;IACD,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC;AAC9B,CAAC;AACD,MAAM,GAAG,GAAG,MAAM,OAAO,EAAE,CAAC;AAE5B,MAAM,CAAC,KAAK,UAAU,GAAG,CAAC,IAAc;IACtC,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;IAE9B,OAAO;SACJ,IAAI,CAAC,KAAK,CAAC;SACX,WAAW,CAAC,uEAAuE,CAAC;SACpF,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC;SACpB,QAAQ,CAAC,kBAAkB,EAAE,qBAAqB,EAAE,EAAE,CAAC;SACvD,MAAM,CAAC,eAAe,EAAE,gDAAgD,CAAC;SACzE,MAAM,CAAC,iBAAiB,EAAE,gCAAgC,EAAE,SAAS,CAAC;SACtE,MAAM,CAAC,sBAAsB,EAAE,uCAAuC,CAAC;SACvE,MAAM,CAAC,sBAAsB,EAAE,uCAAuC,CAAC;SACvE,MAAM,CAAC,sBAAsB,EAAE,uCAAuC,EAAE,mBAAmB,CAAC;SAC5F,MAAM,CAAC,gBAAgB,EAAE,4BAA4B,CAAC;SACtD,SAAS,CAAC,IAAI,MAAM,CAAC,cAAc,EAAE,mCAAmC,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;SACvI,MAAM,CAAC,qBAAqB,EAAE,sBAAsB,CAAC;SACrD,MAAM,CAAC,MAAM,EAAE,8BAA8B,CAAC;SAC9C,MAAM,CAAC,yBAAyB,EAAE,iCAAiC,EAAE,IAAI,CAAC;SAC1E,MAAM,CAAC,WAAW,EAAE,wBAAwB,CAAC;SAC7C,MAAM,CAAC,aAAa,EAAE,wCAAwC,CAAC;SAC/D,MAAM,CAAC,KAAK,EAAE,WAAqB,EAAE,IAAgB,EAAE,EAAE;QACxD,IAAI,CAAC;YACH,MAAM,OAAO,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;QACnC,CAAC;QAAC,OAAO,GAAY,EAAE,CAAC;YACtB,MAAM,KAAK,GAAG,GAAY,CAAC;YAC3B,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,UAAU,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;YAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC,CAAC,CAAC;IAEL,MAAM,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAED,SAAS,sBAAsB,CAAC,KAAa;IAC3C,MAAM,CAAC,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;IAC5B,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,KAAK,CAAC,sCAAsC,KAAK,+BAA+B,CAAC,CAAC;IAC9F,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC;AAEzE,KAAK,UAAU,iBAAiB,CAC9B,KAAkB,EAClB,WAAmB,EACnB,UAAmD;IAEnD,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC;QACnD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,CAAC;QAC9C,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,UAAU,CAC3C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE;YACpB,MAAM,QAAQ,GAAG,MAAM,kBAAkB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAClD,IAAI,QAAQ,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAC;YAClC,MAAM,SAAS,GAAG,mBAAmB,CAAC,QAAQ,CAAC,CAAC;YAChD,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAqB,CAAC;QAClE,CAAC,CAAC,CACH,CAAC;QACF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,CAAC,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;YAC1B,IAAI,CAAC,CAAC,MAAM,KAAK,WAAW,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;gBACxC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACxB,CAAC;YACD,IAAI,UAAU;gBAAE,UAAU,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC;QAChD,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,KAAK,UAAU,OAAO,CAAC,WAAqB,EAAE,IAAgB;IAC5D,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC7B,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU;QACjC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC/C,CAAC,CAAC,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;IAE7B,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,KAAK,KAAK,CAAC;IAExC,IAAI,SAAS,GAAkB,IAAI,CAAC;IACpC,IAAI,WAAW,EAAE,CAAC;QAChB,SAAS,GAAG,MAAM,QAAQ,EAAE,CAAC;IAC/B,CAAC;IAED,kCAAkC;IAClC,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,SAAS,CAAC,WAAW,EAAE;QACtD,UAAU,EAAE,IAAI,CAAC,UAAU;QAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;QAC3B,UAAU,EAAE,WAAW;QACvB,WAAW,EAAE,CAAC,IAAI,CAAC,SAAS;QAC5B,cAAc,EAAE,sBAAsB,CAAC,IAAI,CAAC,cAAc,CAAC;KAC5D,CAAC,CAAC;IAEH,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,KAAK,MAAM,IAAI,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC;IACpE,IAAI,OAAO,GAAkB,EAAE,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrB,MAAM,QAAQ,GAAG,cAAc,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC;QAClG,OAAO,GAAG,MAAM,UAAU,CAAC,KAAK,EAAE,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC,CAAC;QACrF,QAAQ,CAAC,IAAI,EAAE,CAAC;IAClB,CAAC;IAED,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,EAAE;QAC/B,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,IAAI,EAAE,IAAI,CAAC,IAAI;KAChB,CAAC,CAAC;IAEH,IAAI,OAAO,GAAyB,IAAI,CAAC;IACzC,IAAI,WAAW,EAAE,CAAC;QAChB,IAAI,YAAY;YAAE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;QACvE,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,EAAE,WAAW,EAAE;YAC7C,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,UAAU,EAAE,WAAW;YACvB,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,EAAE,EAAE,IAAI,CAAC,EAAE;YACX,WAAW,EAAE,CAAC,IAAI,CAAC,SAAS;SAC7B,CAAC,CAAC;QACH,IAAI,YAAY,EAAE,CAAC;YACjB,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC;YAC1C,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IAED,uBAAuB;IACvB,IAAI,gBAAgB,GAAsB,EAAE,CAAC;IAC7C,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;QACnB,MAAM,iBAAiB,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,iBAAiB,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACzF,IAAI,iBAAiB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjC,MAAM,QAAQ,GAAG,cAAc,CAAC,EAAE,KAAK,EAAE,iBAAiB,CAAC,MAAM,EAAE,KAAK,EAAE,sBAAsB,EAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC;YAC3H,gBAAgB,GAAG,MAAM,iBAAiB,CAAC,iBAAiB,EAAE,EAAE,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC,CAAC;YACjH,QAAQ,CAAC,IAAI,EAAE,CAAC;QAClB,CAAC;IACH,CAAC;IAED,gBAAgB;IAChB,IAAI,MAAc,CAAC;IACnB,IAAI,IAAI,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;IACrF,CAAC;SAAM,CAAC;QACN,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC,EAAE,CAAC;YAC7D,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,KAAK,CAAC,IAAI,CAAC,mBAAmB,CAAC,KAAK,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;YAC1D,CAAC;YAED,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,EAAE,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAC5E,CAAC;YAED,kBAAkB;YAClB,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAChC,KAAK,MAAM,EAAE,IAAI,gBAAgB,EAAE,CAAC;oBAClC,KAAK,CAAC,IAAI,CAAC,mBAAmB,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;gBACvD,CAAC;YACH,CAAC;YAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACvC,MAAM,OAAO,GAAG,iBAAiB,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5E,IAAI,OAAO;gBAAE,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACnC,CAAC;QAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,KAAK,CAAC,IAAI,CAAC,KAAK,OAAO,CAAC,MAAM,qDAAqD,CAAC,CAAC;QACvF,CAAC;QAED,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IACnC,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,MAAM,SAAS,CAAC,IAAI,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC/B,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import mammoth from 'mammoth';
|
|
3
|
+
import pdf from 'pdf-parse';
|
|
4
|
+
import JSZip from 'jszip';
|
|
5
|
+
import officeparser from 'officeparser';
|
|
6
|
+
import TurndownService from 'turndown';
|
|
7
|
+
import { getExtension } from '../utils.js';
|
|
8
|
+
const turndown = new TurndownService({ headingStyle: 'atx' });
|
|
9
|
+
/** Convert a DOCX file to markdown via mammoth → HTML → turndown */
|
|
10
|
+
async function docxToMarkdown(filePath) {
|
|
11
|
+
const result = await mammoth.convertToHtml({ path: filePath });
|
|
12
|
+
const html = result.value || '';
|
|
13
|
+
if (!html.trim())
|
|
14
|
+
return '';
|
|
15
|
+
return turndown.turndown(html);
|
|
16
|
+
}
|
|
17
|
+
/** Convert a PDF to markdown with [Page N] markers */
|
|
18
|
+
async function pdfToMarkdown(filePath) {
|
|
19
|
+
const buffer = await readFile(filePath);
|
|
20
|
+
// Suppress pdf.js warnings
|
|
21
|
+
const originalLog = console.log;
|
|
22
|
+
console.log = (...args) => {
|
|
23
|
+
if (typeof args[0] === 'string' && (args[0].startsWith('Warning: ') || args[0].startsWith('Info: ') || args[0].startsWith('Deprecated API usage: '))) {
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
originalLog.apply(console, args);
|
|
27
|
+
};
|
|
28
|
+
let data;
|
|
29
|
+
try {
|
|
30
|
+
data = await pdf(buffer, {
|
|
31
|
+
pagerender: async (pageData) => {
|
|
32
|
+
const textContent = await pageData.getTextContent();
|
|
33
|
+
const strings = textContent.items.map(item => item.str);
|
|
34
|
+
return `[Page ${pageData.pageIndex + 1}]\n${strings.join(' ')}`;
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
finally {
|
|
39
|
+
console.log = originalLog;
|
|
40
|
+
}
|
|
41
|
+
return data.text;
|
|
42
|
+
}
|
|
43
|
+
/** Convert a PPTX to markdown with slide headers */
|
|
44
|
+
async function pptxToMarkdown(filePath) {
|
|
45
|
+
const buffer = await readFile(filePath);
|
|
46
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
47
|
+
// Get slide filenames sorted by number
|
|
48
|
+
const slideFiles = Object.keys(zip.files)
|
|
49
|
+
.filter(name => /^ppt\/slides\/slide\d+\.xml$/.test(name))
|
|
50
|
+
.sort((a, b) => {
|
|
51
|
+
const numA = parseInt(a.match(/slide(\d+)/)?.[1] || '0', 10);
|
|
52
|
+
const numB = parseInt(b.match(/slide(\d+)/)?.[1] || '0', 10);
|
|
53
|
+
return numA - numB;
|
|
54
|
+
});
|
|
55
|
+
// Get full text via officeparser
|
|
56
|
+
const fullText = await officeparser.parseOffice(buffer);
|
|
57
|
+
const slideCount = slideFiles.length;
|
|
58
|
+
if (slideCount <= 1) {
|
|
59
|
+
return `# Slide 1\n\n${fullText}`;
|
|
60
|
+
}
|
|
61
|
+
// Split text roughly between slides
|
|
62
|
+
const lines = fullText.split('\n');
|
|
63
|
+
const linesPerSlide = Math.max(1, Math.ceil(lines.length / slideCount));
|
|
64
|
+
const parts = [];
|
|
65
|
+
for (let i = 0; i < slideCount; i++) {
|
|
66
|
+
const start = i * linesPerSlide;
|
|
67
|
+
const end = Math.min((i + 1) * linesPerSlide, lines.length);
|
|
68
|
+
const slideText = lines.slice(start, end).join('\n').trim();
|
|
69
|
+
parts.push(`# Slide ${i + 1}\n\n${slideText}`);
|
|
70
|
+
}
|
|
71
|
+
return parts.join('\n\n');
|
|
72
|
+
}
|
|
73
|
+
/** Convert an ODT file to markdown (best-effort heading detection) */
|
|
74
|
+
async function odtToMarkdown(filePath) {
|
|
75
|
+
const text = await officeparser.parseOffice(filePath);
|
|
76
|
+
return text;
|
|
77
|
+
}
|
|
78
|
+
/** Convert an ODP file to markdown with slide headers */
|
|
79
|
+
async function odpToMarkdown(filePath) {
|
|
80
|
+
const buffer = await readFile(filePath);
|
|
81
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
82
|
+
const contentXml = await zip.file('content.xml')?.async('text');
|
|
83
|
+
if (!contentXml)
|
|
84
|
+
return '';
|
|
85
|
+
const slides = (contentXml.match(/<draw:page /g) || []).length;
|
|
86
|
+
const text = await officeparser.parseOffice(buffer);
|
|
87
|
+
if (slides <= 1) {
|
|
88
|
+
return `# Slide 1\n\n${text}`;
|
|
89
|
+
}
|
|
90
|
+
const lines = text.split('\n');
|
|
91
|
+
const linesPerSlide = Math.max(1, Math.ceil(lines.length / slides));
|
|
92
|
+
const parts = [];
|
|
93
|
+
for (let i = 0; i < slides; i++) {
|
|
94
|
+
const start = i * linesPerSlide;
|
|
95
|
+
const end = Math.min((i + 1) * linesPerSlide, lines.length);
|
|
96
|
+
const slideText = lines.slice(start, end).join('\n').trim();
|
|
97
|
+
parts.push(`# Slide ${i + 1}\n\n${slideText}`);
|
|
98
|
+
}
|
|
99
|
+
return parts.join('\n\n');
|
|
100
|
+
}
|
|
101
|
+
/** Convert a document to markdown. Returns null for unsupported formats (xlsx, ods). */
|
|
102
|
+
export async function documentToMarkdown(filePath) {
|
|
103
|
+
const ext = getExtension(filePath);
|
|
104
|
+
switch (ext) {
|
|
105
|
+
case 'docx': return docxToMarkdown(filePath);
|
|
106
|
+
case 'pdf': return pdfToMarkdown(filePath);
|
|
107
|
+
case 'pptx': return pptxToMarkdown(filePath);
|
|
108
|
+
case 'odt': return odtToMarkdown(filePath);
|
|
109
|
+
case 'odp': return odpToMarkdown(filePath);
|
|
110
|
+
case 'xlsx':
|
|
111
|
+
case 'ods':
|
|
112
|
+
return null;
|
|
113
|
+
default:
|
|
114
|
+
return null;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
//# sourceMappingURL=convert.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"convert.js","sourceRoot":"","sources":["../../../src/markdown/convert.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,GAAG,MAAM,WAAW,CAAC;AAC5B,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,YAAY,MAAM,cAAc,CAAC;AACxC,OAAO,eAAe,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAE3C,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC,CAAC;AAE9D,oEAAoE;AACpE,KAAK,UAAU,cAAc,CAAC,QAAgB;IAC5C,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC/D,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,IAAI,EAAE,CAAC;IAChC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAC5B,OAAO,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC;AAED,sDAAsD;AACtD,KAAK,UAAU,aAAa,CAAC,QAAgB;IAC3C,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAExC,2BAA2B;IAC3B,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC;IAChC,OAAO,CAAC,GAAG,GAAG,CAAC,GAAG,IAAe,EAAE,EAAE;QACnC,IAAI,OAAO,IAAI,CAAC,CAAC,CAAC,KAAK,QAAQ,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,wBAAwB,CAAC,CAAC,EAAE,CAAC;YACrJ,OAAO;QACT,CAAC;QACD,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;IACnC,CAAC,CAAC;IAEF,IAAI,IAAwC,CAAC;IAC7C,IAAI,CAAC;QACH,IAAI,GAAG,MAAM,GAAG,CAAC,MAAM,EAAE;YACvB,UAAU,EAAE,KAAK,EAAE,QAAuE,EAAE,EAAE;gBAC5F,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,cAAc,EAAuC,CAAC;gBACzF,MAAM,OAAO,GAAG,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACxD,OAAO,SAAS,QAAQ,CAAC,SAAS,GAAG,CAAC,MAAM,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YAClE,CAAC;SACF,CAAC,CAAC;IACL,CAAC;YAAS,CAAC;QACT,OAAO,CAAC,GAAG,GAAG,WAAW,CAAC;IAC5B,CAAC;IAED,OAAO,IAAI,CAAC,IAAI,CAAC;AACnB,CAAC;AAED,oDAAoD;AACpD,KAAK,UAAU,cAAc,CAAC,QAAgB;IAC5C,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,CAAC;IACxC,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IAE1C,uCAAuC;IACvC,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC;SACtC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,8BAA8B,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;SACzD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACb,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC7D,MAAM,IAAI,GAAG,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC7D,OAAO,IAAI,GAAG,IAAI,CAAC;IACrB,CAAC,CAAC,CAAC;IAEL,iCAAiC;IACjC,MAAM,QAAQ,GAAG,MAAM,YAAY,CAAC,WAAW,CAAC,MAAM,CAAsB,CAAC;IAC7E,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC;IAErC,IAAI,UAAU,IAAI,CAAC,EAAE,CAAC;QACpB,OAAO,gBAAgB,QAAQ,EAAE,CAAC;IACpC,CAAC;IAED,oCAAoC;IACpC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,UAAU,CAAC,CAAC,CAAC;IACxE,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,KAAK,GAAG,CAAC,GAAG,aAAa,CAAC;QAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5D,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,OAAO,SAAS,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED,sEAAsE;AACtE,KAAK,UAAU,aAAa,CAAC,QAAgB;IAC3C,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,WAAW,CAAC,QAAQ,CAAsB,CAAC;IAC3E,OAAO,IAAI,CAAC;AACd,CAAC;AAED,yDAAyD;AACzD,KAAK,UAAU,aAAa,CAAC,QAAgB;IAC3C,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,CAAC;IACxC,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IAC1C,MAAM,UAAU,GAAG,MAAM,GAAG,CAAC,IAAI,CAAC,aAAa,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAChE,IAAI,CAAC,UAAU;QAAE,OAAO,EAAE,CAAC;IAE3B,MAAM,MAAM,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;IAC/D,MAAM,IAAI,GAAG,MAAM,YAAY,CAAC,WAAW,CAAC,MAAM,CAAsB,CAAC;IAEzE,IAAI,MAAM,IAAI,CAAC,EAAE,CAAC;QAChB,OAAO,gBAAgB,IAAI,EAAE,CAAC;IAChC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC/B,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC;IACpE,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,MAAM,KAAK,GAAG,CAAC,GAAG,aAAa,CAAC;QAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,aAAa,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5D,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,OAAO,SAAS,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC;AAED,wFAAwF;AACxF,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IACvD,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;IAEnC,QAAQ,GAAG,EAAE,CAAC;QACZ,KAAK,MAAM,CAAC,CAAC,OAAO,cAAc,CAAC,QAAQ,CAAC,CAAC;QAC7C,KAAK,KAAK,CAAC,CAAC,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;QAC3C,KAAK,MAAM,CAAC,CAAC,OAAO,cAAc,CAAC,QAAQ,CAAC,CAAC;QAC7C,KAAK,KAAK,CAAC,CAAC,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;QAC3C,KAAK,KAAK,CAAC,CAAC,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAC;QAC3C,KAAK,MAAM,CAAC;QACZ,KAAK,KAAK;YACR,OAAO,IAAI,CAAC;QACd;YACE,OAAO,IAAI,CAAC;IAChB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { AggregateResult } from '../stats.js';
|
|
2
|
+
import type { SccLanguage } from '../scc.js';
|
|
3
|
+
import type { StructureResult } from './tree.js';
|
|
4
|
+
export declare function formatJson(stats: AggregateResult, sccData?: SccLanguage[] | null, structureResults?: StructureResult[]): string;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import { METRIC_FIELDS, hasKey } from '../utils.js';
|
|
2
|
+
import { formatStructureJson } from './tree.js';
|
|
3
|
+
export function formatJson(stats, sccData = null, structureResults) {
|
|
4
|
+
const { columns } = stats;
|
|
5
|
+
const mapRow = (r) => {
|
|
6
|
+
const entry = {
|
|
7
|
+
type: r.fileType,
|
|
8
|
+
...(r.fileName ? { name: r.fileName } : {}),
|
|
9
|
+
...(r.filePath ? { path: r.filePath } : {}),
|
|
10
|
+
count: r.files,
|
|
11
|
+
};
|
|
12
|
+
for (const f of METRIC_FIELDS) {
|
|
13
|
+
if (r[hasKey(f)])
|
|
14
|
+
entry[f] = r[f] || 0;
|
|
15
|
+
}
|
|
16
|
+
entry.size = r.size;
|
|
17
|
+
return entry;
|
|
18
|
+
};
|
|
19
|
+
const mapTotals = (t) => {
|
|
20
|
+
const entry = { files: t.files };
|
|
21
|
+
for (const f of METRIC_FIELDS) {
|
|
22
|
+
if (columns[hasKey(f)])
|
|
23
|
+
entry[f] = t[f];
|
|
24
|
+
}
|
|
25
|
+
entry.size = t.size;
|
|
26
|
+
return entry;
|
|
27
|
+
};
|
|
28
|
+
const output = {
|
|
29
|
+
documents: {
|
|
30
|
+
files: stats.rows.map(mapRow),
|
|
31
|
+
totals: mapTotals(stats.totals),
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
if (sccData && sccData.length > 0) {
|
|
35
|
+
output.code = sccData;
|
|
36
|
+
}
|
|
37
|
+
if (structureResults && structureResults.length > 0) {
|
|
38
|
+
output.structures = formatStructureJson(structureResults);
|
|
39
|
+
}
|
|
40
|
+
return JSON.stringify(output, null, 2);
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=json.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json.js","sourceRoot":"","sources":["../../../src/output/json.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,EAAE,mBAAmB,EAAE,MAAM,WAAW,CAAC;AAKhD,MAAM,UAAU,UAAU,CACxB,KAAsB,EACtB,UAAgC,IAAI,EACpC,gBAAoC;IAEpC,MAAM,EAAE,OAAO,EAAE,GAAG,KAAK,CAAC;IAE1B,MAAM,MAAM,GAAG,CAAC,CAAW,EAAE,EAAE;QAC7B,MAAM,KAAK,GAA4B;YACrC,IAAI,EAAE,CAAC,CAAC,QAAQ;YAChB,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3C,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3C,KAAK,EAAE,CAAC,CAAC,KAAK;SACf,CAAC;QACF,KAAK,MAAM,CAAC,IAAI,aAAa,EAAE,CAAC;YAC9B,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBAAE,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACzC,CAAC;QACD,KAAK,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;QACpB,OAAO,KAAK,CAAC;IACf,CAAC,CAAC;IAEF,MAAM,SAAS,GAAG,CAAC,CAAW,EAAE,EAAE;QAChC,MAAM,KAAK,GAA4B,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC;QAC1D,KAAK,MAAM,CAAC,IAAI,aAAa,EAAE,CAAC;YAC9B,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBAAE,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1C,CAAC;QACD,KAAK,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC;QACpB,OAAO,KAAK,CAAC;IACf,CAAC,CAAC;IAEF,MAAM,MAAM,GAA4B;QACtC,SAAS,EAAE;YACT,KAAK,EAAE,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC;YAC7B,MAAM,EAAE,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC;SAChC;KACF,CAAC;IAEF,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClC,MAAM,CAAC,IAAI,GAAG,OAAO,CAAC;IACxB,CAAC;IAED,IAAI,gBAAgB,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpD,MAAM,CAAC,UAAU,GAAG,mBAAmB,CAAC,gBAAgB,CAAC,CAAC;IAC5D,CAAC;IAED,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AACzC,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { AggregateResult } from '../stats.js';
|
|
2
|
+
import type { SccLanguage } from '../scc.js';
|
|
3
|
+
export interface TableOptions {
|
|
4
|
+
ci?: boolean;
|
|
5
|
+
byFile?: boolean;
|
|
6
|
+
}
|
|
7
|
+
export declare function formatDocumentTable(stats: AggregateResult, options?: TableOptions): string;
|
|
8
|
+
export declare function formatSccTable(sccData: SccLanguage[], options?: TableOptions): string;
|
|
9
|
+
export declare function formatSummaryLine(stats: AggregateResult, sccData: SccLanguage[] | null, elapsed: number, options?: TableOptions): string;
|
|
10
|
+
export declare function stripAnsi(str: string): string;
|
|
11
|
+
export declare function sectionHeader(title: string, width: number, ci?: boolean): string;
|
|
12
|
+
export declare function tableChars(ci: boolean): Record<string, string>;
|