@cesarandreslopez/occ 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,24 +77,24 @@ occ --ci docs/
77
77
  ## Example Output
78
78
 
79
79
  ```
80
- -- Documents -----------------------------------------------------------
81
- Format Files Words Pages Extra Size
82
- --------------------------------------------------------------------
83
- Word 12 34,210 137 1,203 paras 1.2 MB
84
- PDF 8 22,540 64 4.5 MB
85
- Excel 3 12 sheets 890 KB
86
- --------------------------------------------------------------------
87
- Total 23 56,750 201 1,203 paras 6.5 MB
88
- --------------------------------------------------------------------
89
-
90
- -- Code (via scc) ------------------------------------------------------
91
- Language Files Lines Blanks Comments Code
92
- --------------------------------------------------------------------
93
- JavaScript 15 2340 180 320 1840
94
- Python 8 1200 90 150 960
95
- --------------------------------------------------------------------
96
- Total 23 3540 270 470 2800
97
- --------------------------------------------------------------------
80
+ -- Documents ---------------------------------------------------------------
81
+ Format Files Words Pages Details Size
82
+ ----------------------------------------------------------------------------
83
+ Word 12 34,210 137 1,203 paras 1.2 MB
84
+ PDF 8 22,540 64 4.5 MB
85
+ Excel 3 12 sheets 890 KB
86
+ ----------------------------------------------------------------------------
87
+ Total 23 56,750 201 1,203 paras 6.5 MB
88
+
89
+ -- Code (via scc) ----------------------------------------------------------
90
+ Language Files Lines Blanks Comments Code
91
+ ----------------------------------------------------------------------------
92
+ JavaScript 15 2340 180 320 1840
93
+ Python 8 1200 90 150 960
94
+ ----------------------------------------------------------------------------
95
+ Total 23 3540 270 470 2800
96
+
97
+ Scanned 23 documents (56,750 words, 201 pages) in 120ms
98
98
  ```
99
99
 
100
100
  ## Supported Formats
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cesarandreslopez/occ",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "description": "Office Cloc and Count — scc-style summary tables for office documents",
5
5
  "type": "module",
6
6
  "bin": {
@@ -3,7 +3,8 @@
3
3
  // Downloads the scc binary for the current platform during npm install.
4
4
  // Falls back gracefully — if download fails, occ will look for scc on PATH.
5
5
 
6
- import { createWriteStream, existsSync, mkdirSync, chmodSync, unlinkSync } from 'node:fs';
6
+ import { createWriteStream, readFileSync, existsSync, mkdirSync, chmodSync, unlinkSync } from 'node:fs';
7
+ import { createHash } from 'node:crypto';
7
8
  import { pipeline } from 'node:stream/promises';
8
9
  import { execFile } from 'node:child_process';
9
10
  import { promisify } from 'node:util';
@@ -27,6 +28,18 @@ const ARCH_MAP = {
27
28
  ia32: 'i386',
28
29
  };
29
30
 
31
+ // SHA-256 checksums from the official scc v3.7.0 release
32
+ const CHECKSUMS = {
33
+ 'scc_Darwin_arm64.tar.gz': '376cbae670be59ee64f398de20e0694ec434bf8a9b842642952b0ab0be5f3961',
34
+ 'scc_Darwin_x86_64.tar.gz': 'c3f7457856b9169ccb3c1dd14198e67f730bee065f24d9051bf52cdc2a719ecc',
35
+ 'scc_Linux_arm64.tar.gz': 'dcb05c6e993bb2d8d2da4765ff018f2e752325dd205a41698929c55e4123575d',
36
+ 'scc_Linux_i386.tar.gz': '1de91dae8a927ac2063a99b520d9a474644db6827fe6f85e3d8f87a1def3b14d',
37
+ 'scc_Linux_x86_64.tar.gz': '3d9d65b00ca874c2b29151abe7e1480736f5229edc3ce8e4b2791460cdfabf5a',
38
+ 'scc_Windows_arm64.zip': 'fd114614c10382c9ed2e32d5455cc4b51960a9f71691c5c1ca42b31adea5b84d',
39
+ 'scc_Windows_i386.zip': '7b887022c37dc79e79ae51897030a6ff2515ab7b124e7b2aabcb0fba15412b05',
40
+ 'scc_Windows_x86_64.zip': '97abf9d55d4b79d3310536d576ccbdf5017aeb425780e850336120b6e67622e1',
41
+ };
42
+
30
43
  function getAssetName() {
31
44
  const platform = PLATFORM_MAP[process.platform];
32
45
  const arch = ARCH_MAP[process.arch];
@@ -53,6 +66,19 @@ async function download(url, dest) {
53
66
  await pipeline(res.body, createWriteStream(dest));
54
67
  }
55
68
 
69
+ function verifyChecksum(filePath, assetName) {
70
+ const expected = CHECKSUMS[assetName];
71
+ if (!expected) {
72
+ console.warn(`occ: No checksum available for ${assetName}, skipping verification`);
73
+ return;
74
+ }
75
+ const data = readFileSync(filePath);
76
+ const actual = createHash('sha256').update(data).digest('hex');
77
+ if (actual !== expected) {
78
+ throw new Error(`Checksum mismatch for ${assetName}\n Expected: ${expected}\n Got: ${actual}`);
79
+ }
80
+ }
81
+
56
82
  async function extract(archive, destDir) {
57
83
  if (archive.endsWith('.tar.gz')) {
58
84
  await execFileAsync('tar', ['xzf', archive, '-C', destDir]);
@@ -92,6 +118,7 @@ async function main() {
92
118
  try {
93
119
  console.log(`Downloading scc v${SCC_VERSION} for ${process.platform}-${process.arch}...`);
94
120
  await download(url, archivePath);
121
+ verifyChecksum(archivePath, assetName);
95
122
 
96
123
  console.log('Extracting...');
97
124
  await extract(archivePath, vendorDir);
package/src/cli.js CHANGED
@@ -1,20 +1,22 @@
1
- import { Command } from 'commander';
2
- import { writeFile } from 'node:fs/promises';
1
+ import { Command, Option } from 'commander';
2
+ import { readFile, writeFile } from 'node:fs/promises';
3
3
  import { findFiles } from './walker.js';
4
4
  import { parseFiles } from './parsers/index.js';
5
5
  import { aggregate } from './stats.js';
6
- import { formatDocumentTable, formatSccTable } from './output/tabular.js';
6
+ import { formatDocumentTable, formatSccTable, formatSummaryLine } from './output/tabular.js';
7
7
  import { formatJson } from './output/json.js';
8
8
  import { checkScc, runScc } from './scc.js';
9
9
  import { createProgress } from './progress.js';
10
10
 
11
+ const pkg = JSON.parse(await readFile(new URL('../package.json', import.meta.url), 'utf8'));
12
+
11
13
  export async function run(argv) {
12
14
  const program = new Command();
13
15
 
14
16
  program
15
17
  .name('occ')
16
18
  .description('Office Cloc and Count — scc-style summary tables for office documents')
17
- .version('0.1.0')
19
+ .version(pkg.version)
18
20
  .argument('[directories...]', 'directories to scan', [])
19
21
  .option('-f, --by-file', 'show a row per file instead of grouped by type')
20
22
  .option('--format <type>', 'output format: tabular or json', 'tabular')
@@ -22,7 +24,7 @@ export async function run(argv) {
22
24
  .option('--exclude-ext <exts>', 'comma-separated extensions to exclude')
23
25
  .option('--exclude-dir <dirs>', 'directories to skip (comma-separated)', 'node_modules,.git')
24
26
  .option('--no-gitignore', 'disable .gitignore respect')
25
- .option('--sort <col>', 'sort by: files, name, words, size', 'files')
27
+ .addOption(new Option('--sort <col>', 'sort by: files, name, words, size').choices(['files', 'name', 'words', 'size']).default('files'))
26
28
  .option('-o, --output <file>', 'write output to file')
27
29
  .option('--ci', 'ASCII-only output, no colors')
28
30
  .option('--large-file-limit <mb>', 'skip files over this size in MB', '50')
@@ -39,14 +41,25 @@ export async function run(argv) {
39
41
  await program.parseAsync(argv);
40
42
  }
41
43
 
44
+ function validateLargeFileLimit(value) {
45
+ const n = parseFloat(value);
46
+ if (Number.isNaN(n) || n <= 0) {
47
+ throw new Error(`Invalid --large-file-limit value: "${value}" (must be a positive number)`);
48
+ }
49
+ return n;
50
+ }
51
+
42
52
  async function execute(directories, opts) {
53
+ const startTime = Date.now();
43
54
  const excludeDirs = opts.excludeDir
44
55
  ? opts.excludeDir.split(',').map(d => d.trim())
45
56
  : ['node_modules', '.git'];
46
57
 
47
- // Check scc availability (unless --no-code)
48
- if (opts.code !== false) {
49
- await checkScc();
58
+ const includeCode = opts.code !== false;
59
+
60
+ let sccBinary = null;
61
+ if (includeCode) {
62
+ sccBinary = await checkScc();
50
63
  }
51
64
 
52
65
  // Find and parse office documents
@@ -55,7 +68,7 @@ async function execute(directories, opts) {
55
68
  excludeExt: opts.excludeExt,
56
69
  excludeDir: excludeDirs,
57
70
  noGitignore: !opts.gitignore,
58
- largeFileLimit: parseFloat(opts.largeFileLimit),
71
+ largeFileLimit: validateLargeFileLimit(opts.largeFileLimit),
59
72
  });
60
73
 
61
74
  const showProgress = opts.format !== 'json' && process.stderr.isTTY;
@@ -71,11 +84,10 @@ async function execute(directories, opts) {
71
84
  sort: opts.sort,
72
85
  });
73
86
 
74
- // Run scc for code files
75
87
  let sccData = null;
76
- if (opts.code !== false) {
88
+ if (includeCode) {
77
89
  if (showProgress) process.stderr.write('\rAnalyzing code with scc...');
78
- sccData = await runScc(directories, {
90
+ sccData = await runScc(sccBinary, directories, {
79
91
  byFile: opts.byFile,
80
92
  excludeDir: excludeDirs,
81
93
  sort: opts.sort,
@@ -106,9 +118,9 @@ async function execute(directories, opts) {
106
118
  parts.push(formatSccTable(sccData, { ci: opts.ci, byFile: opts.byFile }));
107
119
  }
108
120
 
109
- if (files.length === 0) {
110
- parts.unshift('\nNo office documents found.');
111
- }
121
+ const elapsed = Date.now() - startTime;
122
+ const summary = formatSummaryLine(stats, sccData, elapsed, { ci: opts.ci });
123
+ if (summary) parts.push(summary);
112
124
  }
113
125
 
114
126
  if (skipped.length > 0) {
@@ -1,31 +1,35 @@
1
+ import { METRIC_FIELDS, hasKey } from '../utils.js';
2
+
1
3
  export function formatJson(stats, sccData = null) {
4
+ const { columns } = stats;
5
+
6
+ const mapRow = (r) => {
7
+ const entry = {
8
+ type: r.fileType,
9
+ ...(r.fileName ? { name: r.fileName } : {}),
10
+ ...(r.filePath ? { path: r.filePath } : {}),
11
+ count: r.files,
12
+ };
13
+ for (const f of METRIC_FIELDS) {
14
+ if (r[hasKey(f)]) entry[f] = r[f] || 0;
15
+ }
16
+ entry.size = r.size;
17
+ return entry;
18
+ };
19
+
20
+ const mapTotals = (t) => {
21
+ const entry = { files: t.files };
22
+ for (const f of METRIC_FIELDS) {
23
+ if (columns[hasKey(f)]) entry[f] = t[f];
24
+ }
25
+ entry.size = t.size;
26
+ return entry;
27
+ };
28
+
2
29
  const output = {
3
30
  documents: {
4
- files: stats.rows.map(r => ({
5
- type: r.fileType,
6
- ...(r.fileName ? { name: r.fileName } : {}),
7
- ...(r.filePath ? { path: r.filePath } : {}),
8
- count: r.files,
9
- words: r.words || 0,
10
- pages: r.pages || 0,
11
- paragraphs: r.paragraphs || 0,
12
- sheets: r.sheets || 0,
13
- rows: r.rows || 0,
14
- cells: r.cells || 0,
15
- slides: r.slides || 0,
16
- size: r.size,
17
- })),
18
- totals: {
19
- files: stats.totals.files,
20
- words: stats.totals.words,
21
- pages: stats.totals.pages,
22
- paragraphs: stats.totals.paragraphs,
23
- sheets: stats.totals.sheets,
24
- rows: stats.totals.rows,
25
- cells: stats.totals.cells,
26
- slides: stats.totals.slides,
27
- size: stats.totals.size,
28
- },
31
+ files: stats.rows.map(mapRow),
32
+ totals: mapTotals(stats.totals),
29
33
  },
30
34
  };
31
35
 
@@ -6,25 +6,30 @@ export function formatDocumentTable(stats, options = {}) {
6
6
  const { ci = false } = options;
7
7
  const c = ci ? noColor : colorize;
8
8
 
9
- const headers = buildHeaders(stats.columns, stats.mode === 'by-file', c);
9
+ const isByFile = stats.mode === 'by-file';
10
+ const headers = buildHeaders(stats.columns, isByFile, c);
11
+ const colAligns = buildColAligns(stats.columns, isByFile);
10
12
  const table = new Table({
11
13
  head: headers.map(h => h.label),
12
- chars: ci ? asciiChars() : unicodeChars(),
14
+ chars: tableChars(ci),
13
15
  style: { head: [], border: [] },
16
+ colAligns,
14
17
  });
15
18
 
16
19
  for (const row of stats.rows) {
17
- table.push(buildRow(row, stats.columns, stats.mode === 'by-file', c));
20
+ table.push(buildRow(row, stats.columns, isByFile, c));
18
21
  }
19
22
 
20
- // Totals row
21
- const isByFile = stats.mode === 'by-file';
22
23
  table.push(buildRow(stats.totals, stats.columns, isByFile, c, true));
23
24
 
25
+ const tableStr = addSeparators(table.toString(), ci ? '-' : '─');
26
+
27
+ const tableWidth = stripAnsi(tableStr.split('\n')[0]).length;
28
+
24
29
  const lines = [];
25
30
  lines.push('');
26
- lines.push(c.header(`── Documents ${'─'.repeat(56)}`));
27
- lines.push(table.toString());
31
+ lines.push(c.header(sectionHeader('Documents', tableWidth, ci)));
32
+ lines.push(tableStr);
28
33
 
29
34
  // Footnotes
30
35
  const hasEstimatedPages = stats.rows.some(r =>
@@ -52,7 +57,7 @@ export function formatSccTable(sccData, options = {}) {
52
57
  c.headerCell('Comments'),
53
58
  c.headerCell('Code'),
54
59
  ],
55
- chars: ci ? asciiChars() : unicodeChars(),
60
+ chars: tableChars(ci),
56
61
  style: { head: [], border: [] },
57
62
  colAligns: ['left', 'right', 'right', 'right', 'right', 'right'],
58
63
  });
@@ -66,8 +71,8 @@ export function formatSccTable(sccData, options = {}) {
66
71
  c.type(file.Filename || file.Location || ''),
67
72
  formatNumber(1),
68
73
  c.number(formatNumber(file.Lines)),
69
- formatNumber(file.Blank),
70
- formatNumber(file.Comment),
74
+ c.number(formatNumber(file.Blank)),
75
+ c.number(formatNumber(file.Comment)),
71
76
  c.number(formatNumber(file.Code)),
72
77
  ]);
73
78
  }
@@ -76,8 +81,8 @@ export function formatSccTable(sccData, options = {}) {
76
81
  c.type(lang.Name),
77
82
  formatNumber(lang.Count),
78
83
  c.number(formatNumber(lang.Lines)),
79
- formatNumber(lang.Blank),
80
- formatNumber(lang.Comment),
84
+ c.number(formatNumber(lang.Blank)),
85
+ c.number(formatNumber(lang.Comment)),
81
86
  c.number(formatNumber(lang.Code)),
82
87
  ]);
83
88
  }
@@ -97,30 +102,117 @@ export function formatSccTable(sccData, options = {}) {
97
102
  c.total(formatNumber(totalCode)),
98
103
  ]);
99
104
 
105
+ const tableStr = addSeparators(table.toString(), ci ? '-' : '─');
106
+ const tableWidth = stripAnsi(tableStr.split('\n')[0]).length;
107
+
100
108
  const lines = [];
101
109
  lines.push('');
102
- lines.push(c.header(`── Code (via scc) ${'─'.repeat(51)}`));
103
- lines.push(table.toString());
110
+ lines.push(c.header(sectionHeader('Code (via scc)', tableWidth, ci)));
111
+ lines.push(tableStr);
104
112
 
105
113
  return lines.join('\n');
106
114
  }
107
115
 
116
+ export function formatSummaryLine(stats, sccData, elapsed, options = {}) {
117
+ const { ci = false } = options;
118
+ const c = ci ? noColor : colorize;
119
+
120
+ const parts = [];
121
+ if (stats && stats.totals.files > 0) {
122
+ let docPart = `${stats.totals.files} document${stats.totals.files !== 1 ? 's' : ''}`;
123
+ const details = [];
124
+ if (stats.totals.words > 0) details.push(`${formatNumber(stats.totals.words)} word${stats.totals.words !== 1 ? 's' : ''}`);
125
+ if (stats.totals.pages > 0) details.push(`${formatNumber(stats.totals.pages)} page${stats.totals.pages !== 1 ? 's' : ''}`);
126
+ if (details.length > 0) docPart += ` (${details.join(', ')})`;
127
+ parts.push(docPart);
128
+ }
129
+ if (sccData && sccData.length > 0) {
130
+ const totalCode = sccData.reduce((sum, l) => sum + (l.Code || 0), 0);
131
+ parts.push(`${formatNumber(totalCode)} lines of code`);
132
+ }
133
+
134
+ if (parts.length === 0) return '';
135
+
136
+ const time = elapsed >= 1000
137
+ ? `${(elapsed / 1000).toFixed(1)}s`
138
+ : `${elapsed}ms`;
139
+
140
+ return '\n' + c.dim(`Scanned ${parts.join(', ')} in ${time}`) + '\n';
141
+ }
142
+
143
+ /**
144
+ * Post-process table string to insert separator lines after the header row
145
+ * and before the totals row (last data row).
146
+ *
147
+ * Table layout from cli-table3 (with empty mid chars):
148
+ * line 0: top border
149
+ * line 1: header row
150
+ * lines 2..N-2: data rows
151
+ * line N-1: totals row
152
+ * line N: bottom border
153
+ */
154
+ function addSeparators(tableStr, char) {
155
+ const lines = tableStr.split('\n');
156
+ if (lines.length < 4) return tableStr;
157
+
158
+ // Use header row width — top border is narrower due to single-char top-mid vs 2-char middle
159
+ const width = stripAnsi(lines[1]).length;
160
+ const sep = char.repeat(width);
161
+
162
+ const result = [];
163
+ // Skip lines[0] (top border) — section header already serves as delimiter
164
+ result.push(lines[1]); // header row
165
+ result.push(sep); // header separator
166
+
167
+ // Data rows (everything except first 2 and last 2)
168
+ for (let i = 2; i < lines.length - 2; i++) {
169
+ result.push(lines[i]);
170
+ }
171
+
172
+ result.push(sep); // totals separator
173
+ result.push(lines[lines.length - 2]); // totals row
174
+ // Skip bottom border — totals row is the natural end
175
+
176
+ return result.join('\n');
177
+ }
178
+
179
+ function stripAnsi(str) {
180
+ return str.replace(/\x1b\[[0-9;]*m/g, '');
181
+ }
182
+
183
+ function sectionHeader(title, width, ci = false) {
184
+ const dash = ci ? '-' : '─';
185
+ const prefix = `${dash}${dash} ${title} `;
186
+ const padLen = Math.max(0, width - prefix.length);
187
+ return prefix + dash.repeat(padLen);
188
+ }
189
+
190
+ function hasExtraColumns(columns) {
191
+ return columns.hasParagraphs || columns.hasSheets || columns.hasSlides ||
192
+ columns.hasRows || columns.hasCells;
193
+ }
194
+
108
195
  function buildHeaders(columns, byFile, c) {
109
196
  const headers = [];
110
197
  headers.push({ key: 'format', label: c.headerCell(byFile ? 'File' : 'Format') });
111
198
  if (!byFile) headers.push({ key: 'files', label: c.headerCell('Files') });
112
199
  if (columns.hasWords) headers.push({ key: 'words', label: c.headerCell('Words') });
113
200
  if (columns.hasPages) headers.push({ key: 'pages', label: c.headerCell('Pages') });
114
-
115
- // Extra column for type-specific metrics
116
- const hasExtra = columns.hasParagraphs || columns.hasSheets || columns.hasSlides ||
117
- columns.hasRows || columns.hasCells;
118
- if (hasExtra) headers.push({ key: 'extra', label: c.headerCell('Extra') });
119
-
201
+ if (hasExtraColumns(columns)) headers.push({ key: 'extra', label: c.headerCell('Details') });
120
202
  headers.push({ key: 'size', label: c.headerCell('Size') });
121
203
  return headers;
122
204
  }
123
205
 
206
+ function buildColAligns(columns, byFile) {
207
+ const aligns = ['left']; // Format/File
208
+ if (!byFile) aligns.push('right'); // Files
209
+ if (columns.hasWords) aligns.push('right');
210
+ if (columns.hasPages) aligns.push('right');
211
+ if (hasExtraColumns(columns)) aligns.push('right');
212
+ aligns.push('right'); // Size
213
+ return aligns;
214
+ }
215
+
124
216
  function buildRow(row, columns, byFile, c, isTotal = false) {
125
217
  const fmt = isTotal ? c.total : (v) => v;
126
218
  const fmtType = isTotal ? c.total : c.type;
@@ -142,9 +234,7 @@ function buildRow(row, columns, byFile, c, isTotal = false) {
142
234
  if (columns.hasWords) cells.push(fmtNum(row.words ? formatNumber(row.words) : ''));
143
235
  if (columns.hasPages) cells.push(fmtNum(row.pages ? formatNumber(row.pages) : ''));
144
236
 
145
- const hasExtra = columns.hasParagraphs || columns.hasSheets || columns.hasSlides ||
146
- columns.hasRows || columns.hasCells;
147
- if (hasExtra) {
237
+ if (hasExtraColumns(columns)) {
148
238
  const parts = [];
149
239
  if (row.paragraphs) parts.push(`${formatNumber(row.paragraphs)} paras`);
150
240
  if (row.sheets) parts.push(`${formatNumber(row.sheets)} sheets`);
@@ -158,21 +248,13 @@ function buildRow(row, columns, byFile, c, isTotal = false) {
158
248
  return cells;
159
249
  }
160
250
 
161
- function unicodeChars() {
251
+ function tableChars(ci) {
252
+ const ch = ci ? '-' : '─';
162
253
  return {
163
- top: '─', 'top-mid': '─', 'top-left': '─', 'top-right': '─',
164
- bottom: '─', 'bottom-mid': '─', 'bottom-left': '─', 'bottom-right': '─',
165
- left: ' ', 'left-mid': '', mid: '', 'mid-mid': '',
166
- right: ' ', 'right-mid': '', middle: ' ',
167
- };
168
- }
169
-
170
- function asciiChars() {
171
- return {
172
- top: '-', 'top-mid': '-', 'top-left': '-', 'top-right': '-',
173
- bottom: '-', 'bottom-mid': '-', 'bottom-left': '-', 'bottom-right': '-',
174
- left: ' ', 'left-mid': '-', mid: '-', 'mid-mid': '-',
175
- right: ' ', 'right-mid': '-', middle: ' ',
254
+ top: ch, 'top-mid': ch, 'top-left': ch, 'top-right': ch,
255
+ bottom: ch, 'bottom-mid': ch, 'bottom-left': ch, 'bottom-right': ch,
256
+ left: ' ', 'left-mid': '', mid: '', 'mid-mid': '',
257
+ right: ' ', 'right-mid': '', middle: ' ',
176
258
  };
177
259
  }
178
260
 
@@ -186,12 +268,5 @@ const colorize = {
186
268
  dim: (s) => chalk.dim(s),
187
269
  };
188
270
 
189
- const noColor = {
190
- header: (s) => s,
191
- headerCell: (s) => s,
192
- type: (s) => s,
193
- number: (s) => s,
194
- total: (s) => s,
195
- error: (s) => s,
196
- dim: (s) => s,
197
- };
271
+ const identity = (s) => s;
272
+ const noColor = Object.fromEntries(Object.keys(colorize).map(k => [k, identity]));
@@ -10,14 +10,6 @@ export async function parseDocx(filePath) {
10
10
 
11
11
  return {
12
12
  fileType: 'Word',
13
- metrics: {
14
- words,
15
- pages,
16
- paragraphs,
17
- sheets: null,
18
- rows: null,
19
- cells: null,
20
- slides: null,
21
- },
13
+ metrics: { words, pages, paragraphs },
22
14
  };
23
15
  }
@@ -15,18 +15,22 @@ const PARSER_MAP = {
15
15
  odp: parseOdf,
16
16
  };
17
17
 
18
+ function failureResult(filePath, size, ext) {
19
+ return {
20
+ filePath,
21
+ size,
22
+ success: false,
23
+ fileType: EXTENSION_TO_TYPE[ext] || ext.toUpperCase(),
24
+ metrics: null,
25
+ };
26
+ }
27
+
18
28
  export async function parseFile(filePath, size) {
19
29
  const ext = getExtension(filePath);
20
30
  const parser = PARSER_MAP[ext];
21
31
 
22
32
  if (!parser) {
23
- return {
24
- filePath,
25
- size,
26
- success: false,
27
- fileType: EXTENSION_TO_TYPE[ext] || ext.toUpperCase(),
28
- metrics: null,
29
- };
33
+ return failureResult(filePath, size, ext);
30
34
  }
31
35
 
32
36
  try {
@@ -39,13 +43,7 @@ export async function parseFile(filePath, size) {
39
43
  metrics: result.metrics,
40
44
  };
41
45
  } catch {
42
- return {
43
- filePath,
44
- size,
45
- success: false,
46
- fileType: EXTENSION_TO_TYPE[ext] || ext.toUpperCase(),
47
- metrics: null,
48
- };
46
+ return failureResult(filePath, size, ext);
49
47
  }
50
48
  }
51
49
 
@@ -5,11 +5,12 @@ import { countWords, getExtension } from '../utils.js';
5
5
 
6
6
  export async function parseOdf(filePath) {
7
7
  const ext = getExtension(filePath);
8
- const buffer = await readFile(filePath);
9
8
 
10
9
  if (ext === 'odt') return parseOdt(filePath);
11
- if (ext === 'ods') return parseOds(filePath, buffer);
12
- if (ext === 'odp') return parseOdp(filePath, buffer);
10
+
11
+ const buffer = await readFile(filePath);
12
+ if (ext === 'ods') return parseOds(buffer);
13
+ if (ext === 'odp') return parseOdp(buffer);
13
14
 
14
15
  throw new Error(`Unsupported ODF format: ${ext}`);
15
16
  }
@@ -22,19 +23,11 @@ async function parseOdt(filePath) {
22
23
 
23
24
  return {
24
25
  fileType: 'ODT',
25
- metrics: {
26
- words,
27
- pages,
28
- paragraphs,
29
- sheets: null,
30
- rows: null,
31
- cells: null,
32
- slides: null,
33
- },
26
+ metrics: { words, pages, paragraphs },
34
27
  };
35
28
  }
36
29
 
37
- async function parseOds(filePath, buffer) {
30
+ async function parseOds(buffer) {
38
31
  const zip = await JSZip.loadAsync(buffer);
39
32
  const contentXml = await zip.file('content.xml')?.async('text');
40
33
  if (!contentXml) throw new Error('No content.xml found in ODS');
@@ -42,44 +35,28 @@ async function parseOds(filePath, buffer) {
42
35
  const sheets = (contentXml.match(/<table:table /g) || []).length;
43
36
  const rows = (contentXml.match(/<table:table-row/g) || []).length;
44
37
 
45
- // Use officeparser for cell text count
46
- const text = await officeparser.parseOffice(filePath);
38
+ // Use officeparser with buffer to avoid re-reading from disk
39
+ const text = await officeparser.parseOffice(buffer);
47
40
  const cells = text.split(/\n/).filter(s => s.trim().length > 0).length;
48
41
 
49
42
  return {
50
43
  fileType: 'ODS',
51
- metrics: {
52
- words: null,
53
- pages: null,
54
- paragraphs: null,
55
- sheets,
56
- rows,
57
- cells,
58
- slides: null,
59
- },
44
+ metrics: { sheets, rows, cells },
60
45
  };
61
46
  }
62
47
 
63
- async function parseOdp(filePath, buffer) {
48
+ async function parseOdp(buffer) {
64
49
  const zip = await JSZip.loadAsync(buffer);
65
50
  const contentXml = await zip.file('content.xml')?.async('text');
66
51
  if (!contentXml) throw new Error('No content.xml found in ODP');
67
52
 
68
53
  const slides = (contentXml.match(/<draw:page /g) || []).length;
69
54
 
70
- const text = await officeparser.parseOffice(filePath);
55
+ const text = await officeparser.parseOffice(buffer);
71
56
  const words = countWords(text);
72
57
 
73
58
  return {
74
59
  fileType: 'ODP',
75
- metrics: {
76
- words,
77
- pages: null,
78
- paragraphs: null,
79
- sheets: null,
80
- rows: null,
81
- cells: null,
82
- slides,
83
- },
60
+ metrics: { words, slides },
84
61
  };
85
62
  }
@@ -25,6 +25,7 @@ function beginSuppression() {
25
25
  function endSuppression() {
26
26
  if (--suppressionDepth === 0) {
27
27
  console.log = originalLog;
28
+ capturedWarnings.length = 0;
28
29
  }
29
30
  }
30
31
 
@@ -43,14 +44,6 @@ export async function parsePdf(filePath) {
43
44
 
44
45
  return {
45
46
  fileType: 'PDF',
46
- metrics: {
47
- words,
48
- pages: data.numpages,
49
- paragraphs: null,
50
- sheets: null,
51
- rows: null,
52
- cells: null,
53
- slides: null,
54
- },
47
+ metrics: { words, pages: data.numpages },
55
48
  };
56
49
  }
@@ -13,20 +13,12 @@ export async function parsePptx(filePath) {
13
13
  );
14
14
  const slides = slideFiles.length;
15
15
 
16
- // Extract text via officeparser
17
- const text = await officeparser.parseOffice(filePath);
16
+ // Extract text via officeparser (reuse buffer to avoid re-reading)
17
+ const text = await officeparser.parseOffice(buffer);
18
18
  const words = countWords(text);
19
19
 
20
20
  return {
21
21
  fileType: 'PowerPoint',
22
- metrics: {
23
- words,
24
- pages: null,
25
- paragraphs: null,
26
- sheets: null,
27
- rows: null,
28
- cells: null,
29
- slides,
30
- },
22
+ metrics: { words, slides },
31
23
  };
32
24
  }
@@ -18,14 +18,6 @@ export async function parseXlsx(filePath) {
18
18
 
19
19
  return {
20
20
  fileType: 'Excel',
21
- metrics: {
22
- words: null,
23
- pages: null,
24
- paragraphs: null,
25
- sheets,
26
- rows,
27
- cells,
28
- slides: null,
29
- },
21
+ metrics: { sheets, rows, cells },
30
22
  };
31
23
  }
package/src/scc.js CHANGED
@@ -28,19 +28,18 @@ async function findScc() {
28
28
  }
29
29
  }
30
30
 
31
- let sccBinary = null;
32
-
33
31
  export async function checkScc() {
34
- sccBinary = await findScc();
35
- if (!sccBinary) {
32
+ const binary = await findScc();
33
+ if (!binary) {
36
34
  throw new Error(
37
35
  'scc is required but not found.\n' +
38
36
  'Run "npm install" to auto-download it, or install manually from https://github.com/boyter/scc'
39
37
  );
40
38
  }
39
+ return binary;
41
40
  }
42
41
 
43
- export async function runScc(directories, options = {}) {
42
+ export async function runScc(sccBinary, directories, options = {}) {
44
43
  const {
45
44
  byFile = false,
46
45
  excludeDir = [],
@@ -49,7 +48,6 @@ export async function runScc(directories, options = {}) {
49
48
  noGitignore = false,
50
49
  } = options;
51
50
 
52
- if (!sccBinary) sccBinary = await findScc();
53
51
  if (!sccBinary) return [];
54
52
 
55
53
  const args = ['--format', 'json'];
package/src/stats.js CHANGED
@@ -1,5 +1,7 @@
1
1
  import path from 'node:path';
2
- import { EXTENSION_TO_TYPE } from './utils.js';
2
+ import { METRIC_FIELDS, hasKey } from './utils.js';
3
+
4
+ const SUM_FIELDS = ['files', ...METRIC_FIELDS, 'size'];
3
5
 
4
6
  export function aggregate(results, options = {}) {
5
7
  const { byFile = false, sort = 'files' } = options;
@@ -16,25 +18,9 @@ function aggregateByType(results, sort) {
16
18
  for (const r of results) {
17
19
  const key = r.success ? r.fileType : 'Unreadable';
18
20
  if (!groups[key]) {
19
- groups[key] = {
20
- fileType: key,
21
- files: 0,
22
- words: 0,
23
- pages: 0,
24
- paragraphs: 0,
25
- sheets: 0,
26
- rows: 0,
27
- cells: 0,
28
- slides: 0,
29
- size: 0,
30
- hasWords: false,
31
- hasPages: false,
32
- hasParagraphs: false,
33
- hasSheets: false,
34
- hasRows: false,
35
- hasCells: false,
36
- hasSlides: false,
37
- };
21
+ const g = { fileType: key, files: 0, size: 0 };
22
+ for (const f of METRIC_FIELDS) { g[f] = 0; g[hasKey(f)] = false; }
23
+ groups[key] = g;
38
24
  }
39
25
  const g = groups[key];
40
26
  g.files++;
@@ -42,54 +28,39 @@ function aggregateByType(results, sort) {
42
28
 
43
29
  if (r.success && r.metrics) {
44
30
  const m = r.metrics;
45
- if (m.words != null) { g.words += m.words; g.hasWords = true; }
46
- if (m.pages != null) { g.pages += m.pages; g.hasPages = true; }
47
- if (m.paragraphs != null) { g.paragraphs += m.paragraphs; g.hasParagraphs = true; }
48
- if (m.sheets != null) { g.sheets += m.sheets; g.hasSheets = true; }
49
- if (m.rows != null) { g.rows += m.rows; g.hasRows = true; }
50
- if (m.cells != null) { g.cells += m.cells; g.hasCells = true; }
51
- if (m.slides != null) { g.slides += m.slides; g.hasSlides = true; }
31
+ for (const f of METRIC_FIELDS) {
32
+ if (m[f] != null) { g[f] += m[f]; g[hasKey(f)] = true; }
33
+ }
52
34
  }
53
35
  }
54
36
 
55
- const rows = Object.values(groups);
56
- sortRows(rows, sort);
57
-
58
- const totals = computeTotals(rows);
59
- const columns = detectColumns(rows);
60
-
61
- return { rows, totals, columns, mode: 'grouped' };
37
+ return finalize(Object.values(groups), sort, 'grouped');
62
38
  }
63
39
 
64
40
  function aggregateByFile(results, sort) {
65
- const rows = results.map(r => ({
66
- fileType: r.success ? r.fileType : 'Unreadable',
67
- fileName: path.basename(r.filePath),
68
- filePath: r.filePath,
69
- files: 1,
70
- words: r.metrics?.words || 0,
71
- pages: r.metrics?.pages || 0,
72
- paragraphs: r.metrics?.paragraphs || 0,
73
- sheets: r.metrics?.sheets || 0,
74
- rows: r.metrics?.rows || 0,
75
- cells: r.metrics?.cells || 0,
76
- slides: r.metrics?.slides || 0,
77
- size: r.size || 0,
78
- hasWords: r.metrics?.words != null,
79
- hasPages: r.metrics?.pages != null,
80
- hasParagraphs: r.metrics?.paragraphs != null,
81
- hasSheets: r.metrics?.sheets != null,
82
- hasRows: r.metrics?.rows != null,
83
- hasCells: r.metrics?.cells != null,
84
- hasSlides: r.metrics?.slides != null,
85
- }));
41
+ const rows = results.map(r => {
42
+ const row = {
43
+ fileType: r.success ? r.fileType : 'Unreadable',
44
+ fileName: path.basename(r.filePath),
45
+ filePath: r.filePath,
46
+ files: 1,
47
+ size: r.size || 0,
48
+ };
49
+ for (const f of METRIC_FIELDS) {
50
+ row[f] = r.metrics?.[f] || 0;
51
+ row[hasKey(f)] = r.metrics?.[f] != null;
52
+ }
53
+ return row;
54
+ });
86
55
 
87
- sortRows(rows, sort);
56
+ return finalize(rows, sort, 'by-file');
57
+ }
88
58
 
59
+ function finalize(rows, sort, mode) {
60
+ sortRows(rows, sort);
89
61
  const totals = computeTotals(rows);
90
62
  const columns = detectColumns(rows);
91
-
92
- return { rows, totals, columns, mode: 'by-file' };
63
+ return { rows, totals, columns, mode };
93
64
  }
94
65
 
95
66
  function sortRows(rows, sort) {
@@ -104,40 +75,18 @@ function sortRows(rows, sort) {
104
75
  }
105
76
 
106
77
  function computeTotals(rows) {
107
- const totals = {
108
- fileType: 'Total',
109
- files: 0,
110
- words: 0,
111
- pages: 0,
112
- paragraphs: 0,
113
- sheets: 0,
114
- rows: 0,
115
- cells: 0,
116
- slides: 0,
117
- size: 0,
118
- };
78
+ const totals = { fileType: 'Total' };
79
+ for (const f of SUM_FIELDS) totals[f] = 0;
119
80
  for (const r of rows) {
120
- totals.files += r.files;
121
- totals.words += r.words;
122
- totals.pages += r.pages;
123
- totals.paragraphs += r.paragraphs;
124
- totals.sheets += r.sheets;
125
- totals.rows += r.rows;
126
- totals.cells += r.cells;
127
- totals.slides += r.slides;
128
- totals.size += r.size;
81
+ for (const f of SUM_FIELDS) totals[f] += r[f];
129
82
  }
130
83
  return totals;
131
84
  }
132
85
 
133
86
  function detectColumns(rows) {
134
- return {
135
- hasWords: rows.some(r => r.hasWords),
136
- hasPages: rows.some(r => r.hasPages),
137
- hasParagraphs: rows.some(r => r.hasParagraphs),
138
- hasSheets: rows.some(r => r.hasSheets),
139
- hasRows: rows.some(r => r.hasRows),
140
- hasCells: rows.some(r => r.hasCells),
141
- hasSlides: rows.some(r => r.hasSlides),
142
- };
87
+ const columns = {};
88
+ for (const f of METRIC_FIELDS) {
89
+ columns[hasKey(f)] = rows.some(r => r[hasKey(f)]);
90
+ }
91
+ return columns;
143
92
  }
package/src/utils.js CHANGED
@@ -22,8 +22,6 @@ export function getExtension(filePath) {
22
22
  return path.extname(filePath).toLowerCase().replace('.', '');
23
23
  }
24
24
 
25
- export const OFFICE_EXTENSIONS = ['docx', 'xlsx', 'pptx', 'pdf', 'odt', 'ods', 'odp'];
26
-
27
25
  export const EXTENSION_TO_TYPE = {
28
26
  docx: 'Word',
29
27
  pdf: 'PDF',
@@ -33,3 +31,11 @@ export const EXTENSION_TO_TYPE = {
33
31
  ods: 'ODS',
34
32
  odp: 'ODP',
35
33
  };
34
+
35
+ export const OFFICE_EXTENSIONS = Object.keys(EXTENSION_TO_TYPE);
36
+
37
+ export const METRIC_FIELDS = ['words', 'pages', 'paragraphs', 'sheets', 'rows', 'cells', 'slides'];
38
+
39
+ export function hasKey(field) {
40
+ return `has${field[0].toUpperCase()}${field.slice(1)}`;
41
+ }
package/src/walker.js CHANGED
@@ -30,55 +30,40 @@ export async function findFiles(directories, options = {}) {
30
30
 
31
31
  const ignore = excludeDir.map(d => `**/${d}/**`);
32
32
 
33
- const paths = await fg(pattern, {
34
- cwd: undefined,
35
- absolute: true,
36
- ignore,
37
- dot: false,
38
- onlyFiles: true,
39
- followSymbolicLinks: false,
40
- ...(directories.length > 0 ? {} : { cwd: process.cwd() }),
41
- });
42
-
43
- // If directories specified, search each one
44
- let allPaths = [];
45
- if (directories.length > 0) {
46
- for (const dir of directories) {
47
- const found = await fg(pattern, {
48
- cwd: dir,
49
- absolute: true,
50
- ignore,
51
- dot: false,
52
- onlyFiles: true,
53
- followSymbolicLinks: false,
54
- });
55
- allPaths.push(...found);
56
- }
57
- } else {
58
- allPaths = await fg(pattern, {
59
- cwd: process.cwd(),
33
+ const dirs = directories.length > 0 ? directories : [process.cwd()];
34
+ const allPaths = [];
35
+ for (const dir of dirs) {
36
+ const found = await fg(pattern, {
37
+ cwd: dir,
60
38
  absolute: true,
61
39
  ignore,
62
40
  dot: false,
63
41
  onlyFiles: true,
64
42
  followSymbolicLinks: false,
65
43
  });
44
+ allPaths.push(...found);
66
45
  }
67
46
 
68
47
  const limitBytes = largeFileLimit * 1024 * 1024;
69
48
  const files = [];
70
49
  const skipped = [];
71
50
 
72
- for (const p of allPaths) {
73
- try {
74
- const s = await stat(p);
75
- if (s.size > limitBytes) {
76
- skipped.push({ path: p, reason: `Exceeds ${largeFileLimit}MB limit`, size: s.size });
51
+ // Batch stat calls for better throughput on large directories
52
+ const BATCH_SIZE = 50;
53
+ for (let i = 0; i < allPaths.length; i += BATCH_SIZE) {
54
+ const batch = allPaths.slice(i, i + BATCH_SIZE);
55
+ const results = await Promise.allSettled(batch.map(p => stat(p)));
56
+ for (let j = 0; j < results.length; j++) {
57
+ const p = batch[j];
58
+ const r = results[j];
59
+ if (r.status === 'rejected') {
60
+ const err = r.reason;
61
+ skipped.push({ path: p, reason: err.code === 'EACCES' ? 'Permission denied' : err.message, size: 0 });
62
+ } else if (r.value.size > limitBytes) {
63
+ skipped.push({ path: p, reason: `Exceeds ${largeFileLimit}MB limit`, size: r.value.size });
77
64
  } else {
78
- files.push({ path: p, size: s.size });
65
+ files.push({ path: p, size: r.value.size });
79
66
  }
80
- } catch (err) {
81
- skipped.push({ path: p, reason: err.code === 'EACCES' ? 'Permission denied' : err.message, size: 0 });
82
67
  }
83
68
  }
84
69