toksize 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +28 -10
  2. package/dist/cli.js +292 -15
  3. package/package.json +2 -2
package/README.md CHANGED
@@ -3,7 +3,6 @@
3
3
  > Know what's eating your context window.
4
4
 
5
5
  [![npm version](https://img.shields.io/npm/v/toksize.svg)](https://www.npmjs.com/package/toksize)
6
- [![CI](https://github.com/toksize/toksize/actions/workflows/ci.yml/badge.svg)](https://github.com/toksize/toksize/actions/workflows/ci.yml)
7
6
  [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
8
7
 
9
8
  A CLI that scans a project directory and reports LLM tokens per file, per folder, and for the whole tree. Think `ncdu` but for tokens instead of disk.
@@ -49,14 +48,18 @@ toksize
49
48
  # Scan a specific path
50
49
  toksize ./src
51
50
 
51
+ # Target a specific model (aliases like `claude`, `gpt`, `gemini` work too)
52
+ toksize --model claude-opus-4.6
53
+ toksize --model gpt-4o
54
+
55
+ # See which models are supported
56
+ toksize models
57
+
52
58
  # Only TypeScript files, show top 10
53
59
  toksize --ext ts,tsx --top 10
54
60
 
55
61
  # Export JSON for post-processing
56
62
  toksize --format json --output report.json
57
-
58
- # GPT-4o tokenizer
59
- toksize --encoding o200k_base
60
63
  ```
61
64
 
62
65
  ## Options
@@ -64,6 +67,7 @@ toksize --encoding o200k_base
64
67
  | Flag | Description | Default |
65
68
  |------|-------------|---------|
66
69
  | `--format <fmt>` | `tree`, `json`, `csv`, or `table` | `tree` |
70
+ | `--model <id>` | Target model (overrides `--encoding`). See below. | — |
67
71
  | `--encoding <enc>` | `cl100k_base` or `o200k_base` | `cl100k_base` |
68
72
  | `--ext <list>` | Comma-separated extensions to include | *(all)* |
69
73
  | `--depth <n>` | Max recursion depth | unlimited |
@@ -94,14 +98,28 @@ Layered, applied in order:
94
98
 
95
99
  Use `--show-skipped` to see what was dropped.
96
100
 
97
- ## Encodings
101
+ ## Models
102
+
103
+ Pick your target with `--model`. Run `toksize models` for the full list. Aliases such as `claude`, `opus`, `sonnet`, `haiku`, `gpt`, `gemini`, `llama`, `mistral`, `deepseek`, and `grok` resolve to the latest flagship per provider.
104
+
105
+ | Provider | Models | Accuracy |
106
+ |----------|--------|----------|
107
+ | OpenAI | `gpt-4o`, `gpt-4o-mini`, `o1`, `o1-mini`, `o3`, `o3-mini`, `gpt-4`, `gpt-4-turbo`, `gpt-3.5-turbo` | Exact |
108
+ | Anthropic | `claude-opus-4.6`, `claude-sonnet-4.5`, `claude-haiku-4`, `claude-3.5-sonnet`, `claude-3-opus` | Approx (±10-15%) |
109
+ | Google | `gemini-2.5-pro`, `gemini-2.0-flash`, `gemini-1.5-pro` | Approx (±10-15%) |
110
+ | Meta | `llama-4`, `llama-3.3`, `llama-3.1` | Approx (±10-15%) |
111
+ | Mistral | `mistral-large`, `mistral-small` | Approx (±10-15%) |
112
+ | DeepSeek | `deepseek-v3`, `deepseek-r1` | Approx (±10%) |
113
+ | xAI | `grok-3`, `grok-2` | Approx (±10-15%) |
114
+
115
+ ### Encodings
116
+
117
+ toksize counts locally using [`js-tiktoken`](https://github.com/dqbd/tiktoken) (Wasm, no native build, no network). Two encodings ship:
98
118
 
99
- | Encoding | Models |
100
- |----------|--------|
101
- | `cl100k_base` | GPT-4, GPT-3.5-turbo. Reasonable approximation for Claude too. |
102
- | `o200k_base` | GPT-4o family. |
119
+ - `cl100k_base` GPT-4 family + closest proxy for most non-OpenAI models.
120
+ - `o200k_base` — GPT-4o, o1, o3. Closer proxy for Gemini.
103
121
 
104
- toksize does not call any API. Counts are computed locally with [`js-tiktoken`](https://github.com/dqbd/tiktoken), which ships a Wasm tokenizer no native build step, no network.
122
+ Non-OpenAI counts are approximations: the tokenizer is not native, so expect ±10-15% drift depending on content (code compresses better on every tokenizer than prose). Use `--model` to make that explicit in the output.
105
123
 
106
124
  ## Programmatic API
107
125
 
package/dist/cli.js CHANGED
@@ -278,12 +278,15 @@ function renderCsv(root) {
278
278
  }
279
279
 
280
280
  // src/output/json.ts
281
- function renderJson(root, encoding) {
281
+ function renderJson(root, encoding, opts = {}) {
282
282
  const report = {
283
283
  encoding,
284
284
  totalTokens: root.tokens,
285
285
  root
286
286
  };
287
+ if (opts.modelId && opts.modelLabel !== void 0 && opts.modelExact !== void 0) {
288
+ report.model = { id: opts.modelId, label: opts.modelLabel, exact: opts.modelExact };
289
+ }
287
290
  return JSON.stringify(report, null, 2);
288
291
  }
289
292
 
@@ -300,7 +303,8 @@ function renderTable(root, encoding, opts = { useColor: true, topN: 20 }) {
300
303
  const tokenWidth = Math.max(6, ...top.map((f) => fmtNum(f.tokens).length));
301
304
  const pctWidth = 6;
302
305
  const lines = [];
303
- const header = `toksize \u2014 ${fmtNum(root.tokens)} tokens (${encoding})`;
306
+ const suffix = opts.modelLabel ? `${opts.modelLabel}, ${encoding}${opts.modelExact ? "" : " ~approx"}` : encoding;
307
+ const header = `toksize \u2014 ${fmtNum(root.tokens)} tokens (${suffix})`;
304
308
  lines.push(opts.useColor ? chalk.bold(header) : header);
305
309
  lines.push("");
306
310
  const titleRow = `${"PATH".padEnd(pathWidth)} ${"TOKENS".padStart(tokenWidth)} ${"PCT".padStart(pctWidth)}`;
@@ -343,8 +347,13 @@ function renderNode(node, prefix, maxSibling, lines, opts) {
343
347
  }
344
348
  function renderTree(root, encoding, opts = { useColor: true, topN: 5 }) {
345
349
  const lines = [];
346
- const title = `toksize \u2014 ${fmtNum2(root.tokens)} tokens (${encoding})`;
350
+ const suffix = opts.modelLabel ? `${opts.modelLabel}, ${encoding}${opts.modelExact ? "" : " ~approx"}` : encoding;
351
+ const title = `toksize \u2014 ${fmtNum2(root.tokens)} tokens (${suffix})`;
347
352
  lines.push(opts.useColor ? chalk2.bold(title) : title);
353
+ if (opts.modelLabel && opts.modelExact === false) {
354
+ const note = "Approximate count. Non-native tokenizer; expect \xB110-15% drift.";
355
+ lines.push(opts.useColor ? chalk2.dim(note) : note);
356
+ }
348
357
  lines.push("");
349
358
  const maxSibling = root.children.reduce((m, c) => Math.max(m, c.tokens), 0);
350
359
  for (const child of root.children) {
@@ -384,20 +393,25 @@ async function countAll(files, encoding, skipped) {
384
393
  return results;
385
394
  }
386
395
  function render(root, input) {
396
+ const { modelId, modelLabel, modelExact } = input.options;
387
397
  switch (input.format) {
388
398
  case "json":
389
- return renderJson(root, input.options.encoding);
399
+ return renderJson(root, input.options.encoding, { modelId, modelLabel, modelExact });
390
400
  case "csv":
391
401
  return renderCsv(root);
392
402
  case "table":
393
403
  return renderTable(root, input.options.encoding, {
394
404
  useColor: input.useColor,
395
- topN: input.top
405
+ topN: input.top,
406
+ modelLabel,
407
+ modelExact
396
408
  });
397
409
  case "tree":
398
410
  return renderTree(root, input.options.encoding, {
399
411
  useColor: input.useColor,
400
- topN: input.top
412
+ topN: input.top,
413
+ modelLabel,
414
+ modelExact
401
415
  });
402
416
  default: {
403
417
  const exhaustive = input.format;
@@ -429,6 +443,213 @@ async function runScan(input) {
429
443
  return { stdout, skipped, root };
430
444
  }
431
445
 
446
+ // src/core/models.ts
447
+ var MODELS = {
448
+ // OpenAI — exact
449
+ "gpt-4o": {
450
+ id: "gpt-4o",
451
+ label: "GPT-4o",
452
+ provider: "openai",
453
+ encoding: "o200k_base",
454
+ exact: true
455
+ },
456
+ "gpt-4o-mini": {
457
+ id: "gpt-4o-mini",
458
+ label: "GPT-4o mini",
459
+ provider: "openai",
460
+ encoding: "o200k_base",
461
+ exact: true
462
+ },
463
+ o1: { id: "o1", label: "o1", provider: "openai", encoding: "o200k_base", exact: true },
464
+ "o1-mini": {
465
+ id: "o1-mini",
466
+ label: "o1 mini",
467
+ provider: "openai",
468
+ encoding: "o200k_base",
469
+ exact: true
470
+ },
471
+ o3: { id: "o3", label: "o3", provider: "openai", encoding: "o200k_base", exact: true },
472
+ "o3-mini": {
473
+ id: "o3-mini",
474
+ label: "o3 mini",
475
+ provider: "openai",
476
+ encoding: "o200k_base",
477
+ exact: true
478
+ },
479
+ "gpt-4": {
480
+ id: "gpt-4",
481
+ label: "GPT-4",
482
+ provider: "openai",
483
+ encoding: "cl100k_base",
484
+ exact: true
485
+ },
486
+ "gpt-4-turbo": {
487
+ id: "gpt-4-turbo",
488
+ label: "GPT-4 Turbo",
489
+ provider: "openai",
490
+ encoding: "cl100k_base",
491
+ exact: true
492
+ },
493
+ "gpt-3.5-turbo": {
494
+ id: "gpt-3.5-turbo",
495
+ label: "GPT-3.5 Turbo",
496
+ provider: "openai",
497
+ encoding: "cl100k_base",
498
+ exact: true
499
+ },
500
+ // Anthropic — approx
501
+ "claude-opus-4.6": {
502
+ id: "claude-opus-4.6",
503
+ label: "Claude Opus 4.6",
504
+ provider: "anthropic",
505
+ encoding: "cl100k_base",
506
+ exact: false
507
+ },
508
+ "claude-sonnet-4.5": {
509
+ id: "claude-sonnet-4.5",
510
+ label: "Claude Sonnet 4.5",
511
+ provider: "anthropic",
512
+ encoding: "cl100k_base",
513
+ exact: false
514
+ },
515
+ "claude-haiku-4": {
516
+ id: "claude-haiku-4",
517
+ label: "Claude Haiku 4",
518
+ provider: "anthropic",
519
+ encoding: "cl100k_base",
520
+ exact: false
521
+ },
522
+ "claude-3.5-sonnet": {
523
+ id: "claude-3.5-sonnet",
524
+ label: "Claude 3.5 Sonnet",
525
+ provider: "anthropic",
526
+ encoding: "cl100k_base",
527
+ exact: false
528
+ },
529
+ "claude-3-opus": {
530
+ id: "claude-3-opus",
531
+ label: "Claude 3 Opus",
532
+ provider: "anthropic",
533
+ encoding: "cl100k_base",
534
+ exact: false
535
+ },
536
+ // Google — approx
537
+ "gemini-2.5-pro": {
538
+ id: "gemini-2.5-pro",
539
+ label: "Gemini 2.5 Pro",
540
+ provider: "google",
541
+ encoding: "o200k_base",
542
+ exact: false
543
+ },
544
+ "gemini-2.0-flash": {
545
+ id: "gemini-2.0-flash",
546
+ label: "Gemini 2.0 Flash",
547
+ provider: "google",
548
+ encoding: "o200k_base",
549
+ exact: false
550
+ },
551
+ "gemini-1.5-pro": {
552
+ id: "gemini-1.5-pro",
553
+ label: "Gemini 1.5 Pro",
554
+ provider: "google",
555
+ encoding: "o200k_base",
556
+ exact: false
557
+ },
558
+ // Meta — approx
559
+ "llama-4": {
560
+ id: "llama-4",
561
+ label: "Llama 4",
562
+ provider: "meta",
563
+ encoding: "cl100k_base",
564
+ exact: false
565
+ },
566
+ "llama-3.3": {
567
+ id: "llama-3.3",
568
+ label: "Llama 3.3",
569
+ provider: "meta",
570
+ encoding: "cl100k_base",
571
+ exact: false
572
+ },
573
+ "llama-3.1": {
574
+ id: "llama-3.1",
575
+ label: "Llama 3.1",
576
+ provider: "meta",
577
+ encoding: "cl100k_base",
578
+ exact: false
579
+ },
580
+ // Mistral — approx
581
+ "mistral-large": {
582
+ id: "mistral-large",
583
+ label: "Mistral Large",
584
+ provider: "mistral",
585
+ encoding: "cl100k_base",
586
+ exact: false
587
+ },
588
+ "mistral-small": {
589
+ id: "mistral-small",
590
+ label: "Mistral Small",
591
+ provider: "mistral",
592
+ encoding: "cl100k_base",
593
+ exact: false
594
+ },
595
+ // DeepSeek — approx
596
+ "deepseek-v3": {
597
+ id: "deepseek-v3",
598
+ label: "DeepSeek V3",
599
+ provider: "deepseek",
600
+ encoding: "cl100k_base",
601
+ exact: false
602
+ },
603
+ "deepseek-r1": {
604
+ id: "deepseek-r1",
605
+ label: "DeepSeek R1",
606
+ provider: "deepseek",
607
+ encoding: "cl100k_base",
608
+ exact: false
609
+ },
610
+ // xAI — approx
611
+ "grok-3": {
612
+ id: "grok-3",
613
+ label: "Grok 3",
614
+ provider: "xai",
615
+ encoding: "cl100k_base",
616
+ exact: false
617
+ },
618
+ "grok-2": {
619
+ id: "grok-2",
620
+ label: "Grok 2",
621
+ provider: "xai",
622
+ encoding: "cl100k_base",
623
+ exact: false
624
+ }
625
+ };
626
+ var MODEL_ALIASES = {
627
+ // Generic aliases pointing at the latest flagship per provider
628
+ claude: "claude-opus-4.6",
629
+ opus: "claude-opus-4.6",
630
+ sonnet: "claude-sonnet-4.5",
631
+ haiku: "claude-haiku-4",
632
+ gpt: "gpt-4o",
633
+ "gpt-4o-latest": "gpt-4o",
634
+ gemini: "gemini-2.5-pro",
635
+ llama: "llama-4",
636
+ mistral: "mistral-large",
637
+ deepseek: "deepseek-v3",
638
+ grok: "grok-3"
639
+ };
640
+ function resolveModel(name) {
641
+ const key = name.trim().toLowerCase();
642
+ const aliasTarget = MODEL_ALIASES[key];
643
+ if (aliasTarget) return MODELS[aliasTarget];
644
+ return MODELS[key];
645
+ }
646
+ function listModels() {
647
+ return Object.values(MODELS).sort((a, b) => {
648
+ if (a.provider !== b.provider) return a.provider.localeCompare(b.provider);
649
+ return a.id.localeCompare(b.id);
650
+ });
651
+ }
652
+
432
653
  // src/cli.ts
433
654
  async function readVersion() {
434
655
  try {
@@ -445,8 +666,37 @@ function splitList(value) {
445
666
  if (!value) return [];
446
667
  return value.split(",").map((s) => s.trim().replace(/^\./, "").toLowerCase()).filter((s) => s.length > 0);
447
668
  }
669
+ function printModels(useColor) {
670
+ const rows = listModels();
671
+ const widths = {
672
+ id: Math.max(8, ...rows.map((r) => r.id.length)),
673
+ provider: Math.max(8, ...rows.map((r) => r.provider.length)),
674
+ encoding: Math.max(8, ...rows.map((r) => r.encoding.length))
675
+ };
676
+ const header = `${"MODEL".padEnd(widths.id)} ${"PROVIDER".padEnd(widths.provider)} ${"ENCODING".padEnd(widths.encoding)} ACCURACY`;
677
+ const out = [];
678
+ out.push(useColor ? chalk3.bold(header) : header);
679
+ out.push("-".repeat(header.length));
680
+ for (const m of rows) {
681
+ const accuracy = m.exact ? "exact" : "approx";
682
+ const row = `${m.id.padEnd(widths.id)} ${m.provider.padEnd(widths.provider)} ${m.encoding.padEnd(widths.encoding)} ${accuracy}`;
683
+ out.push(useColor && !m.exact ? chalk3.gray(row) : row);
684
+ }
685
+ out.push("");
686
+ out.push(
687
+ useColor ? chalk3.dim(
688
+ "Aliases: claude, opus, sonnet, haiku, gpt, gemini, llama, mistral, deepseek, grok"
689
+ ) : "Aliases: claude, opus, sonnet, haiku, gpt, gemini, llama, mistral, deepseek, grok"
690
+ );
691
+ process.stdout.write(`${out.join("\n")}
692
+ `);
693
+ }
448
694
  async function main(argv = process.argv.slice(2)) {
449
695
  const version = await readVersion();
696
+ if (argv[0] === "models") {
697
+ printModels(process.stdout.isTTY === true);
698
+ return;
699
+ }
450
700
  const argv0 = cli(
451
701
  {
452
702
  name: "toksize",
@@ -458,6 +708,11 @@ async function main(argv = process.argv.slice(2)) {
458
708
  description: "Output format: tree | json | csv | table",
459
709
  default: "tree"
460
710
  },
711
+ model: {
712
+ type: String,
713
+ description: "Target model (overrides --encoding). Run `toksize models` for the list.",
714
+ default: ""
715
+ },
461
716
  encoding: {
462
717
  type: String,
463
718
  description: "Tokenizer encoding: cl100k_base | o200k_base",
@@ -509,9 +764,10 @@ async function main(argv = process.argv.slice(2)) {
509
764
  examples: [
510
765
  "toksize",
511
766
  "toksize ./src",
767
+ "toksize --model claude-opus-4.6",
768
+ "toksize --model gpt-4o --top 10",
512
769
  "toksize --format json --output report.json",
513
- "toksize --ext ts,tsx --top 10",
514
- "toksize --encoding o200k_base"
770
+ "toksize models"
515
771
  ]
516
772
  }
517
773
  },
@@ -525,19 +781,40 @@ async function main(argv = process.argv.slice(2)) {
525
781
  "BAD_FORMAT"
526
782
  );
527
783
  }
528
- const encodingName = argv0.flags.encoding;
529
- if (!isSupportedEncoding(encodingName)) {
530
- throw new ToksizeError(
531
- `Invalid --encoding "${encodingName}". Use cl100k_base or o200k_base.`,
532
- "BAD_ENCODING"
533
- );
784
+ let encoding;
785
+ let modelId;
786
+ let modelLabel;
787
+ let modelExact;
788
+ if (argv0.flags.model) {
789
+ const info = resolveModel(argv0.flags.model);
790
+ if (!info) {
791
+ throw new ToksizeError(
792
+ `Unknown --model "${argv0.flags.model}". Run \`toksize models\` to see the list.`,
793
+ "BAD_MODEL"
794
+ );
795
+ }
796
+ encoding = info.encoding;
797
+ modelId = info.id;
798
+ modelLabel = info.label;
799
+ modelExact = info.exact;
800
+ } else {
801
+ const encodingName = argv0.flags.encoding;
802
+ if (!isSupportedEncoding(encodingName)) {
803
+ throw new ToksizeError(
804
+ `Invalid --encoding "${encodingName}". Use cl100k_base or o200k_base.`,
805
+ "BAD_ENCODING"
806
+ );
807
+ }
808
+ encoding = encodingName;
534
809
  }
535
- const encoding = encodingName;
536
810
  const rawPath = argv0._.path ?? ".";
537
811
  const useColor = !argv0.flags.noColor && process.stdout.isTTY === true && format !== "json" && format !== "csv";
538
812
  const options = {
539
813
  root: rawPath,
540
814
  encoding,
815
+ modelId,
816
+ modelLabel,
817
+ modelExact,
541
818
  depth: Number.isFinite(argv0.flags.depth) ? argv0.flags.depth : Number.POSITIVE_INFINITY,
542
819
  extensions: splitList(argv0.flags.ext),
543
820
  excludes: argv0.flags.exclude,
package/package.json CHANGED
@@ -1,10 +1,10 @@
1
1
  {
2
2
  "name": "toksize",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "Know what's eating your context window. Token counter for your codebase.",
5
5
  "type": "module",
6
6
  "bin": {
7
- "toksize": "./dist/cli.js"
7
+ "toksize": "dist/cli.js"
8
8
  },
9
9
  "files": ["dist"],
10
10
  "engines": {