npm - toksize - Versions diffs - 0.1.0 → 0.2.0 - Mend

toksize 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -3,7 +3,6 @@
 > Know what's eating your context window.
 [![npm version](https://img.shields.io/npm/v/toksize.svg)](https://www.npmjs.com/package/toksize)
-[![CI](https://github.com/toksize/toksize/actions/workflows/ci.yml/badge.svg)](https://github.com/toksize/toksize/actions/workflows/ci.yml)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
 A CLI that scans a project directory and reports LLM tokens per file, per folder, and for the whole tree. Think `ncdu` but for tokens instead of disk.
@@ -49,14 +48,18 @@ toksize
 # Scan a specific path
 toksize ./src
+# Target a specific model (aliases like `claude`, `gpt`, `gemini` work too)
+toksize --model claude-opus-4.6
+toksize --model gpt-4o
+# See which models are supported
+toksize models
 # Only TypeScript files, show top 10
 toksize --ext ts,tsx --top 10
 # Export JSON for post-processing
 toksize --format json --output report.json
-# GPT-4o tokenizer
-toksize --encoding o200k_base
 ```
 ## Options
@@ -64,6 +67,7 @@ toksize --encoding o200k_base
 | Flag | Description | Default |
 |------|-------------|---------|
 | `--format <fmt>` | `tree`, `json`, `csv`, or `table` | `tree` |
+| `--model <id>` | Target model (overrides `--encoding`). See below. | — |
 | `--encoding <enc>` | `cl100k_base` or `o200k_base` | `cl100k_base` |
 | `--ext <list>` | Comma-separated extensions to include | *(all)* |
 | `--depth <n>` | Max recursion depth | unlimited |
@@ -94,14 +98,28 @@ Layered, applied in order:
 Use `--show-skipped` to see what was dropped.
-## Encodings
+## Models
+Pick your target with `--model`. Run `toksize models` for the full list. Aliases such as `claude`, `opus`, `sonnet`, `haiku`, `gpt`, `gemini`, `llama`, `mistral`, `deepseek`, and `grok` resolve to the latest flagship per provider.
+| Provider | Models | Accuracy |
+|----------|--------|----------|
+| OpenAI | `gpt-4o`, `gpt-4o-mini`, `o1`, `o1-mini`, `o3`, `o3-mini`, `gpt-4`, `gpt-4-turbo`, `gpt-3.5-turbo` | Exact |
+| Anthropic | `claude-opus-4.6`, `claude-sonnet-4.5`, `claude-haiku-4`, `claude-3.5-sonnet`, `claude-3-opus` | Approx (±10-15%) |
+| Google | `gemini-2.5-pro`, `gemini-2.0-flash`, `gemini-1.5-pro` | Approx (±10-15%) |
+| Meta | `llama-4`, `llama-3.3`, `llama-3.1` | Approx (±10-15%) |
+| Mistral | `mistral-large`, `mistral-small` | Approx (±10-15%) |
+| DeepSeek | `deepseek-v3`, `deepseek-r1` | Approx (±10%) |
+| xAI | `grok-3`, `grok-2` | Approx (±10-15%) |
+### Encodings
+toksize counts locally using [`js-tiktoken`](https://github.com/dqbd/tiktoken) (Wasm, no native build, no network). Two encodings ship:
-| Encoding | Models |
-|----------|--------|
-| `cl100k_base` | GPT-4, GPT-3.5-turbo. Reasonable approximation for Claude too. |
-| `o200k_base` | GPT-4o family. |
+- `cl100k_base` — GPT-4 family + closest proxy for most non-OpenAI models.
+- `o200k_base` — GPT-4o, o1, o3. Closer proxy for Gemini.
-toksize does not call any API. Counts are computed locally with [`js-tiktoken`](https://github.com/dqbd/tiktoken), which ships a Wasm tokenizer — no native build step, no network.
+Non-OpenAI counts are approximations: the tokenizer is not native, so expect ±10-15% drift depending on content (code compresses better on every tokenizer than prose). Use `--model` to make that explicit in the output.
 ## Programmatic API

package/dist/cli.js CHANGED Viewed

@@ -278,12 +278,15 @@ function renderCsv(root) {
 }
 // src/output/json.ts
-function renderJson(root, encoding) {
+function renderJson(root, encoding, opts = {}) {
   const report = {
     encoding,
     totalTokens: root.tokens,
     root
   };
+  if (opts.modelId && opts.modelLabel !== void 0 && opts.modelExact !== void 0) {
+    report.model = { id: opts.modelId, label: opts.modelLabel, exact: opts.modelExact };
+  }
   return JSON.stringify(report, null, 2);
 }
@@ -300,7 +303,8 @@ function renderTable(root, encoding, opts = { useColor: true, topN: 20 }) {
   const tokenWidth = Math.max(6, ...top.map((f) => fmtNum(f.tokens).length));
   const pctWidth = 6;
   const lines = [];
-  const header = `toksize \u2014 ${fmtNum(root.tokens)} tokens (${encoding})`;
+  const suffix = opts.modelLabel ? `${opts.modelLabel}, ${encoding}${opts.modelExact ? "" : " ~approx"}` : encoding;
+  const header = `toksize \u2014 ${fmtNum(root.tokens)} tokens (${suffix})`;
   lines.push(opts.useColor ? chalk.bold(header) : header);
   lines.push("");
   const titleRow = `${"PATH".padEnd(pathWidth)}  ${"TOKENS".padStart(tokenWidth)}  ${"PCT".padStart(pctWidth)}`;
@@ -343,8 +347,13 @@ function renderNode(node, prefix, maxSibling, lines, opts) {
 }
 function renderTree(root, encoding, opts = { useColor: true, topN: 5 }) {
   const lines = [];
-  const title = `toksize \u2014 ${fmtNum2(root.tokens)} tokens (${encoding})`;
+  const suffix = opts.modelLabel ? `${opts.modelLabel}, ${encoding}${opts.modelExact ? "" : " ~approx"}` : encoding;
+  const title = `toksize \u2014 ${fmtNum2(root.tokens)} tokens (${suffix})`;
   lines.push(opts.useColor ? chalk2.bold(title) : title);
+  if (opts.modelLabel && opts.modelExact === false) {
+    const note = "Approximate count. Non-native tokenizer; expect \xB110-15% drift.";
+    lines.push(opts.useColor ? chalk2.dim(note) : note);
+  }
   lines.push("");
   const maxSibling = root.children.reduce((m, c) => Math.max(m, c.tokens), 0);
   for (const child of root.children) {
@@ -384,20 +393,25 @@ async function countAll(files, encoding, skipped) {
   return results;
 }
 function render(root, input) {
+  const { modelId, modelLabel, modelExact } = input.options;
   switch (input.format) {
     case "json":
-      return renderJson(root, input.options.encoding);
+      return renderJson(root, input.options.encoding, { modelId, modelLabel, modelExact });
     case "csv":
       return renderCsv(root);
     case "table":
       return renderTable(root, input.options.encoding, {
         useColor: input.useColor,
-        topN: input.top
+        topN: input.top,
+        modelLabel,
+        modelExact
       });
     case "tree":
       return renderTree(root, input.options.encoding, {
         useColor: input.useColor,
-        topN: input.top
+        topN: input.top,
+        modelLabel,
+        modelExact
       });
     default: {
       const exhaustive = input.format;
@@ -429,6 +443,213 @@ async function runScan(input) {
   return { stdout, skipped, root };
 }
+// src/core/models.ts
+var MODELS = {
+  // OpenAI — exact
+  "gpt-4o": {
+    id: "gpt-4o",
+    label: "GPT-4o",
+    provider: "openai",
+    encoding: "o200k_base",
+    exact: true
+  },
+  "gpt-4o-mini": {
+    id: "gpt-4o-mini",
+    label: "GPT-4o mini",
+    provider: "openai",
+    encoding: "o200k_base",
+    exact: true
+  },
+  o1: { id: "o1", label: "o1", provider: "openai", encoding: "o200k_base", exact: true },
+  "o1-mini": {
+    id: "o1-mini",
+    label: "o1 mini",
+    provider: "openai",
+    encoding: "o200k_base",
+    exact: true
+  },
+  o3: { id: "o3", label: "o3", provider: "openai", encoding: "o200k_base", exact: true },
+  "o3-mini": {
+    id: "o3-mini",
+    label: "o3 mini",
+    provider: "openai",
+    encoding: "o200k_base",
+    exact: true
+  },
+  "gpt-4": {
+    id: "gpt-4",
+    label: "GPT-4",
+    provider: "openai",
+    encoding: "cl100k_base",
+    exact: true
+  },
+  "gpt-4-turbo": {
+    id: "gpt-4-turbo",
+    label: "GPT-4 Turbo",
+    provider: "openai",
+    encoding: "cl100k_base",
+    exact: true
+  },
+  "gpt-3.5-turbo": {
+    id: "gpt-3.5-turbo",
+    label: "GPT-3.5 Turbo",
+    provider: "openai",
+    encoding: "cl100k_base",
+    exact: true
+  },
+  // Anthropic — approx
+  "claude-opus-4.6": {
+    id: "claude-opus-4.6",
+    label: "Claude Opus 4.6",
+    provider: "anthropic",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "claude-sonnet-4.5": {
+    id: "claude-sonnet-4.5",
+    label: "Claude Sonnet 4.5",
+    provider: "anthropic",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "claude-haiku-4": {
+    id: "claude-haiku-4",
+    label: "Claude Haiku 4",
+    provider: "anthropic",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "claude-3.5-sonnet": {
+    id: "claude-3.5-sonnet",
+    label: "Claude 3.5 Sonnet",
+    provider: "anthropic",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "claude-3-opus": {
+    id: "claude-3-opus",
+    label: "Claude 3 Opus",
+    provider: "anthropic",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  // Google — approx
+  "gemini-2.5-pro": {
+    id: "gemini-2.5-pro",
+    label: "Gemini 2.5 Pro",
+    provider: "google",
+    encoding: "o200k_base",
+    exact: false
+  },
+  "gemini-2.0-flash": {
+    id: "gemini-2.0-flash",
+    label: "Gemini 2.0 Flash",
+    provider: "google",
+    encoding: "o200k_base",
+    exact: false
+  },
+  "gemini-1.5-pro": {
+    id: "gemini-1.5-pro",
+    label: "Gemini 1.5 Pro",
+    provider: "google",
+    encoding: "o200k_base",
+    exact: false
+  },
+  // Meta — approx
+  "llama-4": {
+    id: "llama-4",
+    label: "Llama 4",
+    provider: "meta",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "llama-3.3": {
+    id: "llama-3.3",
+    label: "Llama 3.3",
+    provider: "meta",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "llama-3.1": {
+    id: "llama-3.1",
+    label: "Llama 3.1",
+    provider: "meta",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  // Mistral — approx
+  "mistral-large": {
+    id: "mistral-large",
+    label: "Mistral Large",
+    provider: "mistral",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "mistral-small": {
+    id: "mistral-small",
+    label: "Mistral Small",
+    provider: "mistral",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  // DeepSeek — approx
+  "deepseek-v3": {
+    id: "deepseek-v3",
+    label: "DeepSeek V3",
+    provider: "deepseek",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "deepseek-r1": {
+    id: "deepseek-r1",
+    label: "DeepSeek R1",
+    provider: "deepseek",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  // xAI — approx
+  "grok-3": {
+    id: "grok-3",
+    label: "Grok 3",
+    provider: "xai",
+    encoding: "cl100k_base",
+    exact: false
+  },
+  "grok-2": {
+    id: "grok-2",
+    label: "Grok 2",
+    provider: "xai",
+    encoding: "cl100k_base",
+    exact: false
+  }
+};
+var MODEL_ALIASES = {
+  // Generic aliases pointing at the latest flagship per provider
+  claude: "claude-opus-4.6",
+  opus: "claude-opus-4.6",
+  sonnet: "claude-sonnet-4.5",
+  haiku: "claude-haiku-4",
+  gpt: "gpt-4o",
+  "gpt-4o-latest": "gpt-4o",
+  gemini: "gemini-2.5-pro",
+  llama: "llama-4",
+  mistral: "mistral-large",
+  deepseek: "deepseek-v3",
+  grok: "grok-3"
+};
+function resolveModel(name) {
+  const key = name.trim().toLowerCase();
+  const aliasTarget = MODEL_ALIASES[key];
+  if (aliasTarget) return MODELS[aliasTarget];
+  return MODELS[key];
+}
+function listModels() {
+  return Object.values(MODELS).sort((a, b) => {
+    if (a.provider !== b.provider) return a.provider.localeCompare(b.provider);
+    return a.id.localeCompare(b.id);
+  });
+}
 // src/cli.ts
 async function readVersion() {
   try {
@@ -445,8 +666,37 @@ function splitList(value) {
   if (!value) return [];
   return value.split(",").map((s) => s.trim().replace(/^\./, "").toLowerCase()).filter((s) => s.length > 0);
 }
+function printModels(useColor) {
+  const rows = listModels();
+  const widths = {
+    id: Math.max(8, ...rows.map((r) => r.id.length)),
+    provider: Math.max(8, ...rows.map((r) => r.provider.length)),
+    encoding: Math.max(8, ...rows.map((r) => r.encoding.length))
+  };
+  const header = `${"MODEL".padEnd(widths.id)}  ${"PROVIDER".padEnd(widths.provider)}  ${"ENCODING".padEnd(widths.encoding)}  ACCURACY`;
+  const out = [];
+  out.push(useColor ? chalk3.bold(header) : header);
+  out.push("-".repeat(header.length));
+  for (const m of rows) {
+    const accuracy = m.exact ? "exact" : "approx";
+    const row = `${m.id.padEnd(widths.id)}  ${m.provider.padEnd(widths.provider)}  ${m.encoding.padEnd(widths.encoding)}  ${accuracy}`;
+    out.push(useColor && !m.exact ? chalk3.gray(row) : row);
+  }
+  out.push("");
+  out.push(
+    useColor ? chalk3.dim(
+      "Aliases: claude, opus, sonnet, haiku, gpt, gemini, llama, mistral, deepseek, grok"
+    ) : "Aliases: claude, opus, sonnet, haiku, gpt, gemini, llama, mistral, deepseek, grok"
+  );
+  process.stdout.write(`${out.join("\n")}
+`);
+}
 async function main(argv = process.argv.slice(2)) {
   const version = await readVersion();
+  if (argv[0] === "models") {
+    printModels(process.stdout.isTTY === true);
+    return;
+  }
   const argv0 = cli(
     {
       name: "toksize",
@@ -458,6 +708,11 @@ async function main(argv = process.argv.slice(2)) {
           description: "Output format: tree | json | csv | table",
           default: "tree"
         },
+        model: {
+          type: String,
+          description: "Target model (overrides --encoding). Run `toksize models` for the list.",
+          default: ""
+        },
         encoding: {
           type: String,
           description: "Tokenizer encoding: cl100k_base | o200k_base",
@@ -509,9 +764,10 @@ async function main(argv = process.argv.slice(2)) {
         examples: [
           "toksize",
           "toksize ./src",
+          "toksize --model claude-opus-4.6",
+          "toksize --model gpt-4o --top 10",
           "toksize --format json --output report.json",
-          "toksize --ext ts,tsx --top 10",
-          "toksize --encoding o200k_base"
+          "toksize models"
         ]
       }
     },
@@ -525,19 +781,40 @@ async function main(argv = process.argv.slice(2)) {
       "BAD_FORMAT"
     );
   }
-  const encodingName = argv0.flags.encoding;
-  if (!isSupportedEncoding(encodingName)) {
-    throw new ToksizeError(
-      `Invalid --encoding "${encodingName}". Use cl100k_base or o200k_base.`,
-      "BAD_ENCODING"
-    );
+  let encoding;
+  let modelId;
+  let modelLabel;
+  let modelExact;
+  if (argv0.flags.model) {
+    const info = resolveModel(argv0.flags.model);
+    if (!info) {
+      throw new ToksizeError(
+        `Unknown --model "${argv0.flags.model}". Run \`toksize models\` to see the list.`,
+        "BAD_MODEL"
+      );
+    }
+    encoding = info.encoding;
+    modelId = info.id;
+    modelLabel = info.label;
+    modelExact = info.exact;
+  } else {
+    const encodingName = argv0.flags.encoding;
+    if (!isSupportedEncoding(encodingName)) {
+      throw new ToksizeError(
+        `Invalid --encoding "${encodingName}". Use cl100k_base or o200k_base.`,
+        "BAD_ENCODING"
+      );
+    }
+    encoding = encodingName;
   }
-  const encoding = encodingName;
   const rawPath = argv0._.path ?? ".";
   const useColor = !argv0.flags.noColor && process.stdout.isTTY === true && format !== "json" && format !== "csv";
   const options = {
     root: rawPath,
     encoding,
+    modelId,
+    modelLabel,
+    modelExact,
     depth: Number.isFinite(argv0.flags.depth) ? argv0.flags.depth : Number.POSITIVE_INFINITY,
     extensions: splitList(argv0.flags.ext),
     excludes: argv0.flags.exclude,

package/package.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
 	"name": "toksize",
-	"version": "0.1.0",
+	"version": "0.2.0",
 	"description": "Know what's eating your context window. Token counter for your codebase.",
 	"type": "module",
 	"bin": {
-		"toksize": "./dist/cli.js"
+		"toksize": "dist/cli.js"
 	},
 	"files": ["dist"],
 	"engines": {