npm - tokstat - Versions diffs - 0.1.0 - Mend

tokstat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,84 @@
+# tokstat
+A beautiful, interactive explorer for the token economics of your LLM-generated JSON.
+Point it at a corpus of structured outputs and see exactly where your tokens — and dollars — are going. Treemap, sunburst, circle pack, or icicle chart. Click into any field to drill down. Animated transitions between views. Every interaction designed to make schema auditing something you want to do, not something you have to.
+```bash
+npx tokstat ./data/**/*.json
+```
+## Bakeoff
+This project is currently a bakeoff between two AI coding agents building the same tool from the same spec:
+| Branch | Agent | Model |
+|--------|-------|-------|
+| [`claude/tokstat`](https://github.com/TomNeyland/tokstat/tree/claude/tokstat) | Claude Code | Opus 4.6 |
+| [`codex/tokstat`](https://github.com/TomNeyland/tokstat/tree/codex/tokstat) | Codex | GPT 5.3 (high) |
+Both start from the same design system, specs, and fixtures. Same requirements, different implementations.
+**Claude Code (Opus 4.6):**
+![Claude Code implementation — circle pack with thermal colors, sidebar with cohort selector and model combobox](docs/screenshots/claude.webp)
+**Codex (GPT 5.3 high):**
+![Codex implementation — circle pack with tooltip, schema cohorts, and detail panel](docs/screenshots/codex.webp)
+**Live demo:** [tomneyland.github.io/tokstat](https://tomneyland.github.io/tokstat/)
+## What it does
+You're running structured generation (OpenAI, Anthropic, Gemini) with JSON schemas. You're spending hundreds or thousands of dollars per run. You have no idea which fields cost what, which optional fields are rarely populated, or whether your 200-character field names are bleeding you dry.
+tokstat reads your generated JSON files, walks the schema hierarchy, tokenizes every field name and value, and gives you an interactive visualization of where the weight is.
+**Per-field analytics:**
+- Token count: avg, min, max, p50, p95 across your corpus
+- Fill rate for optional/nullable fields
+- Schema overhead (field names, braces, brackets, colons, commas) vs. value payload
+- Estimated cost per field per model/provider
+**Multiple visualization modes:**
+- **Treemap** — where is the money going (relative area = relative cost)
+- **Sunburst** — drill into nested structure radially
+- **Circle pack** — spot outliers and clustering
+- **Icicle** — linear depth exploration
+Animated transitions between views. Click any node to zoom into that subtree.
+## Install
+```bash
+npm install -g tokstat
+# or
+npx tokstat ./path/to/files/**/*.json
+```
+## Usage
+```bash
+# Point at JSON files
+tokstat ./outputs/**/*.json
+# Specify model for cost estimation
+tokstat ./outputs/**/*.json --model gpt-4o
+# JSON output for LLM consumption / CI pipelines
+tokstat ./outputs/**/*.json --format json
+# LLM-optimized context output
+tokstat ./outputs/**/*.json --format llm
+```
+## Stack
+- **Analysis**: Node/Bun — file walking, schema inference, tokenization (js-tiktoken)
+- **Visualization**: Svelte 5 + D3 layouts — reactive rendering with hand-crafted SVG
+- **Distribution**: npm package, opens local browser. No Electron, no desktop wrapper.
+## License
+MIT

package/dist/index.js ADDED Viewed

@@ -0,0 +1,921 @@
+#!/usr/bin/env node
+// src/cli/index.ts
+import { Command } from "commander";
+import { writeFileSync } from "fs";
+import open2 from "open";
+// src/engine/pipeline.ts
+import fg from "fast-glob";
+import { readFileSync } from "fs";
+import { resolve } from "path";
+// src/engine/schemaInference.ts
+function getJsonType(value) {
+  if (value === null) return "null";
+  if (Array.isArray(value)) return "array";
+  const t = typeof value;
+  if (t === "string") return "string";
+  if (t === "number") return "number";
+  if (t === "boolean") return "boolean";
+  if (t === "object") return "object";
+  throw new Error(`Unexpected JSON value type: ${t}`);
+}
+function createNode(name, path, depth) {
+  return {
+    name,
+    path,
+    depth,
+    type: "null",
+    observed_types: /* @__PURE__ */ new Set(),
+    instance_count: 0,
+    present_count: 0,
+    array_item_counts: [],
+    children: /* @__PURE__ */ new Map()
+  };
+}
+function mergeValue(node, value) {
+  node.instance_count++;
+  if (value === null) {
+    node.observed_types.add("null");
+    return;
+  }
+  node.present_count++;
+  const type = getJsonType(value);
+  node.observed_types.add(type);
+  if (type === "object") {
+    const obj = value;
+    for (const key of Object.keys(obj)) {
+      const childPath = `${node.path}.${key}`;
+      if (!node.children.has(key)) {
+        node.children.set(key, createNode(key, childPath, node.depth + 1));
+      }
+      mergeValue(node.children.get(key), obj[key]);
+    }
+  } else if (type === "array") {
+    const arr = value;
+    node.array_item_counts.push(arr.length);
+    const itemPath = `${node.path}[]`;
+    if (!node.children.has("[]")) {
+      node.children.set("[]", createNode("[]", itemPath, node.depth + 1));
+    }
+    const itemNode = node.children.get("[]");
+    for (const item of arr) {
+      mergeValue(itemNode, item);
+    }
+  }
+}
+function resolveTypes(node) {
+  const types = [...node.observed_types].filter((t) => t !== "null");
+  if (types.length > 0) {
+    node.type = types[0];
+  } else if (node.observed_types.has("null")) {
+    node.type = "null";
+  }
+  for (const child of node.children.values()) {
+    resolveTypes(child);
+  }
+}
+function fixInstanceCounts(node) {
+  if (node.type === "object") {
+    for (const child of node.children.values()) {
+      if (child.instance_count < node.present_count) {
+        child.instance_count = node.present_count;
+      }
+      fixInstanceCounts(child);
+    }
+  } else if (node.type === "array") {
+    for (const child of node.children.values()) {
+      fixInstanceCounts(child);
+    }
+  }
+}
+function inferSchema(documents) {
+  const root = createNode("root", "root", 0);
+  for (const doc of documents) {
+    mergeValue(root, doc);
+  }
+  resolveTypes(root);
+  fixInstanceCounts(root);
+  return root;
+}
+// src/engine/tokenization.ts
+import { getEncoding as getTiktokenEncoding } from "js-tiktoken";
+function buildOffsetMap(value, node) {
+  const chars = [];
+  const map = [];
+  function emit(s, path, category) {
+    for (const ch of s) {
+      chars.push(ch);
+      map.push({ path, category });
+    }
+  }
+  function walk(val, schemaNode) {
+    if (val === null) {
+      emit("null", schemaNode.path, "null_value");
+      return;
+    }
+    if (Array.isArray(val)) {
+      emit("[", schemaNode.path, "structural");
+      const itemNode = schemaNode.children.get("[]");
+      for (let i = 0; i < val.length; i++) {
+        if (i > 0) emit(",", schemaNode.path, "structural");
+        walk(val[i], itemNode);
+      }
+      emit("]", schemaNode.path, "structural");
+      return;
+    }
+    if (typeof val === "object") {
+      emit("{", schemaNode.path, "structural");
+      const keys = Object.keys(val);
+      for (let i = 0; i < keys.length; i++) {
+        const key = keys[i];
+        if (i > 0) emit(",", schemaNode.path, "structural");
+        const childNode = schemaNode.children.get(key);
+        emit(JSON.stringify(key), childNode.path, "key");
+        emit(":", childNode.path, "key");
+        const childValue = val[key];
+        walk(childValue, childNode);
+      }
+      emit("}", schemaNode.path, "structural");
+      return;
+    }
+    const serialized = JSON.stringify(val);
+    emit(serialized, schemaNode.path, "value");
+  }
+  walk(value, node);
+  return { json: chars.join(""), map };
+}
+function tokenizeFile(parsed, schema, encoding) {
+  const { json, map } = buildOffsetMap(parsed, schema);
+  const tokens = encoding.encode(json);
+  const accumulators = /* @__PURE__ */ new Map();
+  function ensureAccumulator(path) {
+    let acc = accumulators.get(path);
+    if (!acc) {
+      acc = { schema_overhead: 0, value_payload: 0, null_waste: 0 };
+      accumulators.set(path, acc);
+    }
+    return acc;
+  }
+  let charPos = 0;
+  for (const tokenId of tokens) {
+    const tokenText = encoding.decode([tokenId]);
+    const tokenLen = tokenText.length;
+    const votes = /* @__PURE__ */ new Map();
+    for (let i = 0; i < tokenLen && charPos + i < map.length; i++) {
+      const entry = map[charPos + i];
+      const key = `${entry.path}|${entry.category}`;
+      const existing = votes.get(key);
+      if (existing) {
+        existing.count++;
+      } else {
+        votes.set(key, { path: entry.path, category: entry.category, count: 1 });
+      }
+    }
+    let winner = null;
+    let maxCount = 0;
+    for (const vote of votes.values()) {
+      if (vote.count > maxCount) {
+        maxCount = vote.count;
+        winner = { path: vote.path, category: vote.category };
+      }
+    }
+    if (winner) {
+      const acc = ensureAccumulator(winner.path);
+      if (winner.category === "key" || winner.category === "structural") {
+        acc.schema_overhead++;
+      } else if (winner.category === "null_value") {
+        acc.null_waste++;
+      } else {
+        acc.value_payload++;
+      }
+    }
+    charPos += tokenLen;
+  }
+  const result = /* @__PURE__ */ new Map();
+  for (const [path, acc] of accumulators) {
+    result.set(path, {
+      total: acc.schema_overhead + acc.value_payload + acc.null_waste,
+      schema_overhead: acc.schema_overhead,
+      value_payload: acc.value_payload,
+      null_waste: acc.null_waste
+    });
+  }
+  return result;
+}
+var cachedEncoding = null;
+var cachedEncodingName = null;
+function getEncoding(tokenizer) {
+  if (cachedEncoding && cachedEncodingName === tokenizer) {
+    return cachedEncoding;
+  }
+  cachedEncoding = getTiktokenEncoding(tokenizer);
+  cachedEncodingName = tokenizer;
+  return cachedEncoding;
+}
+function collectValues(parsed, schema) {
+  const result = /* @__PURE__ */ new Map();
+  function ensure(path) {
+    let arr = result.get(path);
+    if (!arr) {
+      arr = [];
+      result.set(path, arr);
+    }
+    return arr;
+  }
+  function walk(val, node) {
+    if (val === null) return;
+    if (Array.isArray(val)) {
+      const itemNode = node.children.get("[]");
+      for (const item of val) {
+        walk(item, itemNode);
+      }
+      return;
+    }
+    if (typeof val === "object") {
+      const obj = val;
+      for (const [key, childVal] of Object.entries(obj)) {
+        const childNode = node.children.get(key);
+        if (childNode) {
+          walk(childVal, childNode);
+        }
+      }
+      return;
+    }
+    ensure(node.path).push(val);
+  }
+  walk(parsed, schema);
+  return result;
+}
+// src/engine/aggregation.ts
+function computeStats(values) {
+  if (values.length === 0) {
+    return { avg: 0, min: 0, max: 0, p50: 0, p95: 0 };
+  }
+  const sorted = [...values].sort((a, b) => a - b);
+  const sum = sorted.reduce((a, b) => a + b, 0);
+  return {
+    avg: sum / sorted.length,
+    min: sorted[0],
+    max: sorted[sorted.length - 1],
+    p50: percentile(sorted, 50),
+    p95: percentile(sorted, 95)
+  };
+}
+function percentile(sorted, p) {
+  if (sorted.length === 1) return sorted[0];
+  const idx = p / 100 * (sorted.length - 1);
+  const lower = Math.floor(idx);
+  const upper = Math.ceil(idx);
+  if (lower === upper) return sorted[lower];
+  return sorted[lower] + (sorted[upper] - sorted[lower]) * (idx - lower);
+}
+function aggregate(schema, perFileTokens, perFileValues, sampleCount) {
+  const allValues = /* @__PURE__ */ new Map();
+  for (const fileValues of perFileValues) {
+    for (const [path, values] of fileValues) {
+      let arr = allValues.get(path);
+      if (!arr) {
+        arr = [];
+        allValues.set(path, arr);
+      }
+      arr.push(...values);
+    }
+  }
+  return aggregateNode(schema, perFileTokens, allValues, sampleCount);
+}
+function aggregateNode(schema, perFileTokens, allValues, sampleCount) {
+  const path = schema.path;
+  const fileTotals = [];
+  const fileOverheads = [];
+  const filePayloads = [];
+  const fileNullWastes = [];
+  for (const fileTokenMap of perFileTokens) {
+    const subtreeTokens = sumSubtree(schema, fileTokenMap);
+    fileTotals.push(subtreeTokens.total);
+    fileOverheads.push(subtreeTokens.schema_overhead);
+    filePayloads.push(subtreeTokens.value_payload);
+    fileNullWastes.push(subtreeTokens.null_waste);
+  }
+  const totalStats = computeStats(fileTotals);
+  const avgOverhead = avg(fileOverheads);
+  const avgPayload = avg(filePayloads);
+  const avgNullWaste = avg(fileNullWastes);
+  const fillRate = schema.instance_count > 0 ? schema.present_count / schema.instance_count : 0;
+  let arrayStats = null;
+  if (schema.type === "array" && schema.array_item_counts.length > 0) {
+    const counts = schema.array_item_counts;
+    const sorted = [...counts].sort((a, b) => a - b);
+    arrayStats = {
+      avg_items: avg(counts),
+      min_items: sorted[0],
+      max_items: sorted[sorted.length - 1],
+      p95_items: percentile(sorted, 95)
+    };
+  }
+  let stringStats = null;
+  const values = allValues.get(path);
+  if (schema.type === "string" && values && values.length > 0) {
+    const stringValues = values.filter((v) => typeof v === "string");
+    if (stringValues.length > 0) {
+      const uniqueSet = new Set(stringValues);
+      const totalLength = stringValues.reduce((sum, s) => sum + s.length, 0);
+      stringStats = {
+        avg_length: totalLength / stringValues.length,
+        value_diversity: uniqueSet.size / stringValues.length,
+        unique_count: uniqueSet.size
+      };
+    }
+  }
+  const examples = sampleValues(values, sampleCount);
+  const children = [];
+  for (const childSchema of schema.children.values()) {
+    children.push(aggregateNode(childSchema, perFileTokens, allValues, sampleCount));
+  }
+  return {
+    name: schema.name,
+    path: schema.path,
+    depth: schema.depth,
+    type: schema.type,
+    tokens: {
+      total: totalStats,
+      schema_overhead: avgOverhead,
+      value_payload: avgPayload,
+      null_waste: avgNullWaste
+    },
+    fill_rate: fillRate,
+    instance_count: schema.instance_count,
+    array_stats: arrayStats,
+    string_stats: stringStats,
+    examples,
+    children,
+    cost: {
+      per_instance: 0,
+      // filled in by cost calculation stage
+      total_corpus: 0
+    }
+  };
+}
+function sumSubtree(schema, fileTokenMap) {
+  const self = fileTokenMap.get(schema.path);
+  let total = self ? self.total : 0;
+  let schema_overhead = self ? self.schema_overhead : 0;
+  let value_payload = self ? self.value_payload : 0;
+  let null_waste = self ? self.null_waste : 0;
+  for (const child of schema.children.values()) {
+    const childTokens = sumSubtree(child, fileTokenMap);
+    total += childTokens.total;
+    schema_overhead += childTokens.schema_overhead;
+    value_payload += childTokens.value_payload;
+    null_waste += childTokens.null_waste;
+  }
+  return { total, schema_overhead, value_payload, null_waste };
+}
+function avg(values) {
+  if (values.length === 0) return 0;
+  return values.reduce((a, b) => a + b, 0) / values.length;
+}
+function sampleValues(values, maxCount) {
+  if (!values || values.length === 0) return [];
+  if (values.length <= maxCount) return [...values];
+  const reservoir = values.slice(0, maxCount);
+  for (let i = maxCount; i < values.length; i++) {
+    const j = Math.floor(Math.random() * (i + 1));
+    if (j < maxCount) {
+      reservoir[j] = values[i];
+    }
+  }
+  return reservoir;
+}
+// src/engine/insights.ts
+function detectInsights(tree, pricePerToken) {
+  const insights = [];
+  walkTree(tree, (node) => {
+    detectNullTax(node, pricePerToken, insights);
+    detectHollowObject(node, pricePerToken, insights);
+    detectArrayRepetitionTax(node, pricePerToken, insights);
+    detectBoilerplate(node, pricePerToken, insights);
+    detectLengthVariance(node, pricePerToken, insights);
+  });
+  insights.sort((a, b) => b.savings_tokens - a.savings_tokens);
+  return insights;
+}
+function walkTree(node, fn) {
+  fn(node);
+  for (const child of node.children) {
+    walkTree(child, fn);
+  }
+}
+function detectNullTax(node, pricePerToken, insights) {
+  if (node.fill_rate >= 0.5) return;
+  if (node.instance_count === 0) return;
+  if (node.name === "root" || node.name === "[]") return;
+  const nullPct = Math.round((1 - node.fill_rate) * 100);
+  const savingsPerInstance = node.tokens.schema_overhead * (1 - node.fill_rate) + node.tokens.null_waste;
+  if (savingsPerInstance < 1) return;
+  const savingsUsdPer10k = savingsPerInstance * pricePerToken * 1e4;
+  const severity = savingsPerInstance > 20 ? "high" : savingsPerInstance > 5 ? "medium" : "low";
+  insights.push({
+    type: "null_tax",
+    path: node.path,
+    severity,
+    message: `${node.name} is null ${nullPct}% of the time. Making it optional saves ${Math.round(savingsPerInstance)} tok/instance.`,
+    detail: `This field exists in the schema but is null in ${nullPct}% of instances. Each null instance still costs ${node.tokens.schema_overhead.toFixed(1)} tokens in structural overhead plus ${node.tokens.null_waste.toFixed(1)} tokens for the null literal. Making it optional would eliminate these costs when the field has no value.`,
+    savings_tokens: savingsPerInstance,
+    savings_usd_per_10k: savingsUsdPer10k
+  });
+}
+function detectHollowObject(node, pricePerToken, insights) {
+  if (node.type !== "object") return;
+  if (node.tokens.total.avg < 5) return;
+  const overheadRatio = node.tokens.schema_overhead / node.tokens.total.avg;
+  if (overheadRatio <= 0.7) return;
+  const overheadPct = Math.round(overheadRatio * 100);
+  const overheadTokens = Math.round(node.tokens.schema_overhead);
+  const totalTokens = Math.round(node.tokens.total.avg);
+  const savingsPerInstance = node.tokens.schema_overhead * 0.3;
+  const severity = overheadRatio > 0.85 ? "high" : overheadRatio > 0.75 ? "medium" : "low";
+  insights.push({
+    type: "hollow_object",
+    path: node.path,
+    severity,
+    message: `${node.name} is ${overheadPct}% structural overhead. ${overheadTokens} of ${totalTokens} tokens are field names and braces.`,
+    detail: `This object's structural elements (field names, braces, colons, commas) consume ${overheadPct}% of its total token cost. The actual value payload is only ${totalTokens - overheadTokens} tokens. Consider flattening or restructuring to reduce overhead.`,
+    savings_tokens: savingsPerInstance,
+    savings_usd_per_10k: savingsPerInstance * pricePerToken * 1e4
+  });
+}
+function detectArrayRepetitionTax(node, pricePerToken, insights) {
+  if (node.type !== "array") return;
+  if (!node.array_stats) return;
+  if (node.array_stats.avg_items <= 1) return;
+  const itemChildren = node.children.filter((c) => c.name === "[]");
+  if (itemChildren.length === 0) return;
+  const itemNode = itemChildren[0];
+  const perItemKeyCost = itemNode.children.reduce(
+    (sum, child) => sum + child.tokens.schema_overhead,
+    0
+  );
+  if (perItemKeyCost < 1) return;
+  const repetitionTax = perItemKeyCost * (node.array_stats.avg_items - 1);
+  const avgItems = node.array_stats.avg_items;
+  const severity = repetitionTax > 50 ? "high" : repetitionTax > 15 ? "medium" : "low";
+  insights.push({
+    type: "array_repetition_tax",
+    path: node.path,
+    severity,
+    message: `Field names in ${node.name} repeat ${avgItems.toFixed(1)}x per instance, costing ${Math.round(repetitionTax)} tokens in repetition.`,
+    detail: `Each item in this array repeats ${itemNode.children.length} field names, costing ~${perItemKeyCost.toFixed(1)} tokens per item. With an average of ${avgItems.toFixed(1)} items, the first item's field names are repeated ${(avgItems - 1).toFixed(1)} additional times. A header+values format would eliminate this repetition.`,
+    savings_tokens: repetitionTax,
+    savings_usd_per_10k: repetitionTax * pricePerToken * 1e4
+  });
+}
+function detectBoilerplate(node, pricePerToken, insights) {
+  if (node.type !== "string") return;
+  if (!node.string_stats) return;
+  if (node.fill_rate <= 0.5) return;
+  if (node.string_stats.value_diversity >= 0.1) return;
+  const uniqueCount = node.string_stats.unique_count;
+  const totalInstances = node.instance_count;
+  const savingsPerInstance = node.tokens.value_payload * 0.7;
+  const severity = savingsPerInstance > 10 ? "high" : savingsPerInstance > 3 ? "medium" : "low";
+  insights.push({
+    type: "boilerplate",
+    path: node.path,
+    severity,
+    message: `${node.name} has ${uniqueCount} unique values across ${totalInstances} instances. Consider replacing with an enum.`,
+    detail: `This string field has very low value diversity (${(node.string_stats.value_diversity * 100).toFixed(1)}%). Only ${uniqueCount} distinct values appear across ${totalInstances} instances. The repetitive content costs ~${node.tokens.value_payload.toFixed(1)} tokens per instance. Replacing with an enum or shorter values would significantly reduce cost.`,
+    savings_tokens: savingsPerInstance,
+    savings_usd_per_10k: savingsPerInstance * pricePerToken * 1e4
+  });
+}
+function detectLengthVariance(node, pricePerToken, insights) {
+  if (node.type !== "string") return;
+  if (node.tokens.total.p50 === 0) return;
+  const ratio = node.tokens.total.p95 / node.tokens.total.p50;
+  if (ratio <= 5) return;
+  const p50 = Math.round(node.tokens.total.p50);
+  const p95 = Math.round(node.tokens.total.p95);
+  const savingsPerInstance = (node.tokens.total.p95 - node.tokens.total.p50) * 0.05;
+  const severity = ratio > 20 ? "high" : ratio > 10 ? "medium" : "low";
+  insights.push({
+    type: "length_variance",
+    path: node.path,
+    severity,
+    message: `${node.name} length varies ${ratio.toFixed(0)}x (p50: ${p50} tok, p95: ${p95} tok). Consider adding length guidance.`,
+    detail: `This string field has high length variance with a ${ratio.toFixed(1)}x spread between median and 95th percentile. The median instance costs ${p50} tokens but the 95th percentile costs ${p95} tokens. Adding max_length guidance in your schema description would reduce outlier costs.`,
+    savings_tokens: savingsPerInstance,
+    savings_usd_per_10k: savingsPerInstance * pricePerToken * 1e4
+  });
+}
+// src/engine/costCalculation.ts
+function calculateCosts(tree, pricing, fileCount) {
+  const pricePerToken = pricing.output_per_1m / 1e6;
+  walkAndComputeCost(tree, pricePerToken, fileCount);
+}
+function walkAndComputeCost(node, pricePerToken, fileCount) {
+  node.cost = {
+    per_instance: node.tokens.total.avg * pricePerToken,
+    total_corpus: node.tokens.total.avg * pricePerToken * fileCount
+  };
+  for (const child of node.children) {
+    walkAndComputeCost(child, pricePerToken, fileCount);
+  }
+}
+// src/engine/pricing.ts
+var MODELS = {
+  "gpt-4o": {
+    model_id: "gpt-4o",
+    provider: "openai",
+    output_per_1m: 10,
+    tokenizer: "o200k_base"
+  },
+  "gpt-4o-mini": {
+    model_id: "gpt-4o-mini",
+    provider: "openai",
+    output_per_1m: 0.6,
+    tokenizer: "o200k_base"
+  },
+  "claude-sonnet-4-5": {
+    model_id: "claude-sonnet-4-5",
+    provider: "anthropic",
+    output_per_1m: 15,
+    tokenizer: "o200k_base"
+    // Anthropic uses its own tokenizer but o200k_base is a reasonable proxy
+  },
+  "claude-haiku-4-5": {
+    model_id: "claude-haiku-4-5",
+    provider: "anthropic",
+    output_per_1m: 5,
+    tokenizer: "o200k_base"
+  }
+};
+function getModelPricing(modelId) {
+  const pricing = MODELS[modelId];
+  if (!pricing) {
+    throw new Error(`Unknown model: "${modelId}". Available: ${Object.keys(MODELS).join(", ")}`);
+  }
+  return pricing;
+}
+// src/engine/cohorts.ts
+function getJsonType2(value) {
+  if (value === null) return "null";
+  if (Array.isArray(value)) return "array";
+  const t = typeof value;
+  if (t === "string") return "string";
+  if (t === "number") return "number";
+  if (t === "boolean") return "boolean";
+  if (t === "object") return "object";
+  return "string";
+}
+function fingerprint(doc) {
+  if (typeof doc !== "object" || doc === null || Array.isArray(doc)) {
+    return `_root:${getJsonType2(doc)}`;
+  }
+  return Object.keys(doc).sort().join("|");
+}
+function detectCohorts(documents) {
+  const groups = /* @__PURE__ */ new Map();
+  for (let i = 0; i < documents.length; i++) {
+    const fp = fingerprint(documents[i]);
+    let indices = groups.get(fp);
+    if (!indices) {
+      indices = [];
+      groups.set(fp, indices);
+    }
+    indices.push(i);
+  }
+  const cohorts = [];
+  for (const [id, file_indices] of groups) {
+    cohorts.push({
+      id,
+      label: generateCohortLabel(id, file_indices.length),
+      file_count: file_indices.length,
+      file_indices
+    });
+  }
+  cohorts.sort((a, b) => b.file_count - a.file_count);
+  return cohorts;
+}
+function generateCohortLabel(fp, _fileCount) {
+  const keys = fp.split("|").map((entry) => entry.split(":")[0]);
+  if (keys.length <= 3) return keys.join(", ");
+  return keys.slice(0, 3).join(", ") + ` +${keys.length - 3}`;
+}
+// src/engine/pipeline.ts
+function analyzeParsedDocuments(documents, pricing, encoding, glob2, sampleValues2) {
+  const schema = inferSchema(documents);
+  const perFileTokens = [];
+  const perFileValues = [];
+  for (const doc of documents) {
+    perFileTokens.push(tokenizeFile(doc, schema, encoding));
+    perFileValues.push(collectValues(doc, schema));
+  }
+  const tree = aggregate(schema, perFileTokens, perFileValues, sampleValues2);
+  calculateCosts(tree, pricing, documents.length);
+  const pricePerToken = pricing.output_per_1m / 1e6;
+  const insights = detectInsights(tree, pricePerToken);
+  const avgTokens = tree.tokens.total.avg;
+  const costPerInstance = avgTokens * pricePerToken;
+  const summary = {
+    file_count: documents.length,
+    glob: glob2,
+    model: pricing.model_id,
+    tokenizer: pricing.tokenizer,
+    output_price_per_1m: pricing.output_per_1m,
+    corpus_total_tokens: avgTokens * documents.length,
+    corpus_total_cost: costPerInstance * documents.length,
+    avg_tokens_per_instance: avgTokens,
+    cost_per_instance: costPerInstance,
+    overhead_ratio: avgTokens > 0 ? tree.tokens.schema_overhead / avgTokens : 0,
+    null_waste_ratio: avgTokens > 0 ? tree.tokens.null_waste / avgTokens : 0,
+    cost_at_1k: costPerInstance * 1e3,
+    cost_at_10k: costPerInstance * 1e4,
+    cost_at_100k: costPerInstance * 1e5,
+    cost_at_1m: costPerInstance * 1e6,
+    top_insights: insights.slice(0, 5)
+  };
+  return { schema: "tokstat/v1", summary, tree, insights };
+}
+function runPipeline(options2) {
+  const { pricing, tokenizerName } = resolvePricing(options2);
+  const files = readFiles(options2.glob);
+  const documents = files.map((f) => f.parsed);
+  const encoding = getEncoding(tokenizerName);
+  return analyzeParsedDocuments(documents, pricing, encoding, options2.glob, options2.sampleValues);
+}
+function runCohortedPipeline(options2) {
+  const { pricing, tokenizerName } = resolvePricing(options2);
+  const files = readFiles(options2.glob);
+  const documents = files.map((f) => f.parsed);
+  const encoding = getEncoding(tokenizerName);
+  const cohorts = detectCohorts(documents);
+  const combined = analyzeParsedDocuments(documents, pricing, encoding, options2.glob, options2.sampleValues);
+  const per_cohort = {};
+  for (const cohort of cohorts) {
+    const cohortDocs = cohort.file_indices.map((i) => documents[i]);
+    per_cohort[cohort.id] = analyzeParsedDocuments(
+      cohortDocs,
+      pricing,
+      encoding,
+      `${options2.glob} [${cohort.label}]`,
+      options2.sampleValues
+    );
+  }
+  return { schema: "tokstat/v1", cohorts, combined, per_cohort };
+}
+function resolvePricing(options2) {
+  const resolvedTokenizer = options2.tokenizer !== "auto" ? options2.tokenizer : options2.costPer1k ? "o200k_base" : getModelPricing(options2.model).tokenizer;
+  const pricing = options2.costPer1k ? {
+    model_id: options2.model,
+    provider: "custom",
+    output_per_1m: options2.costPer1k * 1e3,
+    tokenizer: resolvedTokenizer
+  } : getModelPricing(options2.model);
+  const tokenizerName = options2.tokenizer === "auto" ? pricing.tokenizer : options2.tokenizer;
+  return { pricing, tokenizerName };
+}
+function readFiles(glob2) {
+  const paths = fg.sync(glob2).sort();
+  if (paths.length === 0) {
+    throw new Error(`No files matched glob: "${glob2}"`);
+  }
+  const files = [];
+  for (const filePath of paths) {
+    const absPath = resolve(filePath);
+    const content = readFileSync(absPath, "utf-8");
+    const parsed = JSON.parse(content);
+    files.push({ path: absPath, parsed });
+  }
+  return files;
+}
+// src/formatters/jsonFormatter.ts
+function formatJson(output2) {
+  return JSON.stringify(output2, null, 2);
+}
+// src/formatters/llmFormatter.ts
+function formatLlm(output2) {
+  const { summary, tree, insights } = output2;
+  const lines = [];
+  lines.push(
+    `tokstat analysis: ${summary.file_count} files, ${summary.model} (${summary.tokenizer})`
+  );
+  lines.push("");
+  lines.push(
+    `HEADLINE: $${summary.cost_per_instance.toFixed(4)}/instance, ${Math.round(summary.overhead_ratio * 100)}% schema overhead, ${Math.round(summary.null_waste_ratio * 100)}% null waste`
+  );
+  lines.push("");
+  lines.push("SCALE:");
+  lines.push(`  1K: $${summary.cost_at_1k.toFixed(2)}`);
+  lines.push(`  10K: $${summary.cost_at_10k.toFixed(2)}`);
+  lines.push(`  100K: $${summary.cost_at_100k.toFixed(2)}`);
+  lines.push(`  1M: $${summary.cost_at_1m.toFixed(2)}`);
+  lines.push("");
+  if (insights.length > 0) {
+    lines.push("TOP SAVINGS:");
+    const topInsights = insights.slice(0, 5);
+    for (let i = 0; i < topInsights.length; i++) {
+      const insight = topInsights[i];
+      lines.push(`  ${i + 1}. ${formatInsightLine(insight)}`);
+    }
+    lines.push("");
+  }
+  const hotspots = findOverheadHotspots(tree);
+  if (hotspots.length > 0) {
+    lines.push("SCHEMA OVERHEAD HOTSPOTS:");
+    for (const hotspot of hotspots.slice(0, 5)) {
+      lines.push(`  ${hotspot}`);
+    }
+    lines.push("");
+  }
+  const wasteFields = findHighWasteFields(tree);
+  if (wasteFields.length > 0) {
+    lines.push("HIGH WASTE (low fill, high cost):");
+    for (const field of wasteFields.slice(0, 5)) {
+      lines.push(`  ${field}`);
+    }
+    lines.push("");
+  }
+  const boilerplateInsights = insights.filter((i) => i.type === "boilerplate");
+  if (boilerplateInsights.length > 0) {
+    lines.push("BOILERPLATE:");
+    for (const insight of boilerplateInsights.slice(0, 3)) {
+      lines.push(`  ${insight.message}`);
+    }
+    lines.push("");
+  }
+  return lines.join("\n");
+}
+function formatInsightLine(insight) {
+  const savings = Math.round(insight.savings_tokens);
+  const usd = insight.savings_usd_per_10k.toFixed(2);
+  return `${insight.path} \u2014 ${insight.message.split(".")[0]}, saves ${savings} tok/inst ($${usd}/10K)`;
+}
+function findOverheadHotspots(tree) {
+  const hotspots = [];
+  function walk(node) {
+    if (node.type === "array" && node.array_stats && node.array_stats.avg_items > 1) {
+      const itemNode = node.children.find((c) => c.name === "[]");
+      if (itemNode) {
+        const fieldCount = itemNode.children.length;
+        const avgItems = node.array_stats.avg_items;
+        hotspots.push({
+          text: `${node.path} items: ${fieldCount} field names x ${avgItems.toFixed(1)} avg items = ${(fieldCount * avgItems).toFixed(1)} key emissions/inst`,
+          overhead: fieldCount * avgItems
+        });
+      }
+    }
+    if (node.type === "object" && node.tokens.total.avg > 0) {
+      const ratio = node.tokens.schema_overhead / node.tokens.total.avg;
+      if (ratio > 0.6 && node.children.length > 2) {
+        hotspots.push({
+          text: `${node.path} object: ${node.children.length} field names, ${Math.round(ratio * 100)}% overhead ratio`,
+          overhead: node.tokens.schema_overhead
+        });
+      }
+    }
+    for (const child of node.children) {
+      walk(child);
+    }
+  }
+  walk(tree);
+  hotspots.sort((a, b) => b.overhead - a.overhead);
+  return hotspots.map((h) => h.text);
+}
+function findHighWasteFields(tree) {
+  const fields = [];
+  function walk(node) {
+    if (node.fill_rate < 0.5 && node.fill_rate > 0 && node.tokens.total.avg > 0 && node.name !== "root") {
+      const fillPct = Math.round(node.fill_rate * 100);
+      const avgTokens = Math.round(node.tokens.total.avg);
+      fields.push({
+        text: `${node.path} \u2014 ${avgTokens} tok avg, ${fillPct}% fill`,
+        waste: node.tokens.total.avg * (1 - node.fill_rate)
+      });
+    }
+    for (const child of node.children) {
+      walk(child);
+    }
+  }
+  walk(tree);
+  fields.sort((a, b) => b.waste - a.waste);
+  return fields.map((f) => f.text);
+}
+// src/cli/server.ts
+import { createServer } from "http";
+import open from "open";
+var APP_URL = "https://tomneyland.github.io/tokstat/other/index.html";
+async function startServer(data, port, autoOpen) {
+  const resp = await fetch(APP_URL);
+  const html = await resp.text();
+  const dataScript = `<script id="tokstat-data" type="application/json">${JSON.stringify(data)}</script>`;
+  const injectedHtml = html.replace("</head>", `${dataScript}
+</head>`);
+  const server = createServer((_req, res) => {
+    res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" });
+    res.end(injectedHtml);
+  });
+  server.listen(port, () => {
+    const url = `http://localhost:${port}`;
+    console.log(`
+  Report available at ${url}`);
+    console.log("  Press Ctrl+C to exit\n");
+    if (autoOpen) open(url);
+  });
+  process.on("SIGINT", () => {
+    server.close();
+    process.exit(0);
+  });
+  process.on("SIGTERM", () => {
+    server.close();
+    process.exit(0);
+  });
+  await new Promise(() => {
+  });
+}
+// src/cli/index.ts
+var APP_URL2 = "https://tomneyland.github.io/tokstat/other/";
+var program = new Command().name("tokstat").description("Visualize per-field token costs across a corpus of LLM-generated JSON").argument("[glob]", "Glob pattern for JSON files").option("--model <model>", "Model for cost estimation", "gpt-4o").option("--format <fmt>", "Output format: interactive, json, llm", "interactive").option("--tokenizer <enc>", "Tokenizer encoding", "auto").option("--out <path>", "Write to file instead of stdout/browser").option("--port <port>", "Dev server port", "3742").option("--no-open", "Don't auto-open browser").option("--cost-per-1k <n>", "Custom output token price per 1K tokens").option("--sample-values <n>", "Example values per field", "5").parse(process.argv);
+var opts = program.opts();
+var glob = program.args[0];
+if (!glob) {
+  console.log(`  Opening ${APP_URL2}`);
+  await open2(APP_URL2);
+  process.exit(0);
+}
+var options = {
+  glob,
+  model: opts.model,
+  format: opts.format,
+  tokenizer: opts.tokenizer,
+  out: opts.out ?? null,
+  port: parseInt(opts.port, 10),
+  noOpen: opts.open === false,
+  costPer1k: opts.costPer1k ? parseFloat(opts.costPer1k) : null,
+  sampleValues: parseInt(opts.sampleValues, 10)
+};
+var isQuiet = options.format === "json" || options.format === "llm";
+function log(msg) {
+  if (!isQuiet) {
+    process.stderr.write(msg + "\n");
+  }
+}
+log(`  Analyzing ${options.glob}...`);
+var output = runPipeline(options);
+log(`  ${output.summary.file_count} files, ${Math.round(output.summary.avg_tokens_per_instance)} avg tokens/instance`);
+log(`  ${Math.round(output.summary.overhead_ratio * 100)}% schema overhead, ${Math.round(output.summary.null_waste_ratio * 100)}% null waste`);
+log(`  ${output.insights.length} insights detected`);
+switch (options.format) {
+  case "json": {
+    const formatted = formatJson(output);
+    if (options.out) {
+      writeFileSync(options.out, formatted, "utf-8");
+      console.log(`Output written to ${options.out}`);
+    } else {
+      console.log(formatted);
+    }
+    break;
+  }
+  case "llm": {
+    const formatted = formatLlm(output);
+    if (options.out) {
+      writeFileSync(options.out, formatted, "utf-8");
+      console.log(`Output written to ${options.out}`);
+    } else {
+      console.log(formatted);
+    }
+    break;
+  }
+  case "interactive": {
+    const cohortedOutput = runCohortedPipeline(options);
+    log(`  ${cohortedOutput.cohorts.length} schema cohort(s) detected`);
+    if (options.out) {
+      log("  Building self-contained HTML...");
+      const resp = await fetch("https://tomneyland.github.io/tokstat/other/index.html");
+      const html = await resp.text();
+      const dataScript = `<script id="tokstat-data" type="application/json">${JSON.stringify(cohortedOutput)}</script>`;
+      const injectedHtml = html.replace("</head>", `${dataScript}
+</head>`);
+      writeFileSync(options.out, injectedHtml, "utf-8");
+      log(`  Written to ${options.out}`);
+    } else {
+      log("  Starting visualization server...");
+      await startServer(cohortedOutput, options.port, !options.noOpen);
+    }
+    break;
+  }
+  default:
+    throw new Error(`Unknown format: ${options.format}`);
+}

package/package.json ADDED Viewed

@@ -0,0 +1,50 @@
+{
+  "name": "tokstat",
+  "version": "0.1.0",
+  "description": "Visualize per-field token costs across a corpus of LLM-generated JSON",
+  "type": "module",
+  "bin": {
+    "tokstat": "dist/index.js"
+  },
+  "files": [
+    "dist/",
+    "README.md"
+  ],
+  "engines": {
+    "node": ">=18"
+  },
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "build:cli": "tsup",
+    "preview": "vite preview",
+    "check": "svelte-check --tsconfig ./tsconfig.app.json && tsc -p tsconfig.node.json",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "prepublishOnly": "tsup"
+  },
+  "dependencies": {
+    "commander": "^14.0.3",
+    "fast-glob": "^3.3.3",
+    "js-tiktoken": "^1.0.21",
+    "open": "^11.0.0"
+  },
+  "devDependencies": {
+    "@sveltejs/vite-plugin-svelte": "^6.2.1",
+    "@tsconfig/svelte": "^5.0.6",
+    "@types/d3-hierarchy": "^3.1.7",
+    "@types/d3-interpolate": "^3.0.4",
+    "@types/d3-scale": "^4.0.9",
+    "@types/node": "^24.10.1",
+    "d3-hierarchy": "^3.1.2",
+    "d3-interpolate": "^3.0.1",
+    "d3-scale": "^4.0.2",
+    "svelte": "^5.45.2",
+    "svelte-check": "^4.3.4",
+    "tsup": "^8.5.1",
+    "typescript": "~5.9.3",
+    "vite": "^7.3.1",
+    "vite-plugin-singlefile": "^2.3.0",
+    "vitest": "^4.0.18"
+  }
+}