PyPI - class1 - Versions diffs - 0.1.0__py3-none-any.whl - Mend

class1 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

blue_book/actuals.py +126 -0
blue_book/actuals_history.py +503 -0
blue_book/allocation.py +31 -0
blue_book/calibration_data.py +544 -0
blue_book/estimate_store.py +101 -0
blue_book/focus.py +143 -0
blue_book/footprint_actuals.py +93 -0
blue_book/ingest.py +61 -0
blue_book/ingest_core.py +184 -0
blue_book/mine.py +171 -0
blue_book/opendata.py +180 -0
blue_book/otel_receiver.py +260 -0
blue_book/search_first.py +28 -0
blue_book/usage_openai.py +57 -0
class1-0.1.0.dist-info/METADATA +135 -0
class1-0.1.0.dist-info/RECORD +72 -0
class1-0.1.0.dist-info/WHEEL +5 -0
class1-0.1.0.dist-info/entry_points.txt +2 -0
class1-0.1.0.dist-info/top_level.txt +4 -0
cost_engine/__init__.py +97 -0
cost_engine/aliases.py +53 -0
cost_engine/autopoiesis.py +109 -0
cost_engine/basis.py +118 -0
cost_engine/basis_of_estimate.py +65 -0
cost_engine/budget.py +101 -0
cost_engine/calibration.py +136 -0
cost_engine/calibrator.py +91 -0
cost_engine/capability.py +130 -0
cost_engine/capability_data.py +43 -0
cost_engine/capex.py +90 -0
cost_engine/classification.py +45 -0
cost_engine/cloud_cost.py +78 -0
cost_engine/commitment.py +41 -0
cost_engine/contingency.py +26 -0
cost_engine/distributions.py +28 -0
cost_engine/energy.py +92 -0
cost_engine/escalation.py +40 -0
cost_engine/estimate_decay.py +38 -0
cost_engine/evidence.py +217 -0
cost_engine/grades_real.py +123 -0
cost_engine/mcp_overhead.py +80 -0
cost_engine/monte_carlo.py +107 -0
cost_engine/prices.py +89 -0
cost_engine/pricing_loader.py +35 -0
cost_engine/recommend.py +65 -0
cost_engine/report.py +69 -0
cost_engine/scenario.py +106 -0
cost_engine/self_cost.py +27 -0
cost_engine/structured_price.py +162 -0
snapshots/__init__.py +0 -0
snapshots/actuals_index.json +738 -0
snapshots/actuarial_table_real.json +2132 -0
snapshots/autobuild_runs.json +176 -0
snapshots/capability.json +280 -0
snapshots/cloud_price_index.json +219694 -0
snapshots/estimates.json +54 -0
snapshots/footprint_basis.json +33 -0
snapshots/grid_intensity.json +31 -0
snapshots/price_index.json +45810 -0
snapshots/pricing.json +49088 -0
snapshots/pricing_structure.json +10193 -0
snapshots/spec_sheet.json +419463 -0
snapshots/water_basis.json +17 -0
takeoff/__init__.py +14 -0
takeoff/estimate_pr.py +423 -0
takeoff/license.py +91 -0
takeoff/pilot.py +68 -0
takeoff/policy.py +96 -0
takeoff/post_pr.py +67 -0
takeoff/scan.py +354 -0
takeoff/scan_treesitter.py +173 -0
takeoff/translate.py +92 -0

blue_book/actuals.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""FinOps actuals — FOCUS-IN + close the calibration loop.
+The FinOps-canonical source of ACTUAL spend is a FOCUS dataset (FinOps Open Cost & Usage
+Specification): a cloud/billing export, a FinOps platform (Vantage / CloudZero / Finout) export,
+or the provider usage/costs API mapped to FOCUS. blue_book already EXPORTS FOCUS (focus.py); this is
+the inverse — read a FOCUS dataset, aggregate the FinOps `EffectiveCost` per workload/month into the
+monthly ACTUAL, and feed it (paired with the prior ESTIMATE) into the ActuarialTable so the estimate
+class rises from 5 (a guess) toward validated. Pure/offline: the REAL FOCUS rows are the user's data.
+"""
+from __future__ import annotations
+import csv
+from pathlib import Path
+from cost_engine.calibration import ActuarialTable  # NOTE: layer violation — blue_book imports cost_engine. Tracked as tech debt.
+def read_focus_csv(path: str | Path) -> list[dict]:
+    """Read a FOCUS-format CSV (as exported by focus.export_to_csv or any FinOps tool / cloud billing).
+    Uses utf-8-SIG: real exports (verified on Microsoft's Azure EA FOCUS sample) carry a UTF-8 BOM that
+    would otherwise corrupt the first column name (\\ufeffBilledCost)."""
+    with Path(path).open(newline="", encoding="utf-8-sig") as f:
+        return list(csv.DictReader(f))
+def monthly_actual(focus_rows: list[dict], workflow: str | None = None, month: str | None = None,
+                   cost_col: str = "EffectiveCost", tag_col: str = "x_workflow_name") -> float:
+    """The ACTUAL monthly spend = sum of FinOps EffectiveCost over FOCUS rows. Optionally scope to a
+    workload (the `tag_col` column == `workflow` — x_workflow_name for our LLM exports, or ServiceName/
+    ResourceId for cloud FOCUS) and/or a month (YYYY-MM prefix of ChargePeriodStart). EffectiveCost is
+    the post-discount FinOps cost — the right number to validate an estimate against."""
+    total = 0.0
+    for r in focus_rows:
+        if workflow is not None and (r.get(tag_col) or "") != workflow:
+            continue
+        if month is not None and not str(r.get("ChargePeriodStart", "")).startswith(month):
+            continue
+        try:
+            total += float(r.get(cost_col) or 0.0)
+        except (TypeError, ValueError):
+            continue
+    return total
+def record_actual(table: ActuarialTable, workflow: str, estimate: dict, focus_rows: list[dict], *,
+                  month: str | None = None, dominant_driver: str = "output length",
+                  tag_col: str = "x_workflow_name") -> float:
+    """Close the loop: pair the prior ESTIMATE for `workflow` with its ACTUAL spend (from the FOCUS
+    dataset) -> ActuarialTable. n_actuals rises -> estimate_class rises (the flywheel). Returns the
+    actual. This is the one step that turns a Class-5 guess into a validated estimate."""
+    actual = monthly_actual(focus_rows, workflow=workflow, month=month, tag_col=tag_col)
+    table.add(workflow, estimate, actual, dominant_driver)
+    return actual
+def variance_waterfall(estimate: dict, actual: float, driver_elasticities: dict[str, float] | None = None) -> dict[str, float]:
+    """Isolate the cost variance (Actual - Expected) into a waterfall of risk drivers.
+    If actual exceeds expected, the variance is distributed proportionally
+    across the drivers based on their simulated elasticities. This bridges
+    the gap between 'we missed the budget' and 'here is exactly why'.
+    """
+    driver_elasticities = driver_elasticities or {}
+    expected = float(estimate.get("expected", 0.0))
+    total_variance = actual - expected
+    waterfall = {}
+    if total_variance == 0 or not driver_elasticities:
+        waterfall["unexplained"] = total_variance
+        return waterfall
+    total_elasticity = sum(driver_elasticities.values())
+    if total_elasticity == 0:
+        waterfall["unexplained"] = total_variance
+        return waterfall
+    explained = 0.0
+    for driver, elasticity in driver_elasticities.items():
+        # Using abs(elasticity) in case negative correlation exists but we allocate magnitude
+        weight = abs(elasticity) / sum(abs(v) for v in driver_elasticities.values())
+        impact = total_variance * weight
+        waterfall[driver] = impact
+        explained += impact
+    remainder = total_variance - explained
+    if abs(remainder) > 0.01:
+        waterfall["remainder"] = remainder
+    return waterfall
+def _main(argv=None) -> int:
+    """CLI: record a real ACTUAL (from a FOCUS export) against a stored ESTIMATE -> persist the loop.
+      python -m blue_book.actuals --focus bill.csv --workflow support_agent \\
+          --estimate '{"expected":18,"p50":16,"p90":30}' [--month 2026-06]
+    """
+    import argparse
+    import json
+    from cost_engine.calibration import ActuarialTable, load_table, save_table  # NOTE: layer violation — blue_book imports cost_engine. Tracked as tech debt.
+    ap = argparse.ArgumentParser(description="Close the calibration loop: a FOCUS actual vs a prior estimate.")
+    ap.add_argument("--focus", required=True, help="FOCUS CSV (FinOps export / cloud billing / provider-usage->FOCUS)")
+    ap.add_argument("--workflow", required=True, help="x_workflow_name to scope the actual to")
+    ap.add_argument("--estimate", required=True, help="JSON (file path or inline) with expected/p50/p90")
+    ap.add_argument("--month", default=None, help="YYYY-MM to scope the actual (default: all)")
+    ap.add_argument("--table", default="snapshots/actuarial_table.json", help="persisted ActuarialTable")
+    ap.add_argument("--driver", default="output length")
+    ap.add_argument("--scope-col", default="x_workflow_name",
+                    help="FOCUS column to scope by (x_workflow_name for LLM; ServiceName/ResourceId for cloud)")
+    a = ap.parse_args(argv)
+    est = json.loads(Path(a.estimate).read_text()) if Path(a.estimate).exists() else json.loads(a.estimate)
+    table = load_table(a.table) if Path(a.table).exists() else ActuarialTable()
+    actual = record_actual(table, a.workflow, est, read_focus_csv(a.focus),
+                           month=a.month, dominant_driver=a.driver, tag_col=a.scope_col)
+    save_table(table, a.table)
+    cls = table.estimate_class(a.workflow)
+    print(f"actual ${actual:,.2f} recorded for '{a.workflow}' (month={a.month or 'all'}) -> "
+          f"n_actuals={table.n_actuals(a.workflow)}, class={cls.label}, "
+          f"verdict={table._scoped(a.workflow)[-1].verdict}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(_main())

blue_book/actuals_history.py ADDED Viewed

@@ -0,0 +1,503 @@
+"""HISTORICAL ACTUALS corpus — real FinOps/FOCUS spend, the auditable basis for the COST side.
+Prices are one half of the basis (the RATE). ACTUALS are the other (what was really BILLED), and
+they are themselves historical data — a dated FOCUS dataset is the spend-equivalent of the Token
+Price Index. This is the medallion ingest for actuals, mirroring `prices/history.py`:
+  catalog (SOURCES, each WIRED / CATALOG_ONLY / REJECTED with license + verdict)
+    -> fetch each WIRED dataset to snapshots/bronze/actuals/<name>/ (pinned to a commit sha, sha256-stamped)
+    -> aggregate ALL bronze FOCUS rows into snapshots/actuals_index.json
+       (real monthly EffectiveCost by provider / ServiceCategory / month)
+Most public FOCUS data is CLOUD infrastructure — Compute, Storage, Networking, Databases — plus an
+"AI and Machine Learning" ServiceCategory. That is exactly the non-token, fully-loaded FinOps stack
+an LLM application ALSO pays: the agent-loop compute, the vector-DB hosting, the egress. ABC7D's
+token cost comes from the price DB; its CLOUD cost comes from THIS corpus. The index is the grounded
+basis that replaces the hand-set `orchestration_cost_per_call` / `fixed_monthly_usd` priors.
+Offline discipline: `build_actuals_index` is pure (reads already-fetched bronze; the testable core).
+`fetch_source` is the only network step (resolves a commit sha, downloads raw at that sha) — same
+provenance contract as the price history.
+"""
+from __future__ import annotations
+import csv
+import hashlib
+import json
+import subprocess
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import date
+from pathlib import Path
+BRONZE = Path("snapshots/bronze/actuals")
+INDEX = Path("snapshots/actuals_index.json")
+# FOCUS canonical columns we aggregate on (the spec names; read tolerantly, see _col).
+_COST_COLS = ("EffectiveCost", "BilledCost", "ContractedCost", "ListCost")
+_AI_CATEGORY = "AI and Machine Learning"   # the FOCUS ServiceCategory closest to LLM/AI infra
+@dataclass(frozen=True)
+class ActualsSource:
+    """One public actuals dataset. focus_native = already FOCUS (directly ingestable); provider_native
+    = a raw cloud export (AWS CUR / Azure EA) that needs the FOCUS converter first; api = a usage API."""
+    name: str
+    kind: str            # focus_native | provider_native | api
+    license: str
+    verdict: str         # WIRED | CATALOG_ONLY | REJECTED
+    repo: str = ""       # owner/name (GitHub) for sha resolution + raw download
+    branch: str = "main"
+    path: str = ""       # file path in the repo
+    note: str = ""
+# --- WIRED: FOCUS-native, bulk-ingested into the actuals index ------------------------------
+SOURCES: dict[str, ActualsSource] = {
+    "focus_validator_10000": ActualsSource(
+        "focus_validator_10000", "focus_native", "MIT", "WIRED",
+        repo="finopsfoundation/focus_validator", branch="main",
+        path="tests/samples/focus_sample_10000.csv",
+        note="10,000-row multi-cloud FOCUS 1.0 sample (AWS/Microsoft/Oracle, 2024-09). Full 44-col "
+             "spec incl. commitment-discount + an 'AI and Machine Learning' category."),
+    "focus_sample_100000": ActualsSource(
+        "focus_sample_100000", "focus_native", "MIT", "WIRED",
+        repo="FinOps-Open-Cost-and-Usage-Spec/FOCUS-Sample-Data", branch="main",
+        path="FOCUS-1.0/focus_sample_100000.csv.gz",
+        note="100,000-row multi-cloud FOCUS 1.0 sample (AWS/Microsoft/Oracle/GCP, 2024-09). "
+             "Same spec as the 10K validator set, 10x the rows + GCP representation."),
+    # --- WIRED provider-native: converted in-process to FOCUS via _convert_* functions -----------
+    "focus_converter_aws_cur": ActualsSource(
+        "focus_converter_aws_cur", "provider_native", "MIT", "WIRED",
+        repo="finopsfoundation/focus_converters", branch="dev",
+        path="focus_converter_base/tests/provider_config_tests/aws/sample-anonymous-aws-export-dataset.csv",
+        note="Real anonymized AWS CUR (~1.3K rows). Converted to FOCUS in-process via _convert_aws_cur."),
+    "focus_converter_azure_ea": ActualsSource(
+        "focus_converter_azure_ea", "provider_native", "MIT", "WIRED",
+        repo="finopsfoundation/focus_converters", branch="dev",
+        path="focus_converter_base/tests/provider_config_tests/azure/sample-anonymous-ea-export-dataset.csv",
+        note="Real anonymized Azure EA export (~27 rows). Converted to FOCUS via _convert_azure_ea."),
+    "focus_converter_oci": ActualsSource(
+        "focus_converter_oci", "provider_native", "MIT", "WIRED",
+        repo="finopsfoundation/focus_converters", branch="dev",
+        path="focus_converter_base/tests/provider_config_tests/oci/reports_cost-csv_0000000030000269.csv",
+        note="Real Oracle Cloud cost report (~506 rows). Converted to FOCUS via _convert_oci."),
+    # --- WIRED Kaggle/HuggingFace: real billing data, provider-native ---
+    "kaggle_azure_subscription": ActualsSource(
+        "kaggle_azure_subscription", "provider_native", "CC0 1.0", "WIRED",
+        note="93K-row anonymized Azure subscription costs (2022-12, MeterCategory-level). "
+             "Kaggle carrucciu/azure-costs. Converted via _convert_azure_subscription."),
+    "kaggle_gcp_billing": ActualsSource(
+        "kaggle_gcp_billing", "provider_native", "Apache 2.0", "WIRED",
+        note="124K-row GCP billing data (2022-2023, 25 services). HuggingFace sairamn/gcp-cloud-billing-cost. "
+             "Converted via _convert_gcp_billing."),
+    "kaggle_azure_org_expenses": ActualsSource(
+        "kaggle_azure_org_expenses", "provider_native", "CC0 1.0", "WIRED",
+        note="89-row Azure org expenses (2023-2024, monthly by service). "
+             "Kaggle rishi2123/oragnizations-expenses-2023-2024. Converted via _convert_azure_org_expenses."),
+    # --- CATALOG_ONLY: needs API key or manual setup ---
+    "openai_usage_api": ActualsSource(
+        "openai_usage_api", "api", "provider data (your account)", "CATALOG_ONLY",
+        note="The real LLM-actuals source. blue_book/usage_openai.fetch_openai_usage needs OPENAI_ADMIN_KEY; "
+             "maps Usage API buckets -> priced CanonicalEvents -> FOCUS via focus.py."),
+}
+def _col(row: dict, name: str) -> str | None:
+    """FOCUS column lookup tolerant of case (exports vary)."""
+    if name in row:
+        return row[name]
+    low = name.lower()
+    for k in row:
+        if k.lower() == low:
+            return row[k]
+    return None
+def _num(v) -> float:
+    try:
+        return float(v)
+    except (TypeError, ValueError):
+        return 0.0
+@dataclass
+class Provenance:
+    name: str
+    url: str
+    sha256: str
+    rows: int
+    license: str
+    fetched: str = field(default_factory=lambda: date.today().isoformat())
+def _latest_sha(repo: str, branch: str, path: str) -> str:
+    """Resolve the file's latest commit sha (pinned, reproducible download). '' if unavailable."""
+    out = subprocess.run(
+        ["gh", "api", f"repos/{repo}/commits", "-X", "GET",
+         "-f", f"sha={branch}", "-f", f"path={path}", "-f", "per_page=1"],
+        capture_output=True, text=True, check=False).stdout
+    try:
+        return json.loads(out)[0]["sha"]
+    except (json.JSONDecodeError, IndexError, KeyError):
+        return ""
+def fetch_source(src: ActualsSource, root: Path = BRONZE) -> Provenance | None:
+    """Download a focus_native source to bronze, pinned to its latest commit sha, sha256-stamped.
+    Returns provenance (None for non-focus_native — those are CATALOG_ONLY recipes, not bulk-fetched)."""
+    if src.kind != "focus_native" or src.verdict != "WIRED":
+        return None
+    sha = _latest_sha(src.repo, src.branch, src.path) or src.branch
+    url = f"https://raw.githubusercontent.com/{src.repo}/{sha}/{src.path}"
+    dest = root / src.name / Path(src.path).name
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    data = subprocess.run(["curl", "-sS", "-L", url], capture_output=True, check=False).stdout
+    dest.write_bytes(data)
+    rows = max(0, data.count(b"\n") - 1)
+    prov = Provenance(src.name, url, hashlib.sha256(data).hexdigest(), rows, src.license)
+    (root / src.name / "provenance.json").write_text(json.dumps(prov.__dict__, indent=2))
+    return prov
+# ---------------------------------------------------------------------------
+# Provider-native → FOCUS converters (lightweight, in-process)
+# ---------------------------------------------------------------------------
+# Each converter reads provider-native CSV rows and returns FOCUS-shaped dicts
+# with the columns aggregate_focus_rows needs: ServiceCategory, EffectiveCost,
+# BilledCost, ProviderName, ChargePeriodStart.  Minimal — just enough for the
+# cost-basis rollup, not a full FOCUS export.
+_AWS_PRODUCT_CATEGORY: dict[str, str] = {
+    "AmazonEC2": "Compute", "AWSLambda": "Compute", "AmazonECS": "Compute",
+    "AmazonEKS": "Compute", "ElasticMapReduce": "Compute", "AmazonLightsail": "Compute",
+    "AmazonS3": "Storage", "AmazonEFS": "Storage", "AmazonGlacier": "Storage",
+    "AmazonRDS": "Databases", "AmazonDynamoDB": "Databases", "AmazonRedshift": "Databases",
+    "AmazonElastiCache": "Databases", "AmazonNeptune": "Databases",
+    "AWSDataTransfer": "Networking", "AmazonVPC": "Networking", "AmazonCloudFront": "Networking",
+    "AmazonRoute53": "Networking", "ElasticLoadBalancing": "Networking",
+    "AmazonSageMaker": "AI and Machine Learning", "AmazonBedrock": "AI and Machine Learning",
+    "AmazonComprehend": "AI and Machine Learning", "AmazonRekognition": "AI and Machine Learning",
+    "AWSCloudTrail": "Management and Governance", "AmazonCloudWatch": "Management and Governance",
+    "AWSConfig": "Management and Governance", "AWSSystemsManager": "Management and Governance",
+    "awskms": "Security", "AWSSecretsManager": "Security", "AmazonGuardDuty": "Security",
+    "AmazonSNS": "Integration", "AWSQueueService": "Integration", "AmazonStates": "Integration",
+    "AWSGlue": "Analytics", "AmazonAthena": "Analytics", "AmazonKinesis": "Analytics",
+}
+def _convert_aws_cur(rows: list[dict]) -> list[dict]:
+    """Convert AWS CUR rows to FOCUS-shaped dicts."""
+    focus: list[dict] = []
+    for r in rows:
+        pc = r.get("lineItem/ProductCode", "")
+        cat = _AWS_PRODUCT_CATEGORY.get(pc, "Other")
+        cost = _num(r.get("lineItem/UnblendedCost", 0))
+        billed = _num(r.get("lineItem/BlendedCost", 0))
+        # Use reservation EffectiveCost if present, else unblended
+        eff = _num(r.get("reservation/EffectiveCost", "")) or cost
+        # SavingsPlan effective cost overrides if present
+        sp_eff = _num(r.get("savingsPlan/SavingsPlanEffectiveCost", ""))
+        if sp_eff:
+            eff = sp_eff
+        start = r.get("lineItem/UsageStartDate", "")
+        focus.append({
+            "ServiceCategory": cat,
+            "EffectiveCost": str(eff),
+            "BilledCost": str(billed),
+            "ContractedCost": "0",
+            "ListCost": str(_num(r.get("pricing/publicOnDemandCost", 0))),
+            "ProviderName": "AWS",
+            "ChargePeriodStart": start,
+        })
+    return focus
+_AZURE_FAMILY_CATEGORY: dict[str, str] = {
+    "Compute": "Compute", "Networking": "Networking", "Storage": "Storage",
+    "Databases": "Databases", "Analytics": "Analytics",
+    "AI + Machine Learning": "AI and Machine Learning",
+    "Internet of Things": "Integration", "Integration": "Integration",
+    "Security": "Security", "Identity": "Identity",
+    "Management and Governance": "Management and Governance",
+    "Developer Tools": "Developer Tools", "Web": "Web",
+}
+def _convert_azure_ea(rows: list[dict]) -> list[dict]:
+    """Convert Azure EA (amortized) rows to FOCUS-shaped dicts."""
+    focus: list[dict] = []
+    for r in rows:
+        sf = r.get("ServiceFamily", "")
+        cat = _AZURE_FAMILY_CATEGORY.get(sf, "Other")
+        cost = _num(r.get("CostInBillingCurrency", 0))
+        unit_price = _num(r.get("UnitPrice", 0))
+        qty = _num(r.get("Quantity", 0))
+        list_cost = _num(r.get("PayGPrice", 0)) * qty
+        start = r.get("Date", "")
+        focus.append({
+            "ServiceCategory": cat,
+            "EffectiveCost": str(cost),
+            "BilledCost": str(cost),
+            "ContractedCost": str(unit_price * qty),
+            "ListCost": str(list_cost),
+            "ProviderName": "Microsoft",
+            "ChargePeriodStart": start,
+        })
+    return focus
+_OCI_SERVICE_CATEGORY: dict[str, str] = {
+    "COMPUTE": "Compute", "CONTAINER": "Compute", "FUNCTIONS": "Compute",
+    "BLOCK_STORAGE": "Storage", "OBJECTSTORE": "Storage", "FILE_STORAGE": "Storage",
+    "DATABASE": "Databases", "MYSQL": "Databases", "NOSQL": "Databases",
+    "POSTGRESQL": "Databases", "AUTONOMOUS_DATABASE": "Databases",
+    "NETWORK": "Networking", "VCN_FLOW_LOGS": "Networking", "LOAD_BALANCER": "Networking",
+    "ORACLE_STREAMING_SERVICE": "Integration", "ORACLE_INTEGRATION": "Integration",
+    "TELEMETRY": "Management and Governance", "MONITORING": "Management and Governance",
+    "DATA_SCIENCE": "AI and Machine Learning", "AI_SERVICES": "AI and Machine Learning",
+}
+def _convert_oci(rows: list[dict]) -> list[dict]:
+    """Convert OCI cost-csv rows to FOCUS-shaped dicts."""
+    focus: list[dict] = []
+    for r in rows:
+        svc = r.get("product/service", "")
+        cat = _OCI_SERVICE_CATEGORY.get(svc, "Other")
+        cost = _num(r.get("cost/myCost", 0))
+        overage = _num(r.get("cost/myCostOverage", 0))
+        start = r.get("lineItem/intervalUsageStart", "")
+        focus.append({
+            "ServiceCategory": cat,
+            "EffectiveCost": str(cost + overage),
+            "BilledCost": str(cost + overage),
+            "ContractedCost": "0",
+            "ListCost": "0",
+            "ProviderName": "Oracle",
+            "ChargePeriodStart": start,
+        })
+    return focus
+# --- Kaggle / HuggingFace provider-native converters ----------------------------------------
+_AZURE_METER_CATEGORY: dict[str, str] = {
+    "Virtual Machines": "Compute", "Container Instances": "Compute",
+    "Azure App Service": "Compute", "Functions": "Compute",
+    "Storage": "Storage", "Backup": "Storage",
+    "Azure Database for MySQL": "Databases", "SQL Database": "Databases",
+    "Azure Cosmos DB": "Databases", "Azure Database for PostgreSQL": "Databases",
+    "Virtual Network": "Networking", "Bandwidth": "Networking",
+    "Load Balancer": "Networking", "Azure Front Door Service": "Networking",
+    "Azure DNS": "Networking", "ExpressRoute": "Networking", "VPN Gateway": "Networking",
+    "Advanced Threat Protection": "Security", "Security Center": "Security",
+    "Advanced Data Security": "Security", "Azure Active Directory": "Identity",
+    "Log Analytics": "Management and Governance", "Azure Monitor": "Management and Governance",
+    "Logic Apps": "Integration", "Service Bus": "Integration", "Event Hubs": "Integration",
+    "Container Registry": "Compute", "Azure Kubernetes Service": "Compute",
+    "Azure Machine Learning": "AI and Machine Learning",
+    "Cognitive Services": "AI and Machine Learning",
+    "Azure Data Factory v2": "Analytics", "Azure Synapse Analytics": "Analytics",
+}
+def _convert_azure_subscription(rows: list[dict]) -> list[dict]:
+    """Convert Kaggle Azure subscription costs (MeterCategory-level) to FOCUS-shaped dicts."""
+    focus: list[dict] = []
+    for r in rows:
+        mc = r.get("MeterCategory", "")
+        cat = _AZURE_METER_CATEGORY.get(mc, "Other")
+        cost = _num(r.get("CostInBillingCurrency", 0))
+        start = r.get("Date", "")
+        focus.append({
+            "ServiceCategory": cat,
+            "EffectiveCost": str(cost),
+            "BilledCost": str(cost),
+            "ContractedCost": "0",
+            "ListCost": "0",
+            "ProviderName": "Microsoft",
+            "ChargePeriodStart": start,
+        })
+    return focus
+_GCP_SERVICE_CATEGORY: dict[str, str] = {
+    "Compute Engine": "Compute", "Cloud Run": "Compute", "Cloud Functions": "Compute",
+    "App Engine": "Compute", "Google Kubernetes Engine": "Compute",
+    "Cloud Storage": "Storage",
+    "Cloud SQL": "Databases", "Cloud Spanner": "Databases", "Cloud Bigtable": "Databases",
+    "Cloud Memorystore": "Databases", "Firestore": "Databases",
+    "Cloud CDN": "Networking", "Cloud NAT": "Networking", "Cloud DNS": "Networking",
+    "Cloud Interconnect": "Networking", "Cloud VPN": "Networking",
+    "Cloud Armor": "Security", "Secret Manager": "Security",
+    "Cloud Pub/Sub": "Integration", "Cloud Tasks": "Integration",
+    "Cloud Scheduler": "Integration", "Cloud Composer": "Integration",
+    "BigQuery": "Analytics", "Cloud Dataproc": "Analytics", "Cloud Dataflow": "Analytics",
+    "Vertex AI": "AI and Machine Learning", "Dialogflow": "AI and Machine Learning",
+    "Cloud Vision API": "AI and Machine Learning", "Cloud Natural Language": "AI and Machine Learning",
+    "Cloud Build": "Developer Tools", "Artifact Registry": "Developer Tools",
+    "Cloud Logging": "Management and Governance", "Cloud Monitoring": "Management and Governance",
+}
+def _convert_gcp_billing(rows: list[dict]) -> list[dict]:
+    """Convert HuggingFace/Kaggle GCP billing data to FOCUS-shaped dicts."""
+    focus: list[dict] = []
+    for r in rows:
+        svc = r.get("Service Name", "")
+        cat = _GCP_SERVICE_CATEGORY.get(svc, "Other")
+        cost = _num(r.get("Unrounded Cost ($)", 0))
+        start = r.get("Usage Start Date", "")
+        focus.append({
+            "ServiceCategory": cat,
+            "EffectiveCost": str(cost),
+            "BilledCost": str(_num(r.get("Rounded Cost ($)", 0))),
+            "ContractedCost": "0",
+            "ListCost": "0",
+            "ProviderName": "Google Cloud",
+            "ChargePeriodStart": start,
+        })
+    return focus
+_AZURE_SERVICE_CATEGORY_SIMPLE: dict[str, str] = {
+    "Virtual Machines": "Compute", "Azure App Service": "Compute",
+    "Storage": "Storage", "Automation": "Management and Governance",
+    "Azure DNS": "Networking", "Bandwidth": "Networking",
+    "Azure Active Directory Domain Services": "Identity",
+    "Azure Monitor": "Management and Governance",
+}
+def _convert_azure_org_expenses(rows: list[dict]) -> list[dict]:
+    """Convert Kaggle Azure org expenses (simple ServiceName/CostUSD) to FOCUS-shaped dicts."""
+    focus: list[dict] = []
+    for r in rows:
+        svc = r.get("ServiceName", "")
+        cat = _AZURE_SERVICE_CATEGORY_SIMPLE.get(svc, "Other")
+        cost = _num(r.get("CostUSD", 0))
+        start = r.get("UsageDate", "")
+        focus.append({
+            "ServiceCategory": cat,
+            "EffectiveCost": str(cost),
+            "BilledCost": str(cost),
+            "ContractedCost": "0",
+            "ListCost": "0",
+            "ProviderName": "Microsoft",
+            "ChargePeriodStart": start,
+        })
+    return focus
+_PROVIDER_CONVERTERS: dict[str, callable] = {
+    "focus_converter_aws_cur": _convert_aws_cur,
+    "focus_converter_azure_ea": _convert_azure_ea,
+    "focus_converter_oci": _convert_oci,
+    "kaggle_azure_subscription": _convert_azure_subscription,
+    "kaggle_gcp_billing": _convert_gcp_billing,
+    "kaggle_azure_org_expenses": _convert_azure_org_expenses,
+}
+def aggregate_focus_rows(rows: list[dict]) -> dict:
+    """Aggregate FOCUS rows into the cost basis: totals + per-(category, provider, month) breakdown.
+    EffectiveCost is the post-discount FinOps number; we also keep billed/contracted/list for spread."""
+    by_category: dict[str, dict] = defaultdict(lambda: defaultdict(float))
+    by_provider: dict[str, dict] = defaultdict(lambda: defaultdict(float))
+    by_month: dict[str, float] = defaultdict(float)
+    totals: dict[str, float] = defaultdict(float)
+    for r in rows:
+        cat = _col(r, "ServiceCategory") or "(uncategorized)"
+        prov = _col(r, "ProviderName") or "(unknown)"
+        month = str(_col(r, "ChargePeriodStart") or "")[:7]
+        eff = _num(_col(r, "EffectiveCost"))
+        for cc in _COST_COLS:
+            totals[cc] += _num(_col(r, cc))
+        by_category[cat]["effective"] += eff
+        by_category[cat]["rows"] += 1
+        by_provider[prov]["effective"] += eff
+        if month:
+            by_month[month] += eff
+    ai = by_category.get(_AI_CATEGORY, {}).get("effective", 0.0)
+    return {
+        "rows": len(rows),
+        "totals": {k: round(v, 6) for k, v in totals.items()},
+        "by_category": {k: {kk: round(vv, 6) for kk, vv in v.items()} for k, v in by_category.items()},
+        "by_provider": {k: round(v["effective"], 6) for k, v in by_provider.items()},
+        "months": sorted(by_month),
+        "ai_ml_effective": round(ai, 6),
+        "cloud_effective": round(totals["EffectiveCost"] - ai, 6),   # non-AI/ML = pure cloud infra
+    }
+def build_actuals_index(root: Path = BRONZE, out: Path = INDEX) -> dict:
+    """Pure/offline: read every fetched FOCUS dataset in bronze, aggregate, write the index. The index
+    is the real CLOUD-COST basis (by ServiceCategory) + the AI/ML actual, with provenance per source.
+    Handles both focus_native CSVs (direct read) and provider_native CSVs (converted in-process)."""
+    sources: dict[str, dict] = {}
+    roll_cat: dict[str, float] = defaultdict(float)
+    roll_tot = roll_ai = 0.0
+    for src in SOURCES.values():
+        if src.verdict != "WIRED":
+            continue
+        d = root / src.name
+        csvs = sorted(d.glob("*.csv")) if d.exists() else []
+        if not csvs:
+            continue
+        raw_rows: list[dict] = []
+        for c in csvs:
+            with c.open(newline="", encoding="utf-8-sig") as f:
+                raw_rows += list(csv.DictReader(f))
+        # Convert provider-native to FOCUS if a converter exists
+        converter = _PROVIDER_CONVERTERS.get(src.name)
+        rows = converter(raw_rows) if converter else raw_rows
+        agg = aggregate_focus_rows(rows)
+        prov_file = d / "provenance.json"
+        prov = json.loads(prov_file.read_text()) if prov_file.exists() else {}
+        sources[src.name] = {"license": src.license, "kind": src.kind,
+                             "provenance": prov, **agg}
+        for cat, v in agg["by_category"].items():
+            roll_cat[cat] += v.get("effective", 0.0)
+        roll_tot += agg["totals"].get("EffectiveCost", 0.0)
+        roll_ai += agg["ai_ml_effective"]
+    index = {
+        "generated": date.today().isoformat(),
+        "catalog": {n: {"verdict": s.verdict, "kind": s.kind, "license": s.license, "note": s.note}
+                    for n, s in SOURCES.items()},
+        "sources": sources,
+        "rollup": {
+            "by_category": {k: round(v, 6) for k, v in sorted(roll_cat.items(), key=lambda kv: -kv[1])},
+            "total_effective": round(roll_tot, 6),
+            "ai_ml_effective": round(roll_ai, 6),
+            "cloud_effective": round(roll_tot - roll_ai, 6),
+        },
+    }
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(index, indent=2))
+    return index
+def _main(argv=None) -> int:
+    """Fetch all WIRED focus_native sources, then (re)build the actuals index. Prints the cloud basis."""
+    import argparse
+    ap = argparse.ArgumentParser(description="Build the historical ACTUALS corpus (real cloud + AI/ML spend).")
+    ap.add_argument("--no-fetch", action="store_true", help="skip download; rebuild index from existing bronze")
+    a = ap.parse_args(argv)
+    if not a.no_fetch:
+        for src in SOURCES.values():
+            prov = fetch_source(src)
+            if prov:
+                print(f"fetched {src.name}: {prov.rows} rows, sha256={prov.sha256[:12]}… ({src.license})")
+    idx = build_actuals_index()
+    r = idx["rollup"]
+    print(f"\nactuals_index.json — {len(idx['sources'])} source(s), ${r['total_effective']:,.2f} effective")
+    print(f"  cloud infra: ${r['cloud_effective']:,.2f}   AI/ML: ${r['ai_ml_effective']:,.2f}")
+    print("  top categories:")
+    for cat, v in list(r["by_category"].items())[:6]:
+        print(f"    {cat:<28} ${v:,.2f}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(_main())

blue_book/allocation.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""FinOps cost allocation / showback — attribute effective cost to an owner dimension.
+"Unallocated cost is unmanaged cost." Pure: groups priced events (CanonicalEvent objects OR the
+ledger's row dicts) by a tag — repo_slug / organization_slug / workflow_name / environment — so cost
+is shown back to the team/feature/tenant that incurred it. The ingest already captures these tags;
+this turns them into a showback table.
+"""
+from __future__ import annotations
+from collections import defaultdict
+def _get(event, key, default=None):
+    return event.get(key, default) if isinstance(event, dict) else getattr(event, key, default)
+def allocate(events, dimension: str = "repo_slug") -> dict[str, float]:
+    """Sum `effective_cost_usd` of priced events by `dimension`, descending. Untagged events fall
+    into '(unallocated)' — the FinOps red flag (cost you cannot attribute to an owner)."""
+    totals: dict[str, float] = defaultdict(float)
+    for e in events:
+        key = _get(e, dimension) or "(unallocated)"
+        totals[key] += float(_get(e, "effective_cost_usd", 0.0) or 0.0)
+    return dict(sorted(totals.items(), key=lambda kv: -kv[1]))
+def unallocated_fraction(events, dimension: str = "repo_slug") -> float:
+    """Fraction of total effective cost that has NO owner tag — the showback coverage gap."""
+    alloc = allocate(events, dimension)
+    total = sum(alloc.values())
+    return (alloc.get("(unallocated)", 0.0) / total) if total else 0.0