PyPI - bobframes - Versions diffs - 0.1.0__py3-none-any.whl - Mend

bobframes 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

bobframes/lint.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Banned-token lint for HTML/markdown chrome.
+Catches LLM-filler vocabulary and label scaffolding that the previous
+iteration of the pipeline kept generating. Applies only to chrome around
+data tables, not to CSV cell contents.
+Run via `python -m bobframes.lint <file...>` or imported by run.py.
+"""
+from __future__ import annotations
+import re
+import sys
+from html.parser import HTMLParser
+BANNED = [
+    (re.compile(r'[—–]'),                                           'em/en dash anywhere'),
+    (re.compile(r'[…]'),                                                 'ellipsis unicode'),
+    (re.compile(r'[“”‘’]'),                               'curly quote'),
+    (re.compile(r'[✓✅↑↓·×⏳→←⚠✨]'), 'decorative unicode'),
+    (re.compile(r'\bcaps\b'),                                                 'shorthand caps'),
+    (re.compile(r'\bcap\b(?![A-Za-z])'),                                      'shorthand cap'),
+    (re.compile(r'\b(comprehensive|leverage|robust|polished|sleek|seamless)\b', re.I), 'LLM filler vocabulary'),
+    (re.compile(r'\b(overview|insights?|breakdown of|deep dive|key findings)\b', re.I), 'report-prose noun'),
+    (re.compile(r'\b(this (report|chart|table|section) shows|as (you can )?see|as shown|the following|let us|we (can )?see|note that|please note|observe that)\b', re.I), 'reader-address phrase'),
+    (re.compile(r'\b(highlights?|takeaways?|notable|noteworthy|significant|interesting)\b', re.I), 'editorial verb'),
+    (re.compile(r'\b(in conclusion|to summarize|in summary|overall)\b', re.I), 'summary opener'),
+    (re.compile(r'\bN/A\b'),                                                  'NA filler'),
+    (re.compile(r'ranks remaining work', re.I),                               'LLM filler phrase'),
+    (re.compile(r'\*\*(What to do|Why this matters|Verify|Effort|Impact|Detail|Fix|Severity|Title):\*\*'), 'label scaffolding'),
+    (re.compile(r'\betc\.'),                                                  'filler etc.'),
+]
+class _HtmlTextExtractor(HTMLParser):
+    """Collects text outside <table>, <script>, <style> ranges.
+    Each entry is (lineno, text).
+    """
+    _SKIP_TAGS = {'table', 'script', 'style'}
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._skip_depth: dict[str, int] = {t: 0 for t in self._SKIP_TAGS}
+        self.chunks: list[tuple[int, str]] = []
+    def handle_starttag(self, tag, attrs):
+        t = tag.lower()
+        if t in self._SKIP_TAGS:
+            self._skip_depth[t] += 1
+    def handle_endtag(self, tag):
+        t = tag.lower()
+        if t in self._SKIP_TAGS and self._skip_depth[t] > 0:
+            self._skip_depth[t] -= 1
+    def handle_data(self, data):
+        if any(v > 0 for v in self._skip_depth.values()):
+            return
+        if not data.strip():
+            return
+        line, _col = self.getpos()
+        self.chunks.append((line, data))
+def lint_html(path: str) -> list[tuple[int, str, str]]:
+    """Return list of (lineno, pattern_label, snippet) for any banned matches."""
+    with open(path, 'r', encoding='utf-8') as f:
+        body = f.read()
+    extractor = _HtmlTextExtractor()
+    extractor.feed(body)
+    hits: list[tuple[int, str, str]] = []
+    for lineno, text in extractor.chunks:
+        for rx, label in BANNED:
+            m = rx.search(text)
+            if m:
+                snippet = text.strip()[:80]
+                hits.append((lineno, label, snippet))
+    return hits
+def lint_markdown(path: str) -> list[tuple[int, str, str]]:
+    hits: list[tuple[int, str, str]] = []
+    with open(path, 'r', encoding='utf-8') as f:
+        for lineno, line in enumerate(f, start=1):
+            for rx, label in BANNED:
+                if rx.search(line):
+                    hits.append((lineno, label, line.rstrip()[:80]))
+    return hits
+def lint_file(path: str) -> list[tuple[int, str, str]]:
+    lower = path.lower()
+    if lower.endswith('.md'):
+        return lint_markdown(path)
+    return lint_html(path)
+def main(argv: list[str]) -> int:
+    if not argv:
+        print('usage: lint.py <file...>', file=sys.stderr)
+        return 2
+    total = 0
+    for path in argv:
+        hits = lint_file(path)
+        for lineno, label, snippet in hits:
+            print(f'{path}:{lineno}: [{label}] {snippet}', file=sys.stderr)
+            total += 1
+    return 2 if total else 0
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))

bobframes/manifest.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""Per-drop _manifest.json writer.
+Records schema version, build timestamp, per-capture replay status, row
+counts per table, and rotated-dir name (if a previous _analysis_out was
+rotated during this run).
+"""
+from __future__ import annotations
+import datetime as _dt
+import json
+import os
+import platform
+import subprocess
+from typing import Any
+from . import qrd_harness, rdcmd, schemas
+from ._version import __version__
+def now_iso() -> str:
+    """Single source of truth for timestamps: always UTC, second precision."""
+    return _dt.datetime.now(_dt.timezone.utc).replace(microsecond=0).isoformat()
+def _tool_version(path: str) -> str:
+    """Best-effort `<tool> --version` first line. Never raises."""
+    try:
+        p = subprocess.run([path, '--version'], capture_output=True, text=True, timeout=15)
+        out = (p.stdout or p.stderr or '').strip()
+        return out.splitlines()[0].strip() if out else 'unknown'
+    except Exception:
+        return 'unknown'
+def gather_tool_versions() -> dict[str, str]:
+    """Record renderdoccmd / qrenderdoc versions at ingest (G-6). Best-effort."""
+    versions: dict[str, str] = {}
+    try:
+        versions['renderdoccmd'] = _tool_version(rdcmd.find_renderdoccmd())
+    except Exception:
+        versions['renderdoccmd'] = 'unknown'
+    try:
+        versions['qrenderdoc'] = _tool_version(qrd_harness.find_qrenderdoc())
+    except Exception:
+        versions['qrenderdoc'] = 'unknown'
+    return versions
+def gather_host_info() -> dict[str, str]:
+    """Record GPU/driver/CPU/OS + bobframes version at ingest (G-7). Best-effort."""
+    gpu, driver = 'unknown', 'unknown'
+    try:
+        ps = subprocess.run(
+            ['powershell', '-NoProfile', '-Command',
+             'Get-CimInstance Win32_VideoController | '
+             'Select-Object -First 1 -Property Name,DriverVersion | ConvertTo-Json -Compress'],
+            capture_output=True, text=True, timeout=20,
+        )
+        if ps.returncode == 0 and ps.stdout.strip():
+            obj = json.loads(ps.stdout)
+            if isinstance(obj, list):
+                obj = obj[0] if obj else {}
+            gpu = (obj.get('Name') or 'unknown').strip()
+            driver = (obj.get('DriverVersion') or 'unknown').strip()
+    except Exception:
+        pass
+    return {
+        'gpu': gpu,
+        'gpu_driver': driver,
+        'cpu': platform.processor() or 'unknown',
+        'os': platform.platform(),
+        'bobframes': __version__,
+    }
+def build_manifest(
+    *,
+    area: str,
+    drop_date: str,
+    drop_label: str,
+    captures: list[str],
+    capture_status: dict[str, str],
+    row_counts: dict[str, int],
+    rotated_from: str | None,
+    build_timestamp: str | None = None,
+    tool_versions: dict[str, str] | None = None,
+    host_info: dict[str, str] | None = None,
+) -> dict[str, Any]:
+    return {
+        'schema_version': schemas.SCHEMA_VERSION,
+        'build_timestamp': build_timestamp or now_iso(),
+        'area': area,
+        'drop_date': drop_date,
+        'drop_label': drop_label,
+        'captures': sorted(captures, key=lambda s: (len(s), s)),
+        'capture_status': dict(capture_status),
+        'row_counts': dict(row_counts),
+        'tool_versions': dict(tool_versions or {}),
+        'host_info': dict(host_info or {}),
+        'rotated_from': rotated_from,
+    }
+def write_manifest(out_dir: str, manifest: dict[str, Any]) -> str:
+    """Atomically write _manifest.json (tmp + os.replace) so a crash mid-write
+    never leaves a partial file the catalog would silently skip (R-1)."""
+    path = os.path.join(out_dir, '_manifest.json')
+    tmp = path + '.tmp'
+    try:
+        with open(tmp, 'w', encoding='utf-8') as f:
+            json.dump(manifest, f, indent=2, sort_keys=False)
+            f.write('\n')
+        os.replace(tmp, path)
+    except BaseException:
+        try:
+            os.remove(tmp)
+        except OSError:
+            pass
+        raise
+    return path
+def read_manifest(out_dir: str) -> dict[str, Any]:
+    path = os.path.join(out_dir, '_manifest.json')
+    with open(path, 'r', encoding='utf-8') as f:
+        return json.load(f)

bobframes/parquetize.py ADDED Viewed

@@ -0,0 +1,282 @@
+"""Merge per-capture CSV fragments into drop-level CSV + Parquet pairs.
+For each table in schemas.TABLES:
+  1. Read every _stage/<capture>/<table>.csv that exists.
+  2. Verify the CSV header equals schemas.<TABLE>_COLS exactly (no drift).
+  3. Concatenate (preserving capture order).
+  4. Compute stable_key for entity tables.
+  5. Coerce dtypes via schemas.infer_dtype.
+  6. Write _analysis_out.tmp/<table>.parquet (snappy) and <table>.csv.
+Also copies non-tabular sidecars (shader_src/, histogram/, jsonl) from the
+stage tree into _analysis_out.tmp/.
+"""
+from __future__ import annotations
+import csv
+import json
+import os
+import shutil
+from typing import Iterable
+import pyarrow as pa
+import pyarrow.csv as pacsv
+import pyarrow.parquet as papq
+from . import schemas, stable_keys
+def _list_stage_dirs(stage_root: str) -> list[str]:
+    if not os.path.isdir(stage_root):
+        return []
+    names = []
+    for entry in os.listdir(stage_root):
+        full = os.path.join(stage_root, entry)
+        if os.path.isdir(full):
+            names.append(entry)
+    names.sort(key=lambda s: (len(s), s))
+    return names
+def _read_csv_compat(path: str, expected_cols: tuple[str, ...]) -> tuple[list[list[str]], list[int | None]]:
+    """Read CSV, return rows ordered into the expected_cols positions.
+    Any expected column missing from the CSV header is filled with empty
+    strings (post-merge derives populate them later). Extra columns in the
+    CSV are ignored. Reorders columns as needed to match expected order.
+    Returns (rows, position_map) where rows[i][j] is the value for
+    expected_cols[j]. position_map records which CSV column index maps to
+    each expected column (None if not present).
+    """
+    with open(path, 'r', encoding='utf-8', newline='') as f:
+        reader = csv.reader(f)
+        try:
+            header = next(reader)
+        except StopIteration:
+            return [], []
+        idx_for = {c: i for i, c in enumerate(header)}
+        pos_map: list[int | None] = [idx_for.get(c) for c in expected_cols]
+        out_rows: list[list[str]] = []
+        for raw in reader:
+            row: list[str] = []
+            for p in pos_map:
+                if p is None or p >= len(raw):
+                    row.append('')
+                else:
+                    row.append(raw[p])
+            out_rows.append(row)
+        return out_rows, pos_map
+def _cast_value(v: str, dtype: str):
+    if v == '' or v is None:
+        if dtype == 'int':   return 0
+        if dtype == 'float': return 0.0
+        if dtype == 'bool':  return False
+        return ''
+    try:
+        if dtype == 'int':
+            try: return int(v)
+            except (ValueError, TypeError): return int(float(v))
+        if dtype == 'float': return float(v)
+        if dtype == 'bool':  return v not in ('0', '', 'False', 'false')
+    except (ValueError, TypeError):
+        if dtype == 'int':   return 0
+        if dtype == 'float': return 0.0
+        if dtype == 'bool':  return False
+    return v
+def _as_int(v) -> int:
+    try:
+        return int(v) if v not in ('', None) else 0
+    except (ValueError, TypeError):
+        return 0
+def _apply_stable_key(table_stem: str, columns: dict[str, list]) -> None:
+    """For entity tables, fill the stable_key column from row content.
+    Called BEFORE dtype coercion; all column values are still strings here.
+    Numeric inputs are cast via _as_int.
+    """
+    n = len(next(iter(columns.values())))
+    if 'stable_key' not in columns:
+        return
+    keys: list[str] = ['' for _ in range(n)]
+    if table_stem == 'shaders':
+        for i in range(n):
+            keys[i] = (columns.get('src_hash') or [''] * n)[i] or ''
+    elif table_stem in ('textures', 'render_targets'):
+        for i in range(n):
+            keys[i] = stable_keys.texture_key(
+                (columns.get('label') or [''] * n)[i],
+                (columns.get('format') or [''] * n)[i],
+                _as_int((columns.get('width') or [''] * n)[i]),
+                _as_int((columns.get('height') or [''] * n)[i]),
+                _as_int((columns.get('depth') or [''] * n)[i]),
+                _as_int((columns.get('mip_levels') or [''] * n)[i]),
+                _as_int((columns.get('sample_count') or [''] * n)[i]),
+            )
+    elif table_stem == 'samplers':
+        for i in range(n):
+            keys[i] = stable_keys.sampler_key(
+                (columns.get('min_filter') or [''] * n)[i],
+                (columns.get('mag_filter') or [''] * n)[i],
+                (columns.get('wrap_s') or [''] * n)[i],
+                (columns.get('wrap_t') or [''] * n)[i],
+                (columns.get('wrap_r') or [''] * n)[i],
+                _as_int((columns.get('max_anisotropy') or [''] * n)[i]),
+                (columns.get('compare_mode') or [''] * n)[i],
+                (columns.get('compare_func') or [''] * n)[i],
+            )
+    elif table_stem == 'buffers':
+        for i in range(n):
+            tgts = (columns.get('target_history') or [''] * n)[i]
+            first_target = (tgts.split(';')[0] if tgts else '')
+            keys[i] = stable_keys.buffer_key(
+                (columns.get('usage_hint') or [''] * n)[i],
+                _as_int((columns.get('allocated_size_bytes') or [''] * n)[i]),
+                first_target,
+            )
+    elif table_stem == 'programs':
+        for i in range(n):
+            ids = (columns.get('attached_shader_ids') or [''] * n)[i]
+            id_list = [x for x in ids.split(';') if x] if ids else []
+            if id_list:
+                keys[i] = stable_keys.program_key(id_list)
+    elif table_stem == 'fbos':
+        for i in range(n):
+            rid = (columns.get('resource_id') or [''] * n)[i] or ''
+            keys[i] = stable_keys.fbo_key([rid]) if rid and rid != '0' else ''
+    columns['stable_key'] = keys
+def _build_table(table_stem: str, stage_root: str) -> tuple[pa.Table | None, int]:
+    """Return (pa.Table or None, row_count). None if no fragments existed."""
+    expected_cols = schemas.expected_columns(table_stem)
+    captures = _list_stage_dirs(stage_root)
+    columns: dict[str, list] = {c: [] for c in expected_cols}
+    found_any = False
+    for capture in captures:
+        path = os.path.join(stage_root, capture, f'{table_stem}.csv')
+        if not os.path.exists(path):
+            continue
+        found_any = True
+        rows, _pos = _read_csv_compat(path, expected_cols)
+        for row in rows:
+            for i, col in enumerate(expected_cols):
+                columns[col].append(row[i])
+    if not found_any:
+        return None, 0
+    n_rows = len(columns[expected_cols[0]])
+    if schemas.is_entity_table(table_stem):
+        _apply_stable_key(table_stem, columns)
+    arrays: dict[str, pa.Array] = {}
+    for col in expected_cols:
+        dtype = schemas.infer_dtype(col)
+        raw = columns[col]
+        if dtype == 'int':
+            arrays[col] = pa.array([_cast_value(v, 'int') for v in raw], type=pa.int64())
+        elif dtype == 'float':
+            arrays[col] = pa.array([_cast_value(v, 'float') for v in raw], type=pa.float64())
+        elif dtype == 'bool':
+            arrays[col] = pa.array([_cast_value(v, 'bool') for v in raw], type=pa.bool_())
+        else:
+            arrays[col] = pa.array(raw, type=pa.string())
+    return pa.table(arrays), n_rows
+def _write_pair(table: pa.Table, out_dir: str, name: str) -> None:
+    """Stage Parquet+CSV to .tmp, then atomically rename both. If either write
+    fails, roll back both tmps so a half-written pair is never committed (R-2)."""
+    pq_path = os.path.join(out_dir, f'{name}.parquet')
+    csv_path = os.path.join(out_dir, f'{name}.csv')
+    pq_tmp = pq_path + '.tmp'
+    csv_tmp = csv_path + '.tmp'
+    try:
+        papq.write_table(table, pq_tmp, compression='snappy')
+        pacsv.write_csv(table, csv_tmp)
+    except BaseException:
+        for t in (pq_tmp, csv_tmp):
+            try:
+                os.remove(t)
+            except OSError:
+                pass
+        raise
+    os.replace(pq_tmp, pq_path)
+    os.replace(csv_tmp, csv_path)
+def _copy_sidecars(stage_root: str, out_dir: str) -> None:
+    """Copy shader_src/, histogram/ and jsonl sidecars from stage to out."""
+    captures = _list_stage_dirs(stage_root)
+    shader_src_dst = os.path.join(out_dir, 'shader_src')
+    histogram_dst = os.path.join(out_dir, 'histogram')
+    os.makedirs(shader_src_dst, exist_ok=True)
+    os.makedirs(histogram_dst, exist_ok=True)
+    # jsonl merging across captures
+    fm_path = os.path.join(out_dir, 'frame_metadata.jsonl')
+    up_path = os.path.join(out_dir, 'uniforms_per_pass.jsonl')
+    fm_lines: list[str] = []
+    up_lines: list[str] = []
+    for capture in captures:
+        cap_dir = os.path.join(stage_root, capture)
+        src = os.path.join(cap_dir, 'shader_src')
+        if os.path.isdir(src):
+            for f in os.listdir(src):
+                shutil.copy2(os.path.join(src, f), os.path.join(shader_src_dst, f))
+        hist = os.path.join(cap_dir, 'histogram')
+        if os.path.isdir(hist):
+            for f in os.listdir(hist):
+                shutil.copy2(os.path.join(hist, f), os.path.join(histogram_dst, f))
+        fm = os.path.join(cap_dir, 'frame_metadata.json')
+        if os.path.exists(fm):
+            with open(fm, 'r', encoding='utf-8') as f:
+                obj = json.load(f)
+            fm_lines.append(json.dumps(obj))
+        up = os.path.join(cap_dir, 'uniforms_per_pass.jsonl')
+        if os.path.exists(up):
+            with open(up, 'r', encoding='utf-8') as f:
+                up_lines.append(f.read().rstrip('\n'))
+    if fm_lines:
+        with open(fm_path, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(fm_lines) + '\n')
+    if up_lines:
+        with open(up_path, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(up_lines) + '\n')
+def merge_drop(stage_root: str, out_dir: str) -> dict[str, int]:
+    """Merge all stage CSVs into out_dir as Parquet+CSV pairs. Returns row counts."""
+    os.makedirs(out_dir, exist_ok=True)
+    row_counts: dict[str, int] = {}
+    for table_stem in schemas.TABLES:
+        tbl, n_rows = _build_table(table_stem, stage_root)
+        if tbl is None:
+            row_counts[table_stem] = 0
+            continue
+        _write_pair(tbl, out_dir, table_stem)
+        row_counts[table_stem] = n_rows
+    _copy_sidecars(stage_root, out_dir)
+    return row_counts

bobframes/parsers/__init__.py ADDED Viewed

File without changes

bobframes/parsers/derive_program_transitions.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Derive program_transitions.parquet from draws.parquet.
+Walk draws in event_id order; emit (from_program_id, to_program_id, count)
+aggregated across the whole drop. Per (area, drop_date, drop_label, capture).
+"""
+from __future__ import annotations
+import os
+from collections import Counter
+import pyarrow as pa
+import pyarrow.parquet as papq
+import pyarrow.csv as pacsv
+from .. import schemas
+def derive(out_dir: str) -> int:
+    draws_pq = os.path.join(out_dir, 'draws.parquet')
+    if not os.path.exists(draws_pq):
+        return 0
+    t = papq.read_table(draws_pq, columns=list(schemas.ID_COLS) + ['event_id', 'program_id'])
+    n = t.num_rows
+    if n == 0:
+        return 0
+    # Group by (area, drop_date, drop_label, capture) and walk in event_id order.
+    cols = {c: t.column(c).to_pylist() for c in t.column_names}
+    groups: dict[tuple, list[tuple[int, int]]] = {}
+    for i in range(n):
+        key = (cols['area'][i], cols['drop_date'][i], cols['drop_label'][i], cols['capture'][i])
+        groups.setdefault(key, []).append((cols['event_id'][i], cols['program_id'][i]))
+    out_rows: dict[tuple, dict] = {}
+    for key, draws_for_capture in groups.items():
+        draws_for_capture.sort(key=lambda x: x[0])
+        prev = 0
+        counter: Counter = Counter()
+        for ev, pid in draws_for_capture:
+            if prev and pid and prev != pid:
+                counter[(prev, pid)] += 1
+            prev = pid
+        for (a, b), c in counter.items():
+            out_rows[(key, a, b)] = {
+                'area': key[0], 'drop_date': key[1], 'drop_label': key[2], 'capture': key[3],
+                'from_program_id': a, 'to_program_id': b, 'count': c,
+            }
+    cols_out = list(schemas.PROG_TRANS_COLS)
+    arrays: dict[str, pa.Array] = {}
+    for c in cols_out:
+        vs = [r[c] for r in out_rows.values()]
+        dt = schemas.infer_dtype(c)
+        if dt == 'int':
+            arrays[c] = pa.array(vs, type=pa.int64())
+        else:
+            arrays[c] = pa.array(vs, type=pa.string())
+    table = pa.table(arrays)
+    papq.write_table(table, os.path.join(out_dir, 'program_transitions.parquet'),
+                     compression='snappy')
+    pacsv.write_csv(table, os.path.join(out_dir, 'program_transitions.csv'))
+    return table.num_rows
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) != 2:
+        print('usage: derive_program_transitions.py <out_dir>', file=sys.stderr)
+        sys.exit(2)
+    print(f'wrote {derive(sys.argv[1])} program_transitions rows')