PyPI - sotellme - Versions diffs - 0.1.0__tar.gz - Mend

sotellme 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sotellme-0.1.0/.gitignore +49 -0
sotellme-0.1.0/.python-version +1 -0
sotellme-0.1.0/LICENSE +21 -0
sotellme-0.1.0/PKG-INFO +217 -0
sotellme-0.1.0/README.md +180 -0
sotellme-0.1.0/evals/assessor_cases.json +119 -0
sotellme-0.1.0/evals/coach_cases.json +80 -0
sotellme-0.1.0/evals/director_cases.json +585 -0
sotellme-0.1.0/evals/grader_cases.json +292 -0
sotellme-0.1.0/evals/guardrail_cases.json +107 -0
sotellme-0.1.0/evals/interviewer_cases.json +218 -0
sotellme-0.1.0/evals/personas/junior-rambling.json +11 -0
sotellme-0.1.0/evals/personas/junior-thin.json +11 -0
sotellme-0.1.0/evals/personas/mid-blurred-ownership.json +11 -0
sotellme-0.1.0/evals/personas/mid-offtopic.json +11 -0
sotellme-0.1.0/evals/personas/senior-bluffer.json +11 -0
sotellme-0.1.0/evals/personas/senior-strong.json +12 -0
sotellme-0.1.0/evals/personas/staff-injection.json +11 -0
sotellme-0.1.0/evals/personas/staff-terse.json +11 -0
sotellme-0.1.0/evals/profile_parser_cases.json +41 -0
sotellme-0.1.0/evals/role_context_cases.json +48 -0
sotellme-0.1.0/pyproject.toml +84 -0
sotellme-0.1.0/scripts/evals.py +89 -0
sotellme-0.1.0/scripts/prepare_package.py +31 -0
sotellme-0.1.0/scripts/release_version.py +59 -0
sotellme-0.1.0/scripts/simulate.py +135 -0
sotellme-0.1.0/scripts/smoke_session.py +178 -0
sotellme-0.1.0/src/sotellme/__init__.py +1 -0
sotellme-0.1.0/src/sotellme/__main__.py +5 -0
sotellme-0.1.0/src/sotellme/assessor.py +69 -0
sotellme-0.1.0/src/sotellme/budget.py +94 -0
sotellme-0.1.0/src/sotellme/caching.py +27 -0
sotellme-0.1.0/src/sotellme/catalog.py +71 -0
sotellme-0.1.0/src/sotellme/cli.py +531 -0
sotellme-0.1.0/src/sotellme/coach.py +114 -0
sotellme-0.1.0/src/sotellme/config.py +122 -0
sotellme-0.1.0/src/sotellme/coverage.py +22 -0
sotellme-0.1.0/src/sotellme/director.py +146 -0
sotellme-0.1.0/src/sotellme/engine.py +463 -0
sotellme-0.1.0/src/sotellme/eval_datasets.py +473 -0
sotellme-0.1.0/src/sotellme/extraction.py +42 -0
sotellme-0.1.0/src/sotellme/fetch.py +229 -0
sotellme-0.1.0/src/sotellme/grader.py +100 -0
sotellme-0.1.0/src/sotellme/guardrail.py +66 -0
sotellme-0.1.0/src/sotellme/interviewer.py +101 -0
sotellme-0.1.0/src/sotellme/judge.py +120 -0
sotellme-0.1.0/src/sotellme/models.toml +68 -0
sotellme-0.1.0/src/sotellme/personas.py +46 -0
sotellme-0.1.0/src/sotellme/posting.py +30 -0
sotellme-0.1.0/src/sotellme/pricing.py +147 -0
sotellme-0.1.0/src/sotellme/profile.py +61 -0
sotellme-0.1.0/src/sotellme/prompts.py +973 -0
sotellme-0.1.0/src/sotellme/py.typed +0 -0
sotellme-0.1.0/src/sotellme/report.py +55 -0
sotellme-0.1.0/src/sotellme/research.py +59 -0
sotellme-0.1.0/src/sotellme/role.py +102 -0
sotellme-0.1.0/src/sotellme/sim_datasets.py +146 -0
sotellme-0.1.0/src/sotellme/simulation.py +413 -0
sotellme-0.1.0/src/sotellme/simulator.py +36 -0
sotellme-0.1.0/src/sotellme/tracing.py +21 -0
sotellme-0.1.0/src/sotellme/voice.py +33 -0
sotellme-0.1.0/src/sotellme/web.py +492 -0
sotellme-0.1.0/tests/fixtures/synthetic_cv.md +32 -0
sotellme-0.1.0/tests/fixtures/synthetic_cv.pdf +63 -0
sotellme-0.1.0/tests/fixtures/synthetic_cv.txt +30 -0
sotellme-0.1.0/tests/pdf_fixture.py +34 -0
sotellme-0.1.0/tests/stubs.py +88 -0
sotellme-0.1.0/tests/test_assessor.py +52 -0
sotellme-0.1.0/tests/test_budget.py +144 -0
sotellme-0.1.0/tests/test_caching.py +157 -0
sotellme-0.1.0/tests/test_catalog.py +91 -0
sotellme-0.1.0/tests/test_cli.py +369 -0
sotellme-0.1.0/tests/test_coach.py +90 -0
sotellme-0.1.0/tests/test_config.py +180 -0
sotellme-0.1.0/tests/test_director.py +126 -0
sotellme-0.1.0/tests/test_engine.py +819 -0
sotellme-0.1.0/tests/test_envelope.py +53 -0
sotellme-0.1.0/tests/test_eval_datasets.py +299 -0
sotellme-0.1.0/tests/test_extraction.py +59 -0
sotellme-0.1.0/tests/test_fetch.py +245 -0
sotellme-0.1.0/tests/test_firewall.py +141 -0
sotellme-0.1.0/tests/test_grader.py +137 -0
sotellme-0.1.0/tests/test_guardrail.py +76 -0
sotellme-0.1.0/tests/test_guardrail_evals.py +37 -0
sotellme-0.1.0/tests/test_injection.py +66 -0
sotellme-0.1.0/tests/test_interviewer.py +151 -0
sotellme-0.1.0/tests/test_judge.py +120 -0
sotellme-0.1.0/tests/test_level_access.py +62 -0
sotellme-0.1.0/tests/test_package.py +5 -0
sotellme-0.1.0/tests/test_personas.py +98 -0
sotellme-0.1.0/tests/test_posting.py +55 -0
sotellme-0.1.0/tests/test_pricing.py +176 -0
sotellme-0.1.0/tests/test_profile.py +84 -0
sotellme-0.1.0/tests/test_prompts.py +463 -0
sotellme-0.1.0/tests/test_release_version.py +68 -0
sotellme-0.1.0/tests/test_report.py +118 -0
sotellme-0.1.0/tests/test_research.py +102 -0
sotellme-0.1.0/tests/test_restart.py +195 -0
sotellme-0.1.0/tests/test_role.py +116 -0
sotellme-0.1.0/tests/test_secret_isolation.py +201 -0
sotellme-0.1.0/tests/test_sim_datasets.py +37 -0
sotellme-0.1.0/tests/test_simulation.py +363 -0
sotellme-0.1.0/tests/test_simulator.py +83 -0
sotellme-0.1.0/tests/test_tracing.py +24 -0
sotellme-0.1.0/tests/test_voice.py +40 -0
sotellme-0.1.0/tests/test_web.py +244 -0
sotellme-0.1.0/tests/voice.py +3 -0
sotellme-0.1.0/uv.lock +2722 -0

sotellme-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,49 @@
+# Knowledge base — licensed course material, prompt-distillation source (never committed).
+/how-to-answer/
+/how-to-interview/
+.claude
+CLAUDE.md
+# Working docs
+plans
+scratch/
+issues/
+learn/
+# Secrets & environment
+.env
+.env.*
+!.env.example
+*.local
+# Langfuse (self-hosted) data
+langfuse/
+*.db
+# Package build files staged from the repo root (see backend/scripts/prepare_package.py)
+backend/README.md
+backend/LICENSE
+# Python
+__pycache__/
+*.py[cod]
+.venv/
+.mypy_cache/
+.pytest_cache/
+.ruff_cache/
+dist/
+# Local data (real CVs, session artifacts — PII, never committed)
+data/
+# Generated session reports (may carry real-CV PII) and simulated-eval session artifacts
+sotellme-report-*.md
+backend/evals/sessions/
+# Feature-review scaffolding (ephemeral, branch-scoped)
+reviews/
+# OS / editor
+.DS_Store

sotellme-0.1.0/.python-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3.12

sotellme-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Srdjan Coric
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

sotellme-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,217 @@
+Metadata-Version: 2.4
+Name: sotellme
+Version: 0.1.0
+Summary: A behavioral-interview simulator and coach that runs in your terminal.
+Project-URL: Homepage, https://github.com/SrdjanCoric/sotellme
+Project-URL: Repository, https://github.com/SrdjanCoric/sotellme
+Project-URL: Issues, https://github.com/SrdjanCoric/sotellme/issues
+Author-email: Srdjan Coric <srdjan.coric1984@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: behavioral,cli,coaching,interview,langgraph,llm
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: End Users/Desktop
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Education
+Classifier: Topic :: Utilities
+Requires-Python: >=3.12
+Requires-Dist: httpx>=0.27
+Requires-Dist: langchain-anthropic>=1.4.5
+Requires-Dist: langchain-google-genai>=4.2.5
+Requires-Dist: langchain-openai>=1.3.0
+Requires-Dist: langchain>=1.3.8
+Requires-Dist: langgraph-checkpoint-sqlite>=3.1.0
+Requires-Dist: langgraph>=1.2.4
+Requires-Dist: prompt-toolkit>=3.0
+Requires-Dist: pydantic>=2.13.4
+Requires-Dist: pypdf>=6.13.2
+Requires-Dist: rich>=15.0.0
+Provides-Extra: tracing
+Requires-Dist: langfuse>=3.0; extra == 'tracing'
+Provides-Extra: web
+Requires-Dist: streamlit>=1.40; extra == 'web'
+Description-Content-Type: text/markdown
+# sotellme
+A mock behavioral interviewer that runs in your terminal, built from your CV and the
+job you're actually chasing.
+## Why I built it
+I built this because behavioral interviews are where good candidates trip up. The
+questions sound easy, so most people wing them, and the usual prep ("tell me about a
+time you failed") is too generic to help with the specific job in front of you.
+sotellme makes the practice specific. You give it your CV and a job posting, it reads
+up on the company, and then it interviews you against all three at once, so when it
+asks why you want the role it can name the product you'd actually be building. At the
+end it grades every answer and walks you through the weak ones: what went wrong, and
+what to say instead.
+## What it does
+You give it your CV and the job you're chasing, and it interviews you against both, plus
+a short brief it builds on the company from a handful of public pages. It runs the
+session the way a real interviewer would: it opens on who you are, digs into your biggest
+piece of work, picks the stories that fit the role, and chases the interesting thread in
+your last answer rather than marching through a checklist. Most sessions run 8 to 14
+questions. When you're done, the smart model reads the whole transcript and scores every
+answer on STAR structure, specificity, and ownership against your target level, then
+writes you a Markdown report: a scorecard that names what's weak, a fix for each soft
+answer, and a short study plan. It also tells you what the run cost.
+## Quickstart
+Set one provider key first (see [Configuration](#configuration)):
+```sh
+export ANTHROPIC_API_KEY=...   # or GOOGLE_API_KEY, or OPENAI_API_KEY
+```
+The easiest way in is the local web app. Pull in the web extra and launch it:
+```sh
+uvx --from "sotellme[web]" sotellme web
+```
+It opens in your browser: upload your CV, paste a posting or drop in a link, run the
+interview as a chat, and read the report on the page, with a button to save it as
+Markdown. Everything runs locally on your own key.
+If you'd rather stay in the terminal, run the interview straight from
+[`uvx`](https://docs.astral.sh/uv/), no clone needed:
+```sh
+uvx sotellme interview --cv path/to/cv.pdf --job https://jobs.example.com/senior-backend
+```
+`--job` takes a link, a file (PDF, markdown, or text), or pasted posting text, and it's
+optional; without it the interview runs on a default competency set with no company
+research to ground it. For a link the tool prefers the page's embedded `JobPosting` data
+and falls back to the visible text, and Workable postings are read through their public
+API. Pages that only render with JavaScript can't be read, and pasting the text always
+works.
+Answers are multi-line with real line editing (Home, End, arrow keys, word jumps). Enter
+starts a new line; Esc then Enter sends, or put `/done` on its own line.
+### Commands
+| Command | What it does |
+| --- | --- |
+| `sotellme interview --cv <path> [--job <link\|file\|text>]` | Start a new interview session. |
+| `sotellme resume` | Pick up the latest interrupted session. |
+| `sotellme reports` | List the coaching reports in this directory, newest first. |
+| `sotellme grade <transcript.json> --level <junior\|mid\|senior\|staff>` | Grade a transcript you already have (a JSON list of `{question, answer}` pairs) without running a live interview. |
+| `sotellme web` | Launch the local web UI in your browser (needs the `web` extra). |
+`interview`, `resume`, and `grade` also take `--provider`, `--fast-model`, and
+`--smart-model` to override the model picks.
+## Privacy and limits
+Your transcripts and session state stay on your machine. The only things that leave it
+are API calls to whichever provider you picked, plus plain HTTP GETs to public pages: one
+for a `--job` link, and up to six more for the company brief. Those fetches are capped per
+session, truncated per page, and refused for localhost and private addresses. Your API key
+is read only by the code that calls the provider and never goes into a prompt, so no
+hostile page or posting can talk the model into leaking it (`tests/test_fetch.py`,
+`tests/test_secret_isolation.py`, `tests/test_injection.py`).
+A cap on questions, a guaranteed closing question, a ceiling on web fetches, and a token
+budget that ends a long session early are all plain code, and they're unit-tested. The
+tool also screens what you type before it reaches the interview, so going off-topic nudges
+you back and a second off-topic reply in a row wraps the session up. Either way the real
+answers you gave still get graded.
+## Configuration
+There's no account and no server. Pick a provider with `SOTELLME_PROVIDER` (or
+`--provider`, or the dropdown in the web app) and set its key:
+| Provider       | Key variable        | Default models (fast / smart)             |
+| -------------- | ------------------- | ----------------------------------------- |
+| `google_genai` | `GOOGLE_API_KEY`    | gemini-3.5-flash / gemini-3.1-pro-preview |
+| `anthropic`    | `ANTHROPIC_API_KEY` | claude-sonnet-4-6 / claude-opus-4-8       |
+| `openai`       | `OPENAI_API_KEY`    | gpt-5.4-mini / gpt-5.5                    |
+The fast slot runs the interview side (CV parser, company researcher, answer assessor,
+interviewer); the smart slot runs the director that makes every probe-or-move-on call,
+plus the end-of-session grader and coach. In the CLI you set those two slots with
+`SOTELLME_FAST_MODEL` / `SOTELLME_SMART_MODEL` or the matching flags. The web app goes
+finer: its Advanced section pins a model to each step on its own, so you can put a cheap
+one on the company research and a stronger one on the questions and the grading, and mix
+providers once you've set more than one key. The eval suites run against `google_genai`
+with an `anthropic` judge, which is the combo I'd reach for.
+Both draw their choices from the same catalog, which ships the per-provider defaults in
+the table above. To change what's on offer, write a `~/.sotellme/models.toml` listing the
+models you want and the default for each provider, and that's what the web app's dropdowns
+show. The file holds model names plus the per-model prices behind the cost
+estimates (including the reduced rate for cached input), so you can correct a rate that's
+drifted; your API keys stay in the environment.
+The session has a token budget, 400,000 by default, that ends the interview early if a run
+goes long and keeps a reserved share back to grade and coach what you gave. Change it with
+`SOTELLME_TOKEN_BUDGET`.
+## Development
+Requires Python 3.12+, managed with `uv`. The package takes its long description from the
+repo's `README.md`, so stage that and the license into `backend/` once before the first
+sync:
+```sh
+cd backend
+python3 scripts/prepare_package.py
+uv sync
+uv run ruff check . && uv run mypy && uv run pytest
+```
+The deterministic suite runs without any API keys, and it's the whole CI gate.
+The judgment agents (grader, coach, assessor, role builder, profile parser) are tuned
+separately in Langfuse. Stand up a local instance, export `LANGFUSE_PUBLIC_KEY`,
+`LANGFUSE_SECRET_KEY`, and `LANGFUSE_HOST`, then sync the committed cases and run one
+agent over its dataset:
+```sh
+uv sync --extra tracing
+uv run python scripts/evals.py upload
+uv run python scripts/evals.py run grader --limit 2   # small calibration run first
+uv run python scripts/evals.py run grader
+```
+Each run lands in Langfuse with a deterministic score per case, so you can read the
+outputs, edit a prompt, run it again, and compare the two runs side by side. It also
+prints the run's token count and estimated cost per model, priced from `models.toml`, so
+you can size a full run from a `--limit` sample before committing to it. Only the
+synthetic `evals/*.json` cases ever go in, and Langfuse stays off unless its env vars are
+set, for evals and for live-session tracing alike.
+The questions the system asks get their own eval. `scripts/simulate.py` runs a full
+interview against a synthetic candidate: the real interviewer and director loop ask, while
+a candidate-simulator answers in character from a persona under `evals/personas/`. The
+personas span every level from junior to staff and a mix of answering styles, complete
+STAR stories, thin answers, blurred ownership, off-topic drift, confident bluffing, and
+injection attempts, so a run also exercises the guardrail and how the loop recovers. An
+LLM judge on the smart slot scores each question on relevance, whether it probes the
+flagged gap, level-appropriateness, whether it leads the candidate, and follow-up
+discipline, plus a coverage verdict for the session.
+```sh
+uv run python scripts/simulate.py upload
+uv run python scripts/simulate.py run --persona senior-strong --persona junior-thin
+uv run python scripts/simulate.py run
+```
+Before a run it estimates the cost across the chosen personas and the judge passes and
+asks first for anything over $3.50; pass `--yes` to skip the prompt in a script. Each
+persona is a Langfuse dataset item tagged with its level and answer mix, so the
+question-quality scores compare run to run and slice by both, and the session transcripts
+land under `evals/sessions/`. The personas are synthetic, the same PII rule as everything
+else.

sotellme-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,180 @@
+# sotellme
+A mock behavioral interviewer that runs in your terminal, built from your CV and the
+job you're actually chasing.
+## Why I built it
+I built this because behavioral interviews are where good candidates trip up. The
+questions sound easy, so most people wing them, and the usual prep ("tell me about a
+time you failed") is too generic to help with the specific job in front of you.
+sotellme makes the practice specific. You give it your CV and a job posting, it reads
+up on the company, and then it interviews you against all three at once, so when it
+asks why you want the role it can name the product you'd actually be building. At the
+end it grades every answer and walks you through the weak ones: what went wrong, and
+what to say instead.
+## What it does
+You give it your CV and the job you're chasing, and it interviews you against both, plus
+a short brief it builds on the company from a handful of public pages. It runs the
+session the way a real interviewer would: it opens on who you are, digs into your biggest
+piece of work, picks the stories that fit the role, and chases the interesting thread in
+your last answer rather than marching through a checklist. Most sessions run 8 to 14
+questions. When you're done, the smart model reads the whole transcript and scores every
+answer on STAR structure, specificity, and ownership against your target level, then
+writes you a Markdown report: a scorecard that names what's weak, a fix for each soft
+answer, and a short study plan. It also tells you what the run cost.
+## Quickstart
+Set one provider key first (see [Configuration](#configuration)):
+```sh
+export ANTHROPIC_API_KEY=...   # or GOOGLE_API_KEY, or OPENAI_API_KEY
+```
+The easiest way in is the local web app. Pull in the web extra and launch it:
+```sh
+uvx --from "sotellme[web]" sotellme web
+```
+It opens in your browser: upload your CV, paste a posting or drop in a link, run the
+interview as a chat, and read the report on the page, with a button to save it as
+Markdown. Everything runs locally on your own key.
+If you'd rather stay in the terminal, run the interview straight from
+[`uvx`](https://docs.astral.sh/uv/), no clone needed:
+```sh
+uvx sotellme interview --cv path/to/cv.pdf --job https://jobs.example.com/senior-backend
+```
+`--job` takes a link, a file (PDF, markdown, or text), or pasted posting text, and it's
+optional; without it the interview runs on a default competency set with no company
+research to ground it. For a link the tool prefers the page's embedded `JobPosting` data
+and falls back to the visible text, and Workable postings are read through their public
+API. Pages that only render with JavaScript can't be read, and pasting the text always
+works.
+Answers are multi-line with real line editing (Home, End, arrow keys, word jumps). Enter
+starts a new line; Esc then Enter sends, or put `/done` on its own line.
+### Commands
+| Command | What it does |
+| --- | --- |
+| `sotellme interview --cv <path> [--job <link\|file\|text>]` | Start a new interview session. |
+| `sotellme resume` | Pick up the latest interrupted session. |
+| `sotellme reports` | List the coaching reports in this directory, newest first. |
+| `sotellme grade <transcript.json> --level <junior\|mid\|senior\|staff>` | Grade a transcript you already have (a JSON list of `{question, answer}` pairs) without running a live interview. |
+| `sotellme web` | Launch the local web UI in your browser (needs the `web` extra). |
+`interview`, `resume`, and `grade` also take `--provider`, `--fast-model`, and
+`--smart-model` to override the model picks.
+## Privacy and limits
+Your transcripts and session state stay on your machine. The only things that leave it
+are API calls to whichever provider you picked, plus plain HTTP GETs to public pages: one
+for a `--job` link, and up to six more for the company brief. Those fetches are capped per
+session, truncated per page, and refused for localhost and private addresses. Your API key
+is read only by the code that calls the provider and never goes into a prompt, so no
+hostile page or posting can talk the model into leaking it (`tests/test_fetch.py`,
+`tests/test_secret_isolation.py`, `tests/test_injection.py`).
+A cap on questions, a guaranteed closing question, a ceiling on web fetches, and a token
+budget that ends a long session early are all plain code, and they're unit-tested. The
+tool also screens what you type before it reaches the interview, so going off-topic nudges
+you back and a second off-topic reply in a row wraps the session up. Either way the real
+answers you gave still get graded.
+## Configuration
+There's no account and no server. Pick a provider with `SOTELLME_PROVIDER` (or
+`--provider`, or the dropdown in the web app) and set its key:
+| Provider       | Key variable        | Default models (fast / smart)             |
+| -------------- | ------------------- | ----------------------------------------- |
+| `google_genai` | `GOOGLE_API_KEY`    | gemini-3.5-flash / gemini-3.1-pro-preview |
+| `anthropic`    | `ANTHROPIC_API_KEY` | claude-sonnet-4-6 / claude-opus-4-8       |
+| `openai`       | `OPENAI_API_KEY`    | gpt-5.4-mini / gpt-5.5                    |
+The fast slot runs the interview side (CV parser, company researcher, answer assessor,
+interviewer); the smart slot runs the director that makes every probe-or-move-on call,
+plus the end-of-session grader and coach. In the CLI you set those two slots with
+`SOTELLME_FAST_MODEL` / `SOTELLME_SMART_MODEL` or the matching flags. The web app goes
+finer: its Advanced section pins a model to each step on its own, so you can put a cheap
+one on the company research and a stronger one on the questions and the grading, and mix
+providers once you've set more than one key. The eval suites run against `google_genai`
+with an `anthropic` judge, which is the combo I'd reach for.
+Both draw their choices from the same catalog, which ships the per-provider defaults in
+the table above. To change what's on offer, write a `~/.sotellme/models.toml` listing the
+models you want and the default for each provider, and that's what the web app's dropdowns
+show. The file holds model names plus the per-model prices behind the cost
+estimates (including the reduced rate for cached input), so you can correct a rate that's
+drifted; your API keys stay in the environment.
+The session has a token budget, 400,000 by default, that ends the interview early if a run
+goes long and keeps a reserved share back to grade and coach what you gave. Change it with
+`SOTELLME_TOKEN_BUDGET`.
+## Development
+Requires Python 3.12+, managed with `uv`. The package takes its long description from the
+repo's `README.md`, so stage that and the license into `backend/` once before the first
+sync:
+```sh
+cd backend
+python3 scripts/prepare_package.py
+uv sync
+uv run ruff check . && uv run mypy && uv run pytest
+```
+The deterministic suite runs without any API keys, and it's the whole CI gate.
+The judgment agents (grader, coach, assessor, role builder, profile parser) are tuned
+separately in Langfuse. Stand up a local instance, export `LANGFUSE_PUBLIC_KEY`,
+`LANGFUSE_SECRET_KEY`, and `LANGFUSE_HOST`, then sync the committed cases and run one
+agent over its dataset:
+```sh
+uv sync --extra tracing
+uv run python scripts/evals.py upload
+uv run python scripts/evals.py run grader --limit 2   # small calibration run first
+uv run python scripts/evals.py run grader
+```
+Each run lands in Langfuse with a deterministic score per case, so you can read the
+outputs, edit a prompt, run it again, and compare the two runs side by side. It also
+prints the run's token count and estimated cost per model, priced from `models.toml`, so
+you can size a full run from a `--limit` sample before committing to it. Only the
+synthetic `evals/*.json` cases ever go in, and Langfuse stays off unless its env vars are
+set, for evals and for live-session tracing alike.
+The questions the system asks get their own eval. `scripts/simulate.py` runs a full
+interview against a synthetic candidate: the real interviewer and director loop ask, while
+a candidate-simulator answers in character from a persona under `evals/personas/`. The
+personas span every level from junior to staff and a mix of answering styles, complete
+STAR stories, thin answers, blurred ownership, off-topic drift, confident bluffing, and
+injection attempts, so a run also exercises the guardrail and how the loop recovers. An
+LLM judge on the smart slot scores each question on relevance, whether it probes the
+flagged gap, level-appropriateness, whether it leads the candidate, and follow-up
+discipline, plus a coverage verdict for the session.
+```sh
+uv run python scripts/simulate.py upload
+uv run python scripts/simulate.py run --persona senior-strong --persona junior-thin
+uv run python scripts/simulate.py run
+```
+Before a run it estimates the cost across the chosen personas and the judge passes and
+asks first for anything over $3.50; pass `--yes` to skip the prompt in a script. Each
+persona is a Langfuse dataset item tagged with its level and answer mix, so the
+question-quality scores compare run to run and slice by both, and the session transcripts
+land under `evals/sessions/`. The personas are synthetic, the same PII rule as everything
+else.

sotellme-0.1.0/evals/assessor_cases.json ADDED Viewed

@@ -0,0 +1,119 @@
+{
+ "description": "Answers the per-answer assessor must read correctly. Each case is the latest answer on a named topic, in the synthetic candidate's voice. STAR evidence flags and signal sufficiency are exact-match on the flags a case names; claim_substrings, when present, must each appear (case-insensitive) somewhere in the claims worth chasing. The five STAR cases carry over from the retired StarFlagger evals; sufficiency and claims are the Phase 4a additions. Ongoing, uneventful work (reviewing generated code, steering a tool) holds enough signal on a concrete account of how the candidate operates even with no incident or number, while a buzzword answer that names no real practice does not. Synthetic data is a stopgap; see plans/decisions/evals-and-observability.md.",
+ "cases": [
+  {
+   "name": "complete-and-quantified",
+   "topic": "the dashboard latency work at Helioscope",
+   "answer": "At Helioscope our ingestion pipeline ran as a nightly batch, so client dashboards were always hours stale. I was asked to get data latency under two minutes before a big renewal. I led the migration to a streaming pipeline on Kafka, rewrote the Python consumers, and ran the old and new paths in parallel for two weeks to prove parity. Data latency dropped from 4 hours to 90 seconds and we kept the client.",
+   "expected": {
+    "situation": true,
+    "task": true,
+    "action": true,
+    "result": true,
+    "quantified_result": true,
+    "sufficient_signal": true
+   }
+  },
+  {
+   "name": "missing-result",
+   "topic": "the berth scheduling work at Dunav Logistics",
+   "answer": "At Dunav Logistics, ships were racking up demurrage fees because berth assignments were done by hand in a spreadsheet. My job was to automate the scheduling. I built a berth-scheduling service in Go that matched vessels to berths by draft and arrival window.",
+   "expected": {
+    "situation": true,
+    "task": true,
+    "action": true,
+    "result": false,
+    "quantified_result": false,
+    "sufficient_signal": false
+   }
+  },
+  {
+   "name": "vague-unquantified-result",
+   "topic": "the berth scheduling work at Dunav Logistics",
+   "answer": "At Dunav Logistics, ships were racking up demurrage fees because berth assignments were done by hand in a spreadsheet. My job was to automate the scheduling. I built a berth-scheduling service in Go. After it shipped, the fees came down a lot and the operations team was much happier.",
+   "expected": {
+    "situation": true,
+    "task": true,
+    "action": true,
+    "result": true,
+    "quantified_result": false,
+    "sufficient_signal": false
+   }
+  },
+  {
+   "name": "action-only",
+   "topic": "the caching work",
+   "answer": "I rewrote the consumers and added a Redis cache in front of the database.",
+   "expected": {
+    "situation": false,
+    "task": false,
+    "action": true,
+    "result": false,
+    "quantified_result": false,
+    "sufficient_signal": false
+   }
+  },
+  {
+   "name": "context-without-action",
+   "topic": "the on-call incident load at Dunav",
+   "answer": "Our tracking platform at Dunav kept paging us at night, and management wanted the incident count brought down before the peak shipping season.",
+   "expected": {
+    "situation": true,
+    "task": true,
+    "action": false,
+    "result": false,
+    "quantified_result": false,
+    "sufficient_signal": false
+   }
+  },
+  {
+   "name": "impact-claim-worth-chasing",
+   "topic": "their background",
+   "answer": "I've spent the last four years at Helioscope on the data platform team. The thing I'm proudest of is cutting our cloud bill by 60 percent in one quarter while the platform kept growing.",
+   "expected": {
+    "situation": true,
+    "task": false,
+    "action": false,
+    "result": true,
+    "quantified_result": true,
+    "sufficient_signal": false
+   },
+   "claim_substrings": [
+    "60 percent"
+   ]
+  },
+  {
+   "name": "broad-topic-one-complete-story-suffices",
+   "topic": "the candidate's background and the thread running through their work",
+   "answer": "The thread is ownership, honestly. The clearest example: at Helioscope our support team hand-triaged every incoming ticket, and renewals were slipping because first responses took hours. I proposed routing tickets with a classifier, got buy-in, and built the service myself over a quarter. It now routes 80 percent of tickets automatically and first response went from six hours to forty minutes.",
+   "expected": {
+    "situation": true,
+    "task": true,
+    "action": true,
+    "result": true,
+    "quantified_result": true,
+    "sufficient_signal": true
+   }
+  },
+  {
+   "name": "ongoing-review-work-is-sufficient-without-an-event",
+   "topic": "how they keep AI-generated code from degrading the codebase",
+   "answer": "I never merge what an assistant writes on trust. I read the whole diff and I'm watching for a few specific things: APIs it invented that don't exist, error handling it quietly dropped, and tests that assert nothing. When I catch one I push back in the same session and have it redo that slice instead of patching over it, and I keep each change small so a bad pattern can't hide in a big diff.",
+   "expected": {
+    "action": true,
+    "result": false,
+    "quantified_result": false,
+    "sufficient_signal": true
+   }
+  },
+  {
+   "name": "buzzword-process-answer-is-not-sufficient",
+   "topic": "how they keep AI-generated code from degrading the codebase",
+   "answer": "I make sure to follow best practices when I use these tools. I keep the architecture clean and the code maintainable, and I always validate the output to a high standard so quality stays where it needs to be.",
+   "expected": {
+    "action": false,
+    "sufficient_signal": false
+   }
+  }
+ ]
+}

sotellme-0.1.0/evals/coach_cases.json ADDED Viewed

@@ -0,0 +1,80 @@
+{
+ "description": "Synthetic coaching cases (stopgap until volunteered sessions land). Each case feeds the coach a transcript plus an authored grade with one planted weakness, and the judge checks the coach's fix is tied to that named gap rather than generic filler.",
+ "cases": [
+  {
+   "name": "missing-quantified-result-senior",
+   "target_level": "senior",
+   "transcript": [
+    {
+     "question": "Tell me about a project you're proud of.",
+     "answer": "At Northwind our checkout service kept timing out under load before big sales. I owned the fix end to end: I profiled the hot path, found the synchronous inventory call, and rewrote it as an async batch with a fallback cache. I rolled it out behind a flag and watched it through two sale weekends."
+    }
+   ],
+   "grade": {
+    "scores": [
+     {
+      "question": "Tell me about a project you're proud of.",
+      "rationale": "Clear end-to-end ownership at senior, concrete actions, but the story stops before any measured outcome.",
+      "star": {"situation": true, "task": true, "action": true, "result": false, "quantified_result": false},
+      "specificity": "high",
+      "ownership": "clear",
+      "weak_or_missing": ["result", "quantified_result"],
+      "gap": "The story never says how the checkout fix turned out; no latency or error-rate change is stated.",
+      "score": 3
+     }
+    ]
+   },
+   "gap_summary": "The answer describes the work in detail but never states the outcome: no number for the latency, timeout rate, or sales impact after the change."
+  },
+  {
+   "name": "blurred-ownership-we-throughout-senior",
+   "target_level": "senior",
+   "transcript": [
+    {
+     "question": "Tell me about a hard technical decision you were part of.",
+     "answer": "We were drowning in flaky integration tests, so we decided to split the suite and run it in parallel. We containerised the dependencies, we added retries for the genuinely network-bound cases, and we got the suite back under ten minutes. The whole team was a lot happier after that."
+    }
+   ],
+   "grade": {
+    "scores": [
+     {
+      "question": "Tell me about a hard technical decision you were part of.",
+      "rationale": "A real outcome with a number, but it is all 'we'; nothing marks what this candidate personally drove, which a senior answer needs.",
+      "star": {"situation": true, "task": true, "action": true, "result": true, "quantified_result": true},
+      "specificity": "high",
+      "ownership": "unclear",
+      "weak_or_missing": [],
+      "gap": "The answer is all 'we' with no personal contribution visible; it never says what the candidate themselves decided or built.",
+      "score": 3
+     }
+    ]
+   },
+   "gap_summary": "Every action is credited to the team as 'we'; the answer never separates out what this candidate personally decided, drove, or built."
+  },
+  {
+   "name": "vague-low-specificity-mid",
+   "target_level": "mid",
+   "transcript": [
+    {
+     "question": "Walk me through a time you improved something on your team.",
+     "answer": "I noticed our process wasn't great, so I made some changes that helped a lot. Things got way better and smoother, people were happier, and overall it was a big improvement for everyone involved."
+    }
+   ],
+   "grade": {
+    "scores": [
+     {
+      "question": "Walk me through a time you improved something on your team.",
+      "rationale": "Nothing concrete: no named process, no specific change, no real outcome. Pure vague betterment.",
+      "star": {"situation": false, "task": false, "action": true, "result": true, "quantified_result": false},
+      "specificity": "low",
+      "ownership": "clear",
+      "weak_or_missing": ["situation", "task", "quantified_result"],
+      "gap": "Nothing in the answer is concrete: it never names the process, the change, or any measurable result.",
+      "score": 2
+     }
+    ]
+   },
+   "gap_summary": "The answer leans entirely on vague words ('helped a lot', 'way better', 'big improvement') and never names the process, the specific change made, or a concrete result."
+  }
+ ]
+}