sotellme 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. sotellme-0.1.0/.gitignore +49 -0
  2. sotellme-0.1.0/.python-version +1 -0
  3. sotellme-0.1.0/LICENSE +21 -0
  4. sotellme-0.1.0/PKG-INFO +217 -0
  5. sotellme-0.1.0/README.md +180 -0
  6. sotellme-0.1.0/evals/assessor_cases.json +119 -0
  7. sotellme-0.1.0/evals/coach_cases.json +80 -0
  8. sotellme-0.1.0/evals/director_cases.json +585 -0
  9. sotellme-0.1.0/evals/grader_cases.json +292 -0
  10. sotellme-0.1.0/evals/guardrail_cases.json +107 -0
  11. sotellme-0.1.0/evals/interviewer_cases.json +218 -0
  12. sotellme-0.1.0/evals/personas/junior-rambling.json +11 -0
  13. sotellme-0.1.0/evals/personas/junior-thin.json +11 -0
  14. sotellme-0.1.0/evals/personas/mid-blurred-ownership.json +11 -0
  15. sotellme-0.1.0/evals/personas/mid-offtopic.json +11 -0
  16. sotellme-0.1.0/evals/personas/senior-bluffer.json +11 -0
  17. sotellme-0.1.0/evals/personas/senior-strong.json +12 -0
  18. sotellme-0.1.0/evals/personas/staff-injection.json +11 -0
  19. sotellme-0.1.0/evals/personas/staff-terse.json +11 -0
  20. sotellme-0.1.0/evals/profile_parser_cases.json +41 -0
  21. sotellme-0.1.0/evals/role_context_cases.json +48 -0
  22. sotellme-0.1.0/pyproject.toml +84 -0
  23. sotellme-0.1.0/scripts/evals.py +89 -0
  24. sotellme-0.1.0/scripts/prepare_package.py +31 -0
  25. sotellme-0.1.0/scripts/release_version.py +59 -0
  26. sotellme-0.1.0/scripts/simulate.py +135 -0
  27. sotellme-0.1.0/scripts/smoke_session.py +178 -0
  28. sotellme-0.1.0/src/sotellme/__init__.py +1 -0
  29. sotellme-0.1.0/src/sotellme/__main__.py +5 -0
  30. sotellme-0.1.0/src/sotellme/assessor.py +69 -0
  31. sotellme-0.1.0/src/sotellme/budget.py +94 -0
  32. sotellme-0.1.0/src/sotellme/caching.py +27 -0
  33. sotellme-0.1.0/src/sotellme/catalog.py +71 -0
  34. sotellme-0.1.0/src/sotellme/cli.py +531 -0
  35. sotellme-0.1.0/src/sotellme/coach.py +114 -0
  36. sotellme-0.1.0/src/sotellme/config.py +122 -0
  37. sotellme-0.1.0/src/sotellme/coverage.py +22 -0
  38. sotellme-0.1.0/src/sotellme/director.py +146 -0
  39. sotellme-0.1.0/src/sotellme/engine.py +463 -0
  40. sotellme-0.1.0/src/sotellme/eval_datasets.py +473 -0
  41. sotellme-0.1.0/src/sotellme/extraction.py +42 -0
  42. sotellme-0.1.0/src/sotellme/fetch.py +229 -0
  43. sotellme-0.1.0/src/sotellme/grader.py +100 -0
  44. sotellme-0.1.0/src/sotellme/guardrail.py +66 -0
  45. sotellme-0.1.0/src/sotellme/interviewer.py +101 -0
  46. sotellme-0.1.0/src/sotellme/judge.py +120 -0
  47. sotellme-0.1.0/src/sotellme/models.toml +68 -0
  48. sotellme-0.1.0/src/sotellme/personas.py +46 -0
  49. sotellme-0.1.0/src/sotellme/posting.py +30 -0
  50. sotellme-0.1.0/src/sotellme/pricing.py +147 -0
  51. sotellme-0.1.0/src/sotellme/profile.py +61 -0
  52. sotellme-0.1.0/src/sotellme/prompts.py +973 -0
  53. sotellme-0.1.0/src/sotellme/py.typed +0 -0
  54. sotellme-0.1.0/src/sotellme/report.py +55 -0
  55. sotellme-0.1.0/src/sotellme/research.py +59 -0
  56. sotellme-0.1.0/src/sotellme/role.py +102 -0
  57. sotellme-0.1.0/src/sotellme/sim_datasets.py +146 -0
  58. sotellme-0.1.0/src/sotellme/simulation.py +413 -0
  59. sotellme-0.1.0/src/sotellme/simulator.py +36 -0
  60. sotellme-0.1.0/src/sotellme/tracing.py +21 -0
  61. sotellme-0.1.0/src/sotellme/voice.py +33 -0
  62. sotellme-0.1.0/src/sotellme/web.py +492 -0
  63. sotellme-0.1.0/tests/fixtures/synthetic_cv.md +32 -0
  64. sotellme-0.1.0/tests/fixtures/synthetic_cv.pdf +63 -0
  65. sotellme-0.1.0/tests/fixtures/synthetic_cv.txt +30 -0
  66. sotellme-0.1.0/tests/pdf_fixture.py +34 -0
  67. sotellme-0.1.0/tests/stubs.py +88 -0
  68. sotellme-0.1.0/tests/test_assessor.py +52 -0
  69. sotellme-0.1.0/tests/test_budget.py +144 -0
  70. sotellme-0.1.0/tests/test_caching.py +157 -0
  71. sotellme-0.1.0/tests/test_catalog.py +91 -0
  72. sotellme-0.1.0/tests/test_cli.py +369 -0
  73. sotellme-0.1.0/tests/test_coach.py +90 -0
  74. sotellme-0.1.0/tests/test_config.py +180 -0
  75. sotellme-0.1.0/tests/test_director.py +126 -0
  76. sotellme-0.1.0/tests/test_engine.py +819 -0
  77. sotellme-0.1.0/tests/test_envelope.py +53 -0
  78. sotellme-0.1.0/tests/test_eval_datasets.py +299 -0
  79. sotellme-0.1.0/tests/test_extraction.py +59 -0
  80. sotellme-0.1.0/tests/test_fetch.py +245 -0
  81. sotellme-0.1.0/tests/test_firewall.py +141 -0
  82. sotellme-0.1.0/tests/test_grader.py +137 -0
  83. sotellme-0.1.0/tests/test_guardrail.py +76 -0
  84. sotellme-0.1.0/tests/test_guardrail_evals.py +37 -0
  85. sotellme-0.1.0/tests/test_injection.py +66 -0
  86. sotellme-0.1.0/tests/test_interviewer.py +151 -0
  87. sotellme-0.1.0/tests/test_judge.py +120 -0
  88. sotellme-0.1.0/tests/test_level_access.py +62 -0
  89. sotellme-0.1.0/tests/test_package.py +5 -0
  90. sotellme-0.1.0/tests/test_personas.py +98 -0
  91. sotellme-0.1.0/tests/test_posting.py +55 -0
  92. sotellme-0.1.0/tests/test_pricing.py +176 -0
  93. sotellme-0.1.0/tests/test_profile.py +84 -0
  94. sotellme-0.1.0/tests/test_prompts.py +463 -0
  95. sotellme-0.1.0/tests/test_release_version.py +68 -0
  96. sotellme-0.1.0/tests/test_report.py +118 -0
  97. sotellme-0.1.0/tests/test_research.py +102 -0
  98. sotellme-0.1.0/tests/test_restart.py +195 -0
  99. sotellme-0.1.0/tests/test_role.py +116 -0
  100. sotellme-0.1.0/tests/test_secret_isolation.py +201 -0
  101. sotellme-0.1.0/tests/test_sim_datasets.py +37 -0
  102. sotellme-0.1.0/tests/test_simulation.py +363 -0
  103. sotellme-0.1.0/tests/test_simulator.py +83 -0
  104. sotellme-0.1.0/tests/test_tracing.py +24 -0
  105. sotellme-0.1.0/tests/test_voice.py +40 -0
  106. sotellme-0.1.0/tests/test_web.py +244 -0
  107. sotellme-0.1.0/tests/voice.py +3 -0
  108. sotellme-0.1.0/uv.lock +2722 -0
@@ -0,0 +1,49 @@
1
+ # Knowledge base — licensed course material, prompt-distillation source (never committed).
2
+ /how-to-answer/
3
+ /how-to-interview/
4
+
5
+ .claude
6
+
7
+ CLAUDE.md
8
+
9
+ # Working docs
10
+ plans
11
+ scratch/
12
+ issues/
13
+ learn/
14
+
15
+ # Secrets & environment
16
+ .env
17
+ .env.*
18
+ !.env.example
19
+ *.local
20
+
21
+ # Langfuse (self-hosted) data
22
+ langfuse/
23
+ *.db
24
+
25
+ # Package build files staged from the repo root (see backend/scripts/prepare_package.py)
26
+ backend/README.md
27
+ backend/LICENSE
28
+
29
+ # Python
30
+ __pycache__/
31
+ *.py[cod]
32
+ .venv/
33
+ .mypy_cache/
34
+ .pytest_cache/
35
+ .ruff_cache/
36
+ dist/
37
+
38
+ # Local data (real CVs, session artifacts — PII, never committed)
39
+ data/
40
+
41
+ # Generated session reports (may carry real-CV PII) and simulated-eval session artifacts
42
+ sotellme-report-*.md
43
+ backend/evals/sessions/
44
+
45
+ # Feature-review scaffolding (ephemeral, branch-scoped)
46
+ reviews/
47
+
48
+ # OS / editor
49
+ .DS_Store
@@ -0,0 +1 @@
1
+ 3.12
sotellme-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Srdjan Coric
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.4
2
+ Name: sotellme
3
+ Version: 0.1.0
4
+ Summary: A behavioral-interview simulator and coach that runs in your terminal.
5
+ Project-URL: Homepage, https://github.com/SrdjanCoric/sotellme
6
+ Project-URL: Repository, https://github.com/SrdjanCoric/sotellme
7
+ Project-URL: Issues, https://github.com/SrdjanCoric/sotellme/issues
8
+ Author-email: Srdjan Coric <srdjan.coric1984@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: behavioral,cli,coaching,interview,langgraph,llm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: End Users/Desktop
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Education
19
+ Classifier: Topic :: Utilities
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: httpx>=0.27
22
+ Requires-Dist: langchain-anthropic>=1.4.5
23
+ Requires-Dist: langchain-google-genai>=4.2.5
24
+ Requires-Dist: langchain-openai>=1.3.0
25
+ Requires-Dist: langchain>=1.3.8
26
+ Requires-Dist: langgraph-checkpoint-sqlite>=3.1.0
27
+ Requires-Dist: langgraph>=1.2.4
28
+ Requires-Dist: prompt-toolkit>=3.0
29
+ Requires-Dist: pydantic>=2.13.4
30
+ Requires-Dist: pypdf>=6.13.2
31
+ Requires-Dist: rich>=15.0.0
32
+ Provides-Extra: tracing
33
+ Requires-Dist: langfuse>=3.0; extra == 'tracing'
34
+ Provides-Extra: web
35
+ Requires-Dist: streamlit>=1.40; extra == 'web'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # sotellme
39
+
40
+ A mock behavioral interviewer that runs in your terminal, built from your CV and the
41
+ job you're actually chasing.
42
+
43
+ ## Why I built it
44
+
45
+ I built this because behavioral interviews are where good candidates trip up. The
46
+ questions sound easy, so most people wing them, and the usual prep ("tell me about a
47
+ time you failed") is too generic to help with the specific job in front of you.
48
+
49
+ sotellme makes the practice specific. You give it your CV and a job posting, it reads
50
+ up on the company, and then it interviews you against all three at once, so when it
51
+ asks why you want the role it can name the product you'd actually be building. At the
52
+ end it grades every answer and walks you through the weak ones: what went wrong, and
53
+ what to say instead.
54
+
55
+ ## What it does
56
+
57
+ You give it your CV and the job you're chasing, and it interviews you against both, plus
58
+ a short brief it builds on the company from a handful of public pages. It runs the
59
+ session the way a real interviewer would: it opens on who you are, digs into your biggest
60
+ piece of work, picks the stories that fit the role, and chases the interesting thread in
61
+ your last answer rather than marching through a checklist. Most sessions run 8 to 14
62
+ questions. When you're done, the smart model reads the whole transcript and scores every
63
+ answer on STAR structure, specificity, and ownership against your target level, then
64
+ writes you a Markdown report: a scorecard that names what's weak, a fix for each soft
65
+ answer, and a short study plan. It also tells you what the run cost.
66
+
67
+ ## Quickstart
68
+
69
+ Set one provider key first (see [Configuration](#configuration)):
70
+
71
+ ```sh
72
+ export ANTHROPIC_API_KEY=... # or GOOGLE_API_KEY, or OPENAI_API_KEY
73
+ ```
74
+
75
+ The easiest way in is the local web app. Pull in the web extra and launch it:
76
+
77
+ ```sh
78
+ uvx --from "sotellme[web]" sotellme web
79
+ ```
80
+
81
+ It opens in your browser: upload your CV, paste a posting or drop in a link, run the
82
+ interview as a chat, and read the report on the page, with a button to save it as
83
+ Markdown. Everything runs locally on your own key.
84
+
85
+ If you'd rather stay in the terminal, run the interview straight from
86
+ [`uvx`](https://docs.astral.sh/uv/), no clone needed:
87
+
88
+ ```sh
89
+ uvx sotellme interview --cv path/to/cv.pdf --job https://jobs.example.com/senior-backend
90
+ ```
91
+
92
+ `--job` takes a link, a file (PDF, markdown, or text), or pasted posting text, and it's
93
+ optional; without it the interview runs on a default competency set with no company
94
+ research to ground it. For a link the tool prefers the page's embedded `JobPosting` data
95
+ and falls back to the visible text, and Workable postings are read through their public
96
+ API. Pages that only render with JavaScript can't be read, and pasting the text always
97
+ works.
98
+
99
+ Answers are multi-line with real line editing (Home, End, arrow keys, word jumps). Enter
100
+ starts a new line; Esc then Enter sends, or put `/done` on its own line.
101
+
102
+ ### Commands
103
+
104
+ | Command | What it does |
105
+ | --- | --- |
106
+ | `sotellme interview --cv <path> [--job <link\|file\|text>]` | Start a new interview session. |
107
+ | `sotellme resume` | Pick up the latest interrupted session. |
108
+ | `sotellme reports` | List the coaching reports in this directory, newest first. |
109
+ | `sotellme grade <transcript.json> --level <junior\|mid\|senior\|staff>` | Grade a transcript you already have (a JSON list of `{question, answer}` pairs) without running a live interview. |
110
+ | `sotellme web` | Launch the local web UI in your browser (needs the `web` extra). |
111
+
112
+ `interview`, `resume`, and `grade` also take `--provider`, `--fast-model`, and
113
+ `--smart-model` to override the model picks.
114
+
115
+ ## Privacy and limits
116
+
117
+ Your transcripts and session state stay on your machine. The only things that leave it
118
+ are API calls to whichever provider you picked, plus plain HTTP GETs to public pages: one
119
+ for a `--job` link, and up to six more for the company brief. Those fetches are capped per
120
+ session, truncated per page, and refused for localhost and private addresses. Your API key
121
+ is read only by the code that calls the provider and never goes into a prompt, so no
122
+ hostile page or posting can talk the model into leaking it (`tests/test_fetch.py`,
123
+ `tests/test_secret_isolation.py`, `tests/test_injection.py`).
124
+
125
+ A cap on questions, a guaranteed closing question, a ceiling on web fetches, and a token
126
+ budget that ends a long session early are all plain code, and they're unit-tested. The
127
+ tool also screens what you type before it reaches the interview, so going off-topic nudges
128
+ you back and a second off-topic reply in a row wraps the session up. Either way the real
129
+ answers you gave still get graded.
130
+
131
+ ## Configuration
132
+
133
+ There's no account and no server. Pick a provider with `SOTELLME_PROVIDER` (or
134
+ `--provider`, or the dropdown in the web app) and set its key:
135
+
136
+ | Provider | Key variable | Default models (fast / smart) |
137
+ | -------------- | ------------------- | ----------------------------------------- |
138
+ | `google_genai` | `GOOGLE_API_KEY` | gemini-3.5-flash / gemini-3.1-pro-preview |
139
+ | `anthropic` | `ANTHROPIC_API_KEY` | claude-sonnet-4-6 / claude-opus-4-8 |
140
+ | `openai` | `OPENAI_API_KEY` | gpt-5.4-mini / gpt-5.5 |
141
+
142
+ The fast slot runs the interview side (CV parser, company researcher, answer assessor,
143
+ interviewer); the smart slot runs the director that makes every probe-or-move-on call,
144
+ plus the end-of-session grader and coach. In the CLI you set those two slots with
145
+ `SOTELLME_FAST_MODEL` / `SOTELLME_SMART_MODEL` or the matching flags. The web app goes
146
+ finer: its Advanced section pins a model to each step on its own, so you can put a cheap
147
+ one on the company research and a stronger one on the questions and the grading, and mix
148
+ providers once you've set more than one key. The eval suites run against `google_genai`
149
+ with an `anthropic` judge, which is the combo I'd reach for.
150
+
151
+ Both draw their choices from the same catalog, which ships the per-provider defaults in
152
+ the table above. To change what's on offer, write a `~/.sotellme/models.toml` listing the
153
+ models you want and the default for each provider, and that's what the web app's dropdowns
154
+ show. The file holds model names plus the per-model prices behind the cost
155
+ estimates (including the reduced rate for cached input), so you can correct a rate that's
156
+ drifted; your API keys stay in the environment.
157
+
158
+ The session has a token budget, 400,000 by default, that ends the interview early if a run
159
+ goes long and keeps a reserved share back to grade and coach what you gave. Change it with
160
+ `SOTELLME_TOKEN_BUDGET`.
161
+
162
+ ## Development
163
+
164
+ Requires Python 3.12+, managed with `uv`. The package takes its long description from the
165
+ repo's `README.md`, so stage that and the license into `backend/` once before the first
166
+ sync:
167
+
168
+ ```sh
169
+ cd backend
170
+ python3 scripts/prepare_package.py
171
+ uv sync
172
+ uv run ruff check . && uv run mypy && uv run pytest
173
+ ```
174
+
175
+ The deterministic suite runs without any API keys, and it's the whole CI gate.
176
+
177
+ The judgment agents (grader, coach, assessor, role builder, profile parser) are tuned
178
+ separately in Langfuse. Stand up a local instance, export `LANGFUSE_PUBLIC_KEY`,
179
+ `LANGFUSE_SECRET_KEY`, and `LANGFUSE_HOST`, then sync the committed cases and run one
180
+ agent over its dataset:
181
+
182
+ ```sh
183
+ uv sync --extra tracing
184
+ uv run python scripts/evals.py upload
185
+ uv run python scripts/evals.py run grader --limit 2 # small calibration run first
186
+ uv run python scripts/evals.py run grader
187
+ ```
188
+
189
+ Each run lands in Langfuse with a deterministic score per case, so you can read the
190
+ outputs, edit a prompt, run it again, and compare the two runs side by side. It also
191
+ prints the run's token count and estimated cost per model, priced from `models.toml`, so
192
+ you can size a full run from a `--limit` sample before committing to it. Only the
193
+ synthetic `evals/*.json` cases ever go in, and Langfuse stays off unless its env vars are
194
+ set, for evals and for live-session tracing alike.
195
+
196
+ The questions the system asks get their own eval. `scripts/simulate.py` runs a full
197
+ interview against a synthetic candidate: the real interviewer and director loop ask, while
198
+ a candidate-simulator answers in character from a persona under `evals/personas/`. The
199
+ personas span every level from junior to staff and a mix of answering styles, complete
200
+ STAR stories, thin answers, blurred ownership, off-topic drift, confident bluffing, and
201
+ injection attempts, so a run also exercises the guardrail and how the loop recovers. An
202
+ LLM judge on the smart slot scores each question on relevance, whether it probes the
203
+ flagged gap, level-appropriateness, whether it leads the candidate, and follow-up
204
+ discipline, plus a coverage verdict for the session.
205
+
206
+ ```sh
207
+ uv run python scripts/simulate.py upload
208
+ uv run python scripts/simulate.py run --persona senior-strong --persona junior-thin
209
+ uv run python scripts/simulate.py run
210
+ ```
211
+
212
+ Before a run it estimates the cost across the chosen personas and the judge passes and
213
+ asks first for anything over $3.50; pass `--yes` to skip the prompt in a script. Each
214
+ persona is a Langfuse dataset item tagged with its level and answer mix, so the
215
+ question-quality scores compare run to run and slice by both, and the session transcripts
216
+ land under `evals/sessions/`. The personas are synthetic, the same PII rule as everything
217
+ else.
@@ -0,0 +1,180 @@
1
+ # sotellme
2
+
3
+ A mock behavioral interviewer that runs in your terminal, built from your CV and the
4
+ job you're actually chasing.
5
+
6
+ ## Why I built it
7
+
8
+ I built this because behavioral interviews are where good candidates trip up. The
9
+ questions sound easy, so most people wing them, and the usual prep ("tell me about a
10
+ time you failed") is too generic to help with the specific job in front of you.
11
+
12
+ sotellme makes the practice specific. You give it your CV and a job posting, it reads
13
+ up on the company, and then it interviews you against all three at once, so when it
14
+ asks why you want the role it can name the product you'd actually be building. At the
15
+ end it grades every answer and walks you through the weak ones: what went wrong, and
16
+ what to say instead.
17
+
18
+ ## What it does
19
+
20
+ You give it your CV and the job you're chasing, and it interviews you against both, plus
21
+ a short brief it builds on the company from a handful of public pages. It runs the
22
+ session the way a real interviewer would: it opens on who you are, digs into your biggest
23
+ piece of work, picks the stories that fit the role, and chases the interesting thread in
24
+ your last answer rather than marching through a checklist. Most sessions run 8 to 14
25
+ questions. When you're done, the smart model reads the whole transcript and scores every
26
+ answer on STAR structure, specificity, and ownership against your target level, then
27
+ writes you a Markdown report: a scorecard that names what's weak, a fix for each soft
28
+ answer, and a short study plan. It also tells you what the run cost.
29
+
30
+ ## Quickstart
31
+
32
+ Set one provider key first (see [Configuration](#configuration)):
33
+
34
+ ```sh
35
+ export ANTHROPIC_API_KEY=... # or GOOGLE_API_KEY, or OPENAI_API_KEY
36
+ ```
37
+
38
+ The easiest way in is the local web app. Pull in the web extra and launch it:
39
+
40
+ ```sh
41
+ uvx --from "sotellme[web]" sotellme web
42
+ ```
43
+
44
+ It opens in your browser: upload your CV, paste a posting or drop in a link, run the
45
+ interview as a chat, and read the report on the page, with a button to save it as
46
+ Markdown. Everything runs locally on your own key.
47
+
48
+ If you'd rather stay in the terminal, run the interview straight from
49
+ [`uvx`](https://docs.astral.sh/uv/), no clone needed:
50
+
51
+ ```sh
52
+ uvx sotellme interview --cv path/to/cv.pdf --job https://jobs.example.com/senior-backend
53
+ ```
54
+
55
+ `--job` takes a link, a file (PDF, markdown, or text), or pasted posting text, and it's
56
+ optional; without it the interview runs on a default competency set with no company
57
+ research to ground it. For a link the tool prefers the page's embedded `JobPosting` data
58
+ and falls back to the visible text, and Workable postings are read through their public
59
+ API. Pages that only render with JavaScript can't be read, and pasting the text always
60
+ works.
61
+
62
+ Answers are multi-line with real line editing (Home, End, arrow keys, word jumps). Enter
63
+ starts a new line; Esc then Enter sends, or put `/done` on its own line.
64
+
65
+ ### Commands
66
+
67
+ | Command | What it does |
68
+ | --- | --- |
69
+ | `sotellme interview --cv <path> [--job <link\|file\|text>]` | Start a new interview session. |
70
+ | `sotellme resume` | Pick up the latest interrupted session. |
71
+ | `sotellme reports` | List the coaching reports in this directory, newest first. |
72
+ | `sotellme grade <transcript.json> --level <junior\|mid\|senior\|staff>` | Grade a transcript you already have (a JSON list of `{question, answer}` pairs) without running a live interview. |
73
+ | `sotellme web` | Launch the local web UI in your browser (needs the `web` extra). |
74
+
75
+ `interview`, `resume`, and `grade` also take `--provider`, `--fast-model`, and
76
+ `--smart-model` to override the model picks.
77
+
78
+ ## Privacy and limits
79
+
80
+ Your transcripts and session state stay on your machine. The only things that leave it
81
+ are API calls to whichever provider you picked, plus plain HTTP GETs to public pages: one
82
+ for a `--job` link, and up to six more for the company brief. Those fetches are capped per
83
+ session, truncated per page, and refused for localhost and private addresses. Your API key
84
+ is read only by the code that calls the provider and never goes into a prompt, so no
85
+ hostile page or posting can talk the model into leaking it (`tests/test_fetch.py`,
86
+ `tests/test_secret_isolation.py`, `tests/test_injection.py`).
87
+
88
+ A cap on questions, a guaranteed closing question, a ceiling on web fetches, and a token
89
+ budget that ends a long session early are all plain code, and they're unit-tested. The
90
+ tool also screens what you type before it reaches the interview, so going off-topic nudges
91
+ you back and a second off-topic reply in a row wraps the session up. Either way the real
92
+ answers you gave still get graded.
93
+
94
+ ## Configuration
95
+
96
+ There's no account and no server. Pick a provider with `SOTELLME_PROVIDER` (or
97
+ `--provider`, or the dropdown in the web app) and set its key:
98
+
99
+ | Provider | Key variable | Default models (fast / smart) |
100
+ | -------------- | ------------------- | ----------------------------------------- |
101
+ | `google_genai` | `GOOGLE_API_KEY` | gemini-3.5-flash / gemini-3.1-pro-preview |
102
+ | `anthropic` | `ANTHROPIC_API_KEY` | claude-sonnet-4-6 / claude-opus-4-8 |
103
+ | `openai` | `OPENAI_API_KEY` | gpt-5.4-mini / gpt-5.5 |
104
+
105
+ The fast slot runs the interview side (CV parser, company researcher, answer assessor,
106
+ interviewer); the smart slot runs the director that makes every probe-or-move-on call,
107
+ plus the end-of-session grader and coach. In the CLI you set those two slots with
108
+ `SOTELLME_FAST_MODEL` / `SOTELLME_SMART_MODEL` or the matching flags. The web app goes
109
+ finer: its Advanced section pins a model to each step on its own, so you can put a cheap
110
+ one on the company research and a stronger one on the questions and the grading, and mix
111
+ providers once you've set more than one key. The eval suites run against `google_genai`
112
+ with an `anthropic` judge, which is the combo I'd reach for.
113
+
114
+ Both draw their choices from the same catalog, which ships the per-provider defaults in
115
+ the table above. To change what's on offer, write a `~/.sotellme/models.toml` listing the
116
+ models you want and the default for each provider, and that's what the web app's dropdowns
117
+ show. The file holds model names plus the per-model prices behind the cost
118
+ estimates (including the reduced rate for cached input), so you can correct a rate that's
119
+ drifted; your API keys stay in the environment.
120
+
121
+ The session has a token budget, 400,000 by default, that ends the interview early if a run
122
+ goes long and keeps a reserved share back to grade and coach what you gave. Change it with
123
+ `SOTELLME_TOKEN_BUDGET`.
124
+
125
+ ## Development
126
+
127
+ Requires Python 3.12+, managed with `uv`. The package takes its long description from the
128
+ repo's `README.md`, so stage that and the license into `backend/` once before the first
129
+ sync:
130
+
131
+ ```sh
132
+ cd backend
133
+ python3 scripts/prepare_package.py
134
+ uv sync
135
+ uv run ruff check . && uv run mypy && uv run pytest
136
+ ```
137
+
138
+ The deterministic suite runs without any API keys, and it's the whole CI gate.
139
+
140
+ The judgment agents (grader, coach, assessor, role builder, profile parser) are tuned
141
+ separately in Langfuse. Stand up a local instance, export `LANGFUSE_PUBLIC_KEY`,
142
+ `LANGFUSE_SECRET_KEY`, and `LANGFUSE_HOST`, then sync the committed cases and run one
143
+ agent over its dataset:
144
+
145
+ ```sh
146
+ uv sync --extra tracing
147
+ uv run python scripts/evals.py upload
148
+ uv run python scripts/evals.py run grader --limit 2 # small calibration run first
149
+ uv run python scripts/evals.py run grader
150
+ ```
151
+
152
+ Each run lands in Langfuse with a deterministic score per case, so you can read the
153
+ outputs, edit a prompt, run it again, and compare the two runs side by side. It also
154
+ prints the run's token count and estimated cost per model, priced from `models.toml`, so
155
+ you can size a full run from a `--limit` sample before committing to it. Only the
156
+ synthetic `evals/*.json` cases ever go in, and Langfuse stays off unless its env vars are
157
+ set, for evals and for live-session tracing alike.
158
+
159
+ The questions the system asks get their own eval. `scripts/simulate.py` runs a full
160
+ interview against a synthetic candidate: the real interviewer and director loop ask, while
161
+ a candidate-simulator answers in character from a persona under `evals/personas/`. The
162
+ personas span every level from junior to staff and a mix of answering styles, complete
163
+ STAR stories, thin answers, blurred ownership, off-topic drift, confident bluffing, and
164
+ injection attempts, so a run also exercises the guardrail and how the loop recovers. An
165
+ LLM judge on the smart slot scores each question on relevance, whether it probes the
166
+ flagged gap, level-appropriateness, whether it leads the candidate, and follow-up
167
+ discipline, plus a coverage verdict for the session.
168
+
169
+ ```sh
170
+ uv run python scripts/simulate.py upload
171
+ uv run python scripts/simulate.py run --persona senior-strong --persona junior-thin
172
+ uv run python scripts/simulate.py run
173
+ ```
174
+
175
+ Before a run it estimates the cost across the chosen personas and the judge passes and
176
+ asks first for anything over $3.50; pass `--yes` to skip the prompt in a script. Each
177
+ persona is a Langfuse dataset item tagged with its level and answer mix, so the
178
+ question-quality scores compare run to run and slice by both, and the session transcripts
179
+ land under `evals/sessions/`. The personas are synthetic, the same PII rule as everything
180
+ else.
@@ -0,0 +1,119 @@
1
+ {
2
+ "description": "Answers the per-answer assessor must read correctly. Each case is the latest answer on a named topic, in the synthetic candidate's voice. STAR evidence flags and signal sufficiency are exact-match on the flags a case names; claim_substrings, when present, must each appear (case-insensitive) somewhere in the claims worth chasing. The five STAR cases carry over from the retired StarFlagger evals; sufficiency and claims are the Phase 4a additions. Ongoing, uneventful work (reviewing generated code, steering a tool) holds enough signal on a concrete account of how the candidate operates even with no incident or number, while a buzzword answer that names no real practice does not. Synthetic data is a stopgap; see plans/decisions/evals-and-observability.md.",
3
+ "cases": [
4
+ {
5
+ "name": "complete-and-quantified",
6
+ "topic": "the dashboard latency work at Helioscope",
7
+ "answer": "At Helioscope our ingestion pipeline ran as a nightly batch, so client dashboards were always hours stale. I was asked to get data latency under two minutes before a big renewal. I led the migration to a streaming pipeline on Kafka, rewrote the Python consumers, and ran the old and new paths in parallel for two weeks to prove parity. Data latency dropped from 4 hours to 90 seconds and we kept the client.",
8
+ "expected": {
9
+ "situation": true,
10
+ "task": true,
11
+ "action": true,
12
+ "result": true,
13
+ "quantified_result": true,
14
+ "sufficient_signal": true
15
+ }
16
+ },
17
+ {
18
+ "name": "missing-result",
19
+ "topic": "the berth scheduling work at Dunav Logistics",
20
+ "answer": "At Dunav Logistics, ships were racking up demurrage fees because berth assignments were done by hand in a spreadsheet. My job was to automate the scheduling. I built a berth-scheduling service in Go that matched vessels to berths by draft and arrival window.",
21
+ "expected": {
22
+ "situation": true,
23
+ "task": true,
24
+ "action": true,
25
+ "result": false,
26
+ "quantified_result": false,
27
+ "sufficient_signal": false
28
+ }
29
+ },
30
+ {
31
+ "name": "vague-unquantified-result",
32
+ "topic": "the berth scheduling work at Dunav Logistics",
33
+ "answer": "At Dunav Logistics, ships were racking up demurrage fees because berth assignments were done by hand in a spreadsheet. My job was to automate the scheduling. I built a berth-scheduling service in Go. After it shipped, the fees came down a lot and the operations team was much happier.",
34
+ "expected": {
35
+ "situation": true,
36
+ "task": true,
37
+ "action": true,
38
+ "result": true,
39
+ "quantified_result": false,
40
+ "sufficient_signal": false
41
+ }
42
+ },
43
+ {
44
+ "name": "action-only",
45
+ "topic": "the caching work",
46
+ "answer": "I rewrote the consumers and added a Redis cache in front of the database.",
47
+ "expected": {
48
+ "situation": false,
49
+ "task": false,
50
+ "action": true,
51
+ "result": false,
52
+ "quantified_result": false,
53
+ "sufficient_signal": false
54
+ }
55
+ },
56
+ {
57
+ "name": "context-without-action",
58
+ "topic": "the on-call incident load at Dunav",
59
+ "answer": "Our tracking platform at Dunav kept paging us at night, and management wanted the incident count brought down before the peak shipping season.",
60
+ "expected": {
61
+ "situation": true,
62
+ "task": true,
63
+ "action": false,
64
+ "result": false,
65
+ "quantified_result": false,
66
+ "sufficient_signal": false
67
+ }
68
+ },
69
+ {
70
+ "name": "impact-claim-worth-chasing",
71
+ "topic": "their background",
72
+ "answer": "I've spent the last four years at Helioscope on the data platform team. The thing I'm proudest of is cutting our cloud bill by 60 percent in one quarter while the platform kept growing.",
73
+ "expected": {
74
+ "situation": true,
75
+ "task": false,
76
+ "action": false,
77
+ "result": true,
78
+ "quantified_result": true,
79
+ "sufficient_signal": false
80
+ },
81
+ "claim_substrings": [
82
+ "60 percent"
83
+ ]
84
+ },
85
+ {
86
+ "name": "broad-topic-one-complete-story-suffices",
87
+ "topic": "the candidate's background and the thread running through their work",
88
+ "answer": "The thread is ownership, honestly. The clearest example: at Helioscope our support team hand-triaged every incoming ticket, and renewals were slipping because first responses took hours. I proposed routing tickets with a classifier, got buy-in, and built the service myself over a quarter. It now routes 80 percent of tickets automatically and first response went from six hours to forty minutes.",
89
+ "expected": {
90
+ "situation": true,
91
+ "task": true,
92
+ "action": true,
93
+ "result": true,
94
+ "quantified_result": true,
95
+ "sufficient_signal": true
96
+ }
97
+ },
98
+ {
99
+ "name": "ongoing-review-work-is-sufficient-without-an-event",
100
+ "topic": "how they keep AI-generated code from degrading the codebase",
101
+ "answer": "I never merge what an assistant writes on trust. I read the whole diff and I'm watching for a few specific things: APIs it invented that don't exist, error handling it quietly dropped, and tests that assert nothing. When I catch one I push back in the same session and have it redo that slice instead of patching over it, and I keep each change small so a bad pattern can't hide in a big diff.",
102
+ "expected": {
103
+ "action": true,
104
+ "result": false,
105
+ "quantified_result": false,
106
+ "sufficient_signal": true
107
+ }
108
+ },
109
+ {
110
+ "name": "buzzword-process-answer-is-not-sufficient",
111
+ "topic": "how they keep AI-generated code from degrading the codebase",
112
+ "answer": "I make sure to follow best practices when I use these tools. I keep the architecture clean and the code maintainable, and I always validate the output to a high standard so quality stays where it needs to be.",
113
+ "expected": {
114
+ "action": false,
115
+ "sufficient_signal": false
116
+ }
117
+ }
118
+ ]
119
+ }
@@ -0,0 +1,80 @@
1
+ {
2
+ "description": "Synthetic coaching cases (stopgap until volunteered sessions land). Each case feeds the coach a transcript plus an authored grade with one planted weakness, and the judge checks the coach's fix is tied to that named gap rather than generic filler.",
3
+ "cases": [
4
+ {
5
+ "name": "missing-quantified-result-senior",
6
+ "target_level": "senior",
7
+ "transcript": [
8
+ {
9
+ "question": "Tell me about a project you're proud of.",
10
+ "answer": "At Northwind our checkout service kept timing out under load before big sales. I owned the fix end to end: I profiled the hot path, found the synchronous inventory call, and rewrote it as an async batch with a fallback cache. I rolled it out behind a flag and watched it through two sale weekends."
11
+ }
12
+ ],
13
+ "grade": {
14
+ "scores": [
15
+ {
16
+ "question": "Tell me about a project you're proud of.",
17
+ "rationale": "Clear end-to-end ownership at senior, concrete actions, but the story stops before any measured outcome.",
18
+ "star": {"situation": true, "task": true, "action": true, "result": false, "quantified_result": false},
19
+ "specificity": "high",
20
+ "ownership": "clear",
21
+ "weak_or_missing": ["result", "quantified_result"],
22
+ "gap": "The story never says how the checkout fix turned out; no latency or error-rate change is stated.",
23
+ "score": 3
24
+ }
25
+ ]
26
+ },
27
+ "gap_summary": "The answer describes the work in detail but never states the outcome: no number for the latency, timeout rate, or sales impact after the change."
28
+ },
29
+ {
30
+ "name": "blurred-ownership-we-throughout-senior",
31
+ "target_level": "senior",
32
+ "transcript": [
33
+ {
34
+ "question": "Tell me about a hard technical decision you were part of.",
35
+ "answer": "We were drowning in flaky integration tests, so we decided to split the suite and run it in parallel. We containerised the dependencies, we added retries for the genuinely network-bound cases, and we got the suite back under ten minutes. The whole team was a lot happier after that."
36
+ }
37
+ ],
38
+ "grade": {
39
+ "scores": [
40
+ {
41
+ "question": "Tell me about a hard technical decision you were part of.",
42
+ "rationale": "A real outcome with a number, but it is all 'we'; nothing marks what this candidate personally drove, which a senior answer needs.",
43
+ "star": {"situation": true, "task": true, "action": true, "result": true, "quantified_result": true},
44
+ "specificity": "high",
45
+ "ownership": "unclear",
46
+ "weak_or_missing": [],
47
+ "gap": "The answer is all 'we' with no personal contribution visible; it never says what the candidate themselves decided or built.",
48
+ "score": 3
49
+ }
50
+ ]
51
+ },
52
+ "gap_summary": "Every action is credited to the team as 'we'; the answer never separates out what this candidate personally decided, drove, or built."
53
+ },
54
+ {
55
+ "name": "vague-low-specificity-mid",
56
+ "target_level": "mid",
57
+ "transcript": [
58
+ {
59
+ "question": "Walk me through a time you improved something on your team.",
60
+ "answer": "I noticed our process wasn't great, so I made some changes that helped a lot. Things got way better and smoother, people were happier, and overall it was a big improvement for everyone involved."
61
+ }
62
+ ],
63
+ "grade": {
64
+ "scores": [
65
+ {
66
+ "question": "Walk me through a time you improved something on your team.",
67
+ "rationale": "Nothing concrete: no named process, no specific change, no real outcome. Pure vague betterment.",
68
+ "star": {"situation": false, "task": false, "action": true, "result": true, "quantified_result": false},
69
+ "specificity": "low",
70
+ "ownership": "clear",
71
+ "weak_or_missing": ["situation", "task", "quantified_result"],
72
+ "gap": "Nothing in the answer is concrete: it never names the process, the change, or any measurable result.",
73
+ "score": 2
74
+ }
75
+ ]
76
+ },
77
+ "gap_summary": "The answer leans entirely on vague words ('helped a lot', 'way better', 'big improvement') and never names the process, the specific change made, or a concrete result."
78
+ }
79
+ ]
80
+ }