kelam-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kelam_core-0.1.0/PKG-INFO +7 -0
- kelam_core-0.1.0/kelam_core/__init__.py +7 -0
- kelam_core-0.1.0/kelam_core/metrics.py +149 -0
- kelam_core-0.1.0/kelam_core/registry.py +134 -0
- kelam_core-0.1.0/kelam_core/schemas.py +310 -0
- kelam_core-0.1.0/kelam_core/translation.py +60 -0
- kelam_core-0.1.0/kelam_core.egg-info/PKG-INFO +7 -0
- kelam_core-0.1.0/kelam_core.egg-info/SOURCES.txt +11 -0
- kelam_core-0.1.0/kelam_core.egg-info/dependency_links.txt +1 -0
- kelam_core-0.1.0/kelam_core.egg-info/requires.txt +2 -0
- kelam_core-0.1.0/kelam_core.egg-info/top_level.txt +1 -0
- kelam_core-0.1.0/pyproject.toml +17 -0
- kelam_core-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""kelam-core — the shared contract both the CLI and the backend depend on.
|
|
2
|
+
|
|
3
|
+
Pure-python, light deps (pydantic + pyyaml): agent schemas, the agent-folder<->record
|
|
4
|
+
translation used by pull/push, the model registry the linter validates against, and the
|
|
5
|
+
derived call metrics used by export/stats. Keeping these here is what stops the CLI and the
|
|
6
|
+
backend from drifting on the push/pull or metrics contracts.
|
|
7
|
+
"""
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Derived call metrics — pure functions over CallLog dicts.
|
|
2
|
+
|
|
3
|
+
The CallLog stores what happened (transcript, timestamps, status); these helpers derive
|
|
4
|
+
the numbers an analysis or dashboard wants (turn counts, word counts, talk ratio,
|
|
5
|
+
durations) so every export consumer gets the same definitions. Everything here is pure
|
|
6
|
+
and dependency-free: the service enriches API responses with it, the CLI summarizes
|
|
7
|
+
with it, and tests exercise it without AWS.
|
|
8
|
+
|
|
9
|
+
Per-turn latency comes from the worker's `turn_metrics` rows (one per user turn:
|
|
10
|
+
eou_delay + llm_ttft + tts_ttfb, seconds); calls recorded before capture existed
|
|
11
|
+
simply have no rows, and every latency metric degrades to None for them.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections import Counter
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
AGENT_ROLES = ("assistant", "agent")
|
|
20
|
+
|
|
21
|
+
# Flat per-call row for tabular consumers (CSV export). Transcript text is excluded —
|
|
22
|
+
# tables get the numbers, jsonl/json get the full conversation.
|
|
23
|
+
CSV_FIELDS = [
|
|
24
|
+
"call_id", "agent_id", "workspace", "direction", "status",
|
|
25
|
+
"from_number", "to_number", "started_at", "ended_at", "duration_seconds",
|
|
26
|
+
"user_turns", "agent_turns", "tool_calls", "user_words", "agent_words",
|
|
27
|
+
"agent_talk_ratio", "words_per_minute",
|
|
28
|
+
"response_latency_p50", "response_latency_p95",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _parse_ts(value) -> datetime | None:
|
|
33
|
+
if not value:
|
|
34
|
+
return None
|
|
35
|
+
try:
|
|
36
|
+
return datetime.fromisoformat(str(value))
|
|
37
|
+
except ValueError:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _word_count(text: str) -> int:
|
|
42
|
+
return len((text or "").split())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def call_duration_seconds(call: dict) -> float | None:
|
|
46
|
+
"""Stored duration if present, else derived from started_at/ended_at."""
|
|
47
|
+
if call.get("duration_seconds") is not None:
|
|
48
|
+
return call["duration_seconds"]
|
|
49
|
+
started, ended = _parse_ts(call.get("started_at")), _parse_ts(call.get("ended_at"))
|
|
50
|
+
if started and ended and started.tzinfo == ended.tzinfo:
|
|
51
|
+
return max((ended - started).total_seconds(), 0.0)
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def turn_latencies(call: dict) -> list[float]:
|
|
56
|
+
"""Per-turn response latency samples (seconds): eou_delay + llm_ttft + tts_ttfb.
|
|
57
|
+
A turn counts only when the caller actually heard a response (llm_ttft and
|
|
58
|
+
tts_ttfb both present); eou_delay is added when emitted (it isn't under stt-based
|
|
59
|
+
turn detection). Empty for calls recorded before the worker captured latency."""
|
|
60
|
+
samples = []
|
|
61
|
+
for t in call.get("turn_metrics") or []:
|
|
62
|
+
if t.get("llm_ttft") is None or t.get("tts_ttfb") is None:
|
|
63
|
+
continue
|
|
64
|
+
samples.append((t.get("eou_delay") or 0.0) + t["llm_ttft"] + t["tts_ttfb"])
|
|
65
|
+
return samples
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def call_metrics(call: dict) -> dict:
|
|
69
|
+
"""Per-call derived metrics. Tool segments come in pairs ("→ called f(...)" then
|
|
70
|
+
"← f returned: ..."); tool_calls counts invocations, not segments."""
|
|
71
|
+
transcript = call.get("transcript") or []
|
|
72
|
+
user = [s for s in transcript if s.get("role") == "user"]
|
|
73
|
+
agent = [s for s in transcript if s.get("role") in AGENT_ROLES]
|
|
74
|
+
tool_calls = sum(1 for s in transcript
|
|
75
|
+
if s.get("role") == "tool" and (s.get("text") or "").startswith("→"))
|
|
76
|
+
user_words = sum(_word_count(s.get("text", "")) for s in user)
|
|
77
|
+
agent_words = sum(_word_count(s.get("text", "")) for s in agent)
|
|
78
|
+
spoken = user_words + agent_words
|
|
79
|
+
duration = call_duration_seconds(call)
|
|
80
|
+
latencies = sorted(turn_latencies(call))
|
|
81
|
+
return {
|
|
82
|
+
"duration_seconds": duration,
|
|
83
|
+
"user_turns": len(user),
|
|
84
|
+
"agent_turns": len(agent),
|
|
85
|
+
"tool_calls": tool_calls,
|
|
86
|
+
"user_words": user_words,
|
|
87
|
+
"agent_words": agent_words,
|
|
88
|
+
"agent_talk_ratio": round(agent_words / spoken, 3) if spoken else None,
|
|
89
|
+
"words_per_minute": round(spoken / (duration / 60), 1) if duration else None,
|
|
90
|
+
"response_latency_p50": round(_percentile(latencies, 50), 3) if latencies else None,
|
|
91
|
+
"response_latency_p95": round(_percentile(latencies, 95), 3) if latencies else None,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def with_metrics(call: dict) -> dict:
|
|
96
|
+
"""The call dict plus a `metrics` sub-dict (the export enrichment)."""
|
|
97
|
+
return {**call, "metrics": call_metrics(call)}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def flatten_call(call: dict) -> dict:
|
|
101
|
+
"""One flat row (CSV_FIELDS) per call; metrics inlined, transcript dropped."""
|
|
102
|
+
m = call.get("metrics") or call_metrics(call)
|
|
103
|
+
row = {k: call.get(k) for k in CSV_FIELDS if k in call}
|
|
104
|
+
row.update({k: m[k] for k in m if k in CSV_FIELDS})
|
|
105
|
+
return {k: row.get(k) for k in CSV_FIELDS}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _percentile(sorted_values: list[float], p: float) -> float:
|
|
109
|
+
"""Nearest-rank percentile on an already-sorted list (small-N friendly)."""
|
|
110
|
+
if not sorted_values:
|
|
111
|
+
return 0.0
|
|
112
|
+
idx = min(int(round(p / 100 * (len(sorted_values) - 1))), len(sorted_values) - 1)
|
|
113
|
+
return sorted_values[idx]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def summarize(calls: list[dict]) -> dict:
|
|
117
|
+
"""Aggregate stats over a set of calls — what `kelam stats` prints."""
|
|
118
|
+
metrics = [call_metrics(c) for c in calls]
|
|
119
|
+
durations = sorted(m["duration_seconds"] for m in metrics
|
|
120
|
+
if m["duration_seconds"] is not None)
|
|
121
|
+
latencies = sorted(s for c in calls for s in turn_latencies(c))
|
|
122
|
+
started = sorted(c.get("started_at") for c in calls if c.get("started_at"))
|
|
123
|
+
return {
|
|
124
|
+
"calls": len(calls),
|
|
125
|
+
"by_status": dict(Counter(c.get("status", "?") for c in calls)),
|
|
126
|
+
"by_direction": dict(Counter(c.get("direction", "?") for c in calls)),
|
|
127
|
+
"by_agent": dict(Counter(c.get("agent_id", "?") for c in calls)),
|
|
128
|
+
"first_call": started[0] if started else None,
|
|
129
|
+
"last_call": started[-1] if started else None,
|
|
130
|
+
"duration_seconds": {
|
|
131
|
+
"total": round(sum(durations), 1),
|
|
132
|
+
"avg": round(sum(durations) / len(durations), 1) if durations else None,
|
|
133
|
+
"p50": _percentile(durations, 50) if durations else None,
|
|
134
|
+
"p95": _percentile(durations, 95) if durations else None,
|
|
135
|
+
"max": durations[-1] if durations else None,
|
|
136
|
+
},
|
|
137
|
+
"turns": {
|
|
138
|
+
"user": sum(m["user_turns"] for m in metrics),
|
|
139
|
+
"agent": sum(m["agent_turns"] for m in metrics),
|
|
140
|
+
"tool_calls": sum(m["tool_calls"] for m in metrics),
|
|
141
|
+
},
|
|
142
|
+
# Pooled per-turn response latency (EOU -> first agent audio) across all calls;
|
|
143
|
+
# turns_measured is 0 (and percentiles None) when no call has latency capture.
|
|
144
|
+
"response_latency_seconds": {
|
|
145
|
+
"turns_measured": len(latencies),
|
|
146
|
+
"p50": round(_percentile(latencies, 50), 3) if latencies else None,
|
|
147
|
+
"p95": round(_percentile(latencies, 95), 3) if latencies else None,
|
|
148
|
+
},
|
|
149
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Curated registry of supported models — the single source of truth.
|
|
2
|
+
|
|
3
|
+
The linter validates agent.yaml against this, and (later) the CLI surfaces it.
|
|
4
|
+
Adding a model to an existing provider = one entry here. Adding a NEW LLM provider
|
|
5
|
+
also needs a branch in the worker's `_build_llm` (the two are kept in sync by tests).
|
|
6
|
+
LLM ids are LiteLLM-style `provider/model`; STT/TTS ids map to LiveKit plugin classes.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
LLM_MODELS: dict[str, str] = {
|
|
11
|
+
"anthropic/claude-haiku-4-5": "Fastest, lowest latency — default for voice",
|
|
12
|
+
"anthropic/claude-sonnet-4-6": "Balanced quality and speed",
|
|
13
|
+
"anthropic/claude-opus-4-8": "Most capable, highest latency",
|
|
14
|
+
"openai/gpt-4o": "OpenAI flagship — strong quality",
|
|
15
|
+
"openai/gpt-4o-mini": "OpenAI small — fast and cheap",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
STT_MODELS: dict[str, str] = {
|
|
19
|
+
"deepgram/nova-3": "Recommended",
|
|
20
|
+
"deepgram/nova-2": "Older, still fine",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
TTS_MODELS: dict[str, str] = {
|
|
24
|
+
"elevenlabs/eleven_flash_v2_5": "Lowest latency",
|
|
25
|
+
"elevenlabs/eleven_turbo_v2_5": "Low latency, a bit richer — default",
|
|
26
|
+
"elevenlabs/eleven_multilingual_v2": "Best quality, higher latency",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
TURN_DETECTION: dict[str, str] = {
|
|
30
|
+
"vad": "Fastest, silence-based (default)",
|
|
31
|
+
"model": "Most accurate, LiveKit's audio turn detector (reads the caller's audio)",
|
|
32
|
+
"stt": "Uses the transcriber's end-of-speech signal",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# LiveKit enhanced noise cancellation, applied to the caller's audio before STT.
|
|
36
|
+
# Requires LiveKit Cloud transport (we're on it); ids map to plugin classes in the worker.
|
|
37
|
+
NOISE_CANCELLATION: dict[str, str] = {
|
|
38
|
+
"nc": "Remove environmental noise (traffic, fans, music) — no extra cost (default)",
|
|
39
|
+
"bvc_telephony": "Also suppress competing background voices, tuned for phone calls — extra cost",
|
|
40
|
+
"off": "No filtering — pass the raw caller audio through",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
DEFAULT_LLM = "anthropic/claude-haiku-4-5"
|
|
44
|
+
DEFAULT_STT = "deepgram/nova-3"
|
|
45
|
+
DEFAULT_TTS = "elevenlabs/eleven_turbo_v2_5"
|
|
46
|
+
DEFAULT_VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # a known-good ElevenLabs voice
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---- languages ----
|
|
50
|
+
# Source of truth for what the linter accepts. STT codes are BCP-47 as Deepgram documents
|
|
51
|
+
# them (https://developers.deepgram.com/docs/models-languages-overview); "multi" is
|
|
52
|
+
# Deepgram's code-switching mode. TTS codes are the base languages each ElevenLabs model
|
|
53
|
+
# speaks. The LLMs are multilingual, so language support is decided by STT/TTS alone.
|
|
54
|
+
|
|
55
|
+
_NOVA_2_LANGUAGES = {
|
|
56
|
+
"multi", # code-switching (en + es only on nova-2)
|
|
57
|
+
"bg", "ca", "cs", "da", "da-DK", "de", "de-CH", "el", "et", "fi", "fr", "fr-CA",
|
|
58
|
+
"hi", "hu", "id", "it", "ja", "ko", "ko-KR", "lt", "lv", "ms", "nl", "nl-BE",
|
|
59
|
+
"no", "pl", "pt", "pt-BR", "pt-PT", "ro", "ru", "sk", "sv", "sv-SE",
|
|
60
|
+
"th", "th-TH", "tr", "uk", "vi",
|
|
61
|
+
"en", "en-AU", "en-GB", "en-IN", "en-NZ", "en-US",
|
|
62
|
+
"es", "es-419",
|
|
63
|
+
"zh", "zh-CN", "zh-Hans", "zh-Hant", "zh-HK", "zh-TW",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
STT_LANGUAGES: dict[str, set[str]] = {
|
|
67
|
+
# nova-3 = nova-2's coverage plus the languages below; its "multi" mode
|
|
68
|
+
# code-switches across en/es/fr/de/hi/ru/pt/ja/it/nl.
|
|
69
|
+
"deepgram/nova-3": _NOVA_2_LANGUAGES | {
|
|
70
|
+
"ar", "be", "bn", "bs", "fa", "gu", "gu-IN", "he", "hr", "kn", "mk",
|
|
71
|
+
"mr", "sl", "sr", "ta", "te", "tl", "ur",
|
|
72
|
+
},
|
|
73
|
+
"deepgram/nova-2": _NOVA_2_LANGUAGES,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Every curated ElevenLabs model is multilingual; v2.5 adds hu/no/vi over multilingual_v2.
|
|
77
|
+
_ELEVEN_V2_LANGUAGES = {
|
|
78
|
+
"ar", "bg", "cs", "da", "de", "el", "en", "es", "fi", "fil", "fr", "hi", "hr",
|
|
79
|
+
"id", "it", "ja", "ko", "ms", "nl", "pl", "pt", "ro", "ru", "sk", "sv", "ta",
|
|
80
|
+
"tr", "uk", "zh",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
TTS_LANGUAGES: dict[str, set[str]] = {
|
|
84
|
+
"elevenlabs/eleven_flash_v2_5": _ELEVEN_V2_LANGUAGES | {"hu", "no", "vi"},
|
|
85
|
+
"elevenlabs/eleven_turbo_v2_5": _ELEVEN_V2_LANGUAGES | {"hu", "no", "vi"},
|
|
86
|
+
"elevenlabs/eleven_multilingual_v2": _ELEVEN_V2_LANGUAGES,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Models that accept an explicit language code at synthesis time (ISO 639, e.g. "es").
|
|
90
|
+
# multilingual_v2 does not — it infers the language from the text.
|
|
91
|
+
TTS_LANGUAGE_ENFORCEABLE = {
|
|
92
|
+
"elevenlabs/eleven_flash_v2_5",
|
|
93
|
+
"elevenlabs/eleven_turbo_v2_5",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
# Providers don't agree on every base code (Deepgram says Tagalog, ElevenLabs Filipino).
|
|
97
|
+
_TTS_LANGUAGE_ALIASES = {"tl": "fil"}
|
|
98
|
+
|
|
99
|
+
# Display names for prompts and linter messages (union of the sets above).
|
|
100
|
+
LANGUAGE_NAMES: dict[str, str] = {
|
|
101
|
+
"ar": "Arabic", "be": "Belarusian", "bg": "Bulgarian", "bn": "Bengali",
|
|
102
|
+
"bs": "Bosnian", "ca": "Catalan", "cs": "Czech", "da": "Danish", "de": "German",
|
|
103
|
+
"el": "Greek", "en": "English", "es": "Spanish", "et": "Estonian", "fa": "Farsi",
|
|
104
|
+
"fi": "Finnish", "fil": "Filipino", "fr": "French", "gu": "Gujarati", "he": "Hebrew",
|
|
105
|
+
"hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "id": "Indonesian",
|
|
106
|
+
"it": "Italian", "ja": "Japanese", "kn": "Kannada", "ko": "Korean",
|
|
107
|
+
"lt": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "mr": "Marathi",
|
|
108
|
+
"ms": "Malay", "nl": "Dutch", "no": "Norwegian", "pl": "Polish", "pt": "Portuguese",
|
|
109
|
+
"ro": "Romanian", "ru": "Russian", "sk": "Slovak", "sl": "Slovenian",
|
|
110
|
+
"sr": "Serbian", "sv": "Swedish", "ta": "Tamil", "te": "Telugu", "th": "Thai",
|
|
111
|
+
"tl": "Tagalog", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu",
|
|
112
|
+
"vi": "Vietnamese", "zh": "Chinese",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def split(model_id: str) -> tuple[str, str]:
|
|
117
|
+
"""`anthropic/claude-haiku-4-5` -> ('anthropic', 'claude-haiku-4-5')."""
|
|
118
|
+
provider, _, model = model_id.partition("/")
|
|
119
|
+
return provider, model
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def base_language(code: str) -> str:
|
|
123
|
+
"""`fr-CA` -> `fr`. The agent-level language and regional STT codes share a base."""
|
|
124
|
+
return code.partition("-")[0].lower()
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def tts_language(code: str) -> str:
|
|
128
|
+
"""The ElevenLabs base code for an agent language (handles provider naming splits)."""
|
|
129
|
+
base = base_language(code)
|
|
130
|
+
return _TTS_LANGUAGE_ALIASES.get(base, base)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def language_name(code: str) -> str:
|
|
134
|
+
return LANGUAGE_NAMES.get(base_language(code), code)
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Pydantic models for an agent and its call logs.
|
|
2
|
+
|
|
3
|
+
`AgentConfig` is the `agent.yaml` contract (validated against the model registry).
|
|
4
|
+
Scenarios and tools are stored separately (one record each); see translation.py.
|
|
5
|
+
CallLog requires both transcript and recording_url (a finalized call always has both).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Literal
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
14
|
+
|
|
15
|
+
from kelam_core import registry
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LLMConfig(BaseModel):
|
|
19
|
+
model: str = registry.DEFAULT_LLM
|
|
20
|
+
temperature: float = Field(0.4, ge=0.0, le=1.0)
|
|
21
|
+
max_tokens: int = Field(300, ge=64, le=4096)
|
|
22
|
+
|
|
23
|
+
@field_validator("model")
|
|
24
|
+
@classmethod
|
|
25
|
+
def _known(cls, v: str) -> str:
|
|
26
|
+
if v not in registry.LLM_MODELS:
|
|
27
|
+
raise ValueError(f"unknown LLM model {v!r}; choices: {sorted(registry.LLM_MODELS)}")
|
|
28
|
+
return v
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class STTConfig(BaseModel):
|
|
32
|
+
model: str = registry.DEFAULT_STT
|
|
33
|
+
language: str = "en-US"
|
|
34
|
+
|
|
35
|
+
@field_validator("model")
|
|
36
|
+
@classmethod
|
|
37
|
+
def _known(cls, v: str) -> str:
|
|
38
|
+
if v not in registry.STT_MODELS:
|
|
39
|
+
raise ValueError(f"unknown STT model {v!r}; choices: {sorted(registry.STT_MODELS)}")
|
|
40
|
+
return v
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class VoiceSettingsConfig(BaseModel):
|
|
44
|
+
"""ElevenLabs voice tuning, applied only when this block is present (omit it to use the
|
|
45
|
+
model's defaults). Lower `stability` = more expressive/variable delivery; higher `style`
|
|
46
|
+
= more emotive; `use_speaker_boost` adds presence. These are what make a stock voice sound
|
|
47
|
+
natural vs. flat — the defaults are ElevenLabs' own, which read fairly robotic."""
|
|
48
|
+
|
|
49
|
+
stability: float = Field(0.5, ge=0.0, le=1.0)
|
|
50
|
+
similarity_boost: float = Field(0.75, ge=0.0, le=1.0)
|
|
51
|
+
style: float = Field(0.0, ge=0.0, le=1.0)
|
|
52
|
+
use_speaker_boost: bool = True
|
|
53
|
+
speed: float = Field(1.0, ge=0.7, le=1.2)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TTSConfig(BaseModel):
|
|
57
|
+
model: str = registry.DEFAULT_TTS
|
|
58
|
+
voice_id: str = registry.DEFAULT_VOICE_ID
|
|
59
|
+
# Optional ElevenLabs voice tuning; None = the model's default settings.
|
|
60
|
+
voice_settings: VoiceSettingsConfig | None = None
|
|
61
|
+
|
|
62
|
+
@field_validator("model")
|
|
63
|
+
@classmethod
|
|
64
|
+
def _known(cls, v: str) -> str:
|
|
65
|
+
if v not in registry.TTS_MODELS:
|
|
66
|
+
raise ValueError(f"unknown TTS model {v!r}; choices: {sorted(registry.TTS_MODELS)}")
|
|
67
|
+
return v
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TurnConfig(BaseModel):
|
|
71
|
+
detection: Literal["vad", "model", "stt"] = "vad"
|
|
72
|
+
endpointing_min_ms: int = Field(500, ge=0, le=5000)
|
|
73
|
+
preemptive_generation: bool = True
|
|
74
|
+
interruptible: bool = True
|
|
75
|
+
# Guard against false barge-in from background noise. min_interruption_words requires at
|
|
76
|
+
# least this many *transcribed* words before the caller is treated as interrupting — at 1
|
|
77
|
+
# (default) a cough or TV (no real words) no longer cuts the agent off, but a real
|
|
78
|
+
# "stop"/"wait" still does; set 0 for hair-trigger barge-in. min_interruption_ms is the
|
|
79
|
+
# minimum speech duration to count. Pairs with the noise_cancellation knob.
|
|
80
|
+
min_interruption_words: int = Field(1, ge=0, le=10)
|
|
81
|
+
min_interruption_ms: int = Field(500, ge=0, le=5000)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class GreetingConfig(BaseModel):
|
|
85
|
+
# Spoken live at call start. Empty = wait for the caller to speak first.
|
|
86
|
+
text: str = ""
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class PhoneConfig(BaseModel):
|
|
90
|
+
# Intent only. The assigned number(s) are server-managed (see AgentRecord).
|
|
91
|
+
inbound: bool = True
|
|
92
|
+
outbound: bool = True
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class EndCallConfig(BaseModel):
|
|
96
|
+
"""Opt-in: let the agent hang up the call itself once it decides the conversation is
|
|
97
|
+
done. Maps onto LiveKit's prebuilt EndCallTool (an `end_call` function tool the LLM can
|
|
98
|
+
invoke). Default off — an agent with no `end_call` block, or `enabled: false`, never
|
|
99
|
+
gets the tool and so cannot terminate its own call (only the peer hanging up ends it)."""
|
|
100
|
+
|
|
101
|
+
enabled: bool = False
|
|
102
|
+
# Appended to the tool description to steer WHEN to end (LiveKit `extra_description`),
|
|
103
|
+
# e.g. "only after the caller confirms they need nothing else". Empty = generic default.
|
|
104
|
+
instructions: str = ""
|
|
105
|
+
# What the agent says right before hanging up (LiveKit `end_instructions`).
|
|
106
|
+
goodbye: str = "say goodbye to the user"
|
|
107
|
+
# Delete the room on end, disconnecting everyone incl. the SIP/phone leg (LiveKit
|
|
108
|
+
# `delete_room`). True is correct for a 1:1 phone/web call — it actually drops the line.
|
|
109
|
+
delete_room: bool = True
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ComplianceConfig(BaseModel):
|
|
113
|
+
"""HIPAA/BAA gate. Declaring `hipaa: true` marks the agent as handling PHI. Until
|
|
114
|
+
`baa_attested: true`, `kelam verify`/`deploy` refuse to ship it — you must have BAAs
|
|
115
|
+
signed with every provider in the call path first (Twilio, LiveKit, Deepgram,
|
|
116
|
+
ElevenLabs, Anthropic, AWS). When `hipaa: true`, call recording defaults OFF (the
|
|
117
|
+
worker skips egress) so PHI isn't captured to S3 without a BAA.
|
|
118
|
+
|
|
119
|
+
This is a guard against accidentally routing real PHI before BAAs exist, plus a safe
|
|
120
|
+
default — it does NOT by itself make the agent HIPAA-compliant."""
|
|
121
|
+
|
|
122
|
+
hipaa: bool = False
|
|
123
|
+
baa_attested: bool = False
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Providers in a call's data path that would need a signed BAA before real PHI flows.
|
|
127
|
+
# LiveKit is in the path even though we call LLM/STT/TTS directly: it carries the audio
|
|
128
|
+
# (SIP/WebRTC transport), runs the turn detector over transcript text (inference.TurnDetector,
|
|
129
|
+
# LiveKit Inference), and performs egress uploads from its own infra.
|
|
130
|
+
_PHI_PROVIDERS = "Twilio, LiveKit, Deepgram, ElevenLabs, Anthropic, AWS"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class AgentConfig(BaseModel):
|
|
134
|
+
"""The agent.yaml contract — pure user intent. Server facts live in AgentRecord."""
|
|
135
|
+
|
|
136
|
+
name: str
|
|
137
|
+
description: str = ""
|
|
138
|
+
# The call's language (BCP-47, e.g. "es", "fr-CA"). One knob: it derives stt.language,
|
|
139
|
+
# is enforced on TTS where supported, and steers the prompt + turn detector. Greeting
|
|
140
|
+
# and scenarios should be written in this language. Fixed per agent — no mid-call
|
|
141
|
+
# switching (use stt.language: multi for code-switching callers).
|
|
142
|
+
language: str = "en"
|
|
143
|
+
# Enhanced noise cancellation on the caller's audio before STT, for callers in noisy
|
|
144
|
+
# places. "nc" removes environmental noise at no extra cost; "bvc_telephony" also
|
|
145
|
+
# suppresses competing background voices (extra cost); "off" passes raw audio. See
|
|
146
|
+
# registry.NOISE_CANCELLATION. Requires LiveKit Cloud transport (we run on it).
|
|
147
|
+
noise_cancellation: Literal["nc", "bvc_telephony", "off"] = "nc"
|
|
148
|
+
llm: LLMConfig = Field(default_factory=LLMConfig)
|
|
149
|
+
stt: STTConfig = Field(default_factory=STTConfig)
|
|
150
|
+
tts: TTSConfig = Field(default_factory=TTSConfig)
|
|
151
|
+
turn: TurnConfig = Field(default_factory=TurnConfig)
|
|
152
|
+
greeting: GreetingConfig = Field(default_factory=GreetingConfig)
|
|
153
|
+
phone: PhoneConfig = Field(default_factory=PhoneConfig)
|
|
154
|
+
end_call: EndCallConfig = Field(default_factory=EndCallConfig)
|
|
155
|
+
compliance: ComplianceConfig = Field(default_factory=ComplianceConfig)
|
|
156
|
+
|
|
157
|
+
def compliance_issues(self) -> list[str]:
|
|
158
|
+
"""Blocking compliance problems (empty = ok). The HIPAA/BAA gate: an agent that
|
|
159
|
+
declares PHI handling must not ship until BAAs are attested. Surfaced by
|
|
160
|
+
`kelam verify` and enforced by `deploy` (which refuses to assemble the runtime)."""
|
|
161
|
+
if self.compliance.hipaa and not self.compliance.baa_attested:
|
|
162
|
+
return [
|
|
163
|
+
"compliance: hipaa=true but baa_attested=false — do not deploy until BAAs "
|
|
164
|
+
f"are signed with every provider in the call path ({_PHI_PROVIDERS}), then "
|
|
165
|
+
"set compliance.baa_attested: true"
|
|
166
|
+
]
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
@model_validator(mode="after")
|
|
170
|
+
def _languages_consistent(self) -> "AgentConfig":
|
|
171
|
+
"""Derive stt.language from the agent language and reject combinations the
|
|
172
|
+
pipeline can't actually run — this is what makes `kelam verify` catch them
|
|
173
|
+
instead of the call failing live."""
|
|
174
|
+
stt_langs = registry.STT_LANGUAGES[self.stt.model]
|
|
175
|
+
# An English stt.language under a non-English agent language is treated as unset,
|
|
176
|
+
# not a conflict: every pulled agent.yaml carries the normalized English default,
|
|
177
|
+
# so "pull, set language: es, push" must derive rather than reject.
|
|
178
|
+
stale_english_default = (
|
|
179
|
+
registry.base_language(self.stt.language) == "en"
|
|
180
|
+
and registry.base_language(self.language) != "en"
|
|
181
|
+
)
|
|
182
|
+
if "language" not in self.stt.model_fields_set or stale_english_default:
|
|
183
|
+
if registry.base_language(self.language) != "en":
|
|
184
|
+
# exact tag if the STT model knows it, else its base language
|
|
185
|
+
for candidate in (self.language, registry.base_language(self.language)):
|
|
186
|
+
if candidate in stt_langs:
|
|
187
|
+
self.stt.language = candidate
|
|
188
|
+
break
|
|
189
|
+
else:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"language {self.language!r} is not supported by {self.stt.model}; "
|
|
192
|
+
f"supported: {sorted(stt_langs)}"
|
|
193
|
+
)
|
|
194
|
+
elif self.stt.language not in stt_langs:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"stt.language {self.stt.language!r} is not supported by {self.stt.model}; "
|
|
197
|
+
f"supported: {sorted(stt_langs)}"
|
|
198
|
+
)
|
|
199
|
+
elif (self.stt.language != "multi"
|
|
200
|
+
and registry.base_language(self.stt.language) != registry.base_language(self.language)):
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"stt.language {self.stt.language!r} does not match the agent language "
|
|
203
|
+
f"{self.language!r} — set language alone (stt.language is derived from it), "
|
|
204
|
+
"or use stt.language: multi for code-switching"
|
|
205
|
+
)
|
|
206
|
+
tts_lang = registry.tts_language(self.language)
|
|
207
|
+
if tts_lang not in registry.TTS_LANGUAGES[self.tts.model]:
|
|
208
|
+
speakers = [m for m, langs in registry.TTS_LANGUAGES.items() if tts_lang in langs]
|
|
209
|
+
hint = f"models that can: {speakers}" if speakers else "no curated TTS model can"
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"{registry.language_name(self.language)} ({self.language!r}) is not spoken "
|
|
212
|
+
f"by {self.tts.model}; {hint}"
|
|
213
|
+
)
|
|
214
|
+
return self
|
|
215
|
+
|
|
216
|
+
def to_yaml(self) -> str:
|
|
217
|
+
return yaml.safe_dump(self.model_dump(), sort_keys=False)
|
|
218
|
+
|
|
219
|
+
@classmethod
|
|
220
|
+
def from_yaml(cls, text: str) -> "AgentConfig":
|
|
221
|
+
return cls(**(yaml.safe_load(text) or {}))
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class Scenario(BaseModel):
|
|
225
|
+
name: str
|
|
226
|
+
body: str = ""
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class TranscriptSegment(BaseModel):
|
|
230
|
+
role: Literal["user", "assistant", "agent", "system", "tool"]
|
|
231
|
+
text: str
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class TurnLatency(BaseModel):
|
|
235
|
+
"""Per-turn pipeline latency captured by the worker from LiveKit metrics events,
|
|
236
|
+
keyed by the turn's speech_id. Parts are seconds; a part is None when that stage
|
|
237
|
+
didn't report for the turn (e.g. EOU metrics aren't emitted under stt-based turn
|
|
238
|
+
detection, and an interrupted turn may never reach TTS)."""
|
|
239
|
+
|
|
240
|
+
speech_id: str
|
|
241
|
+
eou_delay: float | None = None
|
|
242
|
+
llm_ttft: float | None = None
|
|
243
|
+
tts_ttfb: float | None = None
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class CallLog(BaseModel):
|
|
247
|
+
call_id: str
|
|
248
|
+
agent_id: str
|
|
249
|
+
workspace: str
|
|
250
|
+
# "web" is a browser test call (WebRTC, no phone number) placed from the test page.
|
|
251
|
+
direction: Literal["inbound", "outbound", "web"]
|
|
252
|
+
from_number: str | None = None
|
|
253
|
+
to_number: str | None = None
|
|
254
|
+
started_at: datetime
|
|
255
|
+
ended_at: datetime | None = None
|
|
256
|
+
duration_seconds: float | None = None
|
|
257
|
+
status: Literal["completed", "no_answer", "failed", "busy", "in_progress"] = "in_progress"
|
|
258
|
+
transcript: list[TranscriptSegment]
|
|
259
|
+
recording_url: str
|
|
260
|
+
# Short-lived presigned HTTPS link to the recording, minted on read from recording_url
|
|
261
|
+
# (issue #61). Derived, never persisted — absent on stored records and on calls with no
|
|
262
|
+
# recording yet (in_progress / failed dials).
|
|
263
|
+
recording_playback_url: str | None = None
|
|
264
|
+
# Optional so records written before the worker captured latency still validate.
|
|
265
|
+
turn_metrics: list[TurnLatency] | None = None
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class MessageLog(BaseModel):
|
|
269
|
+
"""One message (SMS/MMS today; WhatsApp-ready), either direction. A thread is keyed
|
|
270
|
+
by (agent_id, peer_number); peer_number is the external party — to_number when we
|
|
271
|
+
send, from_number when they do. Statuses are the provider's lifecycle vocabulary
|
|
272
|
+
(Twilio's set)."""
|
|
273
|
+
|
|
274
|
+
message_id: str
|
|
275
|
+
agent_id: str
|
|
276
|
+
workspace: str
|
|
277
|
+
direction: Literal["inbound", "outbound"]
|
|
278
|
+
from_number: str
|
|
279
|
+
to_number: str
|
|
280
|
+
peer_number: str
|
|
281
|
+
body: str = ""
|
|
282
|
+
# MMS attachments (images etc.) by URL; a message needs a body or media, not both.
|
|
283
|
+
media_urls: list[str] = Field(default_factory=list)
|
|
284
|
+
# Delivery channel. SMS today; "whatsapp" is wired through the connector contract so
|
|
285
|
+
# WhatsApp slots in without a data migration (existing rows default to "sms"). A reply
|
|
286
|
+
# must go out on the same channel it arrived on.
|
|
287
|
+
channel: Literal["sms", "whatsapp"] = "sms"
|
|
288
|
+
status: Literal["queued", "accepted", "sending", "sent", "delivered",
|
|
289
|
+
"undelivered", "failed", "received"] = "queued"
|
|
290
|
+
provider: str = "twilio"
|
|
291
|
+
provider_sid: str | None = None
|
|
292
|
+
created_at: datetime
|
|
293
|
+
error: str | None = None
|
|
294
|
+
|
|
295
|
+
@model_validator(mode="after")
|
|
296
|
+
def _has_content(self) -> "MessageLog":
|
|
297
|
+
if not self.body and not self.media_urls:
|
|
298
|
+
raise ValueError("message needs a body or at least one media URL")
|
|
299
|
+
return self
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class AgentRecord(BaseModel):
|
|
303
|
+
"""Server-managed facts about an agent (written to .agentmeta.json on pull, never edited)."""
|
|
304
|
+
|
|
305
|
+
agent_id: str
|
|
306
|
+
workspace: str
|
|
307
|
+
version: str
|
|
308
|
+
phone_numbers: list[str] = Field(default_factory=list)
|
|
309
|
+
created_at: datetime
|
|
310
|
+
updated_at: datetime
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""The translation layer: agent folder <-> JSON records.
|
|
2
|
+
|
|
3
|
+
This is the pull/push contract. `folder_to_records` runs on push (CLI side),
|
|
4
|
+
`records_to_folder` runs on pull. The records dict is what the DynamoDB repository
|
|
5
|
+
stores (one item per piece). Tools are stored as raw code text; their metadata is
|
|
6
|
+
derived by importing through kelam.tools (the one shared loader) — never re-parsed.
|
|
7
|
+
|
|
8
|
+
Records shape:
|
|
9
|
+
{
|
|
10
|
+
"config": {<AgentConfig fields>},
|
|
11
|
+
"scenarios": {"<name>": "<markdown body>", ...},
|
|
12
|
+
"tools": {"<filename.py>": "<code text>", ...},
|
|
13
|
+
}
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from kelam_core.schemas import AgentConfig
|
|
20
|
+
|
|
21
|
+
CONFIG_FILE = "agent.yaml"
|
|
22
|
+
SCENARIOS_DIR = "scenarios"
|
|
23
|
+
TOOLS_DIR = "tools"
|
|
24
|
+
_INGEST = {CONFIG_FILE} # plus scenarios/*.md and tools/*.py
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def folder_to_records(src: str | Path) -> dict:
|
|
28
|
+
"""Parse a local agent folder into JSON records (push). Validates the config."""
|
|
29
|
+
src = Path(src)
|
|
30
|
+
config = AgentConfig.from_yaml((src / CONFIG_FILE).read_text(encoding="utf-8"))
|
|
31
|
+
scenarios = {
|
|
32
|
+
p.stem: p.read_text(encoding="utf-8")
|
|
33
|
+
for p in sorted((src / SCENARIOS_DIR).glob("*.md"))
|
|
34
|
+
}
|
|
35
|
+
tools = {
|
|
36
|
+
p.name: p.read_text(encoding="utf-8")
|
|
37
|
+
for p in sorted((src / TOOLS_DIR).glob("*.py"))
|
|
38
|
+
}
|
|
39
|
+
return {"config": config.model_dump(), "scenarios": scenarios, "tools": tools}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def records_to_folder(records: dict, dest: str | Path) -> Path:
|
|
43
|
+
"""Materialize JSON records into a local agent folder (pull)."""
|
|
44
|
+
dest = Path(dest)
|
|
45
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
config = AgentConfig(**records["config"])
|
|
48
|
+
(dest / CONFIG_FILE).write_text(config.to_yaml(), encoding="utf-8")
|
|
49
|
+
|
|
50
|
+
sdir = dest / SCENARIOS_DIR
|
|
51
|
+
sdir.mkdir(exist_ok=True)
|
|
52
|
+
for name, body in (records.get("scenarios") or {}).items():
|
|
53
|
+
(sdir / f"{name}.md").write_text(body, encoding="utf-8")
|
|
54
|
+
|
|
55
|
+
tdir = dest / TOOLS_DIR
|
|
56
|
+
tdir.mkdir(exist_ok=True)
|
|
57
|
+
for filename, code in (records.get("tools") or {}).items():
|
|
58
|
+
(tdir / filename).write_text(code, encoding="utf-8")
|
|
59
|
+
|
|
60
|
+
return dest
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
kelam_core/__init__.py
|
|
3
|
+
kelam_core/metrics.py
|
|
4
|
+
kelam_core/registry.py
|
|
5
|
+
kelam_core/schemas.py
|
|
6
|
+
kelam_core/translation.py
|
|
7
|
+
kelam_core.egg-info/PKG-INFO
|
|
8
|
+
kelam_core.egg-info/SOURCES.txt
|
|
9
|
+
kelam_core.egg-info/dependency_links.txt
|
|
10
|
+
kelam_core.egg-info/requires.txt
|
|
11
|
+
kelam_core.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
kelam_core
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "kelam-core"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Shared Kelam contracts: agent schemas, folder<->record translation, model registry, call metrics."
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
# Capped at the next major per the repo dep policy (issue #40).
|
|
7
|
+
dependencies = [
|
|
8
|
+
"pydantic>=2.7,<3",
|
|
9
|
+
"pyyaml>=6.0,<7",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[build-system]
|
|
13
|
+
requires = ["setuptools>=68"]
|
|
14
|
+
build-backend = "setuptools.build_meta"
|
|
15
|
+
|
|
16
|
+
[tool.setuptools.packages.find]
|
|
17
|
+
include = ["kelam_core*"]
|