learnx-cli 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- learnx_cli-0.3.0.dist-info/METADATA +240 -0
- learnx_cli-0.3.0.dist-info/RECORD +131 -0
- learnx_cli-0.3.0.dist-info/WHEEL +4 -0
- learnx_cli-0.3.0.dist-info/entry_points.txt +2 -0
- tutor/.env copy.example +4 -0
- tutor/__init__.py +0 -0
- tutor/__main__.py +4 -0
- tutor/assets/__init__.py +5 -0
- tutor/assets/html/fonts/Inter-Bold.woff2 +0 -0
- tutor/assets/html/fonts/Inter-Regular.woff2 +0 -0
- tutor/assets/html/fonts/Inter-SemiBold.woff2 +0 -0
- tutor/assets/html/fonts/JetBrainsMono-Regular.woff2 +0 -0
- tutor/assets/html/highlight-java.min.js +2 -0
- tutor/assets/html/highlight-javascript.min.js +2 -0
- tutor/assets/html/highlight-python.min.js +2 -0
- tutor/assets/html/highlight.min.js +17 -0
- tutor/assets/html/mermaid.min.js +31 -0
- tutor/assets/html/slide_base.css +464 -0
- tutor/assets/html/theme-learnx-dark.css +12 -0
- tutor/audio/__init__.py +0 -0
- tutor/audio/audio_builder.py +143 -0
- tutor/audio/sanitizer.py +9 -0
- tutor/audio/tts_renderer.py +54 -0
- tutor/cli/__init__.py +0 -0
- tutor/cli/commands.py +391 -0
- tutor/cli/logo.py +21 -0
- tutor/cli/playback_commands.py +239 -0
- tutor/cli/shell.py +91 -0
- tutor/cli/shell_context.py +18 -0
- tutor/cli/theme.py +39 -0
- tutor/cli/video_commands.py +123 -0
- tutor/config.py +122 -0
- tutor/conftest.py +5 -0
- tutor/constants.py +82 -0
- tutor/exceptions.py +26 -0
- tutor/generation/__init__.py +0 -0
- tutor/generation/assembler.py +81 -0
- tutor/generation/curriculum.py +97 -0
- tutor/generation/dialogue.py +172 -0
- tutor/generation/narrator.py +122 -0
- tutor/generation/segment_parser.py +223 -0
- tutor/generation/segment_planner.py +200 -0
- tutor/generation/visual_planner.py +205 -0
- tutor/infra/__init__.py +0 -0
- tutor/infra/llm.py +152 -0
- tutor/ingestion/__init__.py +0 -0
- tutor/ingestion/chunker.py +171 -0
- tutor/ingestion/doc_analyzer.py +41 -0
- tutor/ingestion/parse_content.py +19 -0
- tutor/ingestion/summarizer.py +51 -0
- tutor/inspector.py +117 -0
- tutor/llm_config.toml +58 -0
- tutor/models.py +147 -0
- tutor/player/__init__.py +0 -0
- tutor/player/input_handler.py +45 -0
- tutor/player/player.py +308 -0
- tutor/player/player_display.py +117 -0
- tutor/prompts/curriculum.txt +67 -0
- tutor/prompts/dialogue.txt +62 -0
- tutor/prompts/narrate.txt +34 -0
- tutor/prompts/qa.txt +17 -0
- tutor/prompts/summarize.txt +9 -0
- tutor/prompts/visual.txt +60 -0
- tutor/prompts/visual_v3.txt +91 -0
- tutor/qa/__init__.py +0 -0
- tutor/qa/qa.py +105 -0
- tutor/requirements-dev.txt +2 -0
- tutor/requirements.txt +12 -0
- tutor/sample_docs/headingless_large.md +1 -0
- tutor/sample_docs/headingless_test.md +1 -0
- tutor/sample_docs/java-basics.md +78 -0
- tutor/tests/__init__.py +0 -0
- tutor/tests/audio/__init__.py +0 -0
- tutor/tests/audio/test_audio_builder.py +106 -0
- tutor/tests/audio/test_sanitizer.py +41 -0
- tutor/tests/cli/__init__.py +0 -0
- tutor/tests/cli/test_commands.py +67 -0
- tutor/tests/cli/test_video_commands.py +190 -0
- tutor/tests/e2e/README.md +61 -0
- tutor/tests/e2e/__init__.py +0 -0
- tutor/tests/e2e/conftest.py +117 -0
- tutor/tests/e2e/fixtures/README.md +17 -0
- tutor/tests/e2e/fixtures/sample.md +13 -0
- tutor/tests/e2e/test_audio_quality.py +40 -0
- tutor/tests/e2e/test_av_sync.py +56 -0
- tutor/tests/e2e/test_pipeline_smoke.py +37 -0
- tutor/tests/e2e/test_slide_render.py +72 -0
- tutor/tests/e2e/test_video_streams.py +104 -0
- tutor/tests/generation/__init__.py +0 -0
- tutor/tests/generation/conftest.py +134 -0
- tutor/tests/generation/test_assembler.py +64 -0
- tutor/tests/generation/test_curriculum.py +107 -0
- tutor/tests/generation/test_narrator.py +165 -0
- tutor/tests/generation/test_segment_edge_cases.py +280 -0
- tutor/tests/generation/test_segment_planner.py +324 -0
- tutor/tests/generation/test_visual_planner.py +319 -0
- tutor/tests/ingestion/__init__.py +0 -0
- tutor/tests/ingestion/test_chunker.py +94 -0
- tutor/tests/ingestion/test_doc_analyzer.py +51 -0
- tutor/tests/player/__init__.py +0 -0
- tutor/tests/player/test_player_states.py +88 -0
- tutor/tests/test_assets.py +39 -0
- tutor/tests/test_models_visual.py +180 -0
- tutor/tests/visual/__init__.py +0 -0
- tutor/tests/visual/test_beat_timer.py +321 -0
- tutor/tests/visual/test_pipeline_integration.py +178 -0
- tutor/tests/visual/test_slide_renderer.py +298 -0
- tutor/tests/visual/test_subtitle_writer.py +165 -0
- tutor/tests/visual/test_video_assembler.py +108 -0
- tutor/tests/visual/test_visual_pipeline.py +270 -0
- tutor/tutor.py +365 -0
- tutor/visual/__init__.py +213 -0
- tutor/visual/beat_timer.py +222 -0
- tutor/visual/slide_renderer.py +236 -0
- tutor/visual/subtitle_writer.py +187 -0
- tutor/visual/templates/_base.html.j2 +40 -0
- tutor/visual/templates/analogy.html.j2 +21 -0
- tutor/visual/templates/callout.html.j2 +10 -0
- tutor/visual/templates/code_example.html.j2 +12 -0
- tutor/visual/templates/comparison.html.j2 +28 -0
- tutor/visual/templates/decision_guide.html.j2 +37 -0
- tutor/visual/templates/definition.html.j2 +13 -0
- tutor/visual/templates/diagram.html.j2 +11 -0
- tutor/visual/templates/hook_question.html.j2 +17 -0
- tutor/visual/templates/key_insight.html.j2 +9 -0
- tutor/visual/templates/memory_hook.html.j2 +7 -0
- tutor/visual/templates/outro.html.j2 +16 -0
- tutor/visual/templates/question_prompt.html.j2 +13 -0
- tutor/visual/templates/step_sequence.html.j2 +14 -0
- tutor/visual/templates/title_card.html.j2 +12 -0
- tutor/visual/video_assembler.py +299 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from tutor.constants import (
|
|
5
|
+
MAX_CHUNK_TOKENS,
|
|
6
|
+
MIN_CHUNK_TOKENS,
|
|
7
|
+
STRATEGY_C_OVERLAP_TOKENS,
|
|
8
|
+
STRATEGY_C_WINDOW_TOKENS,
|
|
9
|
+
)
|
|
10
|
+
from tutor.ingestion import parse_content
|
|
11
|
+
from tutor.models import Chunk, DocProfile
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def chunk(text: str, profile: DocProfile) -> list[Chunk]:
|
|
17
|
+
if profile.strategy == "A":
|
|
18
|
+
chunks = _strategy_a(text)
|
|
19
|
+
elif profile.strategy == "B":
|
|
20
|
+
chunks = _strategy_b(text)
|
|
21
|
+
else:
|
|
22
|
+
chunks = _strategy_c(text)
|
|
23
|
+
return _apply_quality_rules(chunks)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _slugify(heading: str) -> str:
|
|
27
|
+
return re.sub(r"[^a-z0-9]+", "_", heading.lower()).strip("_")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _estimate_tokens(text: str) -> int:
|
|
31
|
+
return int(len(text.split()) * 1.3)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _strategy_a(text: str) -> list[Chunk]:
|
|
35
|
+
return [
|
|
36
|
+
Chunk(
|
|
37
|
+
chunk_id="full_doc",
|
|
38
|
+
breadcrumb="Full Document",
|
|
39
|
+
heading="Full Document",
|
|
40
|
+
level=0,
|
|
41
|
+
token_count=_estimate_tokens(text),
|
|
42
|
+
text=text,
|
|
43
|
+
)
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _strategy_b(text: str) -> list[Chunk]:
|
|
48
|
+
sections = re.split(r"\n(?=## )", text)
|
|
49
|
+
sections = [s for s in sections if s.strip()]
|
|
50
|
+
|
|
51
|
+
if len(sections) < 2:
|
|
52
|
+
log.warning(
|
|
53
|
+
"Document has no headings — falling back to Strategy C (sliding window). "
|
|
54
|
+
"Consider adding ## headings to improve chunk quality."
|
|
55
|
+
)
|
|
56
|
+
return _strategy_c(text)
|
|
57
|
+
|
|
58
|
+
chunks: list[Chunk] = []
|
|
59
|
+
for section in sections:
|
|
60
|
+
lines = section.strip().split("\n")
|
|
61
|
+
heading_line = lines[0].lstrip("#").strip()
|
|
62
|
+
chunks.extend(_split_section(section, heading_line, parent_heading=None))
|
|
63
|
+
|
|
64
|
+
return chunks
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _split_section(section: str, heading: str, parent_heading: str | None) -> list[Chunk]:
|
|
68
|
+
token_count = _estimate_tokens(section)
|
|
69
|
+
|
|
70
|
+
if token_count <= MAX_CHUNK_TOKENS:
|
|
71
|
+
prefix = f"## {parent_heading}\n\n" if parent_heading else ""
|
|
72
|
+
return [
|
|
73
|
+
Chunk(
|
|
74
|
+
chunk_id=_slugify(heading),
|
|
75
|
+
breadcrumb=f"{parent_heading} > {heading}" if parent_heading else heading,
|
|
76
|
+
heading=heading,
|
|
77
|
+
level=2,
|
|
78
|
+
token_count=token_count,
|
|
79
|
+
text=prefix + section,
|
|
80
|
+
)
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
subsections = re.split(r"\n(?=### )", section)
|
|
84
|
+
if len(subsections) < 2:
|
|
85
|
+
prefix = f"## {parent_heading}\n\n" if parent_heading else ""
|
|
86
|
+
return [
|
|
87
|
+
Chunk(
|
|
88
|
+
chunk_id=_slugify(heading),
|
|
89
|
+
breadcrumb=heading,
|
|
90
|
+
heading=heading,
|
|
91
|
+
level=2,
|
|
92
|
+
token_count=token_count,
|
|
93
|
+
text=prefix + section,
|
|
94
|
+
)
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
result: list[Chunk] = []
|
|
98
|
+
for sub in subsections:
|
|
99
|
+
sub_lines = sub.strip().split("\n")
|
|
100
|
+
sub_heading = sub_lines[0].lstrip("#").strip()
|
|
101
|
+
prefix = f"## {heading}\n\n"
|
|
102
|
+
result.append(
|
|
103
|
+
Chunk(
|
|
104
|
+
chunk_id=_slugify(f"{heading}_{sub_heading}"),
|
|
105
|
+
breadcrumb=f"{heading} > {sub_heading}",
|
|
106
|
+
heading=sub_heading,
|
|
107
|
+
level=3,
|
|
108
|
+
token_count=_estimate_tokens(sub),
|
|
109
|
+
text=prefix + sub,
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _strategy_c(text: str) -> list[Chunk]:
|
|
116
|
+
word_window = int(STRATEGY_C_WINDOW_TOKENS / 1.3)
|
|
117
|
+
word_overlap = int(STRATEGY_C_OVERLAP_TOKENS / 1.3)
|
|
118
|
+
|
|
119
|
+
words = text.split()
|
|
120
|
+
chunks: list[Chunk] = []
|
|
121
|
+
start = 0
|
|
122
|
+
idx = 0
|
|
123
|
+
|
|
124
|
+
while start < len(words):
|
|
125
|
+
end = min(start + word_window, len(words))
|
|
126
|
+
window_words = words[start:end]
|
|
127
|
+
|
|
128
|
+
if end < len(words):
|
|
129
|
+
window_text = " ".join(window_words)
|
|
130
|
+
last_period = window_text.rfind(". ")
|
|
131
|
+
if last_period > len(window_text) // 2:
|
|
132
|
+
window_text = window_text[: last_period + 1]
|
|
133
|
+
window_words = window_text.split()
|
|
134
|
+
|
|
135
|
+
chunk_text = " ".join(window_words)
|
|
136
|
+
token_count = int(len(window_words) * 1.3)
|
|
137
|
+
|
|
138
|
+
chunks.append(
|
|
139
|
+
Chunk(
|
|
140
|
+
chunk_id=f"window_{idx:03d}",
|
|
141
|
+
breadcrumb=f"Window {idx + 1}",
|
|
142
|
+
heading=f"Window {idx + 1}",
|
|
143
|
+
level=0,
|
|
144
|
+
token_count=token_count,
|
|
145
|
+
text=chunk_text,
|
|
146
|
+
has_code=False,
|
|
147
|
+
overlapping=(idx > 0),
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
idx += 1
|
|
152
|
+
step = word_window - word_overlap
|
|
153
|
+
start += max(step, 1)
|
|
154
|
+
|
|
155
|
+
return chunks
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _apply_quality_rules(chunks: list[Chunk]) -> list[Chunk]:
|
|
159
|
+
merged: list[Chunk] = []
|
|
160
|
+
for c in chunks:
|
|
161
|
+
if c.token_count < MIN_CHUNK_TOKENS and merged:
|
|
162
|
+
prev = merged[-1]
|
|
163
|
+
prev.text += "\n\n" + c.text
|
|
164
|
+
prev.token_count = _estimate_tokens(prev.text)
|
|
165
|
+
else:
|
|
166
|
+
merged.append(c)
|
|
167
|
+
|
|
168
|
+
for c in merged:
|
|
169
|
+
parse_content.enrich(c)
|
|
170
|
+
|
|
171
|
+
return merged
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from tutor.constants import STRATEGY_A_TOKEN_LIMIT, STRATEGY_B_TOKEN_LIMIT
|
|
6
|
+
from tutor.exceptions import IngestionError
|
|
7
|
+
from tutor.models import DocProfile
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def analyze(filepath: str) -> DocProfile:
|
|
11
|
+
path = Path(filepath)
|
|
12
|
+
try:
|
|
13
|
+
text = path.read_text(encoding="utf-8")
|
|
14
|
+
except OSError as e:
|
|
15
|
+
raise IngestionError(f"Cannot read file: {filepath}") from e
|
|
16
|
+
|
|
17
|
+
raw_bytes = path.stat().st_size
|
|
18
|
+
word_count = len(text.split())
|
|
19
|
+
estimated_tokens = int(word_count * 1.3)
|
|
20
|
+
|
|
21
|
+
strategy: Literal["A", "B", "C"]
|
|
22
|
+
if estimated_tokens <= STRATEGY_A_TOKEN_LIMIT:
|
|
23
|
+
strategy = "A"
|
|
24
|
+
elif estimated_tokens <= STRATEGY_B_TOKEN_LIMIT:
|
|
25
|
+
strategy = "B"
|
|
26
|
+
else:
|
|
27
|
+
strategy = "C"
|
|
28
|
+
|
|
29
|
+
section_count = len(re.findall(r"^#{1,3}\s", text, re.MULTILINE))
|
|
30
|
+
has_code_blocks = "```" in text
|
|
31
|
+
language_hint = "java" if "```java" in text.lower() else "general"
|
|
32
|
+
|
|
33
|
+
return DocProfile(
|
|
34
|
+
filepath=filepath,
|
|
35
|
+
raw_bytes=raw_bytes,
|
|
36
|
+
estimated_tokens=estimated_tokens,
|
|
37
|
+
strategy=strategy,
|
|
38
|
+
section_count=section_count,
|
|
39
|
+
has_code_blocks=has_code_blocks,
|
|
40
|
+
language_hint=language_hint,
|
|
41
|
+
)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from tutor.models import Chunk
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def enrich(chunk: Chunk) -> Chunk:
|
|
7
|
+
chunk.has_code = "```" in chunk.text
|
|
8
|
+
|
|
9
|
+
raw_terms = re.findall(r"\*\*(.+?)\*\*|`(.+?)`", chunk.text)
|
|
10
|
+
seen: set[str] = set()
|
|
11
|
+
key_terms: list[str] = []
|
|
12
|
+
for bold, code in raw_terms:
|
|
13
|
+
term = bold or code
|
|
14
|
+
if term and term not in seen:
|
|
15
|
+
seen.add(term)
|
|
16
|
+
key_terms.append(term)
|
|
17
|
+
|
|
18
|
+
chunk.key_terms = key_terms
|
|
19
|
+
return chunk
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from tutor.constants import PROMPT_VERSION, SUMMARY_CACHE_DIR
|
|
6
|
+
from tutor.infra.llm import LIMITS, LLMFn, load_prompt
|
|
7
|
+
from tutor.models import Chunk
|
|
8
|
+
|
|
9
|
+
log = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def summarize_all(
|
|
13
|
+
chunks: list[Chunk],
|
|
14
|
+
llm_fn: LLMFn,
|
|
15
|
+
cache_dir: str = SUMMARY_CACHE_DIR,
|
|
16
|
+
) -> list[Chunk]:
|
|
17
|
+
cache_path = Path(cache_dir)
|
|
18
|
+
cache_path.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
|
|
20
|
+
prompt_text = load_prompt("summarize.txt")
|
|
21
|
+
|
|
22
|
+
for c in chunks:
|
|
23
|
+
if c.chunk_id == "full_doc":
|
|
24
|
+
c.summary = c.text[:500]
|
|
25
|
+
continue
|
|
26
|
+
|
|
27
|
+
cache_key = hashlib.md5((c.text + PROMPT_VERSION).encode()).hexdigest()
|
|
28
|
+
cache_file = cache_path / f"{cache_key}.summary.txt"
|
|
29
|
+
|
|
30
|
+
if cache_file.exists():
|
|
31
|
+
c.summary = cache_file.read_text(encoding="utf-8")
|
|
32
|
+
log.debug("Cache hit for chunk %s", c.chunk_id)
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
log.info("Summarizing chunk %s (%d tokens)", c.chunk_id, c.token_count)
|
|
36
|
+
chunk_text = _truncate_to_tokens(c.text, LIMITS["max_summarize_input_tokens"])
|
|
37
|
+
messages = [
|
|
38
|
+
{"role": "system", "content": prompt_text},
|
|
39
|
+
{"role": "user", "content": chunk_text},
|
|
40
|
+
]
|
|
41
|
+
summary = llm_fn(messages, call_type="summarize")
|
|
42
|
+
cache_file.write_text(summary, encoding="utf-8")
|
|
43
|
+
c.summary = summary
|
|
44
|
+
|
|
45
|
+
return chunks
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _truncate_to_tokens(text: str, max_tokens: int) -> str:
|
|
49
|
+
max_words = int(max_tokens / 1.3)
|
|
50
|
+
words = text.split()
|
|
51
|
+
return " ".join(words[:max_words]) if len(words) > max_words else text
|
tutor/inspector.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from tutor.constants import WPM
|
|
2
|
+
from tutor.models import Chunk, DocProfile, TeachingUnit
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def report_ingestion(profile: DocProfile, chunks: list[Chunk]) -> None:
|
|
6
|
+
print("\n=== Ingestion Report ===")
|
|
7
|
+
print(f"File: {profile.filepath}")
|
|
8
|
+
print(f"Raw size: {profile.raw_bytes:,} bytes")
|
|
9
|
+
print(f"Estimated tokens: {profile.estimated_tokens:,}")
|
|
10
|
+
print(f"Strategy: {profile.strategy}")
|
|
11
|
+
print(f"Sections found: {profile.section_count}")
|
|
12
|
+
print(f"Chunks created: {len(chunks)}")
|
|
13
|
+
|
|
14
|
+
if chunks:
|
|
15
|
+
avg = sum(c.token_count for c in chunks) // len(chunks)
|
|
16
|
+
largest = max(chunks, key=lambda c: c.token_count)
|
|
17
|
+
code_count = sum(1 for c in chunks if c.has_code)
|
|
18
|
+
print(f" Avg chunk size: {avg} tokens")
|
|
19
|
+
print(f" Largest chunk: {largest.token_count} tokens ({largest.chunk_id})")
|
|
20
|
+
print(f" Chunks with code: {code_count}/{len(chunks)}")
|
|
21
|
+
|
|
22
|
+
print("\n=== Chunk Map ===")
|
|
23
|
+
print(f"{'ID':<25} {'Heading':<35} {'Tokens':>7} {'Code'}")
|
|
24
|
+
print("-" * 75)
|
|
25
|
+
for c in chunks:
|
|
26
|
+
code_flag = "yes" if c.has_code else "no"
|
|
27
|
+
print(f"{c.chunk_id:<25} {c.heading:<35} {c.token_count:>7} {code_flag}")
|
|
28
|
+
|
|
29
|
+
_report_warnings(chunks)
|
|
30
|
+
_report_orphans(chunks)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _report_warnings(chunks: list[Chunk]) -> None:
|
|
34
|
+
from tutor.constants import MAX_CHUNK_TOKENS
|
|
35
|
+
|
|
36
|
+
warnings = []
|
|
37
|
+
for c in chunks:
|
|
38
|
+
if c.token_count > MAX_CHUNK_TOKENS and c.has_code:
|
|
39
|
+
warnings.append(
|
|
40
|
+
f"! {c.chunk_id} — code block preserved intact at {c.token_count} tokens (correct behavior)."
|
|
41
|
+
)
|
|
42
|
+
elif c.token_count > MAX_CHUNK_TOKENS:
|
|
43
|
+
warnings.append(
|
|
44
|
+
f"! {c.chunk_id} — {c.token_count} tokens, may produce shallow dialogue."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if warnings:
|
|
48
|
+
print("\n=== Chunk Quality Warnings ===")
|
|
49
|
+
for w in warnings:
|
|
50
|
+
print(w)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _report_orphans(chunks: list[Chunk]) -> None:
|
|
54
|
+
orphans = [c for c in chunks if c.token_count < 200]
|
|
55
|
+
if orphans:
|
|
56
|
+
print("\n=== Orphan Risk ===")
|
|
57
|
+
for c in orphans:
|
|
58
|
+
print(
|
|
59
|
+
f" {c.chunk_id} ({c.token_count} tokens) — small section, may be skipped by planner"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def report_curriculum(
|
|
64
|
+
units: list[TeachingUnit],
|
|
65
|
+
chunks: list[Chunk],
|
|
66
|
+
duration_min: int,
|
|
67
|
+
) -> None:
|
|
68
|
+
print("\n=== Duration Plan ===")
|
|
69
|
+
print(f"Target duration: {duration_min} min")
|
|
70
|
+
print(f"Word budget: {duration_min * WPM} words (@ {WPM} WPM)")
|
|
71
|
+
print("Silence overhead: ~1m 20s")
|
|
72
|
+
|
|
73
|
+
print("\n=== Teaching Units ===")
|
|
74
|
+
header = f"{'':45} {'Complexity':>10} {'Words':>7} {'Est. time'}"
|
|
75
|
+
print(header)
|
|
76
|
+
print("-" * 80)
|
|
77
|
+
|
|
78
|
+
intro_words = 100
|
|
79
|
+
intro_secs = intro_words * 60 // WPM
|
|
80
|
+
print(f"{'Intro':<45} {'—':>10} {intro_words:>7} {_fmt_time(intro_secs)}")
|
|
81
|
+
|
|
82
|
+
total_words = intro_words
|
|
83
|
+
total_secs = intro_secs
|
|
84
|
+
|
|
85
|
+
for u in units:
|
|
86
|
+
secs = u.word_budget * 60 // WPM
|
|
87
|
+
label = f'Unit {u.unit} "{u.concept}"'
|
|
88
|
+
print(f"{label:<45} {u.complexity:>10} {u.word_budget:>7} {_fmt_time(secs)}")
|
|
89
|
+
total_words += u.word_budget
|
|
90
|
+
total_secs += secs
|
|
91
|
+
|
|
92
|
+
outro_words = 80
|
|
93
|
+
outro_secs = outro_words * 60 // WPM
|
|
94
|
+
print(f"{'Outro (memory hook recap)':<45} {'—':>10} {outro_words:>7} {_fmt_time(outro_secs)}")
|
|
95
|
+
total_words += outro_words
|
|
96
|
+
total_secs += outro_secs + 80 # silence overhead
|
|
97
|
+
|
|
98
|
+
print("-" * 80)
|
|
99
|
+
print(f"{'Total':<45} {'':>10} {total_words:>7} {_fmt_time(total_secs)}")
|
|
100
|
+
|
|
101
|
+
used_ids = {sid for u in units for sid in u.source_sections}
|
|
102
|
+
used = sum(1 for c in chunks if c.chunk_id in used_ids)
|
|
103
|
+
pct = used / len(chunks) * 100 if chunks else 0
|
|
104
|
+
skipped = [c.chunk_id for c in chunks if c.chunk_id not in used_ids]
|
|
105
|
+
|
|
106
|
+
print("\n=== Coverage ===")
|
|
107
|
+
print(f"Sections used: {used}/{len(chunks)} ({pct:.1f}%)")
|
|
108
|
+
if skipped:
|
|
109
|
+
print(f"Sections skipped: {', '.join(skipped[:8])}")
|
|
110
|
+
if len(skipped) > 8:
|
|
111
|
+
print(f" ... and {len(skipped) - 8} more")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _fmt_time(seconds: int) -> str:
|
|
115
|
+
m = seconds // 60
|
|
116
|
+
s = seconds % 60
|
|
117
|
+
return f"{m}m {s:02d}s"
|
tutor/llm_config.toml
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ============================================================
|
|
2
|
+
# LearnX LLM configuration
|
|
3
|
+
# Edit this file to change models or token budgets.
|
|
4
|
+
# No Python code changes required.
|
|
5
|
+
# ============================================================
|
|
6
|
+
|
|
7
|
+
# ── Model selection ─────────────────────────────────────────
|
|
8
|
+
# One model name per call_type per provider.
|
|
9
|
+
# call_types: curriculum | dialogue | summarize | qa
|
|
10
|
+
|
|
11
|
+
[providers.groq]
|
|
12
|
+
curriculum = "llama-3.3-70b-versatile"
|
|
13
|
+
dialogue = "llama-3.3-70b-versatile"
|
|
14
|
+
summarize = "llama-3.1-8b-instant"
|
|
15
|
+
qa = "llama-3.1-8b-instant"
|
|
16
|
+
visual = "llama-3.3-70b-versatile" # 70b needed for reliable DOT output
|
|
17
|
+
segments = "llama-3.3-70b-versatile"
|
|
18
|
+
|
|
19
|
+
[providers.openrouter]
|
|
20
|
+
curriculum = "poolside/laguna-xs.2:free"
|
|
21
|
+
dialogue = "poolside/laguna-xs.2:free"
|
|
22
|
+
summarize = "poolside/laguna-xs.2:free"
|
|
23
|
+
qa = "poolside/laguna-xs.2:free"
|
|
24
|
+
visual = "poolside/laguna-xs.2:free"
|
|
25
|
+
segments = "poolside/laguna-xs.2:free"
|
|
26
|
+
|
|
27
|
+
# ── Response token caps ──────────────────────────────────────
|
|
28
|
+
# Max tokens the model may generate per call type.
|
|
29
|
+
# Together with the input prompt they must stay under the
|
|
30
|
+
# provider's per-request token limit (Groq free tier: 6 000).
|
|
31
|
+
|
|
32
|
+
[max_tokens]
|
|
33
|
+
curriculum = 2000
|
|
34
|
+
dialogue = 2000
|
|
35
|
+
summarize = 400
|
|
36
|
+
qa = 600
|
|
37
|
+
visual = 1200
|
|
38
|
+
segments = 2000
|
|
39
|
+
|
|
40
|
+
# ── Input size limits ────────────────────────────────────────
|
|
41
|
+
# max_source_tokens: source text sent with each dialogue prompt
|
|
42
|
+
# max_summarize_input_tokens: chunk text sent to the summariser
|
|
43
|
+
#
|
|
44
|
+
# dialogue uses llama-3.3-70b-versatile (128k context on Groq) or owl-alpha (OpenRouter).
|
|
45
|
+
# 8000 source + ~1200 prompt overhead + 2000 response = ~11200 tokens — fine for both.
|
|
46
|
+
# max_summarize_input_tokens kept at 3000; those calls send much smaller chunks.
|
|
47
|
+
|
|
48
|
+
[limits]
|
|
49
|
+
max_source_tokens = 8000
|
|
50
|
+
max_summarize_input_tokens = 3000
|
|
51
|
+
max_visual_source_tokens = 800
|
|
52
|
+
|
|
53
|
+
# ── Call behaviour ───────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
[llm]
|
|
56
|
+
temperature = 0.7
|
|
57
|
+
retry_count = 3 # attempts before giving up
|
|
58
|
+
retry_delay_s = 2.0 # seconds between retries
|
tutor/models.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DocProfile:
|
|
9
|
+
filepath: str
|
|
10
|
+
raw_bytes: int
|
|
11
|
+
estimated_tokens: int
|
|
12
|
+
strategy: Literal["A", "B", "C"]
|
|
13
|
+
section_count: int
|
|
14
|
+
has_code_blocks: bool
|
|
15
|
+
language_hint: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Chunk:
|
|
20
|
+
chunk_id: str
|
|
21
|
+
breadcrumb: str
|
|
22
|
+
heading: str
|
|
23
|
+
level: int
|
|
24
|
+
token_count: int
|
|
25
|
+
text: str
|
|
26
|
+
has_code: bool = False
|
|
27
|
+
summary: str = ""
|
|
28
|
+
overlapping: bool = False
|
|
29
|
+
key_terms: list[str] = field(default_factory=list)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class TeachingUnit:
|
|
34
|
+
unit: int
|
|
35
|
+
concept: str
|
|
36
|
+
source_sections: list[str]
|
|
37
|
+
complexity: int # 1 | 2 | 3
|
|
38
|
+
word_budget: int
|
|
39
|
+
key_facts: list[str]
|
|
40
|
+
common_misconception: str
|
|
41
|
+
good_analogy: str
|
|
42
|
+
question_style: str
|
|
43
|
+
memory_hook: str
|
|
44
|
+
prerequisite_concepts: list[str] = field(default_factory=list)
|
|
45
|
+
js_contrast: str = ""
|
|
46
|
+
production_relevance: str = ""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class DialogueLine:
|
|
51
|
+
speaker: str # "ALEX" | "MAYA" | "SAM"
|
|
52
|
+
text: str
|
|
53
|
+
unit_number: int # 0 = intro, 1+ = unit, -1 = outro
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class RenderedSegment:
|
|
58
|
+
line: DialogueLine
|
|
59
|
+
audio_path: str
|
|
60
|
+
duration_ms: int
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class QAExchange:
|
|
65
|
+
id: int
|
|
66
|
+
unit_number: int
|
|
67
|
+
unit_concept: str
|
|
68
|
+
position_seconds: int
|
|
69
|
+
question: str
|
|
70
|
+
answer: str
|
|
71
|
+
source_sections: list[str]
|
|
72
|
+
timestamp: str
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class SessionLog:
|
|
77
|
+
source_file: str
|
|
78
|
+
session_start: str
|
|
79
|
+
format: str
|
|
80
|
+
duration_minutes: int
|
|
81
|
+
exchanges: list[QAExchange] = field(default_factory=list)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class TimingEntry:
|
|
86
|
+
line_index: int # 0-based within the unit
|
|
87
|
+
speaker: str # "ALEX" | "MAYA" | "SAM"
|
|
88
|
+
text: str # dialogue line text — for cross-referencing only
|
|
89
|
+
start_ms: int # offset from unit MP3 start, in milliseconds
|
|
90
|
+
end_ms: int # exclusive end; end_ms - start_ms == len(audio) in ms
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
VALID_VISUAL_TYPES: frozenset[str] = frozenset(
|
|
94
|
+
{
|
|
95
|
+
"hook_question",
|
|
96
|
+
"definition",
|
|
97
|
+
"analogy",
|
|
98
|
+
"comparison",
|
|
99
|
+
"code_example",
|
|
100
|
+
"diagram",
|
|
101
|
+
"question_prompt",
|
|
102
|
+
"decision_guide",
|
|
103
|
+
"key_insight",
|
|
104
|
+
"memory_hook",
|
|
105
|
+
"step_sequence",
|
|
106
|
+
"callout",
|
|
107
|
+
}
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class SlideSegment:
|
|
113
|
+
unit_index: int
|
|
114
|
+
segment_index: int
|
|
115
|
+
lines_start: int
|
|
116
|
+
lines_end: int
|
|
117
|
+
visual_type: str
|
|
118
|
+
title: str
|
|
119
|
+
body: str | None
|
|
120
|
+
code: str | None
|
|
121
|
+
language: str | None
|
|
122
|
+
mermaid: str | None
|
|
123
|
+
left: str | None
|
|
124
|
+
right: str | None
|
|
125
|
+
rows: list | None
|
|
126
|
+
png_path: str = ""
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class VisualSpec:
|
|
131
|
+
unit_index: int
|
|
132
|
+
slide_type: str # "title_card" | "unit" | "outro"
|
|
133
|
+
concept: str = ""
|
|
134
|
+
hook_question: str = ""
|
|
135
|
+
key_points: list[str] = field(default_factory=list)
|
|
136
|
+
code_snippet: str | None = None
|
|
137
|
+
diagram_type: str = "none"
|
|
138
|
+
diagram_spec: str | dict[str, object] | None = None
|
|
139
|
+
memory_hook: str = ""
|
|
140
|
+
analogy: str = ""
|
|
141
|
+
# title_card fields
|
|
142
|
+
title: str = ""
|
|
143
|
+
subtitle: str = ""
|
|
144
|
+
doc_source: str = ""
|
|
145
|
+
# outro fields
|
|
146
|
+
memory_hooks: list[str] = field(default_factory=list)
|
|
147
|
+
session_stats: str = ""
|
tutor/player/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
log = logging.getLogger(__name__)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_key() -> str | None:
|
|
8
|
+
"""Return the pressed key as a string, or None if no key is available."""
|
|
9
|
+
if sys.platform == "win32":
|
|
10
|
+
return _get_key_windows()
|
|
11
|
+
return _get_key_unix()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_key_windows() -> str | None:
|
|
15
|
+
import msvcrt
|
|
16
|
+
|
|
17
|
+
if msvcrt.kbhit():
|
|
18
|
+
raw = msvcrt.getch()
|
|
19
|
+
try:
|
|
20
|
+
return raw.decode("utf-8")
|
|
21
|
+
except UnicodeDecodeError:
|
|
22
|
+
return None
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_key_unix() -> str | None:
|
|
27
|
+
try:
|
|
28
|
+
import threading
|
|
29
|
+
|
|
30
|
+
import readchar
|
|
31
|
+
|
|
32
|
+
result: list[str | None] = [None]
|
|
33
|
+
|
|
34
|
+
def _read() -> None:
|
|
35
|
+
result[0] = readchar.readchar()
|
|
36
|
+
|
|
37
|
+
t = threading.Thread(target=_read, daemon=True)
|
|
38
|
+
t.start()
|
|
39
|
+
t.join(timeout=0.05)
|
|
40
|
+
return result[0]
|
|
41
|
+
except ImportError:
|
|
42
|
+
log.warning(
|
|
43
|
+
"readchar not installed — keyboard input unavailable on non-Windows. pip install readchar"
|
|
44
|
+
)
|
|
45
|
+
return None
|