biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1624 @@
1
+ """
2
+ Markov analysis backend for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import math
9
+ import re
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Dict, List, Optional, Sequence, Tuple
13
+
14
+ from pydantic import BaseModel
15
+
16
+ from ..ai.embeddings import generate_embeddings_batch
17
+ from ..ai.llm import generate_completion
18
+ from ..context import (
19
+ ContextPack,
20
+ ContextPackPolicy,
21
+ TokenBudget,
22
+ build_context_pack,
23
+ fit_context_pack_to_token_budget,
24
+ )
25
+ from ..corpus import Corpus
26
+ from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult
27
+ from ..retrieval import hash_text
28
+ from ..text.annotate import TextAnnotateRequest, apply_text_annotate
29
+ from ..text.extract import TextExtractRequest, apply_text_extract
30
+ from ..time import utc_now_iso
31
+ from .base import CorpusAnalysisBackend
32
+ from .models import (
33
+ AnalysisRecipeManifest,
34
+ AnalysisRunInput,
35
+ AnalysisRunManifest,
36
+ MarkovAnalysisArtifactsGraphVizConfig,
37
+ MarkovAnalysisDecodedPath,
38
+ MarkovAnalysisModelFamily,
39
+ MarkovAnalysisObservation,
40
+ MarkovAnalysisObservationsEncoder,
41
+ MarkovAnalysisOutput,
42
+ MarkovAnalysisRecipeConfig,
43
+ MarkovAnalysisReport,
44
+ MarkovAnalysisSegment,
45
+ MarkovAnalysisSegmentationMethod,
46
+ MarkovAnalysisStageStatus,
47
+ MarkovAnalysisState,
48
+ MarkovAnalysisTextCollectionReport,
49
+ MarkovAnalysisTextSourceConfig,
50
+ MarkovAnalysisTransition,
51
+ TopicModelingReport,
52
+ )
53
+ from .topic_modeling import TopicModelingDocument, run_topic_modeling_for_documents
54
+
55
+
56
+ class MarkovStateName(BaseModel):
57
+ """
58
+ Structured response for a single state name.
59
+
60
+ :ivar state_id: State identifier.
61
+ :vartype state_id: int
62
+ :ivar name: Short noun-phrase name for the state.
63
+ :vartype name: str
64
+ """
65
+
66
+ state_id: int
67
+ name: str
68
+
69
+
70
+ class MarkovStateNamingResponse(BaseModel):
71
+ """
72
+ Structured response for state naming.
73
+
74
+ :ivar state_names: State name assignments.
75
+ :vartype state_names: list[MarkovStateName]
76
+ :ivar start_state_id: Optional state id representing the start state.
77
+ :vartype start_state_id: int or None
78
+ :ivar end_state_id: Optional state id representing the end state.
79
+ :vartype end_state_id: int or None
80
+ :ivar disconnection_state_id: Optional state id representing a disconnection state.
81
+ :vartype disconnection_state_id: int or None
82
+ """
83
+
84
+ state_names: List[MarkovStateName]
85
+ start_state_id: Optional[int] = None
86
+ end_state_id: Optional[int] = None
87
+ disconnection_state_id: Optional[int] = None
88
+
89
+
90
+ @dataclass
91
+ class _Document:
92
+ item_id: str
93
+ text: str
94
+
95
+
96
+ class MarkovBackend(CorpusAnalysisBackend):
97
+ """
98
+ Markov analysis backend.
99
+
100
+ :ivar analysis_id: Backend identifier.
101
+ :vartype analysis_id: str
102
+ """
103
+
104
+ analysis_id = "markov"
105
+
106
+ def run_analysis(
107
+ self,
108
+ corpus: Corpus,
109
+ *,
110
+ recipe_name: str,
111
+ config: Dict[str, object],
112
+ extraction_run: ExtractionRunReference,
113
+ ) -> BaseModel:
114
+ """
115
+ Run Markov analysis for a corpus.
116
+
117
+ :param corpus: Corpus to analyze.
118
+ :type corpus: Corpus
119
+ :param recipe_name: Human-readable recipe name.
120
+ :type recipe_name: str
121
+ :param config: Analysis configuration values.
122
+ :type config: dict[str, object]
123
+ :param extraction_run: Extraction run reference for text inputs.
124
+ :type extraction_run: biblicus.models.ExtractionRunReference
125
+ :return: Markov analysis output model.
126
+ :rtype: pydantic.BaseModel
127
+ """
128
+ parsed_config = (
129
+ config
130
+ if isinstance(config, MarkovAnalysisRecipeConfig)
131
+ else MarkovAnalysisRecipeConfig.model_validate(config)
132
+ )
133
+ return _run_markov(
134
+ corpus=corpus,
135
+ recipe_name=recipe_name,
136
+ config=parsed_config,
137
+ extraction_run=extraction_run,
138
+ )
139
+
140
+
141
+ def _run_markov(
142
+ *,
143
+ corpus: Corpus,
144
+ recipe_name: str,
145
+ config: MarkovAnalysisRecipeConfig,
146
+ extraction_run: ExtractionRunReference,
147
+ ) -> MarkovAnalysisOutput:
148
+ recipe = _create_recipe_manifest(name=recipe_name, config=config)
149
+ catalog = corpus.load_catalog()
150
+ run_id = _analysis_run_id(
151
+ recipe_id=recipe.recipe_id,
152
+ extraction_run=extraction_run,
153
+ catalog_generated_at=catalog.generated_at,
154
+ )
155
+ run_manifest = AnalysisRunManifest(
156
+ run_id=run_id,
157
+ recipe=recipe,
158
+ corpus_uri=catalog.corpus_uri,
159
+ catalog_generated_at=catalog.generated_at,
160
+ created_at=utc_now_iso(),
161
+ input=AnalysisRunInput(extraction_run=extraction_run),
162
+ artifact_paths=[],
163
+ stats={},
164
+ )
165
+ run_dir = corpus.analysis_run_dir(analysis_id=MarkovBackend.analysis_id, run_id=run_id)
166
+ run_dir.mkdir(parents=True, exist_ok=True)
167
+
168
+ documents, text_report = _collect_documents(
169
+ corpus=corpus,
170
+ extraction_run=extraction_run,
171
+ config=config.text_source,
172
+ )
173
+ segments = _segment_documents(documents=documents, config=config)
174
+ observations = _build_observations(segments=segments, config=config)
175
+ observations, topic_report = _apply_topic_modeling(observations=observations, config=config)
176
+ observation_matrix, lengths = _encode_observations(observations=observations, config=config)
177
+
178
+ predicted_states, transitions, state_count = _fit_and_decode(
179
+ observations=observation_matrix,
180
+ lengths=lengths,
181
+ config=config,
182
+ )
183
+
184
+ decoded_paths = _group_decoded_paths(segments=segments, predicted_states=predicted_states)
185
+ states = _build_states(
186
+ segments=segments,
187
+ predicted_states=predicted_states,
188
+ n_states=state_count,
189
+ max_exemplars=config.report.max_state_exemplars,
190
+ )
191
+ states = _assign_state_names(
192
+ states=states,
193
+ decoded_paths=decoded_paths,
194
+ config=config,
195
+ )
196
+
197
+ artifact_paths: List[str] = [
198
+ "output.json",
199
+ "segments.jsonl",
200
+ "observations.jsonl",
201
+ "transitions.json",
202
+ ]
203
+ _write_segments(run_dir=run_dir, segments=segments)
204
+ _write_observations(run_dir=run_dir, observations=observations)
205
+ _write_transitions_json(run_dir=run_dir, transitions=transitions)
206
+ if topic_report is not None:
207
+ _write_topic_modeling_report(run_dir=run_dir, report=topic_report)
208
+ _write_topic_assignments(run_dir=run_dir, observations=observations)
209
+ artifact_paths.extend(["topic_modeling.json", "topic_assignments.jsonl"])
210
+
211
+ if config.artifacts.graphviz.enabled:
212
+ _write_graphviz(
213
+ run_dir=run_dir,
214
+ transitions=transitions,
215
+ graphviz=config.artifacts.graphviz,
216
+ states=states,
217
+ decoded_paths=decoded_paths,
218
+ )
219
+ artifact_paths.append("transitions.dot")
220
+
221
+ warnings = list(text_report.warnings)
222
+ errors = list(text_report.errors)
223
+ if topic_report is not None:
224
+ warnings.extend(topic_report.warnings)
225
+ errors.extend(topic_report.errors)
226
+
227
+ report = MarkovAnalysisReport(
228
+ text_collection=text_report,
229
+ status=MarkovAnalysisStageStatus.COMPLETE,
230
+ states=states,
231
+ transitions=transitions,
232
+ decoded_paths=decoded_paths,
233
+ topic_modeling=topic_report,
234
+ warnings=warnings,
235
+ errors=errors,
236
+ )
237
+
238
+ run_stats = {
239
+ "items": len({doc.item_id for doc in documents}),
240
+ "segments": len(segments),
241
+ "states": len(states),
242
+ "transitions": len(transitions),
243
+ }
244
+ if topic_report is not None:
245
+ run_stats["topics"] = len(topic_report.topics)
246
+ run_manifest = run_manifest.model_copy(
247
+ update={"artifact_paths": artifact_paths, "stats": run_stats}
248
+ )
249
+ _write_analysis_run_manifest(run_dir=run_dir, manifest=run_manifest)
250
+
251
+ output = MarkovAnalysisOutput(
252
+ analysis_id=MarkovBackend.analysis_id,
253
+ generated_at=utc_now_iso(),
254
+ run=run_manifest,
255
+ report=report,
256
+ )
257
+ (run_dir / "output.json").write_text(output.model_dump_json(indent=2) + "\n", encoding="utf-8")
258
+ return output
259
+
260
+
261
+ def _create_recipe_manifest(
262
+ *, name: str, config: MarkovAnalysisRecipeConfig
263
+ ) -> AnalysisRecipeManifest:
264
+ recipe_payload = json.dumps(
265
+ {
266
+ "analysis_id": MarkovBackend.analysis_id,
267
+ "name": name,
268
+ "config": config.model_dump(),
269
+ },
270
+ sort_keys=True,
271
+ )
272
+ recipe_id = hash_text(recipe_payload)
273
+ return AnalysisRecipeManifest(
274
+ recipe_id=recipe_id,
275
+ analysis_id=MarkovBackend.analysis_id,
276
+ name=name,
277
+ created_at=utc_now_iso(),
278
+ config=config.model_dump(),
279
+ )
280
+
281
+
282
+ def _analysis_run_id(
283
+ *, recipe_id: str, extraction_run: ExtractionRunReference, catalog_generated_at: str
284
+ ) -> str:
285
+ run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
286
+ return hash_text(run_seed)
287
+
288
+
289
+ def _collect_documents(
290
+ *,
291
+ corpus: Corpus,
292
+ extraction_run: ExtractionRunReference,
293
+ config: MarkovAnalysisTextSourceConfig,
294
+ ) -> Tuple[List[_Document], MarkovAnalysisTextCollectionReport]:
295
+ manifest = corpus.load_extraction_run_manifest(
296
+ extractor_id=extraction_run.extractor_id,
297
+ run_id=extraction_run.run_id,
298
+ )
299
+ warnings: List[str] = []
300
+ errors: List[str] = []
301
+ documents: List[_Document] = []
302
+ skipped_items = 0
303
+ empty_texts = 0
304
+
305
+ run_root = corpus.extraction_run_dir(
306
+ extractor_id=extraction_run.extractor_id,
307
+ run_id=extraction_run.run_id,
308
+ )
309
+ for item_result in manifest.items:
310
+ if item_result.status != "extracted" or item_result.final_text_relpath is None:
311
+ skipped_items += 1
312
+ continue
313
+ text_path = run_root / item_result.final_text_relpath
314
+ text_value = text_path.read_text(encoding="utf-8").strip()
315
+ if not text_value:
316
+ empty_texts += 1
317
+ continue
318
+ if config.min_text_characters is not None and len(text_value) < config.min_text_characters:
319
+ skipped_items += 1
320
+ continue
321
+ documents.append(_Document(item_id=item_result.item_id, text=text_value))
322
+
323
+ if config.sample_size is not None and len(documents) > config.sample_size:
324
+ documents = documents[: config.sample_size]
325
+ warnings.append("Text collection truncated to sample_size")
326
+
327
+ report = MarkovAnalysisTextCollectionReport(
328
+ status=MarkovAnalysisStageStatus.COMPLETE,
329
+ source_items=len(manifest.items),
330
+ documents=len(documents),
331
+ sample_size=config.sample_size,
332
+ min_text_characters=config.min_text_characters,
333
+ empty_texts=empty_texts,
334
+ skipped_items=skipped_items,
335
+ warnings=warnings,
336
+ errors=errors,
337
+ )
338
+ if not documents:
339
+ report = report.model_copy(update={"status": MarkovAnalysisStageStatus.FAILED})
340
+ raise ValueError("Markov analysis requires at least one extracted text document")
341
+ return documents, report
342
+
343
+
344
+ def _segment_documents(
345
+ *, documents: Sequence[_Document], config: MarkovAnalysisRecipeConfig
346
+ ) -> List[MarkovAnalysisSegment]:
347
+ segments: List[MarkovAnalysisSegment] = []
348
+ method = config.segmentation.method
349
+ for document in documents:
350
+ if method == MarkovAnalysisSegmentationMethod.SENTENCE:
351
+ segments.extend(_sentence_segments(item_id=document.item_id, text=document.text))
352
+ continue
353
+ if method == MarkovAnalysisSegmentationMethod.FIXED_WINDOW:
354
+ segments.extend(
355
+ _fixed_window_segments(
356
+ item_id=document.item_id,
357
+ text=document.text,
358
+ max_characters=config.segmentation.fixed_window.max_characters,
359
+ overlap_characters=config.segmentation.fixed_window.overlap_characters,
360
+ )
361
+ )
362
+ continue
363
+ if method == MarkovAnalysisSegmentationMethod.LLM:
364
+ segments.extend(
365
+ _llm_segments(item_id=document.item_id, text=document.text, config=config)
366
+ )
367
+ continue
368
+ if method == MarkovAnalysisSegmentationMethod.SPAN_MARKUP:
369
+ segments.extend(
370
+ _span_markup_segments(item_id=document.item_id, text=document.text, config=config)
371
+ )
372
+ continue
373
+ raise ValueError(f"Unsupported segmentation method: {method}")
374
+ if not segments:
375
+ raise ValueError("Markov analysis produced no segments")
376
+ return _add_boundary_segments(segments=segments)
377
+
378
+
379
+ def _add_boundary_segments(
380
+ *, segments: Sequence[MarkovAnalysisSegment]
381
+ ) -> List[MarkovAnalysisSegment]:
382
+ """
383
+ Add synthetic START/END boundary segments for each item sequence.
384
+
385
+ This is a deterministic, programmatic boundary signal that keeps the LLM
386
+ segmentation focused only on natural text phases. We insert:
387
+ - a leading START segment per item
388
+ - a trailing END segment per item
389
+
390
+ These boundaries are added after segmentation for all methods (sentence,
391
+ fixed-window, llm, span-markup) so the model never has to edit or reason
392
+ about them during extraction.
393
+
394
+ :param segments: Ordered segments grouped by item_id.
395
+ :type segments: Sequence[MarkovAnalysisSegment]
396
+ :return: Segments with START/END boundaries per item.
397
+ :rtype: list[MarkovAnalysisSegment]
398
+ """
399
+ if not segments:
400
+ return []
401
+ enriched: List[MarkovAnalysisSegment] = []
402
+ current_item: Optional[str] = None
403
+ buffer: List[MarkovAnalysisSegment] = []
404
+
405
+ def flush() -> None:
406
+ item_id = buffer[0].item_id
407
+ index = 1
408
+ enriched.append(MarkovAnalysisSegment(item_id=item_id, segment_index=index, text="START"))
409
+ for segment in buffer:
410
+ index += 1
411
+ enriched.append(
412
+ MarkovAnalysisSegment(item_id=item_id, segment_index=index, text=segment.text)
413
+ )
414
+ index += 1
415
+ enriched.append(MarkovAnalysisSegment(item_id=item_id, segment_index=index, text="END"))
416
+
417
+ for segment in segments:
418
+ if current_item is None:
419
+ current_item = segment.item_id
420
+ if segment.item_id != current_item:
421
+ flush()
422
+ buffer = []
423
+ current_item = segment.item_id
424
+ buffer.append(segment)
425
+ flush()
426
+ return enriched
427
+
428
+
429
+ _SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
430
+
431
+
432
+ def _sentence_segments(*, item_id: str, text: str) -> List[MarkovAnalysisSegment]:
433
+ tokens = [token.strip() for token in _SENTENCE_SPLIT.split(text) if token.strip()]
434
+ segments: List[MarkovAnalysisSegment] = []
435
+ for index, token in enumerate(tokens, start=1):
436
+ segments.append(
437
+ MarkovAnalysisSegment(
438
+ item_id=item_id,
439
+ segment_index=index,
440
+ text=token,
441
+ )
442
+ )
443
+ return segments
444
+
445
+
446
+ def _fixed_window_segments(
447
+ *, item_id: str, text: str, max_characters: int, overlap_characters: int
448
+ ) -> List[MarkovAnalysisSegment]:
449
+ segments: List[MarkovAnalysisSegment] = []
450
+ if max_characters <= 0:
451
+ raise ValueError("fixed_window.max_characters must be positive")
452
+ if overlap_characters < 0:
453
+ raise ValueError("fixed_window.overlap_characters must be non-negative")
454
+ if overlap_characters >= max_characters:
455
+ raise ValueError("fixed_window.overlap_characters must be smaller than max_characters")
456
+
457
+ start = 0
458
+ index = 1
459
+ while start < len(text):
460
+ end = min(len(text), start + max_characters)
461
+ chunk = text[start:end].strip()
462
+ if chunk:
463
+ segments.append(MarkovAnalysisSegment(item_id=item_id, segment_index=index, text=chunk))
464
+ index += 1
465
+ if end >= len(text):
466
+ break
467
+ start = max(0, end - overlap_characters)
468
+ return segments
469
+
470
+
471
+ def _llm_segments(
472
+ *, item_id: str, text: str, config: MarkovAnalysisRecipeConfig
473
+ ) -> List[MarkovAnalysisSegment]:
474
+ llm_config = config.segmentation.llm
475
+ if llm_config is None:
476
+ raise ValueError("segmentation.llm is required when segmentation.method is 'llm'")
477
+ prompt = llm_config.prompt_template.format(text=text)
478
+ response_text = generate_completion(
479
+ client=llm_config.client,
480
+ system_prompt=llm_config.system_prompt,
481
+ user_prompt=prompt,
482
+ ).strip()
483
+ if llm_config.client.response_format == "json_object":
484
+ payload = _parse_json_object(response_text, error_label="LLM segmentation")
485
+ segments_payload = payload.get("segments")
486
+ if not isinstance(segments_payload, list):
487
+ raise ValueError("LLM segmentation must return a JSON object with a 'segments' list")
488
+ else:
489
+ segments_payload = _parse_json_list(response_text, error_label="LLM segmentation")
490
+ segments: List[MarkovAnalysisSegment] = []
491
+ for index, value in enumerate(segments_payload, start=1):
492
+ segment_text = str(value).strip()
493
+ if not segment_text:
494
+ continue
495
+ segments.append(
496
+ MarkovAnalysisSegment(item_id=item_id, segment_index=index, text=segment_text)
497
+ )
498
+ return segments
499
+
500
+
501
+ def _span_markup_segments(
502
+ *, item_id: str, text: str, config: MarkovAnalysisRecipeConfig
503
+ ) -> List[MarkovAnalysisSegment]:
504
+ markup_config = config.segmentation.span_markup
505
+ if markup_config is None:
506
+ raise ValueError(
507
+ "segmentation.span_markup is required when segmentation.method is 'span_markup'"
508
+ )
509
+ label_attribute = markup_config.label_attribute
510
+ prepend_label = markup_config.prepend_label
511
+ if label_attribute is not None or prepend_label:
512
+ request = TextAnnotateRequest(
513
+ text=text,
514
+ client=markup_config.client,
515
+ prompt_template=markup_config.prompt_template,
516
+ system_prompt=markup_config.system_prompt,
517
+ allowed_attributes=[label_attribute] if label_attribute else None,
518
+ max_rounds=markup_config.max_rounds,
519
+ max_edits_per_round=markup_config.max_edits_per_round,
520
+ )
521
+ result = apply_text_annotate(request)
522
+ else:
523
+ request = TextExtractRequest(
524
+ text=text,
525
+ client=markup_config.client,
526
+ prompt_template=markup_config.prompt_template,
527
+ system_prompt=markup_config.system_prompt,
528
+ max_rounds=markup_config.max_rounds,
529
+ max_edits_per_round=markup_config.max_edits_per_round,
530
+ )
531
+ result = apply_text_extract(request)
532
+ segment_payloads: List[Dict[str, object]] = []
533
+ for index, span in enumerate(result.spans, start=1):
534
+ segment_body = str(span.text).strip()
535
+ if not segment_body:
536
+ continue
537
+ segment_text = segment_body
538
+ if prepend_label:
539
+ if label_attribute is None:
540
+ raise ValueError(
541
+ "segmentation.span_markup.label_attribute is required when "
542
+ "segmentation.span_markup.prepend_label is true"
543
+ )
544
+ label_value = str(span.attributes.get(label_attribute, "")).strip()
545
+ if not label_value:
546
+ raise ValueError(f"Span {index} missing label attribute '{label_attribute}'")
547
+ segment_text = f"{label_value}\n{segment_body}"
548
+ segment_payloads.append(
549
+ {"segment_index": index, "body": segment_body, "text": segment_text}
550
+ )
551
+ segments: List[MarkovAnalysisSegment] = []
552
+ for payload in segment_payloads:
553
+ segments.append(
554
+ MarkovAnalysisSegment(
555
+ item_id=item_id,
556
+ segment_index=int(payload["segment_index"]),
557
+ text=str(payload["text"]),
558
+ )
559
+ )
560
+ return segments
561
+
562
+
563
+ def _verify_end_label(
564
+ *, text: str, config: MarkovAnalysisRecipeConfig
565
+ ) -> Optional[Dict[str, object]]:
566
+ markup_config = config.segmentation.span_markup
567
+ if markup_config is None or markup_config.end_label_verifier is None:
568
+ return None
569
+ verifier = markup_config.end_label_verifier
570
+ system_prompt = verifier.system_prompt.replace("{text}", text)
571
+ user_prompt = verifier.prompt_template.replace("{text}", text)
572
+ response_text = generate_completion(
573
+ client=verifier.client,
574
+ system_prompt=system_prompt,
575
+ user_prompt=user_prompt,
576
+ ).strip()
577
+ payload = _parse_json_object(response_text, error_label="End label verifier")
578
+ return {
579
+ "is_end": bool(payload.get("is_end")),
580
+ "reason": payload.get("reason"),
581
+ }
582
+
583
+
584
+ def _apply_start_end_labels(
585
+ *,
586
+ item_id: str,
587
+ payloads: Sequence[Dict[str, object]],
588
+ config: MarkovAnalysisRecipeConfig,
589
+ ) -> List[MarkovAnalysisSegment]:
590
+ markup_config = config.segmentation.span_markup
591
+ if markup_config is None:
592
+ raise ValueError("segmentation.span_markup is required for start/end labels")
593
+ segments: List[MarkovAnalysisSegment] = []
594
+ for payload in payloads:
595
+ segment_text = str(payload.get("text") or payload.get("body") or "").strip()
596
+ if not segment_text:
597
+ continue
598
+ segments.append(
599
+ MarkovAnalysisSegment(
600
+ item_id=item_id,
601
+ segment_index=int(payload.get("segment_index") or len(segments) + 1),
602
+ text=segment_text,
603
+ )
604
+ )
605
+ if not segments:
606
+ return segments
607
+ if markup_config.start_label_value:
608
+ segments[0] = segments[0].model_copy(
609
+ update={"text": f"{markup_config.start_label_value}\n{segments[0].text}"}
610
+ )
611
+ if markup_config.end_label_value:
612
+ decision = _verify_end_label(text=segments[-1].text, config=config)
613
+ if decision and decision.get("is_end"):
614
+ segments[-1] = segments[-1].model_copy(
615
+ update={"text": f"{markup_config.end_label_value}\n{segments[-1].text}"}
616
+ )
617
+ elif decision and not decision.get("is_end") and markup_config.end_reject_label_value:
618
+ reason = decision.get("reason")
619
+ prefix = markup_config.end_reject_label_value
620
+ if reason:
621
+ prefix = f"{prefix}\n{markup_config.end_reject_reason_prefix}: {reason}"
622
+ segments[-1] = segments[-1].model_copy(
623
+ update={"text": f"{prefix}\n{segments[-1].text}"}
624
+ )
625
+ return segments
626
+
627
+
628
+ def _parse_json_list(raw: str, *, error_label: str) -> List[object]:
629
+ cleaned = str(raw or "").strip()
630
+ if not cleaned:
631
+ raise ValueError(f"{error_label} returned empty output")
632
+ try:
633
+ data = json.loads(cleaned)
634
+ except json.JSONDecodeError as exc:
635
+ raise ValueError(f"{error_label} returned invalid JSON") from exc
636
+ if not isinstance(data, list):
637
+ raise ValueError(f"{error_label} must return a JSON list")
638
+ return list(data)
639
+
640
+
641
+ def _parse_json_object(raw: str, *, error_label: str) -> Dict[str, object]:
642
+ cleaned = str(raw or "").strip()
643
+ if not cleaned:
644
+ raise ValueError(f"{error_label} returned empty output")
645
+ try:
646
+ data = json.loads(cleaned)
647
+ except json.JSONDecodeError as exc:
648
+ raise ValueError(f"{error_label} returned invalid JSON") from exc
649
+ if not isinstance(data, dict):
650
+ raise ValueError(f"{error_label} must return a JSON object")
651
+ return dict(data)
652
+
653
+
654
+ def _sequence_lengths(segments: Sequence[MarkovAnalysisSegment]) -> List[int]:
655
+ lengths: List[int] = []
656
+ current_item: str = ""
657
+ current_length = 0
658
+ for segment in segments:
659
+ if not current_item:
660
+ current_item = segment.item_id
661
+ current_length = 0
662
+ elif segment.item_id != current_item:
663
+ lengths.append(current_length)
664
+ current_item = segment.item_id
665
+ current_length = 0
666
+ current_length += 1
667
+ if current_item:
668
+ lengths.append(current_length)
669
+ return lengths
670
+
671
+
672
+ def _build_observations(
673
+ *, segments: Sequence[MarkovAnalysisSegment], config: MarkovAnalysisRecipeConfig
674
+ ) -> List[MarkovAnalysisObservation]:
675
+ observations: List[MarkovAnalysisObservation] = []
676
+ for segment in segments:
677
+ observations.append(
678
+ MarkovAnalysisObservation(
679
+ item_id=segment.item_id,
680
+ segment_index=segment.segment_index,
681
+ segment_text=segment.text,
682
+ )
683
+ )
684
+
685
+ if config.llm_observations.enabled:
686
+ llm = config.llm_observations
687
+ assert llm.client is not None and llm.prompt_template is not None
688
+ for index, observation in enumerate(observations):
689
+ prompt = llm.prompt_template.format(segment=observation.segment_text)
690
+ response_text = generate_completion(
691
+ client=llm.client,
692
+ system_prompt=llm.system_prompt,
693
+ user_prompt=prompt,
694
+ ).strip()
695
+ payload = _parse_json_object(response_text, error_label="LLM observations")
696
+ label = payload.get("label")
697
+ confidence = payload.get("label_confidence")
698
+ summary = payload.get("summary")
699
+ observations[index] = observation.model_copy(
700
+ update={
701
+ "llm_label": str(label).strip() if label is not None else None,
702
+ "llm_label_confidence": float(confidence) if confidence is not None else None,
703
+ "llm_summary": str(summary).strip() if summary is not None else None,
704
+ }
705
+ )
706
+
707
+ if config.embeddings.enabled:
708
+ embedding_config = config.embeddings
709
+ assert embedding_config.client is not None
710
+ embed_texts: List[str] = []
711
+ for observation in observations:
712
+ if embedding_config.text_source == "segment_text":
713
+ embed_texts.append(observation.segment_text)
714
+ else:
715
+ if not observation.llm_summary:
716
+ raise ValueError(
717
+ "embeddings.text_source is 'llm_summary' but llm_summary is missing"
718
+ )
719
+ embed_texts.append(observation.llm_summary)
720
+ vectors = generate_embeddings_batch(client=embedding_config.client, texts=embed_texts)
721
+ updated: List[MarkovAnalysisObservation] = []
722
+ for observation, vector in zip(observations, vectors):
723
+ updated.append(observation.model_copy(update={"embedding": vector}))
724
+ observations = updated
725
+
726
+ return observations
727
+
728
+
729
+ def _topic_document_id(*, item_id: str, segment_index: int) -> str:
730
+ return f"{item_id}:{segment_index}"
731
+
732
+
733
+ def _apply_topic_modeling(
734
+ *,
735
+ observations: Sequence[MarkovAnalysisObservation],
736
+ config: MarkovAnalysisRecipeConfig,
737
+ ) -> Tuple[List[MarkovAnalysisObservation], Optional[TopicModelingReport]]:
738
+ topic_config = config.topic_modeling
739
+ if not topic_config.enabled:
740
+ return list(observations), None
741
+ if topic_config.recipe is None:
742
+ raise ValueError("topic_modeling.recipe is required when topic_modeling.enabled is true")
743
+
744
+ documents: List[TopicModelingDocument] = []
745
+ for observation in observations:
746
+ if observation.segment_text in {"START", "END"}:
747
+ continue
748
+ documents.append(
749
+ TopicModelingDocument(
750
+ document_id=_topic_document_id(
751
+ item_id=observation.item_id,
752
+ segment_index=observation.segment_index,
753
+ ),
754
+ source_item_id=observation.item_id,
755
+ text=observation.segment_text,
756
+ )
757
+ )
758
+
759
+ if not documents:
760
+ raise ValueError("Topic modeling requires at least one non-boundary segment")
761
+
762
+ report = run_topic_modeling_for_documents(
763
+ documents=documents,
764
+ config=topic_config.recipe,
765
+ )
766
+
767
+ topic_lookup: Dict[str, Tuple[int, str]] = {}
768
+ for topic in report.topics:
769
+ label = str(topic.label or "").strip()
770
+ for document_id in topic.document_ids:
771
+ topic_lookup[str(document_id)] = (int(topic.topic_id), label)
772
+
773
+ updated: List[MarkovAnalysisObservation] = []
774
+ for observation in observations:
775
+ if observation.segment_text in {"START", "END"}:
776
+ updated.append(
777
+ observation.model_copy(
778
+ update={
779
+ "topic_id": None,
780
+ "topic_label": observation.segment_text,
781
+ }
782
+ )
783
+ )
784
+ continue
785
+ document_id = _topic_document_id(
786
+ item_id=observation.item_id, segment_index=observation.segment_index
787
+ )
788
+ assignment = topic_lookup.get(document_id)
789
+ if assignment is None:
790
+ raise ValueError(
791
+ f"Topic modeling did not return an assignment for segment {document_id}"
792
+ )
793
+ topic_id, topic_label = assignment
794
+ updated.append(
795
+ observation.model_copy(update={"topic_id": topic_id, "topic_label": topic_label})
796
+ )
797
+ return updated, report
798
+
799
+
800
+ def _encode_observations(
801
+ *, observations: Sequence[MarkovAnalysisObservation], config: MarkovAnalysisRecipeConfig
802
+ ) -> Tuple[object, List[int]]:
803
+ lengths = _sequence_lengths(
804
+ [
805
+ MarkovAnalysisSegment(
806
+ item_id=observation.item_id,
807
+ segment_index=observation.segment_index,
808
+ text=observation.segment_text,
809
+ )
810
+ for observation in observations
811
+ ]
812
+ )
813
+
814
+ if config.model.family == MarkovAnalysisModelFamily.CATEGORICAL:
815
+ labels: List[str] = []
816
+ for observation in observations:
817
+ label = getattr(observation, config.observations.categorical_source, None)
818
+ if label is None:
819
+ raise ValueError(
820
+ "Categorical Markov models require categorical labels for all segments"
821
+ )
822
+ labels.append(str(label))
823
+ vocabulary = {label: idx for idx, label in enumerate(sorted(set(labels)))}
824
+ encoded = [vocabulary[label] for label in labels]
825
+ return encoded, lengths
826
+
827
+ encoder = config.observations.encoder
828
+ if encoder == MarkovAnalysisObservationsEncoder.TFIDF:
829
+ texts: List[str] = []
830
+ for observation in observations:
831
+ if config.observations.text_source == "segment_text":
832
+ texts.append(observation.segment_text)
833
+ else:
834
+ texts.append(observation.llm_summary or "")
835
+ return (
836
+ _tfidf_encode(
837
+ texts=texts,
838
+ max_features=config.observations.tfidf.max_features,
839
+ ngram_range=tuple(config.observations.tfidf.ngram_range),
840
+ ),
841
+ lengths,
842
+ )
843
+ if encoder == MarkovAnalysisObservationsEncoder.EMBEDDING:
844
+ matrix: List[List[float]] = []
845
+ for observation in observations:
846
+ if observation.embedding is None:
847
+ raise ValueError("Embedding observations require embeddings.enabled true")
848
+ matrix.append([float(value) for value in observation.embedding])
849
+ return matrix, lengths
850
+ if encoder == MarkovAnalysisObservationsEncoder.HYBRID:
851
+ labels = [
852
+ str(getattr(observation, config.observations.categorical_source, "") or "")
853
+ for observation in observations
854
+ ]
855
+ vocabulary = {label: idx for idx, label in enumerate(sorted(set(labels)))}
856
+ one_hot_size = len(vocabulary)
857
+ matrix: List[List[float]] = []
858
+ for observation in observations:
859
+ if observation.embedding is None:
860
+ raise ValueError("Hybrid observations require embeddings.enabled true")
861
+ vector: List[float] = [float(value) for value in observation.embedding]
862
+ numeric_value = getattr(observation, config.observations.numeric_source, None)
863
+ confidence = float(numeric_value) if numeric_value is not None else 0.0
864
+ vector.append(confidence)
865
+ one_hot = [0.0 for _ in range(one_hot_size)]
866
+ label_value = str(
867
+ getattr(observation, config.observations.categorical_source, "") or ""
868
+ )
869
+ idx = vocabulary[label_value]
870
+ one_hot[idx] = 1.0
871
+ vector.extend(one_hot)
872
+ matrix.append(vector)
873
+ return matrix, lengths
874
+ raise ValueError(f"Unsupported observations encoder: {encoder}")
875
+
876
+
877
+ def _tokenize(text: str) -> List[str]:
878
+ return [token for token in re.split(r"[^A-Za-z0-9]+", text.lower()) if token]
879
+
880
+
881
+ def _tfidf_encode(
882
+ *, texts: Sequence[str], max_features: int, ngram_range: Tuple[int, int]
883
+ ) -> List[List[float]]:
884
+ if max_features <= 0:
885
+ raise ValueError("tfidf.max_features must be positive")
886
+ min_n, max_n = ngram_range
887
+ if min_n <= 0 or max_n < min_n:
888
+ raise ValueError("tfidf.ngram_range is invalid")
889
+
890
+ documents: List[List[str]] = []
891
+ for text in texts:
892
+ tokens = _tokenize(text)
893
+ ngrams: List[str] = []
894
+ for n in range(min_n, max_n + 1):
895
+ for idx in range(0, max(0, len(tokens) - n + 1)):
896
+ ngrams.append(" ".join(tokens[idx : idx + n]))
897
+ documents.append(ngrams)
898
+
899
+ df: Dict[str, int] = {}
900
+ for doc in documents:
901
+ for term in set(doc):
902
+ df[term] = df.get(term, 0) + 1
903
+
904
+ sorted_terms = sorted(df.items(), key=lambda item: (-item[1], item[0]))
905
+ vocabulary = [term for term, _ in sorted_terms[:max_features]]
906
+ index = {term: idx for idx, term in enumerate(vocabulary)}
907
+
908
+ n_docs = max(1, len(documents))
909
+ idf: List[float] = []
910
+ for term in vocabulary:
911
+ count = df.get(term, 0)
912
+ idf.append(float((n_docs + 1) / (count + 1)))
913
+
914
+ vectors: List[List[float]] = []
915
+ for doc in documents:
916
+ tf: Dict[int, int] = {}
917
+ for term in doc:
918
+ term_idx = index.get(term)
919
+ if term_idx is None:
920
+ continue
921
+ tf[term_idx] = tf.get(term_idx, 0) + 1
922
+ length = sum(tf.values()) or 1
923
+ vector = [0.0 for _ in vocabulary]
924
+ for term_idx, count in tf.items():
925
+ vector[term_idx] = (float(count) / float(length)) * idf[term_idx]
926
+ vectors.append(vector)
927
+ return vectors
928
+
929
+
930
+ def _fit_and_decode(
931
+ *, observations: object, lengths: List[int], config: MarkovAnalysisRecipeConfig
932
+ ) -> Tuple[List[int], List[MarkovAnalysisTransition], int]:
933
+ def normalize_startprob(values: Sequence[float]) -> List[float]:
934
+ cleaned = [float(value) if math.isfinite(float(value)) else 0.0 for value in values]
935
+ total = sum(cleaned)
936
+ if total <= 0.0:
937
+ return [1.0 / float(len(cleaned)) for _ in cleaned]
938
+ return [value / total for value in cleaned]
939
+
940
+ def normalize_transmat(matrix: Sequence[Sequence[float]]) -> List[List[float]]:
941
+ normalized: List[List[float]] = []
942
+ size = len(matrix)
943
+ for row in matrix:
944
+ cleaned = [float(value) if math.isfinite(float(value)) else 0.0 for value in row]
945
+ total = sum(cleaned)
946
+ if total <= 0.0:
947
+ normalized.append([1.0 / float(size) for _ in cleaned])
948
+ else:
949
+ normalized.append([value / total for value in cleaned])
950
+ return normalized
951
+
952
+ family = config.model.family
953
+ try:
954
+ from hmmlearn.hmm import CategoricalHMM, GaussianHMM
955
+ except ImportError as import_error:
956
+ raise ValueError(
957
+ "Markov analysis requires an optional dependency. "
958
+ 'Install it with pip install "biblicus[markov-analysis]".'
959
+ ) from import_error
960
+
961
+ if family == MarkovAnalysisModelFamily.CATEGORICAL:
962
+ encoded = list(observations) # type: ignore[arg-type]
963
+ X: object = [[int(value)] for value in encoded]
964
+ try:
965
+ import numpy as np
966
+
967
+ X = np.asarray(X, dtype=int)
968
+ except ImportError:
969
+ pass
970
+ model = CategoricalHMM(n_components=config.model.n_states)
971
+ model.fit(X, lengths=lengths)
972
+ if hasattr(model, "startprob_"):
973
+ startprob = normalize_startprob(model.startprob_)
974
+ try:
975
+ import numpy as np
976
+
977
+ model.startprob_ = np.asarray(startprob, dtype=float)
978
+ except ImportError:
979
+ model.startprob_ = startprob
980
+ if hasattr(model, "transmat_"):
981
+ transmat = normalize_transmat(model.transmat_)
982
+ try:
983
+ import numpy as np
984
+
985
+ model.transmat_ = np.asarray(transmat, dtype=float)
986
+ except ImportError:
987
+ model.transmat_ = transmat
988
+ predicted = list(model.predict(X, lengths=lengths))
989
+ else:
990
+ matrix = list(observations) # type: ignore[arg-type]
991
+ X = matrix
992
+ try:
993
+ import numpy as np
994
+
995
+ X = np.asarray(matrix, dtype=float)
996
+ except ImportError:
997
+ pass
998
+ model = GaussianHMM(n_components=config.model.n_states)
999
+ model.fit(X, lengths=lengths)
1000
+ if hasattr(model, "startprob_"):
1001
+ startprob = normalize_startprob(model.startprob_)
1002
+ try:
1003
+ import numpy as np
1004
+
1005
+ model.startprob_ = np.asarray(startprob, dtype=float)
1006
+ except ImportError:
1007
+ model.startprob_ = startprob
1008
+ if hasattr(model, "transmat_"):
1009
+ transmat = normalize_transmat(model.transmat_)
1010
+ try:
1011
+ import numpy as np
1012
+
1013
+ model.transmat_ = np.asarray(transmat, dtype=float)
1014
+ except ImportError:
1015
+ model.transmat_ = transmat
1016
+ predicted = list(model.predict(X, lengths=lengths))
1017
+
1018
+ transitions: List[MarkovAnalysisTransition] = []
1019
+ transmat = getattr(model, "transmat_", None)
1020
+ if transmat is not None:
1021
+ for from_state in range(len(transmat)):
1022
+ row = transmat[from_state]
1023
+ for to_state in range(len(row)):
1024
+ weight = float(row[to_state])
1025
+ if weight <= 0.0:
1026
+ continue
1027
+ transitions.append(
1028
+ MarkovAnalysisTransition(
1029
+ from_state=from_state,
1030
+ to_state=to_state,
1031
+ weight=weight,
1032
+ )
1033
+ )
1034
+ else:
1035
+ transitions = _transitions_from_sequence(predicted)
1036
+
1037
+ return predicted, transitions, config.model.n_states
1038
+
1039
+
1040
+ def _transitions_from_sequence(states: Sequence[int]) -> List[MarkovAnalysisTransition]:
1041
+ counts: Dict[Tuple[int, int], int] = {}
1042
+ totals: Dict[int, int] = {}
1043
+ for prev, nxt in zip(states, states[1:]):
1044
+ counts[(prev, nxt)] = counts.get((prev, nxt), 0) + 1
1045
+ totals[prev] = totals.get(prev, 0) + 1
1046
+ transitions: List[MarkovAnalysisTransition] = []
1047
+ for (prev, nxt), count in sorted(counts.items()):
1048
+ denom = max(1, totals.get(prev, 0))
1049
+ transitions.append(
1050
+ MarkovAnalysisTransition(
1051
+ from_state=prev, to_state=nxt, weight=float(count) / float(denom)
1052
+ )
1053
+ )
1054
+ return transitions
1055
+
1056
+
1057
+ def _group_decoded_paths(
1058
+ *, segments: Sequence[MarkovAnalysisSegment], predicted_states: Sequence[int]
1059
+ ) -> List[MarkovAnalysisDecodedPath]:
1060
+ paths: Dict[str, List[int]] = {}
1061
+ for segment, state in zip(segments, predicted_states):
1062
+ paths.setdefault(segment.item_id, []).append(int(state))
1063
+ return [
1064
+ MarkovAnalysisDecodedPath(item_id=item_id, state_sequence=sequence)
1065
+ for item_id, sequence in sorted(paths.items())
1066
+ ]
1067
+
1068
+
1069
+ def _build_states(
1070
+ *,
1071
+ segments: Sequence[MarkovAnalysisSegment],
1072
+ predicted_states: Sequence[int],
1073
+ n_states: int,
1074
+ max_exemplars: int,
1075
+ ) -> List[MarkovAnalysisState]:
1076
+ exemplars: Dict[int, List[str]] = {idx: [] for idx in range(n_states)}
1077
+ for segment, state in zip(segments, predicted_states):
1078
+ exemplar_list = exemplars.get(int(state))
1079
+ if exemplar_list is None:
1080
+ continue
1081
+ boundary_token = str(segment.text).strip().upper()
1082
+ if boundary_token in {"START", "END"} and boundary_token not in exemplar_list:
1083
+ if max_exemplars > 0 and len(exemplar_list) >= max_exemplars:
1084
+ exemplar_list[-1] = boundary_token
1085
+ continue
1086
+ exemplar_list.append(boundary_token)
1087
+ continue
1088
+ if len(exemplar_list) >= max_exemplars:
1089
+ continue
1090
+ exemplar_list.append(segment.text)
1091
+ states: List[MarkovAnalysisState] = []
1092
+ for state_id in range(n_states):
1093
+ states.append(
1094
+ MarkovAnalysisState(
1095
+ state_id=state_id,
1096
+ label=None,
1097
+ exemplars=exemplars.get(state_id, []),
1098
+ )
1099
+ )
1100
+ return states
1101
+
1102
+
1103
+ def _state_naming_context_pack(
1104
+ *,
1105
+ states: Sequence[MarkovAnalysisState],
1106
+ config: MarkovAnalysisRecipeConfig,
1107
+ position_stats: Optional[Dict[int, Dict[str, float]]] = None,
1108
+ ) -> Tuple[ContextPack, ContextPackPolicy]:
1109
+ naming = config.report.state_naming
1110
+ if naming is None or not naming.enabled:
1111
+ return ContextPack(text="", evidence_count=0, blocks=[]), ContextPackPolicy()
1112
+ evidence: List[Evidence] = []
1113
+ rank = 1
1114
+ for state in states:
1115
+ stats = (position_stats or {}).get(state.state_id)
1116
+ if stats:
1117
+ after_start = stats.get("after_start_pct", 0.0) * 100.0
1118
+ before_end = stats.get("before_end_pct", 0.0) * 100.0
1119
+ avg_position = stats.get("avg_position_pct", 0.0) * 100.0
1120
+ hint_text = (
1121
+ "Position hints:\n"
1122
+ f"- After START: {after_start:.1f}% of transitions from START\n"
1123
+ f"- Before END: {before_end:.1f}% of transitions to END\n"
1124
+ f"- Average position: {avg_position:.1f}% of call length"
1125
+ )
1126
+ evidence.append(
1127
+ Evidence(
1128
+ item_id=f"state-{state.state_id}",
1129
+ source_uri=None,
1130
+ media_type="text/plain",
1131
+ score=1.0,
1132
+ rank=rank,
1133
+ text=f"State {state.state_id}:\n{hint_text}",
1134
+ stage="state-naming",
1135
+ stage_scores=None,
1136
+ recipe_id="state-naming",
1137
+ run_id="state-naming",
1138
+ hash=None,
1139
+ )
1140
+ )
1141
+ rank += 1
1142
+ exemplars = list(state.exemplars)[: naming.max_exemplars_per_state]
1143
+ for index, exemplar in enumerate(exemplars, start=1):
1144
+ text = f"State {state.state_id} exemplar {index}:\n{exemplar}"
1145
+ evidence.append(
1146
+ Evidence(
1147
+ item_id=f"state-{state.state_id}",
1148
+ source_uri=None,
1149
+ media_type="text/plain",
1150
+ score=1.0,
1151
+ rank=rank,
1152
+ text=text,
1153
+ stage="state-naming",
1154
+ stage_scores=None,
1155
+ recipe_id="state-naming",
1156
+ run_id="state-naming",
1157
+ hash=None,
1158
+ )
1159
+ )
1160
+ rank += 1
1161
+ retrieval_result = RetrievalResult(
1162
+ query_text="state-naming",
1163
+ budget=QueryBudget(max_total_items=max(len(evidence), 1)),
1164
+ run_id="state-naming",
1165
+ recipe_id="state-naming",
1166
+ backend_id="state-naming",
1167
+ generated_at=utc_now_iso(),
1168
+ evidence=evidence,
1169
+ stats={},
1170
+ )
1171
+ policy = ContextPackPolicy(join_with="\n\n", ordering="rank", include_metadata=False)
1172
+ context_pack = build_context_pack(retrieval_result, policy=policy)
1173
+ fitted_pack = fit_context_pack_to_token_budget(
1174
+ context_pack,
1175
+ policy=policy,
1176
+ token_budget=TokenBudget(max_tokens=naming.token_budget),
1177
+ )
1178
+ return fitted_pack, policy
1179
+
1180
+
1181
+ def _validate_state_names(
1182
+ *,
1183
+ response: MarkovStateNamingResponse,
1184
+ state_ids: Sequence[int],
1185
+ max_name_words: int,
1186
+ ) -> Dict[int, str]:
1187
+ def require_short_noun_phrase(name: str, max_words: int) -> None:
1188
+ raw_name = str(name).strip()
1189
+ tokens = [token for token in raw_name.split() if token]
1190
+ word_count = len(tokens)
1191
+ if word_count == 0 or word_count > max_words:
1192
+ raise ValueError("State names must be short noun phrases")
1193
+ if any(symbol in raw_name for symbol in (".", "!", "?", ":", ";")):
1194
+ raise ValueError("State names must be short noun phrases without sentence punctuation")
1195
+ lower_tokens = [token.lower() for token in tokens]
1196
+ if lower_tokens[0] in ("to", "please"):
1197
+ raise ValueError("State names must be short noun phrases")
1198
+ forbidden_auxiliaries = {
1199
+ "am",
1200
+ "are",
1201
+ "be",
1202
+ "been",
1203
+ "being",
1204
+ "is",
1205
+ "was",
1206
+ "were",
1207
+ "can",
1208
+ "could",
1209
+ "do",
1210
+ "does",
1211
+ "did",
1212
+ "doing",
1213
+ "have",
1214
+ "has",
1215
+ "had",
1216
+ "having",
1217
+ "may",
1218
+ "might",
1219
+ "must",
1220
+ "shall",
1221
+ "should",
1222
+ "will",
1223
+ "would",
1224
+ }
1225
+ if any(token in forbidden_auxiliaries for token in lower_tokens):
1226
+ raise ValueError("State names must be short noun phrases without verbs")
1227
+
1228
+ names: Dict[int, str] = {}
1229
+ seen_names: Dict[str, int] = {}
1230
+ for entry in response.state_names:
1231
+ raw_name = str(entry.name).strip()
1232
+ require_short_noun_phrase(raw_name, max_name_words)
1233
+ if entry.state_id in names:
1234
+ raise ValueError("State naming response contains duplicate state_id values")
1235
+ normalized = raw_name.lower()
1236
+ if normalized in seen_names:
1237
+ raise ValueError("State naming response contains duplicate state names")
1238
+ names[entry.state_id] = raw_name
1239
+ seen_names[normalized] = entry.state_id
1240
+ missing = [state_id for state_id in state_ids if state_id not in names]
1241
+ if missing:
1242
+ raise ValueError("State naming response missing required state_id values")
1243
+ return names
1244
+
1245
+
1246
+ def _assign_state_names(
1247
+ *,
1248
+ states: Sequence[MarkovAnalysisState],
1249
+ decoded_paths: Sequence[MarkovAnalysisDecodedPath],
1250
+ config: MarkovAnalysisRecipeConfig,
1251
+ ) -> List[MarkovAnalysisState]:
1252
+ naming = config.report.state_naming
1253
+ if naming is None or not naming.enabled:
1254
+ return list(states)
1255
+ if naming.client is None:
1256
+ raise ValueError("report.state_naming.client is required when enabled")
1257
+ if not states:
1258
+ return list(states)
1259
+ start_state_id = _select_boundary_state_id(states=states, boundary_label="START")
1260
+ end_state_id = _select_boundary_state_id(states=states, boundary_label="END")
1261
+ sanitized_states = _strip_boundary_exemplars(
1262
+ states=states,
1263
+ boundary_label="START",
1264
+ allowed_state_id=start_state_id,
1265
+ )
1266
+ sanitized_states = _strip_boundary_exemplars(
1267
+ states=sanitized_states,
1268
+ boundary_label="END",
1269
+ allowed_state_id=end_state_id,
1270
+ )
1271
+ naming_states = [
1272
+ state for state in sanitized_states if state.state_id not in {start_state_id, end_state_id}
1273
+ ]
1274
+ if not naming_states:
1275
+ return _apply_boundary_labels(
1276
+ states=sanitized_states,
1277
+ start_state_id=start_state_id,
1278
+ end_state_id=end_state_id,
1279
+ )
1280
+ state_ids = [state.state_id for state in naming_states]
1281
+ position_stats = _compute_state_position_stats(
1282
+ decoded_paths=decoded_paths,
1283
+ start_state_id=start_state_id,
1284
+ end_state_id=end_state_id,
1285
+ )
1286
+ context_pack, _policy = _state_naming_context_pack(
1287
+ states=naming_states,
1288
+ config=config,
1289
+ position_stats=position_stats,
1290
+ )
1291
+ system_prompt = str(naming.system_prompt or "").format(context_pack=context_pack.text)
1292
+ user_prompt = str(naming.prompt_template or "").format(
1293
+ state_ids=", ".join(str(state_id) for state_id in state_ids),
1294
+ state_count=len(state_ids),
1295
+ )
1296
+ last_error: Optional[str] = None
1297
+ for attempt in range(naming.max_retries + 1):
1298
+ if last_error is not None:
1299
+ user_prompt = f"{user_prompt}\n\nPrevious response:\n{last_error}\n\nFix the issues and return only JSON."
1300
+ response_text = generate_completion(
1301
+ client=naming.client,
1302
+ system_prompt=system_prompt,
1303
+ user_prompt=user_prompt,
1304
+ ).strip()
1305
+ payload = _parse_json_object(response_text, error_label="Markov state naming")
1306
+ response = MarkovStateNamingResponse.model_validate(payload)
1307
+ try:
1308
+ names = _validate_state_names(
1309
+ response=response,
1310
+ state_ids=state_ids,
1311
+ max_name_words=naming.max_name_words,
1312
+ )
1313
+ except ValueError as exc:
1314
+ last_error = f"{response_text}\n\nError: {exc}"
1315
+ continue
1316
+ updated_states: List[MarkovAnalysisState] = []
1317
+ for state in sanitized_states:
1318
+ if start_state_id is not None and state.state_id == start_state_id:
1319
+ updated_states.append(state.model_copy(update={"label": "START"}))
1320
+ continue
1321
+ if end_state_id is not None and state.state_id == end_state_id:
1322
+ updated_states.append(state.model_copy(update={"label": "END"}))
1323
+ continue
1324
+ base_label = names.get(state.state_id)
1325
+ if base_label is None:
1326
+ updated_states.append(state)
1327
+ continue
1328
+ updated_states.append(state.model_copy(update={"label": base_label}))
1329
+ return updated_states
1330
+ error_text = last_error or "unknown error"
1331
+ raise ValueError(f"Markov state naming failed after retries: {error_text}")
1332
+
1333
+
1334
+ def _select_boundary_state_id(
1335
+ *, states: Sequence[MarkovAnalysisState], boundary_label: str
1336
+ ) -> Optional[int]:
1337
+ candidates: List[Tuple[int, int, int]] = []
1338
+ normalized_label = boundary_label.strip().upper()
1339
+ for state in states:
1340
+ exemplars = [str(exemplar).strip().upper() for exemplar in (state.exemplars or [])]
1341
+ match_count = sum(1 for exemplar in exemplars if exemplar == normalized_label)
1342
+ if match_count:
1343
+ candidates.append((match_count, len(exemplars), state.state_id))
1344
+ if not candidates:
1345
+ return None
1346
+ candidates.sort(reverse=True)
1347
+ return candidates[0][2]
1348
+
1349
+
1350
+ def _strip_boundary_exemplars(
1351
+ *,
1352
+ states: Sequence[MarkovAnalysisState],
1353
+ boundary_label: str,
1354
+ allowed_state_id: Optional[int],
1355
+ ) -> List[MarkovAnalysisState]:
1356
+ normalized_label = boundary_label.strip().upper()
1357
+ updated_states: List[MarkovAnalysisState] = []
1358
+ for state in states:
1359
+ exemplars = list(state.exemplars or [])
1360
+ if allowed_state_id is None or state.state_id != allowed_state_id:
1361
+ exemplars = [
1362
+ exemplar
1363
+ for exemplar in exemplars
1364
+ if str(exemplar).strip().upper() != normalized_label
1365
+ ]
1366
+ updated_states.append(state.model_copy(update={"exemplars": exemplars}))
1367
+ return updated_states
1368
+
1369
+
1370
+ def _apply_boundary_labels(
1371
+ *,
1372
+ states: Sequence[MarkovAnalysisState],
1373
+ start_state_id: Optional[int],
1374
+ end_state_id: Optional[int],
1375
+ ) -> List[MarkovAnalysisState]:
1376
+ updated_states: List[MarkovAnalysisState] = []
1377
+ for state in states:
1378
+ if start_state_id is not None and state.state_id == start_state_id:
1379
+ updated_states.append(state.model_copy(update={"label": "START"}))
1380
+ continue
1381
+ if end_state_id is not None and state.state_id == end_state_id:
1382
+ updated_states.append(state.model_copy(update={"label": "END"}))
1383
+ continue
1384
+ updated_states.append(state)
1385
+ return updated_states
1386
+
1387
+
1388
+ def _compute_state_position_stats(
1389
+ *,
1390
+ decoded_paths: Sequence[MarkovAnalysisDecodedPath],
1391
+ start_state_id: Optional[int],
1392
+ end_state_id: Optional[int],
1393
+ ) -> Dict[int, Dict[str, float]]:
1394
+ after_start_counts: Dict[int, int] = {}
1395
+ before_end_counts: Dict[int, int] = {}
1396
+ avg_position_sums: Dict[int, float] = {}
1397
+ avg_position_counts: Dict[int, int] = {}
1398
+ total_after_start = 0
1399
+ total_before_end = 0
1400
+
1401
+ for path in decoded_paths:
1402
+ sequence = list(path.state_sequence)
1403
+ if len(sequence) < 2:
1404
+ continue
1405
+ last_index = max(1, len(sequence) - 1)
1406
+ for index, state_id in enumerate(sequence):
1407
+ if state_id in {start_state_id, end_state_id}:
1408
+ continue
1409
+ avg_position_sums[state_id] = avg_position_sums.get(state_id, 0.0) + (
1410
+ float(index) / float(last_index)
1411
+ )
1412
+ avg_position_counts[state_id] = avg_position_counts.get(state_id, 0) + 1
1413
+ for from_state, to_state in zip(sequence, sequence[1:]):
1414
+ if start_state_id is not None and from_state == start_state_id:
1415
+ total_after_start += 1
1416
+ after_start_counts[to_state] = after_start_counts.get(to_state, 0) + 1
1417
+ if end_state_id is not None and to_state == end_state_id:
1418
+ total_before_end += 1
1419
+ before_end_counts[from_state] = before_end_counts.get(from_state, 0) + 1
1420
+
1421
+ stats: Dict[int, Dict[str, float]] = {}
1422
+ state_ids = set(avg_position_counts) | set(after_start_counts) | set(before_end_counts)
1423
+ for state_id in state_ids:
1424
+ avg_count = avg_position_counts.get(state_id, 0)
1425
+ stats[state_id] = {
1426
+ "after_start_pct": (
1427
+ after_start_counts.get(state_id, 0) / total_after_start
1428
+ if total_after_start
1429
+ else 0.0
1430
+ ),
1431
+ "before_end_pct": (
1432
+ before_end_counts.get(state_id, 0) / total_before_end if total_before_end else 0.0
1433
+ ),
1434
+ "avg_position_pct": (
1435
+ avg_position_sums.get(state_id, 0.0) / avg_count if avg_count else 0.0
1436
+ ),
1437
+ }
1438
+ return stats
1439
+
1440
+
1441
+ def _write_analysis_run_manifest(*, run_dir: Path, manifest: AnalysisRunManifest) -> None:
1442
+ (run_dir / "manifest.json").write_text(
1443
+ manifest.model_dump_json(indent=2) + "\n", encoding="utf-8"
1444
+ )
1445
+
1446
+
1447
+ def _write_segments(*, run_dir: Path, segments: Sequence[MarkovAnalysisSegment]) -> None:
1448
+ lines = [segment.model_dump_json() for segment in segments]
1449
+ (run_dir / "segments.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
1450
+
1451
+
1452
+ def _write_observations(
1453
+ *, run_dir: Path, observations: Sequence[MarkovAnalysisObservation]
1454
+ ) -> None:
1455
+ lines = [observation.model_dump_json() for observation in observations]
1456
+ (run_dir / "observations.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
1457
+
1458
+
1459
+ def _write_transitions_json(
1460
+ *, run_dir: Path, transitions: Sequence[MarkovAnalysisTransition]
1461
+ ) -> None:
1462
+ payload = [transition.model_dump() for transition in transitions]
1463
+ (run_dir / "transitions.json").write_text(
1464
+ json.dumps(payload, indent=2) + "\n", encoding="utf-8"
1465
+ )
1466
+
1467
+
1468
+ def _write_topic_modeling_report(*, run_dir: Path, report: TopicModelingReport) -> None:
1469
+ (run_dir / "topic_modeling.json").write_text(
1470
+ report.model_dump_json(indent=2) + "\n", encoding="utf-8"
1471
+ )
1472
+
1473
+
1474
+ def _write_topic_assignments(
1475
+ *,
1476
+ run_dir: Path,
1477
+ observations: Sequence[MarkovAnalysisObservation],
1478
+ ) -> None:
1479
+ lines: List[str] = []
1480
+ for observation in observations:
1481
+ payload = {
1482
+ "item_id": observation.item_id,
1483
+ "segment_index": observation.segment_index,
1484
+ "segment_text": observation.segment_text,
1485
+ "topic_id": observation.topic_id,
1486
+ "topic_label": observation.topic_label,
1487
+ }
1488
+ lines.append(json.dumps(payload, ensure_ascii=True))
1489
+ (run_dir / "topic_assignments.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
1490
+
1491
+
1492
+ def _write_graphviz(
1493
+ *,
1494
+ run_dir: Path,
1495
+ transitions: Sequence[MarkovAnalysisTransition],
1496
+ graphviz: MarkovAnalysisArtifactsGraphVizConfig,
1497
+ states: Sequence[MarkovAnalysisState],
1498
+ decoded_paths: Sequence[MarkovAnalysisDecodedPath],
1499
+ ) -> None:
1500
+ """
1501
+ Write GraphViz transition output for Markov analysis.
1502
+
1503
+ The exported edge labels are meant for humans, so they include two values
1504
+ when decoded paths are available:
1505
+
1506
+ 1) The empirical transition percentage derived from decoded paths, rendered
1507
+ as ``X.Y% (a/b)``, where ``a`` is the observed transition count and ``b``
1508
+ is the total number of transitions across all decoded sequences.
1509
+ 2) The model transition probability, rendered as ``model Z.W%``. This is
1510
+ the HMM transition weight for the same edge.
1511
+
1512
+ The empirical value makes it clear how many transitions were actually observed,
1513
+ while the model value shows what the fitted HMM believes. When no decoded paths
1514
+ are available, the label falls back to the model percentage only.
1515
+
1516
+ Edges are filtered by ``graphviz.min_edge_weight`` using the empirical weight
1517
+ when possible; otherwise the model weight is used. This keeps the visualization
1518
+ faithful to observed sequences instead of solely model priors.
1519
+
1520
+ :param run_dir: Directory where the ``transitions.dot`` file is written.
1521
+ :type run_dir: pathlib.Path
1522
+ :param transitions: Markov transition edges with model weights.
1523
+ :type transitions: Sequence[MarkovAnalysisTransition]
1524
+ :param graphviz: GraphViz export configuration.
1525
+ :type graphviz: MarkovAnalysisArtifactsGraphVizConfig
1526
+ :param states: Markov states with labels and exemplars.
1527
+ :type states: Sequence[MarkovAnalysisState]
1528
+ :param decoded_paths: Per-item decoded state sequences.
1529
+ :type decoded_paths: Sequence[MarkovAnalysisDecodedPath]
1530
+ :return: None. Writes ``transitions.dot`` to ``run_dir``.
1531
+ :rtype: None
1532
+ """
1533
+
1534
+ def infer_state_id_by_label(
1535
+ labels: Dict[int, str],
1536
+ keywords: Tuple[str, ...],
1537
+ exemplars_by_state: Dict[int, int],
1538
+ ) -> Optional[int]:
1539
+ matches: List[int] = []
1540
+ for state_id, label in labels.items():
1541
+ normalized = label.lower()
1542
+ if any(keyword in normalized for keyword in keywords):
1543
+ matches.append(state_id)
1544
+ if not matches:
1545
+ return None
1546
+ if len(matches) == 1:
1547
+ return matches[0]
1548
+ return max(matches, key=lambda state_id: exemplars_by_state.get(state_id, 0))
1549
+
1550
+ lines: List[str] = []
1551
+ lines.append("digraph markov {")
1552
+ rankdir = str(graphviz.rankdir or "LR").upper()
1553
+ lines.append(f' rankdir="{rankdir}";')
1554
+ label_by_state: Dict[int, str] = {}
1555
+ exemplars_by_state: Dict[int, int] = {}
1556
+ exemplar_start: Dict[int, bool] = {}
1557
+ exemplar_end: Dict[int, bool] = {}
1558
+ for state in states:
1559
+ label_by_state[state.state_id] = str(state.label or "")
1560
+ exemplars_by_state[state.state_id] = len(state.exemplars or [])
1561
+ base_label = str(state.label or str(state.state_id))
1562
+ exemplar_start[state.state_id] = any(
1563
+ str(exemplar).strip().upper() == "START" for exemplar in (state.exemplars or [])
1564
+ )
1565
+ exemplar_end[state.state_id] = any(
1566
+ str(exemplar).strip().upper() == "END" for exemplar in (state.exemplars or [])
1567
+ )
1568
+ label = base_label
1569
+ safe_label = label.replace('"', '\\"')
1570
+ lines.append(f' {state.state_id} [label="{safe_label}"];')
1571
+ start_state_id = graphviz.start_state_id
1572
+ end_state_id = graphviz.end_state_id
1573
+ if start_state_id is None:
1574
+ matching = [state_id for state_id, has in exemplar_start.items() if has]
1575
+ if matching:
1576
+ start_state_id = max(matching, key=lambda state_id: exemplars_by_state.get(state_id, 0))
1577
+ else:
1578
+ start_state_id = infer_state_id_by_label(
1579
+ label_by_state,
1580
+ ("start", "greeting", "opening"),
1581
+ exemplars_by_state,
1582
+ )
1583
+ if end_state_id is None:
1584
+ matching = [state_id for state_id, has in exemplar_end.items() if has]
1585
+ if matching:
1586
+ end_state_id = max(matching, key=lambda state_id: exemplars_by_state.get(state_id, 0))
1587
+ else:
1588
+ end_state_id = infer_state_id_by_label(
1589
+ label_by_state,
1590
+ ("end", "closing", "goodbye", "wrap-up"),
1591
+ exemplars_by_state,
1592
+ )
1593
+ if start_state_id is not None:
1594
+ lines.append(f" {{ rank=min; {start_state_id}; }}")
1595
+ lines.append(
1596
+ f' {start_state_id} [shape="ellipse", peripheries=2, style="bold", color="#2b8a3e"];'
1597
+ )
1598
+ if end_state_id is not None:
1599
+ lines.append(f" {{ rank=max; {end_state_id}; }}")
1600
+ lines.append(f' {end_state_id} [shape="ellipse", peripheries=2, color="#b42318"];')
1601
+ observed_counts: Dict[Tuple[int, int], int] = {}
1602
+ observed_totals_by_state: Dict[int, int] = {}
1603
+ for path in decoded_paths:
1604
+ sequence = list(path.state_sequence)
1605
+ for from_state, to_state in zip(sequence, sequence[1:]):
1606
+ observed_counts[(from_state, to_state)] = (
1607
+ observed_counts.get((from_state, to_state), 0) + 1
1608
+ )
1609
+ observed_totals_by_state[from_state] = observed_totals_by_state.get(from_state, 0) + 1
1610
+
1611
+ for transition in transitions:
1612
+ if end_state_id is not None and transition.from_state == end_state_id:
1613
+ continue
1614
+ observed_count = observed_counts.get((transition.from_state, transition.to_state), 0)
1615
+ observed_total = observed_totals_by_state.get(transition.from_state, 0)
1616
+ observed_weight = observed_count / observed_total if observed_total else transition.weight
1617
+ if observed_total and observed_count == 0:
1618
+ continue
1619
+ if observed_weight < graphviz.min_edge_weight:
1620
+ continue
1621
+ label = f"{observed_weight * 100.0:.1f}%"
1622
+ lines.append(f' {transition.from_state} -> {transition.to_state} [label="{label}"];')
1623
+ lines.append("}")
1624
+ (run_dir / "transitions.dot").write_text("\n".join(lines) + "\n", encoding="utf-8")