biblicus 0.13.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/hybrid.py +6 -1
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/context.py +2 -2
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +90 -26
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +32 -17
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.13.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1624 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markov analysis backend for Biblicus.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import math
|
|
9
|
+
import re
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, List, Optional, Sequence, Tuple
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from ..ai.embeddings import generate_embeddings_batch
|
|
17
|
+
from ..ai.llm import generate_completion
|
|
18
|
+
from ..context import (
|
|
19
|
+
ContextPack,
|
|
20
|
+
ContextPackPolicy,
|
|
21
|
+
TokenBudget,
|
|
22
|
+
build_context_pack,
|
|
23
|
+
fit_context_pack_to_token_budget,
|
|
24
|
+
)
|
|
25
|
+
from ..corpus import Corpus
|
|
26
|
+
from ..models import Evidence, ExtractionRunReference, QueryBudget, RetrievalResult
|
|
27
|
+
from ..retrieval import hash_text
|
|
28
|
+
from ..text.annotate import TextAnnotateRequest, apply_text_annotate
|
|
29
|
+
from ..text.extract import TextExtractRequest, apply_text_extract
|
|
30
|
+
from ..time import utc_now_iso
|
|
31
|
+
from .base import CorpusAnalysisBackend
|
|
32
|
+
from .models import (
|
|
33
|
+
AnalysisRecipeManifest,
|
|
34
|
+
AnalysisRunInput,
|
|
35
|
+
AnalysisRunManifest,
|
|
36
|
+
MarkovAnalysisArtifactsGraphVizConfig,
|
|
37
|
+
MarkovAnalysisDecodedPath,
|
|
38
|
+
MarkovAnalysisModelFamily,
|
|
39
|
+
MarkovAnalysisObservation,
|
|
40
|
+
MarkovAnalysisObservationsEncoder,
|
|
41
|
+
MarkovAnalysisOutput,
|
|
42
|
+
MarkovAnalysisRecipeConfig,
|
|
43
|
+
MarkovAnalysisReport,
|
|
44
|
+
MarkovAnalysisSegment,
|
|
45
|
+
MarkovAnalysisSegmentationMethod,
|
|
46
|
+
MarkovAnalysisStageStatus,
|
|
47
|
+
MarkovAnalysisState,
|
|
48
|
+
MarkovAnalysisTextCollectionReport,
|
|
49
|
+
MarkovAnalysisTextSourceConfig,
|
|
50
|
+
MarkovAnalysisTransition,
|
|
51
|
+
TopicModelingReport,
|
|
52
|
+
)
|
|
53
|
+
from .topic_modeling import TopicModelingDocument, run_topic_modeling_for_documents
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class MarkovStateName(BaseModel):
|
|
57
|
+
"""
|
|
58
|
+
Structured response for a single state name.
|
|
59
|
+
|
|
60
|
+
:ivar state_id: State identifier.
|
|
61
|
+
:vartype state_id: int
|
|
62
|
+
:ivar name: Short noun-phrase name for the state.
|
|
63
|
+
:vartype name: str
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
state_id: int
|
|
67
|
+
name: str
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class MarkovStateNamingResponse(BaseModel):
|
|
71
|
+
"""
|
|
72
|
+
Structured response for state naming.
|
|
73
|
+
|
|
74
|
+
:ivar state_names: State name assignments.
|
|
75
|
+
:vartype state_names: list[MarkovStateName]
|
|
76
|
+
:ivar start_state_id: Optional state id representing the start state.
|
|
77
|
+
:vartype start_state_id: int or None
|
|
78
|
+
:ivar end_state_id: Optional state id representing the end state.
|
|
79
|
+
:vartype end_state_id: int or None
|
|
80
|
+
:ivar disconnection_state_id: Optional state id representing a disconnection state.
|
|
81
|
+
:vartype disconnection_state_id: int or None
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
state_names: List[MarkovStateName]
|
|
85
|
+
start_state_id: Optional[int] = None
|
|
86
|
+
end_state_id: Optional[int] = None
|
|
87
|
+
disconnection_state_id: Optional[int] = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class _Document:
|
|
92
|
+
item_id: str
|
|
93
|
+
text: str
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class MarkovBackend(CorpusAnalysisBackend):
|
|
97
|
+
"""
|
|
98
|
+
Markov analysis backend.
|
|
99
|
+
|
|
100
|
+
:ivar analysis_id: Backend identifier.
|
|
101
|
+
:vartype analysis_id: str
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
analysis_id = "markov"
|
|
105
|
+
|
|
106
|
+
def run_analysis(
|
|
107
|
+
self,
|
|
108
|
+
corpus: Corpus,
|
|
109
|
+
*,
|
|
110
|
+
recipe_name: str,
|
|
111
|
+
config: Dict[str, object],
|
|
112
|
+
extraction_run: ExtractionRunReference,
|
|
113
|
+
) -> BaseModel:
|
|
114
|
+
"""
|
|
115
|
+
Run Markov analysis for a corpus.
|
|
116
|
+
|
|
117
|
+
:param corpus: Corpus to analyze.
|
|
118
|
+
:type corpus: Corpus
|
|
119
|
+
:param recipe_name: Human-readable recipe name.
|
|
120
|
+
:type recipe_name: str
|
|
121
|
+
:param config: Analysis configuration values.
|
|
122
|
+
:type config: dict[str, object]
|
|
123
|
+
:param extraction_run: Extraction run reference for text inputs.
|
|
124
|
+
:type extraction_run: biblicus.models.ExtractionRunReference
|
|
125
|
+
:return: Markov analysis output model.
|
|
126
|
+
:rtype: pydantic.BaseModel
|
|
127
|
+
"""
|
|
128
|
+
parsed_config = (
|
|
129
|
+
config
|
|
130
|
+
if isinstance(config, MarkovAnalysisRecipeConfig)
|
|
131
|
+
else MarkovAnalysisRecipeConfig.model_validate(config)
|
|
132
|
+
)
|
|
133
|
+
return _run_markov(
|
|
134
|
+
corpus=corpus,
|
|
135
|
+
recipe_name=recipe_name,
|
|
136
|
+
config=parsed_config,
|
|
137
|
+
extraction_run=extraction_run,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _run_markov(
|
|
142
|
+
*,
|
|
143
|
+
corpus: Corpus,
|
|
144
|
+
recipe_name: str,
|
|
145
|
+
config: MarkovAnalysisRecipeConfig,
|
|
146
|
+
extraction_run: ExtractionRunReference,
|
|
147
|
+
) -> MarkovAnalysisOutput:
|
|
148
|
+
recipe = _create_recipe_manifest(name=recipe_name, config=config)
|
|
149
|
+
catalog = corpus.load_catalog()
|
|
150
|
+
run_id = _analysis_run_id(
|
|
151
|
+
recipe_id=recipe.recipe_id,
|
|
152
|
+
extraction_run=extraction_run,
|
|
153
|
+
catalog_generated_at=catalog.generated_at,
|
|
154
|
+
)
|
|
155
|
+
run_manifest = AnalysisRunManifest(
|
|
156
|
+
run_id=run_id,
|
|
157
|
+
recipe=recipe,
|
|
158
|
+
corpus_uri=catalog.corpus_uri,
|
|
159
|
+
catalog_generated_at=catalog.generated_at,
|
|
160
|
+
created_at=utc_now_iso(),
|
|
161
|
+
input=AnalysisRunInput(extraction_run=extraction_run),
|
|
162
|
+
artifact_paths=[],
|
|
163
|
+
stats={},
|
|
164
|
+
)
|
|
165
|
+
run_dir = corpus.analysis_run_dir(analysis_id=MarkovBackend.analysis_id, run_id=run_id)
|
|
166
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
|
|
168
|
+
documents, text_report = _collect_documents(
|
|
169
|
+
corpus=corpus,
|
|
170
|
+
extraction_run=extraction_run,
|
|
171
|
+
config=config.text_source,
|
|
172
|
+
)
|
|
173
|
+
segments = _segment_documents(documents=documents, config=config)
|
|
174
|
+
observations = _build_observations(segments=segments, config=config)
|
|
175
|
+
observations, topic_report = _apply_topic_modeling(observations=observations, config=config)
|
|
176
|
+
observation_matrix, lengths = _encode_observations(observations=observations, config=config)
|
|
177
|
+
|
|
178
|
+
predicted_states, transitions, state_count = _fit_and_decode(
|
|
179
|
+
observations=observation_matrix,
|
|
180
|
+
lengths=lengths,
|
|
181
|
+
config=config,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
decoded_paths = _group_decoded_paths(segments=segments, predicted_states=predicted_states)
|
|
185
|
+
states = _build_states(
|
|
186
|
+
segments=segments,
|
|
187
|
+
predicted_states=predicted_states,
|
|
188
|
+
n_states=state_count,
|
|
189
|
+
max_exemplars=config.report.max_state_exemplars,
|
|
190
|
+
)
|
|
191
|
+
states = _assign_state_names(
|
|
192
|
+
states=states,
|
|
193
|
+
decoded_paths=decoded_paths,
|
|
194
|
+
config=config,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
artifact_paths: List[str] = [
|
|
198
|
+
"output.json",
|
|
199
|
+
"segments.jsonl",
|
|
200
|
+
"observations.jsonl",
|
|
201
|
+
"transitions.json",
|
|
202
|
+
]
|
|
203
|
+
_write_segments(run_dir=run_dir, segments=segments)
|
|
204
|
+
_write_observations(run_dir=run_dir, observations=observations)
|
|
205
|
+
_write_transitions_json(run_dir=run_dir, transitions=transitions)
|
|
206
|
+
if topic_report is not None:
|
|
207
|
+
_write_topic_modeling_report(run_dir=run_dir, report=topic_report)
|
|
208
|
+
_write_topic_assignments(run_dir=run_dir, observations=observations)
|
|
209
|
+
artifact_paths.extend(["topic_modeling.json", "topic_assignments.jsonl"])
|
|
210
|
+
|
|
211
|
+
if config.artifacts.graphviz.enabled:
|
|
212
|
+
_write_graphviz(
|
|
213
|
+
run_dir=run_dir,
|
|
214
|
+
transitions=transitions,
|
|
215
|
+
graphviz=config.artifacts.graphviz,
|
|
216
|
+
states=states,
|
|
217
|
+
decoded_paths=decoded_paths,
|
|
218
|
+
)
|
|
219
|
+
artifact_paths.append("transitions.dot")
|
|
220
|
+
|
|
221
|
+
warnings = list(text_report.warnings)
|
|
222
|
+
errors = list(text_report.errors)
|
|
223
|
+
if topic_report is not None:
|
|
224
|
+
warnings.extend(topic_report.warnings)
|
|
225
|
+
errors.extend(topic_report.errors)
|
|
226
|
+
|
|
227
|
+
report = MarkovAnalysisReport(
|
|
228
|
+
text_collection=text_report,
|
|
229
|
+
status=MarkovAnalysisStageStatus.COMPLETE,
|
|
230
|
+
states=states,
|
|
231
|
+
transitions=transitions,
|
|
232
|
+
decoded_paths=decoded_paths,
|
|
233
|
+
topic_modeling=topic_report,
|
|
234
|
+
warnings=warnings,
|
|
235
|
+
errors=errors,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
run_stats = {
|
|
239
|
+
"items": len({doc.item_id for doc in documents}),
|
|
240
|
+
"segments": len(segments),
|
|
241
|
+
"states": len(states),
|
|
242
|
+
"transitions": len(transitions),
|
|
243
|
+
}
|
|
244
|
+
if topic_report is not None:
|
|
245
|
+
run_stats["topics"] = len(topic_report.topics)
|
|
246
|
+
run_manifest = run_manifest.model_copy(
|
|
247
|
+
update={"artifact_paths": artifact_paths, "stats": run_stats}
|
|
248
|
+
)
|
|
249
|
+
_write_analysis_run_manifest(run_dir=run_dir, manifest=run_manifest)
|
|
250
|
+
|
|
251
|
+
output = MarkovAnalysisOutput(
|
|
252
|
+
analysis_id=MarkovBackend.analysis_id,
|
|
253
|
+
generated_at=utc_now_iso(),
|
|
254
|
+
run=run_manifest,
|
|
255
|
+
report=report,
|
|
256
|
+
)
|
|
257
|
+
(run_dir / "output.json").write_text(output.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
258
|
+
return output
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _create_recipe_manifest(
|
|
262
|
+
*, name: str, config: MarkovAnalysisRecipeConfig
|
|
263
|
+
) -> AnalysisRecipeManifest:
|
|
264
|
+
recipe_payload = json.dumps(
|
|
265
|
+
{
|
|
266
|
+
"analysis_id": MarkovBackend.analysis_id,
|
|
267
|
+
"name": name,
|
|
268
|
+
"config": config.model_dump(),
|
|
269
|
+
},
|
|
270
|
+
sort_keys=True,
|
|
271
|
+
)
|
|
272
|
+
recipe_id = hash_text(recipe_payload)
|
|
273
|
+
return AnalysisRecipeManifest(
|
|
274
|
+
recipe_id=recipe_id,
|
|
275
|
+
analysis_id=MarkovBackend.analysis_id,
|
|
276
|
+
name=name,
|
|
277
|
+
created_at=utc_now_iso(),
|
|
278
|
+
config=config.model_dump(),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _analysis_run_id(
|
|
283
|
+
*, recipe_id: str, extraction_run: ExtractionRunReference, catalog_generated_at: str
|
|
284
|
+
) -> str:
|
|
285
|
+
run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
|
|
286
|
+
return hash_text(run_seed)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _collect_documents(
|
|
290
|
+
*,
|
|
291
|
+
corpus: Corpus,
|
|
292
|
+
extraction_run: ExtractionRunReference,
|
|
293
|
+
config: MarkovAnalysisTextSourceConfig,
|
|
294
|
+
) -> Tuple[List[_Document], MarkovAnalysisTextCollectionReport]:
|
|
295
|
+
manifest = corpus.load_extraction_run_manifest(
|
|
296
|
+
extractor_id=extraction_run.extractor_id,
|
|
297
|
+
run_id=extraction_run.run_id,
|
|
298
|
+
)
|
|
299
|
+
warnings: List[str] = []
|
|
300
|
+
errors: List[str] = []
|
|
301
|
+
documents: List[_Document] = []
|
|
302
|
+
skipped_items = 0
|
|
303
|
+
empty_texts = 0
|
|
304
|
+
|
|
305
|
+
run_root = corpus.extraction_run_dir(
|
|
306
|
+
extractor_id=extraction_run.extractor_id,
|
|
307
|
+
run_id=extraction_run.run_id,
|
|
308
|
+
)
|
|
309
|
+
for item_result in manifest.items:
|
|
310
|
+
if item_result.status != "extracted" or item_result.final_text_relpath is None:
|
|
311
|
+
skipped_items += 1
|
|
312
|
+
continue
|
|
313
|
+
text_path = run_root / item_result.final_text_relpath
|
|
314
|
+
text_value = text_path.read_text(encoding="utf-8").strip()
|
|
315
|
+
if not text_value:
|
|
316
|
+
empty_texts += 1
|
|
317
|
+
continue
|
|
318
|
+
if config.min_text_characters is not None and len(text_value) < config.min_text_characters:
|
|
319
|
+
skipped_items += 1
|
|
320
|
+
continue
|
|
321
|
+
documents.append(_Document(item_id=item_result.item_id, text=text_value))
|
|
322
|
+
|
|
323
|
+
if config.sample_size is not None and len(documents) > config.sample_size:
|
|
324
|
+
documents = documents[: config.sample_size]
|
|
325
|
+
warnings.append("Text collection truncated to sample_size")
|
|
326
|
+
|
|
327
|
+
report = MarkovAnalysisTextCollectionReport(
|
|
328
|
+
status=MarkovAnalysisStageStatus.COMPLETE,
|
|
329
|
+
source_items=len(manifest.items),
|
|
330
|
+
documents=len(documents),
|
|
331
|
+
sample_size=config.sample_size,
|
|
332
|
+
min_text_characters=config.min_text_characters,
|
|
333
|
+
empty_texts=empty_texts,
|
|
334
|
+
skipped_items=skipped_items,
|
|
335
|
+
warnings=warnings,
|
|
336
|
+
errors=errors,
|
|
337
|
+
)
|
|
338
|
+
if not documents:
|
|
339
|
+
report = report.model_copy(update={"status": MarkovAnalysisStageStatus.FAILED})
|
|
340
|
+
raise ValueError("Markov analysis requires at least one extracted text document")
|
|
341
|
+
return documents, report
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _segment_documents(
|
|
345
|
+
*, documents: Sequence[_Document], config: MarkovAnalysisRecipeConfig
|
|
346
|
+
) -> List[MarkovAnalysisSegment]:
|
|
347
|
+
segments: List[MarkovAnalysisSegment] = []
|
|
348
|
+
method = config.segmentation.method
|
|
349
|
+
for document in documents:
|
|
350
|
+
if method == MarkovAnalysisSegmentationMethod.SENTENCE:
|
|
351
|
+
segments.extend(_sentence_segments(item_id=document.item_id, text=document.text))
|
|
352
|
+
continue
|
|
353
|
+
if method == MarkovAnalysisSegmentationMethod.FIXED_WINDOW:
|
|
354
|
+
segments.extend(
|
|
355
|
+
_fixed_window_segments(
|
|
356
|
+
item_id=document.item_id,
|
|
357
|
+
text=document.text,
|
|
358
|
+
max_characters=config.segmentation.fixed_window.max_characters,
|
|
359
|
+
overlap_characters=config.segmentation.fixed_window.overlap_characters,
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
continue
|
|
363
|
+
if method == MarkovAnalysisSegmentationMethod.LLM:
|
|
364
|
+
segments.extend(
|
|
365
|
+
_llm_segments(item_id=document.item_id, text=document.text, config=config)
|
|
366
|
+
)
|
|
367
|
+
continue
|
|
368
|
+
if method == MarkovAnalysisSegmentationMethod.SPAN_MARKUP:
|
|
369
|
+
segments.extend(
|
|
370
|
+
_span_markup_segments(item_id=document.item_id, text=document.text, config=config)
|
|
371
|
+
)
|
|
372
|
+
continue
|
|
373
|
+
raise ValueError(f"Unsupported segmentation method: {method}")
|
|
374
|
+
if not segments:
|
|
375
|
+
raise ValueError("Markov analysis produced no segments")
|
|
376
|
+
return _add_boundary_segments(segments=segments)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _add_boundary_segments(
|
|
380
|
+
*, segments: Sequence[MarkovAnalysisSegment]
|
|
381
|
+
) -> List[MarkovAnalysisSegment]:
|
|
382
|
+
"""
|
|
383
|
+
Add synthetic START/END boundary segments for each item sequence.
|
|
384
|
+
|
|
385
|
+
This is a deterministic, programmatic boundary signal that keeps the LLM
|
|
386
|
+
segmentation focused only on natural text phases. We insert:
|
|
387
|
+
- a leading START segment per item
|
|
388
|
+
- a trailing END segment per item
|
|
389
|
+
|
|
390
|
+
These boundaries are added after segmentation for all methods (sentence,
|
|
391
|
+
fixed-window, llm, span-markup) so the model never has to edit or reason
|
|
392
|
+
about them during extraction.
|
|
393
|
+
|
|
394
|
+
:param segments: Ordered segments grouped by item_id.
|
|
395
|
+
:type segments: Sequence[MarkovAnalysisSegment]
|
|
396
|
+
:return: Segments with START/END boundaries per item.
|
|
397
|
+
:rtype: list[MarkovAnalysisSegment]
|
|
398
|
+
"""
|
|
399
|
+
if not segments:
|
|
400
|
+
return []
|
|
401
|
+
enriched: List[MarkovAnalysisSegment] = []
|
|
402
|
+
current_item: Optional[str] = None
|
|
403
|
+
buffer: List[MarkovAnalysisSegment] = []
|
|
404
|
+
|
|
405
|
+
def flush() -> None:
|
|
406
|
+
item_id = buffer[0].item_id
|
|
407
|
+
index = 1
|
|
408
|
+
enriched.append(MarkovAnalysisSegment(item_id=item_id, segment_index=index, text="START"))
|
|
409
|
+
for segment in buffer:
|
|
410
|
+
index += 1
|
|
411
|
+
enriched.append(
|
|
412
|
+
MarkovAnalysisSegment(item_id=item_id, segment_index=index, text=segment.text)
|
|
413
|
+
)
|
|
414
|
+
index += 1
|
|
415
|
+
enriched.append(MarkovAnalysisSegment(item_id=item_id, segment_index=index, text="END"))
|
|
416
|
+
|
|
417
|
+
for segment in segments:
|
|
418
|
+
if current_item is None:
|
|
419
|
+
current_item = segment.item_id
|
|
420
|
+
if segment.item_id != current_item:
|
|
421
|
+
flush()
|
|
422
|
+
buffer = []
|
|
423
|
+
current_item = segment.item_id
|
|
424
|
+
buffer.append(segment)
|
|
425
|
+
flush()
|
|
426
|
+
return enriched
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _sentence_segments(*, item_id: str, text: str) -> List[MarkovAnalysisSegment]:
|
|
433
|
+
tokens = [token.strip() for token in _SENTENCE_SPLIT.split(text) if token.strip()]
|
|
434
|
+
segments: List[MarkovAnalysisSegment] = []
|
|
435
|
+
for index, token in enumerate(tokens, start=1):
|
|
436
|
+
segments.append(
|
|
437
|
+
MarkovAnalysisSegment(
|
|
438
|
+
item_id=item_id,
|
|
439
|
+
segment_index=index,
|
|
440
|
+
text=token,
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
return segments
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _fixed_window_segments(
|
|
447
|
+
*, item_id: str, text: str, max_characters: int, overlap_characters: int
|
|
448
|
+
) -> List[MarkovAnalysisSegment]:
|
|
449
|
+
segments: List[MarkovAnalysisSegment] = []
|
|
450
|
+
if max_characters <= 0:
|
|
451
|
+
raise ValueError("fixed_window.max_characters must be positive")
|
|
452
|
+
if overlap_characters < 0:
|
|
453
|
+
raise ValueError("fixed_window.overlap_characters must be non-negative")
|
|
454
|
+
if overlap_characters >= max_characters:
|
|
455
|
+
raise ValueError("fixed_window.overlap_characters must be smaller than max_characters")
|
|
456
|
+
|
|
457
|
+
start = 0
|
|
458
|
+
index = 1
|
|
459
|
+
while start < len(text):
|
|
460
|
+
end = min(len(text), start + max_characters)
|
|
461
|
+
chunk = text[start:end].strip()
|
|
462
|
+
if chunk:
|
|
463
|
+
segments.append(MarkovAnalysisSegment(item_id=item_id, segment_index=index, text=chunk))
|
|
464
|
+
index += 1
|
|
465
|
+
if end >= len(text):
|
|
466
|
+
break
|
|
467
|
+
start = max(0, end - overlap_characters)
|
|
468
|
+
return segments
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _llm_segments(
|
|
472
|
+
*, item_id: str, text: str, config: MarkovAnalysisRecipeConfig
|
|
473
|
+
) -> List[MarkovAnalysisSegment]:
|
|
474
|
+
llm_config = config.segmentation.llm
|
|
475
|
+
if llm_config is None:
|
|
476
|
+
raise ValueError("segmentation.llm is required when segmentation.method is 'llm'")
|
|
477
|
+
prompt = llm_config.prompt_template.format(text=text)
|
|
478
|
+
response_text = generate_completion(
|
|
479
|
+
client=llm_config.client,
|
|
480
|
+
system_prompt=llm_config.system_prompt,
|
|
481
|
+
user_prompt=prompt,
|
|
482
|
+
).strip()
|
|
483
|
+
if llm_config.client.response_format == "json_object":
|
|
484
|
+
payload = _parse_json_object(response_text, error_label="LLM segmentation")
|
|
485
|
+
segments_payload = payload.get("segments")
|
|
486
|
+
if not isinstance(segments_payload, list):
|
|
487
|
+
raise ValueError("LLM segmentation must return a JSON object with a 'segments' list")
|
|
488
|
+
else:
|
|
489
|
+
segments_payload = _parse_json_list(response_text, error_label="LLM segmentation")
|
|
490
|
+
segments: List[MarkovAnalysisSegment] = []
|
|
491
|
+
for index, value in enumerate(segments_payload, start=1):
|
|
492
|
+
segment_text = str(value).strip()
|
|
493
|
+
if not segment_text:
|
|
494
|
+
continue
|
|
495
|
+
segments.append(
|
|
496
|
+
MarkovAnalysisSegment(item_id=item_id, segment_index=index, text=segment_text)
|
|
497
|
+
)
|
|
498
|
+
return segments
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _span_markup_segments(
|
|
502
|
+
*, item_id: str, text: str, config: MarkovAnalysisRecipeConfig
|
|
503
|
+
) -> List[MarkovAnalysisSegment]:
|
|
504
|
+
markup_config = config.segmentation.span_markup
|
|
505
|
+
if markup_config is None:
|
|
506
|
+
raise ValueError(
|
|
507
|
+
"segmentation.span_markup is required when segmentation.method is 'span_markup'"
|
|
508
|
+
)
|
|
509
|
+
label_attribute = markup_config.label_attribute
|
|
510
|
+
prepend_label = markup_config.prepend_label
|
|
511
|
+
if label_attribute is not None or prepend_label:
|
|
512
|
+
request = TextAnnotateRequest(
|
|
513
|
+
text=text,
|
|
514
|
+
client=markup_config.client,
|
|
515
|
+
prompt_template=markup_config.prompt_template,
|
|
516
|
+
system_prompt=markup_config.system_prompt,
|
|
517
|
+
allowed_attributes=[label_attribute] if label_attribute else None,
|
|
518
|
+
max_rounds=markup_config.max_rounds,
|
|
519
|
+
max_edits_per_round=markup_config.max_edits_per_round,
|
|
520
|
+
)
|
|
521
|
+
result = apply_text_annotate(request)
|
|
522
|
+
else:
|
|
523
|
+
request = TextExtractRequest(
|
|
524
|
+
text=text,
|
|
525
|
+
client=markup_config.client,
|
|
526
|
+
prompt_template=markup_config.prompt_template,
|
|
527
|
+
system_prompt=markup_config.system_prompt,
|
|
528
|
+
max_rounds=markup_config.max_rounds,
|
|
529
|
+
max_edits_per_round=markup_config.max_edits_per_round,
|
|
530
|
+
)
|
|
531
|
+
result = apply_text_extract(request)
|
|
532
|
+
segment_payloads: List[Dict[str, object]] = []
|
|
533
|
+
for index, span in enumerate(result.spans, start=1):
|
|
534
|
+
segment_body = str(span.text).strip()
|
|
535
|
+
if not segment_body:
|
|
536
|
+
continue
|
|
537
|
+
segment_text = segment_body
|
|
538
|
+
if prepend_label:
|
|
539
|
+
if label_attribute is None:
|
|
540
|
+
raise ValueError(
|
|
541
|
+
"segmentation.span_markup.label_attribute is required when "
|
|
542
|
+
"segmentation.span_markup.prepend_label is true"
|
|
543
|
+
)
|
|
544
|
+
label_value = str(span.attributes.get(label_attribute, "")).strip()
|
|
545
|
+
if not label_value:
|
|
546
|
+
raise ValueError(f"Span {index} missing label attribute '{label_attribute}'")
|
|
547
|
+
segment_text = f"{label_value}\n{segment_body}"
|
|
548
|
+
segment_payloads.append(
|
|
549
|
+
{"segment_index": index, "body": segment_body, "text": segment_text}
|
|
550
|
+
)
|
|
551
|
+
segments: List[MarkovAnalysisSegment] = []
|
|
552
|
+
for payload in segment_payloads:
|
|
553
|
+
segments.append(
|
|
554
|
+
MarkovAnalysisSegment(
|
|
555
|
+
item_id=item_id,
|
|
556
|
+
segment_index=int(payload["segment_index"]),
|
|
557
|
+
text=str(payload["text"]),
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
return segments
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _verify_end_label(
|
|
564
|
+
*, text: str, config: MarkovAnalysisRecipeConfig
|
|
565
|
+
) -> Optional[Dict[str, object]]:
|
|
566
|
+
markup_config = config.segmentation.span_markup
|
|
567
|
+
if markup_config is None or markup_config.end_label_verifier is None:
|
|
568
|
+
return None
|
|
569
|
+
verifier = markup_config.end_label_verifier
|
|
570
|
+
system_prompt = verifier.system_prompt.replace("{text}", text)
|
|
571
|
+
user_prompt = verifier.prompt_template.replace("{text}", text)
|
|
572
|
+
response_text = generate_completion(
|
|
573
|
+
client=verifier.client,
|
|
574
|
+
system_prompt=system_prompt,
|
|
575
|
+
user_prompt=user_prompt,
|
|
576
|
+
).strip()
|
|
577
|
+
payload = _parse_json_object(response_text, error_label="End label verifier")
|
|
578
|
+
return {
|
|
579
|
+
"is_end": bool(payload.get("is_end")),
|
|
580
|
+
"reason": payload.get("reason"),
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def _apply_start_end_labels(
|
|
585
|
+
*,
|
|
586
|
+
item_id: str,
|
|
587
|
+
payloads: Sequence[Dict[str, object]],
|
|
588
|
+
config: MarkovAnalysisRecipeConfig,
|
|
589
|
+
) -> List[MarkovAnalysisSegment]:
|
|
590
|
+
markup_config = config.segmentation.span_markup
|
|
591
|
+
if markup_config is None:
|
|
592
|
+
raise ValueError("segmentation.span_markup is required for start/end labels")
|
|
593
|
+
segments: List[MarkovAnalysisSegment] = []
|
|
594
|
+
for payload in payloads:
|
|
595
|
+
segment_text = str(payload.get("text") or payload.get("body") or "").strip()
|
|
596
|
+
if not segment_text:
|
|
597
|
+
continue
|
|
598
|
+
segments.append(
|
|
599
|
+
MarkovAnalysisSegment(
|
|
600
|
+
item_id=item_id,
|
|
601
|
+
segment_index=int(payload.get("segment_index") or len(segments) + 1),
|
|
602
|
+
text=segment_text,
|
|
603
|
+
)
|
|
604
|
+
)
|
|
605
|
+
if not segments:
|
|
606
|
+
return segments
|
|
607
|
+
if markup_config.start_label_value:
|
|
608
|
+
segments[0] = segments[0].model_copy(
|
|
609
|
+
update={"text": f"{markup_config.start_label_value}\n{segments[0].text}"}
|
|
610
|
+
)
|
|
611
|
+
if markup_config.end_label_value:
|
|
612
|
+
decision = _verify_end_label(text=segments[-1].text, config=config)
|
|
613
|
+
if decision and decision.get("is_end"):
|
|
614
|
+
segments[-1] = segments[-1].model_copy(
|
|
615
|
+
update={"text": f"{markup_config.end_label_value}\n{segments[-1].text}"}
|
|
616
|
+
)
|
|
617
|
+
elif decision and not decision.get("is_end") and markup_config.end_reject_label_value:
|
|
618
|
+
reason = decision.get("reason")
|
|
619
|
+
prefix = markup_config.end_reject_label_value
|
|
620
|
+
if reason:
|
|
621
|
+
prefix = f"{prefix}\n{markup_config.end_reject_reason_prefix}: {reason}"
|
|
622
|
+
segments[-1] = segments[-1].model_copy(
|
|
623
|
+
update={"text": f"{prefix}\n{segments[-1].text}"}
|
|
624
|
+
)
|
|
625
|
+
return segments
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def _parse_json_list(raw: str, *, error_label: str) -> List[object]:
|
|
629
|
+
cleaned = str(raw or "").strip()
|
|
630
|
+
if not cleaned:
|
|
631
|
+
raise ValueError(f"{error_label} returned empty output")
|
|
632
|
+
try:
|
|
633
|
+
data = json.loads(cleaned)
|
|
634
|
+
except json.JSONDecodeError as exc:
|
|
635
|
+
raise ValueError(f"{error_label} returned invalid JSON") from exc
|
|
636
|
+
if not isinstance(data, list):
|
|
637
|
+
raise ValueError(f"{error_label} must return a JSON list")
|
|
638
|
+
return list(data)
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _parse_json_object(raw: str, *, error_label: str) -> Dict[str, object]:
|
|
642
|
+
cleaned = str(raw or "").strip()
|
|
643
|
+
if not cleaned:
|
|
644
|
+
raise ValueError(f"{error_label} returned empty output")
|
|
645
|
+
try:
|
|
646
|
+
data = json.loads(cleaned)
|
|
647
|
+
except json.JSONDecodeError as exc:
|
|
648
|
+
raise ValueError(f"{error_label} returned invalid JSON") from exc
|
|
649
|
+
if not isinstance(data, dict):
|
|
650
|
+
raise ValueError(f"{error_label} must return a JSON object")
|
|
651
|
+
return dict(data)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def _sequence_lengths(segments: Sequence[MarkovAnalysisSegment]) -> List[int]:
|
|
655
|
+
lengths: List[int] = []
|
|
656
|
+
current_item: str = ""
|
|
657
|
+
current_length = 0
|
|
658
|
+
for segment in segments:
|
|
659
|
+
if not current_item:
|
|
660
|
+
current_item = segment.item_id
|
|
661
|
+
current_length = 0
|
|
662
|
+
elif segment.item_id != current_item:
|
|
663
|
+
lengths.append(current_length)
|
|
664
|
+
current_item = segment.item_id
|
|
665
|
+
current_length = 0
|
|
666
|
+
current_length += 1
|
|
667
|
+
if current_item:
|
|
668
|
+
lengths.append(current_length)
|
|
669
|
+
return lengths
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def _build_observations(
|
|
673
|
+
*, segments: Sequence[MarkovAnalysisSegment], config: MarkovAnalysisRecipeConfig
|
|
674
|
+
) -> List[MarkovAnalysisObservation]:
|
|
675
|
+
observations: List[MarkovAnalysisObservation] = []
|
|
676
|
+
for segment in segments:
|
|
677
|
+
observations.append(
|
|
678
|
+
MarkovAnalysisObservation(
|
|
679
|
+
item_id=segment.item_id,
|
|
680
|
+
segment_index=segment.segment_index,
|
|
681
|
+
segment_text=segment.text,
|
|
682
|
+
)
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
if config.llm_observations.enabled:
|
|
686
|
+
llm = config.llm_observations
|
|
687
|
+
assert llm.client is not None and llm.prompt_template is not None
|
|
688
|
+
for index, observation in enumerate(observations):
|
|
689
|
+
prompt = llm.prompt_template.format(segment=observation.segment_text)
|
|
690
|
+
response_text = generate_completion(
|
|
691
|
+
client=llm.client,
|
|
692
|
+
system_prompt=llm.system_prompt,
|
|
693
|
+
user_prompt=prompt,
|
|
694
|
+
).strip()
|
|
695
|
+
payload = _parse_json_object(response_text, error_label="LLM observations")
|
|
696
|
+
label = payload.get("label")
|
|
697
|
+
confidence = payload.get("label_confidence")
|
|
698
|
+
summary = payload.get("summary")
|
|
699
|
+
observations[index] = observation.model_copy(
|
|
700
|
+
update={
|
|
701
|
+
"llm_label": str(label).strip() if label is not None else None,
|
|
702
|
+
"llm_label_confidence": float(confidence) if confidence is not None else None,
|
|
703
|
+
"llm_summary": str(summary).strip() if summary is not None else None,
|
|
704
|
+
}
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
if config.embeddings.enabled:
|
|
708
|
+
embedding_config = config.embeddings
|
|
709
|
+
assert embedding_config.client is not None
|
|
710
|
+
embed_texts: List[str] = []
|
|
711
|
+
for observation in observations:
|
|
712
|
+
if embedding_config.text_source == "segment_text":
|
|
713
|
+
embed_texts.append(observation.segment_text)
|
|
714
|
+
else:
|
|
715
|
+
if not observation.llm_summary:
|
|
716
|
+
raise ValueError(
|
|
717
|
+
"embeddings.text_source is 'llm_summary' but llm_summary is missing"
|
|
718
|
+
)
|
|
719
|
+
embed_texts.append(observation.llm_summary)
|
|
720
|
+
vectors = generate_embeddings_batch(client=embedding_config.client, texts=embed_texts)
|
|
721
|
+
updated: List[MarkovAnalysisObservation] = []
|
|
722
|
+
for observation, vector in zip(observations, vectors):
|
|
723
|
+
updated.append(observation.model_copy(update={"embedding": vector}))
|
|
724
|
+
observations = updated
|
|
725
|
+
|
|
726
|
+
return observations
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
def _topic_document_id(*, item_id: str, segment_index: int) -> str:
|
|
730
|
+
return f"{item_id}:{segment_index}"
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _apply_topic_modeling(
|
|
734
|
+
*,
|
|
735
|
+
observations: Sequence[MarkovAnalysisObservation],
|
|
736
|
+
config: MarkovAnalysisRecipeConfig,
|
|
737
|
+
) -> Tuple[List[MarkovAnalysisObservation], Optional[TopicModelingReport]]:
|
|
738
|
+
topic_config = config.topic_modeling
|
|
739
|
+
if not topic_config.enabled:
|
|
740
|
+
return list(observations), None
|
|
741
|
+
if topic_config.recipe is None:
|
|
742
|
+
raise ValueError("topic_modeling.recipe is required when topic_modeling.enabled is true")
|
|
743
|
+
|
|
744
|
+
documents: List[TopicModelingDocument] = []
|
|
745
|
+
for observation in observations:
|
|
746
|
+
if observation.segment_text in {"START", "END"}:
|
|
747
|
+
continue
|
|
748
|
+
documents.append(
|
|
749
|
+
TopicModelingDocument(
|
|
750
|
+
document_id=_topic_document_id(
|
|
751
|
+
item_id=observation.item_id,
|
|
752
|
+
segment_index=observation.segment_index,
|
|
753
|
+
),
|
|
754
|
+
source_item_id=observation.item_id,
|
|
755
|
+
text=observation.segment_text,
|
|
756
|
+
)
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
if not documents:
|
|
760
|
+
raise ValueError("Topic modeling requires at least one non-boundary segment")
|
|
761
|
+
|
|
762
|
+
report = run_topic_modeling_for_documents(
|
|
763
|
+
documents=documents,
|
|
764
|
+
config=topic_config.recipe,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
topic_lookup: Dict[str, Tuple[int, str]] = {}
|
|
768
|
+
for topic in report.topics:
|
|
769
|
+
label = str(topic.label or "").strip()
|
|
770
|
+
for document_id in topic.document_ids:
|
|
771
|
+
topic_lookup[str(document_id)] = (int(topic.topic_id), label)
|
|
772
|
+
|
|
773
|
+
updated: List[MarkovAnalysisObservation] = []
|
|
774
|
+
for observation in observations:
|
|
775
|
+
if observation.segment_text in {"START", "END"}:
|
|
776
|
+
updated.append(
|
|
777
|
+
observation.model_copy(
|
|
778
|
+
update={
|
|
779
|
+
"topic_id": None,
|
|
780
|
+
"topic_label": observation.segment_text,
|
|
781
|
+
}
|
|
782
|
+
)
|
|
783
|
+
)
|
|
784
|
+
continue
|
|
785
|
+
document_id = _topic_document_id(
|
|
786
|
+
item_id=observation.item_id, segment_index=observation.segment_index
|
|
787
|
+
)
|
|
788
|
+
assignment = topic_lookup.get(document_id)
|
|
789
|
+
if assignment is None:
|
|
790
|
+
raise ValueError(
|
|
791
|
+
f"Topic modeling did not return an assignment for segment {document_id}"
|
|
792
|
+
)
|
|
793
|
+
topic_id, topic_label = assignment
|
|
794
|
+
updated.append(
|
|
795
|
+
observation.model_copy(update={"topic_id": topic_id, "topic_label": topic_label})
|
|
796
|
+
)
|
|
797
|
+
return updated, report
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def _encode_observations(
|
|
801
|
+
*, observations: Sequence[MarkovAnalysisObservation], config: MarkovAnalysisRecipeConfig
|
|
802
|
+
) -> Tuple[object, List[int]]:
|
|
803
|
+
lengths = _sequence_lengths(
|
|
804
|
+
[
|
|
805
|
+
MarkovAnalysisSegment(
|
|
806
|
+
item_id=observation.item_id,
|
|
807
|
+
segment_index=observation.segment_index,
|
|
808
|
+
text=observation.segment_text,
|
|
809
|
+
)
|
|
810
|
+
for observation in observations
|
|
811
|
+
]
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
if config.model.family == MarkovAnalysisModelFamily.CATEGORICAL:
|
|
815
|
+
labels: List[str] = []
|
|
816
|
+
for observation in observations:
|
|
817
|
+
label = getattr(observation, config.observations.categorical_source, None)
|
|
818
|
+
if label is None:
|
|
819
|
+
raise ValueError(
|
|
820
|
+
"Categorical Markov models require categorical labels for all segments"
|
|
821
|
+
)
|
|
822
|
+
labels.append(str(label))
|
|
823
|
+
vocabulary = {label: idx for idx, label in enumerate(sorted(set(labels)))}
|
|
824
|
+
encoded = [vocabulary[label] for label in labels]
|
|
825
|
+
return encoded, lengths
|
|
826
|
+
|
|
827
|
+
encoder = config.observations.encoder
|
|
828
|
+
if encoder == MarkovAnalysisObservationsEncoder.TFIDF:
|
|
829
|
+
texts: List[str] = []
|
|
830
|
+
for observation in observations:
|
|
831
|
+
if config.observations.text_source == "segment_text":
|
|
832
|
+
texts.append(observation.segment_text)
|
|
833
|
+
else:
|
|
834
|
+
texts.append(observation.llm_summary or "")
|
|
835
|
+
return (
|
|
836
|
+
_tfidf_encode(
|
|
837
|
+
texts=texts,
|
|
838
|
+
max_features=config.observations.tfidf.max_features,
|
|
839
|
+
ngram_range=tuple(config.observations.tfidf.ngram_range),
|
|
840
|
+
),
|
|
841
|
+
lengths,
|
|
842
|
+
)
|
|
843
|
+
if encoder == MarkovAnalysisObservationsEncoder.EMBEDDING:
|
|
844
|
+
matrix: List[List[float]] = []
|
|
845
|
+
for observation in observations:
|
|
846
|
+
if observation.embedding is None:
|
|
847
|
+
raise ValueError("Embedding observations require embeddings.enabled true")
|
|
848
|
+
matrix.append([float(value) for value in observation.embedding])
|
|
849
|
+
return matrix, lengths
|
|
850
|
+
if encoder == MarkovAnalysisObservationsEncoder.HYBRID:
|
|
851
|
+
labels = [
|
|
852
|
+
str(getattr(observation, config.observations.categorical_source, "") or "")
|
|
853
|
+
for observation in observations
|
|
854
|
+
]
|
|
855
|
+
vocabulary = {label: idx for idx, label in enumerate(sorted(set(labels)))}
|
|
856
|
+
one_hot_size = len(vocabulary)
|
|
857
|
+
matrix: List[List[float]] = []
|
|
858
|
+
for observation in observations:
|
|
859
|
+
if observation.embedding is None:
|
|
860
|
+
raise ValueError("Hybrid observations require embeddings.enabled true")
|
|
861
|
+
vector: List[float] = [float(value) for value in observation.embedding]
|
|
862
|
+
numeric_value = getattr(observation, config.observations.numeric_source, None)
|
|
863
|
+
confidence = float(numeric_value) if numeric_value is not None else 0.0
|
|
864
|
+
vector.append(confidence)
|
|
865
|
+
one_hot = [0.0 for _ in range(one_hot_size)]
|
|
866
|
+
label_value = str(
|
|
867
|
+
getattr(observation, config.observations.categorical_source, "") or ""
|
|
868
|
+
)
|
|
869
|
+
idx = vocabulary[label_value]
|
|
870
|
+
one_hot[idx] = 1.0
|
|
871
|
+
vector.extend(one_hot)
|
|
872
|
+
matrix.append(vector)
|
|
873
|
+
return matrix, lengths
|
|
874
|
+
raise ValueError(f"Unsupported observations encoder: {encoder}")
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
def _tokenize(text: str) -> List[str]:
|
|
878
|
+
return [token for token in re.split(r"[^A-Za-z0-9]+", text.lower()) if token]
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def _tfidf_encode(
|
|
882
|
+
*, texts: Sequence[str], max_features: int, ngram_range: Tuple[int, int]
|
|
883
|
+
) -> List[List[float]]:
|
|
884
|
+
if max_features <= 0:
|
|
885
|
+
raise ValueError("tfidf.max_features must be positive")
|
|
886
|
+
min_n, max_n = ngram_range
|
|
887
|
+
if min_n <= 0 or max_n < min_n:
|
|
888
|
+
raise ValueError("tfidf.ngram_range is invalid")
|
|
889
|
+
|
|
890
|
+
documents: List[List[str]] = []
|
|
891
|
+
for text in texts:
|
|
892
|
+
tokens = _tokenize(text)
|
|
893
|
+
ngrams: List[str] = []
|
|
894
|
+
for n in range(min_n, max_n + 1):
|
|
895
|
+
for idx in range(0, max(0, len(tokens) - n + 1)):
|
|
896
|
+
ngrams.append(" ".join(tokens[idx : idx + n]))
|
|
897
|
+
documents.append(ngrams)
|
|
898
|
+
|
|
899
|
+
df: Dict[str, int] = {}
|
|
900
|
+
for doc in documents:
|
|
901
|
+
for term in set(doc):
|
|
902
|
+
df[term] = df.get(term, 0) + 1
|
|
903
|
+
|
|
904
|
+
sorted_terms = sorted(df.items(), key=lambda item: (-item[1], item[0]))
|
|
905
|
+
vocabulary = [term for term, _ in sorted_terms[:max_features]]
|
|
906
|
+
index = {term: idx for idx, term in enumerate(vocabulary)}
|
|
907
|
+
|
|
908
|
+
n_docs = max(1, len(documents))
|
|
909
|
+
idf: List[float] = []
|
|
910
|
+
for term in vocabulary:
|
|
911
|
+
count = df.get(term, 0)
|
|
912
|
+
idf.append(float((n_docs + 1) / (count + 1)))
|
|
913
|
+
|
|
914
|
+
vectors: List[List[float]] = []
|
|
915
|
+
for doc in documents:
|
|
916
|
+
tf: Dict[int, int] = {}
|
|
917
|
+
for term in doc:
|
|
918
|
+
term_idx = index.get(term)
|
|
919
|
+
if term_idx is None:
|
|
920
|
+
continue
|
|
921
|
+
tf[term_idx] = tf.get(term_idx, 0) + 1
|
|
922
|
+
length = sum(tf.values()) or 1
|
|
923
|
+
vector = [0.0 for _ in vocabulary]
|
|
924
|
+
for term_idx, count in tf.items():
|
|
925
|
+
vector[term_idx] = (float(count) / float(length)) * idf[term_idx]
|
|
926
|
+
vectors.append(vector)
|
|
927
|
+
return vectors
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def _fit_and_decode(
|
|
931
|
+
*, observations: object, lengths: List[int], config: MarkovAnalysisRecipeConfig
|
|
932
|
+
) -> Tuple[List[int], List[MarkovAnalysisTransition], int]:
|
|
933
|
+
def normalize_startprob(values: Sequence[float]) -> List[float]:
|
|
934
|
+
cleaned = [float(value) if math.isfinite(float(value)) else 0.0 for value in values]
|
|
935
|
+
total = sum(cleaned)
|
|
936
|
+
if total <= 0.0:
|
|
937
|
+
return [1.0 / float(len(cleaned)) for _ in cleaned]
|
|
938
|
+
return [value / total for value in cleaned]
|
|
939
|
+
|
|
940
|
+
def normalize_transmat(matrix: Sequence[Sequence[float]]) -> List[List[float]]:
|
|
941
|
+
normalized: List[List[float]] = []
|
|
942
|
+
size = len(matrix)
|
|
943
|
+
for row in matrix:
|
|
944
|
+
cleaned = [float(value) if math.isfinite(float(value)) else 0.0 for value in row]
|
|
945
|
+
total = sum(cleaned)
|
|
946
|
+
if total <= 0.0:
|
|
947
|
+
normalized.append([1.0 / float(size) for _ in cleaned])
|
|
948
|
+
else:
|
|
949
|
+
normalized.append([value / total for value in cleaned])
|
|
950
|
+
return normalized
|
|
951
|
+
|
|
952
|
+
family = config.model.family
|
|
953
|
+
try:
|
|
954
|
+
from hmmlearn.hmm import CategoricalHMM, GaussianHMM
|
|
955
|
+
except ImportError as import_error:
|
|
956
|
+
raise ValueError(
|
|
957
|
+
"Markov analysis requires an optional dependency. "
|
|
958
|
+
'Install it with pip install "biblicus[markov-analysis]".'
|
|
959
|
+
) from import_error
|
|
960
|
+
|
|
961
|
+
if family == MarkovAnalysisModelFamily.CATEGORICAL:
|
|
962
|
+
encoded = list(observations) # type: ignore[arg-type]
|
|
963
|
+
X: object = [[int(value)] for value in encoded]
|
|
964
|
+
try:
|
|
965
|
+
import numpy as np
|
|
966
|
+
|
|
967
|
+
X = np.asarray(X, dtype=int)
|
|
968
|
+
except ImportError:
|
|
969
|
+
pass
|
|
970
|
+
model = CategoricalHMM(n_components=config.model.n_states)
|
|
971
|
+
model.fit(X, lengths=lengths)
|
|
972
|
+
if hasattr(model, "startprob_"):
|
|
973
|
+
startprob = normalize_startprob(model.startprob_)
|
|
974
|
+
try:
|
|
975
|
+
import numpy as np
|
|
976
|
+
|
|
977
|
+
model.startprob_ = np.asarray(startprob, dtype=float)
|
|
978
|
+
except ImportError:
|
|
979
|
+
model.startprob_ = startprob
|
|
980
|
+
if hasattr(model, "transmat_"):
|
|
981
|
+
transmat = normalize_transmat(model.transmat_)
|
|
982
|
+
try:
|
|
983
|
+
import numpy as np
|
|
984
|
+
|
|
985
|
+
model.transmat_ = np.asarray(transmat, dtype=float)
|
|
986
|
+
except ImportError:
|
|
987
|
+
model.transmat_ = transmat
|
|
988
|
+
predicted = list(model.predict(X, lengths=lengths))
|
|
989
|
+
else:
|
|
990
|
+
matrix = list(observations) # type: ignore[arg-type]
|
|
991
|
+
X = matrix
|
|
992
|
+
try:
|
|
993
|
+
import numpy as np
|
|
994
|
+
|
|
995
|
+
X = np.asarray(matrix, dtype=float)
|
|
996
|
+
except ImportError:
|
|
997
|
+
pass
|
|
998
|
+
model = GaussianHMM(n_components=config.model.n_states)
|
|
999
|
+
model.fit(X, lengths=lengths)
|
|
1000
|
+
if hasattr(model, "startprob_"):
|
|
1001
|
+
startprob = normalize_startprob(model.startprob_)
|
|
1002
|
+
try:
|
|
1003
|
+
import numpy as np
|
|
1004
|
+
|
|
1005
|
+
model.startprob_ = np.asarray(startprob, dtype=float)
|
|
1006
|
+
except ImportError:
|
|
1007
|
+
model.startprob_ = startprob
|
|
1008
|
+
if hasattr(model, "transmat_"):
|
|
1009
|
+
transmat = normalize_transmat(model.transmat_)
|
|
1010
|
+
try:
|
|
1011
|
+
import numpy as np
|
|
1012
|
+
|
|
1013
|
+
model.transmat_ = np.asarray(transmat, dtype=float)
|
|
1014
|
+
except ImportError:
|
|
1015
|
+
model.transmat_ = transmat
|
|
1016
|
+
predicted = list(model.predict(X, lengths=lengths))
|
|
1017
|
+
|
|
1018
|
+
transitions: List[MarkovAnalysisTransition] = []
|
|
1019
|
+
transmat = getattr(model, "transmat_", None)
|
|
1020
|
+
if transmat is not None:
|
|
1021
|
+
for from_state in range(len(transmat)):
|
|
1022
|
+
row = transmat[from_state]
|
|
1023
|
+
for to_state in range(len(row)):
|
|
1024
|
+
weight = float(row[to_state])
|
|
1025
|
+
if weight <= 0.0:
|
|
1026
|
+
continue
|
|
1027
|
+
transitions.append(
|
|
1028
|
+
MarkovAnalysisTransition(
|
|
1029
|
+
from_state=from_state,
|
|
1030
|
+
to_state=to_state,
|
|
1031
|
+
weight=weight,
|
|
1032
|
+
)
|
|
1033
|
+
)
|
|
1034
|
+
else:
|
|
1035
|
+
transitions = _transitions_from_sequence(predicted)
|
|
1036
|
+
|
|
1037
|
+
return predicted, transitions, config.model.n_states
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def _transitions_from_sequence(states: Sequence[int]) -> List[MarkovAnalysisTransition]:
|
|
1041
|
+
counts: Dict[Tuple[int, int], int] = {}
|
|
1042
|
+
totals: Dict[int, int] = {}
|
|
1043
|
+
for prev, nxt in zip(states, states[1:]):
|
|
1044
|
+
counts[(prev, nxt)] = counts.get((prev, nxt), 0) + 1
|
|
1045
|
+
totals[prev] = totals.get(prev, 0) + 1
|
|
1046
|
+
transitions: List[MarkovAnalysisTransition] = []
|
|
1047
|
+
for (prev, nxt), count in sorted(counts.items()):
|
|
1048
|
+
denom = max(1, totals.get(prev, 0))
|
|
1049
|
+
transitions.append(
|
|
1050
|
+
MarkovAnalysisTransition(
|
|
1051
|
+
from_state=prev, to_state=nxt, weight=float(count) / float(denom)
|
|
1052
|
+
)
|
|
1053
|
+
)
|
|
1054
|
+
return transitions
|
|
1055
|
+
|
|
1056
|
+
|
|
1057
|
+
def _group_decoded_paths(
|
|
1058
|
+
*, segments: Sequence[MarkovAnalysisSegment], predicted_states: Sequence[int]
|
|
1059
|
+
) -> List[MarkovAnalysisDecodedPath]:
|
|
1060
|
+
paths: Dict[str, List[int]] = {}
|
|
1061
|
+
for segment, state in zip(segments, predicted_states):
|
|
1062
|
+
paths.setdefault(segment.item_id, []).append(int(state))
|
|
1063
|
+
return [
|
|
1064
|
+
MarkovAnalysisDecodedPath(item_id=item_id, state_sequence=sequence)
|
|
1065
|
+
for item_id, sequence in sorted(paths.items())
|
|
1066
|
+
]
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def _build_states(
|
|
1070
|
+
*,
|
|
1071
|
+
segments: Sequence[MarkovAnalysisSegment],
|
|
1072
|
+
predicted_states: Sequence[int],
|
|
1073
|
+
n_states: int,
|
|
1074
|
+
max_exemplars: int,
|
|
1075
|
+
) -> List[MarkovAnalysisState]:
|
|
1076
|
+
exemplars: Dict[int, List[str]] = {idx: [] for idx in range(n_states)}
|
|
1077
|
+
for segment, state in zip(segments, predicted_states):
|
|
1078
|
+
exemplar_list = exemplars.get(int(state))
|
|
1079
|
+
if exemplar_list is None:
|
|
1080
|
+
continue
|
|
1081
|
+
boundary_token = str(segment.text).strip().upper()
|
|
1082
|
+
if boundary_token in {"START", "END"} and boundary_token not in exemplar_list:
|
|
1083
|
+
if max_exemplars > 0 and len(exemplar_list) >= max_exemplars:
|
|
1084
|
+
exemplar_list[-1] = boundary_token
|
|
1085
|
+
continue
|
|
1086
|
+
exemplar_list.append(boundary_token)
|
|
1087
|
+
continue
|
|
1088
|
+
if len(exemplar_list) >= max_exemplars:
|
|
1089
|
+
continue
|
|
1090
|
+
exemplar_list.append(segment.text)
|
|
1091
|
+
states: List[MarkovAnalysisState] = []
|
|
1092
|
+
for state_id in range(n_states):
|
|
1093
|
+
states.append(
|
|
1094
|
+
MarkovAnalysisState(
|
|
1095
|
+
state_id=state_id,
|
|
1096
|
+
label=None,
|
|
1097
|
+
exemplars=exemplars.get(state_id, []),
|
|
1098
|
+
)
|
|
1099
|
+
)
|
|
1100
|
+
return states
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
def _state_naming_context_pack(
|
|
1104
|
+
*,
|
|
1105
|
+
states: Sequence[MarkovAnalysisState],
|
|
1106
|
+
config: MarkovAnalysisRecipeConfig,
|
|
1107
|
+
position_stats: Optional[Dict[int, Dict[str, float]]] = None,
|
|
1108
|
+
) -> Tuple[ContextPack, ContextPackPolicy]:
|
|
1109
|
+
naming = config.report.state_naming
|
|
1110
|
+
if naming is None or not naming.enabled:
|
|
1111
|
+
return ContextPack(text="", evidence_count=0, blocks=[]), ContextPackPolicy()
|
|
1112
|
+
evidence: List[Evidence] = []
|
|
1113
|
+
rank = 1
|
|
1114
|
+
for state in states:
|
|
1115
|
+
stats = (position_stats or {}).get(state.state_id)
|
|
1116
|
+
if stats:
|
|
1117
|
+
after_start = stats.get("after_start_pct", 0.0) * 100.0
|
|
1118
|
+
before_end = stats.get("before_end_pct", 0.0) * 100.0
|
|
1119
|
+
avg_position = stats.get("avg_position_pct", 0.0) * 100.0
|
|
1120
|
+
hint_text = (
|
|
1121
|
+
"Position hints:\n"
|
|
1122
|
+
f"- After START: {after_start:.1f}% of transitions from START\n"
|
|
1123
|
+
f"- Before END: {before_end:.1f}% of transitions to END\n"
|
|
1124
|
+
f"- Average position: {avg_position:.1f}% of call length"
|
|
1125
|
+
)
|
|
1126
|
+
evidence.append(
|
|
1127
|
+
Evidence(
|
|
1128
|
+
item_id=f"state-{state.state_id}",
|
|
1129
|
+
source_uri=None,
|
|
1130
|
+
media_type="text/plain",
|
|
1131
|
+
score=1.0,
|
|
1132
|
+
rank=rank,
|
|
1133
|
+
text=f"State {state.state_id}:\n{hint_text}",
|
|
1134
|
+
stage="state-naming",
|
|
1135
|
+
stage_scores=None,
|
|
1136
|
+
recipe_id="state-naming",
|
|
1137
|
+
run_id="state-naming",
|
|
1138
|
+
hash=None,
|
|
1139
|
+
)
|
|
1140
|
+
)
|
|
1141
|
+
rank += 1
|
|
1142
|
+
exemplars = list(state.exemplars)[: naming.max_exemplars_per_state]
|
|
1143
|
+
for index, exemplar in enumerate(exemplars, start=1):
|
|
1144
|
+
text = f"State {state.state_id} exemplar {index}:\n{exemplar}"
|
|
1145
|
+
evidence.append(
|
|
1146
|
+
Evidence(
|
|
1147
|
+
item_id=f"state-{state.state_id}",
|
|
1148
|
+
source_uri=None,
|
|
1149
|
+
media_type="text/plain",
|
|
1150
|
+
score=1.0,
|
|
1151
|
+
rank=rank,
|
|
1152
|
+
text=text,
|
|
1153
|
+
stage="state-naming",
|
|
1154
|
+
stage_scores=None,
|
|
1155
|
+
recipe_id="state-naming",
|
|
1156
|
+
run_id="state-naming",
|
|
1157
|
+
hash=None,
|
|
1158
|
+
)
|
|
1159
|
+
)
|
|
1160
|
+
rank += 1
|
|
1161
|
+
retrieval_result = RetrievalResult(
|
|
1162
|
+
query_text="state-naming",
|
|
1163
|
+
budget=QueryBudget(max_total_items=max(len(evidence), 1)),
|
|
1164
|
+
run_id="state-naming",
|
|
1165
|
+
recipe_id="state-naming",
|
|
1166
|
+
backend_id="state-naming",
|
|
1167
|
+
generated_at=utc_now_iso(),
|
|
1168
|
+
evidence=evidence,
|
|
1169
|
+
stats={},
|
|
1170
|
+
)
|
|
1171
|
+
policy = ContextPackPolicy(join_with="\n\n", ordering="rank", include_metadata=False)
|
|
1172
|
+
context_pack = build_context_pack(retrieval_result, policy=policy)
|
|
1173
|
+
fitted_pack = fit_context_pack_to_token_budget(
|
|
1174
|
+
context_pack,
|
|
1175
|
+
policy=policy,
|
|
1176
|
+
token_budget=TokenBudget(max_tokens=naming.token_budget),
|
|
1177
|
+
)
|
|
1178
|
+
return fitted_pack, policy
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
def _validate_state_names(
|
|
1182
|
+
*,
|
|
1183
|
+
response: MarkovStateNamingResponse,
|
|
1184
|
+
state_ids: Sequence[int],
|
|
1185
|
+
max_name_words: int,
|
|
1186
|
+
) -> Dict[int, str]:
|
|
1187
|
+
def require_short_noun_phrase(name: str, max_words: int) -> None:
|
|
1188
|
+
raw_name = str(name).strip()
|
|
1189
|
+
tokens = [token for token in raw_name.split() if token]
|
|
1190
|
+
word_count = len(tokens)
|
|
1191
|
+
if word_count == 0 or word_count > max_words:
|
|
1192
|
+
raise ValueError("State names must be short noun phrases")
|
|
1193
|
+
if any(symbol in raw_name for symbol in (".", "!", "?", ":", ";")):
|
|
1194
|
+
raise ValueError("State names must be short noun phrases without sentence punctuation")
|
|
1195
|
+
lower_tokens = [token.lower() for token in tokens]
|
|
1196
|
+
if lower_tokens[0] in ("to", "please"):
|
|
1197
|
+
raise ValueError("State names must be short noun phrases")
|
|
1198
|
+
forbidden_auxiliaries = {
|
|
1199
|
+
"am",
|
|
1200
|
+
"are",
|
|
1201
|
+
"be",
|
|
1202
|
+
"been",
|
|
1203
|
+
"being",
|
|
1204
|
+
"is",
|
|
1205
|
+
"was",
|
|
1206
|
+
"were",
|
|
1207
|
+
"can",
|
|
1208
|
+
"could",
|
|
1209
|
+
"do",
|
|
1210
|
+
"does",
|
|
1211
|
+
"did",
|
|
1212
|
+
"doing",
|
|
1213
|
+
"have",
|
|
1214
|
+
"has",
|
|
1215
|
+
"had",
|
|
1216
|
+
"having",
|
|
1217
|
+
"may",
|
|
1218
|
+
"might",
|
|
1219
|
+
"must",
|
|
1220
|
+
"shall",
|
|
1221
|
+
"should",
|
|
1222
|
+
"will",
|
|
1223
|
+
"would",
|
|
1224
|
+
}
|
|
1225
|
+
if any(token in forbidden_auxiliaries for token in lower_tokens):
|
|
1226
|
+
raise ValueError("State names must be short noun phrases without verbs")
|
|
1227
|
+
|
|
1228
|
+
names: Dict[int, str] = {}
|
|
1229
|
+
seen_names: Dict[str, int] = {}
|
|
1230
|
+
for entry in response.state_names:
|
|
1231
|
+
raw_name = str(entry.name).strip()
|
|
1232
|
+
require_short_noun_phrase(raw_name, max_name_words)
|
|
1233
|
+
if entry.state_id in names:
|
|
1234
|
+
raise ValueError("State naming response contains duplicate state_id values")
|
|
1235
|
+
normalized = raw_name.lower()
|
|
1236
|
+
if normalized in seen_names:
|
|
1237
|
+
raise ValueError("State naming response contains duplicate state names")
|
|
1238
|
+
names[entry.state_id] = raw_name
|
|
1239
|
+
seen_names[normalized] = entry.state_id
|
|
1240
|
+
missing = [state_id for state_id in state_ids if state_id not in names]
|
|
1241
|
+
if missing:
|
|
1242
|
+
raise ValueError("State naming response missing required state_id values")
|
|
1243
|
+
return names
|
|
1244
|
+
|
|
1245
|
+
|
|
1246
|
+
def _assign_state_names(
|
|
1247
|
+
*,
|
|
1248
|
+
states: Sequence[MarkovAnalysisState],
|
|
1249
|
+
decoded_paths: Sequence[MarkovAnalysisDecodedPath],
|
|
1250
|
+
config: MarkovAnalysisRecipeConfig,
|
|
1251
|
+
) -> List[MarkovAnalysisState]:
|
|
1252
|
+
naming = config.report.state_naming
|
|
1253
|
+
if naming is None or not naming.enabled:
|
|
1254
|
+
return list(states)
|
|
1255
|
+
if naming.client is None:
|
|
1256
|
+
raise ValueError("report.state_naming.client is required when enabled")
|
|
1257
|
+
if not states:
|
|
1258
|
+
return list(states)
|
|
1259
|
+
start_state_id = _select_boundary_state_id(states=states, boundary_label="START")
|
|
1260
|
+
end_state_id = _select_boundary_state_id(states=states, boundary_label="END")
|
|
1261
|
+
sanitized_states = _strip_boundary_exemplars(
|
|
1262
|
+
states=states,
|
|
1263
|
+
boundary_label="START",
|
|
1264
|
+
allowed_state_id=start_state_id,
|
|
1265
|
+
)
|
|
1266
|
+
sanitized_states = _strip_boundary_exemplars(
|
|
1267
|
+
states=sanitized_states,
|
|
1268
|
+
boundary_label="END",
|
|
1269
|
+
allowed_state_id=end_state_id,
|
|
1270
|
+
)
|
|
1271
|
+
naming_states = [
|
|
1272
|
+
state for state in sanitized_states if state.state_id not in {start_state_id, end_state_id}
|
|
1273
|
+
]
|
|
1274
|
+
if not naming_states:
|
|
1275
|
+
return _apply_boundary_labels(
|
|
1276
|
+
states=sanitized_states,
|
|
1277
|
+
start_state_id=start_state_id,
|
|
1278
|
+
end_state_id=end_state_id,
|
|
1279
|
+
)
|
|
1280
|
+
state_ids = [state.state_id for state in naming_states]
|
|
1281
|
+
position_stats = _compute_state_position_stats(
|
|
1282
|
+
decoded_paths=decoded_paths,
|
|
1283
|
+
start_state_id=start_state_id,
|
|
1284
|
+
end_state_id=end_state_id,
|
|
1285
|
+
)
|
|
1286
|
+
context_pack, _policy = _state_naming_context_pack(
|
|
1287
|
+
states=naming_states,
|
|
1288
|
+
config=config,
|
|
1289
|
+
position_stats=position_stats,
|
|
1290
|
+
)
|
|
1291
|
+
system_prompt = str(naming.system_prompt or "").format(context_pack=context_pack.text)
|
|
1292
|
+
user_prompt = str(naming.prompt_template or "").format(
|
|
1293
|
+
state_ids=", ".join(str(state_id) for state_id in state_ids),
|
|
1294
|
+
state_count=len(state_ids),
|
|
1295
|
+
)
|
|
1296
|
+
last_error: Optional[str] = None
|
|
1297
|
+
for attempt in range(naming.max_retries + 1):
|
|
1298
|
+
if last_error is not None:
|
|
1299
|
+
user_prompt = f"{user_prompt}\n\nPrevious response:\n{last_error}\n\nFix the issues and return only JSON."
|
|
1300
|
+
response_text = generate_completion(
|
|
1301
|
+
client=naming.client,
|
|
1302
|
+
system_prompt=system_prompt,
|
|
1303
|
+
user_prompt=user_prompt,
|
|
1304
|
+
).strip()
|
|
1305
|
+
payload = _parse_json_object(response_text, error_label="Markov state naming")
|
|
1306
|
+
response = MarkovStateNamingResponse.model_validate(payload)
|
|
1307
|
+
try:
|
|
1308
|
+
names = _validate_state_names(
|
|
1309
|
+
response=response,
|
|
1310
|
+
state_ids=state_ids,
|
|
1311
|
+
max_name_words=naming.max_name_words,
|
|
1312
|
+
)
|
|
1313
|
+
except ValueError as exc:
|
|
1314
|
+
last_error = f"{response_text}\n\nError: {exc}"
|
|
1315
|
+
continue
|
|
1316
|
+
updated_states: List[MarkovAnalysisState] = []
|
|
1317
|
+
for state in sanitized_states:
|
|
1318
|
+
if start_state_id is not None and state.state_id == start_state_id:
|
|
1319
|
+
updated_states.append(state.model_copy(update={"label": "START"}))
|
|
1320
|
+
continue
|
|
1321
|
+
if end_state_id is not None and state.state_id == end_state_id:
|
|
1322
|
+
updated_states.append(state.model_copy(update={"label": "END"}))
|
|
1323
|
+
continue
|
|
1324
|
+
base_label = names.get(state.state_id)
|
|
1325
|
+
if base_label is None:
|
|
1326
|
+
updated_states.append(state)
|
|
1327
|
+
continue
|
|
1328
|
+
updated_states.append(state.model_copy(update={"label": base_label}))
|
|
1329
|
+
return updated_states
|
|
1330
|
+
error_text = last_error or "unknown error"
|
|
1331
|
+
raise ValueError(f"Markov state naming failed after retries: {error_text}")
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
def _select_boundary_state_id(
|
|
1335
|
+
*, states: Sequence[MarkovAnalysisState], boundary_label: str
|
|
1336
|
+
) -> Optional[int]:
|
|
1337
|
+
candidates: List[Tuple[int, int, int]] = []
|
|
1338
|
+
normalized_label = boundary_label.strip().upper()
|
|
1339
|
+
for state in states:
|
|
1340
|
+
exemplars = [str(exemplar).strip().upper() for exemplar in (state.exemplars or [])]
|
|
1341
|
+
match_count = sum(1 for exemplar in exemplars if exemplar == normalized_label)
|
|
1342
|
+
if match_count:
|
|
1343
|
+
candidates.append((match_count, len(exemplars), state.state_id))
|
|
1344
|
+
if not candidates:
|
|
1345
|
+
return None
|
|
1346
|
+
candidates.sort(reverse=True)
|
|
1347
|
+
return candidates[0][2]
|
|
1348
|
+
|
|
1349
|
+
|
|
1350
|
+
def _strip_boundary_exemplars(
|
|
1351
|
+
*,
|
|
1352
|
+
states: Sequence[MarkovAnalysisState],
|
|
1353
|
+
boundary_label: str,
|
|
1354
|
+
allowed_state_id: Optional[int],
|
|
1355
|
+
) -> List[MarkovAnalysisState]:
|
|
1356
|
+
normalized_label = boundary_label.strip().upper()
|
|
1357
|
+
updated_states: List[MarkovAnalysisState] = []
|
|
1358
|
+
for state in states:
|
|
1359
|
+
exemplars = list(state.exemplars or [])
|
|
1360
|
+
if allowed_state_id is None or state.state_id != allowed_state_id:
|
|
1361
|
+
exemplars = [
|
|
1362
|
+
exemplar
|
|
1363
|
+
for exemplar in exemplars
|
|
1364
|
+
if str(exemplar).strip().upper() != normalized_label
|
|
1365
|
+
]
|
|
1366
|
+
updated_states.append(state.model_copy(update={"exemplars": exemplars}))
|
|
1367
|
+
return updated_states
|
|
1368
|
+
|
|
1369
|
+
|
|
1370
|
+
def _apply_boundary_labels(
|
|
1371
|
+
*,
|
|
1372
|
+
states: Sequence[MarkovAnalysisState],
|
|
1373
|
+
start_state_id: Optional[int],
|
|
1374
|
+
end_state_id: Optional[int],
|
|
1375
|
+
) -> List[MarkovAnalysisState]:
|
|
1376
|
+
updated_states: List[MarkovAnalysisState] = []
|
|
1377
|
+
for state in states:
|
|
1378
|
+
if start_state_id is not None and state.state_id == start_state_id:
|
|
1379
|
+
updated_states.append(state.model_copy(update={"label": "START"}))
|
|
1380
|
+
continue
|
|
1381
|
+
if end_state_id is not None and state.state_id == end_state_id:
|
|
1382
|
+
updated_states.append(state.model_copy(update={"label": "END"}))
|
|
1383
|
+
continue
|
|
1384
|
+
updated_states.append(state)
|
|
1385
|
+
return updated_states
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
def _compute_state_position_stats(
|
|
1389
|
+
*,
|
|
1390
|
+
decoded_paths: Sequence[MarkovAnalysisDecodedPath],
|
|
1391
|
+
start_state_id: Optional[int],
|
|
1392
|
+
end_state_id: Optional[int],
|
|
1393
|
+
) -> Dict[int, Dict[str, float]]:
|
|
1394
|
+
after_start_counts: Dict[int, int] = {}
|
|
1395
|
+
before_end_counts: Dict[int, int] = {}
|
|
1396
|
+
avg_position_sums: Dict[int, float] = {}
|
|
1397
|
+
avg_position_counts: Dict[int, int] = {}
|
|
1398
|
+
total_after_start = 0
|
|
1399
|
+
total_before_end = 0
|
|
1400
|
+
|
|
1401
|
+
for path in decoded_paths:
|
|
1402
|
+
sequence = list(path.state_sequence)
|
|
1403
|
+
if len(sequence) < 2:
|
|
1404
|
+
continue
|
|
1405
|
+
last_index = max(1, len(sequence) - 1)
|
|
1406
|
+
for index, state_id in enumerate(sequence):
|
|
1407
|
+
if state_id in {start_state_id, end_state_id}:
|
|
1408
|
+
continue
|
|
1409
|
+
avg_position_sums[state_id] = avg_position_sums.get(state_id, 0.0) + (
|
|
1410
|
+
float(index) / float(last_index)
|
|
1411
|
+
)
|
|
1412
|
+
avg_position_counts[state_id] = avg_position_counts.get(state_id, 0) + 1
|
|
1413
|
+
for from_state, to_state in zip(sequence, sequence[1:]):
|
|
1414
|
+
if start_state_id is not None and from_state == start_state_id:
|
|
1415
|
+
total_after_start += 1
|
|
1416
|
+
after_start_counts[to_state] = after_start_counts.get(to_state, 0) + 1
|
|
1417
|
+
if end_state_id is not None and to_state == end_state_id:
|
|
1418
|
+
total_before_end += 1
|
|
1419
|
+
before_end_counts[from_state] = before_end_counts.get(from_state, 0) + 1
|
|
1420
|
+
|
|
1421
|
+
stats: Dict[int, Dict[str, float]] = {}
|
|
1422
|
+
state_ids = set(avg_position_counts) | set(after_start_counts) | set(before_end_counts)
|
|
1423
|
+
for state_id in state_ids:
|
|
1424
|
+
avg_count = avg_position_counts.get(state_id, 0)
|
|
1425
|
+
stats[state_id] = {
|
|
1426
|
+
"after_start_pct": (
|
|
1427
|
+
after_start_counts.get(state_id, 0) / total_after_start
|
|
1428
|
+
if total_after_start
|
|
1429
|
+
else 0.0
|
|
1430
|
+
),
|
|
1431
|
+
"before_end_pct": (
|
|
1432
|
+
before_end_counts.get(state_id, 0) / total_before_end if total_before_end else 0.0
|
|
1433
|
+
),
|
|
1434
|
+
"avg_position_pct": (
|
|
1435
|
+
avg_position_sums.get(state_id, 0.0) / avg_count if avg_count else 0.0
|
|
1436
|
+
),
|
|
1437
|
+
}
|
|
1438
|
+
return stats
|
|
1439
|
+
|
|
1440
|
+
|
|
1441
|
+
def _write_analysis_run_manifest(*, run_dir: Path, manifest: AnalysisRunManifest) -> None:
|
|
1442
|
+
(run_dir / "manifest.json").write_text(
|
|
1443
|
+
manifest.model_dump_json(indent=2) + "\n", encoding="utf-8"
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
|
|
1447
|
+
def _write_segments(*, run_dir: Path, segments: Sequence[MarkovAnalysisSegment]) -> None:
|
|
1448
|
+
lines = [segment.model_dump_json() for segment in segments]
|
|
1449
|
+
(run_dir / "segments.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
def _write_observations(
|
|
1453
|
+
*, run_dir: Path, observations: Sequence[MarkovAnalysisObservation]
|
|
1454
|
+
) -> None:
|
|
1455
|
+
lines = [observation.model_dump_json() for observation in observations]
|
|
1456
|
+
(run_dir / "observations.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
1457
|
+
|
|
1458
|
+
|
|
1459
|
+
def _write_transitions_json(
|
|
1460
|
+
*, run_dir: Path, transitions: Sequence[MarkovAnalysisTransition]
|
|
1461
|
+
) -> None:
|
|
1462
|
+
payload = [transition.model_dump() for transition in transitions]
|
|
1463
|
+
(run_dir / "transitions.json").write_text(
|
|
1464
|
+
json.dumps(payload, indent=2) + "\n", encoding="utf-8"
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
|
|
1468
|
+
def _write_topic_modeling_report(*, run_dir: Path, report: TopicModelingReport) -> None:
|
|
1469
|
+
(run_dir / "topic_modeling.json").write_text(
|
|
1470
|
+
report.model_dump_json(indent=2) + "\n", encoding="utf-8"
|
|
1471
|
+
)
|
|
1472
|
+
|
|
1473
|
+
|
|
1474
|
+
def _write_topic_assignments(
|
|
1475
|
+
*,
|
|
1476
|
+
run_dir: Path,
|
|
1477
|
+
observations: Sequence[MarkovAnalysisObservation],
|
|
1478
|
+
) -> None:
|
|
1479
|
+
lines: List[str] = []
|
|
1480
|
+
for observation in observations:
|
|
1481
|
+
payload = {
|
|
1482
|
+
"item_id": observation.item_id,
|
|
1483
|
+
"segment_index": observation.segment_index,
|
|
1484
|
+
"segment_text": observation.segment_text,
|
|
1485
|
+
"topic_id": observation.topic_id,
|
|
1486
|
+
"topic_label": observation.topic_label,
|
|
1487
|
+
}
|
|
1488
|
+
lines.append(json.dumps(payload, ensure_ascii=True))
|
|
1489
|
+
(run_dir / "topic_assignments.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
1490
|
+
|
|
1491
|
+
|
|
1492
|
+
def _write_graphviz(
|
|
1493
|
+
*,
|
|
1494
|
+
run_dir: Path,
|
|
1495
|
+
transitions: Sequence[MarkovAnalysisTransition],
|
|
1496
|
+
graphviz: MarkovAnalysisArtifactsGraphVizConfig,
|
|
1497
|
+
states: Sequence[MarkovAnalysisState],
|
|
1498
|
+
decoded_paths: Sequence[MarkovAnalysisDecodedPath],
|
|
1499
|
+
) -> None:
|
|
1500
|
+
"""
|
|
1501
|
+
Write GraphViz transition output for Markov analysis.
|
|
1502
|
+
|
|
1503
|
+
The exported edge labels are meant for humans, so they include two values
|
|
1504
|
+
when decoded paths are available:
|
|
1505
|
+
|
|
1506
|
+
1) The empirical transition percentage derived from decoded paths, rendered
|
|
1507
|
+
as ``X.Y% (a/b)``, where ``a`` is the observed transition count and ``b``
|
|
1508
|
+
is the total number of transitions across all decoded sequences.
|
|
1509
|
+
2) The model transition probability, rendered as ``model Z.W%``. This is
|
|
1510
|
+
the HMM transition weight for the same edge.
|
|
1511
|
+
|
|
1512
|
+
The empirical value makes it clear how many transitions were actually observed,
|
|
1513
|
+
while the model value shows what the fitted HMM believes. When no decoded paths
|
|
1514
|
+
are available, the label falls back to the model percentage only.
|
|
1515
|
+
|
|
1516
|
+
Edges are filtered by ``graphviz.min_edge_weight`` using the empirical weight
|
|
1517
|
+
when possible; otherwise the model weight is used. This keeps the visualization
|
|
1518
|
+
faithful to observed sequences instead of solely model priors.
|
|
1519
|
+
|
|
1520
|
+
:param run_dir: Directory where the ``transitions.dot`` file is written.
|
|
1521
|
+
:type run_dir: pathlib.Path
|
|
1522
|
+
:param transitions: Markov transition edges with model weights.
|
|
1523
|
+
:type transitions: Sequence[MarkovAnalysisTransition]
|
|
1524
|
+
:param graphviz: GraphViz export configuration.
|
|
1525
|
+
:type graphviz: MarkovAnalysisArtifactsGraphVizConfig
|
|
1526
|
+
:param states: Markov states with labels and exemplars.
|
|
1527
|
+
:type states: Sequence[MarkovAnalysisState]
|
|
1528
|
+
:param decoded_paths: Per-item decoded state sequences.
|
|
1529
|
+
:type decoded_paths: Sequence[MarkovAnalysisDecodedPath]
|
|
1530
|
+
:return: None. Writes ``transitions.dot`` to ``run_dir``.
|
|
1531
|
+
:rtype: None
|
|
1532
|
+
"""
|
|
1533
|
+
|
|
1534
|
+
def infer_state_id_by_label(
|
|
1535
|
+
labels: Dict[int, str],
|
|
1536
|
+
keywords: Tuple[str, ...],
|
|
1537
|
+
exemplars_by_state: Dict[int, int],
|
|
1538
|
+
) -> Optional[int]:
|
|
1539
|
+
matches: List[int] = []
|
|
1540
|
+
for state_id, label in labels.items():
|
|
1541
|
+
normalized = label.lower()
|
|
1542
|
+
if any(keyword in normalized for keyword in keywords):
|
|
1543
|
+
matches.append(state_id)
|
|
1544
|
+
if not matches:
|
|
1545
|
+
return None
|
|
1546
|
+
if len(matches) == 1:
|
|
1547
|
+
return matches[0]
|
|
1548
|
+
return max(matches, key=lambda state_id: exemplars_by_state.get(state_id, 0))
|
|
1549
|
+
|
|
1550
|
+
lines: List[str] = []
|
|
1551
|
+
lines.append("digraph markov {")
|
|
1552
|
+
rankdir = str(graphviz.rankdir or "LR").upper()
|
|
1553
|
+
lines.append(f' rankdir="{rankdir}";')
|
|
1554
|
+
label_by_state: Dict[int, str] = {}
|
|
1555
|
+
exemplars_by_state: Dict[int, int] = {}
|
|
1556
|
+
exemplar_start: Dict[int, bool] = {}
|
|
1557
|
+
exemplar_end: Dict[int, bool] = {}
|
|
1558
|
+
for state in states:
|
|
1559
|
+
label_by_state[state.state_id] = str(state.label or "")
|
|
1560
|
+
exemplars_by_state[state.state_id] = len(state.exemplars or [])
|
|
1561
|
+
base_label = str(state.label or str(state.state_id))
|
|
1562
|
+
exemplar_start[state.state_id] = any(
|
|
1563
|
+
str(exemplar).strip().upper() == "START" for exemplar in (state.exemplars or [])
|
|
1564
|
+
)
|
|
1565
|
+
exemplar_end[state.state_id] = any(
|
|
1566
|
+
str(exemplar).strip().upper() == "END" for exemplar in (state.exemplars or [])
|
|
1567
|
+
)
|
|
1568
|
+
label = base_label
|
|
1569
|
+
safe_label = label.replace('"', '\\"')
|
|
1570
|
+
lines.append(f' {state.state_id} [label="{safe_label}"];')
|
|
1571
|
+
start_state_id = graphviz.start_state_id
|
|
1572
|
+
end_state_id = graphviz.end_state_id
|
|
1573
|
+
if start_state_id is None:
|
|
1574
|
+
matching = [state_id for state_id, has in exemplar_start.items() if has]
|
|
1575
|
+
if matching:
|
|
1576
|
+
start_state_id = max(matching, key=lambda state_id: exemplars_by_state.get(state_id, 0))
|
|
1577
|
+
else:
|
|
1578
|
+
start_state_id = infer_state_id_by_label(
|
|
1579
|
+
label_by_state,
|
|
1580
|
+
("start", "greeting", "opening"),
|
|
1581
|
+
exemplars_by_state,
|
|
1582
|
+
)
|
|
1583
|
+
if end_state_id is None:
|
|
1584
|
+
matching = [state_id for state_id, has in exemplar_end.items() if has]
|
|
1585
|
+
if matching:
|
|
1586
|
+
end_state_id = max(matching, key=lambda state_id: exemplars_by_state.get(state_id, 0))
|
|
1587
|
+
else:
|
|
1588
|
+
end_state_id = infer_state_id_by_label(
|
|
1589
|
+
label_by_state,
|
|
1590
|
+
("end", "closing", "goodbye", "wrap-up"),
|
|
1591
|
+
exemplars_by_state,
|
|
1592
|
+
)
|
|
1593
|
+
if start_state_id is not None:
|
|
1594
|
+
lines.append(f" {{ rank=min; {start_state_id}; }}")
|
|
1595
|
+
lines.append(
|
|
1596
|
+
f' {start_state_id} [shape="ellipse", peripheries=2, style="bold", color="#2b8a3e"];'
|
|
1597
|
+
)
|
|
1598
|
+
if end_state_id is not None:
|
|
1599
|
+
lines.append(f" {{ rank=max; {end_state_id}; }}")
|
|
1600
|
+
lines.append(f' {end_state_id} [shape="ellipse", peripheries=2, color="#b42318"];')
|
|
1601
|
+
observed_counts: Dict[Tuple[int, int], int] = {}
|
|
1602
|
+
observed_totals_by_state: Dict[int, int] = {}
|
|
1603
|
+
for path in decoded_paths:
|
|
1604
|
+
sequence = list(path.state_sequence)
|
|
1605
|
+
for from_state, to_state in zip(sequence, sequence[1:]):
|
|
1606
|
+
observed_counts[(from_state, to_state)] = (
|
|
1607
|
+
observed_counts.get((from_state, to_state), 0) + 1
|
|
1608
|
+
)
|
|
1609
|
+
observed_totals_by_state[from_state] = observed_totals_by_state.get(from_state, 0) + 1
|
|
1610
|
+
|
|
1611
|
+
for transition in transitions:
|
|
1612
|
+
if end_state_id is not None and transition.from_state == end_state_id:
|
|
1613
|
+
continue
|
|
1614
|
+
observed_count = observed_counts.get((transition.from_state, transition.to_state), 0)
|
|
1615
|
+
observed_total = observed_totals_by_state.get(transition.from_state, 0)
|
|
1616
|
+
observed_weight = observed_count / observed_total if observed_total else transition.weight
|
|
1617
|
+
if observed_total and observed_count == 0:
|
|
1618
|
+
continue
|
|
1619
|
+
if observed_weight < graphviz.min_edge_weight:
|
|
1620
|
+
continue
|
|
1621
|
+
label = f"{observed_weight * 100.0:.1f}%"
|
|
1622
|
+
lines.append(f' {transition.from_state} -> {transition.to_state} [label="{label}"];')
|
|
1623
|
+
lines.append("}")
|
|
1624
|
+
(run_dir / "transitions.dot").write_text("\n".join(lines) + "\n", encoding="utf-8")
|