biblicus 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,9 +9,9 @@ from typing import Any, Dict, List, Optional
9
9
 
10
10
  from pydantic import Field, field_validator, model_validator
11
11
 
12
+ from ..ai.models import EmbeddingsClientConfig, LlmClientConfig
12
13
  from ..constants import ANALYSIS_SCHEMA_VERSION
13
14
  from ..models import ExtractionRunReference
14
- from .llm import LlmClientConfig
15
15
  from .schema import AnalysisSchemaModel
16
16
 
17
17
 
@@ -775,3 +775,756 @@ class TopicModelingOutput(AnalysisSchemaModel):
775
775
  generated_at: str
776
776
  run: AnalysisRunManifest
777
777
  report: TopicModelingReport
778
+
779
+
780
+ class MarkovAnalysisStageStatus(str, Enum):
781
+ """
782
+ Status values for Markov analysis stages.
783
+ """
784
+
785
+ SKIPPED = "skipped"
786
+ COMPLETE = "complete"
787
+ FAILED = "failed"
788
+
789
+
790
+ class MarkovAnalysisSegmentationMethod(str, Enum):
791
+ """
792
+ Segmentation method identifiers for Markov analysis.
793
+ """
794
+
795
+ SENTENCE = "sentence"
796
+ FIXED_WINDOW = "fixed_window"
797
+ LLM = "llm"
798
+ SPAN_MARKUP = "span_markup"
799
+
800
+
801
+ class MarkovAnalysisLlmSegmentationConfig(AnalysisSchemaModel):
802
+ """
803
+ Provider-backed segmentation configuration.
804
+
805
+ :ivar client: LLM client configuration.
806
+ :vartype client: biblicus.ai.models.LlmClientConfig
807
+ :ivar prompt_template: Prompt template containing ``{text}``.
808
+ :vartype prompt_template: str
809
+ :ivar system_prompt: Optional system prompt.
810
+ :vartype system_prompt: str or None
811
+ """
812
+
813
+ client: LlmClientConfig
814
+ prompt_template: str = Field(min_length=1)
815
+ system_prompt: Optional[str] = None
816
+
817
+
818
+ class MarkovAnalysisSpanMarkupSegmentationConfig(AnalysisSchemaModel):
819
+ """
820
+ Provider-backed text extract configuration.
821
+
822
+ :ivar client: LLM client configuration.
823
+ :vartype client: biblicus.ai.models.LlmClientConfig
824
+ :ivar prompt_template: Prompt template describing what to return (must not include ``{text}``).
825
+ :vartype prompt_template: str
826
+ :ivar system_prompt: System prompt containing ``{text}``.
827
+ :vartype system_prompt: str
828
+ :ivar max_rounds: Maximum number of edit rounds.
829
+ :vartype max_rounds: int
830
+ :ivar max_edits_per_round: Maximum edits per round.
831
+ :vartype max_edits_per_round: int
832
+ :ivar label_attribute: Optional attribute name used to extract segment labels.
833
+ :vartype label_attribute: str or None
834
+ :ivar prepend_label: Whether to prepend the label and a newline to segment text.
835
+ :vartype prepend_label: bool
836
+ :ivar start_label_value: Optional marker prepended to the first segment.
837
+ :vartype start_label_value: str or None
838
+ :ivar end_label_value: Optional marker prepended to the last segment when verified.
839
+ :vartype end_label_value: str or None
840
+ :ivar end_label_verifier: Optional LLM verifier for end-label assignment.
841
+ :vartype end_label_verifier: MarkovAnalysisSpanMarkupEndLabelVerifierConfig or None
842
+ :ivar end_reject_label_value: Optional marker prepended when the verifier rejects an end label.
843
+ :vartype end_reject_label_value: str or None
844
+ :ivar end_reject_reason_prefix: Prefix used for the verifier explanation line.
845
+ :vartype end_reject_reason_prefix: str
846
+ """
847
+
848
+ client: LlmClientConfig
849
+ prompt_template: str = Field(min_length=1)
850
+ system_prompt: str = Field(min_length=1)
851
+ max_rounds: int = Field(default=6, ge=1)
852
+ max_edits_per_round: int = Field(default=500, ge=1)
853
+ label_attribute: Optional[str] = Field(default=None, min_length=1)
854
+ prepend_label: bool = False
855
+ start_label_value: Optional[str] = Field(default=None, min_length=1)
856
+ end_label_value: Optional[str] = Field(default=None, min_length=1)
857
+ end_label_verifier: Optional["MarkovAnalysisSpanMarkupEndLabelVerifierConfig"] = None
858
+ end_reject_label_value: Optional[str] = Field(default=None, min_length=1)
859
+ end_reject_reason_prefix: str = Field(default="disconnection_reason", min_length=1)
860
+
861
+ @model_validator(mode="after")
862
+ def _validate_prompt_template(self) -> "MarkovAnalysisSpanMarkupSegmentationConfig":
863
+ if "{text}" not in self.system_prompt:
864
+ raise ValueError("segmentation.span_markup.system_prompt must include {text}")
865
+ if "{text}" in self.prompt_template:
866
+ raise ValueError("segmentation.span_markup.prompt_template must not include {text}")
867
+ if self.prepend_label and not self.label_attribute:
868
+ raise ValueError(
869
+ "segmentation.span_markup.label_attribute is required when "
870
+ "segmentation.span_markup.prepend_label is true"
871
+ )
872
+ if self.end_label_value is not None and self.end_label_verifier is None:
873
+ raise ValueError(
874
+ "segmentation.span_markup.end_label_verifier is required when "
875
+ "segmentation.span_markup.end_label_value is set"
876
+ )
877
+ if self.end_reject_label_value is not None and self.end_label_verifier is None:
878
+ raise ValueError(
879
+ "segmentation.span_markup.end_label_verifier is required when "
880
+ "segmentation.span_markup.end_reject_label_value is set"
881
+ )
882
+ return self
883
+
884
+
885
+ class MarkovAnalysisSpanMarkupEndLabelVerifierConfig(AnalysisSchemaModel):
886
+ """
887
+ Verifier configuration for end-label assignment.
888
+
889
+ :ivar client: LLM client configuration.
890
+ :vartype client: biblicus.ai.models.LlmClientConfig
891
+ :ivar system_prompt: System prompt containing ``{text}``.
892
+ :vartype system_prompt: str
893
+ :ivar prompt_template: Prompt template for the verifier (must not include ``{text}``).
894
+ :vartype prompt_template: str
895
+ """
896
+
897
+ client: LlmClientConfig
898
+ system_prompt: str = Field(min_length=1)
899
+ prompt_template: str = Field(min_length=1)
900
+
901
+ @model_validator(mode="after")
902
+ def _validate_prompt_template(
903
+ self,
904
+ ) -> "MarkovAnalysisSpanMarkupEndLabelVerifierConfig":
905
+ if "{text}" not in self.system_prompt:
906
+ raise ValueError(
907
+ "segmentation.span_markup.end_label_verifier.system_prompt must include {text}"
908
+ )
909
+ if "{text}" in self.prompt_template:
910
+ raise ValueError(
911
+ "segmentation.span_markup.end_label_verifier.prompt_template must not include {text}"
912
+ )
913
+ return self
914
+
915
+
916
+ class MarkovAnalysisTextSourceConfig(AnalysisSchemaModel):
917
+ """
918
+ Text source configuration for Markov analysis.
919
+
920
+ :ivar sample_size: Optional cap on number of documents included.
921
+ :vartype sample_size: int or None
922
+ :ivar min_text_characters: Optional minimum extracted text length.
923
+ :vartype min_text_characters: int or None
924
+ """
925
+
926
+ sample_size: Optional[int] = Field(default=None, ge=1)
927
+ min_text_characters: Optional[int] = Field(default=None, ge=1)
928
+
929
+
930
+ class MarkovAnalysisFixedWindowSegmentationConfig(AnalysisSchemaModel):
931
+ """
932
+ Fixed window segmentation configuration.
933
+
934
+ :ivar max_characters: Maximum segment size in characters.
935
+ :vartype max_characters: int
936
+ :ivar overlap_characters: Overlap between consecutive segments.
937
+ :vartype overlap_characters: int
938
+ """
939
+
940
+ max_characters: int = Field(default=800, ge=1)
941
+ overlap_characters: int = Field(default=0, ge=0)
942
+
943
+
944
+ class MarkovAnalysisSegmentationConfig(AnalysisSchemaModel):
945
+ """
946
+ Segmentation configuration for Markov analysis.
947
+
948
+ :ivar method: Segmentation method identifier.
949
+ :vartype method: MarkovAnalysisSegmentationMethod
950
+ :ivar fixed_window: Fixed window settings for ``fixed_window`` method.
951
+ :vartype fixed_window: MarkovAnalysisFixedWindowSegmentationConfig
952
+ :ivar span_markup: Text extract settings for ``span_markup`` method.
953
+ :vartype span_markup: MarkovAnalysisSpanMarkupSegmentationConfig or None
954
+ """
955
+
956
+ method: MarkovAnalysisSegmentationMethod = Field(
957
+ default=MarkovAnalysisSegmentationMethod.SENTENCE
958
+ )
959
+ fixed_window: MarkovAnalysisFixedWindowSegmentationConfig = Field(
960
+ default_factory=MarkovAnalysisFixedWindowSegmentationConfig
961
+ )
962
+ llm: Optional[MarkovAnalysisLlmSegmentationConfig] = None
963
+ span_markup: Optional[MarkovAnalysisSpanMarkupSegmentationConfig] = None
964
+
965
+ @field_validator("method", mode="before")
966
+ @classmethod
967
+ def _parse_method(cls, value: object) -> MarkovAnalysisSegmentationMethod:
968
+ if isinstance(value, MarkovAnalysisSegmentationMethod):
969
+ return value
970
+ if isinstance(value, str):
971
+ return MarkovAnalysisSegmentationMethod(value)
972
+ raise ValueError("segmentation.method must be a string or MarkovAnalysisSegmentationMethod")
973
+
974
+ @model_validator(mode="after")
975
+ def _validate_requirements(self) -> "MarkovAnalysisSegmentationConfig":
976
+ if self.method == MarkovAnalysisSegmentationMethod.LLM and self.llm is None:
977
+ raise ValueError("segmentation.llm is required when segmentation.method is 'llm'")
978
+ if self.method == MarkovAnalysisSegmentationMethod.SPAN_MARKUP and self.span_markup is None:
979
+ raise ValueError(
980
+ "segmentation.span_markup is required when segmentation.method is 'span_markup'"
981
+ )
982
+ return self
983
+
984
+
985
+ class MarkovAnalysisLlmObservationsConfig(AnalysisSchemaModel):
986
+ """
987
+ Provider-backed observation extraction configuration.
988
+
989
+ :ivar enabled: Whether to enable provider-backed observation extraction.
990
+ :vartype enabled: bool
991
+ :ivar client: LLM client configuration.
992
+ :vartype client: biblicus.ai.models.LlmClientConfig
993
+ :ivar prompt_template: Prompt template containing ``{segment}``.
994
+ :vartype prompt_template: str
995
+ :ivar system_prompt: Optional system prompt.
996
+ :vartype system_prompt: str or None
997
+ """
998
+
999
+ enabled: bool = Field(default=False)
1000
+ client: Optional[LlmClientConfig] = None
1001
+ prompt_template: Optional[str] = None
1002
+ system_prompt: Optional[str] = None
1003
+
1004
+ @model_validator(mode="after")
1005
+ def _validate_requirements(self) -> "MarkovAnalysisLlmObservationsConfig":
1006
+ if not self.enabled:
1007
+ return self
1008
+ if self.client is None:
1009
+ raise ValueError(
1010
+ "llm_observations.client is required when llm_observations.enabled is true"
1011
+ )
1012
+ if not self.prompt_template:
1013
+ raise ValueError(
1014
+ "llm_observations.prompt_template is required when llm_observations.enabled is true"
1015
+ )
1016
+ return self
1017
+
1018
+
1019
+ class MarkovAnalysisEmbeddingsConfig(AnalysisSchemaModel):
1020
+ """
1021
+ Provider-backed embeddings configuration.
1022
+
1023
+ :ivar enabled: Whether to generate embeddings.
1024
+ :vartype enabled: bool
1025
+ :ivar client: Embeddings client configuration.
1026
+ :vartype client: biblicus.ai.models.EmbeddingsClientConfig
1027
+ :ivar text_source: Which text field to embed (``segment_text`` or ``llm_summary``).
1028
+ :vartype text_source: str
1029
+ """
1030
+
1031
+ enabled: bool = Field(default=False)
1032
+ client: Optional[EmbeddingsClientConfig] = None
1033
+ text_source: str = Field(default="segment_text", min_length=1)
1034
+
1035
+ @model_validator(mode="after")
1036
+ def _validate_requirements(self) -> "MarkovAnalysisEmbeddingsConfig":
1037
+ if not self.enabled:
1038
+ return self
1039
+ if self.client is None:
1040
+ raise ValueError("embeddings.client is required when embeddings.enabled is true")
1041
+ if self.text_source not in {"segment_text", "llm_summary"}:
1042
+ raise ValueError("embeddings.text_source must be 'segment_text' or 'llm_summary'")
1043
+ return self
1044
+
1045
+
1046
+ class MarkovAnalysisTopicModelingConfig(AnalysisSchemaModel):
1047
+ """
1048
+ Topic modeling configuration for Markov analysis observations.
1049
+
1050
+ :ivar enabled: Whether to run topic modeling on segments.
1051
+ :vartype enabled: bool
1052
+ :ivar recipe: Topic modeling recipe applied to segments.
1053
+ :vartype recipe: TopicModelingRecipeConfig or None
1054
+ """
1055
+
1056
+ enabled: bool = Field(default=False)
1057
+ recipe: Optional["TopicModelingRecipeConfig"] = None
1058
+
1059
+ @model_validator(mode="after")
1060
+ def _validate_requirements(self) -> "MarkovAnalysisTopicModelingConfig":
1061
+ if not self.enabled:
1062
+ return self
1063
+ if self.recipe is None:
1064
+ raise ValueError(
1065
+ "topic_modeling.recipe is required when topic_modeling.enabled is true"
1066
+ )
1067
+ if self.recipe.llm_extraction.enabled and (
1068
+ self.recipe.llm_extraction.method != TopicModelingLlmExtractionMethod.SINGLE
1069
+ ):
1070
+ raise ValueError(
1071
+ "topic_modeling.recipe.llm_extraction.method must be 'single' for Markov topic modeling"
1072
+ )
1073
+ return self
1074
+
1075
+
1076
+ class MarkovAnalysisObservationsEncoder(str, Enum):
1077
+ """
1078
+ Observation encoder identifiers.
1079
+ """
1080
+
1081
+ TFIDF = "tfidf"
1082
+ EMBEDDING = "embedding"
1083
+ HYBRID = "hybrid"
1084
+
1085
+
1086
+ class MarkovAnalysisTfidfObservationConfig(AnalysisSchemaModel):
1087
+ """
1088
+ TF-IDF encoder configuration for local observations.
1089
+
1090
+ :ivar max_features: Maximum vocabulary size.
1091
+ :vartype max_features: int
1092
+ :ivar ngram_range: Inclusive n-gram range.
1093
+ :vartype ngram_range: list[int]
1094
+ """
1095
+
1096
+ max_features: int = Field(default=2000, ge=1)
1097
+ ngram_range: List[int] = Field(default_factory=lambda: [1, 2])
1098
+
1099
+ @field_validator("ngram_range", mode="before")
1100
+ @classmethod
1101
+ def _validate_ngram_range(cls, value: object) -> object:
1102
+ if value is None:
1103
+ return value
1104
+ if not isinstance(value, list) or len(value) != 2:
1105
+ raise ValueError("tfidf.ngram_range must be a list of two integers")
1106
+ if any(not isinstance(item, int) for item in value):
1107
+ raise ValueError("tfidf.ngram_range must be a list of two integers")
1108
+ if value[0] < 1 or value[1] < value[0]:
1109
+ raise ValueError("tfidf.ngram_range must be a valid inclusive range")
1110
+ return value
1111
+
1112
+
1113
+ class MarkovAnalysisObservationsConfig(AnalysisSchemaModel):
1114
+ """
1115
+ Observations configuration for Markov analysis.
1116
+
1117
+ :ivar encoder: Observation encoder identifier.
1118
+ :vartype encoder: MarkovAnalysisObservationsEncoder
1119
+ :ivar tfidf: TF-IDF encoder settings.
1120
+ :vartype tfidf: MarkovAnalysisTfidfObservationConfig
1121
+ :ivar text_source: Which text field to encode for ``tfidf`` (``segment_text`` or ``llm_summary``).
1122
+ :vartype text_source: str
1123
+ :ivar categorical_source: Which field provides categorical labels for hybrid/categorical use.
1124
+ :vartype categorical_source: str
1125
+ :ivar numeric_source: Which field provides a numeric scalar feature for hybrid use.
1126
+ :vartype numeric_source: str
1127
+ """
1128
+
1129
+ encoder: MarkovAnalysisObservationsEncoder = Field(
1130
+ default=MarkovAnalysisObservationsEncoder.TFIDF
1131
+ )
1132
+ tfidf: MarkovAnalysisTfidfObservationConfig = Field(
1133
+ default_factory=MarkovAnalysisTfidfObservationConfig
1134
+ )
1135
+ text_source: str = Field(default="segment_text", min_length=1)
1136
+ categorical_source: str = Field(default="llm_label", min_length=1)
1137
+ numeric_source: str = Field(default="llm_label_confidence", min_length=1)
1138
+
1139
+ @field_validator("encoder", mode="before")
1140
+ @classmethod
1141
+ def _parse_encoder(cls, value: object) -> MarkovAnalysisObservationsEncoder:
1142
+ if isinstance(value, MarkovAnalysisObservationsEncoder):
1143
+ return value
1144
+ if isinstance(value, str):
1145
+ return MarkovAnalysisObservationsEncoder(value)
1146
+ raise ValueError(
1147
+ "observations.encoder must be a string or MarkovAnalysisObservationsEncoder"
1148
+ )
1149
+
1150
+ @model_validator(mode="after")
1151
+ def _validate_sources(self) -> "MarkovAnalysisObservationsConfig":
1152
+ if self.text_source not in {"segment_text", "llm_summary"}:
1153
+ raise ValueError("observations.text_source must be 'segment_text' or 'llm_summary'")
1154
+ return self
1155
+
1156
+
1157
+ class MarkovAnalysisModelFamily(str, Enum):
1158
+ """
1159
+ Markov model family identifiers.
1160
+ """
1161
+
1162
+ GAUSSIAN = "gaussian"
1163
+ CATEGORICAL = "categorical"
1164
+
1165
+
1166
+ class MarkovAnalysisModelConfig(AnalysisSchemaModel):
1167
+ """
1168
+ Model configuration for Markov analysis.
1169
+
1170
+ :ivar family: Model family identifier.
1171
+ :vartype family: MarkovAnalysisModelFamily
1172
+ :ivar n_states: Number of hidden states to learn.
1173
+ :vartype n_states: int
1174
+ """
1175
+
1176
+ family: MarkovAnalysisModelFamily = Field(default=MarkovAnalysisModelFamily.GAUSSIAN)
1177
+ n_states: int = Field(default=8, ge=1)
1178
+
1179
+ @field_validator("family", mode="before")
1180
+ @classmethod
1181
+ def _parse_family(cls, value: object) -> MarkovAnalysisModelFamily:
1182
+ if isinstance(value, MarkovAnalysisModelFamily):
1183
+ return value
1184
+ if isinstance(value, str):
1185
+ return MarkovAnalysisModelFamily(value)
1186
+ raise ValueError("model.family must be a string or MarkovAnalysisModelFamily")
1187
+
1188
+
1189
+ class MarkovAnalysisArtifactsGraphVizConfig(AnalysisSchemaModel):
1190
+ """
1191
+ GraphViz export configuration.
1192
+
1193
+ :ivar enabled: Whether to write GraphViz transitions output.
1194
+ :vartype enabled: bool
1195
+ :ivar rankdir: GraphViz rank direction (e.g., LR or TB).
1196
+ :vartype rankdir: str
1197
+ :ivar min_edge_weight: Minimum edge weight to include in GraphViz output.
1198
+ :vartype min_edge_weight: float
1199
+ :ivar start_state_id: Optional state id to pin at the start of the layout.
1200
+ :vartype start_state_id: int or None
1201
+ :ivar end_state_id: Optional state id to pin at the end of the layout.
1202
+ :vartype end_state_id: int or None
1203
+ """
1204
+
1205
+ enabled: bool = Field(default=False)
1206
+ rankdir: str = Field(default="LR", min_length=1)
1207
+ min_edge_weight: float = Field(default=0.0, ge=0.0)
1208
+ start_state_id: Optional[int] = None
1209
+ end_state_id: Optional[int] = None
1210
+
1211
+
1212
+ class MarkovAnalysisArtifactsConfig(AnalysisSchemaModel):
1213
+ """
1214
+ Artifact configuration for Markov analysis.
1215
+
1216
+ :ivar graphviz: GraphViz export settings.
1217
+ :vartype graphviz: MarkovAnalysisArtifactsGraphVizConfig
1218
+ """
1219
+
1220
+ graphviz: MarkovAnalysisArtifactsGraphVizConfig = Field(
1221
+ default_factory=MarkovAnalysisArtifactsGraphVizConfig
1222
+ )
1223
+
1224
+
1225
+ class MarkovAnalysisReportConfig(AnalysisSchemaModel):
1226
+ """
1227
+ Report configuration for Markov analysis.
1228
+
1229
+ :ivar max_state_exemplars: Maximum exemplar segments stored per state.
1230
+ :vartype max_state_exemplars: int
1231
+ :ivar state_naming: Optional provider-backed state naming configuration.
1232
+ :vartype state_naming: MarkovAnalysisStateNamingConfig or None
1233
+ """
1234
+
1235
+ max_state_exemplars: int = Field(default=5, ge=0)
1236
+ state_naming: Optional["MarkovAnalysisStateNamingConfig"] = None
1237
+
1238
+
1239
+ class MarkovAnalysisStateNamingConfig(AnalysisSchemaModel):
1240
+ """
1241
+ Provider-backed configuration for naming Markov states.
1242
+
1243
+ :ivar enabled: Whether state naming is enabled.
1244
+ :vartype enabled: bool
1245
+ :ivar client: LLM client configuration.
1246
+ :vartype client: biblicus.ai.models.LlmClientConfig
1247
+ :ivar system_prompt: System prompt containing the context pack placeholder.
1248
+ :vartype system_prompt: str
1249
+ :ivar prompt_template: User prompt template for naming.
1250
+ :vartype prompt_template: str
1251
+ :ivar token_budget: Maximum tokens for the context pack text.
1252
+ :vartype token_budget: int
1253
+ :ivar max_exemplars_per_state: Maximum exemplars per state in the context pack.
1254
+ :vartype max_exemplars_per_state: int
1255
+ :ivar max_name_words: Maximum words allowed in each state name (short noun phrase).
1256
+ :vartype max_name_words: int
1257
+ :ivar max_retries: Maximum retries when the naming response is invalid.
1258
+ :vartype max_retries: int
1259
+ """
1260
+
1261
+ enabled: bool = False
1262
+ client: Optional[LlmClientConfig] = None
1263
+ system_prompt: Optional[str] = None
1264
+ prompt_template: Optional[str] = None
1265
+ token_budget: int = Field(default=256, ge=1)
1266
+ max_exemplars_per_state: int = Field(default=3, ge=1)
1267
+ max_name_words: int = Field(default=4, ge=1)
1268
+ max_retries: int = Field(default=1, ge=0)
1269
+
1270
+ @model_validator(mode="after")
1271
+ def _validate_state_naming(self) -> "MarkovAnalysisStateNamingConfig":
1272
+ if not self.enabled:
1273
+ return self
1274
+ if self.client is None:
1275
+ raise ValueError("report.state_naming.client is required when enabled")
1276
+ if self.system_prompt is None or not str(self.system_prompt).strip():
1277
+ raise ValueError("report.state_naming.system_prompt is required when enabled")
1278
+ if "{context_pack}" not in self.system_prompt:
1279
+ raise ValueError(
1280
+ 'report.state_naming.system_prompt must include the "{context_pack}" placeholder'
1281
+ )
1282
+ if self.prompt_template is None or not str(self.prompt_template).strip():
1283
+ raise ValueError("report.state_naming.prompt_template is required when enabled")
1284
+ if "{context_pack}" in self.prompt_template:
1285
+ raise ValueError(
1286
+ 'report.state_naming.prompt_template must not include "{context_pack}"'
1287
+ )
1288
+ return self
1289
+
1290
+
1291
+ class MarkovAnalysisRecipeConfig(AnalysisSchemaModel):
1292
+ """
1293
+ Recipe configuration for Markov analysis.
1294
+
1295
+ :ivar schema_version: Analysis schema version.
1296
+ :vartype schema_version: int
1297
+ :ivar text_source: Text source configuration.
1298
+ :vartype text_source: MarkovAnalysisTextSourceConfig
1299
+ :ivar segmentation: Segmentation configuration.
1300
+ :vartype segmentation: MarkovAnalysisSegmentationConfig
1301
+ :ivar observations: Observation encoder configuration.
1302
+ :vartype observations: MarkovAnalysisObservationsConfig
1303
+ :ivar model: Markov model configuration.
1304
+ :vartype model: MarkovAnalysisModelConfig
1305
+ :ivar topic_modeling: Topic modeling configuration.
1306
+ :vartype topic_modeling: MarkovAnalysisTopicModelingConfig
1307
+ :ivar artifacts: Artifact configuration.
1308
+ :vartype artifacts: MarkovAnalysisArtifactsConfig
1309
+ :ivar report: Report configuration.
1310
+ :vartype report: MarkovAnalysisReportConfig
1311
+ """
1312
+
1313
+ schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
1314
+ text_source: MarkovAnalysisTextSourceConfig = Field(
1315
+ default_factory=MarkovAnalysisTextSourceConfig
1316
+ )
1317
+ segmentation: MarkovAnalysisSegmentationConfig = Field(
1318
+ default_factory=MarkovAnalysisSegmentationConfig
1319
+ )
1320
+ observations: MarkovAnalysisObservationsConfig = Field(
1321
+ default_factory=MarkovAnalysisObservationsConfig
1322
+ )
1323
+ model: MarkovAnalysisModelConfig = Field(default_factory=MarkovAnalysisModelConfig)
1324
+ topic_modeling: MarkovAnalysisTopicModelingConfig = Field(
1325
+ default_factory=MarkovAnalysisTopicModelingConfig
1326
+ )
1327
+ llm_observations: MarkovAnalysisLlmObservationsConfig = Field(
1328
+ default_factory=MarkovAnalysisLlmObservationsConfig
1329
+ )
1330
+ embeddings: MarkovAnalysisEmbeddingsConfig = Field(
1331
+ default_factory=MarkovAnalysisEmbeddingsConfig
1332
+ )
1333
+ artifacts: MarkovAnalysisArtifactsConfig = Field(default_factory=MarkovAnalysisArtifactsConfig)
1334
+ report: MarkovAnalysisReportConfig = Field(default_factory=MarkovAnalysisReportConfig)
1335
+
1336
+ @model_validator(mode="after")
1337
+ def _validate_schema_version(self) -> "MarkovAnalysisRecipeConfig":
1338
+ if self.schema_version != ANALYSIS_SCHEMA_VERSION:
1339
+ raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
1340
+ return self
1341
+
1342
+
1343
+ class MarkovAnalysisTextCollectionReport(AnalysisSchemaModel):
1344
+ """
1345
+ Report for Markov analysis text collection stage.
1346
+
1347
+ :ivar status: Stage status.
1348
+ :vartype status: MarkovAnalysisStageStatus
1349
+ :ivar source_items: Count of items in extraction run.
1350
+ :vartype source_items: int
1351
+ :ivar documents: Count of documents included.
1352
+ :vartype documents: int
1353
+ :ivar sample_size: Sample size applied.
1354
+ :vartype sample_size: int or None
1355
+ :ivar min_text_characters: Minimum length filter applied.
1356
+ :vartype min_text_characters: int or None
1357
+ :ivar empty_texts: Count of empty extracted texts.
1358
+ :vartype empty_texts: int
1359
+ :ivar skipped_items: Count of items skipped for missing/invalid text.
1360
+ :vartype skipped_items: int
1361
+ :ivar warnings: Warning messages.
1362
+ :vartype warnings: list[str]
1363
+ :ivar errors: Error messages.
1364
+ :vartype errors: list[str]
1365
+ """
1366
+
1367
+ status: MarkovAnalysisStageStatus
1368
+ source_items: int = Field(ge=0)
1369
+ documents: int = Field(ge=0)
1370
+ sample_size: Optional[int] = None
1371
+ min_text_characters: Optional[int] = None
1372
+ empty_texts: int = Field(ge=0)
1373
+ skipped_items: int = Field(ge=0)
1374
+ warnings: List[str] = Field(default_factory=list)
1375
+ errors: List[str] = Field(default_factory=list)
1376
+
1377
+
1378
+ class MarkovAnalysisSegment(AnalysisSchemaModel):
1379
+ """
1380
+ Segment record for Markov analysis.
1381
+
1382
+ :ivar item_id: Source item identifier.
1383
+ :vartype item_id: str
1384
+ :ivar segment_index: One-based segment index within the item.
1385
+ :vartype segment_index: int
1386
+ :ivar text: Segment text.
1387
+ :vartype text: str
1388
+ """
1389
+
1390
+ item_id: str = Field(min_length=1)
1391
+ segment_index: int = Field(ge=1)
1392
+ text: str = Field(min_length=1)
1393
+
1394
+
1395
+ class MarkovAnalysisObservation(AnalysisSchemaModel):
1396
+ """
1397
+ Observation record for a single segment.
1398
+
1399
+ :ivar item_id: Source item identifier.
1400
+ :vartype item_id: str
1401
+ :ivar segment_index: One-based segment index within the item.
1402
+ :vartype segment_index: int
1403
+ :ivar segment_text: Segment text.
1404
+ :vartype segment_text: str
1405
+ :ivar llm_label: Optional provider-proposed label.
1406
+ :vartype llm_label: str or None
1407
+ :ivar llm_label_confidence: Optional provider-proposed confidence.
1408
+ :vartype llm_label_confidence: float or None
1409
+ :ivar llm_summary: Optional provider-proposed summary.
1410
+ :vartype llm_summary: str or None
1411
+ :ivar topic_id: Optional topic identifier from topic modeling.
1412
+ :vartype topic_id: int or None
1413
+ :ivar topic_label: Optional topic label from topic modeling.
1414
+ :vartype topic_label: str or None
1415
+ :ivar embedding: Optional embedding vector for the configured embedding text source.
1416
+ :vartype embedding: list[float] or None
1417
+ """
1418
+
1419
+ item_id: str = Field(min_length=1)
1420
+ segment_index: int = Field(ge=1)
1421
+ segment_text: str = Field(min_length=1)
1422
+ llm_label: Optional[str] = None
1423
+ llm_label_confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
1424
+ llm_summary: Optional[str] = None
1425
+ topic_id: Optional[int] = None
1426
+ topic_label: Optional[str] = None
1427
+ embedding: Optional[List[float]] = None
1428
+
1429
+
1430
+ class MarkovAnalysisState(AnalysisSchemaModel):
1431
+ """
1432
+ State record for Markov analysis.
1433
+
1434
+ :ivar state_id: State identifier.
1435
+ :vartype state_id: int
1436
+ :ivar label: Optional human-readable label.
1437
+ :vartype label: str or None
1438
+ :ivar exemplars: Example segments representative of the state.
1439
+ :vartype exemplars: list[str]
1440
+ """
1441
+
1442
+ state_id: int = Field(ge=0)
1443
+ label: Optional[str] = None
1444
+ exemplars: List[str] = Field(default_factory=list)
1445
+
1446
+
1447
+ class MarkovAnalysisTransition(AnalysisSchemaModel):
1448
+ """
1449
+ Directed transition edge between two states.
1450
+
1451
+ :ivar from_state: Source state identifier.
1452
+ :vartype from_state: int
1453
+ :ivar to_state: Destination state identifier.
1454
+ :vartype to_state: int
1455
+ :ivar weight: Transition weight.
1456
+ :vartype weight: float
1457
+ """
1458
+
1459
+ from_state: int = Field(ge=0)
1460
+ to_state: int = Field(ge=0)
1461
+ weight: float
1462
+
1463
+
1464
+ class MarkovAnalysisDecodedPath(AnalysisSchemaModel):
1465
+ """
1466
+ Decoded state sequence for a single item.
1467
+
1468
+ :ivar item_id: Source item identifier.
1469
+ :vartype item_id: str
1470
+ :ivar state_sequence: Most likely state sequence over segments.
1471
+ :vartype state_sequence: list[int]
1472
+ """
1473
+
1474
+ item_id: str = Field(min_length=1)
1475
+ state_sequence: List[int] = Field(default_factory=list)
1476
+
1477
+
1478
+ class MarkovAnalysisReport(AnalysisSchemaModel):
1479
+ """
1480
+ Markov analysis report data.
1481
+
1482
+ :ivar text_collection: Text collection report.
1483
+ :vartype text_collection: MarkovAnalysisTextCollectionReport
1484
+ :ivar status: Overall analysis status.
1485
+ :vartype status: MarkovAnalysisStageStatus
1486
+ :ivar states: State records.
1487
+ :vartype states: list[MarkovAnalysisState]
1488
+ :ivar transitions: Transition edges.
1489
+ :vartype transitions: list[MarkovAnalysisTransition]
1490
+ :ivar decoded_paths: Per-item decoded paths.
1491
+ :vartype decoded_paths: list[MarkovAnalysisDecodedPath]
1492
+ :ivar topic_modeling: Optional topic modeling report for segment topics.
1493
+ :vartype topic_modeling: TopicModelingReport or None
1494
+ :ivar warnings: Warning messages.
1495
+ :vartype warnings: list[str]
1496
+ :ivar errors: Error messages.
1497
+ :vartype errors: list[str]
1498
+ """
1499
+
1500
+ text_collection: MarkovAnalysisTextCollectionReport
1501
+ status: MarkovAnalysisStageStatus
1502
+ states: List[MarkovAnalysisState] = Field(default_factory=list)
1503
+ transitions: List[MarkovAnalysisTransition] = Field(default_factory=list)
1504
+ decoded_paths: List[MarkovAnalysisDecodedPath] = Field(default_factory=list)
1505
+ topic_modeling: Optional[TopicModelingReport] = None
1506
+ warnings: List[str] = Field(default_factory=list)
1507
+ errors: List[str] = Field(default_factory=list)
1508
+
1509
+
1510
+ class MarkovAnalysisOutput(AnalysisSchemaModel):
1511
+ """
1512
+ Output bundle for Markov analysis.
1513
+
1514
+ :ivar schema_version: Analysis schema version.
1515
+ :vartype schema_version: int
1516
+ :ivar analysis_id: Analysis backend identifier.
1517
+ :vartype analysis_id: str
1518
+ :ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
1519
+ :vartype generated_at: str
1520
+ :ivar run: Analysis run manifest.
1521
+ :vartype run: AnalysisRunManifest
1522
+ :ivar report: Markov analysis report data.
1523
+ :vartype report: MarkovAnalysisReport
1524
+ """
1525
+
1526
+ schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
1527
+ analysis_id: str
1528
+ generated_at: str
1529
+ run: AnalysisRunManifest
1530
+ report: MarkovAnalysisReport