biblicus 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblicus/__init__.py +1 -1
- biblicus/_vendor/dotyaml/__init__.py +2 -2
- biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus/ai/__init__.py +39 -0
- biblicus/ai/embeddings.py +114 -0
- biblicus/ai/llm.py +138 -0
- biblicus/ai/models.py +226 -0
- biblicus/analysis/__init__.py +5 -2
- biblicus/analysis/markov.py +1624 -0
- biblicus/analysis/models.py +754 -1
- biblicus/analysis/topic_modeling.py +98 -19
- biblicus/backends/sqlite_full_text_search.py +4 -2
- biblicus/cli.py +118 -23
- biblicus/recipes.py +136 -0
- biblicus/text/__init__.py +43 -0
- biblicus/text/annotate.py +222 -0
- biblicus/text/extract.py +210 -0
- biblicus/text/link.py +519 -0
- biblicus/text/markup.py +200 -0
- biblicus/text/models.py +319 -0
- biblicus/text/prompts.py +113 -0
- biblicus/text/redact.py +229 -0
- biblicus/text/slice.py +155 -0
- biblicus/text/tool_loop.py +334 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/METADATA +88 -25
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/RECORD +30 -15
- biblicus/analysis/llm.py +0 -106
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/WHEEL +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/entry_points.txt +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {biblicus-0.14.0.dist-info → biblicus-0.15.0.dist-info}/top_level.txt +0 -0
biblicus/analysis/models.py
CHANGED
|
@@ -9,9 +9,9 @@ from typing import Any, Dict, List, Optional
|
|
|
9
9
|
|
|
10
10
|
from pydantic import Field, field_validator, model_validator
|
|
11
11
|
|
|
12
|
+
from ..ai.models import EmbeddingsClientConfig, LlmClientConfig
|
|
12
13
|
from ..constants import ANALYSIS_SCHEMA_VERSION
|
|
13
14
|
from ..models import ExtractionRunReference
|
|
14
|
-
from .llm import LlmClientConfig
|
|
15
15
|
from .schema import AnalysisSchemaModel
|
|
16
16
|
|
|
17
17
|
|
|
@@ -775,3 +775,756 @@ class TopicModelingOutput(AnalysisSchemaModel):
|
|
|
775
775
|
generated_at: str
|
|
776
776
|
run: AnalysisRunManifest
|
|
777
777
|
report: TopicModelingReport
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
class MarkovAnalysisStageStatus(str, Enum):
|
|
781
|
+
"""
|
|
782
|
+
Status values for Markov analysis stages.
|
|
783
|
+
"""
|
|
784
|
+
|
|
785
|
+
SKIPPED = "skipped"
|
|
786
|
+
COMPLETE = "complete"
|
|
787
|
+
FAILED = "failed"
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
class MarkovAnalysisSegmentationMethod(str, Enum):
|
|
791
|
+
"""
|
|
792
|
+
Segmentation method identifiers for Markov analysis.
|
|
793
|
+
"""
|
|
794
|
+
|
|
795
|
+
SENTENCE = "sentence"
|
|
796
|
+
FIXED_WINDOW = "fixed_window"
|
|
797
|
+
LLM = "llm"
|
|
798
|
+
SPAN_MARKUP = "span_markup"
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
class MarkovAnalysisLlmSegmentationConfig(AnalysisSchemaModel):
|
|
802
|
+
"""
|
|
803
|
+
Provider-backed segmentation configuration.
|
|
804
|
+
|
|
805
|
+
:ivar client: LLM client configuration.
|
|
806
|
+
:vartype client: biblicus.ai.models.LlmClientConfig
|
|
807
|
+
:ivar prompt_template: Prompt template containing ``{text}``.
|
|
808
|
+
:vartype prompt_template: str
|
|
809
|
+
:ivar system_prompt: Optional system prompt.
|
|
810
|
+
:vartype system_prompt: str or None
|
|
811
|
+
"""
|
|
812
|
+
|
|
813
|
+
client: LlmClientConfig
|
|
814
|
+
prompt_template: str = Field(min_length=1)
|
|
815
|
+
system_prompt: Optional[str] = None
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
class MarkovAnalysisSpanMarkupSegmentationConfig(AnalysisSchemaModel):
|
|
819
|
+
"""
|
|
820
|
+
Provider-backed text extract configuration.
|
|
821
|
+
|
|
822
|
+
:ivar client: LLM client configuration.
|
|
823
|
+
:vartype client: biblicus.ai.models.LlmClientConfig
|
|
824
|
+
:ivar prompt_template: Prompt template describing what to return (must not include ``{text}``).
|
|
825
|
+
:vartype prompt_template: str
|
|
826
|
+
:ivar system_prompt: System prompt containing ``{text}``.
|
|
827
|
+
:vartype system_prompt: str
|
|
828
|
+
:ivar max_rounds: Maximum number of edit rounds.
|
|
829
|
+
:vartype max_rounds: int
|
|
830
|
+
:ivar max_edits_per_round: Maximum edits per round.
|
|
831
|
+
:vartype max_edits_per_round: int
|
|
832
|
+
:ivar label_attribute: Optional attribute name used to extract segment labels.
|
|
833
|
+
:vartype label_attribute: str or None
|
|
834
|
+
:ivar prepend_label: Whether to prepend the label and a newline to segment text.
|
|
835
|
+
:vartype prepend_label: bool
|
|
836
|
+
:ivar start_label_value: Optional marker prepended to the first segment.
|
|
837
|
+
:vartype start_label_value: str or None
|
|
838
|
+
:ivar end_label_value: Optional marker prepended to the last segment when verified.
|
|
839
|
+
:vartype end_label_value: str or None
|
|
840
|
+
:ivar end_label_verifier: Optional LLM verifier for end-label assignment.
|
|
841
|
+
:vartype end_label_verifier: MarkovAnalysisSpanMarkupEndLabelVerifierConfig or None
|
|
842
|
+
:ivar end_reject_label_value: Optional marker prepended when the verifier rejects an end label.
|
|
843
|
+
:vartype end_reject_label_value: str or None
|
|
844
|
+
:ivar end_reject_reason_prefix: Prefix used for the verifier explanation line.
|
|
845
|
+
:vartype end_reject_reason_prefix: str
|
|
846
|
+
"""
|
|
847
|
+
|
|
848
|
+
client: LlmClientConfig
|
|
849
|
+
prompt_template: str = Field(min_length=1)
|
|
850
|
+
system_prompt: str = Field(min_length=1)
|
|
851
|
+
max_rounds: int = Field(default=6, ge=1)
|
|
852
|
+
max_edits_per_round: int = Field(default=500, ge=1)
|
|
853
|
+
label_attribute: Optional[str] = Field(default=None, min_length=1)
|
|
854
|
+
prepend_label: bool = False
|
|
855
|
+
start_label_value: Optional[str] = Field(default=None, min_length=1)
|
|
856
|
+
end_label_value: Optional[str] = Field(default=None, min_length=1)
|
|
857
|
+
end_label_verifier: Optional["MarkovAnalysisSpanMarkupEndLabelVerifierConfig"] = None
|
|
858
|
+
end_reject_label_value: Optional[str] = Field(default=None, min_length=1)
|
|
859
|
+
end_reject_reason_prefix: str = Field(default="disconnection_reason", min_length=1)
|
|
860
|
+
|
|
861
|
+
@model_validator(mode="after")
|
|
862
|
+
def _validate_prompt_template(self) -> "MarkovAnalysisSpanMarkupSegmentationConfig":
|
|
863
|
+
if "{text}" not in self.system_prompt:
|
|
864
|
+
raise ValueError("segmentation.span_markup.system_prompt must include {text}")
|
|
865
|
+
if "{text}" in self.prompt_template:
|
|
866
|
+
raise ValueError("segmentation.span_markup.prompt_template must not include {text}")
|
|
867
|
+
if self.prepend_label and not self.label_attribute:
|
|
868
|
+
raise ValueError(
|
|
869
|
+
"segmentation.span_markup.label_attribute is required when "
|
|
870
|
+
"segmentation.span_markup.prepend_label is true"
|
|
871
|
+
)
|
|
872
|
+
if self.end_label_value is not None and self.end_label_verifier is None:
|
|
873
|
+
raise ValueError(
|
|
874
|
+
"segmentation.span_markup.end_label_verifier is required when "
|
|
875
|
+
"segmentation.span_markup.end_label_value is set"
|
|
876
|
+
)
|
|
877
|
+
if self.end_reject_label_value is not None and self.end_label_verifier is None:
|
|
878
|
+
raise ValueError(
|
|
879
|
+
"segmentation.span_markup.end_label_verifier is required when "
|
|
880
|
+
"segmentation.span_markup.end_reject_label_value is set"
|
|
881
|
+
)
|
|
882
|
+
return self
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
class MarkovAnalysisSpanMarkupEndLabelVerifierConfig(AnalysisSchemaModel):
|
|
886
|
+
"""
|
|
887
|
+
Verifier configuration for end-label assignment.
|
|
888
|
+
|
|
889
|
+
:ivar client: LLM client configuration.
|
|
890
|
+
:vartype client: biblicus.ai.models.LlmClientConfig
|
|
891
|
+
:ivar system_prompt: System prompt containing ``{text}``.
|
|
892
|
+
:vartype system_prompt: str
|
|
893
|
+
:ivar prompt_template: Prompt template for the verifier (must not include ``{text}``).
|
|
894
|
+
:vartype prompt_template: str
|
|
895
|
+
"""
|
|
896
|
+
|
|
897
|
+
client: LlmClientConfig
|
|
898
|
+
system_prompt: str = Field(min_length=1)
|
|
899
|
+
prompt_template: str = Field(min_length=1)
|
|
900
|
+
|
|
901
|
+
@model_validator(mode="after")
|
|
902
|
+
def _validate_prompt_template(
|
|
903
|
+
self,
|
|
904
|
+
) -> "MarkovAnalysisSpanMarkupEndLabelVerifierConfig":
|
|
905
|
+
if "{text}" not in self.system_prompt:
|
|
906
|
+
raise ValueError(
|
|
907
|
+
"segmentation.span_markup.end_label_verifier.system_prompt must include {text}"
|
|
908
|
+
)
|
|
909
|
+
if "{text}" in self.prompt_template:
|
|
910
|
+
raise ValueError(
|
|
911
|
+
"segmentation.span_markup.end_label_verifier.prompt_template must not include {text}"
|
|
912
|
+
)
|
|
913
|
+
return self
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
class MarkovAnalysisTextSourceConfig(AnalysisSchemaModel):
|
|
917
|
+
"""
|
|
918
|
+
Text source configuration for Markov analysis.
|
|
919
|
+
|
|
920
|
+
:ivar sample_size: Optional cap on number of documents included.
|
|
921
|
+
:vartype sample_size: int or None
|
|
922
|
+
:ivar min_text_characters: Optional minimum extracted text length.
|
|
923
|
+
:vartype min_text_characters: int or None
|
|
924
|
+
"""
|
|
925
|
+
|
|
926
|
+
sample_size: Optional[int] = Field(default=None, ge=1)
|
|
927
|
+
min_text_characters: Optional[int] = Field(default=None, ge=1)
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
class MarkovAnalysisFixedWindowSegmentationConfig(AnalysisSchemaModel):
|
|
931
|
+
"""
|
|
932
|
+
Fixed window segmentation configuration.
|
|
933
|
+
|
|
934
|
+
:ivar max_characters: Maximum segment size in characters.
|
|
935
|
+
:vartype max_characters: int
|
|
936
|
+
:ivar overlap_characters: Overlap between consecutive segments.
|
|
937
|
+
:vartype overlap_characters: int
|
|
938
|
+
"""
|
|
939
|
+
|
|
940
|
+
max_characters: int = Field(default=800, ge=1)
|
|
941
|
+
overlap_characters: int = Field(default=0, ge=0)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
class MarkovAnalysisSegmentationConfig(AnalysisSchemaModel):
|
|
945
|
+
"""
|
|
946
|
+
Segmentation configuration for Markov analysis.
|
|
947
|
+
|
|
948
|
+
:ivar method: Segmentation method identifier.
|
|
949
|
+
:vartype method: MarkovAnalysisSegmentationMethod
|
|
950
|
+
:ivar fixed_window: Fixed window settings for ``fixed_window`` method.
|
|
951
|
+
:vartype fixed_window: MarkovAnalysisFixedWindowSegmentationConfig
|
|
952
|
+
:ivar span_markup: Text extract settings for ``span_markup`` method.
|
|
953
|
+
:vartype span_markup: MarkovAnalysisSpanMarkupSegmentationConfig or None
|
|
954
|
+
"""
|
|
955
|
+
|
|
956
|
+
method: MarkovAnalysisSegmentationMethod = Field(
|
|
957
|
+
default=MarkovAnalysisSegmentationMethod.SENTENCE
|
|
958
|
+
)
|
|
959
|
+
fixed_window: MarkovAnalysisFixedWindowSegmentationConfig = Field(
|
|
960
|
+
default_factory=MarkovAnalysisFixedWindowSegmentationConfig
|
|
961
|
+
)
|
|
962
|
+
llm: Optional[MarkovAnalysisLlmSegmentationConfig] = None
|
|
963
|
+
span_markup: Optional[MarkovAnalysisSpanMarkupSegmentationConfig] = None
|
|
964
|
+
|
|
965
|
+
@field_validator("method", mode="before")
|
|
966
|
+
@classmethod
|
|
967
|
+
def _parse_method(cls, value: object) -> MarkovAnalysisSegmentationMethod:
|
|
968
|
+
if isinstance(value, MarkovAnalysisSegmentationMethod):
|
|
969
|
+
return value
|
|
970
|
+
if isinstance(value, str):
|
|
971
|
+
return MarkovAnalysisSegmentationMethod(value)
|
|
972
|
+
raise ValueError("segmentation.method must be a string or MarkovAnalysisSegmentationMethod")
|
|
973
|
+
|
|
974
|
+
@model_validator(mode="after")
|
|
975
|
+
def _validate_requirements(self) -> "MarkovAnalysisSegmentationConfig":
|
|
976
|
+
if self.method == MarkovAnalysisSegmentationMethod.LLM and self.llm is None:
|
|
977
|
+
raise ValueError("segmentation.llm is required when segmentation.method is 'llm'")
|
|
978
|
+
if self.method == MarkovAnalysisSegmentationMethod.SPAN_MARKUP and self.span_markup is None:
|
|
979
|
+
raise ValueError(
|
|
980
|
+
"segmentation.span_markup is required when segmentation.method is 'span_markup'"
|
|
981
|
+
)
|
|
982
|
+
return self
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
class MarkovAnalysisLlmObservationsConfig(AnalysisSchemaModel):
|
|
986
|
+
"""
|
|
987
|
+
Provider-backed observation extraction configuration.
|
|
988
|
+
|
|
989
|
+
:ivar enabled: Whether to enable provider-backed observation extraction.
|
|
990
|
+
:vartype enabled: bool
|
|
991
|
+
:ivar client: LLM client configuration.
|
|
992
|
+
:vartype client: biblicus.ai.models.LlmClientConfig
|
|
993
|
+
:ivar prompt_template: Prompt template containing ``{segment}``.
|
|
994
|
+
:vartype prompt_template: str
|
|
995
|
+
:ivar system_prompt: Optional system prompt.
|
|
996
|
+
:vartype system_prompt: str or None
|
|
997
|
+
"""
|
|
998
|
+
|
|
999
|
+
enabled: bool = Field(default=False)
|
|
1000
|
+
client: Optional[LlmClientConfig] = None
|
|
1001
|
+
prompt_template: Optional[str] = None
|
|
1002
|
+
system_prompt: Optional[str] = None
|
|
1003
|
+
|
|
1004
|
+
@model_validator(mode="after")
|
|
1005
|
+
def _validate_requirements(self) -> "MarkovAnalysisLlmObservationsConfig":
|
|
1006
|
+
if not self.enabled:
|
|
1007
|
+
return self
|
|
1008
|
+
if self.client is None:
|
|
1009
|
+
raise ValueError(
|
|
1010
|
+
"llm_observations.client is required when llm_observations.enabled is true"
|
|
1011
|
+
)
|
|
1012
|
+
if not self.prompt_template:
|
|
1013
|
+
raise ValueError(
|
|
1014
|
+
"llm_observations.prompt_template is required when llm_observations.enabled is true"
|
|
1015
|
+
)
|
|
1016
|
+
return self
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
class MarkovAnalysisEmbeddingsConfig(AnalysisSchemaModel):
|
|
1020
|
+
"""
|
|
1021
|
+
Provider-backed embeddings configuration.
|
|
1022
|
+
|
|
1023
|
+
:ivar enabled: Whether to generate embeddings.
|
|
1024
|
+
:vartype enabled: bool
|
|
1025
|
+
:ivar client: Embeddings client configuration.
|
|
1026
|
+
:vartype client: biblicus.ai.models.EmbeddingsClientConfig
|
|
1027
|
+
:ivar text_source: Which text field to embed (``segment_text`` or ``llm_summary``).
|
|
1028
|
+
:vartype text_source: str
|
|
1029
|
+
"""
|
|
1030
|
+
|
|
1031
|
+
enabled: bool = Field(default=False)
|
|
1032
|
+
client: Optional[EmbeddingsClientConfig] = None
|
|
1033
|
+
text_source: str = Field(default="segment_text", min_length=1)
|
|
1034
|
+
|
|
1035
|
+
@model_validator(mode="after")
|
|
1036
|
+
def _validate_requirements(self) -> "MarkovAnalysisEmbeddingsConfig":
|
|
1037
|
+
if not self.enabled:
|
|
1038
|
+
return self
|
|
1039
|
+
if self.client is None:
|
|
1040
|
+
raise ValueError("embeddings.client is required when embeddings.enabled is true")
|
|
1041
|
+
if self.text_source not in {"segment_text", "llm_summary"}:
|
|
1042
|
+
raise ValueError("embeddings.text_source must be 'segment_text' or 'llm_summary'")
|
|
1043
|
+
return self
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
class MarkovAnalysisTopicModelingConfig(AnalysisSchemaModel):
|
|
1047
|
+
"""
|
|
1048
|
+
Topic modeling configuration for Markov analysis observations.
|
|
1049
|
+
|
|
1050
|
+
:ivar enabled: Whether to run topic modeling on segments.
|
|
1051
|
+
:vartype enabled: bool
|
|
1052
|
+
:ivar recipe: Topic modeling recipe applied to segments.
|
|
1053
|
+
:vartype recipe: TopicModelingRecipeConfig or None
|
|
1054
|
+
"""
|
|
1055
|
+
|
|
1056
|
+
enabled: bool = Field(default=False)
|
|
1057
|
+
recipe: Optional["TopicModelingRecipeConfig"] = None
|
|
1058
|
+
|
|
1059
|
+
@model_validator(mode="after")
|
|
1060
|
+
def _validate_requirements(self) -> "MarkovAnalysisTopicModelingConfig":
|
|
1061
|
+
if not self.enabled:
|
|
1062
|
+
return self
|
|
1063
|
+
if self.recipe is None:
|
|
1064
|
+
raise ValueError(
|
|
1065
|
+
"topic_modeling.recipe is required when topic_modeling.enabled is true"
|
|
1066
|
+
)
|
|
1067
|
+
if self.recipe.llm_extraction.enabled and (
|
|
1068
|
+
self.recipe.llm_extraction.method != TopicModelingLlmExtractionMethod.SINGLE
|
|
1069
|
+
):
|
|
1070
|
+
raise ValueError(
|
|
1071
|
+
"topic_modeling.recipe.llm_extraction.method must be 'single' for Markov topic modeling"
|
|
1072
|
+
)
|
|
1073
|
+
return self
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
class MarkovAnalysisObservationsEncoder(str, Enum):
|
|
1077
|
+
"""
|
|
1078
|
+
Observation encoder identifiers.
|
|
1079
|
+
"""
|
|
1080
|
+
|
|
1081
|
+
TFIDF = "tfidf"
|
|
1082
|
+
EMBEDDING = "embedding"
|
|
1083
|
+
HYBRID = "hybrid"
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
class MarkovAnalysisTfidfObservationConfig(AnalysisSchemaModel):
|
|
1087
|
+
"""
|
|
1088
|
+
TF-IDF encoder configuration for local observations.
|
|
1089
|
+
|
|
1090
|
+
:ivar max_features: Maximum vocabulary size.
|
|
1091
|
+
:vartype max_features: int
|
|
1092
|
+
:ivar ngram_range: Inclusive n-gram range.
|
|
1093
|
+
:vartype ngram_range: list[int]
|
|
1094
|
+
"""
|
|
1095
|
+
|
|
1096
|
+
max_features: int = Field(default=2000, ge=1)
|
|
1097
|
+
ngram_range: List[int] = Field(default_factory=lambda: [1, 2])
|
|
1098
|
+
|
|
1099
|
+
@field_validator("ngram_range", mode="before")
|
|
1100
|
+
@classmethod
|
|
1101
|
+
def _validate_ngram_range(cls, value: object) -> object:
|
|
1102
|
+
if value is None:
|
|
1103
|
+
return value
|
|
1104
|
+
if not isinstance(value, list) or len(value) != 2:
|
|
1105
|
+
raise ValueError("tfidf.ngram_range must be a list of two integers")
|
|
1106
|
+
if any(not isinstance(item, int) for item in value):
|
|
1107
|
+
raise ValueError("tfidf.ngram_range must be a list of two integers")
|
|
1108
|
+
if value[0] < 1 or value[1] < value[0]:
|
|
1109
|
+
raise ValueError("tfidf.ngram_range must be a valid inclusive range")
|
|
1110
|
+
return value
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
class MarkovAnalysisObservationsConfig(AnalysisSchemaModel):
|
|
1114
|
+
"""
|
|
1115
|
+
Observations configuration for Markov analysis.
|
|
1116
|
+
|
|
1117
|
+
:ivar encoder: Observation encoder identifier.
|
|
1118
|
+
:vartype encoder: MarkovAnalysisObservationsEncoder
|
|
1119
|
+
:ivar tfidf: TF-IDF encoder settings.
|
|
1120
|
+
:vartype tfidf: MarkovAnalysisTfidfObservationConfig
|
|
1121
|
+
:ivar text_source: Which text field to encode for ``tfidf`` (``segment_text`` or ``llm_summary``).
|
|
1122
|
+
:vartype text_source: str
|
|
1123
|
+
:ivar categorical_source: Which field provides categorical labels for hybrid/categorical use.
|
|
1124
|
+
:vartype categorical_source: str
|
|
1125
|
+
:ivar numeric_source: Which field provides a numeric scalar feature for hybrid use.
|
|
1126
|
+
:vartype numeric_source: str
|
|
1127
|
+
"""
|
|
1128
|
+
|
|
1129
|
+
encoder: MarkovAnalysisObservationsEncoder = Field(
|
|
1130
|
+
default=MarkovAnalysisObservationsEncoder.TFIDF
|
|
1131
|
+
)
|
|
1132
|
+
tfidf: MarkovAnalysisTfidfObservationConfig = Field(
|
|
1133
|
+
default_factory=MarkovAnalysisTfidfObservationConfig
|
|
1134
|
+
)
|
|
1135
|
+
text_source: str = Field(default="segment_text", min_length=1)
|
|
1136
|
+
categorical_source: str = Field(default="llm_label", min_length=1)
|
|
1137
|
+
numeric_source: str = Field(default="llm_label_confidence", min_length=1)
|
|
1138
|
+
|
|
1139
|
+
@field_validator("encoder", mode="before")
|
|
1140
|
+
@classmethod
|
|
1141
|
+
def _parse_encoder(cls, value: object) -> MarkovAnalysisObservationsEncoder:
|
|
1142
|
+
if isinstance(value, MarkovAnalysisObservationsEncoder):
|
|
1143
|
+
return value
|
|
1144
|
+
if isinstance(value, str):
|
|
1145
|
+
return MarkovAnalysisObservationsEncoder(value)
|
|
1146
|
+
raise ValueError(
|
|
1147
|
+
"observations.encoder must be a string or MarkovAnalysisObservationsEncoder"
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
@model_validator(mode="after")
|
|
1151
|
+
def _validate_sources(self) -> "MarkovAnalysisObservationsConfig":
|
|
1152
|
+
if self.text_source not in {"segment_text", "llm_summary"}:
|
|
1153
|
+
raise ValueError("observations.text_source must be 'segment_text' or 'llm_summary'")
|
|
1154
|
+
return self
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
class MarkovAnalysisModelFamily(str, Enum):
|
|
1158
|
+
"""
|
|
1159
|
+
Markov model family identifiers.
|
|
1160
|
+
"""
|
|
1161
|
+
|
|
1162
|
+
GAUSSIAN = "gaussian"
|
|
1163
|
+
CATEGORICAL = "categorical"
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
class MarkovAnalysisModelConfig(AnalysisSchemaModel):
|
|
1167
|
+
"""
|
|
1168
|
+
Model configuration for Markov analysis.
|
|
1169
|
+
|
|
1170
|
+
:ivar family: Model family identifier.
|
|
1171
|
+
:vartype family: MarkovAnalysisModelFamily
|
|
1172
|
+
:ivar n_states: Number of hidden states to learn.
|
|
1173
|
+
:vartype n_states: int
|
|
1174
|
+
"""
|
|
1175
|
+
|
|
1176
|
+
family: MarkovAnalysisModelFamily = Field(default=MarkovAnalysisModelFamily.GAUSSIAN)
|
|
1177
|
+
n_states: int = Field(default=8, ge=1)
|
|
1178
|
+
|
|
1179
|
+
@field_validator("family", mode="before")
|
|
1180
|
+
@classmethod
|
|
1181
|
+
def _parse_family(cls, value: object) -> MarkovAnalysisModelFamily:
|
|
1182
|
+
if isinstance(value, MarkovAnalysisModelFamily):
|
|
1183
|
+
return value
|
|
1184
|
+
if isinstance(value, str):
|
|
1185
|
+
return MarkovAnalysisModelFamily(value)
|
|
1186
|
+
raise ValueError("model.family must be a string or MarkovAnalysisModelFamily")
|
|
1187
|
+
|
|
1188
|
+
|
|
1189
|
+
class MarkovAnalysisArtifactsGraphVizConfig(AnalysisSchemaModel):
|
|
1190
|
+
"""
|
|
1191
|
+
GraphViz export configuration.
|
|
1192
|
+
|
|
1193
|
+
:ivar enabled: Whether to write GraphViz transitions output.
|
|
1194
|
+
:vartype enabled: bool
|
|
1195
|
+
:ivar rankdir: GraphViz rank direction (e.g., LR or TB).
|
|
1196
|
+
:vartype rankdir: str
|
|
1197
|
+
:ivar min_edge_weight: Minimum edge weight to include in GraphViz output.
|
|
1198
|
+
:vartype min_edge_weight: float
|
|
1199
|
+
:ivar start_state_id: Optional state id to pin at the start of the layout.
|
|
1200
|
+
:vartype start_state_id: int or None
|
|
1201
|
+
:ivar end_state_id: Optional state id to pin at the end of the layout.
|
|
1202
|
+
:vartype end_state_id: int or None
|
|
1203
|
+
"""
|
|
1204
|
+
|
|
1205
|
+
enabled: bool = Field(default=False)
|
|
1206
|
+
rankdir: str = Field(default="LR", min_length=1)
|
|
1207
|
+
min_edge_weight: float = Field(default=0.0, ge=0.0)
|
|
1208
|
+
start_state_id: Optional[int] = None
|
|
1209
|
+
end_state_id: Optional[int] = None
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
class MarkovAnalysisArtifactsConfig(AnalysisSchemaModel):
|
|
1213
|
+
"""
|
|
1214
|
+
Artifact configuration for Markov analysis.
|
|
1215
|
+
|
|
1216
|
+
:ivar graphviz: GraphViz export settings.
|
|
1217
|
+
:vartype graphviz: MarkovAnalysisArtifactsGraphVizConfig
|
|
1218
|
+
"""
|
|
1219
|
+
|
|
1220
|
+
graphviz: MarkovAnalysisArtifactsGraphVizConfig = Field(
|
|
1221
|
+
default_factory=MarkovAnalysisArtifactsGraphVizConfig
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
class MarkovAnalysisReportConfig(AnalysisSchemaModel):
|
|
1226
|
+
"""
|
|
1227
|
+
Report configuration for Markov analysis.
|
|
1228
|
+
|
|
1229
|
+
:ivar max_state_exemplars: Maximum exemplar segments stored per state.
|
|
1230
|
+
:vartype max_state_exemplars: int
|
|
1231
|
+
:ivar state_naming: Optional provider-backed state naming configuration.
|
|
1232
|
+
:vartype state_naming: MarkovAnalysisStateNamingConfig or None
|
|
1233
|
+
"""
|
|
1234
|
+
|
|
1235
|
+
max_state_exemplars: int = Field(default=5, ge=0)
|
|
1236
|
+
state_naming: Optional["MarkovAnalysisStateNamingConfig"] = None
|
|
1237
|
+
|
|
1238
|
+
|
|
1239
|
+
class MarkovAnalysisStateNamingConfig(AnalysisSchemaModel):
|
|
1240
|
+
"""
|
|
1241
|
+
Provider-backed configuration for naming Markov states.
|
|
1242
|
+
|
|
1243
|
+
:ivar enabled: Whether state naming is enabled.
|
|
1244
|
+
:vartype enabled: bool
|
|
1245
|
+
:ivar client: LLM client configuration.
|
|
1246
|
+
:vartype client: biblicus.ai.models.LlmClientConfig
|
|
1247
|
+
:ivar system_prompt: System prompt containing the context pack placeholder.
|
|
1248
|
+
:vartype system_prompt: str
|
|
1249
|
+
:ivar prompt_template: User prompt template for naming.
|
|
1250
|
+
:vartype prompt_template: str
|
|
1251
|
+
:ivar token_budget: Maximum tokens for the context pack text.
|
|
1252
|
+
:vartype token_budget: int
|
|
1253
|
+
:ivar max_exemplars_per_state: Maximum exemplars per state in the context pack.
|
|
1254
|
+
:vartype max_exemplars_per_state: int
|
|
1255
|
+
:ivar max_name_words: Maximum words allowed in each state name (short noun phrase).
|
|
1256
|
+
:vartype max_name_words: int
|
|
1257
|
+
:ivar max_retries: Maximum retries when the naming response is invalid.
|
|
1258
|
+
:vartype max_retries: int
|
|
1259
|
+
"""
|
|
1260
|
+
|
|
1261
|
+
enabled: bool = False
|
|
1262
|
+
client: Optional[LlmClientConfig] = None
|
|
1263
|
+
system_prompt: Optional[str] = None
|
|
1264
|
+
prompt_template: Optional[str] = None
|
|
1265
|
+
token_budget: int = Field(default=256, ge=1)
|
|
1266
|
+
max_exemplars_per_state: int = Field(default=3, ge=1)
|
|
1267
|
+
max_name_words: int = Field(default=4, ge=1)
|
|
1268
|
+
max_retries: int = Field(default=1, ge=0)
|
|
1269
|
+
|
|
1270
|
+
@model_validator(mode="after")
|
|
1271
|
+
def _validate_state_naming(self) -> "MarkovAnalysisStateNamingConfig":
|
|
1272
|
+
if not self.enabled:
|
|
1273
|
+
return self
|
|
1274
|
+
if self.client is None:
|
|
1275
|
+
raise ValueError("report.state_naming.client is required when enabled")
|
|
1276
|
+
if self.system_prompt is None or not str(self.system_prompt).strip():
|
|
1277
|
+
raise ValueError("report.state_naming.system_prompt is required when enabled")
|
|
1278
|
+
if "{context_pack}" not in self.system_prompt:
|
|
1279
|
+
raise ValueError(
|
|
1280
|
+
'report.state_naming.system_prompt must include the "{context_pack}" placeholder'
|
|
1281
|
+
)
|
|
1282
|
+
if self.prompt_template is None or not str(self.prompt_template).strip():
|
|
1283
|
+
raise ValueError("report.state_naming.prompt_template is required when enabled")
|
|
1284
|
+
if "{context_pack}" in self.prompt_template:
|
|
1285
|
+
raise ValueError(
|
|
1286
|
+
'report.state_naming.prompt_template must not include "{context_pack}"'
|
|
1287
|
+
)
|
|
1288
|
+
return self
|
|
1289
|
+
|
|
1290
|
+
|
|
1291
|
+
class MarkovAnalysisRecipeConfig(AnalysisSchemaModel):
|
|
1292
|
+
"""
|
|
1293
|
+
Recipe configuration for Markov analysis.
|
|
1294
|
+
|
|
1295
|
+
:ivar schema_version: Analysis schema version.
|
|
1296
|
+
:vartype schema_version: int
|
|
1297
|
+
:ivar text_source: Text source configuration.
|
|
1298
|
+
:vartype text_source: MarkovAnalysisTextSourceConfig
|
|
1299
|
+
:ivar segmentation: Segmentation configuration.
|
|
1300
|
+
:vartype segmentation: MarkovAnalysisSegmentationConfig
|
|
1301
|
+
:ivar observations: Observation encoder configuration.
|
|
1302
|
+
:vartype observations: MarkovAnalysisObservationsConfig
|
|
1303
|
+
:ivar model: Markov model configuration.
|
|
1304
|
+
:vartype model: MarkovAnalysisModelConfig
|
|
1305
|
+
:ivar topic_modeling: Topic modeling configuration.
|
|
1306
|
+
:vartype topic_modeling: MarkovAnalysisTopicModelingConfig
|
|
1307
|
+
:ivar artifacts: Artifact configuration.
|
|
1308
|
+
:vartype artifacts: MarkovAnalysisArtifactsConfig
|
|
1309
|
+
:ivar report: Report configuration.
|
|
1310
|
+
:vartype report: MarkovAnalysisReportConfig
|
|
1311
|
+
"""
|
|
1312
|
+
|
|
1313
|
+
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
1314
|
+
text_source: MarkovAnalysisTextSourceConfig = Field(
|
|
1315
|
+
default_factory=MarkovAnalysisTextSourceConfig
|
|
1316
|
+
)
|
|
1317
|
+
segmentation: MarkovAnalysisSegmentationConfig = Field(
|
|
1318
|
+
default_factory=MarkovAnalysisSegmentationConfig
|
|
1319
|
+
)
|
|
1320
|
+
observations: MarkovAnalysisObservationsConfig = Field(
|
|
1321
|
+
default_factory=MarkovAnalysisObservationsConfig
|
|
1322
|
+
)
|
|
1323
|
+
model: MarkovAnalysisModelConfig = Field(default_factory=MarkovAnalysisModelConfig)
|
|
1324
|
+
topic_modeling: MarkovAnalysisTopicModelingConfig = Field(
|
|
1325
|
+
default_factory=MarkovAnalysisTopicModelingConfig
|
|
1326
|
+
)
|
|
1327
|
+
llm_observations: MarkovAnalysisLlmObservationsConfig = Field(
|
|
1328
|
+
default_factory=MarkovAnalysisLlmObservationsConfig
|
|
1329
|
+
)
|
|
1330
|
+
embeddings: MarkovAnalysisEmbeddingsConfig = Field(
|
|
1331
|
+
default_factory=MarkovAnalysisEmbeddingsConfig
|
|
1332
|
+
)
|
|
1333
|
+
artifacts: MarkovAnalysisArtifactsConfig = Field(default_factory=MarkovAnalysisArtifactsConfig)
|
|
1334
|
+
report: MarkovAnalysisReportConfig = Field(default_factory=MarkovAnalysisReportConfig)
|
|
1335
|
+
|
|
1336
|
+
@model_validator(mode="after")
|
|
1337
|
+
def _validate_schema_version(self) -> "MarkovAnalysisRecipeConfig":
|
|
1338
|
+
if self.schema_version != ANALYSIS_SCHEMA_VERSION:
|
|
1339
|
+
raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
|
|
1340
|
+
return self
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
class MarkovAnalysisTextCollectionReport(AnalysisSchemaModel):
|
|
1344
|
+
"""
|
|
1345
|
+
Report for Markov analysis text collection stage.
|
|
1346
|
+
|
|
1347
|
+
:ivar status: Stage status.
|
|
1348
|
+
:vartype status: MarkovAnalysisStageStatus
|
|
1349
|
+
:ivar source_items: Count of items in extraction run.
|
|
1350
|
+
:vartype source_items: int
|
|
1351
|
+
:ivar documents: Count of documents included.
|
|
1352
|
+
:vartype documents: int
|
|
1353
|
+
:ivar sample_size: Sample size applied.
|
|
1354
|
+
:vartype sample_size: int or None
|
|
1355
|
+
:ivar min_text_characters: Minimum length filter applied.
|
|
1356
|
+
:vartype min_text_characters: int or None
|
|
1357
|
+
:ivar empty_texts: Count of empty extracted texts.
|
|
1358
|
+
:vartype empty_texts: int
|
|
1359
|
+
:ivar skipped_items: Count of items skipped for missing/invalid text.
|
|
1360
|
+
:vartype skipped_items: int
|
|
1361
|
+
:ivar warnings: Warning messages.
|
|
1362
|
+
:vartype warnings: list[str]
|
|
1363
|
+
:ivar errors: Error messages.
|
|
1364
|
+
:vartype errors: list[str]
|
|
1365
|
+
"""
|
|
1366
|
+
|
|
1367
|
+
status: MarkovAnalysisStageStatus
|
|
1368
|
+
source_items: int = Field(ge=0)
|
|
1369
|
+
documents: int = Field(ge=0)
|
|
1370
|
+
sample_size: Optional[int] = None
|
|
1371
|
+
min_text_characters: Optional[int] = None
|
|
1372
|
+
empty_texts: int = Field(ge=0)
|
|
1373
|
+
skipped_items: int = Field(ge=0)
|
|
1374
|
+
warnings: List[str] = Field(default_factory=list)
|
|
1375
|
+
errors: List[str] = Field(default_factory=list)
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
class MarkovAnalysisSegment(AnalysisSchemaModel):
|
|
1379
|
+
"""
|
|
1380
|
+
Segment record for Markov analysis.
|
|
1381
|
+
|
|
1382
|
+
:ivar item_id: Source item identifier.
|
|
1383
|
+
:vartype item_id: str
|
|
1384
|
+
:ivar segment_index: One-based segment index within the item.
|
|
1385
|
+
:vartype segment_index: int
|
|
1386
|
+
:ivar text: Segment text.
|
|
1387
|
+
:vartype text: str
|
|
1388
|
+
"""
|
|
1389
|
+
|
|
1390
|
+
item_id: str = Field(min_length=1)
|
|
1391
|
+
segment_index: int = Field(ge=1)
|
|
1392
|
+
text: str = Field(min_length=1)
|
|
1393
|
+
|
|
1394
|
+
|
|
1395
|
+
class MarkovAnalysisObservation(AnalysisSchemaModel):
|
|
1396
|
+
"""
|
|
1397
|
+
Observation record for a single segment.
|
|
1398
|
+
|
|
1399
|
+
:ivar item_id: Source item identifier.
|
|
1400
|
+
:vartype item_id: str
|
|
1401
|
+
:ivar segment_index: One-based segment index within the item.
|
|
1402
|
+
:vartype segment_index: int
|
|
1403
|
+
:ivar segment_text: Segment text.
|
|
1404
|
+
:vartype segment_text: str
|
|
1405
|
+
:ivar llm_label: Optional provider-proposed label.
|
|
1406
|
+
:vartype llm_label: str or None
|
|
1407
|
+
:ivar llm_label_confidence: Optional provider-proposed confidence.
|
|
1408
|
+
:vartype llm_label_confidence: float or None
|
|
1409
|
+
:ivar llm_summary: Optional provider-proposed summary.
|
|
1410
|
+
:vartype llm_summary: str or None
|
|
1411
|
+
:ivar topic_id: Optional topic identifier from topic modeling.
|
|
1412
|
+
:vartype topic_id: int or None
|
|
1413
|
+
:ivar topic_label: Optional topic label from topic modeling.
|
|
1414
|
+
:vartype topic_label: str or None
|
|
1415
|
+
:ivar embedding: Optional embedding vector for the configured embedding text source.
|
|
1416
|
+
:vartype embedding: list[float] or None
|
|
1417
|
+
"""
|
|
1418
|
+
|
|
1419
|
+
item_id: str = Field(min_length=1)
|
|
1420
|
+
segment_index: int = Field(ge=1)
|
|
1421
|
+
segment_text: str = Field(min_length=1)
|
|
1422
|
+
llm_label: Optional[str] = None
|
|
1423
|
+
llm_label_confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
1424
|
+
llm_summary: Optional[str] = None
|
|
1425
|
+
topic_id: Optional[int] = None
|
|
1426
|
+
topic_label: Optional[str] = None
|
|
1427
|
+
embedding: Optional[List[float]] = None
|
|
1428
|
+
|
|
1429
|
+
|
|
1430
|
+
class MarkovAnalysisState(AnalysisSchemaModel):
|
|
1431
|
+
"""
|
|
1432
|
+
State record for Markov analysis.
|
|
1433
|
+
|
|
1434
|
+
:ivar state_id: State identifier.
|
|
1435
|
+
:vartype state_id: int
|
|
1436
|
+
:ivar label: Optional human-readable label.
|
|
1437
|
+
:vartype label: str or None
|
|
1438
|
+
:ivar exemplars: Example segments representative of the state.
|
|
1439
|
+
:vartype exemplars: list[str]
|
|
1440
|
+
"""
|
|
1441
|
+
|
|
1442
|
+
state_id: int = Field(ge=0)
|
|
1443
|
+
label: Optional[str] = None
|
|
1444
|
+
exemplars: List[str] = Field(default_factory=list)
|
|
1445
|
+
|
|
1446
|
+
|
|
1447
|
+
class MarkovAnalysisTransition(AnalysisSchemaModel):
|
|
1448
|
+
"""
|
|
1449
|
+
Directed transition edge between two states.
|
|
1450
|
+
|
|
1451
|
+
:ivar from_state: Source state identifier.
|
|
1452
|
+
:vartype from_state: int
|
|
1453
|
+
:ivar to_state: Destination state identifier.
|
|
1454
|
+
:vartype to_state: int
|
|
1455
|
+
:ivar weight: Transition weight.
|
|
1456
|
+
:vartype weight: float
|
|
1457
|
+
"""
|
|
1458
|
+
|
|
1459
|
+
from_state: int = Field(ge=0)
|
|
1460
|
+
to_state: int = Field(ge=0)
|
|
1461
|
+
weight: float
|
|
1462
|
+
|
|
1463
|
+
|
|
1464
|
+
class MarkovAnalysisDecodedPath(AnalysisSchemaModel):
|
|
1465
|
+
"""
|
|
1466
|
+
Decoded state sequence for a single item.
|
|
1467
|
+
|
|
1468
|
+
:ivar item_id: Source item identifier.
|
|
1469
|
+
:vartype item_id: str
|
|
1470
|
+
:ivar state_sequence: Most likely state sequence over segments.
|
|
1471
|
+
:vartype state_sequence: list[int]
|
|
1472
|
+
"""
|
|
1473
|
+
|
|
1474
|
+
item_id: str = Field(min_length=1)
|
|
1475
|
+
state_sequence: List[int] = Field(default_factory=list)
|
|
1476
|
+
|
|
1477
|
+
|
|
1478
|
+
class MarkovAnalysisReport(AnalysisSchemaModel):
|
|
1479
|
+
"""
|
|
1480
|
+
Markov analysis report data.
|
|
1481
|
+
|
|
1482
|
+
:ivar text_collection: Text collection report.
|
|
1483
|
+
:vartype text_collection: MarkovAnalysisTextCollectionReport
|
|
1484
|
+
:ivar status: Overall analysis status.
|
|
1485
|
+
:vartype status: MarkovAnalysisStageStatus
|
|
1486
|
+
:ivar states: State records.
|
|
1487
|
+
:vartype states: list[MarkovAnalysisState]
|
|
1488
|
+
:ivar transitions: Transition edges.
|
|
1489
|
+
:vartype transitions: list[MarkovAnalysisTransition]
|
|
1490
|
+
:ivar decoded_paths: Per-item decoded paths.
|
|
1491
|
+
:vartype decoded_paths: list[MarkovAnalysisDecodedPath]
|
|
1492
|
+
:ivar topic_modeling: Optional topic modeling report for segment topics.
|
|
1493
|
+
:vartype topic_modeling: TopicModelingReport or None
|
|
1494
|
+
:ivar warnings: Warning messages.
|
|
1495
|
+
:vartype warnings: list[str]
|
|
1496
|
+
:ivar errors: Error messages.
|
|
1497
|
+
:vartype errors: list[str]
|
|
1498
|
+
"""
|
|
1499
|
+
|
|
1500
|
+
text_collection: MarkovAnalysisTextCollectionReport
|
|
1501
|
+
status: MarkovAnalysisStageStatus
|
|
1502
|
+
states: List[MarkovAnalysisState] = Field(default_factory=list)
|
|
1503
|
+
transitions: List[MarkovAnalysisTransition] = Field(default_factory=list)
|
|
1504
|
+
decoded_paths: List[MarkovAnalysisDecodedPath] = Field(default_factory=list)
|
|
1505
|
+
topic_modeling: Optional[TopicModelingReport] = None
|
|
1506
|
+
warnings: List[str] = Field(default_factory=list)
|
|
1507
|
+
errors: List[str] = Field(default_factory=list)
|
|
1508
|
+
|
|
1509
|
+
|
|
1510
|
+
class MarkovAnalysisOutput(AnalysisSchemaModel):
|
|
1511
|
+
"""
|
|
1512
|
+
Output bundle for Markov analysis.
|
|
1513
|
+
|
|
1514
|
+
:ivar schema_version: Analysis schema version.
|
|
1515
|
+
:vartype schema_version: int
|
|
1516
|
+
:ivar analysis_id: Analysis backend identifier.
|
|
1517
|
+
:vartype analysis_id: str
|
|
1518
|
+
:ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
|
|
1519
|
+
:vartype generated_at: str
|
|
1520
|
+
:ivar run: Analysis run manifest.
|
|
1521
|
+
:vartype run: AnalysisRunManifest
|
|
1522
|
+
:ivar report: Markov analysis report data.
|
|
1523
|
+
:vartype report: MarkovAnalysisReport
|
|
1524
|
+
"""
|
|
1525
|
+
|
|
1526
|
+
schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
|
|
1527
|
+
analysis_id: str
|
|
1528
|
+
generated_at: str
|
|
1529
|
+
run: AnalysisRunManifest
|
|
1530
|
+
report: MarkovAnalysisReport
|