biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,512 @@
1
+ """
2
+ Pydantic models for analysis pipelines.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from enum import Enum
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from pydantic import Field, field_validator, model_validator
11
+
12
+ from ..constants import ANALYSIS_SCHEMA_VERSION
13
+ from ..models import ExtractionRunReference
14
+ from .llm import LlmClientConfig
15
+ from .schema import AnalysisSchemaModel
16
+
17
+
18
+ class AnalysisRecipeManifest(AnalysisSchemaModel):
19
+ """
20
+ Reproducible configuration for an analysis pipeline.
21
+
22
+ :ivar recipe_id: Deterministic recipe identifier.
23
+ :vartype recipe_id: str
24
+ :ivar analysis_id: Analysis backend identifier.
25
+ :vartype analysis_id: str
26
+ :ivar name: Human-readable recipe name.
27
+ :vartype name: str
28
+ :ivar created_at: International Organization for Standardization 8601 timestamp for recipe creation.
29
+ :vartype created_at: str
30
+ :ivar config: Analysis-specific configuration values.
31
+ :vartype config: dict[str, Any]
32
+ :ivar description: Optional human description.
33
+ :vartype description: str or None
34
+ """
35
+
36
+ recipe_id: str
37
+ analysis_id: str
38
+ name: str
39
+ created_at: str
40
+ config: Dict[str, Any] = Field(default_factory=dict)
41
+ description: Optional[str] = None
42
+
43
+
44
+ class AnalysisRunInput(AnalysisSchemaModel):
45
+ """
46
+ Inputs required to execute an analysis run.
47
+
48
+ :ivar extraction_run: Extraction run reference for analysis inputs.
49
+ :vartype extraction_run: biblicus.models.ExtractionRunReference
50
+ """
51
+
52
+ extraction_run: ExtractionRunReference
53
+
54
+
55
+ class AnalysisRunManifest(AnalysisSchemaModel):
56
+ """
57
+ Immutable record of an analysis run.
58
+
59
+ :ivar run_id: Unique run identifier.
60
+ :vartype run_id: str
61
+ :ivar recipe: Recipe manifest for this run.
62
+ :vartype recipe: AnalysisRecipeManifest
63
+ :ivar corpus_uri: Canonical uniform resource identifier for the corpus root.
64
+ :vartype corpus_uri: str
65
+ :ivar catalog_generated_at: Catalog timestamp used for the run.
66
+ :vartype catalog_generated_at: str
67
+ :ivar created_at: International Organization for Standardization 8601 timestamp for run creation.
68
+ :vartype created_at: str
69
+ :ivar input: Inputs used for this analysis run.
70
+ :vartype input: AnalysisRunInput
71
+ :ivar artifact_paths: Relative paths to materialized artifacts.
72
+ :vartype artifact_paths: list[str]
73
+ :ivar stats: Analysis-specific run statistics.
74
+ :vartype stats: dict[str, Any]
75
+ """
76
+
77
+ run_id: str
78
+ recipe: AnalysisRecipeManifest
79
+ corpus_uri: str
80
+ catalog_generated_at: str
81
+ created_at: str
82
+ input: AnalysisRunInput
83
+ artifact_paths: List[str] = Field(default_factory=list)
84
+ stats: Dict[str, Any] = Field(default_factory=dict)
85
+
86
+
87
+ class TopicModelingTextSourceConfig(AnalysisSchemaModel):
88
+ """
89
+ Configuration for text collection within topic modeling.
90
+
91
+ :ivar sample_size: Optional sample size for text collection.
92
+ :vartype sample_size: int or None
93
+ :ivar min_text_characters: Optional minimum character count for text inclusion.
94
+ :vartype min_text_characters: int or None
95
+ """
96
+
97
+ sample_size: Optional[int] = Field(default=None, ge=1)
98
+ min_text_characters: Optional[int] = Field(default=None, ge=1)
99
+
100
+
101
+ class TopicModelingLlmExtractionMethod(str, Enum):
102
+ """
103
+ LLM extraction method identifiers.
104
+ """
105
+
106
+ SINGLE = "single"
107
+ ITEMIZE = "itemize"
108
+
109
+
110
+ class TopicModelingLlmExtractionConfig(AnalysisSchemaModel):
111
+ """
112
+ Configuration for LLM-based extraction within topic modeling.
113
+
114
+ :ivar enabled: Whether LLM extraction is enabled.
115
+ :vartype enabled: bool
116
+ :ivar method: Extraction method, single or itemize.
117
+ :vartype method: TopicModelingLlmExtractionMethod
118
+ :ivar client: LLM client configuration.
119
+ :vartype client: LlmClientConfig or None
120
+ :ivar prompt_template: Prompt template containing the {text} placeholder.
121
+ :vartype prompt_template: str or None
122
+ :ivar system_prompt: Optional system prompt.
123
+ :vartype system_prompt: str or None
124
+ """
125
+
126
+ enabled: bool = Field(default=False)
127
+ method: TopicModelingLlmExtractionMethod = Field(default=TopicModelingLlmExtractionMethod.SINGLE)
128
+ client: Optional[LlmClientConfig] = None
129
+ prompt_template: Optional[str] = None
130
+ system_prompt: Optional[str] = None
131
+
132
+ @field_validator("method", mode="before")
133
+ @classmethod
134
+ def _parse_method(cls, value: object) -> TopicModelingLlmExtractionMethod:
135
+ if isinstance(value, TopicModelingLlmExtractionMethod):
136
+ return value
137
+ if isinstance(value, str):
138
+ return TopicModelingLlmExtractionMethod(value)
139
+ raise ValueError("llm_extraction.method must be a string or TopicModelingLlmExtractionMethod")
140
+
141
+ @model_validator(mode="after")
142
+ def _validate_requirements(self) -> "TopicModelingLlmExtractionConfig":
143
+ if not self.enabled:
144
+ return self
145
+ if self.client is None:
146
+ raise ValueError("llm_extraction.client is required when enabled")
147
+ if self.prompt_template is None:
148
+ raise ValueError("llm_extraction.prompt_template is required when enabled")
149
+ if "{text}" not in self.prompt_template:
150
+ raise ValueError("llm_extraction.prompt_template must include {text}")
151
+ return self
152
+
153
+
154
+ class TopicModelingLexicalProcessingConfig(AnalysisSchemaModel):
155
+ """
156
+ Configuration for lexical processing within topic modeling.
157
+
158
+ :ivar enabled: Whether lexical processing is enabled.
159
+ :vartype enabled: bool
160
+ :ivar lowercase: Whether to lowercase text.
161
+ :vartype lowercase: bool
162
+ :ivar strip_punctuation: Whether to remove punctuation.
163
+ :vartype strip_punctuation: bool
164
+ :ivar collapse_whitespace: Whether to normalize whitespace.
165
+ :vartype collapse_whitespace: bool
166
+ """
167
+
168
+ enabled: bool = Field(default=False)
169
+ lowercase: bool = Field(default=True)
170
+ strip_punctuation: bool = Field(default=False)
171
+ collapse_whitespace: bool = Field(default=True)
172
+
173
+
174
+ class TopicModelingBerTopicConfig(AnalysisSchemaModel):
175
+ """
176
+ Configuration for BERTopic analysis.
177
+
178
+ :ivar parameters: Parameters forwarded to the BERTopic constructor.
179
+ :vartype parameters: dict[str, Any]
180
+ """
181
+
182
+ parameters: Dict[str, Any] = Field(default_factory=dict)
183
+
184
+
185
+ class TopicModelingLlmFineTuningConfig(AnalysisSchemaModel):
186
+ """
187
+ Configuration for LLM-based topic labeling.
188
+
189
+ :ivar enabled: Whether LLM topic labeling is enabled.
190
+ :vartype enabled: bool
191
+ :ivar client: LLM client configuration.
192
+ :vartype client: LlmClientConfig or None
193
+ :ivar prompt_template: Prompt template containing {keywords} and {documents} placeholders.
194
+ :vartype prompt_template: str or None
195
+ :ivar system_prompt: Optional system prompt.
196
+ :vartype system_prompt: str or None
197
+ :ivar max_keywords: Maximum number of keywords to include in prompts.
198
+ :vartype max_keywords: int
199
+ :ivar max_documents: Maximum number of documents to include in prompts.
200
+ :vartype max_documents: int
201
+ """
202
+
203
+ enabled: bool = Field(default=False)
204
+ client: Optional[LlmClientConfig] = None
205
+ prompt_template: Optional[str] = None
206
+ system_prompt: Optional[str] = None
207
+ max_keywords: int = Field(default=8, ge=1)
208
+ max_documents: int = Field(default=5, ge=1)
209
+
210
+ @model_validator(mode="after")
211
+ def _validate_requirements(self) -> "TopicModelingLlmFineTuningConfig":
212
+ if not self.enabled:
213
+ return self
214
+ if self.client is None:
215
+ raise ValueError("llm_fine_tuning.client is required when enabled")
216
+ if self.prompt_template is None:
217
+ raise ValueError("llm_fine_tuning.prompt_template is required when enabled")
218
+ if "{keywords}" not in self.prompt_template or "{documents}" not in self.prompt_template:
219
+ raise ValueError(
220
+ "llm_fine_tuning.prompt_template must include {keywords} and {documents}"
221
+ )
222
+ return self
223
+
224
+
225
+ class TopicModelingRecipeConfig(AnalysisSchemaModel):
226
+ """
227
+ Recipe configuration for topic modeling analysis.
228
+
229
+ :ivar schema_version: Analysis schema version.
230
+ :vartype schema_version: int
231
+ :ivar text_source: Text collection configuration.
232
+ :vartype text_source: TopicModelingTextSourceConfig
233
+ :ivar llm_extraction: LLM extraction configuration.
234
+ :vartype llm_extraction: TopicModelingLlmExtractionConfig
235
+ :ivar lexical_processing: Lexical processing configuration.
236
+ :vartype lexical_processing: TopicModelingLexicalProcessingConfig
237
+ :ivar bertopic_analysis: BERTopic configuration.
238
+ :vartype bertopic_analysis: TopicModelingBerTopicConfig
239
+ :ivar llm_fine_tuning: LLM fine-tuning configuration.
240
+ :vartype llm_fine_tuning: TopicModelingLlmFineTuningConfig
241
+ """
242
+
243
+ schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
244
+ text_source: TopicModelingTextSourceConfig = Field(default_factory=TopicModelingTextSourceConfig)
245
+ llm_extraction: TopicModelingLlmExtractionConfig = Field(
246
+ default_factory=TopicModelingLlmExtractionConfig
247
+ )
248
+ lexical_processing: TopicModelingLexicalProcessingConfig = Field(
249
+ default_factory=TopicModelingLexicalProcessingConfig
250
+ )
251
+ bertopic_analysis: TopicModelingBerTopicConfig = Field(
252
+ default_factory=TopicModelingBerTopicConfig
253
+ )
254
+ llm_fine_tuning: TopicModelingLlmFineTuningConfig = Field(
255
+ default_factory=TopicModelingLlmFineTuningConfig
256
+ )
257
+
258
+ @model_validator(mode="after")
259
+ def _validate_schema_version(self) -> "TopicModelingRecipeConfig":
260
+ if self.schema_version != ANALYSIS_SCHEMA_VERSION:
261
+ raise ValueError(f"Unsupported analysis schema version: {self.schema_version}")
262
+ return self
263
+
264
+
265
+ class TopicModelingStageStatus(str, Enum):
266
+ """
267
+ Stage status values for topic modeling.
268
+ """
269
+
270
+ COMPLETE = "complete"
271
+ SKIPPED = "skipped"
272
+ FAILED = "failed"
273
+
274
+
275
+ class TopicModelingTextCollectionReport(AnalysisSchemaModel):
276
+ """
277
+ Report for the text collection stage.
278
+
279
+ :ivar status: Stage status.
280
+ :vartype status: TopicModelingStageStatus
281
+ :ivar source_items: Count of source items inspected.
282
+ :vartype source_items: int
283
+ :ivar documents: Count of documents produced.
284
+ :vartype documents: int
285
+ :ivar sample_size: Optional sample size.
286
+ :vartype sample_size: int or None
287
+ :ivar min_text_characters: Optional minimum character threshold.
288
+ :vartype min_text_characters: int or None
289
+ :ivar empty_texts: Count of empty text inputs.
290
+ :vartype empty_texts: int
291
+ :ivar skipped_items: Count of skipped items.
292
+ :vartype skipped_items: int
293
+ :ivar warnings: Warning messages.
294
+ :vartype warnings: list[str]
295
+ :ivar errors: Error messages.
296
+ :vartype errors: list[str]
297
+ """
298
+
299
+ status: TopicModelingStageStatus
300
+ source_items: int = Field(ge=0)
301
+ documents: int = Field(ge=0)
302
+ sample_size: Optional[int] = None
303
+ min_text_characters: Optional[int] = None
304
+ empty_texts: int = Field(ge=0)
305
+ skipped_items: int = Field(ge=0)
306
+ warnings: List[str] = Field(default_factory=list)
307
+ errors: List[str] = Field(default_factory=list)
308
+
309
+
310
+ class TopicModelingLlmExtractionReport(AnalysisSchemaModel):
311
+ """
312
+ Report for the LLM extraction stage.
313
+
314
+ :ivar status: Stage status.
315
+ :vartype status: TopicModelingStageStatus
316
+ :ivar method: Extraction method used.
317
+ :vartype method: TopicModelingLlmExtractionMethod
318
+ :ivar input_documents: Count of input documents.
319
+ :vartype input_documents: int
320
+ :ivar output_documents: Count of output documents.
321
+ :vartype output_documents: int
322
+ :ivar warnings: Warning messages.
323
+ :vartype warnings: list[str]
324
+ :ivar errors: Error messages.
325
+ :vartype errors: list[str]
326
+ """
327
+
328
+ status: TopicModelingStageStatus
329
+ method: TopicModelingLlmExtractionMethod
330
+ input_documents: int = Field(ge=0)
331
+ output_documents: int = Field(ge=0)
332
+ warnings: List[str] = Field(default_factory=list)
333
+ errors: List[str] = Field(default_factory=list)
334
+
335
+
336
+ class TopicModelingLexicalProcessingReport(AnalysisSchemaModel):
337
+ """
338
+ Report for the lexical processing stage.
339
+
340
+ :ivar status: Stage status.
341
+ :vartype status: TopicModelingStageStatus
342
+ :ivar input_documents: Count of input documents.
343
+ :vartype input_documents: int
344
+ :ivar output_documents: Count of output documents.
345
+ :vartype output_documents: int
346
+ :ivar lowercase: Whether lowercase normalization was applied.
347
+ :vartype lowercase: bool
348
+ :ivar strip_punctuation: Whether punctuation was removed.
349
+ :vartype strip_punctuation: bool
350
+ :ivar collapse_whitespace: Whether whitespace was normalized.
351
+ :vartype collapse_whitespace: bool
352
+ """
353
+
354
+ status: TopicModelingStageStatus
355
+ input_documents: int = Field(ge=0)
356
+ output_documents: int = Field(ge=0)
357
+ lowercase: bool
358
+ strip_punctuation: bool
359
+ collapse_whitespace: bool
360
+
361
+
362
+ class TopicModelingBerTopicReport(AnalysisSchemaModel):
363
+ """
364
+ Report for the BERTopic analysis stage.
365
+
366
+ :ivar status: Stage status.
367
+ :vartype status: TopicModelingStageStatus
368
+ :ivar topic_count: Count of topics discovered.
369
+ :vartype topic_count: int
370
+ :ivar document_count: Count of documents analyzed.
371
+ :vartype document_count: int
372
+ :ivar parameters: BERTopic configuration parameters.
373
+ :vartype parameters: dict[str, Any]
374
+ :ivar warnings: Warning messages.
375
+ :vartype warnings: list[str]
376
+ :ivar errors: Error messages.
377
+ :vartype errors: list[str]
378
+ """
379
+
380
+ status: TopicModelingStageStatus
381
+ topic_count: int = Field(ge=0)
382
+ document_count: int = Field(ge=0)
383
+ parameters: Dict[str, Any] = Field(default_factory=dict)
384
+ warnings: List[str] = Field(default_factory=list)
385
+ errors: List[str] = Field(default_factory=list)
386
+
387
+
388
+ class TopicModelingLlmFineTuningReport(AnalysisSchemaModel):
389
+ """
390
+ Report for the LLM fine-tuning stage.
391
+
392
+ :ivar status: Stage status.
393
+ :vartype status: TopicModelingStageStatus
394
+ :ivar topics_labeled: Count of topics labeled.
395
+ :vartype topics_labeled: int
396
+ :ivar warnings: Warning messages.
397
+ :vartype warnings: list[str]
398
+ :ivar errors: Error messages.
399
+ :vartype errors: list[str]
400
+ """
401
+
402
+ status: TopicModelingStageStatus
403
+ topics_labeled: int = Field(ge=0)
404
+ warnings: List[str] = Field(default_factory=list)
405
+ errors: List[str] = Field(default_factory=list)
406
+
407
+
408
+ class TopicModelingLabelSource(str, Enum):
409
+ """
410
+ Source identifiers for topic labels.
411
+ """
412
+
413
+ BERTOPIC = "bertopic"
414
+ LLM = "llm"
415
+
416
+
417
+ class TopicModelingKeyword(AnalysisSchemaModel):
418
+ """
419
+ Keyword entry for a topic.
420
+
421
+ :ivar keyword: Keyword or phrase.
422
+ :vartype keyword: str
423
+ :ivar score: Keyword relevance score.
424
+ :vartype score: float
425
+ """
426
+
427
+ keyword: str
428
+ score: float
429
+
430
+
431
+ class TopicModelingTopic(AnalysisSchemaModel):
432
+ """
433
+ Topic output record.
434
+
435
+ :ivar topic_id: Topic identifier.
436
+ :vartype topic_id: int
437
+ :ivar label: Human-readable topic label.
438
+ :vartype label: str
439
+ :ivar label_source: Source for the label.
440
+ :vartype label_source: TopicModelingLabelSource
441
+ :ivar keywords: Topic keywords with scores.
442
+ :vartype keywords: list[TopicModelingKeyword]
443
+ :ivar document_count: Number of documents in the topic.
444
+ :vartype document_count: int
445
+ :ivar document_examples: Example document texts.
446
+ :vartype document_examples: list[str]
447
+ :ivar document_ids: Document identifiers for the topic.
448
+ :vartype document_ids: list[str]
449
+ """
450
+
451
+ topic_id: int
452
+ label: str
453
+ label_source: TopicModelingLabelSource
454
+ keywords: List[TopicModelingKeyword] = Field(default_factory=list)
455
+ document_count: int = Field(ge=0)
456
+ document_examples: List[str] = Field(default_factory=list)
457
+ document_ids: List[str] = Field(default_factory=list)
458
+
459
+
460
+ class TopicModelingReport(AnalysisSchemaModel):
461
+ """
462
+ Report for topic modeling analysis.
463
+
464
+ :ivar text_collection: Text collection report.
465
+ :vartype text_collection: TopicModelingTextCollectionReport
466
+ :ivar llm_extraction: LLM extraction report.
467
+ :vartype llm_extraction: TopicModelingLlmExtractionReport
468
+ :ivar lexical_processing: Lexical processing report.
469
+ :vartype lexical_processing: TopicModelingLexicalProcessingReport
470
+ :ivar bertopic_analysis: BERTopic analysis report.
471
+ :vartype bertopic_analysis: TopicModelingBerTopicReport
472
+ :ivar llm_fine_tuning: LLM fine-tuning report.
473
+ :vartype llm_fine_tuning: TopicModelingLlmFineTuningReport
474
+ :ivar topics: Topic output list.
475
+ :vartype topics: list[TopicModelingTopic]
476
+ :ivar warnings: Warning messages.
477
+ :vartype warnings: list[str]
478
+ :ivar errors: Error messages.
479
+ :vartype errors: list[str]
480
+ """
481
+
482
+ text_collection: TopicModelingTextCollectionReport
483
+ llm_extraction: TopicModelingLlmExtractionReport
484
+ lexical_processing: TopicModelingLexicalProcessingReport
485
+ bertopic_analysis: TopicModelingBerTopicReport
486
+ llm_fine_tuning: TopicModelingLlmFineTuningReport
487
+ topics: List[TopicModelingTopic] = Field(default_factory=list)
488
+ warnings: List[str] = Field(default_factory=list)
489
+ errors: List[str] = Field(default_factory=list)
490
+
491
+
492
+ class TopicModelingOutput(AnalysisSchemaModel):
493
+ """
494
+ Output bundle for topic modeling analysis.
495
+
496
+ :ivar schema_version: Analysis schema version.
497
+ :vartype schema_version: int
498
+ :ivar analysis_id: Analysis backend identifier.
499
+ :vartype analysis_id: str
500
+ :ivar generated_at: International Organization for Standardization 8601 timestamp for output creation.
501
+ :vartype generated_at: str
502
+ :ivar run: Analysis run manifest.
503
+ :vartype run: AnalysisRunManifest
504
+ :ivar report: Topic modeling report data.
505
+ :vartype report: TopicModelingReport
506
+ """
507
+
508
+ schema_version: int = Field(default=ANALYSIS_SCHEMA_VERSION, ge=1)
509
+ analysis_id: str
510
+ generated_at: str
511
+ run: AnalysisRunManifest
512
+ report: TopicModelingReport
@@ -0,0 +1,18 @@
1
+ """
2
+ Shared schema utilities for analysis models.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from pydantic import BaseModel, ConfigDict
8
+
9
+
10
+ class AnalysisSchemaModel(BaseModel):
11
+ """
12
+ Base model for analysis schemas with strict validation.
13
+
14
+ :ivar model_config: Pydantic configuration for strict schema enforcement.
15
+ :vartype model_config: pydantic.ConfigDict
16
+ """
17
+
18
+ model_config = ConfigDict(extra="forbid", strict=True)