biblicus 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,561 @@
1
+ """
2
+ Topic modeling analysis backend for Biblicus.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import re
9
+ import string
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Tuple
13
+
14
+ from pydantic import BaseModel
15
+
16
+ from ..corpus import Corpus
17
+ from ..models import ExtractionRunReference
18
+ from ..retrieval import hash_text
19
+ from ..time import utc_now_iso
20
+ from .base import CorpusAnalysisBackend
21
+ from .llm import generate_completion
22
+ from .models import (
23
+ AnalysisRecipeManifest,
24
+ AnalysisRunInput,
25
+ AnalysisRunManifest,
26
+ TopicModelingBerTopicConfig,
27
+ TopicModelingBerTopicReport,
28
+ TopicModelingKeyword,
29
+ TopicModelingLabelSource,
30
+ TopicModelingLexicalProcessingConfig,
31
+ TopicModelingLexicalProcessingReport,
32
+ TopicModelingLlmExtractionConfig,
33
+ TopicModelingLlmExtractionMethod,
34
+ TopicModelingLlmExtractionReport,
35
+ TopicModelingLlmFineTuningConfig,
36
+ TopicModelingLlmFineTuningReport,
37
+ TopicModelingOutput,
38
+ TopicModelingRecipeConfig,
39
+ TopicModelingReport,
40
+ TopicModelingStageStatus,
41
+ TopicModelingTextCollectionReport,
42
+ TopicModelingTextSourceConfig,
43
+ TopicModelingTopic,
44
+ )
45
+
46
+
47
+ @dataclass
48
+ class _TopicDocument:
49
+ document_id: str
50
+ source_item_id: str
51
+ text: str
52
+
53
+
54
+ class TopicModelingBackend(CorpusAnalysisBackend):
55
+ """
56
+ Topic modeling analysis backend backed by BERTopic.
57
+
58
+ :ivar analysis_id: Backend identifier.
59
+ :vartype analysis_id: str
60
+ """
61
+
62
+ analysis_id = "topic-modeling"
63
+
64
+ def run_analysis(
65
+ self,
66
+ corpus: Corpus,
67
+ *,
68
+ recipe_name: str,
69
+ config: Dict[str, object],
70
+ extraction_run: ExtractionRunReference,
71
+ ) -> BaseModel:
72
+ """
73
+ Run the topic modeling analysis pipeline.
74
+
75
+ :param corpus: Corpus to analyze.
76
+ :type corpus: Corpus
77
+ :param recipe_name: Human-readable recipe name.
78
+ :type recipe_name: str
79
+ :param config: Analysis configuration values.
80
+ :type config: dict[str, object]
81
+ :param extraction_run: Extraction run reference for text inputs.
82
+ :type extraction_run: biblicus.models.ExtractionRunReference
83
+ :return: Topic modeling output model.
84
+ :rtype: pydantic.BaseModel
85
+ """
86
+ parsed_config = (
87
+ config
88
+ if isinstance(config, TopicModelingRecipeConfig)
89
+ else TopicModelingRecipeConfig.model_validate(config)
90
+ )
91
+ return _run_topic_modeling(
92
+ corpus=corpus,
93
+ recipe_name=recipe_name,
94
+ config=parsed_config,
95
+ extraction_run=extraction_run,
96
+ )
97
+
98
+
99
+ def _run_topic_modeling(
100
+ *,
101
+ corpus: Corpus,
102
+ recipe_name: str,
103
+ config: TopicModelingRecipeConfig,
104
+ extraction_run: ExtractionRunReference,
105
+ ) -> TopicModelingOutput:
106
+ recipe = _create_recipe_manifest(name=recipe_name, config=config)
107
+ catalog = corpus.load_catalog()
108
+ run_id = _analysis_run_id(
109
+ recipe_id=recipe.recipe_id,
110
+ extraction_run=extraction_run,
111
+ catalog_generated_at=catalog.generated_at,
112
+ )
113
+ run_manifest = AnalysisRunManifest(
114
+ run_id=run_id,
115
+ recipe=recipe,
116
+ corpus_uri=catalog.corpus_uri,
117
+ catalog_generated_at=catalog.generated_at,
118
+ created_at=utc_now_iso(),
119
+ input=AnalysisRunInput(extraction_run=extraction_run),
120
+ artifact_paths=[],
121
+ stats={},
122
+ )
123
+ run_dir = corpus.analysis_run_dir(analysis_id=TopicModelingBackend.analysis_id, run_id=run_id)
124
+ output_path = run_dir / "output.json"
125
+
126
+ run_dir.mkdir(parents=True, exist_ok=True)
127
+
128
+ documents, text_report = _collect_documents(
129
+ corpus=corpus,
130
+ extraction_run=extraction_run,
131
+ config=config.text_source,
132
+ )
133
+
134
+ llm_extraction_report, extracted_documents = _apply_llm_extraction(
135
+ documents=documents,
136
+ config=config.llm_extraction,
137
+ )
138
+
139
+ lexical_report, lexical_documents = _apply_lexical_processing(
140
+ documents=extracted_documents,
141
+ config=config.lexical_processing,
142
+ )
143
+
144
+ bertopic_report, topics = _run_bertopic(
145
+ documents=lexical_documents,
146
+ config=config.bertopic_analysis,
147
+ )
148
+
149
+ fine_tuning_report, labeled_topics = _apply_llm_fine_tuning(
150
+ topics=topics,
151
+ documents=lexical_documents,
152
+ config=config.llm_fine_tuning,
153
+ )
154
+
155
+ report = TopicModelingReport(
156
+ text_collection=text_report,
157
+ llm_extraction=llm_extraction_report,
158
+ lexical_processing=lexical_report,
159
+ bertopic_analysis=bertopic_report,
160
+ llm_fine_tuning=fine_tuning_report,
161
+ topics=labeled_topics,
162
+ warnings=(
163
+ text_report.warnings
164
+ + llm_extraction_report.warnings
165
+ + bertopic_report.warnings
166
+ + fine_tuning_report.warnings
167
+ ),
168
+ errors=text_report.errors
169
+ + llm_extraction_report.errors
170
+ + bertopic_report.errors
171
+ + fine_tuning_report.errors,
172
+ )
173
+
174
+ run_stats = {
175
+ "documents": bertopic_report.document_count,
176
+ "topics": bertopic_report.topic_count,
177
+ }
178
+ run_manifest = run_manifest.model_copy(
179
+ update={"artifact_paths": ["output.json"], "stats": run_stats}
180
+ )
181
+ _write_analysis_run_manifest(run_dir=run_dir, manifest=run_manifest)
182
+
183
+ output = TopicModelingOutput(
184
+ analysis_id=TopicModelingBackend.analysis_id,
185
+ generated_at=utc_now_iso(),
186
+ run=run_manifest,
187
+ report=report,
188
+ )
189
+ _write_topic_modeling_output(path=output_path, output=output)
190
+ return output
191
+
192
+
193
+ def _create_recipe_manifest(
194
+ *, name: str, config: TopicModelingRecipeConfig
195
+ ) -> AnalysisRecipeManifest:
196
+ recipe_payload = json.dumps(
197
+ {
198
+ "analysis_id": TopicModelingBackend.analysis_id,
199
+ "name": name,
200
+ "config": config.model_dump(),
201
+ },
202
+ sort_keys=True,
203
+ )
204
+ recipe_id = hash_text(recipe_payload)
205
+ return AnalysisRecipeManifest(
206
+ recipe_id=recipe_id,
207
+ analysis_id=TopicModelingBackend.analysis_id,
208
+ name=name,
209
+ created_at=utc_now_iso(),
210
+ config=config.model_dump(),
211
+ )
212
+
213
+
214
+ def _analysis_run_id(
215
+ *,
216
+ recipe_id: str,
217
+ extraction_run: ExtractionRunReference,
218
+ catalog_generated_at: str,
219
+ ) -> str:
220
+ run_seed = f"{recipe_id}:{extraction_run.as_string()}:{catalog_generated_at}"
221
+ return hash_text(run_seed)
222
+
223
+
224
+ def _collect_documents(
225
+ *,
226
+ corpus: Corpus,
227
+ extraction_run: ExtractionRunReference,
228
+ config: TopicModelingTextSourceConfig,
229
+ ) -> Tuple[List[_TopicDocument], TopicModelingTextCollectionReport]:
230
+ manifest = corpus.load_extraction_run_manifest(
231
+ extractor_id=extraction_run.extractor_id,
232
+ run_id=extraction_run.run_id,
233
+ )
234
+ warnings: List[str] = []
235
+ errors: List[str] = []
236
+ documents: List[_TopicDocument] = []
237
+ skipped_items = 0
238
+ empty_texts = 0
239
+
240
+ for item_result in manifest.items:
241
+ if item_result.status != "extracted" or item_result.final_text_relpath is None:
242
+ skipped_items += 1
243
+ continue
244
+ text_path = (
245
+ corpus.extraction_run_dir(
246
+ extractor_id=extraction_run.extractor_id,
247
+ run_id=extraction_run.run_id,
248
+ )
249
+ / item_result.final_text_relpath
250
+ )
251
+ text_value = text_path.read_text(encoding="utf-8").strip()
252
+ if not text_value:
253
+ empty_texts += 1
254
+ continue
255
+ if config.min_text_characters is not None and len(text_value) < config.min_text_characters:
256
+ skipped_items += 1
257
+ continue
258
+ documents.append(
259
+ _TopicDocument(
260
+ document_id=item_result.item_id,
261
+ source_item_id=item_result.item_id,
262
+ text=text_value,
263
+ )
264
+ )
265
+
266
+ if config.sample_size is not None and len(documents) > config.sample_size:
267
+ documents = documents[: config.sample_size]
268
+ warnings.append("Text collection truncated to sample_size")
269
+
270
+ report = TopicModelingTextCollectionReport(
271
+ status=TopicModelingStageStatus.COMPLETE,
272
+ source_items=len(manifest.items),
273
+ documents=len(documents),
274
+ sample_size=config.sample_size,
275
+ min_text_characters=config.min_text_characters,
276
+ empty_texts=empty_texts,
277
+ skipped_items=skipped_items,
278
+ warnings=warnings,
279
+ errors=errors,
280
+ )
281
+ if not documents:
282
+ report = report.model_copy(update={"status": TopicModelingStageStatus.FAILED})
283
+ raise ValueError("Topic modeling requires at least one extracted text document")
284
+ return documents, report
285
+
286
+
287
+ def _apply_llm_extraction(
288
+ *,
289
+ documents: List[_TopicDocument],
290
+ config: TopicModelingLlmExtractionConfig,
291
+ ) -> Tuple[TopicModelingLlmExtractionReport, List[_TopicDocument]]:
292
+ if not config.enabled:
293
+ report = TopicModelingLlmExtractionReport(
294
+ status=TopicModelingStageStatus.SKIPPED,
295
+ method=config.method,
296
+ input_documents=len(documents),
297
+ output_documents=len(documents),
298
+ warnings=[],
299
+ errors=[],
300
+ )
301
+ return report, list(documents)
302
+
303
+ extracted_documents: List[_TopicDocument] = []
304
+ errors: List[str] = []
305
+
306
+ for document in documents:
307
+ prompt = config.prompt_template.format(text=document.text)
308
+ response_text = generate_completion(
309
+ client=config.client,
310
+ system_prompt=config.system_prompt,
311
+ user_prompt=prompt,
312
+ ).strip()
313
+ if config.method == TopicModelingLlmExtractionMethod.SINGLE:
314
+ if not response_text:
315
+ errors.append(f"LLM extraction returned empty output for {document.document_id}")
316
+ continue
317
+ extracted_documents.append(
318
+ _TopicDocument(
319
+ document_id=document.document_id,
320
+ source_item_id=document.source_item_id,
321
+ text=response_text,
322
+ )
323
+ )
324
+ continue
325
+ items = _parse_itemized_response(response_text)
326
+ if not items:
327
+ errors.append(f"LLM itemization returned no items for {document.document_id}")
328
+ continue
329
+ for index, item_text in enumerate(items, start=1):
330
+ extracted_documents.append(
331
+ _TopicDocument(
332
+ document_id=f"{document.document_id}:{index}",
333
+ source_item_id=document.source_item_id,
334
+ text=item_text,
335
+ )
336
+ )
337
+
338
+ report = TopicModelingLlmExtractionReport(
339
+ status=TopicModelingStageStatus.COMPLETE,
340
+ method=config.method,
341
+ input_documents=len(documents),
342
+ output_documents=len(extracted_documents),
343
+ warnings=[],
344
+ errors=errors,
345
+ )
346
+ if not extracted_documents:
347
+ report = report.model_copy(update={"status": TopicModelingStageStatus.FAILED})
348
+ raise ValueError("LLM extraction produced no usable documents")
349
+ return report, extracted_documents
350
+
351
+
352
+ def _parse_itemized_response(response_text: str) -> List[str]:
353
+ cleaned = response_text.strip()
354
+ if not cleaned:
355
+ return []
356
+ try:
357
+ data = json.loads(cleaned)
358
+ except json.JSONDecodeError:
359
+ unescaped = cleaned.replace('\\"', '"')
360
+ try:
361
+ data = json.loads(unescaped)
362
+ except json.JSONDecodeError:
363
+ return []
364
+ if not isinstance(data, list):
365
+ if isinstance(data, str):
366
+ try:
367
+ data = json.loads(data)
368
+ except json.JSONDecodeError:
369
+ return []
370
+ else:
371
+ return []
372
+ items: List[str] = []
373
+ for entry in data:
374
+ if not isinstance(entry, str):
375
+ continue
376
+ cleaned = entry.strip()
377
+ if cleaned:
378
+ items.append(cleaned)
379
+ return items
380
+
381
+
382
+ def _apply_lexical_processing(
383
+ *,
384
+ documents: List[_TopicDocument],
385
+ config: TopicModelingLexicalProcessingConfig,
386
+ ) -> Tuple[TopicModelingLexicalProcessingReport, List[_TopicDocument]]:
387
+ if not config.enabled:
388
+ report = TopicModelingLexicalProcessingReport(
389
+ status=TopicModelingStageStatus.SKIPPED,
390
+ input_documents=len(documents),
391
+ output_documents=len(documents),
392
+ lowercase=config.lowercase,
393
+ strip_punctuation=config.strip_punctuation,
394
+ collapse_whitespace=config.collapse_whitespace,
395
+ )
396
+ return report, list(documents)
397
+
398
+ processed: List[_TopicDocument] = []
399
+ for document in documents:
400
+ text_value = document.text
401
+ if config.lowercase:
402
+ text_value = text_value.lower()
403
+ if config.strip_punctuation:
404
+ text_value = text_value.translate(str.maketrans("", "", string.punctuation))
405
+ if config.collapse_whitespace:
406
+ text_value = re.sub(r"\s+", " ", text_value).strip()
407
+ processed.append(
408
+ _TopicDocument(
409
+ document_id=document.document_id,
410
+ source_item_id=document.source_item_id,
411
+ text=text_value,
412
+ )
413
+ )
414
+
415
+ report = TopicModelingLexicalProcessingReport(
416
+ status=TopicModelingStageStatus.COMPLETE,
417
+ input_documents=len(documents),
418
+ output_documents=len(processed),
419
+ lowercase=config.lowercase,
420
+ strip_punctuation=config.strip_punctuation,
421
+ collapse_whitespace=config.collapse_whitespace,
422
+ )
423
+ return report, processed
424
+
425
+
426
+ def _run_bertopic(
427
+ *,
428
+ documents: List[_TopicDocument],
429
+ config: TopicModelingBerTopicConfig,
430
+ ) -> Tuple[TopicModelingBerTopicReport, List[TopicModelingTopic]]:
431
+ try:
432
+ from bertopic import BERTopic
433
+ except ImportError as import_error:
434
+ raise ValueError(
435
+ "BERTopic analysis requires an optional dependency. "
436
+ 'Install it with pip install "biblicus[topic-modeling]".'
437
+ ) from import_error
438
+
439
+ topic_model = BERTopic(**config.parameters)
440
+ texts = [document.text for document in documents]
441
+ assignments, _ = topic_model.fit_transform(texts)
442
+ assignment_list = list(assignments)
443
+ topic_ids = sorted({int(topic_id) for topic_id in assignment_list})
444
+ topics: List[TopicModelingTopic] = []
445
+ topic_documents = _group_documents_by_topic(documents, assignment_list)
446
+
447
+ for topic_id in topic_ids:
448
+ keywords = _resolve_topic_keywords(topic_model=topic_model, topic_id=topic_id)
449
+ label = keywords[0].keyword if keywords else f"Topic {topic_id}"
450
+ doc_entries = topic_documents.get(topic_id, [])
451
+ topics.append(
452
+ TopicModelingTopic(
453
+ topic_id=topic_id,
454
+ label=label,
455
+ label_source=TopicModelingLabelSource.BERTOPIC,
456
+ keywords=keywords,
457
+ document_count=len(doc_entries),
458
+ document_examples=[doc.text for doc in doc_entries[:3]],
459
+ document_ids=[doc.document_id for doc in doc_entries],
460
+ )
461
+ )
462
+
463
+ report = TopicModelingBerTopicReport(
464
+ status=TopicModelingStageStatus.COMPLETE,
465
+ topic_count=len(topics),
466
+ document_count=len(documents),
467
+ parameters=dict(config.parameters),
468
+ warnings=[],
469
+ errors=[],
470
+ )
471
+ return report, topics
472
+
473
+
474
+ def _group_documents_by_topic(
475
+ documents: List[_TopicDocument], assignments: List[int]
476
+ ) -> Dict[int, List[_TopicDocument]]:
477
+ grouped: Dict[int, List[_TopicDocument]] = {}
478
+ for index, topic_id in enumerate(assignments):
479
+ grouped.setdefault(int(topic_id), []).append(documents[index])
480
+ return grouped
481
+
482
+
483
+ def _resolve_topic_keywords(
484
+ *, topic_model: Any, topic_id: int
485
+ ) -> List[TopicModelingKeyword]:
486
+ raw_keywords = topic_model.get_topic(topic_id) or []
487
+ return [
488
+ TopicModelingKeyword(keyword=str(entry[0]), score=float(entry[1]))
489
+ for entry in raw_keywords
490
+ ]
491
+
492
+
493
+ def _apply_llm_fine_tuning(
494
+ *,
495
+ topics: List[TopicModelingTopic],
496
+ documents: List[_TopicDocument],
497
+ config: TopicModelingLlmFineTuningConfig,
498
+ ) -> Tuple[TopicModelingLlmFineTuningReport, List[TopicModelingTopic]]:
499
+ if not config.enabled:
500
+ report = TopicModelingLlmFineTuningReport(
501
+ status=TopicModelingStageStatus.SKIPPED,
502
+ topics_labeled=0,
503
+ warnings=[],
504
+ errors=[],
505
+ )
506
+ return report, topics
507
+
508
+ labeled_topics: List[TopicModelingTopic] = []
509
+ errors: List[str] = []
510
+ labeled_count = 0
511
+ topic_documents = {doc.document_id: doc for doc in documents}
512
+
513
+ for topic in topics:
514
+ keyword_text = ", ".join(
515
+ [keyword.keyword for keyword in topic.keywords[: config.max_keywords]]
516
+ )
517
+ selected_documents = []
518
+ for doc_id in topic.document_ids[: config.max_documents]:
519
+ doc = topic_documents.get(doc_id)
520
+ if doc is not None:
521
+ selected_documents.append(doc.text)
522
+ documents_text = "\n".join(selected_documents)
523
+ prompt = config.prompt_template.format(
524
+ keywords=keyword_text,
525
+ documents=documents_text,
526
+ )
527
+ label_text = generate_completion(
528
+ client=config.client,
529
+ system_prompt=config.system_prompt,
530
+ user_prompt=prompt,
531
+ ).strip()
532
+ if label_text:
533
+ labeled_topics.append(
534
+ topic.model_copy(
535
+ update={
536
+ "label": label_text,
537
+ "label_source": TopicModelingLabelSource.LLM,
538
+ }
539
+ )
540
+ )
541
+ labeled_count += 1
542
+ else:
543
+ errors.append(f"LLM fine-tuning returned empty label for topic {topic.topic_id}")
544
+ labeled_topics.append(topic)
545
+
546
+ report = TopicModelingLlmFineTuningReport(
547
+ status=TopicModelingStageStatus.COMPLETE,
548
+ topics_labeled=labeled_count,
549
+ warnings=[],
550
+ errors=errors,
551
+ )
552
+ return report, labeled_topics
553
+
554
+
555
+ def _write_analysis_run_manifest(*, run_dir: Path, manifest: AnalysisRunManifest) -> None:
556
+ manifest_path = run_dir / "manifest.json"
557
+ manifest_path.write_text(manifest.model_dump_json(indent=2) + "\n", encoding="utf-8")
558
+
559
+
560
+ def _write_topic_modeling_output(*, path: Path, output: TopicModelingOutput) -> None:
561
+ path.write_text(output.model_dump_json(indent=2) + "\n", encoding="utf-8")