biblicus 0.8.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.8.0/src/biblicus.egg-info → biblicus-0.10.0}/PKG-INFO +17 -10
- {biblicus-0.8.0 → biblicus-0.10.0}/README.md +14 -9
- biblicus-0.10.0/docs/ANALYSIS.md +47 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/DEMOS.md +20 -31
- biblicus-0.10.0/docs/PROFILING.md +98 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/ROADMAP.md +10 -54
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/TESTING.md +1 -1
- biblicus-0.10.0/docs/TOPIC_MODELING.md +159 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/conf.py +5 -8
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/index.rst +2 -0
- biblicus-0.10.0/features/analysis_schema.feature +110 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/environment.py +29 -5
- biblicus-0.10.0/features/profiling.feature +150 -0
- biblicus-0.10.0/features/steps/analysis_steps.py +389 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/cli_steps.py +13 -7
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/crawl_steps.py +6 -2
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/deepgram_steps.py +3 -11
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/docling_steps.py +2 -6
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/evidence_processing_steps.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/extraction_run_lifecycle_steps.py +6 -2
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/extraction_steps.py +25 -6
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/inference_steps.py +12 -6
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/markitdown_steps.py +1 -3
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/openai_steps.py +3 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/paddleocr_mock_steps.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_steps.py +17 -19
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_unit_steps.py +10 -9
- biblicus-0.10.0/features/steps/profiling_steps.py +205 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/requests_mock_steps.py +32 -13
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/topic_modeling_steps.py +98 -7
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/user_config_steps.py +6 -7
- {biblicus-0.8.0 → biblicus-0.10.0}/features/topic_modeling.feature +170 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/pyproject.toml +5 -1
- biblicus-0.10.0/scripts/download_ag_news.py +150 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_audio_samples.py +9 -5
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_image_samples.py +0 -5
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_mixed_samples.py +0 -6
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_pdf_samples.py +0 -5
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_wikipedia.py +1 -5
- biblicus-0.10.0/scripts/profiling_demo.py +212 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/readme_end_to_end_demo.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/test.py +0 -4
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/topic_modeling_integration.py +76 -14
- {biblicus-0.8.0 → biblicus-0.10.0}/scripts/wikipedia_rag_demo.py +3 -8
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/__init__.py +1 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/__init__.py +2 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/models.py +268 -3
- biblicus-0.10.0/src/biblicus/analysis/profiling.py +337 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/topic_modeling.py +28 -7
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/sqlite_full_text_search.py +2 -4
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/cli.py +83 -4
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/corpus.py +9 -3
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/evidence_processing.py +4 -2
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extraction.py +3 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/markitdown_text.py +1 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/paddleocr_vl_text.py +1 -3
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/user_config.py +2 -6
- {biblicus-0.8.0 → biblicus-0.10.0/src/biblicus.egg-info}/PKG-INFO +17 -10
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/SOURCES.txt +7 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/requires.txt +3 -0
- biblicus-0.8.0/docs/TOPIC_MODELING.md +0 -82
- biblicus-0.8.0/features/analysis_schema.feature +0 -36
- biblicus-0.8.0/features/steps/analysis_steps.py +0 -194
- {biblicus-0.8.0 → biblicus-0.10.0}/LICENSE +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/MANIFEST.in +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/BACKENDS.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/CONTEXT_PACK.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/CORPUS.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/CORPUS_DESIGN.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/EXTRACTION.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/FEATURE_INDEX.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/KNOWLEDGE_BASE.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/STT.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/USER_CONFIGURATION.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/api.rst +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/backends/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/backends/scan.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/backends/sqlite-full-text-search.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/ocr/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/ocr/rapidocr.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/openai.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/markitdown.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/metadata.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/pass-through.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/pdf.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/unstructured.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/vlm-document/index.md +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/backend_validation.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/cli_step_spec_parsing.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/context_pack.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/crawl.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/docling_granite_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/docling_smol_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/error_cases.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/evaluation.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/frontmatter.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/import_tree.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/inference_backend.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_ocr_image_extraction.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_pdf_retrieval.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_unstructured_extraction.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/knowledge_base.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/markitdown_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/model_validation.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/paddleocr_vl_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/python_api.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/query_processing.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/recipe_file_extraction.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_uses_extraction_run.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/select_override.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/smart_override_selection.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/source_loading.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/context_pack_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/extractor_steps.py +1 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/python_api_steps.py +1 -1
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/rapidocr_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/retrieval_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/stt_deepgram_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/unstructured_steps.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/stt_deepgram_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/token_budget.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/features/user_config.feature +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/setup.cfg +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/base.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/llm.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/schema.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/scan.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/constants.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/context.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/errors.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/__init__.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_override.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_smart_override.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/inference.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/knowledge_base.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/models.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/sources.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/time.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -40,6 +40,8 @@ Provides-Extra: docling-mlx
|
|
|
40
40
|
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
41
|
Provides-Extra: topic-modeling
|
|
42
42
|
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
43
|
+
Provides-Extra: datasets
|
|
44
|
+
Requires-Dist: datasets>=2.18.0; extra == "datasets"
|
|
43
45
|
Dynamic: license-file
|
|
44
46
|
|
|
45
47
|
# Biblicus
|
|
@@ -529,10 +531,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
529
531
|
|
|
530
532
|
## Topic modeling analysis
|
|
531
533
|
|
|
532
|
-
Biblicus can run analysis pipelines on extracted text without changing the raw corpus.
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
JavaScript Object Notation.
|
|
534
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
535
|
+
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
536
|
+
an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
537
|
+
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
538
|
+
|
|
539
|
+
See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
|
|
540
|
+
`docs/TOPIC_MODELING.md` for topic modeling details.
|
|
536
541
|
|
|
537
542
|
Run a topic analysis using a recipe file:
|
|
538
543
|
|
|
@@ -564,26 +569,28 @@ bertopic_analysis:
|
|
|
564
569
|
parameters:
|
|
565
570
|
min_topic_size: 8
|
|
566
571
|
nr_topics: 10
|
|
572
|
+
vectorizer:
|
|
573
|
+
ngram_range: [1, 2]
|
|
574
|
+
stop_words: english
|
|
567
575
|
llm_fine_tuning:
|
|
568
576
|
enabled: false
|
|
569
577
|
```
|
|
570
578
|
|
|
571
579
|
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
572
580
|
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
581
|
+
AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
|
|
573
582
|
|
|
574
|
-
For a repeatable, real-world integration run that downloads
|
|
583
|
+
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
575
584
|
|
|
576
585
|
```
|
|
577
|
-
python3 scripts/topic_modeling_integration.py --corpus corpora/
|
|
586
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
578
587
|
```
|
|
579
588
|
|
|
580
589
|
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
581
590
|
|
|
582
591
|
## Integration corpus and evaluation dataset
|
|
583
592
|
|
|
584
|
-
Use `scripts/
|
|
585
|
-
|
|
586
|
-
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
593
|
+
Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
|
|
587
594
|
|
|
588
595
|
Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
|
|
589
596
|
|
|
@@ -485,10 +485,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
485
485
|
|
|
486
486
|
## Topic modeling analysis
|
|
487
487
|
|
|
488
|
-
Biblicus can run analysis pipelines on extracted text without changing the raw corpus.
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
JavaScript Object Notation.
|
|
488
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
489
|
+
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
490
|
+
an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
491
|
+
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
492
|
+
|
|
493
|
+
See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
|
|
494
|
+
`docs/TOPIC_MODELING.md` for topic modeling details.
|
|
492
495
|
|
|
493
496
|
Run a topic analysis using a recipe file:
|
|
494
497
|
|
|
@@ -520,26 +523,28 @@ bertopic_analysis:
|
|
|
520
523
|
parameters:
|
|
521
524
|
min_topic_size: 8
|
|
522
525
|
nr_topics: 10
|
|
526
|
+
vectorizer:
|
|
527
|
+
ngram_range: [1, 2]
|
|
528
|
+
stop_words: english
|
|
523
529
|
llm_fine_tuning:
|
|
524
530
|
enabled: false
|
|
525
531
|
```
|
|
526
532
|
|
|
527
533
|
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
528
534
|
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
535
|
+
AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
|
|
529
536
|
|
|
530
|
-
For a repeatable, real-world integration run that downloads
|
|
537
|
+
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
531
538
|
|
|
532
539
|
```
|
|
533
|
-
python3 scripts/topic_modeling_integration.py --corpus corpora/
|
|
540
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
534
541
|
```
|
|
535
542
|
|
|
536
543
|
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
537
544
|
|
|
538
545
|
## Integration corpus and evaluation dataset
|
|
539
546
|
|
|
540
|
-
Use `scripts/
|
|
541
|
-
|
|
542
|
-
The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
|
|
547
|
+
Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
|
|
543
548
|
|
|
544
549
|
Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
|
|
545
550
|
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Corpus analysis
|
|
2
|
+
|
|
3
|
+
Biblicus supports analysis backends that run on extracted text artifacts without changing the raw corpus. Analysis is a
|
|
4
|
+
pluggable phase that reads an extraction run, produces structured output, and stores artifacts under the corpus runs
|
|
5
|
+
folder. Each analysis backend declares its own configuration schema and output contract, and all schemas are validated
|
|
6
|
+
strictly.
|
|
7
|
+
|
|
8
|
+
## How analysis runs work
|
|
9
|
+
|
|
10
|
+
- Analysis runs are tied to a corpus state via the extraction run reference.
|
|
11
|
+
- The analysis output is written under `.biblicus/runs/analysis/<analysis-id>/<run_id>/`.
|
|
12
|
+
- Analysis is reproducible when you supply the same extraction run and corpus catalog state.
|
|
13
|
+
- Analysis configuration is stored as a recipe manifest in the run metadata.
|
|
14
|
+
|
|
15
|
+
If you omit the extraction run, Biblicus uses the most recent extraction run and emits a reproducibility warning. For
|
|
16
|
+
repeatable analysis runs, always pass the extraction run reference explicitly.
|
|
17
|
+
|
|
18
|
+
## Pluggable analysis backends
|
|
19
|
+
|
|
20
|
+
Analysis backends implement the `CorpusAnalysisBackend` interface and are registered under `biblicus.analysis`.
|
|
21
|
+
A backend receives the corpus, a recipe name, a configuration mapping, and an extraction run reference. It returns a
|
|
22
|
+
Pydantic model that is serialized to JavaScript Object Notation for storage.
|
|
23
|
+
|
|
24
|
+
## Topic modeling
|
|
25
|
+
|
|
26
|
+
Topic modeling is the first analysis backend. It uses BERTopic to cluster extracted text, produces per-topic evidence,
|
|
27
|
+
and optionally labels topics using an LLM. See `docs/TOPIC_MODELING.md` for detailed configuration and examples.
|
|
28
|
+
|
|
29
|
+
The integration demo script is a working reference you can use as a starting point:
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
The command prints the analysis run identifier and the output path. Open the resulting `output.json` to inspect per-topic
|
|
36
|
+
labels, keywords, and document examples.
|
|
37
|
+
|
|
38
|
+
## Profiling analysis
|
|
39
|
+
|
|
40
|
+
Profiling is the baseline analysis backend. It summarizes corpus composition and extraction coverage using
|
|
41
|
+
deterministic counts and distribution metrics. See `docs/PROFILING.md` for the full reference and working demo.
|
|
42
|
+
|
|
43
|
+
Run profiling from the CLI:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
|
|
47
|
+
```
|
|
@@ -187,19 +187,26 @@ The output includes a `run_id` you can reuse when building a retrieval backend.
|
|
|
187
187
|
|
|
188
188
|
### Topic modeling integration run
|
|
189
189
|
|
|
190
|
-
Use the integration script to download
|
|
190
|
+
Use the integration script to download AG News, run extraction, and run topic modeling with a single command.
|
|
191
|
+
Install optional dependencies first:
|
|
191
192
|
|
|
192
193
|
```
|
|
193
|
-
python3
|
|
194
|
+
python3 -m pip install "biblicus[datasets,topic-modeling]"
|
|
194
195
|
```
|
|
195
196
|
|
|
196
|
-
|
|
197
|
+
```
|
|
198
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Run with a larger corpus and a higher topic count:
|
|
197
202
|
|
|
198
203
|
```
|
|
199
204
|
python3 scripts/topic_modeling_integration.py \
|
|
200
|
-
--corpus corpora/
|
|
205
|
+
--corpus corpora/ag_news_demo \
|
|
201
206
|
--force \
|
|
202
|
-
--limit
|
|
207
|
+
--limit 10000 \
|
|
208
|
+
--vectorizer-ngram-min 1 \
|
|
209
|
+
--vectorizer-ngram-max 2 \
|
|
203
210
|
--bertopic-param nr_topics=8 \
|
|
204
211
|
--bertopic-param min_topic_size=2
|
|
205
212
|
```
|
|
@@ -207,6 +214,14 @@ python3 scripts/topic_modeling_integration.py \
|
|
|
207
214
|
The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
|
|
208
215
|
keywords, and document examples.
|
|
209
216
|
|
|
217
|
+
### Profiling analysis demo
|
|
218
|
+
|
|
219
|
+
The profiling demo downloads AG News, runs extraction, and produces a profiling report.
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
|
|
223
|
+
```
|
|
224
|
+
|
|
210
225
|
### Select extracted text within a pipeline
|
|
211
226
|
|
|
212
227
|
When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
|
|
@@ -243,15 +258,6 @@ python3 -m biblicus build --corpus corpora/pdf_samples --backend sqlite-full-tex
|
|
|
243
258
|
python3 -m biblicus query --corpus corpora/pdf_samples --query "Dummy PDF file"
|
|
244
259
|
```
|
|
245
260
|
|
|
246
|
-
### Wikipedia retrieval demo (Python)
|
|
247
|
-
|
|
248
|
-
This example downloads a few Wikipedia summaries about retrieval and knowledge bases, builds an extraction run, creates a local full text index, and returns evidence plus a context pack.
|
|
249
|
-
|
|
250
|
-
```
|
|
251
|
-
rm -rf corpora/wikipedia_rag_demo
|
|
252
|
-
python3 scripts/wikipedia_rag_demo.py --corpus corpora/wikipedia_rag_demo --force
|
|
253
|
-
```
|
|
254
|
-
|
|
255
261
|
### MarkItDown extraction demo (Python 3.10+)
|
|
256
262
|
|
|
257
263
|
MarkItDown requires Python 3.10 or higher. This example uses the `py311` conda environment to run the extractor over the mixed sample corpus.
|
|
@@ -374,23 +380,6 @@ python3 -m biblicus build --corpus corpora/demo --backend sqlite-full-text-searc
|
|
|
374
380
|
python3 -m biblicus query --corpus corpora/demo --query "tiny"
|
|
375
381
|
```
|
|
376
382
|
|
|
377
|
-
### Evaluate a run against a dataset
|
|
378
|
-
|
|
379
|
-
The repository includes a small dataset that matches the Wikipedia integration corpus.
|
|
380
|
-
|
|
381
|
-
```
|
|
382
|
-
python3 -m biblicus eval --corpus corpora/demo --dataset datasets/wikipedia_mini.json
|
|
383
|
-
```
|
|
384
|
-
|
|
385
|
-
If you want the matching corpus content, download it first into a separate corpus.
|
|
386
|
-
|
|
387
|
-
```
|
|
388
|
-
rm -rf corpora/wikipedia
|
|
389
|
-
python3 scripts/download_wikipedia.py --corpus corpora/wikipedia --limit 5 --force
|
|
390
|
-
python3 -m biblicus build --corpus corpora/wikipedia --backend sqlite-full-text-search
|
|
391
|
-
python3 -m biblicus eval --corpus corpora/wikipedia --dataset datasets/wikipedia_mini.json
|
|
392
|
-
```
|
|
393
|
-
|
|
394
383
|
### Run the test suite and view coverage
|
|
395
384
|
|
|
396
385
|
```
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Corpus profiling analysis
|
|
2
|
+
|
|
3
|
+
Biblicus provides a profiling analysis backend that summarizes corpus contents using deterministic counts and
|
|
4
|
+
coverage metrics. Profiling is intended as a fast, local baseline before heavier analysis such as topic modeling.
|
|
5
|
+
|
|
6
|
+
## What profiling does
|
|
7
|
+
|
|
8
|
+
The profiling analysis reports:
|
|
9
|
+
|
|
10
|
+
- Total item count and media type distribution
|
|
11
|
+
- Extracted text coverage (present, empty, missing)
|
|
12
|
+
- Size and length distributions with percentiles
|
|
13
|
+
- Tag coverage and top tags
|
|
14
|
+
|
|
15
|
+
The output is structured JSON that can be stored, versioned, and compared across runs.
|
|
16
|
+
|
|
17
|
+
## Run profiling from the CLI
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
If you omit `--extraction-run`, Biblicus uses the latest extraction run and emits a reproducibility warning.
|
|
24
|
+
|
|
25
|
+
To customize profiling metrics, pass a recipe file:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
biblicus analyze profile --corpus corpora/example --recipe recipes/profiling.yml --extraction-run pipeline:RUN_ID
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Profiling recipe configuration
|
|
32
|
+
|
|
33
|
+
Profiling recipes use the analysis schema version and accept these fields:
|
|
34
|
+
|
|
35
|
+
- `schema_version`: analysis schema version, currently `1`
|
|
36
|
+
- `sample_size`: optional cap for distribution calculations
|
|
37
|
+
- `min_text_characters`: minimum extracted text length for inclusion
|
|
38
|
+
- `percentiles`: percentiles to compute for size and length distributions
|
|
39
|
+
- `top_tag_count`: maximum number of tags to list in `top_tags`
|
|
40
|
+
- `tag_filters`: optional list of tags to include in tag coverage metrics
|
|
41
|
+
|
|
42
|
+
Example recipe:
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
schema_version: 1
|
|
46
|
+
sample_size: 500
|
|
47
|
+
min_text_characters: 50
|
|
48
|
+
percentiles: [50, 90, 99]
|
|
49
|
+
top_tag_count: 10
|
|
50
|
+
tag_filters: ["ag_news", "label:World"]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Run profiling from Python
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
|
|
58
|
+
from biblicus.analysis import get_analysis_backend
|
|
59
|
+
from biblicus.corpus import Corpus
|
|
60
|
+
from biblicus.models import ExtractionRunReference
|
|
61
|
+
|
|
62
|
+
corpus = Corpus.open(Path("corpora/example"))
|
|
63
|
+
backend = get_analysis_backend("profiling")
|
|
64
|
+
output = backend.run_analysis(
|
|
65
|
+
corpus,
|
|
66
|
+
recipe_name="default",
|
|
67
|
+
config={
|
|
68
|
+
"schema_version": 1,
|
|
69
|
+
"sample_size": 500,
|
|
70
|
+
"min_text_characters": 50,
|
|
71
|
+
"percentiles": [50, 90, 99],
|
|
72
|
+
"top_tag_count": 10,
|
|
73
|
+
"tag_filters": ["ag_news"],
|
|
74
|
+
},
|
|
75
|
+
extraction_run=ExtractionRunReference(
|
|
76
|
+
extractor_id="pipeline",
|
|
77
|
+
run_id="RUN_ID",
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
print(output.model_dump())
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Output location
|
|
84
|
+
|
|
85
|
+
Profiling output is stored under:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
.biblicus/runs/analysis/profiling/<run_id>/output.json
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Working demo
|
|
92
|
+
|
|
93
|
+
A runnable demo is provided in `scripts/profiling_demo.py`. It downloads a corpus, runs extraction, and executes the
|
|
94
|
+
profiling analysis so you can inspect the output:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
|
|
98
|
+
```
|
|
@@ -46,23 +46,20 @@ Acceptance checks:
|
|
|
46
46
|
- Behavior specifications cover policy selection and budgeting behaviors.
|
|
47
47
|
- Example outputs show how context packs differ across policies.
|
|
48
48
|
|
|
49
|
-
## Next: extraction
|
|
49
|
+
## Next: extraction evaluation harness
|
|
50
50
|
|
|
51
|
-
Goal:
|
|
51
|
+
Goal: compare extraction approaches in a way that is measurable, repeatable, and useful for practical engineering decisions.
|
|
52
52
|
|
|
53
53
|
Deliverables:
|
|
54
54
|
|
|
55
|
-
-
|
|
56
|
-
-
|
|
57
|
-
- A
|
|
58
|
-
- A consistent output contract that captures text plus optional confidence and per-page metadata.
|
|
59
|
-
- A selector policy for choosing between multiple extractor outputs in a pipeline.
|
|
60
|
-
- A shared evaluation harness for extraction backends using the same corpus and dataset.
|
|
55
|
+
- Dataset authoring workflow for extraction ground truth (for example: expected transcripts and expected OCR text).
|
|
56
|
+
- Evaluation metrics for accuracy, speed, and cost, including processable fraction for a given extractor recipe.
|
|
57
|
+
- A report format that can compare multiple extraction recipes against the same corpus and dataset.
|
|
61
58
|
|
|
62
59
|
Acceptance checks:
|
|
63
60
|
|
|
64
|
-
-
|
|
65
|
-
-
|
|
61
|
+
- Evaluation results are stable and reproducible for the same corpus and dataset inputs.
|
|
62
|
+
- Reports make it clear when an extractor fails to process an item versus producing empty output.
|
|
66
63
|
|
|
67
64
|
## Next: corpus analysis tools
|
|
68
65
|
|
|
@@ -70,41 +67,15 @@ Goal: provide lightweight analysis utilities that summarize corpus themes and gu
|
|
|
70
67
|
|
|
71
68
|
Deliverables:
|
|
72
69
|
|
|
73
|
-
-
|
|
74
|
-
-
|
|
75
|
-
- A way to compare
|
|
70
|
+
- Basic data profiling reports (counts, media types, size distributions, tag coverage).
|
|
71
|
+
- Hidden Markov modeling analysis for sequence-driven corpora.
|
|
72
|
+
- A way to compare analysis outputs across corpora or corpus snapshots.
|
|
76
73
|
|
|
77
74
|
Acceptance checks:
|
|
78
75
|
|
|
79
76
|
- Analysis is reproducible for the same corpus state.
|
|
80
77
|
- Reports are exportable and readable without custom tooling.
|
|
81
78
|
|
|
82
|
-
### Candidate backend ecosystem (for planning and evaluation)
|
|
83
|
-
|
|
84
|
-
Document understanding and OCR blur together at the interface level in Biblicus, so the roadmap treats them as extractor candidates with the same input/output contract.
|
|
85
|
-
|
|
86
|
-
Docling family candidates:
|
|
87
|
-
|
|
88
|
-
- Docling (document understanding with structured outputs)
|
|
89
|
-
- docling-ocr (OCR component in the Docling ecosystem)
|
|
90
|
-
|
|
91
|
-
General-purpose extraction candidates:
|
|
92
|
-
|
|
93
|
-
- Unstructured (element-oriented extraction for many formats)
|
|
94
|
-
- MarkItDown (lightweight conversion to Markdown)
|
|
95
|
-
- Kreuzberg (speed-focused extraction for bulk workflows)
|
|
96
|
-
- ExtractThinker (schema-driven extraction using Pydantic contracts)
|
|
97
|
-
|
|
98
|
-
Ecosystem adapters:
|
|
99
|
-
|
|
100
|
-
- LangChain document loaders (uniform loader interface across many sources)
|
|
101
|
-
|
|
102
|
-
### Guidance for choosing early targets
|
|
103
|
-
|
|
104
|
-
- If you need layout and table understanding, prioritize Docling and docling-ocr.
|
|
105
|
-
- If you need speed and simplicity, prioritize MarkItDown or Kreuzberg.
|
|
106
|
-
- If you need schema-first extraction, prioritize ExtractThinker layered on an OCR or document extractor.
|
|
107
|
-
|
|
108
79
|
## Later: alternate backends and hosting modes
|
|
109
80
|
|
|
110
81
|
Goal: broaden the backend surface while keeping the core predictable.
|
|
@@ -138,18 +109,3 @@ Acceptance checks:
|
|
|
138
109
|
|
|
139
110
|
- Behavior specifications cover ingestion, listing, and reindexing in memory.
|
|
140
111
|
- Retrieval and extraction can operate on the in-memory corpus without special casing.
|
|
141
|
-
|
|
142
|
-
### Extractor datasets and evaluation harness
|
|
143
|
-
|
|
144
|
-
Goal: compare extraction approaches in a way that is measurable, repeatable, and useful for practical engineering decisions.
|
|
145
|
-
|
|
146
|
-
Deliverables:
|
|
147
|
-
|
|
148
|
-
- Dataset authoring workflow for extraction ground truth (for example: expected transcripts and expected optical character recognition text).
|
|
149
|
-
- Evaluation metrics for accuracy, speed, and cost, including “processable fraction” for a given extractor recipe.
|
|
150
|
-
- A report format that can compare multiple extraction recipes against the same corpus and dataset.
|
|
151
|
-
|
|
152
|
-
Acceptance checks:
|
|
153
|
-
|
|
154
|
-
- Evaluation results are stable and reproducible for the same corpus and dataset inputs.
|
|
155
|
-
- Reports make it clear when an extractor fails to process an item versus producing empty output.
|
|
@@ -36,7 +36,7 @@ Integration scenarios are tagged `@integration`.
|
|
|
36
36
|
|
|
37
37
|
The repository does not include downloaded content. Integration scripts download content into a corpus path you choose and then ingest it for a test run.
|
|
38
38
|
|
|
39
|
-
-
|
|
39
|
+
- AG News dataset: `scripts/download_ag_news.py`
|
|
40
40
|
- Portable Document Format samples: `scripts/download_pdf_samples.py`
|
|
41
41
|
- Image samples: `scripts/download_image_samples.py`
|
|
42
42
|
- Mixed modality samples: `scripts/download_mixed_samples.py`
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Topic modeling
|
|
2
|
+
|
|
3
|
+
Biblicus provides a topic modeling analysis backend that reads extracted text artifacts, optionally applies an LLM
|
|
4
|
+
extraction pass, applies lexical processing, runs BERTopic, and optionally applies an LLM fine-tuning pass for
|
|
5
|
+
labels. The output is structured JavaScript Object Notation with explicit per-topic evidence.
|
|
6
|
+
|
|
7
|
+
## What topic modeling does
|
|
8
|
+
|
|
9
|
+
Topic modeling groups documents into clusters based on shared terms or phrases, then surfaces representative
|
|
10
|
+
keywords for each cluster. It is a fast way to summarize large corpora, identify dominant themes, and spot outliers
|
|
11
|
+
without manual labeling. The output is not a classifier; it is an exploratory tool that produces evidence that can
|
|
12
|
+
be inspected or reviewed by humans.
|
|
13
|
+
|
|
14
|
+
## About BERTopic
|
|
15
|
+
|
|
16
|
+
BERTopic combines document embeddings with clustering and a class-based term frequency approach to extract topic
|
|
17
|
+
keywords. Biblicus supports BERTopic as an optional dependency and forwards its configuration parameters directly to
|
|
18
|
+
the BERTopic constructor. This allows you to tune clustering behavior while keeping the output in a consistent
|
|
19
|
+
schema.
|
|
20
|
+
|
|
21
|
+
## Pipeline stages
|
|
22
|
+
|
|
23
|
+
- Text collection reads extracted text artifacts from an extraction run.
|
|
24
|
+
- LLM extraction optionally transforms each document into one or more analysis documents.
|
|
25
|
+
- Lexical processing optionally normalizes text before BERTopic.
|
|
26
|
+
- BERTopic produces topic assignments and keyword weights.
|
|
27
|
+
- LLM fine-tuning optionally replaces topic labels based on sampled documents.
|
|
28
|
+
|
|
29
|
+
## Output structure
|
|
30
|
+
|
|
31
|
+
Topic modeling writes a single `output.json` file under the analysis run directory. The output contains:
|
|
32
|
+
|
|
33
|
+
- `run.run_id` and `run.stats` for reproducible tracking.
|
|
34
|
+
- `report.topics` with the modeled topics.
|
|
35
|
+
- `report.text_collection`, `report.llm_extraction`, `report.lexical_processing`, `report.bertopic_analysis`,
|
|
36
|
+
and `report.llm_fine_tuning` describing each pipeline stage.
|
|
37
|
+
|
|
38
|
+
Each topic record includes:
|
|
39
|
+
|
|
40
|
+
- `topic_id`: The BERTopic topic identifier. The outlier topic uses `-1`.
|
|
41
|
+
- `label`: The human-readable label.
|
|
42
|
+
- `label_source`: `bertopic` or `llm` depending on the stage that set the label.
|
|
43
|
+
- `keywords`: Keyword list with weights.
|
|
44
|
+
- `document_count`: Number of documents assigned to the topic.
|
|
45
|
+
- `document_ids`: Item identifiers for the assigned documents.
|
|
46
|
+
- `document_examples`: Sampled document text used for inspection.
|
|
47
|
+
|
|
48
|
+
Per-topic behavior is determined by the BERTopic assignments and the optional fine-tuning stage. The lexical
|
|
49
|
+
processing flags can substantially change tokenization and therefore the resulting topic labels. The outlier
|
|
50
|
+
`topic_id` `-1` indicates documents that BERTopic could not confidently assign to a cluster.
|
|
51
|
+
|
|
52
|
+
## Configuration reference
|
|
53
|
+
|
|
54
|
+
Topic modeling recipes use a strict schema. Unknown fields or type mismatches are errors.
|
|
55
|
+
|
|
56
|
+
### Text source
|
|
57
|
+
|
|
58
|
+
- `text_source.sample_size`: Limit the number of documents used for analysis.
|
|
59
|
+
- `text_source.min_text_characters`: Drop documents shorter than this count.
|
|
60
|
+
|
|
61
|
+
### LLM extraction
|
|
62
|
+
|
|
63
|
+
- `llm_extraction.enabled`: Enable the LLM extraction stage.
|
|
64
|
+
- `llm_extraction.method`: `single` or `itemize` to control whether an input maps to one or many documents.
|
|
65
|
+
- `llm_extraction.client`: LLM client configuration (requires `biblicus[openai]`).
|
|
66
|
+
- `llm_extraction.prompt_template`: Prompt template for the extraction stage.
|
|
67
|
+
- `llm_extraction.system_prompt`: Optional system prompt.
|
|
68
|
+
|
|
69
|
+
### Lexical processing
|
|
70
|
+
|
|
71
|
+
- `lexical_processing.enabled`: Enable normalization.
|
|
72
|
+
- `lexical_processing.lowercase`: Lowercase text before tokenization.
|
|
73
|
+
- `lexical_processing.strip_punctuation`: Remove punctuation before tokenization.
|
|
74
|
+
- `lexical_processing.collapse_whitespace`: Normalize repeated whitespace.
|
|
75
|
+
|
|
76
|
+
### BERTopic configuration
|
|
77
|
+
|
|
78
|
+
- `bertopic_analysis.parameters`: Mapping of BERTopic constructor parameters.
|
|
79
|
+
- `bertopic_analysis.vectorizer.ngram_range`: Inclusive n-gram range (for example `[1, 2]`).
|
|
80
|
+
- `bertopic_analysis.vectorizer.stop_words`: `english` or a list of stop words. Set to `null` to disable.
|
|
81
|
+
|
|
82
|
+
### LLM fine-tuning
|
|
83
|
+
|
|
84
|
+
- `llm_fine_tuning.enabled`: Enable LLM topic labeling.
|
|
85
|
+
- `llm_fine_tuning.client`: LLM client configuration.
|
|
86
|
+
- `llm_fine_tuning.prompt_template`: Prompt template containing `{keywords}` and `{documents}`.
|
|
87
|
+
- `llm_fine_tuning.system_prompt`: Optional system prompt.
|
|
88
|
+
- `llm_fine_tuning.max_keywords`: Maximum keywords included per prompt.
|
|
89
|
+
- `llm_fine_tuning.max_documents`: Maximum documents included per prompt.
|
|
90
|
+
|
|
91
|
+
## Vectorizer configuration
|
|
92
|
+
|
|
93
|
+
Biblicus forwards BERTopic configuration through `bertopic_analysis.parameters` and exposes vectorizer settings
|
|
94
|
+
through `bertopic_analysis.vectorizer`. To include bigrams, set `ngram_range` to `[1, 2]`. To remove stop words,
|
|
95
|
+
set `stop_words` to `english` or a list.
|
|
96
|
+
|
|
97
|
+
```yaml
|
|
98
|
+
bertopic_analysis:
|
|
99
|
+
parameters:
|
|
100
|
+
min_topic_size: 10
|
|
101
|
+
nr_topics: 12
|
|
102
|
+
vectorizer:
|
|
103
|
+
ngram_range: [1, 2]
|
|
104
|
+
stop_words: english
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Repeatable integration script
|
|
108
|
+
|
|
109
|
+
The integration script downloads AG News, runs extraction, and then runs topic modeling with the selected
|
|
110
|
+
parameters. It prints a summary with the analysis run identifier and the output path.
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Example: raise topic count
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
python3 scripts/topic_modeling_integration.py \
|
|
120
|
+
--corpus corpora/ag_news_demo \
|
|
121
|
+
--force \
|
|
122
|
+
--limit 10000 \
|
|
123
|
+
--vectorizer-ngram-min 1 \
|
|
124
|
+
--vectorizer-ngram-max 2 \
|
|
125
|
+
--bertopic-param nr_topics=8 \
|
|
126
|
+
--bertopic-param min_topic_size=2
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Example: disable lexical processing and restrict inputs
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
python3 scripts/topic_modeling_integration.py \
|
|
133
|
+
--corpus corpora/ag_news_demo \
|
|
134
|
+
--force \
|
|
135
|
+
--sample-size 200 \
|
|
136
|
+
--min-text-characters 200 \
|
|
137
|
+
--no-lexical-enabled
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Example: keep lexical processing but preserve punctuation
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
python3 scripts/topic_modeling_integration.py \
|
|
144
|
+
--corpus corpora/ag_news_demo \
|
|
145
|
+
--force \
|
|
146
|
+
--no-lexical-strip-punctuation
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
BERTopic parameters are passed directly to the constructor. Use repeated `--bertopic-param key=value` pairs for
|
|
150
|
+
multiple parameters. Values that look like JSON objects or arrays are parsed as JSON.
|
|
151
|
+
|
|
152
|
+
The integration script requires at least 16 documents to avoid BERTopic default UMAP errors. Increase `--limit` or
|
|
153
|
+
use a larger corpus if you receive a small-corpus error.
|
|
154
|
+
|
|
155
|
+
AG News downloads require the `datasets` dependency. Install with:
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
python3 -m pip install "biblicus[datasets,topic-modeling]"
|
|
159
|
+
```
|
|
@@ -4,8 +4,13 @@ Sphinx configuration for Biblicus documentation.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
|
|
11
|
+
from pygments.lexers.special import TextLexer
|
|
12
|
+
from sphinx.highlighting import lexers
|
|
13
|
+
|
|
9
14
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
10
15
|
SOURCE_ROOT = PROJECT_ROOT / "src"
|
|
11
16
|
|
|
@@ -31,8 +36,6 @@ html_theme_options = {
|
|
|
31
36
|
}
|
|
32
37
|
|
|
33
38
|
# ReadTheDocs integration - canonical URL for SEO
|
|
34
|
-
import os
|
|
35
|
-
|
|
36
39
|
if os.environ.get("READTHEDOCS"):
|
|
37
40
|
rtd_version = os.environ.get("READTHEDOCS_VERSION", "latest")
|
|
38
41
|
rtd_project = os.environ.get("READTHEDOCS_PROJECT", "biblicus")
|
|
@@ -44,12 +47,6 @@ source_suffix = {
|
|
|
44
47
|
}
|
|
45
48
|
|
|
46
49
|
suppress_warnings = ["misc.highlighting_failure"]
|
|
47
|
-
|
|
48
|
-
import sys
|
|
49
|
-
|
|
50
50
|
sys.path.insert(0, str(SOURCE_ROOT))
|
|
51
51
|
|
|
52
|
-
from pygments.lexers.special import TextLexer
|
|
53
|
-
from sphinx.highlighting import lexers
|
|
54
|
-
|
|
55
52
|
lexers["mermaid"] = TextLexer()
|