biblicus 0.6.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.6.0 → biblicus-0.8.0}/PKG-INFO +120 -5
- biblicus-0.6.0/src/biblicus.egg-info/PKG-INFO → biblicus-0.8.0/README.md +104 -33
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/BACKENDS.md +2 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/DEMOS.md +41 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/EXTRACTION.md +66 -74
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/FEATURE_INDEX.md +2 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/ROADMAP.md +15 -0
- biblicus-0.8.0/docs/STT.md +89 -0
- biblicus-0.8.0/docs/TOPIC_MODELING.md +82 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/USER_CONFIGURATION.md +13 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/api.rst +11 -0
- biblicus-0.8.0/docs/backends/index.md +242 -0
- biblicus-0.8.0/docs/backends/scan.md +327 -0
- biblicus-0.8.0/docs/backends/sqlite-full-text-search.md +487 -0
- biblicus-0.8.0/docs/extractors/index.md +135 -0
- biblicus-0.8.0/docs/extractors/ocr/index.md +141 -0
- biblicus-0.8.0/docs/extractors/ocr/paddleocr-vl.md +456 -0
- biblicus-0.8.0/docs/extractors/ocr/rapidocr.md +359 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/index.md +234 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/pipeline.md +542 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-longest.md +404 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-override.md +402 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-smart-override.md +472 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-text.md +339 -0
- biblicus-0.8.0/docs/extractors/speech-to-text/deepgram.md +482 -0
- biblicus-0.8.0/docs/extractors/speech-to-text/index.md +158 -0
- biblicus-0.8.0/docs/extractors/speech-to-text/openai.md +449 -0
- biblicus-0.8.0/docs/extractors/text-document/index.md +107 -0
- biblicus-0.8.0/docs/extractors/text-document/markitdown.md +394 -0
- biblicus-0.8.0/docs/extractors/text-document/metadata.md +335 -0
- biblicus-0.8.0/docs/extractors/text-document/pass-through.md +253 -0
- biblicus-0.8.0/docs/extractors/text-document/pdf.md +339 -0
- biblicus-0.8.0/docs/extractors/text-document/unstructured.md +405 -0
- biblicus-0.8.0/docs/extractors/vlm-document/docling-granite.md +311 -0
- biblicus-0.8.0/docs/extractors/vlm-document/docling-smol.md +269 -0
- biblicus-0.8.0/docs/extractors/vlm-document/index.md +229 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/index.rst +3 -0
- biblicus-0.8.0/features/analysis_schema.feature +36 -0
- biblicus-0.8.0/features/cli_step_spec_parsing.feature +41 -0
- biblicus-0.8.0/features/docling_granite_extractor.feature +202 -0
- biblicus-0.8.0/features/docling_smol_extractor.feature +202 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/environment.py +139 -0
- biblicus-0.8.0/features/inference_backend.feature +117 -0
- biblicus-0.8.0/features/markitdown_extractor.feature +99 -0
- biblicus-0.8.0/features/paddleocr_vl_extractor.feature +299 -0
- biblicus-0.8.0/features/paddleocr_vl_parse_api_response.feature +18 -0
- biblicus-0.8.0/features/recipe_file_extraction.feature +35 -0
- biblicus-0.8.0/features/select_override.feature +126 -0
- biblicus-0.8.0/features/smart_override_selection.feature +406 -0
- biblicus-0.8.0/features/steps/analysis_steps.py +194 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/cli_steps.py +11 -1
- biblicus-0.8.0/features/steps/deepgram_steps.py +222 -0
- biblicus-0.8.0/features/steps/docling_steps.py +360 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/extraction_steps.py +132 -3
- biblicus-0.8.0/features/steps/inference_steps.py +63 -0
- biblicus-0.8.0/features/steps/markitdown_steps.py +173 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/openai_steps.py +76 -0
- biblicus-0.8.0/features/steps/paddleocr_mock_steps.py +48 -0
- biblicus-0.8.0/features/steps/paddleocr_vl_steps.py +196 -0
- biblicus-0.8.0/features/steps/paddleocr_vl_unit_steps.py +108 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/rapidocr_steps.py +2 -2
- biblicus-0.8.0/features/steps/requests_mock_steps.py +158 -0
- biblicus-0.8.0/features/steps/stt_deepgram_steps.py +93 -0
- biblicus-0.8.0/features/steps/topic_modeling_steps.py +231 -0
- biblicus-0.8.0/features/steps/user_config_steps.py +183 -0
- biblicus-0.8.0/features/stt_deepgram_extractor.feature +142 -0
- biblicus-0.8.0/features/topic_modeling.feature +908 -0
- biblicus-0.8.0/features/user_config.feature +85 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/pyproject.toml +23 -1
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/download_wikipedia.py +15 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/test.py +15 -4
- biblicus-0.8.0/scripts/topic_modeling_integration.py +257 -0
- biblicus-0.8.0/scripts/wikipedia_rag_demo.py +212 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/__init__.py +1 -1
- biblicus-0.8.0/src/biblicus/analysis/__init__.py +40 -0
- biblicus-0.8.0/src/biblicus/analysis/base.py +49 -0
- biblicus-0.8.0/src/biblicus/analysis/llm.py +106 -0
- biblicus-0.8.0/src/biblicus/analysis/models.py +512 -0
- biblicus-0.8.0/src/biblicus/analysis/schema.py +18 -0
- biblicus-0.8.0/src/biblicus/analysis/topic_modeling.py +561 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/cli.py +160 -11
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/constants.py +2 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/corpus.py +42 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extraction.py +5 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/__init__.py +14 -0
- biblicus-0.8.0/src/biblicus/extractors/deepgram_stt.py +166 -0
- biblicus-0.8.0/src/biblicus/extractors/docling_granite_text.py +188 -0
- biblicus-0.8.0/src/biblicus/extractors/docling_smol_text.py +188 -0
- biblicus-0.8.0/src/biblicus/extractors/markitdown_text.py +128 -0
- biblicus-0.8.0/src/biblicus/extractors/paddleocr_vl_text.py +305 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/rapidocr_text.py +8 -1
- biblicus-0.8.0/src/biblicus/extractors/select_override.py +121 -0
- biblicus-0.8.0/src/biblicus/extractors/select_smart_override.py +187 -0
- biblicus-0.8.0/src/biblicus/inference.py +104 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/models.py +6 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/user_config.py +76 -0
- biblicus-0.6.0/README.md → biblicus-0.8.0/src/biblicus.egg-info/PKG-INFO +148 -4
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus.egg-info/SOURCES.txt +67 -0
- biblicus-0.8.0/src/biblicus.egg-info/requires.txt +46 -0
- biblicus-0.6.0/features/steps/user_config_steps.py +0 -47
- biblicus-0.6.0/features/user_config.feature +0 -39
- biblicus-0.6.0/src/biblicus.egg-info/requires.txt +0 -23
- {biblicus-0.6.0 → biblicus-0.8.0}/LICENSE +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/MANIFEST.in +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/CONTEXT_PACK.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/CORPUS.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/CORPUS_DESIGN.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/KNOWLEDGE_BASE.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/TESTING.md +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/docs/conf.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/backend_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/context_pack.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/crawl.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/error_cases.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/evaluation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/frontmatter.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/import_tree.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_ocr_image_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_pdf_retrieval.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_unstructured_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/knowledge_base.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/model_validation.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/python_api.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/query_processing.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/retrieval_uses_extraction_run.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/source_loading.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/context_pack_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/crawl_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/evidence_processing_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/extractor_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/python_api_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/retrieval_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/steps/unstructured_steps.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/token_budget.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/download_audio_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/download_image_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/download_mixed_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/download_pdf_samples.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/scripts/readme_end_to_end_demo.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/setup.cfg +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/backends/scan.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/backends/sqlite_full_text_search.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/context.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/errors.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/evidence_processing.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/knowledge_base.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/sources.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/time.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.6.0 → biblicus-0.8.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -25,6 +25,21 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
|
25
25
|
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
26
|
Provides-Extra: ocr
|
|
27
27
|
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Provides-Extra: paddleocr
|
|
29
|
+
Requires-Dist: paddleocr>=2.7.0; extra == "paddleocr"
|
|
30
|
+
Requires-Dist: paddlepaddle>=2.5.0; extra == "paddleocr"
|
|
31
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "paddleocr"
|
|
32
|
+
Requires-Dist: requests>=2.28.0; extra == "paddleocr"
|
|
33
|
+
Provides-Extra: markitdown
|
|
34
|
+
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
35
|
+
Provides-Extra: deepgram
|
|
36
|
+
Requires-Dist: deepgram-sdk>=3.0; extra == "deepgram"
|
|
37
|
+
Provides-Extra: docling
|
|
38
|
+
Requires-Dist: docling[vlm]>=2.0.0; extra == "docling"
|
|
39
|
+
Provides-Extra: docling-mlx
|
|
40
|
+
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
|
+
Provides-Extra: topic-modeling
|
|
42
|
+
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
28
43
|
Dynamic: license-file
|
|
29
44
|
|
|
30
45
|
# Biblicus
|
|
@@ -67,7 +82,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
|
|
|
67
82
|
This simplified sequence diagram shows the same idea at a high level.
|
|
68
83
|
|
|
69
84
|
```mermaid
|
|
70
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
85
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
71
86
|
sequenceDiagram
|
|
72
87
|
participant App as Your assistant code
|
|
73
88
|
participant KB as Knowledge base
|
|
@@ -106,7 +121,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
106
121
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
107
122
|
|
|
108
123
|
```mermaid
|
|
109
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
124
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
110
125
|
sequenceDiagram
|
|
111
126
|
participant User
|
|
112
127
|
participant App as Your assistant code
|
|
@@ -158,8 +173,14 @@ python3 -m pip install biblicus
|
|
|
158
173
|
Some extractors are optional so the base install stays small.
|
|
159
174
|
|
|
160
175
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
161
|
-
-
|
|
176
|
+
- Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
|
|
177
|
+
- Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
|
|
178
|
+
- Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
|
|
179
|
+
- Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
180
|
+
- Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
162
181
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
182
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
183
|
+
- Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
|
|
163
184
|
|
|
164
185
|
## Quick start
|
|
165
186
|
|
|
@@ -417,6 +438,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
417
438
|
|
|
418
439
|
- [Corpus][corpus]
|
|
419
440
|
- [Text extraction][text-extraction]
|
|
441
|
+
- [Speech to text][speech-to-text]
|
|
420
442
|
- [Knowledge base][knowledge-base]
|
|
421
443
|
- [Backends][backends]
|
|
422
444
|
- [Context packs][context-packs]
|
|
@@ -465,7 +487,97 @@ corpus/
|
|
|
465
487
|
Two backends are included.
|
|
466
488
|
|
|
467
489
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
468
|
-
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in
|
|
490
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
491
|
+
|
|
492
|
+
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
493
|
+
|
|
494
|
+
## Extraction backends
|
|
495
|
+
|
|
496
|
+
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
497
|
+
|
|
498
|
+
### Text and document extraction
|
|
499
|
+
|
|
500
|
+
- [`pass-through-text`](docs/extractors/text-document/pass-through.md) reads text items and strips Markdown front matter.
|
|
501
|
+
- [`metadata-text`](docs/extractors/text-document/metadata.md) turns catalog metadata into a small text artifact.
|
|
502
|
+
- [`pdf-text`](docs/extractors/text-document/pdf.md) extracts text from Portable Document Format items with `pypdf`.
|
|
503
|
+
- [`unstructured`](docs/extractors/text-document/unstructured.md) provides broad document parsing (optional).
|
|
504
|
+
- [`markitdown`](docs/extractors/text-document/markitdown.md) converts many formats into Markdown-like text (optional).
|
|
505
|
+
|
|
506
|
+
### Optical character recognition
|
|
507
|
+
|
|
508
|
+
- [`ocr-rapidocr`](docs/extractors/ocr/rapidocr.md) does optical character recognition on images (optional).
|
|
509
|
+
- [`ocr-paddleocr-vl`](docs/extractors/ocr/paddleocr-vl.md) does advanced optical character recognition with PaddleOCR vision-language model (optional).
|
|
510
|
+
|
|
511
|
+
### Vision-language models
|
|
512
|
+
|
|
513
|
+
- [`docling-smol`](docs/extractors/vlm-document/docling-smol.md) uses the SmolDocling-256M vision-language model for fast document understanding (optional).
|
|
514
|
+
- [`docling-granite`](docs/extractors/vlm-document/docling-granite.md) uses the Granite Docling-258M vision-language model for high-accuracy extraction (optional).
|
|
515
|
+
|
|
516
|
+
### Speech to text
|
|
517
|
+
|
|
518
|
+
- [`stt-openai`](docs/extractors/speech-to-text/openai.md) performs speech to text on audio using OpenAI (optional).
|
|
519
|
+
- [`stt-deepgram`](docs/extractors/speech-to-text/deepgram.md) performs speech to text on audio using Deepgram (optional).
|
|
520
|
+
|
|
521
|
+
### Pipeline utilities
|
|
522
|
+
|
|
523
|
+
- [`select-text`](docs/extractors/pipeline-utilities/select-text.md) chooses one prior extraction result in a pipeline.
|
|
524
|
+
- [`select-longest-text`](docs/extractors/pipeline-utilities/select-longest.md) chooses the longest prior extraction result.
|
|
525
|
+
- [`select-override`](docs/extractors/pipeline-utilities/select-override.md) chooses the last extraction result for matching media types in a pipeline.
|
|
526
|
+
- [`select-smart-override`](docs/extractors/pipeline-utilities/select-smart-override.md) intelligently chooses between extraction results based on confidence and content quality.
|
|
527
|
+
|
|
528
|
+
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
529
|
+
|
|
530
|
+
## Topic modeling analysis
|
|
531
|
+
|
|
532
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
|
|
533
|
+
analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
|
|
534
|
+
processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
|
|
535
|
+
JavaScript Object Notation.
|
|
536
|
+
|
|
537
|
+
Run a topic analysis using a recipe file:
|
|
538
|
+
|
|
539
|
+
```
|
|
540
|
+
biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
|
|
544
|
+
reproducibility. The analysis output is stored under:
|
|
545
|
+
|
|
546
|
+
```
|
|
547
|
+
.biblicus/runs/analysis/topic-modeling/<run_id>/output.json
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
Minimal recipe example:
|
|
551
|
+
|
|
552
|
+
```yaml
|
|
553
|
+
schema_version: 1
|
|
554
|
+
text_source:
|
|
555
|
+
sample_size: 200
|
|
556
|
+
llm_extraction:
|
|
557
|
+
enabled: false
|
|
558
|
+
lexical_processing:
|
|
559
|
+
enabled: true
|
|
560
|
+
lowercase: true
|
|
561
|
+
strip_punctuation: false
|
|
562
|
+
collapse_whitespace: true
|
|
563
|
+
bertopic_analysis:
|
|
564
|
+
parameters:
|
|
565
|
+
min_topic_size: 8
|
|
566
|
+
nr_topics: 10
|
|
567
|
+
llm_fine_tuning:
|
|
568
|
+
enabled: false
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
572
|
+
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
573
|
+
|
|
574
|
+
For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
|
|
575
|
+
|
|
576
|
+
```
|
|
577
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
578
|
+
```
|
|
579
|
+
|
|
580
|
+
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
469
581
|
|
|
470
582
|
## Integration corpus and evaluation dataset
|
|
471
583
|
|
|
@@ -522,6 +634,9 @@ License terms are in `LICENSE`.
|
|
|
522
634
|
[corpus]: docs/CORPUS.md
|
|
523
635
|
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
524
636
|
[text-extraction]: docs/EXTRACTION.md
|
|
637
|
+
[extractor-reference]: docs/extractors/index.md
|
|
638
|
+
[backend-reference]: docs/backends/index.md
|
|
639
|
+
[speech-to-text]: docs/STT.md
|
|
525
640
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
526
641
|
[backends]: docs/BACKENDS.md
|
|
527
642
|
[context-packs]: docs/CONTEXT_PACK.md
|
|
@@ -1,32 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: biblicus
|
|
3
|
-
Version: 0.6.0
|
|
4
|
-
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
|
-
License: MIT
|
|
6
|
-
Requires-Python: >=3.9
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Requires-Dist: pydantic>=2.0
|
|
10
|
-
Requires-Dist: PyYAML>=6.0
|
|
11
|
-
Requires-Dist: pypdf>=4.0
|
|
12
|
-
Provides-Extra: dev
|
|
13
|
-
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
14
|
-
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
15
|
-
Requires-Dist: sphinx>=7.0; extra == "dev"
|
|
16
|
-
Requires-Dist: myst-parser>=2.0; extra == "dev"
|
|
17
|
-
Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
|
|
18
|
-
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
19
|
-
Requires-Dist: black>=24.0; extra == "dev"
|
|
20
|
-
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
21
|
-
Provides-Extra: openai
|
|
22
|
-
Requires-Dist: openai>=1.0; extra == "openai"
|
|
23
|
-
Provides-Extra: unstructured
|
|
24
|
-
Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
25
|
-
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
|
-
Provides-Extra: ocr
|
|
27
|
-
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
-
Dynamic: license-file
|
|
29
|
-
|
|
30
1
|
# Biblicus
|
|
31
2
|
|
|
32
3
|
![Continuous integration][continuous-integration-badge]
|
|
@@ -67,7 +38,7 @@ If you want to run a real, executable version of this story, use `scripts/readme
|
|
|
67
38
|
This simplified sequence diagram shows the same idea at a high level.
|
|
68
39
|
|
|
69
40
|
```mermaid
|
|
70
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
41
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
71
42
|
sequenceDiagram
|
|
72
43
|
participant App as Your assistant code
|
|
73
44
|
participant KB as Knowledge base
|
|
@@ -106,7 +77,7 @@ In a coding assistant, retrieval is often triggered by what the user is doing ri
|
|
|
106
77
|
This diagram shows two sequential Biblicus calls. They are shown separately to make the boundaries explicit: retrieval returns evidence, and context pack building consumes evidence.
|
|
107
78
|
|
|
108
79
|
```mermaid
|
|
109
|
-
%%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
80
|
+
%%{init: {"theme": "base", "themeVariables": {"background": "#ffffff", "primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
|
|
110
81
|
sequenceDiagram
|
|
111
82
|
participant User
|
|
112
83
|
participant App as Your assistant code
|
|
@@ -158,8 +129,14 @@ python3 -m pip install biblicus
|
|
|
158
129
|
Some extractors are optional so the base install stays small.
|
|
159
130
|
|
|
160
131
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
161
|
-
-
|
|
132
|
+
- Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
|
|
133
|
+
- Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
|
|
134
|
+
- Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
|
|
135
|
+
- Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
136
|
+
- Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
162
137
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
138
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
139
|
+
- Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
|
|
163
140
|
|
|
164
141
|
## Quick start
|
|
165
142
|
|
|
@@ -417,6 +394,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
417
394
|
|
|
418
395
|
- [Corpus][corpus]
|
|
419
396
|
- [Text extraction][text-extraction]
|
|
397
|
+
- [Speech to text][speech-to-text]
|
|
420
398
|
- [Knowledge base][knowledge-base]
|
|
421
399
|
- [Backends][backends]
|
|
422
400
|
- [Context packs][context-packs]
|
|
@@ -465,7 +443,97 @@ corpus/
|
|
|
465
443
|
Two backends are included.
|
|
466
444
|
|
|
467
445
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
468
|
-
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in
|
|
446
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
447
|
+
|
|
448
|
+
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
449
|
+
|
|
450
|
+
## Extraction backends
|
|
451
|
+
|
|
452
|
+
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
453
|
+
|
|
454
|
+
### Text and document extraction
|
|
455
|
+
|
|
456
|
+
- [`pass-through-text`](docs/extractors/text-document/pass-through.md) reads text items and strips Markdown front matter.
|
|
457
|
+
- [`metadata-text`](docs/extractors/text-document/metadata.md) turns catalog metadata into a small text artifact.
|
|
458
|
+
- [`pdf-text`](docs/extractors/text-document/pdf.md) extracts text from Portable Document Format items with `pypdf`.
|
|
459
|
+
- [`unstructured`](docs/extractors/text-document/unstructured.md) provides broad document parsing (optional).
|
|
460
|
+
- [`markitdown`](docs/extractors/text-document/markitdown.md) converts many formats into Markdown-like text (optional).
|
|
461
|
+
|
|
462
|
+
### Optical character recognition
|
|
463
|
+
|
|
464
|
+
- [`ocr-rapidocr`](docs/extractors/ocr/rapidocr.md) does optical character recognition on images (optional).
|
|
465
|
+
- [`ocr-paddleocr-vl`](docs/extractors/ocr/paddleocr-vl.md) does advanced optical character recognition with PaddleOCR vision-language model (optional).
|
|
466
|
+
|
|
467
|
+
### Vision-language models
|
|
468
|
+
|
|
469
|
+
- [`docling-smol`](docs/extractors/vlm-document/docling-smol.md) uses the SmolDocling-256M vision-language model for fast document understanding (optional).
|
|
470
|
+
- [`docling-granite`](docs/extractors/vlm-document/docling-granite.md) uses the Granite Docling-258M vision-language model for high-accuracy extraction (optional).
|
|
471
|
+
|
|
472
|
+
### Speech to text
|
|
473
|
+
|
|
474
|
+
- [`stt-openai`](docs/extractors/speech-to-text/openai.md) performs speech to text on audio using OpenAI (optional).
|
|
475
|
+
- [`stt-deepgram`](docs/extractors/speech-to-text/deepgram.md) performs speech to text on audio using Deepgram (optional).
|
|
476
|
+
|
|
477
|
+
### Pipeline utilities
|
|
478
|
+
|
|
479
|
+
- [`select-text`](docs/extractors/pipeline-utilities/select-text.md) chooses one prior extraction result in a pipeline.
|
|
480
|
+
- [`select-longest-text`](docs/extractors/pipeline-utilities/select-longest.md) chooses the longest prior extraction result.
|
|
481
|
+
- [`select-override`](docs/extractors/pipeline-utilities/select-override.md) chooses the last extraction result for matching media types in a pipeline.
|
|
482
|
+
- [`select-smart-override`](docs/extractors/pipeline-utilities/select-smart-override.md) intelligently chooses between extraction results based on confidence and content quality.
|
|
483
|
+
|
|
484
|
+
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
485
|
+
|
|
486
|
+
## Topic modeling analysis
|
|
487
|
+
|
|
488
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
|
|
489
|
+
analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
|
|
490
|
+
processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
|
|
491
|
+
JavaScript Object Notation.
|
|
492
|
+
|
|
493
|
+
Run a topic analysis using a recipe file:
|
|
494
|
+
|
|
495
|
+
```
|
|
496
|
+
biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
|
|
500
|
+
reproducibility. The analysis output is stored under:
|
|
501
|
+
|
|
502
|
+
```
|
|
503
|
+
.biblicus/runs/analysis/topic-modeling/<run_id>/output.json
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
Minimal recipe example:
|
|
507
|
+
|
|
508
|
+
```yaml
|
|
509
|
+
schema_version: 1
|
|
510
|
+
text_source:
|
|
511
|
+
sample_size: 200
|
|
512
|
+
llm_extraction:
|
|
513
|
+
enabled: false
|
|
514
|
+
lexical_processing:
|
|
515
|
+
enabled: true
|
|
516
|
+
lowercase: true
|
|
517
|
+
strip_punctuation: false
|
|
518
|
+
collapse_whitespace: true
|
|
519
|
+
bertopic_analysis:
|
|
520
|
+
parameters:
|
|
521
|
+
min_topic_size: 8
|
|
522
|
+
nr_topics: 10
|
|
523
|
+
llm_fine_tuning:
|
|
524
|
+
enabled: false
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
528
|
+
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
529
|
+
|
|
530
|
+
For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
|
|
531
|
+
|
|
532
|
+
```
|
|
533
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
469
537
|
|
|
470
538
|
## Integration corpus and evaluation dataset
|
|
471
539
|
|
|
@@ -522,6 +590,9 @@ License terms are in `LICENSE`.
|
|
|
522
590
|
[corpus]: docs/CORPUS.md
|
|
523
591
|
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
524
592
|
[text-extraction]: docs/EXTRACTION.md
|
|
593
|
+
[extractor-reference]: docs/extractors/index.md
|
|
594
|
+
[backend-reference]: docs/backends/index.md
|
|
595
|
+
[speech-to-text]: docs/STT.md
|
|
525
596
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
526
597
|
[backends]: docs/BACKENDS.md
|
|
527
598
|
[context-packs]: docs/CONTEXT_PACK.md
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
Backends are pluggable engines that implement a small, stable interface.
|
|
4
4
|
The goal is to make new retrieval ideas easy to test without reshaping the corpus.
|
|
5
5
|
|
|
6
|
+
For user documentation on available backends, see the [Backend Reference](backends/index.md).
|
|
7
|
+
|
|
6
8
|
## Backend contract
|
|
7
9
|
|
|
8
10
|
Backends implement two operations:
|
|
@@ -185,6 +185,28 @@ python3 -m biblicus extract build --corpus corpora/demo --step pass-through-text
|
|
|
185
185
|
|
|
186
186
|
The output includes a `run_id` you can reuse when building a retrieval backend.
|
|
187
187
|
|
|
188
|
+
### Topic modeling integration run
|
|
189
|
+
|
|
190
|
+
Use the integration script to download a Wikipedia corpus, run extraction, and run topic modeling with a single command.
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Run with a smaller corpus and a higher topic count:
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
python3 scripts/topic_modeling_integration.py \
|
|
200
|
+
--corpus corpora/wiki_demo \
|
|
201
|
+
--force \
|
|
202
|
+
--limit 20 \
|
|
203
|
+
--bertopic-param nr_topics=8 \
|
|
204
|
+
--bertopic-param min_topic_size=2
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
|
|
208
|
+
keywords, and document examples.
|
|
209
|
+
|
|
188
210
|
### Select extracted text within a pipeline
|
|
189
211
|
|
|
190
212
|
When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
|
|
@@ -221,6 +243,25 @@ python3 -m biblicus build --corpus corpora/pdf_samples --backend sqlite-full-tex
|
|
|
221
243
|
python3 -m biblicus query --corpus corpora/pdf_samples --query "Dummy PDF file"
|
|
222
244
|
```
|
|
223
245
|
|
|
246
|
+
### Wikipedia retrieval demo (Python)
|
|
247
|
+
|
|
248
|
+
This example downloads a few Wikipedia summaries about retrieval and knowledge bases, builds an extraction run, creates a local full text index, and returns evidence plus a context pack.
|
|
249
|
+
|
|
250
|
+
```
|
|
251
|
+
rm -rf corpora/wikipedia_rag_demo
|
|
252
|
+
python3 scripts/wikipedia_rag_demo.py --corpus corpora/wikipedia_rag_demo --force
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### MarkItDown extraction demo (Python 3.10+)
|
|
256
|
+
|
|
257
|
+
MarkItDown requires Python 3.10 or higher. This example uses the `py311` conda environment to run the extractor over the mixed sample corpus.
|
|
258
|
+
|
|
259
|
+
```
|
|
260
|
+
conda run -n py311 python -m pip install -e . "markitdown[all]"
|
|
261
|
+
conda run -n py311 python scripts/download_mixed_samples.py --corpus corpora/markitdown_demo_py311 --force
|
|
262
|
+
conda run -n py311 python -m biblicus extract build --corpus corpora/markitdown_demo_py311 --step markitdown
|
|
263
|
+
```
|
|
264
|
+
|
|
224
265
|
### Mixed modality integration corpus
|
|
225
266
|
|
|
226
267
|
This example assembles a tiny mixed corpus with a Markdown note, a Hypertext Markup Language page, an image, a Portable Document Format file with extractable text, and a generated Portable Document Format file with no extractable text.
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
# Text
|
|
1
|
+
# Text Extraction Pipeline
|
|
2
2
|
|
|
3
3
|
Text extraction is a separate pipeline stage that produces derived text artifacts under a corpus.
|
|
4
4
|
|
|
5
5
|
This separation matters because it lets you combine extraction choices and retrieval backends independently.
|
|
6
6
|
|
|
7
|
+
For detailed documentation on specific extractors, see [Extractor Reference](extractors/index.md).
|
|
8
|
+
|
|
7
9
|
## What extraction produces
|
|
8
10
|
|
|
9
11
|
An extraction run produces:
|
|
@@ -31,78 +33,42 @@ corpus/
|
|
|
31
33
|
<item id>.txt
|
|
32
34
|
```
|
|
33
35
|
|
|
34
|
-
##
|
|
35
|
-
|
|
36
|
-
Version zero includes a small set of deterministic extractors.
|
|
37
|
-
|
|
38
|
-
`pass-through-text`
|
|
39
|
-
|
|
40
|
-
- Reads text items and returns their content
|
|
41
|
-
- For Markdown items, it strips YAML front matter and returns only the body
|
|
42
|
-
- Skips non text items
|
|
43
|
-
|
|
44
|
-
`metadata-text`
|
|
45
|
-
|
|
46
|
-
- Builds a small text representation from catalog metadata
|
|
47
|
-
- This is useful when you have a non text item with meaningful tags or a title
|
|
48
|
-
|
|
49
|
-
`pdf-text`
|
|
50
|
-
|
|
51
|
-
- Attempts to extract text from Portable Document Format items
|
|
52
|
-
- Skips items that are not Portable Document Format
|
|
53
|
-
- Uses the `pypdf` library
|
|
54
|
-
- Produces empty output for scanned Portable Document Format files that contain no extractable text without optical character recognition
|
|
55
|
-
|
|
56
|
-
`select-text`
|
|
36
|
+
## Available Extractors
|
|
57
37
|
|
|
58
|
-
|
|
59
|
-
- This is used when you have multiple pipeline steps that can produce usable text for the same items and you want one chosen result
|
|
60
|
-
- Records which step supplied the selected text
|
|
38
|
+
Biblicus provides 16 built-in extractors organized by category:
|
|
61
39
|
|
|
62
|
-
|
|
40
|
+
### Text & Document Processing
|
|
63
41
|
|
|
64
|
-
-
|
|
65
|
-
-
|
|
66
|
-
-
|
|
42
|
+
- [`pass-through-text`](extractors/text-document/pass-through.md) - Direct text file reading
|
|
43
|
+
- [`metadata-text`](extractors/text-document/metadata.md) - Text from item metadata
|
|
44
|
+
- [`pdf-text`](extractors/text-document/pdf.md) - PDF text extraction using pypdf
|
|
45
|
+
- [`markitdown`](extractors/text-document/markitdown.md) - Office documents via MarkItDown
|
|
46
|
+
- [`unstructured`](extractors/text-document/unstructured.md) - Universal document parsing
|
|
67
47
|
|
|
68
|
-
|
|
48
|
+
### Optical Character Recognition
|
|
69
49
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
`ocr-rapidocr`
|
|
50
|
+
- [`ocr-rapidocr`](extractors/ocr/rapidocr.md) - Fast ONNX-based OCR
|
|
51
|
+
- [`ocr-paddleocr-vl`](extractors/ocr/paddleocr-vl.md) - Advanced OCR with VL model
|
|
75
52
|
|
|
76
|
-
-
|
|
77
|
-
- Backed by the optional `rapidocr-onnxruntime` dependency
|
|
78
|
-
- Intended as a practical default when you need text from images without running a service
|
|
53
|
+
### Vision-Language Models
|
|
79
54
|
|
|
80
|
-
|
|
55
|
+
- [`docling-smol`](extractors/vlm-document/docling-smol.md) - SmolDocling-256M for fast document processing
|
|
56
|
+
- [`docling-granite`](extractors/vlm-document/docling-granite.md) - Granite Docling-258M for high-accuracy extraction
|
|
81
57
|
|
|
82
|
-
|
|
83
|
-
python3 -m pip install "biblicus[ocr]"
|
|
84
|
-
```
|
|
58
|
+
### Speech-to-Text
|
|
85
59
|
|
|
86
|
-
`stt-openai`
|
|
60
|
+
- [`stt-openai`](extractors/speech-to-text/openai.md) - OpenAI Whisper API
|
|
61
|
+
- [`stt-deepgram`](extractors/speech-to-text/deepgram.md) - Deepgram Nova-3 API
|
|
87
62
|
|
|
88
|
-
|
|
89
|
-
- Backed by the optional `openai` dependency
|
|
90
|
-
- Requires an OpenAI API key (from `OPENAI_API_KEY` or the user configuration file)
|
|
91
|
-
|
|
92
|
-
To install:
|
|
93
|
-
|
|
94
|
-
```
|
|
95
|
-
python3 -m pip install "biblicus[openai]"
|
|
96
|
-
```
|
|
63
|
+
### Pipeline Utilities
|
|
97
64
|
|
|
98
|
-
|
|
65
|
+
- [`select-text`](extractors/pipeline-utilities/select-text.md) - First successful extractor
|
|
66
|
+
- [`select-longest-text`](extractors/pipeline-utilities/select-longest.md) - Longest output selection
|
|
67
|
+
- [`select-override`](extractors/pipeline-utilities/select-override.md) - Per-item override by ID
|
|
68
|
+
- [`select-smart-override`](extractors/pipeline-utilities/select-smart-override.md) - Media type-based routing
|
|
69
|
+
- [`pipeline`](extractors/pipeline-utilities/pipeline.md) - Multi-step extraction workflow
|
|
99
70
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
```
|
|
103
|
-
openai:
|
|
104
|
-
api_key: YOUR_KEY_HERE
|
|
105
|
-
```
|
|
71
|
+
For detailed documentation including configuration options, usage examples, and best practices, see the [Extractor Reference](extractors/index.md).
|
|
106
72
|
|
|
107
73
|
## How selection chooses text
|
|
108
74
|
|
|
@@ -110,12 +76,12 @@ The `select-text` extractor does not attempt to judge extraction quality. It cho
|
|
|
110
76
|
|
|
111
77
|
Usable means non-empty after stripping whitespace.
|
|
112
78
|
|
|
113
|
-
This means selection does not automatically choose the longest extracted text or the extraction with the most content. If you want a scoring rule such as choose the longest extracted text,
|
|
79
|
+
This means selection does not automatically choose the longest extracted text or the extraction with the most content. If you want a scoring rule such as choose the longest extracted text, use the [`select-longest-text`](extractors/pipeline-utilities/select-longest.md) extractor instead.
|
|
114
80
|
|
|
115
|
-
|
|
81
|
+
Other selection strategies include:
|
|
116
82
|
|
|
117
|
-
-
|
|
118
|
-
-
|
|
83
|
+
- [`select-override`](extractors/pipeline-utilities/select-override.md) - Override extraction for specific items by ID
|
|
84
|
+
- [`select-smart-override`](extractors/pipeline-utilities/select-smart-override.md) - Route items based on media type patterns
|
|
119
85
|
|
|
120
86
|
## Pipeline extractor
|
|
121
87
|
|
|
@@ -125,6 +91,8 @@ The pipeline runs every step in order and records all step outputs. Each step re
|
|
|
125
91
|
|
|
126
92
|
This lets you build explicit extraction policies while keeping every step outcome available for comparison and metrics.
|
|
127
93
|
|
|
94
|
+
For details, see the [`pipeline` extractor documentation](extractors/pipeline-utilities/pipeline.md).
|
|
95
|
+
|
|
128
96
|
## Complementary versus competing extractors
|
|
129
97
|
|
|
130
98
|
The pipeline is designed for complementary steps that do not overlap much in what they handle.
|
|
@@ -148,9 +116,9 @@ python3 -m biblicus init corpora/extraction-demo
|
|
|
148
116
|
printf 'x' > /tmp/image.png
|
|
149
117
|
python3 -m biblicus ingest --corpus corpora/extraction-demo /tmp/image.png --tag extracted
|
|
150
118
|
|
|
151
|
-
python3 -m biblicus extract build --corpus corpora/extraction-demo
|
|
152
|
-
--step pass-through-text
|
|
153
|
-
--step pdf-text
|
|
119
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
120
|
+
--step pass-through-text \
|
|
121
|
+
--step pdf-text \
|
|
154
122
|
--step metadata-text
|
|
155
123
|
```
|
|
156
124
|
|
|
@@ -161,14 +129,38 @@ The extracted text for the image comes from the `metadata-text` step because the
|
|
|
161
129
|
Selection is a pipeline step that chooses extracted text from previous pipeline steps. Selection is just another extractor in the pipeline, and it decides which prior output to carry forward.
|
|
162
130
|
|
|
163
131
|
```
|
|
164
|
-
python3 -m biblicus extract build --corpus corpora/extraction-demo
|
|
165
|
-
--step pass-through-text
|
|
166
|
-
--step metadata-text
|
|
132
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
133
|
+
--step pass-through-text \
|
|
134
|
+
--step metadata-text \
|
|
167
135
|
--step select-text
|
|
168
136
|
```
|
|
169
137
|
|
|
170
138
|
The pipeline run produces one extraction run under `pipeline`. You can point retrieval backends at that run.
|
|
171
139
|
|
|
140
|
+
## Example: PDF with OCR fallback
|
|
141
|
+
|
|
142
|
+
Try text extraction first, fall back to OCR for scanned documents:
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
146
|
+
--step pdf-text \
|
|
147
|
+
--step ocr-rapidocr \
|
|
148
|
+
--step select-text
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
This pipeline tries `pdf-text` first for PDFs with text layers, falls back to `ocr-rapidocr` for scanned PDFs, and uses `select-text` to pick the first successful result.
|
|
152
|
+
|
|
153
|
+
## Example: VLM for complex documents
|
|
154
|
+
|
|
155
|
+
Use vision-language models for documents with complex layouts:
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
159
|
+
--step docling-granite
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
The `docling-granite` extractor uses IBM Research's Granite Docling-258M VLM for high-accuracy extraction of tables, code blocks, and equations.
|
|
163
|
+
|
|
172
164
|
## Inspecting and deleting extraction runs
|
|
173
165
|
|
|
174
166
|
Extraction runs are stored under the corpus and can be listed and inspected.
|
|
@@ -181,8 +173,8 @@ python3 -m biblicus extract show --corpus corpora/extraction-demo --run pipeline
|
|
|
181
173
|
Deletion is explicit and requires typing the exact run reference as confirmation:
|
|
182
174
|
|
|
183
175
|
```
|
|
184
|
-
python3 -m biblicus extract delete --corpus corpora/extraction-demo
|
|
185
|
-
--run pipeline:EXTRACTION_RUN_ID
|
|
176
|
+
python3 -m biblicus extract delete --corpus corpora/extraction-demo \
|
|
177
|
+
--run pipeline:EXTRACTION_RUN_ID \
|
|
186
178
|
--confirm pipeline:EXTRACTION_RUN_ID
|
|
187
179
|
```
|
|
188
180
|
|
|
@@ -191,7 +183,7 @@ python3 -m biblicus extract delete --corpus corpora/extraction-demo \\
|
|
|
191
183
|
Retrieval backends can build and query using a selected extraction run. This is configured by passing `extraction_run=extractor_id:run_id` to the backend build command.
|
|
192
184
|
|
|
193
185
|
```
|
|
194
|
-
python3 -m biblicus build --corpus corpora/extraction-demo --backend sqlite-full-text-search
|
|
186
|
+
python3 -m biblicus build --corpus corpora/extraction-demo --backend sqlite-full-text-search \
|
|
195
187
|
--config extraction_run=pipeline:EXTRACTION_RUN_ID
|
|
196
188
|
python3 -m biblicus query --corpus corpora/extraction-demo --query extracted
|
|
197
189
|
```
|
|
@@ -123,6 +123,7 @@ What it does:
|
|
|
123
123
|
- Includes a Portable Document Format text extractor plugin.
|
|
124
124
|
- Includes a speech to text extractor plugin for audio items.
|
|
125
125
|
- Includes a selection extractor step for choosing extracted text within a pipeline.
|
|
126
|
+
- Includes a MarkItDown extractor plugin for document conversion.
|
|
126
127
|
|
|
127
128
|
Documentation:
|
|
128
129
|
|
|
@@ -139,6 +140,7 @@ Behavior specifications:
|
|
|
139
140
|
- `features/ocr_extractor.feature`
|
|
140
141
|
- `features/stt_extractor.feature`
|
|
141
142
|
- `features/unstructured_extractor.feature`
|
|
143
|
+
- `features/markitdown_extractor.feature`
|
|
142
144
|
- `features/integration_unstructured_extraction.feature`
|
|
143
145
|
|
|
144
146
|
Primary implementation:
|