biblicus 0.7.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.7.0 → biblicus-0.8.0}/PKG-INFO +111 -13
- biblicus-0.7.0/src/biblicus.egg-info/PKG-INFO → biblicus-0.8.0/README.md +97 -43
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/BACKENDS.md +2 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/DEMOS.md +22 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/EXTRACTION.md +66 -95
- biblicus-0.8.0/docs/STT.md +89 -0
- biblicus-0.8.0/docs/TOPIC_MODELING.md +82 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/USER_CONFIGURATION.md +13 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/api.rst +11 -0
- biblicus-0.8.0/docs/backends/index.md +242 -0
- biblicus-0.8.0/docs/backends/scan.md +327 -0
- biblicus-0.8.0/docs/backends/sqlite-full-text-search.md +487 -0
- biblicus-0.8.0/docs/extractors/index.md +135 -0
- biblicus-0.8.0/docs/extractors/ocr/index.md +141 -0
- biblicus-0.8.0/docs/extractors/ocr/paddleocr-vl.md +456 -0
- biblicus-0.8.0/docs/extractors/ocr/rapidocr.md +359 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/index.md +234 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/pipeline.md +542 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-longest.md +404 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-override.md +402 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-smart-override.md +472 -0
- biblicus-0.8.0/docs/extractors/pipeline-utilities/select-text.md +339 -0
- biblicus-0.8.0/docs/extractors/speech-to-text/deepgram.md +482 -0
- biblicus-0.8.0/docs/extractors/speech-to-text/index.md +158 -0
- biblicus-0.8.0/docs/extractors/speech-to-text/openai.md +449 -0
- biblicus-0.8.0/docs/extractors/text-document/index.md +107 -0
- biblicus-0.8.0/docs/extractors/text-document/markitdown.md +394 -0
- biblicus-0.8.0/docs/extractors/text-document/metadata.md +335 -0
- biblicus-0.8.0/docs/extractors/text-document/pass-through.md +253 -0
- biblicus-0.8.0/docs/extractors/text-document/pdf.md +339 -0
- biblicus-0.8.0/docs/extractors/text-document/unstructured.md +405 -0
- biblicus-0.8.0/docs/extractors/vlm-document/docling-granite.md +311 -0
- biblicus-0.8.0/docs/extractors/vlm-document/docling-smol.md +269 -0
- biblicus-0.8.0/docs/extractors/vlm-document/index.md +229 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/index.rst +3 -0
- biblicus-0.8.0/features/analysis_schema.feature +36 -0
- biblicus-0.8.0/features/cli_step_spec_parsing.feature +41 -0
- biblicus-0.8.0/features/docling_granite_extractor.feature +202 -0
- biblicus-0.8.0/features/docling_smol_extractor.feature +202 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/environment.py +113 -0
- biblicus-0.8.0/features/inference_backend.feature +117 -0
- biblicus-0.8.0/features/paddleocr_vl_extractor.feature +299 -0
- biblicus-0.8.0/features/paddleocr_vl_parse_api_response.feature +18 -0
- biblicus-0.8.0/features/recipe_file_extraction.feature +35 -0
- biblicus-0.8.0/features/select_override.feature +126 -0
- biblicus-0.8.0/features/smart_override_selection.feature +406 -0
- biblicus-0.8.0/features/steps/analysis_steps.py +194 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/cli_steps.py +11 -1
- biblicus-0.8.0/features/steps/deepgram_steps.py +222 -0
- biblicus-0.8.0/features/steps/docling_steps.py +360 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/extraction_steps.py +132 -3
- biblicus-0.8.0/features/steps/inference_steps.py +63 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/openai_steps.py +76 -0
- biblicus-0.8.0/features/steps/paddleocr_mock_steps.py +48 -0
- biblicus-0.8.0/features/steps/paddleocr_vl_steps.py +196 -0
- biblicus-0.8.0/features/steps/paddleocr_vl_unit_steps.py +108 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/rapidocr_steps.py +2 -2
- biblicus-0.8.0/features/steps/requests_mock_steps.py +158 -0
- biblicus-0.8.0/features/steps/stt_deepgram_steps.py +93 -0
- biblicus-0.8.0/features/steps/topic_modeling_steps.py +231 -0
- biblicus-0.8.0/features/steps/user_config_steps.py +183 -0
- biblicus-0.8.0/features/stt_deepgram_extractor.feature +142 -0
- biblicus-0.8.0/features/topic_modeling.feature +908 -0
- biblicus-0.8.0/features/user_config.feature +85 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/pyproject.toml +19 -1
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/download_wikipedia.py +15 -0
- biblicus-0.8.0/scripts/topic_modeling_integration.py +257 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/__init__.py +1 -1
- biblicus-0.8.0/src/biblicus/analysis/__init__.py +40 -0
- biblicus-0.8.0/src/biblicus/analysis/base.py +49 -0
- biblicus-0.8.0/src/biblicus/analysis/llm.py +106 -0
- biblicus-0.8.0/src/biblicus/analysis/models.py +512 -0
- biblicus-0.8.0/src/biblicus/analysis/schema.py +18 -0
- biblicus-0.8.0/src/biblicus/analysis/topic_modeling.py +561 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/cli.py +160 -11
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/constants.py +2 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/corpus.py +42 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extraction.py +5 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/__init__.py +12 -0
- biblicus-0.8.0/src/biblicus/extractors/deepgram_stt.py +166 -0
- biblicus-0.8.0/src/biblicus/extractors/docling_granite_text.py +188 -0
- biblicus-0.8.0/src/biblicus/extractors/docling_smol_text.py +188 -0
- biblicus-0.8.0/src/biblicus/extractors/paddleocr_vl_text.py +305 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/rapidocr_text.py +8 -1
- biblicus-0.8.0/src/biblicus/extractors/select_override.py +121 -0
- biblicus-0.8.0/src/biblicus/extractors/select_smart_override.py +187 -0
- biblicus-0.8.0/src/biblicus/inference.py +104 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/models.py +6 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/user_config.py +76 -0
- biblicus-0.7.0/README.md → biblicus-0.8.0/src/biblicus.egg-info/PKG-INFO +141 -12
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus.egg-info/SOURCES.txt +63 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus.egg-info/requires.txt +18 -0
- biblicus-0.7.0/features/steps/user_config_steps.py +0 -47
- biblicus-0.7.0/features/user_config.feature +0 -39
- {biblicus-0.7.0 → biblicus-0.8.0}/LICENSE +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/MANIFEST.in +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/CONTEXT_PACK.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/CORPUS.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/CORPUS_DESIGN.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/FEATURE_INDEX.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/KNOWLEDGE_BASE.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/ROADMAP.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/TESTING.md +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/docs/conf.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/backend_validation.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/context_pack.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/crawl.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/error_cases.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/evaluation.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/frontmatter.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/import_tree.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_ocr_image_extraction.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_pdf_retrieval.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_unstructured_extraction.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/knowledge_base.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/markitdown_extractor.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/model_validation.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/python_api.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/query_processing.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/retrieval_uses_extraction_run.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/source_loading.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/context_pack_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/crawl_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/evidence_processing_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/extractor_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/markitdown_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/python_api_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/retrieval_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/steps/unstructured_steps.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/token_budget.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/download_audio_samples.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/download_image_samples.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/download_mixed_samples.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/download_pdf_samples.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/readme_end_to_end_demo.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/test.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/scripts/wikipedia_rag_demo.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/setup.cfg +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/backends/scan.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/backends/sqlite_full_text_search.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/context.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/errors.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/evidence_processing.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/markitdown_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/knowledge_base.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/sources.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/time.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.7.0 → biblicus-0.8.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -25,8 +25,21 @@ Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
|
25
25
|
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
26
|
Provides-Extra: ocr
|
|
27
27
|
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
+
Provides-Extra: paddleocr
|
|
29
|
+
Requires-Dist: paddleocr>=2.7.0; extra == "paddleocr"
|
|
30
|
+
Requires-Dist: paddlepaddle>=2.5.0; extra == "paddleocr"
|
|
31
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "paddleocr"
|
|
32
|
+
Requires-Dist: requests>=2.28.0; extra == "paddleocr"
|
|
28
33
|
Provides-Extra: markitdown
|
|
29
34
|
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
35
|
+
Provides-Extra: deepgram
|
|
36
|
+
Requires-Dist: deepgram-sdk>=3.0; extra == "deepgram"
|
|
37
|
+
Provides-Extra: docling
|
|
38
|
+
Requires-Dist: docling[vlm]>=2.0.0; extra == "docling"
|
|
39
|
+
Provides-Extra: docling-mlx
|
|
40
|
+
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
|
+
Provides-Extra: topic-modeling
|
|
42
|
+
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
30
43
|
Dynamic: license-file
|
|
31
44
|
|
|
32
45
|
# Biblicus
|
|
@@ -160,9 +173,14 @@ python3 -m pip install biblicus
|
|
|
160
173
|
Some extractors are optional so the base install stays small.
|
|
161
174
|
|
|
162
175
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
163
|
-
-
|
|
176
|
+
- Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
|
|
177
|
+
- Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
|
|
178
|
+
- Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
|
|
179
|
+
- Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
180
|
+
- Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
164
181
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
165
182
|
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
183
|
+
- Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
|
|
166
184
|
|
|
167
185
|
## Quick start
|
|
168
186
|
|
|
@@ -420,6 +438,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
420
438
|
|
|
421
439
|
- [Corpus][corpus]
|
|
422
440
|
- [Text extraction][text-extraction]
|
|
441
|
+
- [Speech to text][speech-to-text]
|
|
423
442
|
- [Knowledge base][knowledge-base]
|
|
424
443
|
- [Backends][backends]
|
|
425
444
|
- [Context packs][context-packs]
|
|
@@ -468,21 +487,97 @@ corpus/
|
|
|
468
487
|
Two backends are included.
|
|
469
488
|
|
|
470
489
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
471
|
-
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in
|
|
490
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
491
|
+
|
|
492
|
+
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
472
493
|
|
|
473
494
|
## Extraction backends
|
|
474
495
|
|
|
475
|
-
These extractors are built in. Optional ones require extra dependencies.
|
|
496
|
+
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
497
|
+
|
|
498
|
+
### Text and document extraction
|
|
499
|
+
|
|
500
|
+
- [`pass-through-text`](docs/extractors/text-document/pass-through.md) reads text items and strips Markdown front matter.
|
|
501
|
+
- [`metadata-text`](docs/extractors/text-document/metadata.md) turns catalog metadata into a small text artifact.
|
|
502
|
+
- [`pdf-text`](docs/extractors/text-document/pdf.md) extracts text from Portable Document Format items with `pypdf`.
|
|
503
|
+
- [`unstructured`](docs/extractors/text-document/unstructured.md) provides broad document parsing (optional).
|
|
504
|
+
- [`markitdown`](docs/extractors/text-document/markitdown.md) converts many formats into Markdown-like text (optional).
|
|
505
|
+
|
|
506
|
+
### Optical character recognition
|
|
507
|
+
|
|
508
|
+
- [`ocr-rapidocr`](docs/extractors/ocr/rapidocr.md) does optical character recognition on images (optional).
|
|
509
|
+
- [`ocr-paddleocr-vl`](docs/extractors/ocr/paddleocr-vl.md) does advanced optical character recognition with PaddleOCR vision-language model (optional).
|
|
510
|
+
|
|
511
|
+
### Vision-language models
|
|
512
|
+
|
|
513
|
+
- [`docling-smol`](docs/extractors/vlm-document/docling-smol.md) uses the SmolDocling-256M vision-language model for fast document understanding (optional).
|
|
514
|
+
- [`docling-granite`](docs/extractors/vlm-document/docling-granite.md) uses the Granite Docling-258M vision-language model for high-accuracy extraction (optional).
|
|
515
|
+
|
|
516
|
+
### Speech to text
|
|
517
|
+
|
|
518
|
+
- [`stt-openai`](docs/extractors/speech-to-text/openai.md) performs speech to text on audio using OpenAI (optional).
|
|
519
|
+
- [`stt-deepgram`](docs/extractors/speech-to-text/deepgram.md) performs speech to text on audio using Deepgram (optional).
|
|
520
|
+
|
|
521
|
+
### Pipeline utilities
|
|
522
|
+
|
|
523
|
+
- [`select-text`](docs/extractors/pipeline-utilities/select-text.md) chooses one prior extraction result in a pipeline.
|
|
524
|
+
- [`select-longest-text`](docs/extractors/pipeline-utilities/select-longest.md) chooses the longest prior extraction result.
|
|
525
|
+
- [`select-override`](docs/extractors/pipeline-utilities/select-override.md) chooses the last extraction result for matching media types in a pipeline.
|
|
526
|
+
- [`select-smart-override`](docs/extractors/pipeline-utilities/select-smart-override.md) intelligently chooses between extraction results based on confidence and content quality.
|
|
527
|
+
|
|
528
|
+
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
529
|
+
|
|
530
|
+
## Topic modeling analysis
|
|
531
|
+
|
|
532
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
|
|
533
|
+
analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
|
|
534
|
+
processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
|
|
535
|
+
JavaScript Object Notation.
|
|
536
|
+
|
|
537
|
+
Run a topic analysis using a recipe file:
|
|
538
|
+
|
|
539
|
+
```
|
|
540
|
+
biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
|
|
544
|
+
reproducibility. The analysis output is stored under:
|
|
545
|
+
|
|
546
|
+
```
|
|
547
|
+
.biblicus/runs/analysis/topic-modeling/<run_id>/output.json
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
Minimal recipe example:
|
|
551
|
+
|
|
552
|
+
```yaml
|
|
553
|
+
schema_version: 1
|
|
554
|
+
text_source:
|
|
555
|
+
sample_size: 200
|
|
556
|
+
llm_extraction:
|
|
557
|
+
enabled: false
|
|
558
|
+
lexical_processing:
|
|
559
|
+
enabled: true
|
|
560
|
+
lowercase: true
|
|
561
|
+
strip_punctuation: false
|
|
562
|
+
collapse_whitespace: true
|
|
563
|
+
bertopic_analysis:
|
|
564
|
+
parameters:
|
|
565
|
+
min_topic_size: 8
|
|
566
|
+
nr_topics: 10
|
|
567
|
+
llm_fine_tuning:
|
|
568
|
+
enabled: false
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
572
|
+
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
573
|
+
|
|
574
|
+
For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
|
|
575
|
+
|
|
576
|
+
```
|
|
577
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
578
|
+
```
|
|
476
579
|
|
|
477
|
-
|
|
478
|
-
- `metadata-text` turns catalog metadata into a small text artifact.
|
|
479
|
-
- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
|
|
480
|
-
- `select-text` chooses one prior extraction result in a pipeline.
|
|
481
|
-
- `select-longest-text` chooses the longest prior extraction result.
|
|
482
|
-
- `ocr-rapidocr` does optical character recognition on images (optional).
|
|
483
|
-
- `stt-openai` performs speech to text on audio (optional).
|
|
484
|
-
- `unstructured` provides broad document parsing (optional).
|
|
485
|
-
- `markitdown` converts many formats into Markdown-like text (optional).
|
|
580
|
+
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
486
581
|
|
|
487
582
|
## Integration corpus and evaluation dataset
|
|
488
583
|
|
|
@@ -539,6 +634,9 @@ License terms are in `LICENSE`.
|
|
|
539
634
|
[corpus]: docs/CORPUS.md
|
|
540
635
|
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
541
636
|
[text-extraction]: docs/EXTRACTION.md
|
|
637
|
+
[extractor-reference]: docs/extractors/index.md
|
|
638
|
+
[backend-reference]: docs/backends/index.md
|
|
639
|
+
[speech-to-text]: docs/STT.md
|
|
542
640
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
543
641
|
[backends]: docs/BACKENDS.md
|
|
544
642
|
[context-packs]: docs/CONTEXT_PACK.md
|
|
@@ -1,34 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: biblicus
|
|
3
|
-
Version: 0.7.0
|
|
4
|
-
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
|
-
License: MIT
|
|
6
|
-
Requires-Python: >=3.9
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Requires-Dist: pydantic>=2.0
|
|
10
|
-
Requires-Dist: PyYAML>=6.0
|
|
11
|
-
Requires-Dist: pypdf>=4.0
|
|
12
|
-
Provides-Extra: dev
|
|
13
|
-
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
14
|
-
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
15
|
-
Requires-Dist: sphinx>=7.0; extra == "dev"
|
|
16
|
-
Requires-Dist: myst-parser>=2.0; extra == "dev"
|
|
17
|
-
Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
|
|
18
|
-
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
19
|
-
Requires-Dist: black>=24.0; extra == "dev"
|
|
20
|
-
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
21
|
-
Provides-Extra: openai
|
|
22
|
-
Requires-Dist: openai>=1.0; extra == "openai"
|
|
23
|
-
Provides-Extra: unstructured
|
|
24
|
-
Requires-Dist: unstructured>=0.12.0; extra == "unstructured"
|
|
25
|
-
Requires-Dist: python-docx>=1.1.0; extra == "unstructured"
|
|
26
|
-
Provides-Extra: ocr
|
|
27
|
-
Requires-Dist: rapidocr-onnxruntime>=1.3.0; extra == "ocr"
|
|
28
|
-
Provides-Extra: markitdown
|
|
29
|
-
Requires-Dist: markitdown[all]>=0.1.0; python_version >= "3.10" and extra == "markitdown"
|
|
30
|
-
Dynamic: license-file
|
|
31
|
-
|
|
32
1
|
# Biblicus
|
|
33
2
|
|
|
34
3
|
![Continuous integration][continuous-integration-badge]
|
|
@@ -160,9 +129,14 @@ python3 -m pip install biblicus
|
|
|
160
129
|
Some extractors are optional so the base install stays small.
|
|
161
130
|
|
|
162
131
|
- Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
|
|
163
|
-
-
|
|
132
|
+
- Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
|
|
133
|
+
- Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
|
|
134
|
+
- Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
|
|
135
|
+
- Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
136
|
+
- Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
164
137
|
- Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
|
|
165
138
|
- MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
|
|
139
|
+
- Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
|
|
166
140
|
|
|
167
141
|
## Quick start
|
|
168
142
|
|
|
@@ -420,6 +394,7 @@ The documents below follow the pipeline from raw items to model context:
|
|
|
420
394
|
|
|
421
395
|
- [Corpus][corpus]
|
|
422
396
|
- [Text extraction][text-extraction]
|
|
397
|
+
- [Speech to text][speech-to-text]
|
|
423
398
|
- [Knowledge base][knowledge-base]
|
|
424
399
|
- [Backends][backends]
|
|
425
400
|
- [Context packs][context-packs]
|
|
@@ -468,21 +443,97 @@ corpus/
|
|
|
468
443
|
Two backends are included.
|
|
469
444
|
|
|
470
445
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
471
|
-
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in
|
|
446
|
+
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
447
|
+
|
|
448
|
+
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
472
449
|
|
|
473
450
|
## Extraction backends
|
|
474
451
|
|
|
475
|
-
These extractors are built in. Optional ones require extra dependencies.
|
|
452
|
+
These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
|
|
453
|
+
|
|
454
|
+
### Text and document extraction
|
|
455
|
+
|
|
456
|
+
- [`pass-through-text`](docs/extractors/text-document/pass-through.md) reads text items and strips Markdown front matter.
|
|
457
|
+
- [`metadata-text`](docs/extractors/text-document/metadata.md) turns catalog metadata into a small text artifact.
|
|
458
|
+
- [`pdf-text`](docs/extractors/text-document/pdf.md) extracts text from Portable Document Format items with `pypdf`.
|
|
459
|
+
- [`unstructured`](docs/extractors/text-document/unstructured.md) provides broad document parsing (optional).
|
|
460
|
+
- [`markitdown`](docs/extractors/text-document/markitdown.md) converts many formats into Markdown-like text (optional).
|
|
461
|
+
|
|
462
|
+
### Optical character recognition
|
|
463
|
+
|
|
464
|
+
- [`ocr-rapidocr`](docs/extractors/ocr/rapidocr.md) does optical character recognition on images (optional).
|
|
465
|
+
- [`ocr-paddleocr-vl`](docs/extractors/ocr/paddleocr-vl.md) does advanced optical character recognition with PaddleOCR vision-language model (optional).
|
|
466
|
+
|
|
467
|
+
### Vision-language models
|
|
468
|
+
|
|
469
|
+
- [`docling-smol`](docs/extractors/vlm-document/docling-smol.md) uses the SmolDocling-256M vision-language model for fast document understanding (optional).
|
|
470
|
+
- [`docling-granite`](docs/extractors/vlm-document/docling-granite.md) uses the Granite Docling-258M vision-language model for high-accuracy extraction (optional).
|
|
471
|
+
|
|
472
|
+
### Speech to text
|
|
473
|
+
|
|
474
|
+
- [`stt-openai`](docs/extractors/speech-to-text/openai.md) performs speech to text on audio using OpenAI (optional).
|
|
475
|
+
- [`stt-deepgram`](docs/extractors/speech-to-text/deepgram.md) performs speech to text on audio using Deepgram (optional).
|
|
476
|
+
|
|
477
|
+
### Pipeline utilities
|
|
478
|
+
|
|
479
|
+
- [`select-text`](docs/extractors/pipeline-utilities/select-text.md) chooses one prior extraction result in a pipeline.
|
|
480
|
+
- [`select-longest-text`](docs/extractors/pipeline-utilities/select-longest.md) chooses the longest prior extraction result.
|
|
481
|
+
- [`select-override`](docs/extractors/pipeline-utilities/select-override.md) chooses the last extraction result for matching media types in a pipeline.
|
|
482
|
+
- [`select-smart-override`](docs/extractors/pipeline-utilities/select-smart-override.md) intelligently chooses between extraction results based on confidence and content quality.
|
|
483
|
+
|
|
484
|
+
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
485
|
+
|
|
486
|
+
## Topic modeling analysis
|
|
487
|
+
|
|
488
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
|
|
489
|
+
analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
|
|
490
|
+
processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
|
|
491
|
+
JavaScript Object Notation.
|
|
492
|
+
|
|
493
|
+
Run a topic analysis using a recipe file:
|
|
494
|
+
|
|
495
|
+
```
|
|
496
|
+
biblicus analyze topics --corpus corpora/example --recipe recipes/topic-modeling.yml --extraction-run pipeline:<run_id>
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction run and emits a warning about
|
|
500
|
+
reproducibility. The analysis output is stored under:
|
|
501
|
+
|
|
502
|
+
```
|
|
503
|
+
.biblicus/runs/analysis/topic-modeling/<run_id>/output.json
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
Minimal recipe example:
|
|
507
|
+
|
|
508
|
+
```yaml
|
|
509
|
+
schema_version: 1
|
|
510
|
+
text_source:
|
|
511
|
+
sample_size: 200
|
|
512
|
+
llm_extraction:
|
|
513
|
+
enabled: false
|
|
514
|
+
lexical_processing:
|
|
515
|
+
enabled: true
|
|
516
|
+
lowercase: true
|
|
517
|
+
strip_punctuation: false
|
|
518
|
+
collapse_whitespace: true
|
|
519
|
+
bertopic_analysis:
|
|
520
|
+
parameters:
|
|
521
|
+
min_topic_size: 8
|
|
522
|
+
nr_topics: 10
|
|
523
|
+
llm_fine_tuning:
|
|
524
|
+
enabled: false
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
528
|
+
Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
529
|
+
|
|
530
|
+
For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
|
|
531
|
+
|
|
532
|
+
```
|
|
533
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
534
|
+
```
|
|
476
535
|
|
|
477
|
-
|
|
478
|
-
- `metadata-text` turns catalog metadata into a small text artifact.
|
|
479
|
-
- `pdf-text` extracts text from Portable Document Format items with `pypdf`.
|
|
480
|
-
- `select-text` chooses one prior extraction result in a pipeline.
|
|
481
|
-
- `select-longest-text` chooses the longest prior extraction result.
|
|
482
|
-
- `ocr-rapidocr` does optical character recognition on images (optional).
|
|
483
|
-
- `stt-openai` performs speech to text on audio (optional).
|
|
484
|
-
- `unstructured` provides broad document parsing (optional).
|
|
485
|
-
- `markitdown` converts many formats into Markdown-like text (optional).
|
|
536
|
+
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
486
537
|
|
|
487
538
|
## Integration corpus and evaluation dataset
|
|
488
539
|
|
|
@@ -539,6 +590,9 @@ License terms are in `LICENSE`.
|
|
|
539
590
|
[corpus]: docs/CORPUS.md
|
|
540
591
|
[knowledge-base]: docs/KNOWLEDGE_BASE.md
|
|
541
592
|
[text-extraction]: docs/EXTRACTION.md
|
|
593
|
+
[extractor-reference]: docs/extractors/index.md
|
|
594
|
+
[backend-reference]: docs/backends/index.md
|
|
595
|
+
[speech-to-text]: docs/STT.md
|
|
542
596
|
[user-configuration]: docs/USER_CONFIGURATION.md
|
|
543
597
|
[backends]: docs/BACKENDS.md
|
|
544
598
|
[context-packs]: docs/CONTEXT_PACK.md
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
Backends are pluggable engines that implement a small, stable interface.
|
|
4
4
|
The goal is to make new retrieval ideas easy to test without reshaping the corpus.
|
|
5
5
|
|
|
6
|
+
For user documentation on available backends, see the [Backend Reference](backends/index.md).
|
|
7
|
+
|
|
6
8
|
## Backend contract
|
|
7
9
|
|
|
8
10
|
Backends implement two operations:
|
|
@@ -185,6 +185,28 @@ python3 -m biblicus extract build --corpus corpora/demo --step pass-through-text
|
|
|
185
185
|
|
|
186
186
|
The output includes a `run_id` you can reuse when building a retrieval backend.
|
|
187
187
|
|
|
188
|
+
### Topic modeling integration run
|
|
189
|
+
|
|
190
|
+
Use the integration script to download a Wikipedia corpus, run extraction, and run topic modeling with a single command.
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Run with a smaller corpus and a higher topic count:
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
python3 scripts/topic_modeling_integration.py \
|
|
200
|
+
--corpus corpora/wiki_demo \
|
|
201
|
+
--force \
|
|
202
|
+
--limit 20 \
|
|
203
|
+
--bertopic-param nr_topics=8 \
|
|
204
|
+
--bertopic-param min_topic_size=2
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
|
|
208
|
+
keywords, and document examples.
|
|
209
|
+
|
|
188
210
|
### Select extracted text within a pipeline
|
|
189
211
|
|
|
190
212
|
When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
# Text
|
|
1
|
+
# Text Extraction Pipeline
|
|
2
2
|
|
|
3
3
|
Text extraction is a separate pipeline stage that produces derived text artifacts under a corpus.
|
|
4
4
|
|
|
5
5
|
This separation matters because it lets you combine extraction choices and retrieval backends independently.
|
|
6
6
|
|
|
7
|
+
For detailed documentation on specific extractors, see [Extractor Reference](extractors/index.md).
|
|
8
|
+
|
|
7
9
|
## What extraction produces
|
|
8
10
|
|
|
9
11
|
An extraction run produces:
|
|
@@ -31,99 +33,42 @@ corpus/
|
|
|
31
33
|
<item id>.txt
|
|
32
34
|
```
|
|
33
35
|
|
|
34
|
-
##
|
|
35
|
-
|
|
36
|
-
Version zero includes a small set of deterministic extractors.
|
|
37
|
-
|
|
38
|
-
`pass-through-text`
|
|
39
|
-
|
|
40
|
-
- Reads text items and returns their content
|
|
41
|
-
- For Markdown items, it strips YAML front matter and returns only the body
|
|
42
|
-
- Skips non text items
|
|
43
|
-
|
|
44
|
-
`metadata-text`
|
|
45
|
-
|
|
46
|
-
- Builds a small text representation from catalog metadata
|
|
47
|
-
- This is useful when you have a non text item with meaningful tags or a title
|
|
36
|
+
## Available Extractors
|
|
48
37
|
|
|
49
|
-
|
|
38
|
+
Biblicus provides 16 built-in extractors organized by category:
|
|
50
39
|
|
|
51
|
-
|
|
52
|
-
- Skips items that are not Portable Document Format
|
|
53
|
-
- Uses the `pypdf` library
|
|
54
|
-
- Produces empty output for scanned Portable Document Format files that contain no extractable text without optical character recognition
|
|
40
|
+
### Text & Document Processing
|
|
55
41
|
|
|
56
|
-
`
|
|
42
|
+
- [`pass-through-text`](extractors/text-document/pass-through.md) - Direct text file reading
|
|
43
|
+
- [`metadata-text`](extractors/text-document/metadata.md) - Text from item metadata
|
|
44
|
+
- [`pdf-text`](extractors/text-document/pdf.md) - PDF text extraction using pypdf
|
|
45
|
+
- [`markitdown`](extractors/text-document/markitdown.md) - Office documents via MarkItDown
|
|
46
|
+
- [`unstructured`](extractors/text-document/unstructured.md) - Universal document parsing
|
|
57
47
|
|
|
58
|
-
|
|
59
|
-
- This is used when you have multiple pipeline steps that can produce usable text for the same items and you want one chosen result
|
|
60
|
-
- Records which step supplied the selected text
|
|
48
|
+
### Optical Character Recognition
|
|
61
49
|
|
|
62
|
-
`
|
|
50
|
+
- [`ocr-rapidocr`](extractors/ocr/rapidocr.md) - Fast ONNX-based OCR
|
|
51
|
+
- [`ocr-paddleocr-vl`](extractors/ocr/paddleocr-vl.md) - Advanced OCR with VL model
|
|
63
52
|
|
|
64
|
-
-
|
|
65
|
-
- Intended as a last-resort extractor for non-text items when more specific extractors cannot produce usable text
|
|
66
|
-
- Skips items that are already text so the pass-through extractor remains the canonical choice for text items
|
|
53
|
+
### Vision-Language Models
|
|
67
54
|
|
|
68
|
-
|
|
55
|
+
- [`docling-smol`](extractors/vlm-document/docling-smol.md) - SmolDocling-256M for fast document processing
|
|
56
|
+
- [`docling-granite`](extractors/vlm-document/docling-granite.md) - Granite Docling-258M for high-accuracy extraction
|
|
69
57
|
|
|
70
|
-
|
|
71
|
-
python3 -m pip install "biblicus[unstructured]"
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
`markitdown`
|
|
58
|
+
### Speech-to-Text
|
|
75
59
|
|
|
76
|
-
-
|
|
77
|
-
-
|
|
78
|
-
- Requires Python 3.10 or higher
|
|
79
|
-
- Skips items that are already text so the pass-through extractor remains the canonical choice for text items
|
|
80
|
-
- This means it will not process `text/html` or other text media types unless that policy changes
|
|
81
|
-
|
|
82
|
-
To install:
|
|
83
|
-
|
|
84
|
-
```
|
|
85
|
-
python3 -m pip install "biblicus[markitdown]"
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
Example:
|
|
89
|
-
|
|
90
|
-
```
|
|
91
|
-
python3 -m biblicus extract build --corpus corpora/extraction-demo \\
|
|
92
|
-
--step markitdown
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
`ocr-rapidocr`
|
|
96
|
-
|
|
97
|
-
- Optical character recognition for image items
|
|
98
|
-
- Backed by the optional `rapidocr-onnxruntime` dependency
|
|
99
|
-
- Intended as a practical default when you need text from images without running a service
|
|
100
|
-
|
|
101
|
-
To install:
|
|
102
|
-
|
|
103
|
-
```
|
|
104
|
-
python3 -m pip install "biblicus[ocr]"
|
|
105
|
-
```
|
|
60
|
+
- [`stt-openai`](extractors/speech-to-text/openai.md) - OpenAI Whisper API
|
|
61
|
+
- [`stt-deepgram`](extractors/speech-to-text/deepgram.md) - Deepgram Nova-3 API
|
|
106
62
|
|
|
107
|
-
|
|
63
|
+
### Pipeline Utilities
|
|
108
64
|
|
|
109
|
-
-
|
|
110
|
-
-
|
|
111
|
-
-
|
|
65
|
+
- [`select-text`](extractors/pipeline-utilities/select-text.md) - First successful extractor
|
|
66
|
+
- [`select-longest-text`](extractors/pipeline-utilities/select-longest.md) - Longest output selection
|
|
67
|
+
- [`select-override`](extractors/pipeline-utilities/select-override.md) - Per-item override by ID
|
|
68
|
+
- [`select-smart-override`](extractors/pipeline-utilities/select-smart-override.md) - Media type-based routing
|
|
69
|
+
- [`pipeline`](extractors/pipeline-utilities/pipeline.md) - Multi-step extraction workflow
|
|
112
70
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
```
|
|
116
|
-
python3 -m pip install "biblicus[openai]"
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
To configure:
|
|
120
|
-
|
|
121
|
-
- Create `~/.biblicus/config.yml` or `./.biblicus/config.yml` with:
|
|
122
|
-
|
|
123
|
-
```
|
|
124
|
-
openai:
|
|
125
|
-
api_key: YOUR_KEY_HERE
|
|
126
|
-
```
|
|
71
|
+
For detailed documentation including configuration options, usage examples, and best practices, see the [Extractor Reference](extractors/index.md).
|
|
127
72
|
|
|
128
73
|
## How selection chooses text
|
|
129
74
|
|
|
@@ -131,12 +76,12 @@ The `select-text` extractor does not attempt to judge extraction quality. It cho
|
|
|
131
76
|
|
|
132
77
|
Usable means non-empty after stripping whitespace.
|
|
133
78
|
|
|
134
|
-
This means selection does not automatically choose the longest extracted text or the extraction with the most content. If you want a scoring rule such as choose the longest extracted text,
|
|
79
|
+
This means selection does not automatically choose the longest extracted text or the extraction with the most content. If you want a scoring rule such as choose the longest extracted text, use the [`select-longest-text`](extractors/pipeline-utilities/select-longest.md) extractor instead.
|
|
135
80
|
|
|
136
|
-
|
|
81
|
+
Other selection strategies include:
|
|
137
82
|
|
|
138
|
-
-
|
|
139
|
-
-
|
|
83
|
+
- [`select-override`](extractors/pipeline-utilities/select-override.md) - Override extraction for specific items by ID
|
|
84
|
+
- [`select-smart-override`](extractors/pipeline-utilities/select-smart-override.md) - Route items based on media type patterns
|
|
140
85
|
|
|
141
86
|
## Pipeline extractor
|
|
142
87
|
|
|
@@ -146,6 +91,8 @@ The pipeline runs every step in order and records all step outputs. Each step re
|
|
|
146
91
|
|
|
147
92
|
This lets you build explicit extraction policies while keeping every step outcome available for comparison and metrics.
|
|
148
93
|
|
|
94
|
+
For details, see the [`pipeline` extractor documentation](extractors/pipeline-utilities/pipeline.md).
|
|
95
|
+
|
|
149
96
|
## Complementary versus competing extractors
|
|
150
97
|
|
|
151
98
|
The pipeline is designed for complementary steps that do not overlap much in what they handle.
|
|
@@ -169,9 +116,9 @@ python3 -m biblicus init corpora/extraction-demo
|
|
|
169
116
|
printf 'x' > /tmp/image.png
|
|
170
117
|
python3 -m biblicus ingest --corpus corpora/extraction-demo /tmp/image.png --tag extracted
|
|
171
118
|
|
|
172
|
-
python3 -m biblicus extract build --corpus corpora/extraction-demo
|
|
173
|
-
--step pass-through-text
|
|
174
|
-
--step pdf-text
|
|
119
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
120
|
+
--step pass-through-text \
|
|
121
|
+
--step pdf-text \
|
|
175
122
|
--step metadata-text
|
|
176
123
|
```
|
|
177
124
|
|
|
@@ -182,14 +129,38 @@ The extracted text for the image comes from the `metadata-text` step because the
|
|
|
182
129
|
Selection is a pipeline step that chooses extracted text from previous pipeline steps. Selection is just another extractor in the pipeline, and it decides which prior output to carry forward.
|
|
183
130
|
|
|
184
131
|
```
|
|
185
|
-
python3 -m biblicus extract build --corpus corpora/extraction-demo
|
|
186
|
-
--step pass-through-text
|
|
187
|
-
--step metadata-text
|
|
132
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
133
|
+
--step pass-through-text \
|
|
134
|
+
--step metadata-text \
|
|
188
135
|
--step select-text
|
|
189
136
|
```
|
|
190
137
|
|
|
191
138
|
The pipeline run produces one extraction run under `pipeline`. You can point retrieval backends at that run.
|
|
192
139
|
|
|
140
|
+
## Example: PDF with OCR fallback
|
|
141
|
+
|
|
142
|
+
Try text extraction first, fall back to OCR for scanned documents:
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
146
|
+
--step pdf-text \
|
|
147
|
+
--step ocr-rapidocr \
|
|
148
|
+
--step select-text
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
This pipeline tries `pdf-text` first for PDFs with text layers, falls back to `ocr-rapidocr` for scanned PDFs, and uses `select-text` to pick the first successful result.
|
|
152
|
+
|
|
153
|
+
## Example: VLM for complex documents
|
|
154
|
+
|
|
155
|
+
Use vision-language models for documents with complex layouts:
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
python3 -m biblicus extract build --corpus corpora/extraction-demo \
|
|
159
|
+
--step docling-granite
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
The `docling-granite` extractor uses IBM Research's Granite Docling-258M VLM for high-accuracy extraction of tables, code blocks, and equations.
|
|
163
|
+
|
|
193
164
|
## Inspecting and deleting extraction runs
|
|
194
165
|
|
|
195
166
|
Extraction runs are stored under the corpus and can be listed and inspected.
|
|
@@ -202,8 +173,8 @@ python3 -m biblicus extract show --corpus corpora/extraction-demo --run pipeline
|
|
|
202
173
|
Deletion is explicit and requires typing the exact run reference as confirmation:
|
|
203
174
|
|
|
204
175
|
```
|
|
205
|
-
python3 -m biblicus extract delete --corpus corpora/extraction-demo
|
|
206
|
-
--run pipeline:EXTRACTION_RUN_ID
|
|
176
|
+
python3 -m biblicus extract delete --corpus corpora/extraction-demo \
|
|
177
|
+
--run pipeline:EXTRACTION_RUN_ID \
|
|
207
178
|
--confirm pipeline:EXTRACTION_RUN_ID
|
|
208
179
|
```
|
|
209
180
|
|
|
@@ -212,7 +183,7 @@ python3 -m biblicus extract delete --corpus corpora/extraction-demo \\
|
|
|
212
183
|
Retrieval backends can build and query using a selected extraction run. This is configured by passing `extraction_run=extractor_id:run_id` to the backend build command.
|
|
213
184
|
|
|
214
185
|
```
|
|
215
|
-
python3 -m biblicus build --corpus corpora/extraction-demo --backend sqlite-full-text-search
|
|
186
|
+
python3 -m biblicus build --corpus corpora/extraction-demo --backend sqlite-full-text-search \
|
|
216
187
|
--config extraction_run=pipeline:EXTRACTION_RUN_ID
|
|
217
188
|
python3 -m biblicus query --corpus corpora/extraction-demo --query extracted
|
|
218
189
|
```
|