biblicus 1.0.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-1.0.0/src/biblicus.egg-info → biblicus-1.1.1}/PKG-INFO +52 -43
- {biblicus-1.0.0 → biblicus-1.1.1}/README.md +51 -42
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/CHUNKING.md +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/CORPUS.md +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/PROFILING.md +17 -17
- biblicus-1.0.0/docs/ANALYSIS.md → biblicus-1.1.1/docs/analysis.md +28 -28
- biblicus-1.1.1/docs/architecture.md +107 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/backends/embedding-index-file.md +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/backends/embedding-index-inmemory.md +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/backends/index.md +20 -20
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/backends/scan.md +21 -21
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/backends/sqlite-full-text-search.md +22 -22
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/backends/tf-vector.md +5 -5
- biblicus-1.0.0/docs/BACKENDS.md → biblicus-1.1.1/docs/backends.md +7 -7
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/conf.py +3 -1
- biblicus-1.0.0/docs/CONTEXT_ENGINE.md → biblicus-1.1.1/docs/context-engine.md +28 -3
- biblicus-1.0.0/docs/CONTEXT_PACK.md → biblicus-1.1.1/docs/context-pack.md +1 -1
- biblicus-1.0.0/docs/CORPUS_DESIGN.md → biblicus-1.1.1/docs/corpus-design.md +13 -13
- biblicus-1.0.0/docs/DEMOS.md → biblicus-1.1.1/docs/demos.md +44 -131
- biblicus-1.1.1/docs/embedding-retrieval.md +68 -0
- biblicus-1.0.0/docs/EXTRACTION_EVALUATION.md → biblicus-1.1.1/docs/extraction-evaluation.md +13 -13
- biblicus-1.0.0/docs/EXTRACTION.md → biblicus-1.1.1/docs/extraction.md +19 -19
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/index.md +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/ocr/paddleocr-vl.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/ocr/rapidocr.md +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/pipeline-utilities/pipeline.md +8 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/pipeline-utilities/select-longest.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/pipeline-utilities/select-override.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/pipeline-utilities/select-smart-override.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/pipeline-utilities/select-text.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/speech-to-text/deepgram.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/speech-to-text/openai.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/text-document/markitdown.md +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/text-document/metadata.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/text-document/pass-through.md +5 -5
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/text-document/pdf.md +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/text-document/unstructured.md +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/vlm-document/docling-granite.md +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/vlm-document/docling-smol.md +4 -4
- biblicus-1.0.0/docs/FEATURE_INDEX.md → biblicus-1.1.1/docs/feature-index.md +29 -29
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/index.rst +39 -42
- biblicus-1.0.0/docs/KNOWLEDGE_BASE.md → biblicus-1.1.1/docs/knowledge-base.md +5 -5
- biblicus-1.0.0/docs/MARKOV_ANALYSIS.md → biblicus-1.1.1/docs/markov-analysis.md +27 -22
- biblicus-1.0.0/docs/RETRIEVAL_EVALUATION.md → biblicus-1.1.1/docs/retrieval-evaluation.md +15 -15
- biblicus-1.0.0/docs/RETRIEVAL_QUALITY.md → biblicus-1.1.1/docs/retrieval-quality.md +5 -5
- biblicus-1.0.0/docs/RETRIEVAL.md → biblicus-1.1.1/docs/retrieval.md +12 -12
- biblicus-1.0.0/docs/ROADMAP.md → biblicus-1.1.1/docs/roadmap.md +3 -3
- biblicus-1.0.0/docs/TEXT_ANNOTATE.md → biblicus-1.1.1/docs/text-annotate.md +39 -9
- biblicus-1.0.0/docs/TEXT_EXTRACT.md → biblicus-1.1.1/docs/text-extract.md +105 -55
- biblicus-1.0.0/docs/TEXT_LINK.md → biblicus-1.1.1/docs/text-link.md +18 -8
- biblicus-1.0.0/docs/TEXT_REDACT.md → biblicus-1.1.1/docs/text-redact.md +28 -13
- biblicus-1.0.0/docs/TEXT_SLICE.md → biblicus-1.1.1/docs/text-slice.md +44 -24
- biblicus-1.1.1/docs/text-utilities.md +414 -0
- biblicus-1.0.0/docs/TOPIC_MODELING.md → biblicus-1.1.1/docs/topic-modeling.md +13 -13
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/use_cases/sequence_markov.md +7 -7
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/use_cases/text_folder_search.md +1 -1
- biblicus-1.0.0/docs/UTILITIES.md → biblicus-1.1.1/docs/utilities.md +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/89_context_engine_internal_branches.feature +21 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/90_embedding_index_evidence_fallback.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/analysis_schema.feature +6 -6
- biblicus-1.1.1/features/backend_validation.feature +14 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/cli_entrypoint.feature +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/cli_step_spec_parsing.feature +5 -5
- biblicus-1.1.1/features/context_engine_retrieval_internal_branches.feature +6 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/context_engine_retrieve_context_pack.feature +10 -10
- {biblicus-1.0.0 → biblicus-1.1.1}/features/context_pack_cli.feature +5 -5
- {biblicus-1.0.0 → biblicus-1.1.1}/features/corpus_edge_cases.feature +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/corpus_purge.feature +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/features/docling_granite_extractor.feature +36 -36
- {biblicus-1.0.0 → biblicus-1.1.1}/features/docling_smol_extractor.feature +36 -36
- {biblicus-1.0.0 → biblicus-1.1.1}/features/embedding_retrieval.feature +47 -47
- {biblicus-1.0.0 → biblicus-1.1.1}/features/error_cases.feature +36 -36
- {biblicus-1.0.0 → biblicus-1.1.1}/features/evaluation.feature +13 -13
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extraction_error_handling.feature +10 -10
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extraction_evaluation.feature +28 -28
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extraction_evaluation_lab.feature +1 -1
- biblicus-1.1.1/features/extraction_run_lifecycle.feature +117 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extraction_selection.feature +8 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extraction_selection_longest.feature +7 -7
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extractor_pipeline.feature +15 -15
- {biblicus-1.0.0 → biblicus-1.1.1}/features/import_tree.feature +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/inference_backend.feature +12 -12
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_audio_samples.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_mixed_extraction.feature +6 -6
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_ocr_image_extraction.feature +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_pdf_retrieval.feature +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_text_annotate.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_text_extract.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_unstructured_extraction.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_use_cases.feature +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_use_cases_sequence_markov.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markitdown_extractor.feature +24 -24
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_analysis.feature +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_analysis_categorical.feature +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_analysis_llm.feature +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_analysis_topic_modeling.feature +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_analysis_variants.feature +70 -70
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_internal_branches.feature +8 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_schema.feature +39 -39
- {biblicus-1.0.0 → biblicus-1.1.1}/features/ocr_extractor.feature +9 -9
- {biblicus-1.0.0 → biblicus-1.1.1}/features/paddleocr_vl_extractor.feature +32 -32
- {biblicus-1.0.0 → biblicus-1.1.1}/features/pdf_text_extraction.feature +13 -13
- {biblicus-1.0.0 → biblicus-1.1.1}/features/profiling.feature +35 -35
- {biblicus-1.0.0 → biblicus-1.1.1}/features/profiling_config_overrides.feature +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/features/query_processing.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/recipe_cascading.feature +12 -12
- biblicus-1.1.1/features/recipe_file_extraction.feature +35 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/recipe_utilities.feature +2 -2
- biblicus-1.1.1/features/retrieval_build_recipes.feature +19 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_evaluation_lab.feature +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_quality.feature +37 -37
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_scan.feature +14 -14
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_sqlite_full_text_search.feature +12 -12
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_uses_extraction_run.feature +28 -28
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_utilities.feature +5 -5
- {biblicus-1.0.0 → biblicus-1.1.1}/features/select_override.feature +10 -10
- {biblicus-1.0.0 → biblicus-1.1.1}/features/smart_override_selection.feature +27 -27
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/analysis_steps.py +28 -25
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/backend_steps.py +47 -40
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/cli_steps.py +11 -11
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_full_paths_steps.py +8 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_internal_steps.py +200 -1
- biblicus-1.1.1/features/steps/context_engine_retrieval_internal_steps.py +114 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_retrieve_context_pack_steps.py +24 -22
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_pack_steps.py +20 -20
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/docling_steps.py +6 -6
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/embedding_index_evidence_steps.py +25 -24
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/embedding_index_internal_steps.py +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/embedding_retrieval_coverage_steps.py +42 -32
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/extraction_evaluation_lab_steps.py +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/extraction_evaluation_steps.py +7 -7
- biblicus-1.1.1/features/steps/extraction_run_lifecycle_steps.py +156 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/extraction_steps.py +241 -193
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/extractor_steps.py +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/markov_embeddings_error_steps.py +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/markov_internal_steps.py +49 -49
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/markov_schema_steps.py +143 -111
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/markov_steps.py +69 -64
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/model_steps.py +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/paddleocr_vl_steps.py +5 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/profiling_steps.py +82 -37
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/recipe_steps.py +5 -1
- biblicus-1.1.1/features/steps/retrieval_build_recipe_steps.py +66 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/retrieval_evaluation_lab_steps.py +3 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/retrieval_quality_steps.py +28 -23
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/retrieval_steps.py +104 -76
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_annotate_steps.py +4 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_extract_steps.py +24 -12
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_link_steps.py +4 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_redact_steps.py +4 -2
- biblicus-1.1.1/features/steps/text_tool_loop_steps.py +138 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/tf_vector_internal_steps.py +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/topic_modeling_steps.py +46 -34
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/use_cases_steps.py +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/stt_deepgram_extractor.feature +13 -13
- {biblicus-1.0.0 → biblicus-1.1.1}/features/stt_extractor.feature +14 -14
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_extraction_runs.feature +29 -29
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_utilities.feature +26 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/topic_modeling.feature +117 -117
- {biblicus-1.0.0 → biblicus-1.1.1}/features/unstructured_extractor.feature +15 -15
- {biblicus-1.0.0 → biblicus-1.1.1}/features/use_cases.feature +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/features/user_config.feature +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/pyproject.toml +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/extraction_evaluation_demo.py +12 -12
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/extraction_evaluation_lab.py +12 -12
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/markov_analysis_demo.py +77 -71
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/markov_cached_segments_demo.py +88 -76
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/markov_run_report.py +8 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/profiling_demo.py +22 -22
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/readme_end_to_end_demo.py +11 -7
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/retrieval_evaluation_lab.py +20 -20
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/topic_modeling_integration.py +28 -28
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/use_cases/notes_to_context_pack_demo.py +10 -6
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/use_cases/sequence_markov_demo.py +37 -31
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/use_cases/text_folder_search_demo.py +14 -14
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/wikipedia_rag_demo.py +13 -13
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/__init__.py +5 -5
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/__init__.py +1 -1
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/base.py +10 -10
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/markov.py +78 -68
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/models.py +47 -47
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/profiling.py +58 -48
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/topic_modeling.py +56 -51
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/cli.py +224 -177
- biblicus-1.0.0/src/biblicus/recipes.py → biblicus-1.1.1/src/biblicus/configuration.py +14 -14
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/constants.py +2 -2
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/context_engine/assembler.py +49 -19
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/context_engine/retrieval.py +46 -42
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/corpus.py +116 -108
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/errors.py +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/evaluation.py +27 -25
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extraction.py +103 -98
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extraction_evaluation.py +26 -26
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/deepgram_stt.py +7 -7
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/docling_granite_text.py +11 -11
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/docling_smol_text.py +11 -11
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/markitdown_text.py +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/openai_stt.py +7 -7
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/paddleocr_vl_text.py +20 -18
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/pipeline.py +8 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/rapidocr_text.py +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/unstructured_text.py +3 -3
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/hooks.py +4 -4
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/knowledge_base.py +33 -31
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/models.py +78 -78
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/retrieval.py +47 -40
- biblicus-1.1.1/src/biblicus/retrievers/__init__.py +50 -0
- biblicus-1.1.1/src/biblicus/retrievers/base.py +65 -0
- {biblicus-1.0.0/src/biblicus/backends → biblicus-1.1.1/src/biblicus/retrievers}/embedding_index_common.py +44 -41
- {biblicus-1.0.0/src/biblicus/backends → biblicus-1.1.1/src/biblicus/retrievers}/embedding_index_file.py +87 -58
- {biblicus-1.0.0/src/biblicus/backends → biblicus-1.1.1/src/biblicus/retrievers}/embedding_index_inmemory.py +88 -59
- biblicus-1.1.1/src/biblicus/retrievers/hybrid.py +301 -0
- {biblicus-1.0.0/src/biblicus/backends → biblicus-1.1.1/src/biblicus/retrievers}/scan.py +83 -73
- {biblicus-1.0.0/src/biblicus/backends → biblicus-1.1.1/src/biblicus/retrievers}/sqlite_full_text_search.py +115 -101
- {biblicus-1.0.0/src/biblicus/backends → biblicus-1.1.1/src/biblicus/retrievers}/tf_vector.py +87 -77
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/prompts.py +16 -8
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/tool_loop.py +63 -5
- {biblicus-1.0.0 → biblicus-1.1.1/src/biblicus.egg-info}/PKG-INFO +52 -43
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus.egg-info/SOURCES.txt +42 -43
- biblicus-1.1.1/tests/test_text_extract_tool_calls.py +110 -0
- biblicus-1.1.1/tests/test_text_utility_tool_calls.py +314 -0
- biblicus-1.1.1/tests/test_tool_loop_safeguards.py +171 -0
- biblicus-1.0.0/docs/ARCHITECTURE.md +0 -46
- biblicus-1.0.0/docs/ARCHITECTURE_DETAIL.md +0 -267
- biblicus-1.0.0/docs/EMBEDDING_RETRIEVAL.md +0 -57
- biblicus-1.0.0/docs/PR_FAQ_CONTEXT_ENGINE.md +0 -43
- biblicus-1.0.0/docs/PR_FAQ_EMBEDDING_RETRIEVAL.md +0 -105
- biblicus-1.0.0/docs/PR_FAQ_TEXT_ANNOTATE.md +0 -118
- biblicus-1.0.0/docs/TEXT_UTILITIES.md +0 -137
- biblicus-1.0.0/features/backend_validation.feature +0 -14
- biblicus-1.0.0/features/context_engine_retrieval_internal_branches.feature +0 -6
- biblicus-1.0.0/features/extraction_run_lifecycle.feature +0 -117
- biblicus-1.0.0/features/recipe_file_extraction.feature +0 -35
- biblicus-1.0.0/features/retrieval_build_recipes.feature +0 -19
- biblicus-1.0.0/features/steps/context_engine_retrieval_internal_steps.py +0 -113
- biblicus-1.0.0/features/steps/extraction_run_lifecycle_steps.py +0 -152
- biblicus-1.0.0/features/steps/retrieval_build_recipe_steps.py +0 -64
- biblicus-1.0.0/features/steps/text_tool_loop_steps.py +0 -36
- biblicus-1.0.0/src/biblicus/backends/__init__.py +0 -50
- biblicus-1.0.0/src/biblicus/backends/base.py +0 -65
- biblicus-1.0.0/src/biblicus/backends/hybrid.py +0 -292
- {biblicus-1.0.0 → biblicus-1.1.1}/LICENSE +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/MANIFEST.in +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/datasets/extraction_lab/labels.json +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/datasets/retrieval_lab/labels.json +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/datasets/wikipedia_mini.json +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/STT.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/TESTING.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/api.rst +0 -0
- /biblicus-1.0.0/docs/CONTEXT_ENGINE_DEMO.md → /biblicus-1.1.1/docs/context-engine-demo.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/ocr/index.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/pipeline-utilities/index.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/speech-to-text/index.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/text-document/index.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/extractors/vlm-document/index.md +0 -0
- /biblicus-1.0.0/docs/USE_CASES.md → /biblicus-1.1.1/docs/use-cases.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/use_cases/notes_to_context_pack.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/docs/use_cases/text_redact.md +0 -0
- /biblicus-1.0.0/docs/USER_CONFIGURATION.md → /biblicus-1.1.1/docs/user-configuration.md +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/70_context_retriever.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/71_context_compaction.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/72_context_history_compaction.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/73_context_nested_compaction.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/74_context_regeneration.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/75_context_default_regeneration.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/76_context_pack_budget_weights.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/77_context_default_pack_priority.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/78_context_default_pack_weights.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/79_context_nested_context_packs.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/80_context_nested_pack_budget_cap.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/81_context_nested_regeneration.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/82_context_explicit_regeneration.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/83_context_explicit_pack_priority.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/84_context_explicit_pack_weights.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/85_context_expansion.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/86_context_engine_errors.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/87_context_compactor_strategies.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/88_context_engine_model_validation.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/91_tf_vector_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/93_context_engine_full_paths.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/ai_llm.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/ai_models.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/biblicus_corpus.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/cli_parsing.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/content_sniffing.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/context_pack.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/context_pack_policies.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/corpus_identity.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/corpus_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/crawl.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/embedding_index_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/embeddings.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/environment.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/evidence_processing.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/extractor_validation.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/frontmatter.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/hook_config_validation.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/hook_error_handling.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/hook_logging_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/ingest_namespacing.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/ingest_sources.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_image_samples.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_pdf_samples.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_text_link.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_text_redact.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_text_slice.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/integration_wikipedia.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/knowledge_base.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/lifecycle_hooks.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_embeddings_errors.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/markov_start_end_labels.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/model_validation.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/paddleocr_vl_parse_api_response.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/python_api.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/python_hook_logging.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/retrieval_budget.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/select_override_defaults.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/source_helper_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/source_loading.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/ai_llm_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/ai_models_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_compaction_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_compactor_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_default_pack_priority_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_default_pack_weights_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_default_regeneration_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_error_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_model_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_registry.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_engine_retriever.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_expansion_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_explicit_pack_priority_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_explicit_pack_weights_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_explicit_regeneration_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_history_compaction_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_nested_compaction_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_nested_context_packs_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_nested_pack_budget_cap_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_nested_regeneration_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_pack_budget_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_regeneration_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/context_retriever_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/corpus_internal_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/crawl_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/deepgram_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/embeddings_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/evidence_processing_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/hook_logging_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/inference_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/markitdown_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/markov_start_end_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/openai_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/paddleocr_mock_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/paddleocr_vl_unit_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/pdf_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/python_api_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/rapidocr_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/requests_mock_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/select_override_defaults_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/source_helper_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/stt_deepgram_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/stt_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_internal_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_link_internal_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_mock_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/text_slice_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/unstructured_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/user_config_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/steps/wikitext_steps.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/streaming_ingest.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_annotate.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_extract.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_link.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_link_internal_branches.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_mock.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_redact.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/text_slice.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/features/token_budget.feature +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/demo_context_engine.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/download_ag_news.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/download_audio_samples.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/download_image_samples.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/download_mixed_samples.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/download_pdf_samples.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/download_wikipedia.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/test.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/scripts/use_cases/text_redact_demo.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/setup.cfg +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/__main__.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/ai/__init__.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/ai/embeddings.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/ai/llm.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/ai/models.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/analysis/schema.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/chunking.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/context.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/context_engine/__init__.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/context_engine/compaction.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/context_engine/models.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/crawl.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/embedding_providers.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/evidence_processing.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/__init__.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/base.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/select_override.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/select_smart_override.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/frontmatter.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/hook_logging.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/hook_manager.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/ignore.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/inference.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/sources.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/__init__.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/annotate.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/extract.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/link.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/markup.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/models.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/redact.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/text/slice.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/time.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/uris.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus/user_config.py +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus.egg-info/requires.txt +0 -0
- {biblicus-1.0.0 → biblicus-1.1.1}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -80,10 +80,10 @@ See [retrieval augmented generation overview] for a short introduction to the id
|
|
|
80
80
|
## Analysis highlights
|
|
81
81
|
|
|
82
82
|
- `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
|
|
83
|
-
- YAML
|
|
83
|
+
- YAML configurations support cascading composition plus dotted `--config key=value` overrides.
|
|
84
84
|
- Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
|
|
85
|
-
- See `docs/
|
|
86
|
-
- See `docs/
|
|
85
|
+
- See `docs/markov-analysis.md` for Markov analysis details and runnable demos.
|
|
86
|
+
- See `docs/text-extract.md` for the text extract utility and examples.
|
|
87
87
|
|
|
88
88
|
## Start with a knowledge base
|
|
89
89
|
|
|
@@ -167,7 +167,7 @@ sequenceDiagram
|
|
|
167
167
|
|
|
168
168
|
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
169
169
|
- You can keep raw files readable and portable, without locking your data inside a database.
|
|
170
|
-
- You can evaluate retrieval
|
|
170
|
+
- You can evaluate retrieval snapshots against shared datasets and compare backends using the same corpus.
|
|
171
171
|
|
|
172
172
|
## Typical flow
|
|
173
173
|
|
|
@@ -176,7 +176,7 @@ sequenceDiagram
|
|
|
176
176
|
- Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
|
|
177
177
|
- Run extraction when you want derived text artifacts from non-text sources.
|
|
178
178
|
- Reindex to refresh the catalog after edits.
|
|
179
|
-
- Build a retrieval
|
|
179
|
+
- Build a retrieval snapshot with a backend.
|
|
180
180
|
- Query the run to collect evidence and evaluate it with datasets.
|
|
181
181
|
|
|
182
182
|
## Install
|
|
@@ -292,7 +292,7 @@ for note_title, note_text in notes:
|
|
|
292
292
|
corpus.ingest_note(note_text, title=note_title, tags=["memory"])
|
|
293
293
|
|
|
294
294
|
backend = get_backend("scan")
|
|
295
|
-
run = backend.build_run(corpus,
|
|
295
|
+
run = backend.build_run(corpus, configuration_name="Story demo", config={})
|
|
296
296
|
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
297
297
|
result = backend.query(
|
|
298
298
|
corpus,
|
|
@@ -336,8 +336,8 @@ Example output:
|
|
|
336
336
|
"maximum_total_characters": 2000,
|
|
337
337
|
"max_items_per_source": null
|
|
338
338
|
},
|
|
339
|
-
"
|
|
340
|
-
"
|
|
339
|
+
"snapshot_id": "RUN_ID",
|
|
340
|
+
"configuration_id": "RECIPE_ID",
|
|
341
341
|
"backend_id": "scan",
|
|
342
342
|
"generated_at": "2026-01-29T00:00:00.000000Z",
|
|
343
343
|
"evidence": [
|
|
@@ -352,8 +352,8 @@ Example output:
|
|
|
352
352
|
"span_start": null,
|
|
353
353
|
"span_end": null,
|
|
354
354
|
"stage": "scan",
|
|
355
|
-
"
|
|
356
|
-
"
|
|
355
|
+
"configuration_id": "RECIPE_ID",
|
|
356
|
+
"snapshot_id": "RUN_ID",
|
|
357
357
|
"hash": null
|
|
358
358
|
}
|
|
359
359
|
],
|
|
@@ -422,7 +422,7 @@ flowchart TB
|
|
|
422
422
|
|
|
423
423
|
subgraph RowExtraction[Pluggable: extraction pipeline]
|
|
424
424
|
direction TB
|
|
425
|
-
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction
|
|
425
|
+
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction snapshot manifest]
|
|
426
426
|
end
|
|
427
427
|
|
|
428
428
|
subgraph RowRetrieval[Pluggable: retrieval backend]
|
|
@@ -484,7 +484,7 @@ From Python, the same flow is available through the Corpus class and backend int
|
|
|
484
484
|
- Ingest notes with `Corpus.ingest_note`.
|
|
485
485
|
- Ingest files or web addresses with `Corpus.ingest_source`.
|
|
486
486
|
- List items with `Corpus.list_items`.
|
|
487
|
-
- Build a retrieval
|
|
487
|
+
- Build a retrieval snapshot with `get_backend` and `backend.build_run`.
|
|
488
488
|
- Query a run with `backend.query`.
|
|
489
489
|
- Evaluate with `evaluate_run`.
|
|
490
490
|
|
|
@@ -530,13 +530,13 @@ corpus/
|
|
|
530
530
|
runs/
|
|
531
531
|
extraction/
|
|
532
532
|
pipeline/
|
|
533
|
-
<
|
|
533
|
+
<snapshot id>/
|
|
534
534
|
manifest.json
|
|
535
535
|
text/
|
|
536
536
|
<item id>.txt
|
|
537
537
|
retrieval/
|
|
538
538
|
<backend id>/
|
|
539
|
-
<
|
|
539
|
+
<snapshot id>/
|
|
540
540
|
manifest.json
|
|
541
541
|
```
|
|
542
542
|
|
|
@@ -552,9 +552,9 @@ For detailed documentation including configuration options, performance characte
|
|
|
552
552
|
|
|
553
553
|
## Retrieval documentation
|
|
554
554
|
|
|
555
|
-
For the retrieval pipeline overview and
|
|
556
|
-
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/
|
|
557
|
-
and dataset formats, see `docs/
|
|
555
|
+
For the retrieval pipeline overview and snapshot artifacts, see `docs/retrieval.md`. For retrieval quality upgrades
|
|
556
|
+
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/retrieval-quality.md`. For evaluation workflows
|
|
557
|
+
and dataset formats, see `docs/retrieval-evaluation.md`. For a runnable walkthrough, use the retrieval evaluation lab
|
|
558
558
|
script (`scripts/retrieval_evaluation_lab.py`).
|
|
559
559
|
|
|
560
560
|
## Extraction backends
|
|
@@ -594,7 +594,7 @@ These extractors are built in. Optional ones require extra dependencies. See [te
|
|
|
594
594
|
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
595
595
|
|
|
596
596
|
For extraction evaluation workflows, dataset formats, and report interpretation, see
|
|
597
|
-
`docs/
|
|
597
|
+
`docs/extraction-evaluation.md`.
|
|
598
598
|
|
|
599
599
|
## Text extract utility
|
|
600
600
|
|
|
@@ -602,39 +602,39 @@ Text extract is a reusable analysis utility that lets a model insert XML tags in
|
|
|
602
602
|
entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
|
|
603
603
|
analysis.
|
|
604
604
|
|
|
605
|
-
See `docs/
|
|
605
|
+
See `docs/text-extract.md` for the utility API and examples, and `docs/markov-analysis.md` for the Markov integration.
|
|
606
606
|
|
|
607
607
|
## Text slice utility
|
|
608
608
|
|
|
609
609
|
Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
|
|
610
610
|
re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
|
|
611
611
|
|
|
612
|
-
See `docs/
|
|
612
|
+
See `docs/text-slice.md` for the utility API and examples.
|
|
613
613
|
|
|
614
614
|
## Topic modeling analysis
|
|
615
615
|
|
|
616
616
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
617
617
|
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
618
|
-
an extraction
|
|
618
|
+
an extraction snapshot, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
619
619
|
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
620
620
|
|
|
621
|
-
See `docs/
|
|
622
|
-
`docs/
|
|
621
|
+
See `docs/analysis.md` for the analysis pipeline overview, `docs/profiling.md` for profiling, and
|
|
622
|
+
`docs/topic-modeling.md` for topic modeling details.
|
|
623
623
|
|
|
624
|
-
Run a topic analysis using a
|
|
624
|
+
Run a topic analysis using a configuration file:
|
|
625
625
|
|
|
626
626
|
```
|
|
627
|
-
biblicus analyze topics --corpus corpora/example --
|
|
627
|
+
biblicus analyze topics --corpus corpora/example --configuration configurations/topic-modeling.yml --extraction-run pipeline:<snapshot_id>
|
|
628
628
|
```
|
|
629
629
|
|
|
630
|
-
If `--extraction-run` is omitted, Biblicus uses the most recent extraction
|
|
630
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction snapshot and emits a warning about
|
|
631
631
|
reproducibility. The analysis output is stored under:
|
|
632
632
|
|
|
633
633
|
```
|
|
634
|
-
.biblicus/runs/analysis/topic-modeling/<
|
|
634
|
+
.biblicus/runs/analysis/topic-modeling/<snapshot_id>/output.json
|
|
635
635
|
```
|
|
636
636
|
|
|
637
|
-
Minimal
|
|
637
|
+
Minimal configuration example:
|
|
638
638
|
|
|
639
639
|
```yaml
|
|
640
640
|
schema_version: 1
|
|
@@ -659,7 +659,7 @@ llm_fine_tuning:
|
|
|
659
659
|
```
|
|
660
660
|
|
|
661
661
|
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
662
|
-
|
|
662
|
+
Configuration files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
663
663
|
AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
|
|
664
664
|
|
|
665
665
|
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
@@ -668,7 +668,7 @@ For a repeatable, real-world integration run that downloads AG News and executes
|
|
|
668
668
|
python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
669
669
|
```
|
|
670
670
|
|
|
671
|
-
See `docs/
|
|
671
|
+
See `docs/topic-modeling.md` for parameter examples and per-topic output behavior.
|
|
672
672
|
|
|
673
673
|
## Integration corpus and evaluation dataset
|
|
674
674
|
|
|
@@ -712,25 +712,34 @@ Build the documentation:
|
|
|
712
712
|
python -m sphinx -b html docs docs/_build/html
|
|
713
713
|
```
|
|
714
714
|
|
|
715
|
+
Preview the documentation locally:
|
|
716
|
+
|
|
717
|
+
```
|
|
718
|
+
cd docs/_build/html
|
|
719
|
+
python -m http.server
|
|
720
|
+
```
|
|
721
|
+
|
|
722
|
+
Open `http://localhost:8000` in your browser.
|
|
723
|
+
|
|
715
724
|
## License
|
|
716
725
|
|
|
717
726
|
License terms are in `LICENSE`.
|
|
718
727
|
|
|
719
728
|
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
720
|
-
[architecture]: docs/
|
|
721
|
-
[roadmap]: docs/
|
|
722
|
-
[feature-index]: docs/
|
|
723
|
-
[corpus]: docs/
|
|
724
|
-
[knowledge-base]: docs/
|
|
725
|
-
[text-extraction]: docs/
|
|
729
|
+
[architecture]: docs/architecture.md
|
|
730
|
+
[roadmap]: docs/roadmap.md
|
|
731
|
+
[feature-index]: docs/feature-index.md
|
|
732
|
+
[corpus]: docs/corpus.md
|
|
733
|
+
[knowledge-base]: docs/knowledge-base.md
|
|
734
|
+
[text-extraction]: docs/extraction.md
|
|
726
735
|
[extractor-reference]: docs/extractors/index.md
|
|
727
736
|
[backend-reference]: docs/backends/index.md
|
|
728
|
-
[speech-to-text]: docs/
|
|
729
|
-
[user-configuration]: docs/
|
|
730
|
-
[backends]: docs/
|
|
731
|
-
[context-packs]: docs/
|
|
732
|
-
[demos]: docs/
|
|
733
|
-
[testing]: docs/
|
|
737
|
+
[speech-to-text]: docs/stt.md
|
|
738
|
+
[user-configuration]: docs/user-configuration.md
|
|
739
|
+
[backends]: docs/backends.md
|
|
740
|
+
[context-packs]: docs/context-pack.md
|
|
741
|
+
[demos]: docs/demos.md
|
|
742
|
+
[testing]: docs/testing.md
|
|
734
743
|
|
|
735
744
|
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
736
745
|
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
@@ -26,10 +26,10 @@ See [retrieval augmented generation overview] for a short introduction to the id
|
|
|
26
26
|
## Analysis highlights
|
|
27
27
|
|
|
28
28
|
- `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
|
|
29
|
-
- YAML
|
|
29
|
+
- YAML configurations support cascading composition plus dotted `--config key=value` overrides.
|
|
30
30
|
- Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
|
|
31
|
-
- See `docs/
|
|
32
|
-
- See `docs/
|
|
31
|
+
- See `docs/markov-analysis.md` for Markov analysis details and runnable demos.
|
|
32
|
+
- See `docs/text-extract.md` for the text extract utility and examples.
|
|
33
33
|
|
|
34
34
|
## Start with a knowledge base
|
|
35
35
|
|
|
@@ -113,7 +113,7 @@ sequenceDiagram
|
|
|
113
113
|
|
|
114
114
|
- You can ingest raw material once, then try many retrieval approaches over time.
|
|
115
115
|
- You can keep raw files readable and portable, without locking your data inside a database.
|
|
116
|
-
- You can evaluate retrieval
|
|
116
|
+
- You can evaluate retrieval snapshots against shared datasets and compare backends using the same corpus.
|
|
117
117
|
|
|
118
118
|
## Typical flow
|
|
119
119
|
|
|
@@ -122,7 +122,7 @@ sequenceDiagram
|
|
|
122
122
|
- Crawl a website section into corpus items when you want a repeatable “import from the web” workflow.
|
|
123
123
|
- Run extraction when you want derived text artifacts from non-text sources.
|
|
124
124
|
- Reindex to refresh the catalog after edits.
|
|
125
|
-
- Build a retrieval
|
|
125
|
+
- Build a retrieval snapshot with a backend.
|
|
126
126
|
- Query the run to collect evidence and evaluate it with datasets.
|
|
127
127
|
|
|
128
128
|
## Install
|
|
@@ -238,7 +238,7 @@ for note_title, note_text in notes:
|
|
|
238
238
|
corpus.ingest_note(note_text, title=note_title, tags=["memory"])
|
|
239
239
|
|
|
240
240
|
backend = get_backend("scan")
|
|
241
|
-
run = backend.build_run(corpus,
|
|
241
|
+
run = backend.build_run(corpus, configuration_name="Story demo", config={})
|
|
242
242
|
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
243
243
|
result = backend.query(
|
|
244
244
|
corpus,
|
|
@@ -282,8 +282,8 @@ Example output:
|
|
|
282
282
|
"maximum_total_characters": 2000,
|
|
283
283
|
"max_items_per_source": null
|
|
284
284
|
},
|
|
285
|
-
"
|
|
286
|
-
"
|
|
285
|
+
"snapshot_id": "RUN_ID",
|
|
286
|
+
"configuration_id": "RECIPE_ID",
|
|
287
287
|
"backend_id": "scan",
|
|
288
288
|
"generated_at": "2026-01-29T00:00:00.000000Z",
|
|
289
289
|
"evidence": [
|
|
@@ -298,8 +298,8 @@ Example output:
|
|
|
298
298
|
"span_start": null,
|
|
299
299
|
"span_end": null,
|
|
300
300
|
"stage": "scan",
|
|
301
|
-
"
|
|
302
|
-
"
|
|
301
|
+
"configuration_id": "RECIPE_ID",
|
|
302
|
+
"snapshot_id": "RUN_ID",
|
|
303
303
|
"hash": null
|
|
304
304
|
}
|
|
305
305
|
],
|
|
@@ -368,7 +368,7 @@ flowchart TB
|
|
|
368
368
|
|
|
369
369
|
subgraph RowExtraction[Pluggable: extraction pipeline]
|
|
370
370
|
direction TB
|
|
371
|
-
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction
|
|
371
|
+
Catalog --> Extract[Extract pipeline] --> ExtractedText[Extracted text artifacts] --> ExtractionRun[Extraction snapshot manifest]
|
|
372
372
|
end
|
|
373
373
|
|
|
374
374
|
subgraph RowRetrieval[Pluggable: retrieval backend]
|
|
@@ -430,7 +430,7 @@ From Python, the same flow is available through the Corpus class and backend int
|
|
|
430
430
|
- Ingest notes with `Corpus.ingest_note`.
|
|
431
431
|
- Ingest files or web addresses with `Corpus.ingest_source`.
|
|
432
432
|
- List items with `Corpus.list_items`.
|
|
433
|
-
- Build a retrieval
|
|
433
|
+
- Build a retrieval snapshot with `get_backend` and `backend.build_run`.
|
|
434
434
|
- Query a run with `backend.query`.
|
|
435
435
|
- Evaluate with `evaluate_run`.
|
|
436
436
|
|
|
@@ -476,13 +476,13 @@ corpus/
|
|
|
476
476
|
runs/
|
|
477
477
|
extraction/
|
|
478
478
|
pipeline/
|
|
479
|
-
<
|
|
479
|
+
<snapshot id>/
|
|
480
480
|
manifest.json
|
|
481
481
|
text/
|
|
482
482
|
<item id>.txt
|
|
483
483
|
retrieval/
|
|
484
484
|
<backend id>/
|
|
485
|
-
<
|
|
485
|
+
<snapshot id>/
|
|
486
486
|
manifest.json
|
|
487
487
|
```
|
|
488
488
|
|
|
@@ -498,9 +498,9 @@ For detailed documentation including configuration options, performance characte
|
|
|
498
498
|
|
|
499
499
|
## Retrieval documentation
|
|
500
500
|
|
|
501
|
-
For the retrieval pipeline overview and
|
|
502
|
-
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/
|
|
503
|
-
and dataset formats, see `docs/
|
|
501
|
+
For the retrieval pipeline overview and snapshot artifacts, see `docs/retrieval.md`. For retrieval quality upgrades
|
|
502
|
+
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/retrieval-quality.md`. For evaluation workflows
|
|
503
|
+
and dataset formats, see `docs/retrieval-evaluation.md`. For a runnable walkthrough, use the retrieval evaluation lab
|
|
504
504
|
script (`scripts/retrieval_evaluation_lab.py`).
|
|
505
505
|
|
|
506
506
|
## Extraction backends
|
|
@@ -540,7 +540,7 @@ These extractors are built in. Optional ones require extra dependencies. See [te
|
|
|
540
540
|
For detailed documentation on all extractors, see the [Extractor Reference][extractor-reference].
|
|
541
541
|
|
|
542
542
|
For extraction evaluation workflows, dataset formats, and report interpretation, see
|
|
543
|
-
`docs/
|
|
543
|
+
`docs/extraction-evaluation.md`.
|
|
544
544
|
|
|
545
545
|
## Text extract utility
|
|
546
546
|
|
|
@@ -548,39 +548,39 @@ Text extract is a reusable analysis utility that lets a model insert XML tags in
|
|
|
548
548
|
entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
|
|
549
549
|
analysis.
|
|
550
550
|
|
|
551
|
-
See `docs/
|
|
551
|
+
See `docs/text-extract.md` for the utility API and examples, and `docs/markov-analysis.md` for the Markov integration.
|
|
552
552
|
|
|
553
553
|
## Text slice utility
|
|
554
554
|
|
|
555
555
|
Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
|
|
556
556
|
re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
|
|
557
557
|
|
|
558
|
-
See `docs/
|
|
558
|
+
See `docs/text-slice.md` for the utility API and examples.
|
|
559
559
|
|
|
560
560
|
## Topic modeling analysis
|
|
561
561
|
|
|
562
562
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
563
563
|
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
564
|
-
an extraction
|
|
564
|
+
an extraction snapshot, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
565
565
|
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
566
566
|
|
|
567
|
-
See `docs/
|
|
568
|
-
`docs/
|
|
567
|
+
See `docs/analysis.md` for the analysis pipeline overview, `docs/profiling.md` for profiling, and
|
|
568
|
+
`docs/topic-modeling.md` for topic modeling details.
|
|
569
569
|
|
|
570
|
-
Run a topic analysis using a
|
|
570
|
+
Run a topic analysis using a configuration file:
|
|
571
571
|
|
|
572
572
|
```
|
|
573
|
-
biblicus analyze topics --corpus corpora/example --
|
|
573
|
+
biblicus analyze topics --corpus corpora/example --configuration configurations/topic-modeling.yml --extraction-run pipeline:<snapshot_id>
|
|
574
574
|
```
|
|
575
575
|
|
|
576
|
-
If `--extraction-run` is omitted, Biblicus uses the most recent extraction
|
|
576
|
+
If `--extraction-run` is omitted, Biblicus uses the most recent extraction snapshot and emits a warning about
|
|
577
577
|
reproducibility. The analysis output is stored under:
|
|
578
578
|
|
|
579
579
|
```
|
|
580
|
-
.biblicus/runs/analysis/topic-modeling/<
|
|
580
|
+
.biblicus/runs/analysis/topic-modeling/<snapshot_id>/output.json
|
|
581
581
|
```
|
|
582
582
|
|
|
583
|
-
Minimal
|
|
583
|
+
Minimal configuration example:
|
|
584
584
|
|
|
585
585
|
```yaml
|
|
586
586
|
schema_version: 1
|
|
@@ -605,7 +605,7 @@ llm_fine_tuning:
|
|
|
605
605
|
```
|
|
606
606
|
|
|
607
607
|
LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
|
|
608
|
-
|
|
608
|
+
Configuration files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
|
|
609
609
|
AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
|
|
610
610
|
|
|
611
611
|
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
@@ -614,7 +614,7 @@ For a repeatable, real-world integration run that downloads AG News and executes
|
|
|
614
614
|
python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
615
615
|
```
|
|
616
616
|
|
|
617
|
-
See `docs/
|
|
617
|
+
See `docs/topic-modeling.md` for parameter examples and per-topic output behavior.
|
|
618
618
|
|
|
619
619
|
## Integration corpus and evaluation dataset
|
|
620
620
|
|
|
@@ -658,25 +658,34 @@ Build the documentation:
|
|
|
658
658
|
python -m sphinx -b html docs docs/_build/html
|
|
659
659
|
```
|
|
660
660
|
|
|
661
|
+
Preview the documentation locally:
|
|
662
|
+
|
|
663
|
+
```
|
|
664
|
+
cd docs/_build/html
|
|
665
|
+
python -m http.server
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
Open `http://localhost:8000` in your browser.
|
|
669
|
+
|
|
661
670
|
## License
|
|
662
671
|
|
|
663
672
|
License terms are in `LICENSE`.
|
|
664
673
|
|
|
665
674
|
[retrieval augmented generation overview]: https://en.wikipedia.org/wiki/Retrieval-augmented_generation
|
|
666
|
-
[architecture]: docs/
|
|
667
|
-
[roadmap]: docs/
|
|
668
|
-
[feature-index]: docs/
|
|
669
|
-
[corpus]: docs/
|
|
670
|
-
[knowledge-base]: docs/
|
|
671
|
-
[text-extraction]: docs/
|
|
675
|
+
[architecture]: docs/architecture.md
|
|
676
|
+
[roadmap]: docs/roadmap.md
|
|
677
|
+
[feature-index]: docs/feature-index.md
|
|
678
|
+
[corpus]: docs/corpus.md
|
|
679
|
+
[knowledge-base]: docs/knowledge-base.md
|
|
680
|
+
[text-extraction]: docs/extraction.md
|
|
672
681
|
[extractor-reference]: docs/extractors/index.md
|
|
673
682
|
[backend-reference]: docs/backends/index.md
|
|
674
|
-
[speech-to-text]: docs/
|
|
675
|
-
[user-configuration]: docs/
|
|
676
|
-
[backends]: docs/
|
|
677
|
-
[context-packs]: docs/
|
|
678
|
-
[demos]: docs/
|
|
679
|
-
[testing]: docs/
|
|
683
|
+
[speech-to-text]: docs/stt.md
|
|
684
|
+
[user-configuration]: docs/user-configuration.md
|
|
685
|
+
[backends]: docs/backends.md
|
|
686
|
+
[context-packs]: docs/context-pack.md
|
|
687
|
+
[demos]: docs/demos.md
|
|
688
|
+
[testing]: docs/testing.md
|
|
680
689
|
|
|
681
690
|
[continuous-integration-badge]: https://github.com/AnthusAI/Biblicus/actions/workflows/ci.yml/badge.svg?branch=main
|
|
682
691
|
[coverage-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/AnthusAI/Biblicus/main/coverage_badge.json
|
|
@@ -8,7 +8,7 @@ returns evidence with chunk boundaries so you can trace results back to the orig
|
|
|
8
8
|
|
|
9
9
|
## Chunkers are pluggable
|
|
10
10
|
|
|
11
|
-
Chunking is a pluggable interface selected by identifier in a retrieval
|
|
11
|
+
Chunking is a pluggable interface selected by identifier in a retrieval configuration:
|
|
12
12
|
|
|
13
13
|
- `chunker_id`
|
|
14
14
|
- `chunker_config` (Pydantic validated; `extra="forbid"`)
|
|
@@ -18,7 +18,7 @@ corpus/
|
|
|
18
18
|
config.json
|
|
19
19
|
catalog.json
|
|
20
20
|
runs/
|
|
21
|
-
<
|
|
21
|
+
<snapshot manifests and artifacts>
|
|
22
22
|
```
|
|
23
23
|
|
|
24
24
|
## Core concepts
|
|
@@ -137,7 +137,7 @@ python -m biblicus reindex --corpus corpora/example
|
|
|
137
137
|
## Reproducibility checklist
|
|
138
138
|
|
|
139
139
|
- Keep raw files and sidecars in source control or backed up as immutable inputs.
|
|
140
|
-
- Record the catalog timestamp when comparing
|
|
140
|
+
- Record the catalog timestamp when comparing snapshot outputs.
|
|
141
141
|
- Prefer `import-tree` for reproducible ingest of existing folder structures.
|
|
142
142
|
|
|
143
143
|
## Common pitfalls
|
|
@@ -20,22 +20,22 @@ The output is structured JSON that can be stored, versioned, and compared across
|
|
|
20
20
|
biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
|
|
21
21
|
```
|
|
22
22
|
|
|
23
|
-
If you omit `--extraction-run`, Biblicus uses the latest extraction
|
|
23
|
+
If you omit `--extraction-run`, Biblicus uses the latest extraction snapshot and emits a reproducibility warning.
|
|
24
24
|
|
|
25
|
-
To customize profiling metrics, pass a
|
|
25
|
+
To customize profiling metrics, pass a configuration file:
|
|
26
26
|
|
|
27
27
|
```
|
|
28
|
-
biblicus analyze profile --corpus corpora/example --
|
|
28
|
+
biblicus analyze profile --corpus corpora/example --configuration configurations/profiling.yml --extraction-run pipeline:RUN_ID
|
|
29
29
|
```
|
|
30
30
|
|
|
31
|
-
Profiling
|
|
31
|
+
Profiling configurations support cascading composition. Pass multiple `--configuration` files; later configurations override earlier configurations
|
|
32
32
|
via a deep merge:
|
|
33
33
|
|
|
34
34
|
```
|
|
35
35
|
biblicus analyze profile \
|
|
36
36
|
--corpus corpora/example \
|
|
37
|
-
--
|
|
38
|
-
--
|
|
37
|
+
--configuration configurations/profiling/base.yml \
|
|
38
|
+
--configuration configurations/profiling/strict.yml \
|
|
39
39
|
--extraction-run pipeline:RUN_ID
|
|
40
40
|
```
|
|
41
41
|
|
|
@@ -44,14 +44,14 @@ To override the composed configuration view from the command line, use `--config
|
|
|
44
44
|
```
|
|
45
45
|
biblicus analyze profile \
|
|
46
46
|
--corpus corpora/example \
|
|
47
|
-
--
|
|
47
|
+
--configuration configurations/profiling/base.yml \
|
|
48
48
|
--config sample_size=200 \
|
|
49
49
|
--extraction-run pipeline:RUN_ID
|
|
50
50
|
```
|
|
51
51
|
|
|
52
|
-
### Profiling
|
|
52
|
+
### Profiling configuration configuration
|
|
53
53
|
|
|
54
|
-
Profiling
|
|
54
|
+
Profiling configurations use the analysis schema version and accept these fields:
|
|
55
55
|
|
|
56
56
|
- `schema_version`: analysis schema version, currently `1`
|
|
57
57
|
- `sample_size`: optional cap for distribution calculations
|
|
@@ -60,7 +60,7 @@ Profiling recipes use the analysis schema version and accept these fields:
|
|
|
60
60
|
- `top_tag_count`: maximum number of tags to list in `top_tags`
|
|
61
61
|
- `tag_filters`: optional list of tags to include in tag coverage metrics
|
|
62
62
|
|
|
63
|
-
Example
|
|
63
|
+
Example configuration:
|
|
64
64
|
|
|
65
65
|
```
|
|
66
66
|
schema_version: 1
|
|
@@ -84,7 +84,7 @@ corpus = Corpus.open(Path("corpora/example"))
|
|
|
84
84
|
backend = get_analysis_backend("profiling")
|
|
85
85
|
output = backend.run_analysis(
|
|
86
86
|
corpus,
|
|
87
|
-
|
|
87
|
+
configuration_name="default",
|
|
88
88
|
config={
|
|
89
89
|
"schema_version": 1,
|
|
90
90
|
"sample_size": 500,
|
|
@@ -93,9 +93,9 @@ output = backend.run_analysis(
|
|
|
93
93
|
"top_tag_count": 10,
|
|
94
94
|
"tag_filters": ["ag_news"],
|
|
95
95
|
},
|
|
96
|
-
|
|
96
|
+
extraction_snapshot=ExtractionRunReference(
|
|
97
97
|
extractor_id="pipeline",
|
|
98
|
-
|
|
98
|
+
snapshot_id="RUN_ID",
|
|
99
99
|
),
|
|
100
100
|
)
|
|
101
101
|
print(output.model_dump())
|
|
@@ -106,7 +106,7 @@ print(output.model_dump())
|
|
|
106
106
|
Profiling output is stored under:
|
|
107
107
|
|
|
108
108
|
```
|
|
109
|
-
.biblicus/runs/analysis/profiling/<
|
|
109
|
+
.biblicus/runs/analysis/profiling/<snapshot_id>/output.json
|
|
110
110
|
```
|
|
111
111
|
|
|
112
112
|
## Reading the report
|
|
@@ -138,17 +138,17 @@ through extraction and how much was missing or empty.
|
|
|
138
138
|
|
|
139
139
|
## Comparing profiling runs
|
|
140
140
|
|
|
141
|
-
Use the same extraction
|
|
141
|
+
Use the same extraction snapshot and configuration configuration whenever you compare profiling outputs:
|
|
142
142
|
|
|
143
143
|
1) Run profiling on two corpus snapshots.
|
|
144
144
|
2) Compare `raw_items.total_items`, media type counts, and tag coverage.
|
|
145
145
|
3) Compare `extracted_text` coverage to spot extraction regressions.
|
|
146
146
|
|
|
147
|
-
Record the
|
|
147
|
+
Record the snapshot identifiers and catalog timestamps so you can trace differences later.
|
|
148
148
|
|
|
149
149
|
## Common pitfalls
|
|
150
150
|
|
|
151
|
-
- Profiling without specifying an extraction
|
|
151
|
+
- Profiling without specifying an extraction snapshot, which makes comparisons harder to reproduce.
|
|
152
152
|
- Comparing runs with different `sample_size` or `min_text_characters` settings.
|
|
153
153
|
- Interpreting tag counts without noting the `tag_filters` applied.
|
|
154
154
|
|