biblicus 0.13.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.13.0/src/biblicus.egg-info → biblicus-1.0.0}/PKG-INFO +103 -31
- {biblicus-0.13.0 → biblicus-1.0.0}/README.md +94 -30
- biblicus-1.0.0/datasets/retrieval_lab/labels.json +25 -0
- biblicus-1.0.0/docs/ANALYSIS.md +143 -0
- biblicus-1.0.0/docs/ARCHITECTURE.md +46 -0
- biblicus-1.0.0/docs/ARCHITECTURE_DETAIL.md +267 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/BACKENDS.md +25 -1
- biblicus-1.0.0/docs/CHUNKING.md +69 -0
- biblicus-1.0.0/docs/CONTEXT_ENGINE.md +120 -0
- biblicus-1.0.0/docs/CONTEXT_ENGINE_DEMO.md +96 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/CONTEXT_PACK.md +58 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/CORPUS.md +49 -10
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/CORPUS_DESIGN.md +18 -5
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/DEMOS.md +85 -48
- biblicus-1.0.0/docs/EMBEDDING_RETRIEVAL.md +57 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/EXTRACTION.md +46 -11
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/EXTRACTION_EVALUATION.md +33 -3
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/FEATURE_INDEX.md +199 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/KNOWLEDGE_BASE.md +20 -1
- biblicus-1.0.0/docs/MARKOV_ANALYSIS.md +262 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/PROFILING.md +65 -1
- biblicus-1.0.0/docs/PR_FAQ_CONTEXT_ENGINE.md +43 -0
- biblicus-1.0.0/docs/PR_FAQ_EMBEDDING_RETRIEVAL.md +105 -0
- biblicus-1.0.0/docs/PR_FAQ_TEXT_ANNOTATE.md +118 -0
- biblicus-1.0.0/docs/RETRIEVAL.md +123 -0
- biblicus-1.0.0/docs/RETRIEVAL_EVALUATION.md +218 -0
- biblicus-1.0.0/docs/RETRIEVAL_QUALITY.md +112 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/ROADMAP.md +42 -14
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/STT.md +4 -4
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/TESTING.md +15 -4
- biblicus-1.0.0/docs/TEXT_ANNOTATE.md +119 -0
- biblicus-1.0.0/docs/TEXT_EXTRACT.md +671 -0
- biblicus-1.0.0/docs/TEXT_LINK.md +124 -0
- biblicus-1.0.0/docs/TEXT_REDACT.md +170 -0
- biblicus-1.0.0/docs/TEXT_SLICE.md +319 -0
- biblicus-1.0.0/docs/TEXT_UTILITIES.md +137 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/TOPIC_MODELING.md +78 -5
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/USER_CONFIGURATION.md +11 -0
- biblicus-1.0.0/docs/USE_CASES.md +37 -0
- biblicus-1.0.0/docs/UTILITIES.md +23 -0
- biblicus-1.0.0/docs/backends/embedding-index-file.md +34 -0
- biblicus-1.0.0/docs/backends/embedding-index-inmemory.md +34 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/backends/index.md +53 -4
- biblicus-1.0.0/docs/backends/tf-vector.md +59 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/conf.py +2 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/index.md +12 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/ocr/index.md +8 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/index.md +11 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/speech-to-text/index.md +8 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/index.md +11 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/vlm-document/index.md +8 -0
- biblicus-1.0.0/docs/index.rst +223 -0
- biblicus-1.0.0/docs/use_cases/notes_to_context_pack.md +48 -0
- biblicus-1.0.0/docs/use_cases/sequence_markov.md +82 -0
- biblicus-1.0.0/docs/use_cases/text_folder_search.md +39 -0
- biblicus-1.0.0/docs/use_cases/text_redact.md +50 -0
- biblicus-1.0.0/features/70_context_retriever.feature +12 -0
- biblicus-1.0.0/features/71_context_compaction.feature +22 -0
- biblicus-1.0.0/features/72_context_history_compaction.feature +9 -0
- biblicus-1.0.0/features/73_context_nested_compaction.feature +9 -0
- biblicus-1.0.0/features/74_context_regeneration.feature +9 -0
- biblicus-1.0.0/features/75_context_default_regeneration.feature +9 -0
- biblicus-1.0.0/features/76_context_pack_budget_weights.feature +9 -0
- biblicus-1.0.0/features/77_context_default_pack_priority.feature +10 -0
- biblicus-1.0.0/features/78_context_default_pack_weights.feature +9 -0
- biblicus-1.0.0/features/79_context_nested_context_packs.feature +9 -0
- biblicus-1.0.0/features/80_context_nested_pack_budget_cap.feature +9 -0
- biblicus-1.0.0/features/81_context_nested_regeneration.feature +9 -0
- biblicus-1.0.0/features/82_context_explicit_regeneration.feature +9 -0
- biblicus-1.0.0/features/83_context_explicit_pack_priority.feature +9 -0
- biblicus-1.0.0/features/84_context_explicit_pack_weights.feature +9 -0
- biblicus-1.0.0/features/85_context_expansion.feature +10 -0
- biblicus-1.0.0/features/86_context_engine_errors.feature +24 -0
- biblicus-1.0.0/features/87_context_compactor_strategies.feature +22 -0
- biblicus-1.0.0/features/88_context_engine_model_validation.feature +64 -0
- biblicus-1.0.0/features/89_context_engine_internal_branches.feature +47 -0
- biblicus-1.0.0/features/90_embedding_index_evidence_fallback.feature +10 -0
- biblicus-1.0.0/features/91_tf_vector_internal_branches.feature +10 -0
- biblicus-1.0.0/features/93_context_engine_full_paths.feature +6 -0
- biblicus-1.0.0/features/ai_llm.feature +25 -0
- biblicus-1.0.0/features/ai_models.feature +74 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/analysis_schema.feature +1 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/features/biblicus_corpus.feature +1 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/features/cli_parsing.feature +26 -0
- biblicus-1.0.0/features/context_engine_retrieval_internal_branches.feature +6 -0
- biblicus-1.0.0/features/context_engine_retrieve_context_pack.feature +38 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/context_pack_policies.feature +40 -0
- biblicus-1.0.0/features/corpus_internal_branches.feature +53 -0
- biblicus-1.0.0/features/embedding_index_internal_branches.feature +22 -0
- biblicus-1.0.0/features/embedding_retrieval.feature +341 -0
- biblicus-1.0.0/features/embeddings.feature +39 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/environment.py +64 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/error_cases.feature +2 -2
- {biblicus-0.13.0 → biblicus-1.0.0}/features/evaluation.feature +5 -5
- biblicus-1.0.0/features/hook_logging_internal_branches.feature +6 -0
- biblicus-1.0.0/features/ingest_namespacing.feature +43 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_ocr_image_extraction.feature +4 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_pdf_retrieval.feature +1 -1
- biblicus-1.0.0/features/integration_text_annotate.feature +22 -0
- biblicus-1.0.0/features/integration_text_extract.feature +69 -0
- biblicus-1.0.0/features/integration_text_link.feature +25 -0
- biblicus-1.0.0/features/integration_text_redact.feature +31 -0
- biblicus-1.0.0/features/integration_text_slice.feature +27 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_unstructured_extraction.feature +1 -0
- biblicus-1.0.0/features/integration_use_cases.feature +10 -0
- biblicus-1.0.0/features/integration_use_cases_sequence_markov.feature +14 -0
- biblicus-1.0.0/features/markov_analysis.feature +36 -0
- biblicus-1.0.0/features/markov_analysis_categorical.feature +42 -0
- biblicus-1.0.0/features/markov_analysis_llm.feature +65 -0
- biblicus-1.0.0/features/markov_analysis_topic_modeling.feature +40 -0
- biblicus-1.0.0/features/markov_analysis_variants.feature +559 -0
- biblicus-1.0.0/features/markov_embeddings_errors.feature +13 -0
- biblicus-1.0.0/features/markov_internal_branches.feature +297 -0
- biblicus-1.0.0/features/markov_schema.feature +161 -0
- biblicus-1.0.0/features/markov_start_end_labels.feature +10 -0
- biblicus-1.0.0/features/profiling_config_overrides.feature +16 -0
- biblicus-1.0.0/features/recipe_cascading.feature +63 -0
- biblicus-1.0.0/features/recipe_utilities.feature +77 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_budget.feature +4 -0
- biblicus-1.0.0/features/retrieval_build_recipes.feature +19 -0
- biblicus-1.0.0/features/retrieval_evaluation_lab.feature +10 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_quality.feature +20 -20
- {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_scan.feature +6 -18
- {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_sqlite_full_text_search.feature +1 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_uses_extraction_run.feature +5 -5
- biblicus-1.0.0/features/select_override_defaults.feature +14 -0
- biblicus-1.0.0/features/source_helper_internal_branches.feature +22 -0
- biblicus-1.0.0/features/steps/ai_llm_steps.py +44 -0
- biblicus-1.0.0/features/steps/ai_models_steps.py +181 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/analysis_steps.py +8 -6
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/backend_steps.py +1 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/cli_parsing_steps.py +16 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/cli_steps.py +73 -7
- biblicus-1.0.0/features/steps/context_compaction_steps.py +139 -0
- biblicus-1.0.0/features/steps/context_compactor_steps.py +28 -0
- biblicus-1.0.0/features/steps/context_default_pack_priority_steps.py +98 -0
- biblicus-1.0.0/features/steps/context_default_pack_weights_steps.py +91 -0
- biblicus-1.0.0/features/steps/context_default_regeneration_steps.py +69 -0
- biblicus-1.0.0/features/steps/context_engine_error_steps.py +111 -0
- biblicus-1.0.0/features/steps/context_engine_full_paths_steps.py +696 -0
- biblicus-1.0.0/features/steps/context_engine_internal_steps.py +322 -0
- biblicus-1.0.0/features/steps/context_engine_model_steps.py +144 -0
- biblicus-1.0.0/features/steps/context_engine_registry.py +123 -0
- biblicus-1.0.0/features/steps/context_engine_retrieval_internal_steps.py +113 -0
- biblicus-1.0.0/features/steps/context_engine_retrieve_context_pack_steps.py +129 -0
- biblicus-1.0.0/features/steps/context_engine_retriever.py +104 -0
- biblicus-1.0.0/features/steps/context_expansion_steps.py +79 -0
- biblicus-1.0.0/features/steps/context_explicit_pack_priority_steps.py +94 -0
- biblicus-1.0.0/features/steps/context_explicit_pack_weights_steps.py +83 -0
- biblicus-1.0.0/features/steps/context_explicit_regeneration_steps.py +84 -0
- biblicus-1.0.0/features/steps/context_history_compaction_steps.py +46 -0
- biblicus-1.0.0/features/steps/context_nested_compaction_steps.py +50 -0
- biblicus-1.0.0/features/steps/context_nested_context_packs_steps.py +74 -0
- biblicus-1.0.0/features/steps/context_nested_pack_budget_cap_steps.py +84 -0
- biblicus-1.0.0/features/steps/context_nested_regeneration_steps.py +91 -0
- biblicus-1.0.0/features/steps/context_pack_budget_steps.py +81 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/context_pack_steps.py +54 -0
- biblicus-1.0.0/features/steps/context_regeneration_steps.py +73 -0
- biblicus-1.0.0/features/steps/context_retriever_steps.py +68 -0
- biblicus-1.0.0/features/steps/corpus_internal_steps.py +190 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/docling_steps.py +7 -0
- biblicus-1.0.0/features/steps/embedding_index_evidence_steps.py +150 -0
- biblicus-1.0.0/features/steps/embedding_index_internal_steps.py +34 -0
- biblicus-1.0.0/features/steps/embedding_retrieval_coverage_steps.py +453 -0
- biblicus-1.0.0/features/steps/embeddings_steps.py +122 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_steps.py +20 -0
- biblicus-1.0.0/features/steps/hook_logging_steps.py +13 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/markitdown_steps.py +7 -0
- biblicus-1.0.0/features/steps/markov_embeddings_error_steps.py +69 -0
- biblicus-1.0.0/features/steps/markov_internal_steps.py +1933 -0
- biblicus-1.0.0/features/steps/markov_schema_steps.py +729 -0
- biblicus-1.0.0/features/steps/markov_start_end_steps.py +38 -0
- biblicus-1.0.0/features/steps/markov_steps.py +451 -0
- biblicus-1.0.0/features/steps/openai_steps.py +735 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/paddleocr_vl_steps.py +7 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/profiling_steps.py +74 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/rapidocr_steps.py +7 -0
- biblicus-1.0.0/features/steps/recipe_steps.py +96 -0
- biblicus-1.0.0/features/steps/retrieval_build_recipe_steps.py +64 -0
- biblicus-1.0.0/features/steps/retrieval_evaluation_lab_steps.py +77 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/retrieval_quality_steps.py +3 -3
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/retrieval_steps.py +87 -4
- biblicus-1.0.0/features/steps/select_override_defaults_steps.py +21 -0
- biblicus-1.0.0/features/steps/source_helper_steps.py +35 -0
- biblicus-1.0.0/features/steps/text_annotate_steps.py +477 -0
- biblicus-1.0.0/features/steps/text_extract_steps.py +480 -0
- biblicus-1.0.0/features/steps/text_internal_steps.py +64 -0
- biblicus-1.0.0/features/steps/text_link_internal_steps.py +411 -0
- biblicus-1.0.0/features/steps/text_link_steps.py +494 -0
- biblicus-1.0.0/features/steps/text_mock_steps.py +199 -0
- biblicus-1.0.0/features/steps/text_redact_steps.py +509 -0
- biblicus-1.0.0/features/steps/text_slice_steps.py +433 -0
- biblicus-1.0.0/features/steps/text_tool_loop_steps.py +36 -0
- biblicus-1.0.0/features/steps/tf_vector_internal_steps.py +14 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/topic_modeling_steps.py +45 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/unstructured_steps.py +7 -0
- biblicus-1.0.0/features/steps/use_cases_steps.py +139 -0
- biblicus-1.0.0/features/steps/wikitext_steps.py +31 -0
- biblicus-1.0.0/features/text_annotate.feature +227 -0
- biblicus-1.0.0/features/text_extract.feature +226 -0
- biblicus-1.0.0/features/text_internal_branches.feature +52 -0
- biblicus-1.0.0/features/text_link.feature +146 -0
- biblicus-1.0.0/features/text_link_internal_branches.feature +114 -0
- biblicus-1.0.0/features/text_mock.feature +86 -0
- biblicus-1.0.0/features/text_redact.feature +135 -0
- biblicus-1.0.0/features/text_slice.feature +135 -0
- biblicus-1.0.0/features/text_utilities.feature +51 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/topic_modeling.feature +3 -3
- biblicus-1.0.0/features/use_cases.feature +21 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/pyproject.toml +12 -1
- biblicus-1.0.0/scripts/demo_context_engine.py +328 -0
- biblicus-1.0.0/scripts/markov_analysis_demo.py +279 -0
- biblicus-1.0.0/scripts/markov_cached_segments_demo.py +603 -0
- biblicus-1.0.0/scripts/markov_run_report.py +243 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/readme_end_to_end_demo.py +1 -1
- biblicus-1.0.0/scripts/retrieval_evaluation_lab.py +284 -0
- biblicus-1.0.0/scripts/use_cases/notes_to_context_pack_demo.py +121 -0
- biblicus-1.0.0/scripts/use_cases/sequence_markov_demo.py +189 -0
- biblicus-1.0.0/scripts/use_cases/text_folder_search_demo.py +132 -0
- biblicus-1.0.0/scripts/use_cases/text_redact_demo.py +116 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/wikipedia_rag_demo.py +1 -1
- biblicus-1.0.0/src/biblicus/__init__.py +50 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/__init__.py +2 -2
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/loader.py +40 -1
- biblicus-1.0.0/src/biblicus/ai/__init__.py +39 -0
- biblicus-1.0.0/src/biblicus/ai/embeddings.py +114 -0
- biblicus-1.0.0/src/biblicus/ai/llm.py +138 -0
- biblicus-1.0.0/src/biblicus/ai/models.py +226 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/__init__.py +5 -2
- biblicus-1.0.0/src/biblicus/analysis/markov.py +1656 -0
- biblicus-1.0.0/src/biblicus/analysis/models.py +1530 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/topic_modeling.py +98 -19
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/__init__.py +6 -2
- biblicus-1.0.0/src/biblicus/backends/embedding_index_common.py +334 -0
- biblicus-1.0.0/src/biblicus/backends/embedding_index_file.py +272 -0
- biblicus-1.0.0/src/biblicus/backends/embedding_index_inmemory.py +270 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/hybrid.py +14 -6
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/scan.py +1 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/sqlite_full_text_search.py +5 -3
- biblicus-0.13.0/src/biblicus/backends/vector.py → biblicus-1.0.0/src/biblicus/backends/tf_vector.py +28 -35
- biblicus-1.0.0/src/biblicus/chunking.py +396 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/cli.py +193 -48
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/context.py +29 -14
- biblicus-1.0.0/src/biblicus/context_engine/__init__.py +53 -0
- biblicus-1.0.0/src/biblicus/context_engine/assembler.py +1060 -0
- biblicus-1.0.0/src/biblicus/context_engine/compaction.py +110 -0
- biblicus-1.0.0/src/biblicus/context_engine/models.py +423 -0
- biblicus-1.0.0/src/biblicus/context_engine/retrieval.py +129 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/corpus.py +117 -16
- biblicus-1.0.0/src/biblicus/embedding_providers.py +122 -0
- biblicus-1.0.0/src/biblicus/errors.py +39 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/frontmatter.py +2 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/knowledge_base.py +1 -1
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/models.py +15 -3
- biblicus-1.0.0/src/biblicus/recipes.py +136 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/retrieval.py +7 -2
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/sources.py +46 -11
- biblicus-1.0.0/src/biblicus/text/__init__.py +43 -0
- biblicus-1.0.0/src/biblicus/text/annotate.py +222 -0
- biblicus-1.0.0/src/biblicus/text/extract.py +210 -0
- biblicus-1.0.0/src/biblicus/text/link.py +525 -0
- biblicus-1.0.0/src/biblicus/text/markup.py +200 -0
- biblicus-1.0.0/src/biblicus/text/models.py +319 -0
- biblicus-1.0.0/src/biblicus/text/prompts.py +115 -0
- biblicus-1.0.0/src/biblicus/text/redact.py +229 -0
- biblicus-1.0.0/src/biblicus/text/slice.py +155 -0
- biblicus-1.0.0/src/biblicus/text/tool_loop.py +334 -0
- {biblicus-0.13.0 → biblicus-1.0.0/src/biblicus.egg-info}/PKG-INFO +103 -31
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/SOURCES.txt +183 -4
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/requires.txt +10 -0
- biblicus-0.13.0/docs/ANALYSIS.md +0 -47
- biblicus-0.13.0/docs/ARCHITECTURE.md +0 -180
- biblicus-0.13.0/docs/RETRIEVAL.md +0 -47
- biblicus-0.13.0/docs/RETRIEVAL_EVALUATION.md +0 -74
- biblicus-0.13.0/docs/RETRIEVAL_QUALITY.md +0 -42
- biblicus-0.13.0/docs/backends/vector.md +0 -59
- biblicus-0.13.0/docs/index.rst +0 -33
- biblicus-0.13.0/features/steps/openai_steps.py +0 -314
- biblicus-0.13.0/src/biblicus/__init__.py +0 -30
- biblicus-0.13.0/src/biblicus/analysis/llm.py +0 -106
- biblicus-0.13.0/src/biblicus/analysis/models.py +0 -777
- biblicus-0.13.0/src/biblicus/errors.py +0 -15
- {biblicus-0.13.0 → biblicus-1.0.0}/LICENSE +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/MANIFEST.in +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/datasets/extraction_lab/labels.json +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/api.rst +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/backends/scan.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/backends/sqlite-full-text-search.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/ocr/rapidocr.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/speech-to-text/openai.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/markitdown.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/metadata.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/pass-through.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/pdf.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/unstructured.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/backend_validation.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/cli_step_spec_parsing.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/context_pack.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/crawl.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/docling_granite_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/docling_smol_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_evaluation.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_evaluation_lab.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/frontmatter.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/import_tree.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/inference_backend.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/knowledge_base.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/markitdown_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/model_validation.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/paddleocr_vl_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/profiling.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/python_api.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/query_processing.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/recipe_file_extraction.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/select_override.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/smart_override_selection.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/source_loading.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/crawl_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/deepgram_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/evidence_processing_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_evaluation_lab_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_evaluation_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extractor_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/inference_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/paddleocr_mock_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/paddleocr_vl_unit_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/python_api_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/requests_mock_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/stt_deepgram_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/user_config_steps.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/stt_deepgram_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/token_budget.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/features/user_config.feature +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_ag_news.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_audio_samples.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_image_samples.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_mixed_samples.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_pdf_samples.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_wikipedia.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/extraction_evaluation_demo.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/extraction_evaluation_lab.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/profiling_demo.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/test.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/scripts/topic_modeling_integration.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/setup.cfg +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/base.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/profiling.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/schema.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/constants.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/evidence_processing.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extraction.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extraction_evaluation.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/__init__.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/markitdown_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/paddleocr_vl_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_override.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_smart_override.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/inference.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/time.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/user_config.py +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -9,6 +9,9 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: pydantic>=2.0
|
|
10
10
|
Requires-Dist: PyYAML>=6.0
|
|
11
11
|
Requires-Dist: pypdf>=4.0
|
|
12
|
+
Requires-Dist: Jinja2>=3.1
|
|
13
|
+
Requires-Dist: dotyaml>=0.1.3
|
|
14
|
+
Requires-Dist: numpy>=1.24
|
|
12
15
|
Provides-Extra: dev
|
|
13
16
|
Requires-Dist: behave>=1.2.6; extra == "dev"
|
|
14
17
|
Requires-Dist: coverage[toml]>=7.0; extra == "dev"
|
|
@@ -18,6 +21,9 @@ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
|
|
|
18
21
|
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
19
22
|
Requires-Dist: black>=24.0; extra == "dev"
|
|
20
23
|
Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
|
|
24
|
+
Provides-Extra: dspy
|
|
25
|
+
Requires-Dist: dspy>=2.5; extra == "dspy"
|
|
26
|
+
Requires-Dist: litellm>=1.0; extra == "dspy"
|
|
21
27
|
Provides-Extra: openai
|
|
22
28
|
Requires-Dist: openai>=1.0; extra == "openai"
|
|
23
29
|
Provides-Extra: unstructured
|
|
@@ -40,6 +46,8 @@ Provides-Extra: docling-mlx
|
|
|
40
46
|
Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
|
|
41
47
|
Provides-Extra: topic-modeling
|
|
42
48
|
Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
|
|
49
|
+
Provides-Extra: markov-analysis
|
|
50
|
+
Requires-Dist: hmmlearn>=0.3.0; extra == "markov-analysis"
|
|
43
51
|
Provides-Extra: datasets
|
|
44
52
|
Requires-Dist: datasets>=2.18.0; extra == "datasets"
|
|
45
53
|
Dynamic: license-file
|
|
@@ -50,18 +58,33 @@ Dynamic: license-file
|
|
|
50
58
|
![Coverage][coverage-badge]
|
|
51
59
|
![Documentation][documentation-badge]
|
|
52
60
|
|
|
53
|
-
|
|
54
|
-
|
|
61
|
+
<p>
|
|
62
|
+
<img
|
|
63
|
+
src="docs/_static/Biblicus-logo.png"
|
|
64
|
+
alt="Biblicus logo"
|
|
65
|
+
align="right"
|
|
66
|
+
width="216"
|
|
67
|
+
/>
|
|
68
|
+
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
69
|
+
</p>
|
|
55
70
|
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
56
71
|
|
|
57
72
|
The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
|
|
58
73
|
|
|
59
|
-
|
|
74
|
+
Biblicus gives you a normal folder on disk to manage. In Biblicus documentation, that managed folder is called a *corpus* (plural: *corpora*). It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw files.
|
|
60
75
|
|
|
61
76
|
It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
|
|
62
77
|
|
|
63
78
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
64
79
|
|
|
80
|
+
## Analysis highlights
|
|
81
|
+
|
|
82
|
+
- `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
|
|
83
|
+
- YAML recipes support cascading composition plus dotted `--config key=value` overrides.
|
|
84
|
+
- Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
|
|
85
|
+
- See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
|
|
86
|
+
- See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
|
|
87
|
+
|
|
65
88
|
## Start with a knowledge base
|
|
66
89
|
|
|
67
90
|
If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
|
|
@@ -106,7 +129,7 @@ Think in three stages.
|
|
|
106
129
|
|
|
107
130
|
If you learn a few project words, the rest of the system becomes predictable.
|
|
108
131
|
|
|
109
|
-
- Corpus is the folder that holds raw items and their metadata.
|
|
132
|
+
- Corpus is the managed folder that holds raw items and their metadata.
|
|
110
133
|
- Item is the raw bytes plus optional metadata and source information.
|
|
111
134
|
- Catalog is the rebuildable index of the corpus.
|
|
112
135
|
- Extraction run is a recorded extraction build that produces text artifacts.
|
|
@@ -161,28 +184,28 @@ sequenceDiagram
|
|
|
161
184
|
This repository is a working Python package. Install it into a virtual environment from the repository root.
|
|
162
185
|
|
|
163
186
|
```
|
|
164
|
-
|
|
187
|
+
python -m pip install -e .
|
|
165
188
|
```
|
|
166
189
|
|
|
167
190
|
After the first release, you can install it from Python Package Index.
|
|
168
191
|
|
|
169
192
|
```
|
|
170
|
-
|
|
193
|
+
python -m pip install biblicus
|
|
171
194
|
```
|
|
172
195
|
|
|
173
196
|
### Optional extras
|
|
174
197
|
|
|
175
198
|
Some extractors are optional so the base install stays small.
|
|
176
199
|
|
|
177
|
-
- Optical character recognition for images: `
|
|
178
|
-
- Advanced optical character recognition with PaddleOCR: `
|
|
179
|
-
- Document understanding with Docling VLM: `
|
|
180
|
-
- Document understanding with Docling VLM and MLX acceleration: `
|
|
181
|
-
- Speech to text transcription with OpenAI: `
|
|
182
|
-
- Speech to text transcription with Deepgram: `
|
|
183
|
-
- Broad document parsing fallback: `
|
|
184
|
-
- MarkItDown document conversion (requires Python 3.10 or higher): `
|
|
185
|
-
- Topic modeling analysis with BERTopic: `
|
|
200
|
+
- Optical character recognition for images: `python -m pip install "biblicus[ocr]"`
|
|
201
|
+
- Advanced optical character recognition with PaddleOCR: `python -m pip install "biblicus[paddleocr]"`
|
|
202
|
+
- Document understanding with Docling VLM: `python -m pip install "biblicus[docling]"`
|
|
203
|
+
- Document understanding with Docling VLM and MLX acceleration: `python -m pip install "biblicus[docling-mlx]"`
|
|
204
|
+
- Speech to text transcription with OpenAI: `python -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
205
|
+
- Speech to text transcription with Deepgram: `python -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
206
|
+
- Broad document parsing fallback: `python -m pip install "biblicus[unstructured]"`
|
|
207
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python -m pip install "biblicus[markitdown]"`
|
|
208
|
+
- Topic modeling analysis with BERTopic: `python -m pip install "biblicus[topic-modeling]"`
|
|
186
209
|
|
|
187
210
|
## Quick start
|
|
188
211
|
|
|
@@ -200,16 +223,49 @@ biblicus build --corpus corpora/example --backend scan
|
|
|
200
223
|
biblicus query --corpus corpora/example --query "note"
|
|
201
224
|
```
|
|
202
225
|
|
|
203
|
-
|
|
226
|
+
## Web Ingestion
|
|
227
|
+
|
|
228
|
+
Biblicus supports ingesting content directly from the web using two approaches.
|
|
229
|
+
|
|
230
|
+
### Ingest from URLs
|
|
204
231
|
|
|
232
|
+
Ingest individual documents or web pages from URLs. The `ingest` command automatically detects content types including PDF, HTML, Markdown, images, and audio:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
# Ingest a document from a URL
|
|
236
|
+
biblicus ingest https://example.com/document.pdf --tags "research"
|
|
237
|
+
|
|
238
|
+
# Ingest a web page
|
|
239
|
+
biblicus ingest https://example.com/article.html --tags "article"
|
|
240
|
+
|
|
241
|
+
# Ingest with a corpus path specified
|
|
242
|
+
biblicus ingest --corpus corpora/example https://docs.example.com/guide.md --tags "documentation"
|
|
205
243
|
```
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
244
|
+
|
|
245
|
+
### Crawl Websites
|
|
246
|
+
|
|
247
|
+
Crawl entire website sections with automatic link discovery. The crawler follows links within the allowed prefix and stores discovered content:
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
# Crawl a documentation site
|
|
251
|
+
biblicus crawl \
|
|
252
|
+
--corpus corpora/example \
|
|
253
|
+
--root-url https://docs.example.com/ \
|
|
254
|
+
--allowed-prefix https://docs.example.com/ \
|
|
255
|
+
--max-items 100 \
|
|
256
|
+
--tags "documentation"
|
|
257
|
+
|
|
258
|
+
# Crawl a specific blog category
|
|
259
|
+
biblicus crawl \
|
|
260
|
+
--corpus corpora/example \
|
|
261
|
+
--root-url https://blog.example.com/category/tutorials/ \
|
|
262
|
+
--allowed-prefix https://blog.example.com/category/tutorials/ \
|
|
263
|
+
--max-items 50 \
|
|
264
|
+
--tags "tutorials,blog"
|
|
211
265
|
```
|
|
212
266
|
|
|
267
|
+
The `--allowed-prefix` parameter restricts the crawler to only follow links that start with the specified URL prefix, preventing it from crawling outside the intended scope. The crawler respects `.biblicusignore` rules and stores items under `raw/imports/crawl/` in your corpus.
|
|
268
|
+
|
|
213
269
|
## End-to-end example: lower-level control
|
|
214
270
|
|
|
215
271
|
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
@@ -237,7 +293,7 @@ for note_title, note_text in notes:
|
|
|
237
293
|
|
|
238
294
|
backend = get_backend("scan")
|
|
239
295
|
run = backend.build_run(corpus, recipe_name="Story demo", config={})
|
|
240
|
-
budget = QueryBudget(max_total_items=5,
|
|
296
|
+
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
241
297
|
result = backend.query(
|
|
242
298
|
corpus,
|
|
243
299
|
run=run,
|
|
@@ -277,7 +333,7 @@ Example output:
|
|
|
277
333
|
"query_text": "Primary button style preference",
|
|
278
334
|
"budget": {
|
|
279
335
|
"max_total_items": 5,
|
|
280
|
-
"
|
|
336
|
+
"maximum_total_characters": 2000,
|
|
281
337
|
"max_items_per_source": null
|
|
282
338
|
},
|
|
283
339
|
"run_id": "RUN_ID",
|
|
@@ -490,7 +546,7 @@ Three backends are included.
|
|
|
490
546
|
|
|
491
547
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
492
548
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
493
|
-
- `vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
|
|
549
|
+
- `tf-vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
|
|
494
550
|
|
|
495
551
|
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
496
552
|
|
|
@@ -498,7 +554,8 @@ For detailed documentation including configuration options, performance characte
|
|
|
498
554
|
|
|
499
555
|
For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
|
|
500
556
|
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
|
|
501
|
-
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
|
|
557
|
+
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
|
|
558
|
+
script (`scripts/retrieval_evaluation_lab.py`).
|
|
502
559
|
|
|
503
560
|
## Extraction backends
|
|
504
561
|
|
|
@@ -539,6 +596,21 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
539
596
|
For extraction evaluation workflows, dataset formats, and report interpretation, see
|
|
540
597
|
`docs/EXTRACTION_EVALUATION.md`.
|
|
541
598
|
|
|
599
|
+
## Text extract utility
|
|
600
|
+
|
|
601
|
+
Text extract is a reusable analysis utility that lets a model insert XML tags into a long text without re-emitting the
|
|
602
|
+
entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
|
|
603
|
+
analysis.
|
|
604
|
+
|
|
605
|
+
See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
|
|
606
|
+
|
|
607
|
+
## Text slice utility
|
|
608
|
+
|
|
609
|
+
Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
|
|
610
|
+
re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
|
|
611
|
+
|
|
612
|
+
See `docs/TEXT_SLICE.md` for the utility API and examples.
|
|
613
|
+
|
|
542
614
|
## Topic modeling analysis
|
|
543
615
|
|
|
544
616
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
@@ -593,7 +665,7 @@ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[t
|
|
|
593
665
|
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
594
666
|
|
|
595
667
|
```
|
|
596
|
-
|
|
668
|
+
python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
597
669
|
```
|
|
598
670
|
|
|
599
671
|
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
@@ -607,13 +679,13 @@ Use `scripts/download_pdf_samples.py` to download a small Portable Document Form
|
|
|
607
679
|
## Tests and coverage
|
|
608
680
|
|
|
609
681
|
```
|
|
610
|
-
|
|
682
|
+
python scripts/test.py
|
|
611
683
|
```
|
|
612
684
|
|
|
613
685
|
To include integration scenarios that download public test data at runtime, run this command.
|
|
614
686
|
|
|
615
687
|
```
|
|
616
|
-
|
|
688
|
+
python scripts/test.py --integration
|
|
617
689
|
```
|
|
618
690
|
|
|
619
691
|
## Releases
|
|
@@ -631,13 +703,13 @@ Reference documentation is generated from Sphinx style docstrings.
|
|
|
631
703
|
Install development dependencies:
|
|
632
704
|
|
|
633
705
|
```
|
|
634
|
-
|
|
706
|
+
python -m pip install -e ".[dev]"
|
|
635
707
|
```
|
|
636
708
|
|
|
637
709
|
Build the documentation:
|
|
638
710
|
|
|
639
711
|
```
|
|
640
|
-
|
|
712
|
+
python -m sphinx -b html docs docs/_build/html
|
|
641
713
|
```
|
|
642
714
|
|
|
643
715
|
## License
|
|
@@ -4,18 +4,33 @@
|
|
|
4
4
|
![Coverage][coverage-badge]
|
|
5
5
|
![Documentation][documentation-badge]
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
<p>
|
|
8
|
+
<img
|
|
9
|
+
src="docs/_static/Biblicus-logo.png"
|
|
10
|
+
alt="Biblicus logo"
|
|
11
|
+
align="right"
|
|
12
|
+
width="216"
|
|
13
|
+
/>
|
|
14
|
+
Make your documents usable by your assistant, then decide later how you will search and retrieve them.
|
|
15
|
+
</p>
|
|
9
16
|
If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
|
|
10
17
|
|
|
11
18
|
The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
|
|
12
19
|
|
|
13
|
-
|
|
20
|
+
Biblicus gives you a normal folder on disk to manage. In Biblicus documentation, that managed folder is called a *corpus* (plural: *corpora*). It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw files.
|
|
14
21
|
|
|
15
22
|
It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
|
|
16
23
|
|
|
17
24
|
See [retrieval augmented generation overview] for a short introduction to the idea.
|
|
18
25
|
|
|
26
|
+
## Analysis highlights
|
|
27
|
+
|
|
28
|
+
- `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
|
|
29
|
+
- YAML recipes support cascading composition plus dotted `--config key=value` overrides.
|
|
30
|
+
- Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
|
|
31
|
+
- See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
|
|
32
|
+
- See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
|
|
33
|
+
|
|
19
34
|
## Start with a knowledge base
|
|
20
35
|
|
|
21
36
|
If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
|
|
@@ -60,7 +75,7 @@ Think in three stages.
|
|
|
60
75
|
|
|
61
76
|
If you learn a few project words, the rest of the system becomes predictable.
|
|
62
77
|
|
|
63
|
-
- Corpus is the folder that holds raw items and their metadata.
|
|
78
|
+
- Corpus is the managed folder that holds raw items and their metadata.
|
|
64
79
|
- Item is the raw bytes plus optional metadata and source information.
|
|
65
80
|
- Catalog is the rebuildable index of the corpus.
|
|
66
81
|
- Extraction run is a recorded extraction build that produces text artifacts.
|
|
@@ -115,28 +130,28 @@ sequenceDiagram
|
|
|
115
130
|
This repository is a working Python package. Install it into a virtual environment from the repository root.
|
|
116
131
|
|
|
117
132
|
```
|
|
118
|
-
|
|
133
|
+
python -m pip install -e .
|
|
119
134
|
```
|
|
120
135
|
|
|
121
136
|
After the first release, you can install it from Python Package Index.
|
|
122
137
|
|
|
123
138
|
```
|
|
124
|
-
|
|
139
|
+
python -m pip install biblicus
|
|
125
140
|
```
|
|
126
141
|
|
|
127
142
|
### Optional extras
|
|
128
143
|
|
|
129
144
|
Some extractors are optional so the base install stays small.
|
|
130
145
|
|
|
131
|
-
- Optical character recognition for images: `
|
|
132
|
-
- Advanced optical character recognition with PaddleOCR: `
|
|
133
|
-
- Document understanding with Docling VLM: `
|
|
134
|
-
- Document understanding with Docling VLM and MLX acceleration: `
|
|
135
|
-
- Speech to text transcription with OpenAI: `
|
|
136
|
-
- Speech to text transcription with Deepgram: `
|
|
137
|
-
- Broad document parsing fallback: `
|
|
138
|
-
- MarkItDown document conversion (requires Python 3.10 or higher): `
|
|
139
|
-
- Topic modeling analysis with BERTopic: `
|
|
146
|
+
- Optical character recognition for images: `python -m pip install "biblicus[ocr]"`
|
|
147
|
+
- Advanced optical character recognition with PaddleOCR: `python -m pip install "biblicus[paddleocr]"`
|
|
148
|
+
- Document understanding with Docling VLM: `python -m pip install "biblicus[docling]"`
|
|
149
|
+
- Document understanding with Docling VLM and MLX acceleration: `python -m pip install "biblicus[docling-mlx]"`
|
|
150
|
+
- Speech to text transcription with OpenAI: `python -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
151
|
+
- Speech to text transcription with Deepgram: `python -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
|
|
152
|
+
- Broad document parsing fallback: `python -m pip install "biblicus[unstructured]"`
|
|
153
|
+
- MarkItDown document conversion (requires Python 3.10 or higher): `python -m pip install "biblicus[markitdown]"`
|
|
154
|
+
- Topic modeling analysis with BERTopic: `python -m pip install "biblicus[topic-modeling]"`
|
|
140
155
|
|
|
141
156
|
## Quick start
|
|
142
157
|
|
|
@@ -154,16 +169,49 @@ biblicus build --corpus corpora/example --backend scan
|
|
|
154
169
|
biblicus query --corpus corpora/example --query "note"
|
|
155
170
|
```
|
|
156
171
|
|
|
157
|
-
|
|
172
|
+
## Web Ingestion
|
|
173
|
+
|
|
174
|
+
Biblicus supports ingesting content directly from the web using two approaches.
|
|
175
|
+
|
|
176
|
+
### Ingest from URLs
|
|
158
177
|
|
|
178
|
+
Ingest individual documents or web pages from URLs. The `ingest` command automatically detects content types including PDF, HTML, Markdown, images, and audio:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
# Ingest a document from a URL
|
|
182
|
+
biblicus ingest https://example.com/document.pdf --tags "research"
|
|
183
|
+
|
|
184
|
+
# Ingest a web page
|
|
185
|
+
biblicus ingest https://example.com/article.html --tags "article"
|
|
186
|
+
|
|
187
|
+
# Ingest with a corpus path specified
|
|
188
|
+
biblicus ingest --corpus corpora/example https://docs.example.com/guide.md --tags "documentation"
|
|
159
189
|
```
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
190
|
+
|
|
191
|
+
### Crawl Websites
|
|
192
|
+
|
|
193
|
+
Crawl entire website sections with automatic link discovery. The crawler follows links within the allowed prefix and stores discovered content:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
# Crawl a documentation site
|
|
197
|
+
biblicus crawl \
|
|
198
|
+
--corpus corpora/example \
|
|
199
|
+
--root-url https://docs.example.com/ \
|
|
200
|
+
--allowed-prefix https://docs.example.com/ \
|
|
201
|
+
--max-items 100 \
|
|
202
|
+
--tags "documentation"
|
|
203
|
+
|
|
204
|
+
# Crawl a specific blog category
|
|
205
|
+
biblicus crawl \
|
|
206
|
+
--corpus corpora/example \
|
|
207
|
+
--root-url https://blog.example.com/category/tutorials/ \
|
|
208
|
+
--allowed-prefix https://blog.example.com/category/tutorials/ \
|
|
209
|
+
--max-items 50 \
|
|
210
|
+
--tags "tutorials,blog"
|
|
165
211
|
```
|
|
166
212
|
|
|
213
|
+
The `--allowed-prefix` parameter restricts the crawler to only follow links that start with the specified URL prefix, preventing it from crawling outside the intended scope. The crawler respects `.biblicusignore` rules and stores items under `raw/imports/crawl/` in your corpus.
|
|
214
|
+
|
|
167
215
|
## End-to-end example: lower-level control
|
|
168
216
|
|
|
169
217
|
The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
|
|
@@ -191,7 +239,7 @@ for note_title, note_text in notes:
|
|
|
191
239
|
|
|
192
240
|
backend = get_backend("scan")
|
|
193
241
|
run = backend.build_run(corpus, recipe_name="Story demo", config={})
|
|
194
|
-
budget = QueryBudget(max_total_items=5,
|
|
242
|
+
budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
|
|
195
243
|
result = backend.query(
|
|
196
244
|
corpus,
|
|
197
245
|
run=run,
|
|
@@ -231,7 +279,7 @@ Example output:
|
|
|
231
279
|
"query_text": "Primary button style preference",
|
|
232
280
|
"budget": {
|
|
233
281
|
"max_total_items": 5,
|
|
234
|
-
"
|
|
282
|
+
"maximum_total_characters": 2000,
|
|
235
283
|
"max_items_per_source": null
|
|
236
284
|
},
|
|
237
285
|
"run_id": "RUN_ID",
|
|
@@ -444,7 +492,7 @@ Three backends are included.
|
|
|
444
492
|
|
|
445
493
|
- `scan` is a minimal baseline that scans raw items directly.
|
|
446
494
|
- `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
|
|
447
|
-
- `vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
|
|
495
|
+
- `tf-vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
|
|
448
496
|
|
|
449
497
|
For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
|
|
450
498
|
|
|
@@ -452,7 +500,8 @@ For detailed documentation including configuration options, performance characte
|
|
|
452
500
|
|
|
453
501
|
For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
|
|
454
502
|
(tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
|
|
455
|
-
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
|
|
503
|
+
and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
|
|
504
|
+
script (`scripts/retrieval_evaluation_lab.py`).
|
|
456
505
|
|
|
457
506
|
## Extraction backends
|
|
458
507
|
|
|
@@ -493,6 +542,21 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
493
542
|
For extraction evaluation workflows, dataset formats, and report interpretation, see
|
|
494
543
|
`docs/EXTRACTION_EVALUATION.md`.
|
|
495
544
|
|
|
545
|
+
## Text extract utility
|
|
546
|
+
|
|
547
|
+
Text extract is a reusable analysis utility that lets a model insert XML tags into a long text without re-emitting the
|
|
548
|
+
entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
|
|
549
|
+
analysis.
|
|
550
|
+
|
|
551
|
+
See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
|
|
552
|
+
|
|
553
|
+
## Text slice utility
|
|
554
|
+
|
|
555
|
+
Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
|
|
556
|
+
re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
|
|
557
|
+
|
|
558
|
+
See `docs/TEXT_SLICE.md` for the utility API and examples.
|
|
559
|
+
|
|
496
560
|
## Topic modeling analysis
|
|
497
561
|
|
|
498
562
|
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
@@ -547,7 +611,7 @@ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[t
|
|
|
547
611
|
For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
|
|
548
612
|
|
|
549
613
|
```
|
|
550
|
-
|
|
614
|
+
python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
|
|
551
615
|
```
|
|
552
616
|
|
|
553
617
|
See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
|
|
@@ -561,13 +625,13 @@ Use `scripts/download_pdf_samples.py` to download a small Portable Document Form
|
|
|
561
625
|
## Tests and coverage
|
|
562
626
|
|
|
563
627
|
```
|
|
564
|
-
|
|
628
|
+
python scripts/test.py
|
|
565
629
|
```
|
|
566
630
|
|
|
567
631
|
To include integration scenarios that download public test data at runtime, run this command.
|
|
568
632
|
|
|
569
633
|
```
|
|
570
|
-
|
|
634
|
+
python scripts/test.py --integration
|
|
571
635
|
```
|
|
572
636
|
|
|
573
637
|
## Releases
|
|
@@ -585,13 +649,13 @@ Reference documentation is generated from Sphinx style docstrings.
|
|
|
585
649
|
Install development dependencies:
|
|
586
650
|
|
|
587
651
|
```
|
|
588
|
-
|
|
652
|
+
python -m pip install -e ".[dev]"
|
|
589
653
|
```
|
|
590
654
|
|
|
591
655
|
Build the documentation:
|
|
592
656
|
|
|
593
657
|
```
|
|
594
|
-
|
|
658
|
+
python -m sphinx -b html docs docs/_build/html
|
|
595
659
|
```
|
|
596
660
|
|
|
597
661
|
## License
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema_version": 1,
|
|
3
|
+
"name": "retrieval-evaluation-lab",
|
|
4
|
+
"description": "Bundled labels for the retrieval evaluation lab.",
|
|
5
|
+
"queries": [
|
|
6
|
+
{
|
|
7
|
+
"query_id": "q1",
|
|
8
|
+
"query_text": "alpha unique",
|
|
9
|
+
"expected_filename": "alpha.txt",
|
|
10
|
+
"kind": "gold"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"query_id": "q2",
|
|
14
|
+
"query_text": "beta unique",
|
|
15
|
+
"expected_filename": "beta.txt",
|
|
16
|
+
"kind": "gold"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"query_id": "q3",
|
|
20
|
+
"query_text": "gamma unique",
|
|
21
|
+
"expected_filename": "gamma.txt",
|
|
22
|
+
"kind": "gold"
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|