biblicus 0.9.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblicus-0.9.0/src/biblicus.egg-info → biblicus-0.10.0}/PKG-INFO +7 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/README.md +6 -5
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/ANALYSIS.md +11 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/DEMOS.md +8 -0
- biblicus-0.10.0/docs/PROFILING.md +98 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/conf.py +5 -8
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/index.rst +1 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/analysis_schema.feature +52 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/environment.py +3 -5
- biblicus-0.10.0/features/profiling.feature +150 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/analysis_steps.py +149 -9
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/cli_steps.py +13 -7
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/crawl_steps.py +6 -2
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/deepgram_steps.py +3 -11
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/docling_steps.py +2 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/evidence_processing_steps.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/extraction_run_lifecycle_steps.py +6 -2
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/extraction_steps.py +25 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/inference_steps.py +12 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/markitdown_steps.py +1 -3
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/openai_steps.py +3 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/paddleocr_mock_steps.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_steps.py +17 -19
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_unit_steps.py +10 -9
- biblicus-0.10.0/features/steps/profiling_steps.py +205 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/requests_mock_steps.py +32 -13
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/topic_modeling_steps.py +7 -3
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/user_config_steps.py +6 -7
- {biblicus-0.9.0 → biblicus-0.10.0}/pyproject.toml +2 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_ag_news.py +1 -2
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_audio_samples.py +9 -5
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_image_samples.py +0 -5
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_mixed_samples.py +0 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_pdf_samples.py +0 -5
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_wikipedia.py +1 -5
- biblicus-0.10.0/scripts/profiling_demo.py +212 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/readme_end_to_end_demo.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/test.py +0 -4
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/topic_modeling_integration.py +15 -10
- {biblicus-0.9.0 → biblicus-0.10.0}/scripts/wikipedia_rag_demo.py +3 -8
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/__init__.py +1 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/__init__.py +2 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/models.py +228 -5
- biblicus-0.10.0/src/biblicus/analysis/profiling.py +337 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/topic_modeling.py +3 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/sqlite_full_text_search.py +2 -4
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/cli.py +83 -4
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/corpus.py +9 -3
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/evidence_processing.py +4 -2
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extraction.py +3 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/markitdown_text.py +1 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/paddleocr_vl_text.py +1 -3
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/user_config.py +2 -6
- {biblicus-0.9.0 → biblicus-0.10.0/src/biblicus.egg-info}/PKG-INFO +7 -6
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/SOURCES.txt +5 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/LICENSE +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/MANIFEST.in +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/THIRD_PARTY_NOTICES.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/datasets/wikipedia_mini.json +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/ARCHITECTURE.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/BACKENDS.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/CONTEXT_PACK.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/CORPUS.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/CORPUS_DESIGN.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/EXTRACTION.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/FEATURE_INDEX.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/KNOWLEDGE_BASE.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/ROADMAP.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/STT.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/TESTING.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/TOPIC_MODELING.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/USER_CONFIGURATION.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/api.rst +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/backends/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/backends/scan.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/backends/sqlite-full-text-search.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/ocr/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/ocr/rapidocr.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/openai.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/markitdown.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/metadata.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/pass-through.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/pdf.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/unstructured.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/vlm-document/index.md +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/backend_validation.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/biblicus_corpus.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/cli_entrypoint.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/cli_parsing.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/cli_step_spec_parsing.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/content_sniffing.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/context_pack.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/context_pack_cli.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/corpus_edge_cases.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/corpus_identity.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/corpus_purge.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/crawl.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/docling_granite_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/docling_smol_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/error_cases.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/evaluation.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/evidence_processing.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_error_handling.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_run_lifecycle.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_selection.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_selection_longest.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/extractor_pipeline.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/extractor_validation.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/frontmatter.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/hook_config_validation.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/hook_error_handling.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/import_tree.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/inference_backend.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/ingest_sources.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_audio_samples.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_image_samples.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_mixed_corpus.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_mixed_extraction.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_ocr_image_extraction.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_pdf_retrieval.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_pdf_samples.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_unstructured_extraction.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_wikipedia.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/knowledge_base.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/lifecycle_hooks.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/markitdown_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/model_validation.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/ocr_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/paddleocr_vl_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/pdf_text_extraction.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/python_api.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/python_hook_logging.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/query_processing.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/recipe_file_extraction.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_budget.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_scan.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_uses_extraction_run.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_utilities.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/select_override.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/smart_override_selection.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/source_loading.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/backend_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/cli_parsing_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/context_pack_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/extractor_steps.py +1 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/frontmatter_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/knowledge_base_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/model_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/pdf_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/python_api_steps.py +1 -1
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/rapidocr_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/retrieval_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/stt_deepgram_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/stt_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/unstructured_steps.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/streaming_ingest.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/stt_deepgram_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/stt_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/text_extraction_runs.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/token_budget.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/topic_modeling.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/unstructured_extractor.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/features/user_config.feature +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/setup.cfg +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/__main__.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/base.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/llm.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/schema.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/__init__.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/base.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/scan.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/constants.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/context.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/crawl.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/errors.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/evaluation.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/__init__.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/base.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/metadata_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/openai_stt.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/pass_through_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/pdf_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/pipeline.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_longest_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_override.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_smart_override.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/unstructured_text.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/frontmatter.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/hook_logging.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/hook_manager.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/hooks.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/ignore.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/inference.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/knowledge_base.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/models.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/retrieval.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/sources.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/time.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/uris.py +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/entry_points.txt +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/requires.txt +0 -0
- {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblicus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
|
|
5
5
|
License: MIT
|
|
6
6
|
Requires-Python: >=3.9
|
|
@@ -531,12 +531,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
531
531
|
|
|
532
532
|
## Topic modeling analysis
|
|
533
533
|
|
|
534
|
-
Biblicus can run analysis pipelines on extracted text without changing the raw corpus.
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
JavaScript Object Notation.
|
|
534
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
535
|
+
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
536
|
+
an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
537
|
+
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
538
538
|
|
|
539
|
-
See `docs/ANALYSIS.md` for the analysis pipeline overview
|
|
539
|
+
See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
|
|
540
|
+
`docs/TOPIC_MODELING.md` for topic modeling details.
|
|
540
541
|
|
|
541
542
|
Run a topic analysis using a recipe file:
|
|
542
543
|
|
|
@@ -485,12 +485,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
|
|
|
485
485
|
|
|
486
486
|
## Topic modeling analysis
|
|
487
487
|
|
|
488
|
-
Biblicus can run analysis pipelines on extracted text without changing the raw corpus.
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
JavaScript Object Notation.
|
|
488
|
+
Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
|
|
489
|
+
are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
|
|
490
|
+
an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
|
|
491
|
+
optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
|
|
492
492
|
|
|
493
|
-
See `docs/ANALYSIS.md` for the analysis pipeline overview
|
|
493
|
+
See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
|
|
494
|
+
`docs/TOPIC_MODELING.md` for topic modeling details.
|
|
494
495
|
|
|
495
496
|
Run a topic analysis using a recipe file:
|
|
496
497
|
|
|
@@ -34,3 +34,14 @@ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --fo
|
|
|
34
34
|
|
|
35
35
|
The command prints the analysis run identifier and the output path. Open the resulting `output.json` to inspect per-topic
|
|
36
36
|
labels, keywords, and document examples.
|
|
37
|
+
|
|
38
|
+
## Profiling analysis
|
|
39
|
+
|
|
40
|
+
Profiling is the baseline analysis backend. It summarizes corpus composition and extraction coverage using
|
|
41
|
+
deterministic counts and distribution metrics. See `docs/PROFILING.md` for the full reference and working demo.
|
|
42
|
+
|
|
43
|
+
Run profiling from the CLI:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
|
|
47
|
+
```
|
|
@@ -214,6 +214,14 @@ python3 scripts/topic_modeling_integration.py \
|
|
|
214
214
|
The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
|
|
215
215
|
keywords, and document examples.
|
|
216
216
|
|
|
217
|
+
### Profiling analysis demo
|
|
218
|
+
|
|
219
|
+
The profiling demo downloads AG News, runs extraction, and produces a profiling report.
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
|
|
223
|
+
```
|
|
224
|
+
|
|
217
225
|
### Select extracted text within a pipeline
|
|
218
226
|
|
|
219
227
|
When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Corpus profiling analysis
|
|
2
|
+
|
|
3
|
+
Biblicus provides a profiling analysis backend that summarizes corpus contents using deterministic counts and
|
|
4
|
+
coverage metrics. Profiling is intended as a fast, local baseline before heavier analysis such as topic modeling.
|
|
5
|
+
|
|
6
|
+
## What profiling does
|
|
7
|
+
|
|
8
|
+
The profiling analysis reports:
|
|
9
|
+
|
|
10
|
+
- Total item count and media type distribution
|
|
11
|
+
- Extracted text coverage (present, empty, missing)
|
|
12
|
+
- Size and length distributions with percentiles
|
|
13
|
+
- Tag coverage and top tags
|
|
14
|
+
|
|
15
|
+
The output is structured JSON that can be stored, versioned, and compared across runs.
|
|
16
|
+
|
|
17
|
+
## Run profiling from the CLI
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
If you omit `--extraction-run`, Biblicus uses the latest extraction run and emits a reproducibility warning.
|
|
24
|
+
|
|
25
|
+
To customize profiling metrics, pass a recipe file:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
biblicus analyze profile --corpus corpora/example --recipe recipes/profiling.yml --extraction-run pipeline:RUN_ID
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Profiling recipe configuration
|
|
32
|
+
|
|
33
|
+
Profiling recipes use the analysis schema version and accept these fields:
|
|
34
|
+
|
|
35
|
+
- `schema_version`: analysis schema version, currently `1`
|
|
36
|
+
- `sample_size`: optional cap for distribution calculations
|
|
37
|
+
- `min_text_characters`: minimum extracted text length for inclusion
|
|
38
|
+
- `percentiles`: percentiles to compute for size and length distributions
|
|
39
|
+
- `top_tag_count`: maximum number of tags to list in `top_tags`
|
|
40
|
+
- `tag_filters`: optional list of tags to include in tag coverage metrics
|
|
41
|
+
|
|
42
|
+
Example recipe:
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
schema_version: 1
|
|
46
|
+
sample_size: 500
|
|
47
|
+
min_text_characters: 50
|
|
48
|
+
percentiles: [50, 90, 99]
|
|
49
|
+
top_tag_count: 10
|
|
50
|
+
tag_filters: ["ag_news", "label:World"]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Run profiling from Python
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
|
|
58
|
+
from biblicus.analysis import get_analysis_backend
|
|
59
|
+
from biblicus.corpus import Corpus
|
|
60
|
+
from biblicus.models import ExtractionRunReference
|
|
61
|
+
|
|
62
|
+
corpus = Corpus.open(Path("corpora/example"))
|
|
63
|
+
backend = get_analysis_backend("profiling")
|
|
64
|
+
output = backend.run_analysis(
|
|
65
|
+
corpus,
|
|
66
|
+
recipe_name="default",
|
|
67
|
+
config={
|
|
68
|
+
"schema_version": 1,
|
|
69
|
+
"sample_size": 500,
|
|
70
|
+
"min_text_characters": 50,
|
|
71
|
+
"percentiles": [50, 90, 99],
|
|
72
|
+
"top_tag_count": 10,
|
|
73
|
+
"tag_filters": ["ag_news"],
|
|
74
|
+
},
|
|
75
|
+
extraction_run=ExtractionRunReference(
|
|
76
|
+
extractor_id="pipeline",
|
|
77
|
+
run_id="RUN_ID",
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
print(output.model_dump())
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Output location
|
|
84
|
+
|
|
85
|
+
Profiling output is stored under:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
.biblicus/runs/analysis/profiling/<run_id>/output.json
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Working demo
|
|
92
|
+
|
|
93
|
+
A runnable demo is provided in `scripts/profiling_demo.py`. It downloads a corpus, runs extraction, and executes the
|
|
94
|
+
profiling analysis so you can inspect the output:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
|
|
98
|
+
```
|
|
@@ -4,8 +4,13 @@ Sphinx configuration for Biblicus documentation.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
|
|
11
|
+
from pygments.lexers.special import TextLexer
|
|
12
|
+
from sphinx.highlighting import lexers
|
|
13
|
+
|
|
9
14
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
10
15
|
SOURCE_ROOT = PROJECT_ROOT / "src"
|
|
11
16
|
|
|
@@ -31,8 +36,6 @@ html_theme_options = {
|
|
|
31
36
|
}
|
|
32
37
|
|
|
33
38
|
# ReadTheDocs integration - canonical URL for SEO
|
|
34
|
-
import os
|
|
35
|
-
|
|
36
39
|
if os.environ.get("READTHEDOCS"):
|
|
37
40
|
rtd_version = os.environ.get("READTHEDOCS_VERSION", "latest")
|
|
38
41
|
rtd_project = os.environ.get("READTHEDOCS_PROJECT", "biblicus")
|
|
@@ -44,12 +47,6 @@ source_suffix = {
|
|
|
44
47
|
}
|
|
45
48
|
|
|
46
49
|
suppress_warnings = ["misc.highlighting_failure"]
|
|
47
|
-
|
|
48
|
-
import sys
|
|
49
|
-
|
|
50
50
|
sys.path.insert(0, str(SOURCE_ROOT))
|
|
51
51
|
|
|
52
|
-
from pygments.lexers.special import TextLexer
|
|
53
|
-
from sphinx.highlighting import lexers
|
|
54
|
-
|
|
55
52
|
lexers["mermaid"] = TextLexer()
|
|
@@ -56,3 +56,55 @@ Feature: Analysis schema validation
|
|
|
56
56
|
When I attempt to validate a vectorizer config with stop words "spanish"
|
|
57
57
|
Then a model validation error is raised
|
|
58
58
|
And the validation error mentions "vectorizer.stop_words must be"
|
|
59
|
+
|
|
60
|
+
Scenario: Profiling config rejects invalid sample size
|
|
61
|
+
When I attempt to validate a profiling config with sample size 0
|
|
62
|
+
Then a model validation error is raised
|
|
63
|
+
And the validation error mentions "sample_size"
|
|
64
|
+
|
|
65
|
+
Scenario: Profiling config rejects unsupported schema version
|
|
66
|
+
When I attempt to validate a profiling config with schema version 2
|
|
67
|
+
Then a model validation error is raised
|
|
68
|
+
And the validation error mentions "Unsupported analysis schema version"
|
|
69
|
+
|
|
70
|
+
Scenario: Profiling config rejects invalid percentiles
|
|
71
|
+
When I attempt to validate a profiling config with percentiles "0,101"
|
|
72
|
+
Then a model validation error is raised
|
|
73
|
+
And the validation error mentions "percentiles"
|
|
74
|
+
|
|
75
|
+
Scenario: Profiling config rejects empty percentiles
|
|
76
|
+
When I attempt to validate a profiling config with empty percentiles
|
|
77
|
+
Then a model validation error is raised
|
|
78
|
+
And the validation error mentions "percentiles"
|
|
79
|
+
|
|
80
|
+
Scenario: Profiling config rejects unsorted percentiles
|
|
81
|
+
When I attempt to validate a profiling config with percentiles "90,50"
|
|
82
|
+
Then a model validation error is raised
|
|
83
|
+
And the validation error mentions "percentiles"
|
|
84
|
+
|
|
85
|
+
Scenario: Profiling config rejects empty tag filters
|
|
86
|
+
When I attempt to validate a profiling config with tag filters "alpha,,beta"
|
|
87
|
+
Then a model validation error is raised
|
|
88
|
+
And the validation error mentions "tag_filters"
|
|
89
|
+
|
|
90
|
+
Scenario: Profiling config rejects non-list tag filters
|
|
91
|
+
When I attempt to validate a profiling config with tag filters string "alpha"
|
|
92
|
+
Then a model validation error is raised
|
|
93
|
+
And the validation error mentions "tag_filters"
|
|
94
|
+
|
|
95
|
+
Scenario: Profiling config accepts tag filters None
|
|
96
|
+
When I validate a profiling config with tag filters None
|
|
97
|
+
Then the profiling tag filters are absent
|
|
98
|
+
|
|
99
|
+
Scenario: Profiling config normalizes tag filters
|
|
100
|
+
When I validate a profiling config with tag filters list " alpha ,beta "
|
|
101
|
+
Then the profiling tag filters include "alpha"
|
|
102
|
+
And the profiling tag filters include "beta"
|
|
103
|
+
|
|
104
|
+
Scenario: Profiling ordering helper ignores missing items
|
|
105
|
+
When I order catalog items with missing entries
|
|
106
|
+
Then the ordered catalog item identifiers equal "a,c,b"
|
|
107
|
+
|
|
108
|
+
Scenario: Profiling percentile helper handles empty values
|
|
109
|
+
When I compute a profiling percentile on empty values
|
|
110
|
+
Then the profiling percentile value equals 0
|
|
@@ -17,7 +17,6 @@ def _repo_root() -> Path:
|
|
|
17
17
|
:return: Repository root path.
|
|
18
18
|
:rtype: Path
|
|
19
19
|
"""
|
|
20
|
-
|
|
21
20
|
return Path(__file__).resolve().parent.parent
|
|
22
21
|
|
|
23
22
|
|
|
@@ -32,7 +31,6 @@ def before_scenario(context, scenario) -> None:
|
|
|
32
31
|
:return: None.
|
|
33
32
|
:rtype: None
|
|
34
33
|
"""
|
|
35
|
-
|
|
36
34
|
import biblicus.__main__ as _biblicus_main
|
|
37
35
|
|
|
38
36
|
_ = _biblicus_main
|
|
@@ -74,7 +72,6 @@ def after_scenario(context, scenario) -> None:
|
|
|
74
72
|
:return: None.
|
|
75
73
|
:rtype: None
|
|
76
74
|
"""
|
|
77
|
-
|
|
78
75
|
if getattr(context, "httpd", None) is not None:
|
|
79
76
|
context.httpd.shutdown()
|
|
80
77
|
context.httpd.server_close()
|
|
@@ -221,7 +218,9 @@ def after_scenario(context, scenario) -> None:
|
|
|
221
218
|
context.fake_paddleocr_vl_behaviors.clear()
|
|
222
219
|
if getattr(context, "_fake_paddleocr_installed", False):
|
|
223
220
|
# Remove all paddle-related modules
|
|
224
|
-
paddle_module_names = [
|
|
221
|
+
paddle_module_names = [
|
|
222
|
+
name for name in list(sys.modules.keys()) if "paddle" in name.lower()
|
|
223
|
+
]
|
|
225
224
|
for name in paddle_module_names:
|
|
226
225
|
sys.modules.pop(name, None)
|
|
227
226
|
# Restore original modules
|
|
@@ -345,7 +344,6 @@ def run_biblicus(
|
|
|
345
344
|
:return: Captured execution result.
|
|
346
345
|
:rtype: RunResult
|
|
347
346
|
"""
|
|
348
|
-
|
|
349
347
|
import contextlib
|
|
350
348
|
import io
|
|
351
349
|
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Feature: Profiling analysis
|
|
2
|
+
Profiling analysis summarizes raw corpus composition and extracted text coverage.
|
|
3
|
+
|
|
4
|
+
Scenario: Profiling analysis reports raw and extracted counts
|
|
5
|
+
Given I initialized a corpus at "corpus"
|
|
6
|
+
And a binary file "blob.bin" exists
|
|
7
|
+
When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
|
|
8
|
+
And I ingest the file "blob.bin" into corpus "corpus"
|
|
9
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
10
|
+
| extractor_id | config_json |
|
|
11
|
+
| pass-through-text | {} |
|
|
12
|
+
And I run a profiling analysis in corpus "corpus" using the latest extraction run
|
|
13
|
+
Then the profiling output includes raw item total 2
|
|
14
|
+
And the profiling output includes media type count "text/markdown" 1
|
|
15
|
+
And the profiling output includes media type count "application/octet-stream" 1
|
|
16
|
+
And the profiling output includes raw bytes distribution count 2
|
|
17
|
+
And the profiling output includes raw bytes percentiles 50,90,99
|
|
18
|
+
And the profiling output includes tagged items 1
|
|
19
|
+
And the profiling output includes untagged items 1
|
|
20
|
+
And the profiling output includes top tag "t" with count 1
|
|
21
|
+
And the profiling output includes extracted source items 2
|
|
22
|
+
And the profiling output includes extracted nonempty items 1
|
|
23
|
+
And the profiling output includes extracted empty items 0
|
|
24
|
+
And the profiling output includes extracted missing items 1
|
|
25
|
+
And the profiling output includes extracted text distribution count 1
|
|
26
|
+
And the profiling output includes extracted text percentiles 50,90,99
|
|
27
|
+
|
|
28
|
+
Scenario: Profiling analysis uses the latest extraction run when omitted
|
|
29
|
+
Given I initialized a corpus at "corpus"
|
|
30
|
+
And a binary file "blob.bin" exists
|
|
31
|
+
When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
|
|
32
|
+
And I ingest the file "blob.bin" into corpus "corpus"
|
|
33
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
34
|
+
| extractor_id | config_json |
|
|
35
|
+
| pass-through-text | {} |
|
|
36
|
+
And I run a profiling analysis in corpus "corpus"
|
|
37
|
+
Then the command succeeds
|
|
38
|
+
And standard error includes "latest extraction run"
|
|
39
|
+
|
|
40
|
+
Scenario: Profiling analysis accepts a recipe file
|
|
41
|
+
Given I initialized a corpus at "corpus"
|
|
42
|
+
And a binary file "blob.bin" exists
|
|
43
|
+
When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
|
|
44
|
+
And I ingest the file "blob.bin" into corpus "corpus"
|
|
45
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
46
|
+
| extractor_id | config_json |
|
|
47
|
+
| pass-through-text | {} |
|
|
48
|
+
And I create a profiling recipe file "profiling_recipe.yml" with:
|
|
49
|
+
"""
|
|
50
|
+
schema_version: 1
|
|
51
|
+
sample_size: 1
|
|
52
|
+
percentiles: [50]
|
|
53
|
+
top_tag_count: 1
|
|
54
|
+
"""
|
|
55
|
+
And I run a profiling analysis in corpus "corpus" using recipe "profiling_recipe.yml" and the latest extraction run
|
|
56
|
+
Then the profiling output includes raw bytes distribution count 1
|
|
57
|
+
And the profiling output includes raw bytes percentiles 50
|
|
58
|
+
And the profiling output includes top tag "t" with count 1
|
|
59
|
+
|
|
60
|
+
Scenario: Profiling analysis reports empty corpus distributions
|
|
61
|
+
Given I initialized a corpus at "corpus"
|
|
62
|
+
When I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
63
|
+
| extractor_id | config_json |
|
|
64
|
+
| pass-through-text | {} |
|
|
65
|
+
And I run a profiling analysis in corpus "corpus" using the latest extraction run
|
|
66
|
+
Then the profiling output includes raw item total 0
|
|
67
|
+
And the profiling output includes raw bytes distribution count 0
|
|
68
|
+
And the profiling output includes extracted source items 0
|
|
69
|
+
And the profiling output includes extracted text distribution count 0
|
|
70
|
+
|
|
71
|
+
Scenario: Profiling analysis counts empty extracted text
|
|
72
|
+
Given I initialized a corpus at "corpus"
|
|
73
|
+
When I ingest the text " " with title "Blank" and tags "t" into corpus "corpus"
|
|
74
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
75
|
+
| extractor_id | config_json |
|
|
76
|
+
| pass-through-text | {} |
|
|
77
|
+
And I run a profiling analysis in corpus "corpus" using the latest extraction run
|
|
78
|
+
Then the profiling output includes extracted nonempty items 0
|
|
79
|
+
And the profiling output includes extracted empty items 1
|
|
80
|
+
|
|
81
|
+
Scenario: Profiling analysis respects minimum text length
|
|
82
|
+
Given I initialized a corpus at "corpus"
|
|
83
|
+
When I ingest the text "short" with title "Short" and tags "t" into corpus "corpus"
|
|
84
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
85
|
+
| extractor_id | config_json |
|
|
86
|
+
| pass-through-text | {} |
|
|
87
|
+
And I create a profiling recipe file "profiling_min_text.yml" with:
|
|
88
|
+
"""
|
|
89
|
+
schema_version: 1
|
|
90
|
+
min_text_characters: 10
|
|
91
|
+
"""
|
|
92
|
+
And I run a profiling analysis in corpus "corpus" using recipe "profiling_min_text.yml" and the latest extraction run
|
|
93
|
+
Then the profiling output includes extracted nonempty items 0
|
|
94
|
+
And the profiling output includes extracted empty items 1
|
|
95
|
+
|
|
96
|
+
Scenario: Profiling analysis applies tag filters
|
|
97
|
+
Given I initialized a corpus at "corpus"
|
|
98
|
+
When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
|
|
99
|
+
And I ingest the text "Beta note" with title "Beta" and tags "other" into corpus "corpus"
|
|
100
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
101
|
+
| extractor_id | config_json |
|
|
102
|
+
| pass-through-text | {} |
|
|
103
|
+
And I create a profiling recipe file "profiling_tags.yml" with:
|
|
104
|
+
"""
|
|
105
|
+
schema_version: 1
|
|
106
|
+
tag_filters: ["t"]
|
|
107
|
+
"""
|
|
108
|
+
And I run a profiling analysis in corpus "corpus" using recipe "profiling_tags.yml" and the latest extraction run
|
|
109
|
+
Then the profiling output includes top tag "t" with count 1
|
|
110
|
+
And the profiling output includes tagged items 1
|
|
111
|
+
And the profiling output includes untagged items 1
|
|
112
|
+
|
|
113
|
+
Scenario: Profiling analysis rejects missing recipe file
|
|
114
|
+
Given I initialized a corpus at "corpus"
|
|
115
|
+
When I run a profiling analysis in corpus "corpus" using recipe "missing.yml" without extraction run
|
|
116
|
+
Then the command fails with exit code 2
|
|
117
|
+
And standard error includes "Recipe file not found"
|
|
118
|
+
|
|
119
|
+
Scenario: Profiling analysis rejects non-mapping recipe
|
|
120
|
+
Given I initialized a corpus at "corpus"
|
|
121
|
+
When I create a profiling recipe file "profiling_invalid.yml" with:
|
|
122
|
+
"""
|
|
123
|
+
- not
|
|
124
|
+
- a
|
|
125
|
+
- mapping
|
|
126
|
+
"""
|
|
127
|
+
And I run a profiling analysis in corpus "corpus" using recipe "profiling_invalid.yml" without extraction run
|
|
128
|
+
Then the command fails with exit code 2
|
|
129
|
+
And standard error includes "Profiling recipe must be a mapping/object"
|
|
130
|
+
|
|
131
|
+
Scenario: Profiling analysis rejects invalid recipe values
|
|
132
|
+
Given I initialized a corpus at "corpus"
|
|
133
|
+
When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
|
|
134
|
+
And I build a "pipeline" extraction run in corpus "corpus" with steps:
|
|
135
|
+
| extractor_id | config_json |
|
|
136
|
+
| pass-through-text | {} |
|
|
137
|
+
And I create a profiling recipe file "profiling_invalid_values.yml" with:
|
|
138
|
+
"""
|
|
139
|
+
schema_version: 1
|
|
140
|
+
percentiles: ["bad"]
|
|
141
|
+
"""
|
|
142
|
+
And I run a profiling analysis in corpus "corpus" using recipe "profiling_invalid_values.yml" and the latest extraction run
|
|
143
|
+
Then the command fails with exit code 2
|
|
144
|
+
And standard error includes "Invalid profiling recipe"
|
|
145
|
+
|
|
146
|
+
Scenario: Profiling analysis requires extraction run
|
|
147
|
+
Given I initialized a corpus at "corpus"
|
|
148
|
+
When I run a profiling analysis in corpus "corpus"
|
|
149
|
+
Then the command fails with exit code 2
|
|
150
|
+
And standard error includes "Profiling analysis requires an extraction run"
|
|
@@ -9,23 +9,25 @@ from biblicus.analysis import get_analysis_backend
|
|
|
9
9
|
from biblicus.analysis.base import CorpusAnalysisBackend
|
|
10
10
|
from biblicus.analysis.llm import LlmClientConfig, LlmProvider
|
|
11
11
|
from biblicus.analysis.models import (
|
|
12
|
+
ProfilingRecipeConfig,
|
|
13
|
+
TopicModelingKeyword,
|
|
14
|
+
TopicModelingLabelSource,
|
|
12
15
|
TopicModelingLlmExtractionConfig,
|
|
13
16
|
TopicModelingLlmExtractionMethod,
|
|
14
17
|
TopicModelingLlmFineTuningConfig,
|
|
15
|
-
TopicModelingKeyword,
|
|
16
|
-
TopicModelingLabelSource,
|
|
17
18
|
TopicModelingTopic,
|
|
18
19
|
TopicModelingVectorizerConfig,
|
|
19
20
|
)
|
|
21
|
+
from biblicus.analysis.profiling import _ordered_catalog_items, _percentile_value
|
|
20
22
|
from biblicus.analysis.topic_modeling import (
|
|
21
|
-
_TopicDocument,
|
|
22
23
|
_apply_llm_fine_tuning,
|
|
23
24
|
_parse_itemized_response,
|
|
25
|
+
_TopicDocument,
|
|
24
26
|
)
|
|
25
|
-
from biblicus.models import ExtractionRunReference
|
|
27
|
+
from biblicus.models import CatalogItem, ExtractionRunReference
|
|
26
28
|
from features.steps.openai_steps import (
|
|
27
|
-
_FakeOpenAiChatBehavior,
|
|
28
29
|
_ensure_fake_openai_chat_behaviors,
|
|
30
|
+
_FakeOpenAiChatBehavior,
|
|
29
31
|
_install_fake_openai_module,
|
|
30
32
|
)
|
|
31
33
|
|
|
@@ -163,9 +165,7 @@ def step_run_llm_fine_tuning_missing_documents(context) -> None:
|
|
|
163
165
|
document_ids=["missing"],
|
|
164
166
|
)
|
|
165
167
|
]
|
|
166
|
-
documents = [
|
|
167
|
-
_TopicDocument(document_id="present", source_item_id="present", text="Text")
|
|
168
|
-
]
|
|
168
|
+
documents = [_TopicDocument(document_id="present", source_item_id="present", text="Text")]
|
|
169
169
|
report, labeled_topics = _apply_llm_fine_tuning(
|
|
170
170
|
topics=topics,
|
|
171
171
|
documents=documents,
|
|
@@ -184,7 +184,7 @@ def step_fine_tuning_topics_labeled(context, count: int) -> None:
|
|
|
184
184
|
|
|
185
185
|
@when("I parse an itemized response JSON string")
|
|
186
186
|
def step_parse_itemized_response_json_string(context) -> None:
|
|
187
|
-
response_text = "
|
|
187
|
+
response_text = '"[\\"Alpha\\", \\"Beta\\"]"'
|
|
188
188
|
context.itemized_response = _parse_itemized_response(response_text)
|
|
189
189
|
|
|
190
190
|
|
|
@@ -247,3 +247,143 @@ def step_vectorizer_stop_words_equals(context, value: str) -> None:
|
|
|
247
247
|
def step_vectorizer_stop_words_absent(context) -> None:
|
|
248
248
|
model = context.last_model
|
|
249
249
|
assert model.stop_words is None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@when("I attempt to validate a profiling config with sample size {value:d}")
|
|
253
|
+
def step_validate_profiling_sample_size(context, value: int) -> None:
|
|
254
|
+
try:
|
|
255
|
+
ProfilingRecipeConfig(sample_size=value)
|
|
256
|
+
context.validation_error = None
|
|
257
|
+
except ValidationError as exc:
|
|
258
|
+
context.validation_error = exc
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
@when('I attempt to validate a profiling config with percentiles "{values}"')
|
|
262
|
+
def step_validate_profiling_percentiles(context, values: str) -> None:
|
|
263
|
+
try:
|
|
264
|
+
percentiles = [int(value.strip()) for value in values.split(",") if value.strip()]
|
|
265
|
+
ProfilingRecipeConfig(percentiles=percentiles)
|
|
266
|
+
context.validation_error = None
|
|
267
|
+
except ValidationError as exc:
|
|
268
|
+
context.validation_error = exc
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@when('I attempt to validate a profiling config with tag filters "{values}"')
|
|
272
|
+
def step_validate_profiling_tag_filters(context, values: str) -> None:
|
|
273
|
+
try:
|
|
274
|
+
tags = [value.strip() for value in values.split(",")]
|
|
275
|
+
ProfilingRecipeConfig(tag_filters=tags)
|
|
276
|
+
context.validation_error = None
|
|
277
|
+
except ValidationError as exc:
|
|
278
|
+
context.validation_error = exc
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@when("I attempt to validate a profiling config with schema version {value:d}")
|
|
282
|
+
def step_validate_profiling_schema_version(context, value: int) -> None:
|
|
283
|
+
try:
|
|
284
|
+
ProfilingRecipeConfig(schema_version=value)
|
|
285
|
+
context.validation_error = None
|
|
286
|
+
except ValidationError as exc:
|
|
287
|
+
context.validation_error = exc
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@when("I attempt to validate a profiling config with empty percentiles")
|
|
291
|
+
def step_validate_profiling_empty_percentiles(context) -> None:
|
|
292
|
+
try:
|
|
293
|
+
ProfilingRecipeConfig(percentiles=[])
|
|
294
|
+
context.validation_error = None
|
|
295
|
+
except ValidationError as exc:
|
|
296
|
+
context.validation_error = exc
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@when('I attempt to validate a profiling config with tag filters string "{value}"')
|
|
300
|
+
def step_validate_profiling_tag_filters_string(context, value: str) -> None:
|
|
301
|
+
try:
|
|
302
|
+
ProfilingRecipeConfig(tag_filters=value)
|
|
303
|
+
context.validation_error = None
|
|
304
|
+
except ValidationError as exc:
|
|
305
|
+
context.validation_error = exc
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@when("I validate a profiling config with tag filters None")
|
|
309
|
+
def step_validate_profiling_tag_filters_none(context) -> None:
|
|
310
|
+
context.last_model = ProfilingRecipeConfig(tag_filters=None)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@when('I validate a profiling config with tag filters list "{values}"')
|
|
314
|
+
def step_validate_profiling_tag_filters_list(context, values: str) -> None:
|
|
315
|
+
tags = [value.strip() for value in values.split(",")]
|
|
316
|
+
context.last_model = ProfilingRecipeConfig(tag_filters=tags)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@then("the profiling tag filters are absent")
|
|
320
|
+
def step_profiling_tag_filters_absent(context) -> None:
|
|
321
|
+
model = context.last_model
|
|
322
|
+
assert model.tag_filters is None
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
@then('the profiling tag filters include "{value}"')
|
|
326
|
+
def step_profiling_tag_filters_include(context, value: str) -> None:
|
|
327
|
+
model = context.last_model
|
|
328
|
+
assert model.tag_filters is not None
|
|
329
|
+
assert value in model.tag_filters
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@when("I order catalog items with missing entries")
|
|
333
|
+
def step_order_catalog_items_with_missing_entries(context) -> None:
|
|
334
|
+
items = {
|
|
335
|
+
"a": CatalogItem(
|
|
336
|
+
id="a",
|
|
337
|
+
relpath="raw/a.txt",
|
|
338
|
+
sha256="a",
|
|
339
|
+
bytes=1,
|
|
340
|
+
media_type="text/plain",
|
|
341
|
+
title=None,
|
|
342
|
+
tags=[],
|
|
343
|
+
metadata={},
|
|
344
|
+
created_at="2020-01-01T00:00:00Z",
|
|
345
|
+
source_uri=None,
|
|
346
|
+
),
|
|
347
|
+
"b": CatalogItem(
|
|
348
|
+
id="b",
|
|
349
|
+
relpath="raw/b.txt",
|
|
350
|
+
sha256="b",
|
|
351
|
+
bytes=2,
|
|
352
|
+
media_type="text/plain",
|
|
353
|
+
title=None,
|
|
354
|
+
tags=[],
|
|
355
|
+
metadata={},
|
|
356
|
+
created_at="2020-01-01T00:00:00Z",
|
|
357
|
+
source_uri=None,
|
|
358
|
+
),
|
|
359
|
+
"c": CatalogItem(
|
|
360
|
+
id="c",
|
|
361
|
+
relpath="raw/c.txt",
|
|
362
|
+
sha256="c",
|
|
363
|
+
bytes=3,
|
|
364
|
+
media_type="text/plain",
|
|
365
|
+
title=None,
|
|
366
|
+
tags=[],
|
|
367
|
+
metadata={},
|
|
368
|
+
created_at="2020-01-01T00:00:00Z",
|
|
369
|
+
source_uri=None,
|
|
370
|
+
),
|
|
371
|
+
}
|
|
372
|
+
ordered = _ordered_catalog_items(items, ["a", "missing", "c"])
|
|
373
|
+
context.ordered_catalog_ids = [item.id for item in ordered]
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@then('the ordered catalog item identifiers equal "{values}"')
|
|
377
|
+
def step_ordered_catalog_item_identifiers_equal(context, values: str) -> None:
|
|
378
|
+
expected = [value.strip() for value in values.split(",") if value.strip()]
|
|
379
|
+
assert context.ordered_catalog_ids == expected
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
@when("I compute a profiling percentile on empty values")
|
|
383
|
+
def step_compute_profiling_percentile_empty(context) -> None:
|
|
384
|
+
context.percentile_value = _percentile_value([], 50)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@then("the profiling percentile value equals {value:d}")
|
|
388
|
+
def step_profiling_percentile_value_equals(context, value: int) -> None:
|
|
389
|
+
assert context.percentile_value == value
|