biblicus 0.9.0__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. {biblicus-0.9.0/src/biblicus.egg-info → biblicus-0.11.0}/PKG-INFO +13 -6
  2. {biblicus-0.9.0 → biblicus-0.11.0}/README.md +12 -5
  3. {biblicus-0.9.0 → biblicus-0.11.0}/docs/ANALYSIS.md +11 -0
  4. {biblicus-0.9.0 → biblicus-0.11.0}/docs/ARCHITECTURE.md +4 -4
  5. {biblicus-0.9.0 → biblicus-0.11.0}/docs/CORPUS_DESIGN.md +2 -2
  6. {biblicus-0.9.0 → biblicus-0.11.0}/docs/DEMOS.md +11 -3
  7. biblicus-0.11.0/docs/PROFILING.md +98 -0
  8. biblicus-0.11.0/docs/RETRIEVAL.md +47 -0
  9. biblicus-0.11.0/docs/RETRIEVAL_EVALUATION.md +74 -0
  10. biblicus-0.11.0/docs/RETRIEVAL_QUALITY.md +42 -0
  11. {biblicus-0.9.0 → biblicus-0.11.0}/docs/ROADMAP.md +15 -1
  12. {biblicus-0.9.0 → biblicus-0.11.0}/docs/conf.py +5 -8
  13. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/text-document/pass-through.md +3 -3
  14. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/text-document/unstructured.md +1 -1
  15. {biblicus-0.9.0 → biblicus-0.11.0}/docs/index.rst +4 -0
  16. {biblicus-0.9.0 → biblicus-0.11.0}/features/analysis_schema.feature +52 -0
  17. {biblicus-0.9.0 → biblicus-0.11.0}/features/environment.py +3 -5
  18. biblicus-0.11.0/features/profiling.feature +150 -0
  19. biblicus-0.11.0/features/retrieval_quality.feature +253 -0
  20. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/analysis_steps.py +149 -9
  21. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/cli_steps.py +13 -7
  22. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/crawl_steps.py +6 -2
  23. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/deepgram_steps.py +3 -11
  24. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/docling_steps.py +2 -6
  25. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/evidence_processing_steps.py +0 -1
  26. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/extraction_run_lifecycle_steps.py +6 -2
  27. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/extraction_steps.py +25 -6
  28. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/inference_steps.py +12 -6
  29. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/markitdown_steps.py +1 -3
  30. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/openai_steps.py +3 -1
  31. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/paddleocr_mock_steps.py +0 -1
  32. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/paddleocr_vl_steps.py +17 -19
  33. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/paddleocr_vl_unit_steps.py +10 -9
  34. biblicus-0.11.0/features/steps/profiling_steps.py +205 -0
  35. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/requests_mock_steps.py +32 -13
  36. biblicus-0.11.0/features/steps/retrieval_quality_steps.py +186 -0
  37. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/retrieval_steps.py +10 -0
  38. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/topic_modeling_steps.py +7 -3
  39. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/user_config_steps.py +6 -7
  40. {biblicus-0.9.0 → biblicus-0.11.0}/pyproject.toml +2 -1
  41. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/download_ag_news.py +1 -2
  42. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/download_audio_samples.py +9 -5
  43. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/download_image_samples.py +0 -5
  44. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/download_mixed_samples.py +0 -6
  45. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/download_pdf_samples.py +0 -5
  46. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/download_wikipedia.py +1 -5
  47. biblicus-0.11.0/scripts/profiling_demo.py +212 -0
  48. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/readme_end_to_end_demo.py +0 -1
  49. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/test.py +0 -4
  50. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/topic_modeling_integration.py +15 -10
  51. {biblicus-0.9.0 → biblicus-0.11.0}/scripts/wikipedia_rag_demo.py +3 -8
  52. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/__init__.py +1 -1
  53. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -1
  54. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -1
  55. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -1
  56. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -1
  57. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/analysis/__init__.py +2 -0
  58. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/analysis/models.py +228 -5
  59. biblicus-0.11.0/src/biblicus/analysis/profiling.py +337 -0
  60. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/analysis/topic_modeling.py +3 -6
  61. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/backends/__init__.py +4 -0
  62. biblicus-0.11.0/src/biblicus/backends/hybrid.py +284 -0
  63. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/backends/sqlite_full_text_search.py +266 -22
  64. biblicus-0.11.0/src/biblicus/backends/vector.py +460 -0
  65. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/cli.py +83 -4
  66. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/corpus.py +9 -3
  67. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/evidence_processing.py +4 -2
  68. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extraction.py +3 -1
  69. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/markitdown_text.py +1 -0
  70. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/paddleocr_vl_text.py +1 -3
  71. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/models.py +3 -0
  72. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/user_config.py +2 -6
  73. {biblicus-0.9.0 → biblicus-0.11.0/src/biblicus.egg-info}/PKG-INFO +13 -6
  74. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus.egg-info/SOURCES.txt +12 -0
  75. {biblicus-0.9.0 → biblicus-0.11.0}/LICENSE +0 -0
  76. {biblicus-0.9.0 → biblicus-0.11.0}/MANIFEST.in +0 -0
  77. {biblicus-0.9.0 → biblicus-0.11.0}/THIRD_PARTY_NOTICES.md +0 -0
  78. {biblicus-0.9.0 → biblicus-0.11.0}/datasets/wikipedia_mini.json +0 -0
  79. {biblicus-0.9.0 → biblicus-0.11.0}/docs/BACKENDS.md +0 -0
  80. {biblicus-0.9.0 → biblicus-0.11.0}/docs/CONTEXT_PACK.md +0 -0
  81. {biblicus-0.9.0 → biblicus-0.11.0}/docs/CORPUS.md +0 -0
  82. {biblicus-0.9.0 → biblicus-0.11.0}/docs/EXTRACTION.md +0 -0
  83. {biblicus-0.9.0 → biblicus-0.11.0}/docs/FEATURE_INDEX.md +0 -0
  84. {biblicus-0.9.0 → biblicus-0.11.0}/docs/KNOWLEDGE_BASE.md +0 -0
  85. {biblicus-0.9.0 → biblicus-0.11.0}/docs/STT.md +0 -0
  86. {biblicus-0.9.0 → biblicus-0.11.0}/docs/TESTING.md +0 -0
  87. {biblicus-0.9.0 → biblicus-0.11.0}/docs/TOPIC_MODELING.md +0 -0
  88. {biblicus-0.9.0 → biblicus-0.11.0}/docs/USER_CONFIGURATION.md +0 -0
  89. {biblicus-0.9.0 → biblicus-0.11.0}/docs/api.rst +0 -0
  90. {biblicus-0.9.0 → biblicus-0.11.0}/docs/backends/index.md +0 -0
  91. {biblicus-0.9.0 → biblicus-0.11.0}/docs/backends/scan.md +0 -0
  92. {biblicus-0.9.0 → biblicus-0.11.0}/docs/backends/sqlite-full-text-search.md +0 -0
  93. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/index.md +0 -0
  94. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/ocr/index.md +0 -0
  95. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
  96. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/ocr/rapidocr.md +0 -0
  97. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/pipeline-utilities/index.md +0 -0
  98. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
  99. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
  100. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
  101. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
  102. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
  103. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
  104. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/speech-to-text/index.md +0 -0
  105. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/speech-to-text/openai.md +0 -0
  106. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/text-document/index.md +0 -0
  107. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/text-document/markitdown.md +0 -0
  108. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/text-document/metadata.md +0 -0
  109. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/text-document/pdf.md +0 -0
  110. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
  111. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
  112. {biblicus-0.9.0 → biblicus-0.11.0}/docs/extractors/vlm-document/index.md +0 -0
  113. {biblicus-0.9.0 → biblicus-0.11.0}/features/backend_validation.feature +0 -0
  114. {biblicus-0.9.0 → biblicus-0.11.0}/features/biblicus_corpus.feature +0 -0
  115. {biblicus-0.9.0 → biblicus-0.11.0}/features/cli_entrypoint.feature +0 -0
  116. {biblicus-0.9.0 → biblicus-0.11.0}/features/cli_parsing.feature +0 -0
  117. {biblicus-0.9.0 → biblicus-0.11.0}/features/cli_step_spec_parsing.feature +0 -0
  118. {biblicus-0.9.0 → biblicus-0.11.0}/features/content_sniffing.feature +0 -0
  119. {biblicus-0.9.0 → biblicus-0.11.0}/features/context_pack.feature +0 -0
  120. {biblicus-0.9.0 → biblicus-0.11.0}/features/context_pack_cli.feature +0 -0
  121. {biblicus-0.9.0 → biblicus-0.11.0}/features/corpus_edge_cases.feature +0 -0
  122. {biblicus-0.9.0 → biblicus-0.11.0}/features/corpus_identity.feature +0 -0
  123. {biblicus-0.9.0 → biblicus-0.11.0}/features/corpus_purge.feature +0 -0
  124. {biblicus-0.9.0 → biblicus-0.11.0}/features/crawl.feature +0 -0
  125. {biblicus-0.9.0 → biblicus-0.11.0}/features/docling_granite_extractor.feature +0 -0
  126. {biblicus-0.9.0 → biblicus-0.11.0}/features/docling_smol_extractor.feature +0 -0
  127. {biblicus-0.9.0 → biblicus-0.11.0}/features/error_cases.feature +0 -0
  128. {biblicus-0.9.0 → biblicus-0.11.0}/features/evaluation.feature +0 -0
  129. {biblicus-0.9.0 → biblicus-0.11.0}/features/evidence_processing.feature +0 -0
  130. {biblicus-0.9.0 → biblicus-0.11.0}/features/extraction_error_handling.feature +0 -0
  131. {biblicus-0.9.0 → biblicus-0.11.0}/features/extraction_run_lifecycle.feature +0 -0
  132. {biblicus-0.9.0 → biblicus-0.11.0}/features/extraction_selection.feature +0 -0
  133. {biblicus-0.9.0 → biblicus-0.11.0}/features/extraction_selection_longest.feature +0 -0
  134. {biblicus-0.9.0 → biblicus-0.11.0}/features/extractor_pipeline.feature +0 -0
  135. {biblicus-0.9.0 → biblicus-0.11.0}/features/extractor_validation.feature +0 -0
  136. {biblicus-0.9.0 → biblicus-0.11.0}/features/frontmatter.feature +0 -0
  137. {biblicus-0.9.0 → biblicus-0.11.0}/features/hook_config_validation.feature +0 -0
  138. {biblicus-0.9.0 → biblicus-0.11.0}/features/hook_error_handling.feature +0 -0
  139. {biblicus-0.9.0 → biblicus-0.11.0}/features/import_tree.feature +0 -0
  140. {biblicus-0.9.0 → biblicus-0.11.0}/features/inference_backend.feature +0 -0
  141. {biblicus-0.9.0 → biblicus-0.11.0}/features/ingest_sources.feature +0 -0
  142. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_audio_samples.feature +0 -0
  143. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_image_samples.feature +0 -0
  144. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_mixed_corpus.feature +0 -0
  145. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_mixed_extraction.feature +0 -0
  146. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_ocr_image_extraction.feature +0 -0
  147. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_pdf_retrieval.feature +0 -0
  148. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_pdf_samples.feature +0 -0
  149. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_unstructured_extraction.feature +0 -0
  150. {biblicus-0.9.0 → biblicus-0.11.0}/features/integration_wikipedia.feature +0 -0
  151. {biblicus-0.9.0 → biblicus-0.11.0}/features/knowledge_base.feature +0 -0
  152. {biblicus-0.9.0 → biblicus-0.11.0}/features/lifecycle_hooks.feature +0 -0
  153. {biblicus-0.9.0 → biblicus-0.11.0}/features/markitdown_extractor.feature +0 -0
  154. {biblicus-0.9.0 → biblicus-0.11.0}/features/model_validation.feature +0 -0
  155. {biblicus-0.9.0 → biblicus-0.11.0}/features/ocr_extractor.feature +0 -0
  156. {biblicus-0.9.0 → biblicus-0.11.0}/features/paddleocr_vl_extractor.feature +0 -0
  157. {biblicus-0.9.0 → biblicus-0.11.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
  158. {biblicus-0.9.0 → biblicus-0.11.0}/features/pdf_text_extraction.feature +0 -0
  159. {biblicus-0.9.0 → biblicus-0.11.0}/features/python_api.feature +0 -0
  160. {biblicus-0.9.0 → biblicus-0.11.0}/features/python_hook_logging.feature +0 -0
  161. {biblicus-0.9.0 → biblicus-0.11.0}/features/query_processing.feature +0 -0
  162. {biblicus-0.9.0 → biblicus-0.11.0}/features/recipe_file_extraction.feature +0 -0
  163. {biblicus-0.9.0 → biblicus-0.11.0}/features/retrieval_budget.feature +0 -0
  164. {biblicus-0.9.0 → biblicus-0.11.0}/features/retrieval_scan.feature +0 -0
  165. {biblicus-0.9.0 → biblicus-0.11.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  166. {biblicus-0.9.0 → biblicus-0.11.0}/features/retrieval_uses_extraction_run.feature +0 -0
  167. {biblicus-0.9.0 → biblicus-0.11.0}/features/retrieval_utilities.feature +0 -0
  168. {biblicus-0.9.0 → biblicus-0.11.0}/features/select_override.feature +0 -0
  169. {biblicus-0.9.0 → biblicus-0.11.0}/features/smart_override_selection.feature +0 -0
  170. {biblicus-0.9.0 → biblicus-0.11.0}/features/source_loading.feature +0 -0
  171. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/backend_steps.py +0 -0
  172. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/cli_parsing_steps.py +0 -0
  173. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/context_pack_steps.py +0 -0
  174. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/extractor_steps.py +1 -1
  175. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/frontmatter_steps.py +0 -0
  176. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/knowledge_base_steps.py +0 -0
  177. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/model_steps.py +0 -0
  178. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/pdf_steps.py +0 -0
  179. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/python_api_steps.py +1 -1
  180. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/rapidocr_steps.py +0 -0
  181. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/stt_deepgram_steps.py +0 -0
  182. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/stt_steps.py +0 -0
  183. {biblicus-0.9.0 → biblicus-0.11.0}/features/steps/unstructured_steps.py +0 -0
  184. {biblicus-0.9.0 → biblicus-0.11.0}/features/streaming_ingest.feature +0 -0
  185. {biblicus-0.9.0 → biblicus-0.11.0}/features/stt_deepgram_extractor.feature +0 -0
  186. {biblicus-0.9.0 → biblicus-0.11.0}/features/stt_extractor.feature +0 -0
  187. {biblicus-0.9.0 → biblicus-0.11.0}/features/text_extraction_runs.feature +0 -0
  188. {biblicus-0.9.0 → biblicus-0.11.0}/features/token_budget.feature +0 -0
  189. {biblicus-0.9.0 → biblicus-0.11.0}/features/topic_modeling.feature +0 -0
  190. {biblicus-0.9.0 → biblicus-0.11.0}/features/unstructured_extractor.feature +0 -0
  191. {biblicus-0.9.0 → biblicus-0.11.0}/features/user_config.feature +0 -0
  192. {biblicus-0.9.0 → biblicus-0.11.0}/setup.cfg +0 -0
  193. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/__main__.py +0 -0
  194. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/analysis/base.py +0 -0
  195. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/analysis/llm.py +0 -0
  196. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/analysis/schema.py +0 -0
  197. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/backends/base.py +0 -0
  198. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/backends/scan.py +0 -0
  199. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/constants.py +0 -0
  200. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/context.py +0 -0
  201. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/crawl.py +0 -0
  202. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/errors.py +0 -0
  203. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/evaluation.py +0 -0
  204. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/__init__.py +0 -0
  205. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/base.py +0 -0
  206. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
  207. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
  208. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
  209. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/metadata_text.py +0 -0
  210. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/openai_stt.py +0 -0
  211. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/pass_through_text.py +0 -0
  212. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/pdf_text.py +0 -0
  213. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/pipeline.py +0 -0
  214. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
  215. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/select_longest_text.py +0 -0
  216. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/select_override.py +0 -0
  217. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/select_smart_override.py +0 -0
  218. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/select_text.py +0 -0
  219. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/extractors/unstructured_text.py +0 -0
  220. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/frontmatter.py +0 -0
  221. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/hook_logging.py +0 -0
  222. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/hook_manager.py +0 -0
  223. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/hooks.py +0 -0
  224. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/ignore.py +0 -0
  225. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/inference.py +0 -0
  226. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/knowledge_base.py +0 -0
  227. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/retrieval.py +0 -0
  228. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/sources.py +0 -0
  229. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/time.py +0 -0
  230. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus/uris.py +0 -0
  231. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  232. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  233. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus.egg-info/requires.txt +0 -0
  234. {biblicus-0.9.0 → biblicus-0.11.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.9.0
3
+ Version: 0.11.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -493,6 +493,12 @@ Two backends are included.
493
493
 
494
494
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
495
495
 
496
+ ## Retrieval documentation
497
+
498
+ For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
499
+ (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
500
+ and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
501
+
496
502
  ## Extraction backends
497
503
 
498
504
  These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
@@ -531,12 +537,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
531
537
 
532
538
  ## Topic modeling analysis
533
539
 
534
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
535
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
536
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
537
- JavaScript Object Notation.
540
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
541
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
542
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
543
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
538
544
 
539
- See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
545
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
546
+ `docs/TOPIC_MODELING.md` for topic modeling details.
540
547
 
541
548
  Run a topic analysis using a recipe file:
542
549
 
@@ -447,6 +447,12 @@ Two backends are included.
447
447
 
448
448
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
449
449
 
450
+ ## Retrieval documentation
451
+
452
+ For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
453
+ (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
454
+ and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
455
+
450
456
  ## Extraction backends
451
457
 
452
458
  These extractors are built in. Optional ones require extra dependencies. See [text extraction documentation][text-extraction] for details.
@@ -485,12 +491,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
485
491
 
486
492
  ## Topic modeling analysis
487
493
 
488
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
489
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
490
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
491
- JavaScript Object Notation.
494
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
495
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
496
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
497
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
492
498
 
493
- See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
499
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
500
+ `docs/TOPIC_MODELING.md` for topic modeling details.
494
501
 
495
502
  Run a topic analysis using a recipe file:
496
503
 
@@ -34,3 +34,14 @@ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --fo
34
34
 
35
35
  The command prints the analysis run identifier and the output path. Open the resulting `output.json` to inspect per-topic
36
36
  labels, keywords, and document examples.
37
+
38
+ ## Profiling analysis
39
+
40
+ Profiling is the baseline analysis backend. It summarizes corpus composition and extraction coverage using
41
+ deterministic counts and distribution metrics. See `docs/PROFILING.md` for the full reference and working demo.
42
+
43
+ Run profiling from the CLI:
44
+
45
+ ```
46
+ biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
47
+ ```
@@ -88,11 +88,11 @@ Evidence is the canonical output of retrieval. Required fields:
88
88
  ### Integration boundary
89
89
 
90
90
  - Biblicus can integrate with Tactus as a **Model Context Protocol toolset**, for example with tool names such as `knowledge_base_ingest`, `knowledge_base_query`, and `knowledge_base_stats`.
91
- - We will **not** add a knowledge base or retrieval augmented generation language primitive in version zero. Revisit only if we need semantics that tools cannot express cleanly, such as enforceable policy boundaries, runtime managed durability, caching hooks, or guaranteed instrumentation.
91
+ - We do **not** add a knowledge base or retrieval augmented generation language primitive in version zero. Revisit only if we need semantics that tools cannot express cleanly, such as enforceable policy boundaries, runtime managed durability, caching hooks, or guaranteed instrumentation.
92
92
 
93
93
  ### Interface packaging
94
94
 
95
- - The knowledge base interface is a **small protocol and reference implementation**, including tool schemas and a reference Model Context Protocol server. We will not build a full managed service in version zero.
95
+ - The knowledge base interface is a **small protocol and reference implementation**, including tool schemas and a reference Model Context Protocol server. We do not build a full managed service in version zero.
96
96
 
97
97
  ### Corpus identity and layout
98
98
 
@@ -143,7 +143,7 @@ The interface stays the same; topology is configuration.
143
143
  - When a backend produces persisted materializations, Biblicus treats them as **versioned build runs** identified by `run_id` (rather than overwriting in place by default).
144
144
  - Manifests exist even for just-in-time backends (materializations may be empty).
145
145
  - Full directed acyclic graph lineage is not included in version zero; revisit only if needed.
146
- - Future (optional): define **shared materialization formats** (canonical chunk and embedding stores) so multiple backends can reuse intermediates when it makes sense; keep it opt-in.
146
+ - Optional: define **shared materialization formats** (canonical chunk and embedding stores) so multiple backends can reuse intermediates when it makes sense; keep it opt-in.
147
147
 
148
148
  ### Evaluation
149
149
 
@@ -156,7 +156,7 @@ The interface stays the same; topology is configuration.
156
156
  - The corpus catalog is **file-based** (committable, portable, backend-agnostic) so any backend/tool can consume it without requiring a database engine.
157
157
  - Canonical version zero format is a single JavaScript Object Notation file at `.biblicus/catalog.json`, written atomically (temporary file and rename) on updates.
158
158
  - The catalog includes `latest_run_id` and run manifests are stored at `.biblicus/runs/<run_id>.json`.
159
- - If this ever becomes a bottleneck at very large scales, we will **change the specification** (bump `schema_version`) rather than introduce multiple “supported” catalog storage modes.
159
+ - If this becomes a bottleneck at very large scales, we **change the specification** (bump `schema_version`) rather than introduce multiple “supported” catalog storage modes.
160
160
 
161
161
  ## Near-term deliverables
162
162
 
@@ -216,7 +216,7 @@ Version zero locked this as policy. A prune workflow was not implemented yet.
216
216
 
217
217
  Goal: retain derived artifacts from multiple implementations side by side so a user can compare results and switch between implementations without losing work.
218
218
 
219
- This decision applies to extraction plugins and retrieval backends, and to any future plugin type that produces derived artifacts.
219
+ This decision applies to extraction plugins and retrieval backends, and to any plugin type that produces derived artifacts.
220
220
 
221
221
  Option A: store artifacts under the corpus, partitioned by plugin type
222
222
 
@@ -369,7 +369,7 @@ Version zero implemented option A by writing structured log entries for hook exe
369
369
 
370
370
  ## Outcomes and remaining questions
371
371
 
372
- The hook protocol and hook logging policy above were implemented in version zero. This section records what was implemented, plus the questions that remain for future iterations.
372
+ The hook protocol and hook logging policy above were implemented in version zero. This section records what was implemented and the open questions tracked for later iterations.
373
373
 
374
374
  ### Hook contexts implemented in version zero
375
375
 
@@ -6,7 +6,7 @@ For the ordered plan of what to build next, see `docs/ROADMAP.md`.
6
6
 
7
7
  ## Diagram of the current system and the next layers
8
8
 
9
- Blue boxes are implemented now. Purple boxes are planned next layers that we can build and compare.
9
+ Blue boxes are implemented now. Purple boxes are layers not implemented yet that we can build and compare.
10
10
 
11
11
  ```mermaid
12
12
  %%{init: {"flowchart": {"useMaxWidth": true, "nodeSpacing": 18, "rankSpacing": 22}}}%%
@@ -214,6 +214,14 @@ python3 scripts/topic_modeling_integration.py \
214
214
  The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
215
215
  keywords, and document examples.
216
216
 
217
+ ### Profiling analysis demo
218
+
219
+ The profiling demo downloads AG News, runs extraction, and produces a profiling report.
220
+
221
+ ```
222
+ python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
223
+ ```
224
+
217
225
  ### Select extracted text within a pipeline
218
226
 
219
227
  When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
@@ -225,7 +233,7 @@ python3 -m biblicus extract build --corpus corpora/demo \\
225
233
  --step select-text
226
234
  ```
227
235
 
228
- Copy the `run_id` from the JavaScript Object Notation output. You will use it as `EXTRACTION_RUN_ID` in the next command.
236
+ Copy the `run_id` from the JavaScript Object Notation output. Use it as `EXTRACTION_RUN_ID` in the next command.
229
237
 
230
238
  ```
231
239
  python3 -m biblicus build --corpus corpora/demo --backend sqlite-full-text-search \\
@@ -243,7 +251,7 @@ python3 scripts/download_pdf_samples.py --corpus corpora/pdf_samples --force
243
251
  python3 -m biblicus extract build --corpus corpora/pdf_samples --step pdf-text
244
252
  ```
245
253
 
246
- Copy the `run_id` from the JavaScript Object Notation output. You will use it as `PDF_EXTRACTION_RUN_ID` in the next command.
254
+ Copy the `run_id` from the JavaScript Object Notation output. Use it as `PDF_EXTRACTION_RUN_ID` in the next command.
247
255
 
248
256
  ```
249
257
  python3 -m biblicus build --corpus corpora/pdf_samples --backend sqlite-full-text-search --config extraction_run=pipeline:PDF_EXTRACTION_RUN_ID --config chunk_size=200 --config chunk_overlap=50 --config snippet_characters=120
@@ -0,0 +1,98 @@
1
+ # Corpus profiling analysis
2
+
3
+ Biblicus provides a profiling analysis backend that summarizes corpus contents using deterministic counts and
4
+ coverage metrics. Profiling is intended as a fast, local baseline before heavier analysis such as topic modeling.
5
+
6
+ ## What profiling does
7
+
8
+ The profiling analysis reports:
9
+
10
+ - Total item count and media type distribution
11
+ - Extracted text coverage (present, empty, missing)
12
+ - Size and length distributions with percentiles
13
+ - Tag coverage and top tags
14
+
15
+ The output is structured JSON that can be stored, versioned, and compared across runs.
16
+
17
+ ## Run profiling from the CLI
18
+
19
+ ```
20
+ biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
21
+ ```
22
+
23
+ If you omit `--extraction-run`, Biblicus uses the latest extraction run and emits a reproducibility warning.
24
+
25
+ To customize profiling metrics, pass a recipe file:
26
+
27
+ ```
28
+ biblicus analyze profile --corpus corpora/example --recipe recipes/profiling.yml --extraction-run pipeline:RUN_ID
29
+ ```
30
+
31
+ ### Profiling recipe configuration
32
+
33
+ Profiling recipes use the analysis schema version and accept these fields:
34
+
35
+ - `schema_version`: analysis schema version, currently `1`
36
+ - `sample_size`: optional cap for distribution calculations
37
+ - `min_text_characters`: minimum extracted text length for inclusion
38
+ - `percentiles`: percentiles to compute for size and length distributions
39
+ - `top_tag_count`: maximum number of tags to list in `top_tags`
40
+ - `tag_filters`: optional list of tags to include in tag coverage metrics
41
+
42
+ Example recipe:
43
+
44
+ ```
45
+ schema_version: 1
46
+ sample_size: 500
47
+ min_text_characters: 50
48
+ percentiles: [50, 90, 99]
49
+ top_tag_count: 10
50
+ tag_filters: ["ag_news", "label:World"]
51
+ ```
52
+
53
+ ## Run profiling from Python
54
+
55
+ ```
56
+ from pathlib import Path
57
+
58
+ from biblicus.analysis import get_analysis_backend
59
+ from biblicus.corpus import Corpus
60
+ from biblicus.models import ExtractionRunReference
61
+
62
+ corpus = Corpus.open(Path("corpora/example"))
63
+ backend = get_analysis_backend("profiling")
64
+ output = backend.run_analysis(
65
+ corpus,
66
+ recipe_name="default",
67
+ config={
68
+ "schema_version": 1,
69
+ "sample_size": 500,
70
+ "min_text_characters": 50,
71
+ "percentiles": [50, 90, 99],
72
+ "top_tag_count": 10,
73
+ "tag_filters": ["ag_news"],
74
+ },
75
+ extraction_run=ExtractionRunReference(
76
+ extractor_id="pipeline",
77
+ run_id="RUN_ID",
78
+ ),
79
+ )
80
+ print(output.model_dump())
81
+ ```
82
+
83
+ ## Output location
84
+
85
+ Profiling output is stored under:
86
+
87
+ ```
88
+ .biblicus/runs/analysis/profiling/<run_id>/output.json
89
+ ```
90
+
91
+ ## Working demo
92
+
93
+ A runnable demo is provided in `scripts/profiling_demo.py`. It downloads a corpus, runs extraction, and executes the
94
+ profiling analysis so you can inspect the output:
95
+
96
+ ```
97
+ python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
98
+ ```
@@ -0,0 +1,47 @@
1
+ # Retrieval
2
+
3
+ Biblicus treats retrieval as a reproducible, explicit pipeline stage that transforms a corpus into structured evidence.
4
+ Retrieval is separated from extraction and context shaping so each can be evaluated independently and swapped without
5
+ rewriting ingestion.
6
+
7
+ ## Retrieval concepts
8
+
9
+ - **Backend**: a pluggable retrieval implementation that can build and query runs.
10
+ - **Run**: a recorded retrieval build for a corpus and extraction run.
11
+ - **Evidence**: structured output containing identifiers, provenance, and scores.
12
+ - **Stage**: explicit steps such as retrieve, rerank, and filter.
13
+
14
+ ## How retrieval runs work
15
+
16
+ 1) Ingest raw items into a corpus.
17
+ 2) Build an extraction run to produce text artifacts.
18
+ 3) Build a retrieval run with a backend, referencing the extraction run.
19
+ 4) Query the run to return evidence.
20
+
21
+ Retrieval runs are stored under:
22
+
23
+ ```
24
+ .biblicus/runs/retrieval/<backend_id>/<run_id>/
25
+ ```
26
+
27
+ ## Backends
28
+
29
+ See `docs/backends/index.md` for backend selection and configuration.
30
+
31
+ ## Evaluation
32
+
33
+ Retrieval runs are evaluated against datasets with explicit budgets. See `docs/RETRIEVAL_EVALUATION.md` for the
34
+ dataset format and workflow, `docs/FEATURE_INDEX.md` for the behavior specifications, and `docs/CONTEXT_PACK.md` for
35
+ how evidence feeds into context packs.
36
+
37
+ ## Why the separation matters
38
+
39
+ Keeping extraction and retrieval distinct makes it possible to:
40
+
41
+ - Reuse the same extracted artifacts across many retrieval backends.
42
+ - Compare backends against the same corpus and dataset inputs.
43
+ - Record and audit retrieval decisions without mixing in prompting or context formatting.
44
+
45
+ ## Retrieval quality
46
+
47
+ For retrieval quality upgrades, see `docs/RETRIEVAL_QUALITY.md`.
@@ -0,0 +1,74 @@
1
+ # Retrieval evaluation
2
+
3
+ Biblicus evaluates retrieval runs against deterministic datasets so quality comparisons are repeatable across backends
4
+ and corpora. Evaluations keep the evidence-first model intact by reporting per-query evidence alongside summary
5
+ metrics.
6
+
7
+ ## Dataset format
8
+
9
+ Retrieval datasets are stored as JavaScript Object Notation files with a strict schema:
10
+
11
+ ```json
12
+ {
13
+ "schema_version": 1,
14
+ "name": "example-dataset",
15
+ "description": "Small hand-labeled dataset for smoke tests.",
16
+ "queries": [
17
+ {
18
+ "query_id": "q-001",
19
+ "query_text": "alpha",
20
+ "expected_item_id": "item-id-123",
21
+ "kind": "gold"
22
+ }
23
+ ]
24
+ }
25
+ ```
26
+
27
+ Each query includes either an `expected_item_id` or an `expected_source_uri`. The `kind` field records whether the
28
+ query is hand-labeled (`gold`) or synthetic.
29
+
30
+ ## Running an evaluation
31
+
32
+ Use the command-line interface to evaluate a retrieval run against a dataset:
33
+
34
+ ```bash
35
+ biblicus eval --corpus corpora/example --run <run_id> --dataset datasets/retrieval.json \
36
+ --max-total-items 5 --max-total-characters 2000 --max-items-per-source 5
37
+ ```
38
+
39
+ If `--run` is omitted, the latest retrieval run is used. Evaluations are deterministic for the same corpus, run, and
40
+ budget.
41
+
42
+ ## Output
43
+
44
+ The evaluation output includes:
45
+
46
+ - Dataset metadata (name, description, query count).
47
+ - Run metadata (backend ID, run ID, evaluation timestamp).
48
+ - Metrics (hit rate, precision-at-k, mean reciprocal rank).
49
+ - System diagnostics (latency percentiles and index size).
50
+
51
+ The output is JavaScript Object Notation suitable for downstream reporting.
52
+
53
+ ## Python usage
54
+
55
+ ```python
56
+ from pathlib import Path
57
+
58
+ from biblicus.corpus import Corpus
59
+ from biblicus.evaluation import evaluate_run, load_dataset
60
+ from biblicus.models import QueryBudget
61
+
62
+ corpus = Corpus.open("corpora/example")
63
+ run = corpus.load_run("<run_id>")
64
+ dataset = load_dataset(Path("datasets/retrieval.json"))
65
+ budget = QueryBudget(max_total_items=5, max_total_characters=2000, max_items_per_source=5)
66
+ result = evaluate_run(corpus=corpus, run=run, dataset=dataset, budget=budget)
67
+ print(result.model_dump_json(indent=2))
68
+ ```
69
+
70
+ ## Design notes
71
+
72
+ - Evaluation is reproducible by construction: the run manifest, dataset, and budget fully determine the results.
73
+ - The evaluation workflow expects retrieval stages to remain explicit in the run artifacts.
74
+ - Reports are portable, so comparisons across backends and corpora are straightforward.
@@ -0,0 +1,42 @@
1
+ # Retrieval quality upgrades
2
+
3
+ This document describes the retrieval quality upgrades available in Biblicus. It is a reference for how retrieval
4
+ quality is expressed in runs and should be read alongside `docs/ROADMAP.md`.
5
+
6
+ ## Goals
7
+
8
+ - Improve relevance without losing determinism or reproducibility.
9
+ - Keep retrieval stages explicit and visible in run artifacts.
10
+ - Preserve the evidence-first output model.
11
+
12
+ ## Available upgrades
13
+
14
+ ### 1) Tuned lexical baseline
15
+
16
+ - BM25-style scoring with configurable parameters.
17
+ - N-gram range controls.
18
+ - Stop word strategy per backend.
19
+ - Field weighting (for example: title, body, metadata).
20
+
21
+ ### 2) Reranking stage
22
+
23
+ - Optional rerank step that re-scores top-N candidates.
24
+ - Deterministic scoring keeps rerank behavior reproducible.
25
+
26
+ ### 3) Hybrid retrieval
27
+
28
+ - Combine lexical and embedding signals.
29
+ - Expose fusion weights in the recipe schema.
30
+ - Emit stage-level scores and weights in evidence metadata.
31
+
32
+ ## Evaluation guidance
33
+
34
+ - Measure accuracy-at-k and compare against the same datasets.
35
+ - Run artifacts capture each stage and configuration for auditability.
36
+ - Deterministic settings remain available as the default baseline.
37
+
38
+ ## Non-goals
39
+
40
+ - Automated hyperparameter tuning.
41
+ - Hidden fallback stages that obscure retrieval behavior.
42
+ - UI-driven tuning in this phase.
@@ -31,6 +31,21 @@ Acceptance checks:
31
31
  - Dataset formats are versioned when they change.
32
32
  - Reports remain deterministic for the same inputs.
33
33
 
34
+ ## Next: retrieval quality upgrades
35
+
36
+ Goal: make retrieval relevance stronger while keeping deterministic baselines and clear evaluation.
37
+
38
+ Deliverables:
39
+
40
+ - A tuned lexical baseline (for example: BM25 configuration, n-grams, field weighting, stop word controls).
41
+ - A reranking stage that can refine top-N results with either a cross-encoder or an LLM re-ranker.
42
+ - A hybrid retrieval mode that combines lexical signals with embeddings and exposes weights explicitly.
43
+
44
+ Acceptance checks:
45
+
46
+ - Accuracy-at-k improves on the same evaluation datasets without regressions in determinism.
47
+ - Retrieval stages are explicitly recorded (retrieve, rerank, filter) in the output artifacts.
48
+
34
49
  ## Next: context pack policy surfaces
35
50
 
36
51
  Goal: make context shaping policies easier to evaluate and swap.
@@ -67,7 +82,6 @@ Goal: provide lightweight analysis utilities that summarize corpus themes and gu
67
82
 
68
83
  Deliverables:
69
84
 
70
- - Basic data profiling reports (counts, media types, size distributions, tag coverage).
71
85
  - Hidden Markov modeling analysis for sequence-driven corpora.
72
86
  - A way to compare analysis outputs across corpora or corpus snapshots.
73
87
 
@@ -4,8 +4,13 @@ Sphinx configuration for Biblicus documentation.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ import os
8
+ import sys
7
9
  from pathlib import Path
8
10
 
11
+ from pygments.lexers.special import TextLexer
12
+ from sphinx.highlighting import lexers
13
+
9
14
  PROJECT_ROOT = Path(__file__).resolve().parent.parent
10
15
  SOURCE_ROOT = PROJECT_ROOT / "src"
11
16
 
@@ -31,8 +36,6 @@ html_theme_options = {
31
36
  }
32
37
 
33
38
  # ReadTheDocs integration - canonical URL for SEO
34
- import os
35
-
36
39
  if os.environ.get("READTHEDOCS"):
37
40
  rtd_version = os.environ.get("READTHEDOCS_VERSION", "latest")
38
41
  rtd_project = os.environ.get("READTHEDOCS_PROJECT", "biblicus")
@@ -44,12 +47,6 @@ source_suffix = {
44
47
  }
45
48
 
46
49
  suppress_warnings = ["misc.highlighting_failure"]
47
-
48
- import sys
49
-
50
50
  sys.path.insert(0, str(SOURCE_ROOT))
51
51
 
52
- from pygments.lexers.special import TextLexer
53
- from sphinx.highlighting import lexers
54
-
55
52
  lexers["mermaid"] = TextLexer()
@@ -120,12 +120,12 @@ title: My Document
120
120
  tags: [note, draft]
121
121
  ---
122
122
 
123
- This is the body content that will be extracted.
123
+ This is the body content that is extracted.
124
124
  ```
125
125
 
126
126
  Output text:
127
127
  ```
128
- This is the body content that will be extracted.
128
+ This is the body content that is extracted.
129
129
  ```
130
130
 
131
131
  ### Mixed Format Pipeline
@@ -185,7 +185,7 @@ Non-text items are silently skipped (returns `None`). This allows the extractor
185
185
 
186
186
  ### Encoding Errors
187
187
 
188
- UTF-8 decoding errors will cause per-item failures recorded in `errored_items` but won't halt the entire extraction run.
188
+ UTF-8 decoding errors cause per-item failures recorded in `errored_items` but do not halt the entire extraction run.
189
189
 
190
190
  ### Missing Files
191
191
 
@@ -78,7 +78,7 @@ class UnstructuredExtractorConfig(BaseModel):
78
78
 
79
79
  ### Configuration Options
80
80
 
81
- This extractor currently accepts no configuration. Future versions may expose Unstructured library options.
81
+ This extractor currently accepts no configuration. Optional extensions may expose Unstructured library options.
82
82
 
83
83
  ## Usage
84
84
 
@@ -15,8 +15,12 @@ Contents
15
15
  KNOWLEDGE_BASE
16
16
  BACKENDS
17
17
  backends/index
18
+ RETRIEVAL
19
+ RETRIEVAL_QUALITY
20
+ RETRIEVAL_EVALUATION
18
21
  CONTEXT_PACK
19
22
  ANALYSIS
23
+ PROFILING
20
24
  TOPIC_MODELING
21
25
  DEMOS
22
26
  USER_CONFIGURATION
@@ -56,3 +56,55 @@ Feature: Analysis schema validation
56
56
  When I attempt to validate a vectorizer config with stop words "spanish"
57
57
  Then a model validation error is raised
58
58
  And the validation error mentions "vectorizer.stop_words must be"
59
+
60
+ Scenario: Profiling config rejects invalid sample size
61
+ When I attempt to validate a profiling config with sample size 0
62
+ Then a model validation error is raised
63
+ And the validation error mentions "sample_size"
64
+
65
+ Scenario: Profiling config rejects unsupported schema version
66
+ When I attempt to validate a profiling config with schema version 2
67
+ Then a model validation error is raised
68
+ And the validation error mentions "Unsupported analysis schema version"
69
+
70
+ Scenario: Profiling config rejects invalid percentiles
71
+ When I attempt to validate a profiling config with percentiles "0,101"
72
+ Then a model validation error is raised
73
+ And the validation error mentions "percentiles"
74
+
75
+ Scenario: Profiling config rejects empty percentiles
76
+ When I attempt to validate a profiling config with empty percentiles
77
+ Then a model validation error is raised
78
+ And the validation error mentions "percentiles"
79
+
80
+ Scenario: Profiling config rejects unsorted percentiles
81
+ When I attempt to validate a profiling config with percentiles "90,50"
82
+ Then a model validation error is raised
83
+ And the validation error mentions "percentiles"
84
+
85
+ Scenario: Profiling config rejects empty tag filters
86
+ When I attempt to validate a profiling config with tag filters "alpha,,beta"
87
+ Then a model validation error is raised
88
+ And the validation error mentions "tag_filters"
89
+
90
+ Scenario: Profiling config rejects non-list tag filters
91
+ When I attempt to validate a profiling config with tag filters string "alpha"
92
+ Then a model validation error is raised
93
+ And the validation error mentions "tag_filters"
94
+
95
+ Scenario: Profiling config accepts tag filters None
96
+ When I validate a profiling config with tag filters None
97
+ Then the profiling tag filters are absent
98
+
99
+ Scenario: Profiling config normalizes tag filters
100
+ When I validate a profiling config with tag filters list " alpha ,beta "
101
+ Then the profiling tag filters include "alpha"
102
+ And the profiling tag filters include "beta"
103
+
104
+ Scenario: Profiling ordering helper ignores missing items
105
+ When I order catalog items with missing entries
106
+ Then the ordered catalog item identifiers equal "a,c,b"
107
+
108
+ Scenario: Profiling percentile helper handles empty values
109
+ When I compute a profiling percentile on empty values
110
+ Then the profiling percentile value equals 0
@@ -17,7 +17,6 @@ def _repo_root() -> Path:
17
17
  :return: Repository root path.
18
18
  :rtype: Path
19
19
  """
20
-
21
20
  return Path(__file__).resolve().parent.parent
22
21
 
23
22
 
@@ -32,7 +31,6 @@ def before_scenario(context, scenario) -> None:
32
31
  :return: None.
33
32
  :rtype: None
34
33
  """
35
-
36
34
  import biblicus.__main__ as _biblicus_main
37
35
 
38
36
  _ = _biblicus_main
@@ -74,7 +72,6 @@ def after_scenario(context, scenario) -> None:
74
72
  :return: None.
75
73
  :rtype: None
76
74
  """
77
-
78
75
  if getattr(context, "httpd", None) is not None:
79
76
  context.httpd.shutdown()
80
77
  context.httpd.server_close()
@@ -221,7 +218,9 @@ def after_scenario(context, scenario) -> None:
221
218
  context.fake_paddleocr_vl_behaviors.clear()
222
219
  if getattr(context, "_fake_paddleocr_installed", False):
223
220
  # Remove all paddle-related modules
224
- paddle_module_names = [name for name in list(sys.modules.keys()) if "paddle" in name.lower()]
221
+ paddle_module_names = [
222
+ name for name in list(sys.modules.keys()) if "paddle" in name.lower()
223
+ ]
225
224
  for name in paddle_module_names:
226
225
  sys.modules.pop(name, None)
227
226
  # Restore original modules
@@ -345,7 +344,6 @@ def run_biblicus(
345
344
  :return: Captured execution result.
346
345
  :rtype: RunResult
347
346
  """
348
-
349
347
  import contextlib
350
348
  import io
351
349