biblicus 0.15.1__tar.gz → 0.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. {biblicus-0.15.1/src/biblicus.egg-info → biblicus-0.16.0}/PKG-INFO +2 -1
  2. {biblicus-0.15.1 → biblicus-0.16.0}/docs/BACKENDS.md +1 -1
  3. biblicus-0.16.0/docs/CHUNKING.md +69 -0
  4. biblicus-0.16.0/docs/EMBEDDING_RETRIEVAL.md +57 -0
  5. biblicus-0.16.0/docs/PR_FAQ_EMBEDDING_RETRIEVAL.md +105 -0
  6. {biblicus-0.15.1 → biblicus-0.16.0}/docs/RETRIEVAL.md +2 -1
  7. biblicus-0.16.0/docs/backends/embedding-index-file.md +34 -0
  8. biblicus-0.16.0/docs/backends/embedding-index-inmemory.md +34 -0
  9. {biblicus-0.15.1 → biblicus-0.16.0}/docs/backends/index.md +29 -5
  10. biblicus-0.16.0/docs/backends/tf-vector.md +59 -0
  11. {biblicus-0.15.1 → biblicus-0.16.0}/docs/conf.py +2 -1
  12. {biblicus-0.15.1 → biblicus-0.16.0}/features/cli_parsing.feature +26 -0
  13. biblicus-0.16.0/features/embedding_retrieval.feature +341 -0
  14. biblicus-0.16.0/features/markov_embeddings_errors.feature +13 -0
  15. biblicus-0.16.0/features/retrieval_build_recipes.feature +19 -0
  16. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_quality.feature +13 -13
  17. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/cli_parsing_steps.py +16 -0
  18. biblicus-0.16.0/features/steps/embedding_retrieval_coverage_steps.py +453 -0
  19. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/extraction_steps.py +20 -0
  20. biblicus-0.16.0/features/steps/markov_embeddings_error_steps.py +69 -0
  21. biblicus-0.16.0/features/steps/retrieval_build_recipe_steps.py +64 -0
  22. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/retrieval_quality_steps.py +2 -2
  23. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/retrieval_steps.py +63 -0
  24. biblicus-0.16.0/features/steps/wikitext_steps.py +31 -0
  25. {biblicus-0.15.1 → biblicus-0.16.0}/pyproject.toml +2 -1
  26. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/__init__.py +1 -1
  27. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/markov.py +35 -3
  28. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/backends/__init__.py +6 -2
  29. biblicus-0.16.0/src/biblicus/backends/embedding_index_common.py +301 -0
  30. biblicus-0.16.0/src/biblicus/backends/embedding_index_file.py +266 -0
  31. biblicus-0.16.0/src/biblicus/backends/embedding_index_inmemory.py +268 -0
  32. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/backends/hybrid.py +4 -2
  33. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/backends/sqlite_full_text_search.py +1 -1
  34. biblicus-0.15.1/src/biblicus/backends/vector.py → biblicus-0.16.0/src/biblicus/backends/tf_vector.py +11 -11
  35. biblicus-0.16.0/src/biblicus/chunking.py +396 -0
  36. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/cli.py +50 -10
  37. biblicus-0.16.0/src/biblicus/embedding_providers.py +122 -0
  38. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/frontmatter.py +2 -0
  39. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/models.py +9 -0
  40. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/retrieval.py +5 -0
  41. {biblicus-0.15.1 → biblicus-0.16.0/src/biblicus.egg-info}/PKG-INFO +2 -1
  42. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus.egg-info/SOURCES.txt +19 -2
  43. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus.egg-info/requires.txt +1 -0
  44. biblicus-0.15.1/docs/backends/vector.md +0 -59
  45. {biblicus-0.15.1 → biblicus-0.16.0}/LICENSE +0 -0
  46. {biblicus-0.15.1 → biblicus-0.16.0}/MANIFEST.in +0 -0
  47. {biblicus-0.15.1 → biblicus-0.16.0}/README.md +0 -0
  48. {biblicus-0.15.1 → biblicus-0.16.0}/THIRD_PARTY_NOTICES.md +0 -0
  49. {biblicus-0.15.1 → biblicus-0.16.0}/datasets/extraction_lab/labels.json +0 -0
  50. {biblicus-0.15.1 → biblicus-0.16.0}/datasets/retrieval_lab/labels.json +0 -0
  51. {biblicus-0.15.1 → biblicus-0.16.0}/datasets/wikipedia_mini.json +0 -0
  52. {biblicus-0.15.1 → biblicus-0.16.0}/docs/ANALYSIS.md +0 -0
  53. {biblicus-0.15.1 → biblicus-0.16.0}/docs/ARCHITECTURE.md +0 -0
  54. {biblicus-0.15.1 → biblicus-0.16.0}/docs/ARCHITECTURE_DETAIL.md +0 -0
  55. {biblicus-0.15.1 → biblicus-0.16.0}/docs/CONTEXT_PACK.md +0 -0
  56. {biblicus-0.15.1 → biblicus-0.16.0}/docs/CORPUS.md +0 -0
  57. {biblicus-0.15.1 → biblicus-0.16.0}/docs/CORPUS_DESIGN.md +0 -0
  58. {biblicus-0.15.1 → biblicus-0.16.0}/docs/DEMOS.md +0 -0
  59. {biblicus-0.15.1 → biblicus-0.16.0}/docs/EXTRACTION.md +0 -0
  60. {biblicus-0.15.1 → biblicus-0.16.0}/docs/EXTRACTION_EVALUATION.md +0 -0
  61. {biblicus-0.15.1 → biblicus-0.16.0}/docs/FEATURE_INDEX.md +0 -0
  62. {biblicus-0.15.1 → biblicus-0.16.0}/docs/KNOWLEDGE_BASE.md +0 -0
  63. {biblicus-0.15.1 → biblicus-0.16.0}/docs/MARKOV_ANALYSIS.md +0 -0
  64. {biblicus-0.15.1 → biblicus-0.16.0}/docs/PROFILING.md +0 -0
  65. {biblicus-0.15.1 → biblicus-0.16.0}/docs/PR_FAQ_TEXT_ANNOTATE.md +0 -0
  66. {biblicus-0.15.1 → biblicus-0.16.0}/docs/RETRIEVAL_EVALUATION.md +0 -0
  67. {biblicus-0.15.1 → biblicus-0.16.0}/docs/RETRIEVAL_QUALITY.md +0 -0
  68. {biblicus-0.15.1 → biblicus-0.16.0}/docs/ROADMAP.md +0 -0
  69. {biblicus-0.15.1 → biblicus-0.16.0}/docs/STT.md +0 -0
  70. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TESTING.md +0 -0
  71. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TEXT_ANNOTATE.md +0 -0
  72. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TEXT_EXTRACT.md +0 -0
  73. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TEXT_LINK.md +0 -0
  74. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TEXT_REDACT.md +0 -0
  75. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TEXT_SLICE.md +0 -0
  76. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TEXT_UTILITIES.md +0 -0
  77. {biblicus-0.15.1 → biblicus-0.16.0}/docs/TOPIC_MODELING.md +0 -0
  78. {biblicus-0.15.1 → biblicus-0.16.0}/docs/USER_CONFIGURATION.md +0 -0
  79. {biblicus-0.15.1 → biblicus-0.16.0}/docs/USE_CASES.md +0 -0
  80. {biblicus-0.15.1 → biblicus-0.16.0}/docs/UTILITIES.md +0 -0
  81. {biblicus-0.15.1 → biblicus-0.16.0}/docs/api.rst +0 -0
  82. {biblicus-0.15.1 → biblicus-0.16.0}/docs/backends/scan.md +0 -0
  83. {biblicus-0.15.1 → biblicus-0.16.0}/docs/backends/sqlite-full-text-search.md +0 -0
  84. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/index.md +0 -0
  85. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/ocr/index.md +0 -0
  86. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
  87. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/ocr/rapidocr.md +0 -0
  88. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/pipeline-utilities/index.md +0 -0
  89. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
  90. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
  91. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
  92. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
  93. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
  94. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
  95. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/speech-to-text/index.md +0 -0
  96. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/speech-to-text/openai.md +0 -0
  97. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/text-document/index.md +0 -0
  98. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/text-document/markitdown.md +0 -0
  99. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/text-document/metadata.md +0 -0
  100. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/text-document/pass-through.md +0 -0
  101. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/text-document/pdf.md +0 -0
  102. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/text-document/unstructured.md +0 -0
  103. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
  104. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
  105. {biblicus-0.15.1 → biblicus-0.16.0}/docs/extractors/vlm-document/index.md +0 -0
  106. {biblicus-0.15.1 → biblicus-0.16.0}/docs/index.rst +0 -0
  107. {biblicus-0.15.1 → biblicus-0.16.0}/docs/use_cases/notes_to_context_pack.md +0 -0
  108. {biblicus-0.15.1 → biblicus-0.16.0}/docs/use_cases/sequence_markov.md +0 -0
  109. {biblicus-0.15.1 → biblicus-0.16.0}/docs/use_cases/text_folder_search.md +0 -0
  110. {biblicus-0.15.1 → biblicus-0.16.0}/docs/use_cases/text_redact.md +0 -0
  111. {biblicus-0.15.1 → biblicus-0.16.0}/features/ai_llm.feature +0 -0
  112. {biblicus-0.15.1 → biblicus-0.16.0}/features/ai_models.feature +0 -0
  113. {biblicus-0.15.1 → biblicus-0.16.0}/features/analysis_schema.feature +0 -0
  114. {biblicus-0.15.1 → biblicus-0.16.0}/features/backend_validation.feature +0 -0
  115. {biblicus-0.15.1 → biblicus-0.16.0}/features/biblicus_corpus.feature +0 -0
  116. {biblicus-0.15.1 → biblicus-0.16.0}/features/cli_entrypoint.feature +0 -0
  117. {biblicus-0.15.1 → biblicus-0.16.0}/features/cli_step_spec_parsing.feature +0 -0
  118. {biblicus-0.15.1 → biblicus-0.16.0}/features/content_sniffing.feature +0 -0
  119. {biblicus-0.15.1 → biblicus-0.16.0}/features/context_pack.feature +0 -0
  120. {biblicus-0.15.1 → biblicus-0.16.0}/features/context_pack_cli.feature +0 -0
  121. {biblicus-0.15.1 → biblicus-0.16.0}/features/context_pack_policies.feature +0 -0
  122. {biblicus-0.15.1 → biblicus-0.16.0}/features/corpus_edge_cases.feature +0 -0
  123. {biblicus-0.15.1 → biblicus-0.16.0}/features/corpus_identity.feature +0 -0
  124. {biblicus-0.15.1 → biblicus-0.16.0}/features/corpus_purge.feature +0 -0
  125. {biblicus-0.15.1 → biblicus-0.16.0}/features/crawl.feature +0 -0
  126. {biblicus-0.15.1 → biblicus-0.16.0}/features/docling_granite_extractor.feature +0 -0
  127. {biblicus-0.15.1 → biblicus-0.16.0}/features/docling_smol_extractor.feature +0 -0
  128. {biblicus-0.15.1 → biblicus-0.16.0}/features/embeddings.feature +0 -0
  129. {biblicus-0.15.1 → biblicus-0.16.0}/features/environment.py +0 -0
  130. {biblicus-0.15.1 → biblicus-0.16.0}/features/error_cases.feature +0 -0
  131. {biblicus-0.15.1 → biblicus-0.16.0}/features/evaluation.feature +0 -0
  132. {biblicus-0.15.1 → biblicus-0.16.0}/features/evidence_processing.feature +0 -0
  133. {biblicus-0.15.1 → biblicus-0.16.0}/features/extraction_error_handling.feature +0 -0
  134. {biblicus-0.15.1 → biblicus-0.16.0}/features/extraction_evaluation.feature +0 -0
  135. {biblicus-0.15.1 → biblicus-0.16.0}/features/extraction_evaluation_lab.feature +0 -0
  136. {biblicus-0.15.1 → biblicus-0.16.0}/features/extraction_run_lifecycle.feature +0 -0
  137. {biblicus-0.15.1 → biblicus-0.16.0}/features/extraction_selection.feature +0 -0
  138. {biblicus-0.15.1 → biblicus-0.16.0}/features/extraction_selection_longest.feature +0 -0
  139. {biblicus-0.15.1 → biblicus-0.16.0}/features/extractor_pipeline.feature +0 -0
  140. {biblicus-0.15.1 → biblicus-0.16.0}/features/extractor_validation.feature +0 -0
  141. {biblicus-0.15.1 → biblicus-0.16.0}/features/frontmatter.feature +0 -0
  142. {biblicus-0.15.1 → biblicus-0.16.0}/features/hook_config_validation.feature +0 -0
  143. {biblicus-0.15.1 → biblicus-0.16.0}/features/hook_error_handling.feature +0 -0
  144. {biblicus-0.15.1 → biblicus-0.16.0}/features/import_tree.feature +0 -0
  145. {biblicus-0.15.1 → biblicus-0.16.0}/features/inference_backend.feature +0 -0
  146. {biblicus-0.15.1 → biblicus-0.16.0}/features/ingest_sources.feature +0 -0
  147. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_audio_samples.feature +0 -0
  148. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_image_samples.feature +0 -0
  149. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_mixed_corpus.feature +0 -0
  150. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_mixed_extraction.feature +0 -0
  151. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_ocr_image_extraction.feature +0 -0
  152. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_pdf_retrieval.feature +0 -0
  153. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_pdf_samples.feature +0 -0
  154. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_text_annotate.feature +0 -0
  155. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_text_extract.feature +0 -0
  156. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_text_link.feature +0 -0
  157. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_text_redact.feature +0 -0
  158. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_text_slice.feature +0 -0
  159. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_unstructured_extraction.feature +0 -0
  160. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_use_cases.feature +0 -0
  161. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_use_cases_sequence_markov.feature +0 -0
  162. {biblicus-0.15.1 → biblicus-0.16.0}/features/integration_wikipedia.feature +0 -0
  163. {biblicus-0.15.1 → biblicus-0.16.0}/features/knowledge_base.feature +0 -0
  164. {biblicus-0.15.1 → biblicus-0.16.0}/features/lifecycle_hooks.feature +0 -0
  165. {biblicus-0.15.1 → biblicus-0.16.0}/features/markitdown_extractor.feature +0 -0
  166. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_analysis.feature +0 -0
  167. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_analysis_categorical.feature +0 -0
  168. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_analysis_llm.feature +0 -0
  169. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_analysis_topic_modeling.feature +0 -0
  170. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_analysis_variants.feature +0 -0
  171. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_internal_branches.feature +0 -0
  172. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_schema.feature +0 -0
  173. {biblicus-0.15.1 → biblicus-0.16.0}/features/markov_start_end_labels.feature +0 -0
  174. {biblicus-0.15.1 → biblicus-0.16.0}/features/model_validation.feature +0 -0
  175. {biblicus-0.15.1 → biblicus-0.16.0}/features/ocr_extractor.feature +0 -0
  176. {biblicus-0.15.1 → biblicus-0.16.0}/features/paddleocr_vl_extractor.feature +0 -0
  177. {biblicus-0.15.1 → biblicus-0.16.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
  178. {biblicus-0.15.1 → biblicus-0.16.0}/features/pdf_text_extraction.feature +0 -0
  179. {biblicus-0.15.1 → biblicus-0.16.0}/features/profiling.feature +0 -0
  180. {biblicus-0.15.1 → biblicus-0.16.0}/features/profiling_config_overrides.feature +0 -0
  181. {biblicus-0.15.1 → biblicus-0.16.0}/features/python_api.feature +0 -0
  182. {biblicus-0.15.1 → biblicus-0.16.0}/features/python_hook_logging.feature +0 -0
  183. {biblicus-0.15.1 → biblicus-0.16.0}/features/query_processing.feature +0 -0
  184. {biblicus-0.15.1 → biblicus-0.16.0}/features/recipe_cascading.feature +0 -0
  185. {biblicus-0.15.1 → biblicus-0.16.0}/features/recipe_file_extraction.feature +0 -0
  186. {biblicus-0.15.1 → biblicus-0.16.0}/features/recipe_utilities.feature +0 -0
  187. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_budget.feature +0 -0
  188. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_evaluation_lab.feature +0 -0
  189. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_scan.feature +0 -0
  190. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  191. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_uses_extraction_run.feature +0 -0
  192. {biblicus-0.15.1 → biblicus-0.16.0}/features/retrieval_utilities.feature +0 -0
  193. {biblicus-0.15.1 → biblicus-0.16.0}/features/select_override.feature +0 -0
  194. {biblicus-0.15.1 → biblicus-0.16.0}/features/smart_override_selection.feature +0 -0
  195. {biblicus-0.15.1 → biblicus-0.16.0}/features/source_loading.feature +0 -0
  196. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/ai_llm_steps.py +0 -0
  197. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/ai_models_steps.py +0 -0
  198. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/analysis_steps.py +0 -0
  199. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/backend_steps.py +0 -0
  200. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/cli_steps.py +0 -0
  201. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/context_pack_steps.py +0 -0
  202. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/crawl_steps.py +0 -0
  203. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/deepgram_steps.py +0 -0
  204. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/docling_steps.py +0 -0
  205. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/embeddings_steps.py +0 -0
  206. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/evidence_processing_steps.py +0 -0
  207. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/extraction_evaluation_lab_steps.py +0 -0
  208. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/extraction_evaluation_steps.py +0 -0
  209. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
  210. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/extractor_steps.py +0 -0
  211. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/frontmatter_steps.py +0 -0
  212. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/inference_steps.py +0 -0
  213. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/knowledge_base_steps.py +0 -0
  214. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/markitdown_steps.py +0 -0
  215. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/markov_internal_steps.py +0 -0
  216. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/markov_schema_steps.py +0 -0
  217. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/markov_start_end_steps.py +0 -0
  218. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/markov_steps.py +0 -0
  219. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/model_steps.py +0 -0
  220. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/openai_steps.py +0 -0
  221. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/paddleocr_mock_steps.py +0 -0
  222. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/paddleocr_vl_steps.py +0 -0
  223. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/paddleocr_vl_unit_steps.py +0 -0
  224. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/pdf_steps.py +0 -0
  225. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/profiling_steps.py +0 -0
  226. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/python_api_steps.py +0 -0
  227. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/rapidocr_steps.py +0 -0
  228. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/recipe_steps.py +0 -0
  229. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/requests_mock_steps.py +0 -0
  230. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/retrieval_evaluation_lab_steps.py +0 -0
  231. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/stt_deepgram_steps.py +0 -0
  232. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/stt_steps.py +0 -0
  233. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_annotate_steps.py +0 -0
  234. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_extract_steps.py +0 -0
  235. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_internal_steps.py +0 -0
  236. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_link_internal_steps.py +0 -0
  237. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_link_steps.py +0 -0
  238. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_mock_steps.py +0 -0
  239. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_redact_steps.py +0 -0
  240. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_slice_steps.py +0 -0
  241. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/text_tool_loop_steps.py +0 -0
  242. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/topic_modeling_steps.py +0 -0
  243. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/unstructured_steps.py +0 -0
  244. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/use_cases_steps.py +0 -0
  245. {biblicus-0.15.1 → biblicus-0.16.0}/features/steps/user_config_steps.py +0 -0
  246. {biblicus-0.15.1 → biblicus-0.16.0}/features/streaming_ingest.feature +0 -0
  247. {biblicus-0.15.1 → biblicus-0.16.0}/features/stt_deepgram_extractor.feature +0 -0
  248. {biblicus-0.15.1 → biblicus-0.16.0}/features/stt_extractor.feature +0 -0
  249. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_annotate.feature +0 -0
  250. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_extract.feature +0 -0
  251. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_extraction_runs.feature +0 -0
  252. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_internal_branches.feature +0 -0
  253. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_link.feature +0 -0
  254. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_link_internal_branches.feature +0 -0
  255. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_mock.feature +0 -0
  256. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_redact.feature +0 -0
  257. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_slice.feature +0 -0
  258. {biblicus-0.15.1 → biblicus-0.16.0}/features/text_utilities.feature +0 -0
  259. {biblicus-0.15.1 → biblicus-0.16.0}/features/token_budget.feature +0 -0
  260. {biblicus-0.15.1 → biblicus-0.16.0}/features/topic_modeling.feature +0 -0
  261. {biblicus-0.15.1 → biblicus-0.16.0}/features/unstructured_extractor.feature +0 -0
  262. {biblicus-0.15.1 → biblicus-0.16.0}/features/use_cases.feature +0 -0
  263. {biblicus-0.15.1 → biblicus-0.16.0}/features/user_config.feature +0 -0
  264. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/download_ag_news.py +0 -0
  265. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/download_audio_samples.py +0 -0
  266. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/download_image_samples.py +0 -0
  267. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/download_mixed_samples.py +0 -0
  268. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/download_pdf_samples.py +0 -0
  269. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/download_wikipedia.py +0 -0
  270. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/extraction_evaluation_demo.py +0 -0
  271. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/extraction_evaluation_lab.py +0 -0
  272. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/markov_analysis_demo.py +0 -0
  273. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/markov_cached_segments_demo.py +0 -0
  274. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/markov_run_report.py +0 -0
  275. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/profiling_demo.py +0 -0
  276. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/readme_end_to_end_demo.py +0 -0
  277. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/retrieval_evaluation_lab.py +0 -0
  278. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/test.py +0 -0
  279. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/topic_modeling_integration.py +0 -0
  280. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/use_cases/notes_to_context_pack_demo.py +0 -0
  281. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/use_cases/sequence_markov_demo.py +0 -0
  282. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/use_cases/text_folder_search_demo.py +0 -0
  283. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/use_cases/text_redact_demo.py +0 -0
  284. {biblicus-0.15.1 → biblicus-0.16.0}/scripts/wikipedia_rag_demo.py +0 -0
  285. {biblicus-0.15.1 → biblicus-0.16.0}/setup.cfg +0 -0
  286. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/__main__.py +0 -0
  287. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
  288. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
  289. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
  290. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
  291. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/ai/__init__.py +0 -0
  292. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/ai/embeddings.py +0 -0
  293. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/ai/llm.py +0 -0
  294. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/ai/models.py +0 -0
  295. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/__init__.py +0 -0
  296. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/base.py +0 -0
  297. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/models.py +0 -0
  298. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/profiling.py +0 -0
  299. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/schema.py +0 -0
  300. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/analysis/topic_modeling.py +0 -0
  301. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/backends/base.py +0 -0
  302. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/backends/scan.py +0 -0
  303. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/constants.py +0 -0
  304. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/context.py +0 -0
  305. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/corpus.py +0 -0
  306. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/crawl.py +0 -0
  307. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/errors.py +0 -0
  308. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/evaluation.py +0 -0
  309. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/evidence_processing.py +0 -0
  310. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extraction.py +0 -0
  311. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extraction_evaluation.py +0 -0
  312. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/__init__.py +0 -0
  313. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/base.py +0 -0
  314. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
  315. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
  316. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
  317. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/markitdown_text.py +0 -0
  318. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/metadata_text.py +0 -0
  319. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/openai_stt.py +0 -0
  320. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/paddleocr_vl_text.py +0 -0
  321. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/pass_through_text.py +0 -0
  322. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/pdf_text.py +0 -0
  323. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/pipeline.py +0 -0
  324. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
  325. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/select_longest_text.py +0 -0
  326. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/select_override.py +0 -0
  327. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/select_smart_override.py +0 -0
  328. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/select_text.py +0 -0
  329. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/extractors/unstructured_text.py +0 -0
  330. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/hook_logging.py +0 -0
  331. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/hook_manager.py +0 -0
  332. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/hooks.py +0 -0
  333. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/ignore.py +0 -0
  334. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/inference.py +0 -0
  335. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/knowledge_base.py +0 -0
  336. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/recipes.py +0 -0
  337. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/sources.py +0 -0
  338. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/__init__.py +0 -0
  339. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/annotate.py +0 -0
  340. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/extract.py +0 -0
  341. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/link.py +0 -0
  342. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/markup.py +0 -0
  343. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/models.py +0 -0
  344. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/prompts.py +0 -0
  345. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/redact.py +0 -0
  346. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/slice.py +0 -0
  347. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/text/tool_loop.py +0 -0
  348. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/time.py +0 -0
  349. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/uris.py +0 -0
  350. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus/user_config.py +0 -0
  351. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  352. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  353. {biblicus-0.15.1 → biblicus-0.16.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.15.1
3
+ Version: 0.16.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -11,6 +11,7 @@ Requires-Dist: PyYAML>=6.0
11
11
  Requires-Dist: pypdf>=4.0
12
12
  Requires-Dist: Jinja2>=3.1
13
13
  Requires-Dist: dotyaml>=0.1.3
14
+ Requires-Dist: numpy>=1.24
14
15
  Provides-Extra: dev
15
16
  Requires-Dist: behave>=1.2.6; extra == "dev"
16
17
  Requires-Dist: coverage[toml]>=7.0; extra == "dev"
@@ -61,4 +61,4 @@ See:
61
61
 
62
62
  - `biblicus.backends.scan.ScanBackend` (minimal baseline)
63
63
  - `biblicus.backends.sqlite_full_text_search.SqliteFullTextSearchBackend` (practical local backend)
64
- - `biblicus.backends.vector.VectorBackend` (term-frequency vector baseline)
64
+ - `biblicus.backends.vector.VectorBackend` (term-frequency vector baseline; `tf-vector`)
@@ -0,0 +1,69 @@
1
+ # Chunking
2
+
3
+ Embedding retrieval depends on chunking.
4
+
5
+ Most corpora contain documents that are too long to retrieve effectively as single “whole-document” units. Biblicus
6
+ therefore treats chunking as part of the retrieval indexing contract: embeddings are computed over chunks, and retrieval
7
+ returns evidence with chunk boundaries so you can trace results back to the original item text.
8
+
9
+ ## Chunkers are pluggable
10
+
11
+ Chunking is a pluggable interface selected by identifier in a retrieval recipe:
12
+
13
+ - `chunker_id`
14
+ - `chunker_config` (Pydantic validated; `extra="forbid"`)
15
+
16
+ There are no hidden fallbacks. If you select a chunker without providing required configuration, Biblicus fails with a
17
+ user-facing error that explains what is missing.
18
+
19
+ ## Chunk identifiers and provenance
20
+
21
+ Chunks must be addressable and reproducible:
22
+
23
+ - Each chunk has a stable `chunk_id`.
24
+ - Each chunk references a parent `item_id`.
25
+ - Each chunk records boundaries (for example `start_char` and `end_char`) and optional metadata.
26
+
27
+ Evidence produced by embedding retrieval references chunk provenance so downstream tooling can reconstruct context packs
28
+ without guessing.
29
+
30
+ ## Built-in chunking strategies
31
+
32
+ Biblicus provides multiple built-in chunking strategies so you can compare tradeoffs explicitly:
33
+
34
+ ### Fixed character window
35
+
36
+ Split text into windows of a fixed character length with optional overlap.
37
+
38
+ Typical parameters:
39
+
40
+ - `window_characters`
41
+ - `overlap_characters`
42
+
43
+ ### Paragraph chunking
44
+
45
+ Split text into paragraphs (blank-line delimited), with optional joining/splitting rules to avoid chunks that are too
46
+ small or too large.
47
+
48
+ Typical parameters:
49
+
50
+ - `max_characters`
51
+ - `join_short_paragraphs`
52
+
53
+ ### Fixed token window
54
+
55
+ Split text into windows of a fixed token length with optional overlap.
56
+
57
+ Token-based chunking depends on a tokenizer interface (see next section) so Biblicus can support multiple tokenization
58
+ strategies without locking into a single library.
59
+
60
+ ## Tokenization is pluggable too
61
+
62
+ Token-based chunkers depend on a second pluggable interface selected by identifier:
63
+
64
+ - `tokenizer_id`
65
+ - `tokenizer_config`
66
+
67
+ This keeps the surface configurable while avoiding implicit dependencies. If a token-based chunker is selected without
68
+ a tokenizer implementation configured, Biblicus fails fast with explicit guidance.
69
+
@@ -0,0 +1,57 @@
1
+ # Embedding Retrieval
2
+
3
+ Embedding retrieval turns a large collection of text into a reusable index that can be queried efficiently.
4
+
5
+ Biblicus supports embedding retrieval through backends that:
6
+
7
+ 1) chunk extracted text,
8
+ 2) compute embeddings for each chunk,
9
+ 3) build an index under the corpus as run artifacts, and
10
+ 4) return evidence with chunk provenance on query.
11
+
12
+ ## Concepts
13
+
14
+ - **Chunking**: the unit of embedding and retrieval. See `docs/CHUNKING.md`.
15
+ - **Embedding provider**: a pluggable implementation that turns text into vectors.
16
+ - **Embedding index backend**: a retrieval backend that materializes vectors and supports similarity search.
17
+
18
+ ## A local, textbook embedding index
19
+
20
+ Biblicus provides two embedding index backends that avoid external services:
21
+
22
+ - `embedding-index-inmemory` for small demos with safety caps
23
+ - `embedding-index-file` for a file-backed, memory-mapped exact index
24
+
25
+ Both use exact cosine similarity. This is intentionally easy to validate and compare.
26
+
27
+ ## Build and query
28
+
29
+ Embedding retrieval is a run-based workflow:
30
+
31
+ 1) ingest items
32
+ 2) extract text (or select an existing extraction run)
33
+ 3) build an embedding retrieval run (which materializes artifacts under the corpus)
34
+ 4) query the run and inspect evidence
35
+
36
+ Example build:
37
+
38
+ ```
39
+ python -m biblicus build --corpus corpora/example --backend embedding-index-file
40
+ ```
41
+
42
+ Example query:
43
+
44
+ ```
45
+ python -m biblicus query --corpus corpora/example --run embedding-index-file:RUN_ID --query "meaningful phrase"
46
+ ```
47
+
48
+ ## Evidence and provenance
49
+
50
+ Evidence returned by embedding retrieval includes:
51
+
52
+ - the parent `item_id` and `source_uri`
53
+ - a retrieval `score` and `rank`
54
+ - chunk provenance (boundaries and identifiers)
55
+
56
+ This allows downstream tooling (including context pack formatting) to remain evidence-first and reproducible.
57
+
@@ -0,0 +1,105 @@
1
+ # PR-FAQ (Draft): Embedding Retrieval and Chunking
2
+
3
+ ## Press release
4
+
5
+ Today we are adding true embedding-based retrieval to Biblicus, including a pluggable chunking surface and a
6
+ textbook-simple local index.
7
+
8
+ Biblicus already supports deterministic lexical retrieval and hybrid retrieval wiring. This feature adds retrieval
9
+ backends that build a reusable embedding index under the corpus, so retrieval is fast, repeatable, and evaluatable.
10
+ Chunking is treated as part of the indexing contract (not an afterthought): embeddings are computed over chunks, and
11
+ retrieval returns evidence with item-level provenance plus chunk boundaries.
12
+
13
+ ## FAQ
14
+
15
+ ### What problem does this solve?
16
+
17
+ - Provide a real embedding retrieval backend with an explicit build/query lifecycle.
18
+ - Make chunking a first-class, fully configurable pipeline stage for embedding retrieval.
19
+ - Establish a stable surface for swapping embedding providers without rewriting backends.
20
+
21
+ ### What will users be able to do?
22
+
23
+ - Build a retrieval run that materializes an embedding index under a corpus.
24
+ - Query that run and receive evidence-first outputs with stable identifiers and provenance.
25
+ - Choose a chunking strategy (and its configuration) per recipe, and compare results across recipes.
26
+
27
+ ### Why do we need chunking now?
28
+
29
+ Whole-document embeddings are misleading for most real corpora. Chunking is the practical unit of retrieval and must
30
+ be part of the indexing contract so that:
31
+
32
+ - embeddings are computed over well-defined text spans,
33
+ - evidence can cite exact spans,
34
+ - hybrid retrieval and evaluation can compare like-for-like.
35
+
36
+ ### How is chunking configured?
37
+
38
+ Chunking is a pluggable interface selected by identifier in the retrieval recipe:
39
+
40
+ - `chunker_id`
41
+ - `chunker_config` (Pydantic validated; `extra="forbid"`)
42
+
43
+ No fallbacks: if a recipe selects a chunker but its implementation or required configuration is missing, Biblicus
44
+ fails fast with a user-facing error that explains exactly what to install/configure.
45
+
46
+ ### What chunkers are provided in the initial implementation?
47
+
48
+ We provide multiple built-in chunkers from day 1, each with its own configuration model:
49
+
50
+ - Fixed character window chunking (size + overlap).
51
+ - Paragraph chunking (blank-line delimited, with configurable joining/splitting behavior).
52
+ - Fixed token window chunking, via a tokenizer interface (see below).
53
+
54
+ ### How is token-based chunking handled without locking in one tokenizer library?
55
+
56
+ Token-based chunking uses a separate pluggable interface (for example `Tokenizer` or `TokenCounter`), selected by id.
57
+
58
+ This keeps the chunker configurable while avoiding hidden dependencies. If a token-based chunker is selected without a
59
+ configured tokenizer implementation, Biblicus fails with explicit guidance.
60
+
61
+ ### What embedding index backends are included?
62
+
63
+ Two concrete backends are introduced to avoid external vector stores while still being “real” retrievers:
64
+
65
+ 1) In-memory exact index
66
+ - Intended for small corpora and demos.
67
+ - Enforces a hard safety cap (for example maximum vectors or bytes).
68
+
69
+ 2) File-backed exact index (NumPy-backed)
70
+ - Writes an embedding matrix and id mapping as run artifacts under the corpus.
71
+ - Queries by memory-mapping and scanning in batches so memory usage is bounded.
72
+
73
+ Both backends use exact cosine similarity. This is intentionally “textbook” behavior that is easy to validate and
74
+ compare. Approximate nearest neighbor indexes are explicitly out of scope for this slice.
75
+
76
+ ### How do embeddings get generated?
77
+
78
+ Embeddings are generated by a pluggable `EmbeddingProvider` interface (an abstract base class), selected by Pydantic
79
+ configuration in the recipe. Concrete implementations can wrap OpenAI, Bedrock, or other providers via Biblicus’s AI
80
+ provider wiring.
81
+
82
+ ### How does this fit the “derived artifacts live under the corpus” rule?
83
+
84
+ Embedding retrieval is implemented as a run + artifacts:
85
+
86
+ - `build_run(...)` materializes the embedding index and records artifact paths in a run manifest.
87
+ - `query(...)` loads artifacts from the run and returns evidence with stable chunk provenance.
88
+
89
+ ### How do we test this in CI?
90
+
91
+ - Behavior specs cover all success and failure behaviors and enforce 100% coverage.
92
+ - CI fetches and caches WikiText-2 raw (`wikitext-2-raw-v1`) to provide “real-ish” text at scale without committing the
93
+ dataset into the repository.
94
+
95
+ ### Dependencies
96
+
97
+ - `numpy` is required for the file-backed index backend (pip-only, no system services).
98
+ - `datasets` is used only for CI fixture fetching and is installed in CI.
99
+
100
+ ## Non-goals (this slice)
101
+
102
+ - External vector stores (Qdrant, pgvector, Pinecone, etc.).
103
+ - ANN indexing (FAISS/HNSW tuning).
104
+ - Backwards-compatible aliasing for renamed backend ids.
105
+
@@ -54,7 +54,8 @@ Start with the simplest backend that answers your question:
54
54
 
55
55
  - `scan` for tiny corpora or sanity checks.
56
56
  - `sqlite-full-text-search` for a practical lexical baseline.
57
- - `vector` when you want deterministic term-frequency similarity without external dependencies.
57
+ - `tf-vector` when you want deterministic term-frequency similarity without external dependencies.
58
+ - `embedding-index-file` when you want embedding retrieval with a local, file-backed index.
58
59
 
59
60
  You can compare them with the same dataset and budget using the retrieval evaluation workflow.
60
61
 
@@ -0,0 +1,34 @@
1
+ # Embedding index (file-backed)
2
+
3
+ This backend builds an embedding index under a corpus and queries it using exact cosine similarity.
4
+
5
+ It is intended for larger corpora where you want a local, pip-installable workflow that does not depend on an external
6
+ vector database.
7
+
8
+ ## Backend ID
9
+
10
+ `embedding-index-file`
11
+
12
+ ## What it builds
13
+
14
+ This backend builds a retrieval run that materializes run artifacts under the corpus, for example:
15
+
16
+ - an embedding matrix stored as a NumPy array on disk
17
+ - an id mapping from chunk identifiers to embedding row offsets
18
+ - chunk records (text + boundaries + provenance)
19
+
20
+ Queries memory-map the embedding matrix and scan in batches so memory usage stays bounded, even when the index is larger
21
+ than available RAM.
22
+
23
+ ## Chunking
24
+
25
+ Embeddings are computed over chunks. Chunking is configured per recipe by selecting a chunker and its configuration.
26
+
27
+ Chunking is part of the index contract: evidence references chunk boundaries so you can trace retrieval outputs back to
28
+ the original item text.
29
+
30
+ ## Dependencies
31
+
32
+ - Requires `numpy`.
33
+ - Requires an embedding provider configuration.
34
+
@@ -0,0 +1,34 @@
1
+ # Embedding index (in-memory)
2
+
3
+ This backend builds an embedding index in memory and queries it using exact cosine similarity.
4
+
5
+ It is intended for textbook demos and small corpora where you want a “real” embedding retrieval loop without running an
6
+ external vector database.
7
+
8
+ ## Backend ID
9
+
10
+ `embedding-index-inmemory`
11
+
12
+ ## What it builds
13
+
14
+ This backend builds a retrieval run that materializes:
15
+
16
+ - chunk records (text + boundaries + provenance)
17
+ - embedding vectors for each chunk
18
+
19
+ All of this lives in memory while the process is running. For safety, the backend enforces explicit caps so a build does
20
+ not accidentally consume unbounded memory.
21
+
22
+ ## Chunking
23
+
24
+ Embeddings are computed over chunks. Chunking is configured per recipe by selecting a chunker and its configuration.
25
+
26
+ Chunking is part of the index contract: evidence references chunk boundaries so you can trace retrieval outputs back to
27
+ the original item text.
28
+
29
+ ## Dependencies
30
+
31
+ - Requires an embedding provider configuration.
32
+
33
+ This backend does not require a database or server.
34
+
@@ -8,7 +8,9 @@ Biblicus provides pluggable retrieval backends that implement different search a
8
8
 
9
9
  scan
10
10
  sqlite-full-text-search
11
- vector
11
+ tf-vector
12
+ embedding-index-inmemory
13
+ embedding-index-file
12
14
  ```
13
15
 
14
16
  ## Available Backends
@@ -33,16 +35,36 @@ Production-ready full-text search using SQLite FTS5 with BM25 ranking.
33
35
  - **Index**: SQLite database with FTS5 virtual tables
34
36
  - **Speed**: Fast with persistent index
35
37
 
36
- ### [vector](vector.md)
38
+ ### [tf-vector](tf-vector.md)
37
39
 
38
- Deterministic term-frequency vector retrieval with cosine similarity.
40
+ Deterministic term-frequency vector retrieval (vector space model baseline) with cosine similarity.
39
41
 
40
- - **Backend ID**: `vector`
42
+ - **Backend ID**: `tf-vector`
41
43
  - **Installation**: Included by default
42
44
  - **Best for**: Semantic-style baselines without embeddings
43
45
  - **Index**: None (scans and scores at query time)
44
46
  - **Speed**: Moderate for small corpora
45
47
 
48
+ ### [embedding-index-inmemory](embedding-index-inmemory.md)
49
+
50
+ Embedding-based retrieval with an in-memory exact cosine similarity index.
51
+
52
+ - **Backend ID**: `embedding-index-inmemory`
53
+ - **Installation**: Requires `numpy` and an embedding provider configuration
54
+ - **Best for**: Textbook demos and small corpora
55
+ - **Index**: In-memory embedding matrix
56
+ - **Speed**: Fast for small corpora; bounded by safety caps
57
+
58
+ ### [embedding-index-file](embedding-index-file.md)
59
+
60
+ Embedding-based retrieval with a file-backed exact cosine similarity index.
61
+
62
+ - **Backend ID**: `embedding-index-file`
63
+ - **Installation**: Requires `numpy` and an embedding provider configuration
64
+ - **Best for**: Larger corpora without running an external vector database
65
+ - **Index**: Memory-mapped embedding matrix + id mapping under the corpus
66
+ - **Speed**: Exact scan; bounded memory via batching
67
+
46
68
  ## Quick Start
47
69
 
48
70
  ### Installation
@@ -115,7 +137,9 @@ See `docs/RETRIEVAL_EVALUATION.md` for evaluation workflows and dataset formats.
115
137
  | Production applications | [sqlite-full-text-search](sqlite-full-text-search.md) | Fast queries with BM25 ranking |
116
138
  | Large corpora (>10,000 items) | [sqlite-full-text-search](sqlite-full-text-search.md) | Essential for performance |
117
139
  | Baseline comparisons | [scan](scan.md) | Simple reference implementation |
118
- | Term-frequency vector baseline | [vector](vector.md) | Deterministic cosine similarity |
140
+ | Term-frequency vector baseline | [tf-vector](tf-vector.md) | Deterministic cosine similarity |
141
+ | Embedding retrieval (in-memory) | [embedding-index-inmemory](embedding-index-inmemory.md) | Exact cosine similarity |
142
+ | Embedding retrieval (file-backed) | [embedding-index-file](embedding-index-file.md) | Exact cosine similarity, memory-mapped |
119
143
 
120
144
  ## Reproducibility checklist
121
145
 
@@ -0,0 +1,59 @@
1
+ # TF Vector backend
2
+
3
+ The TF Vector backend implements a deterministic vector space model baseline using term-frequency vectors and cosine
4
+ similarity. It builds no persistent index and scores items at query time. This makes it useful as a lightweight
5
+ “vector-style” baseline without dense embeddings or external services.
6
+
7
+ ## When to use it
8
+
9
+ - You want a minimal baseline to compare against lexical search.
10
+ - You want deterministic, inspectable similarity scoring.
11
+ - You are teaching retrieval concepts and want a small, runnable backend.
12
+
13
+ ## Backend ID
14
+
15
+ `tf-vector`
16
+
17
+ ## How it works
18
+
19
+ 1) Tokenize the query and each item into lowercase word tokens.
20
+ 2) Build term-frequency vectors.
21
+ 3) Compute cosine similarity between the query vector and each item vector.
22
+ 4) Return evidence ranked by similarity score.
23
+
24
+ ## Configuration
25
+
26
+ The backend accepts these configuration fields:
27
+
28
+ - `snippet_characters`: maximum characters to include in evidence snippets.
29
+ - `extraction_run`: optional extraction run reference (`extractor_id:run_id`).
30
+
31
+ Example recipe:
32
+
33
+ ```yaml
34
+ snippet_characters: 320
35
+ extraction_run: pipeline:RUN_ID
36
+ ```
37
+
38
+ ## Build a run
39
+
40
+ ```
41
+ python -m biblicus build --corpus corpora/example --backend tf-vector --config extraction_run=pipeline:RUN_ID
42
+ ```
43
+
44
+ This backend does not create artifacts beyond the run manifest.
45
+
46
+ ## Query a run
47
+
48
+ ```
49
+ python -m biblicus query --corpus corpora/example --run tf-vector:RUN_ID --query "semantic match"
50
+ ```
51
+
52
+ The evidence results include a `stage` value of `tf-vector` and similarity scores for each match.
53
+
54
+ ## What it is not
55
+
56
+ - This backend does not compute dense embeddings.
57
+ - It does not use approximate nearest neighbor indexing.
58
+ - It does not depend on external services.
59
+
@@ -25,7 +25,8 @@ extensions = [
25
25
  templates_path = ["_templates"]
26
26
  exclude_patterns = ["_build"]
27
27
  autodoc_typehints = "description"
28
- html_theme = "sphinx_rtd_theme"
28
+ html_theme = "biblicus_rtd"
29
+ html_theme_path = [Path(__file__).resolve().parent / "_themes"]
29
30
 
30
31
  html_theme_options = {
31
32
  "prev_next_buttons_location": "bottom",
@@ -20,6 +20,32 @@ Feature: Command-line interface parsing
20
20
  | mode=fast |
21
21
  Then the parsed config value "mode" is string "fast"
22
22
 
23
+ Scenario: Config pairs parse integers
24
+ When I parse config pairs:
25
+ | pair |
26
+ | max_docs=12 |
27
+ Then the parsed config value "max_docs" is int 12
28
+
29
+ Scenario: Config pairs parse JSON objects
30
+ When I parse config pairs:
31
+ | pair |
32
+ | vectorizer={"stop":["a"]} |
33
+ Then the parsed config value "vectorizer" has JSON key "stop" list item "a"
34
+
35
+ Scenario: Config pairs reject missing equals
36
+ When I attempt to parse config pairs:
37
+ | pair |
38
+ | justkey |
39
+ Then a config parsing error is raised
40
+ And the config parsing error mentions "key=value"
41
+
42
+ Scenario: Config pairs reject invalid JSON
43
+ When I attempt to parse config pairs:
44
+ | pair |
45
+ | config={bad |
46
+ Then a config parsing error is raised
47
+ And the config parsing error mentions "valid JSON"
48
+
23
49
  Scenario: Step specs reject empty strings
24
50
  When I attempt to parse an empty step spec
25
51
  Then a step spec parsing error is raised