biblicus 0.13.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (436) hide show
  1. {biblicus-0.13.0/src/biblicus.egg-info → biblicus-1.0.0}/PKG-INFO +103 -31
  2. {biblicus-0.13.0 → biblicus-1.0.0}/README.md +94 -30
  3. biblicus-1.0.0/datasets/retrieval_lab/labels.json +25 -0
  4. biblicus-1.0.0/docs/ANALYSIS.md +143 -0
  5. biblicus-1.0.0/docs/ARCHITECTURE.md +46 -0
  6. biblicus-1.0.0/docs/ARCHITECTURE_DETAIL.md +267 -0
  7. {biblicus-0.13.0 → biblicus-1.0.0}/docs/BACKENDS.md +25 -1
  8. biblicus-1.0.0/docs/CHUNKING.md +69 -0
  9. biblicus-1.0.0/docs/CONTEXT_ENGINE.md +120 -0
  10. biblicus-1.0.0/docs/CONTEXT_ENGINE_DEMO.md +96 -0
  11. {biblicus-0.13.0 → biblicus-1.0.0}/docs/CONTEXT_PACK.md +58 -0
  12. {biblicus-0.13.0 → biblicus-1.0.0}/docs/CORPUS.md +49 -10
  13. {biblicus-0.13.0 → biblicus-1.0.0}/docs/CORPUS_DESIGN.md +18 -5
  14. {biblicus-0.13.0 → biblicus-1.0.0}/docs/DEMOS.md +85 -48
  15. biblicus-1.0.0/docs/EMBEDDING_RETRIEVAL.md +57 -0
  16. {biblicus-0.13.0 → biblicus-1.0.0}/docs/EXTRACTION.md +46 -11
  17. {biblicus-0.13.0 → biblicus-1.0.0}/docs/EXTRACTION_EVALUATION.md +33 -3
  18. {biblicus-0.13.0 → biblicus-1.0.0}/docs/FEATURE_INDEX.md +199 -0
  19. {biblicus-0.13.0 → biblicus-1.0.0}/docs/KNOWLEDGE_BASE.md +20 -1
  20. biblicus-1.0.0/docs/MARKOV_ANALYSIS.md +262 -0
  21. {biblicus-0.13.0 → biblicus-1.0.0}/docs/PROFILING.md +65 -1
  22. biblicus-1.0.0/docs/PR_FAQ_CONTEXT_ENGINE.md +43 -0
  23. biblicus-1.0.0/docs/PR_FAQ_EMBEDDING_RETRIEVAL.md +105 -0
  24. biblicus-1.0.0/docs/PR_FAQ_TEXT_ANNOTATE.md +118 -0
  25. biblicus-1.0.0/docs/RETRIEVAL.md +123 -0
  26. biblicus-1.0.0/docs/RETRIEVAL_EVALUATION.md +218 -0
  27. biblicus-1.0.0/docs/RETRIEVAL_QUALITY.md +112 -0
  28. {biblicus-0.13.0 → biblicus-1.0.0}/docs/ROADMAP.md +42 -14
  29. {biblicus-0.13.0 → biblicus-1.0.0}/docs/STT.md +4 -4
  30. {biblicus-0.13.0 → biblicus-1.0.0}/docs/TESTING.md +15 -4
  31. biblicus-1.0.0/docs/TEXT_ANNOTATE.md +119 -0
  32. biblicus-1.0.0/docs/TEXT_EXTRACT.md +671 -0
  33. biblicus-1.0.0/docs/TEXT_LINK.md +124 -0
  34. biblicus-1.0.0/docs/TEXT_REDACT.md +170 -0
  35. biblicus-1.0.0/docs/TEXT_SLICE.md +319 -0
  36. biblicus-1.0.0/docs/TEXT_UTILITIES.md +137 -0
  37. {biblicus-0.13.0 → biblicus-1.0.0}/docs/TOPIC_MODELING.md +78 -5
  38. {biblicus-0.13.0 → biblicus-1.0.0}/docs/USER_CONFIGURATION.md +11 -0
  39. biblicus-1.0.0/docs/USE_CASES.md +37 -0
  40. biblicus-1.0.0/docs/UTILITIES.md +23 -0
  41. biblicus-1.0.0/docs/backends/embedding-index-file.md +34 -0
  42. biblicus-1.0.0/docs/backends/embedding-index-inmemory.md +34 -0
  43. {biblicus-0.13.0 → biblicus-1.0.0}/docs/backends/index.md +53 -4
  44. biblicus-1.0.0/docs/backends/tf-vector.md +59 -0
  45. {biblicus-0.13.0 → biblicus-1.0.0}/docs/conf.py +2 -1
  46. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/index.md +12 -1
  47. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/ocr/index.md +8 -0
  48. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/index.md +11 -0
  49. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/speech-to-text/index.md +8 -0
  50. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/index.md +11 -0
  51. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/vlm-document/index.md +8 -0
  52. biblicus-1.0.0/docs/index.rst +223 -0
  53. biblicus-1.0.0/docs/use_cases/notes_to_context_pack.md +48 -0
  54. biblicus-1.0.0/docs/use_cases/sequence_markov.md +82 -0
  55. biblicus-1.0.0/docs/use_cases/text_folder_search.md +39 -0
  56. biblicus-1.0.0/docs/use_cases/text_redact.md +50 -0
  57. biblicus-1.0.0/features/70_context_retriever.feature +12 -0
  58. biblicus-1.0.0/features/71_context_compaction.feature +22 -0
  59. biblicus-1.0.0/features/72_context_history_compaction.feature +9 -0
  60. biblicus-1.0.0/features/73_context_nested_compaction.feature +9 -0
  61. biblicus-1.0.0/features/74_context_regeneration.feature +9 -0
  62. biblicus-1.0.0/features/75_context_default_regeneration.feature +9 -0
  63. biblicus-1.0.0/features/76_context_pack_budget_weights.feature +9 -0
  64. biblicus-1.0.0/features/77_context_default_pack_priority.feature +10 -0
  65. biblicus-1.0.0/features/78_context_default_pack_weights.feature +9 -0
  66. biblicus-1.0.0/features/79_context_nested_context_packs.feature +9 -0
  67. biblicus-1.0.0/features/80_context_nested_pack_budget_cap.feature +9 -0
  68. biblicus-1.0.0/features/81_context_nested_regeneration.feature +9 -0
  69. biblicus-1.0.0/features/82_context_explicit_regeneration.feature +9 -0
  70. biblicus-1.0.0/features/83_context_explicit_pack_priority.feature +9 -0
  71. biblicus-1.0.0/features/84_context_explicit_pack_weights.feature +9 -0
  72. biblicus-1.0.0/features/85_context_expansion.feature +10 -0
  73. biblicus-1.0.0/features/86_context_engine_errors.feature +24 -0
  74. biblicus-1.0.0/features/87_context_compactor_strategies.feature +22 -0
  75. biblicus-1.0.0/features/88_context_engine_model_validation.feature +64 -0
  76. biblicus-1.0.0/features/89_context_engine_internal_branches.feature +47 -0
  77. biblicus-1.0.0/features/90_embedding_index_evidence_fallback.feature +10 -0
  78. biblicus-1.0.0/features/91_tf_vector_internal_branches.feature +10 -0
  79. biblicus-1.0.0/features/93_context_engine_full_paths.feature +6 -0
  80. biblicus-1.0.0/features/ai_llm.feature +25 -0
  81. biblicus-1.0.0/features/ai_models.feature +74 -0
  82. {biblicus-0.13.0 → biblicus-1.0.0}/features/analysis_schema.feature +1 -1
  83. {biblicus-0.13.0 → biblicus-1.0.0}/features/biblicus_corpus.feature +1 -1
  84. {biblicus-0.13.0 → biblicus-1.0.0}/features/cli_parsing.feature +26 -0
  85. biblicus-1.0.0/features/context_engine_retrieval_internal_branches.feature +6 -0
  86. biblicus-1.0.0/features/context_engine_retrieve_context_pack.feature +38 -0
  87. {biblicus-0.13.0 → biblicus-1.0.0}/features/context_pack_policies.feature +40 -0
  88. biblicus-1.0.0/features/corpus_internal_branches.feature +53 -0
  89. biblicus-1.0.0/features/embedding_index_internal_branches.feature +22 -0
  90. biblicus-1.0.0/features/embedding_retrieval.feature +341 -0
  91. biblicus-1.0.0/features/embeddings.feature +39 -0
  92. {biblicus-0.13.0 → biblicus-1.0.0}/features/environment.py +64 -0
  93. {biblicus-0.13.0 → biblicus-1.0.0}/features/error_cases.feature +2 -2
  94. {biblicus-0.13.0 → biblicus-1.0.0}/features/evaluation.feature +5 -5
  95. biblicus-1.0.0/features/hook_logging_internal_branches.feature +6 -0
  96. biblicus-1.0.0/features/ingest_namespacing.feature +43 -0
  97. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_ocr_image_extraction.feature +4 -0
  98. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_pdf_retrieval.feature +1 -1
  99. biblicus-1.0.0/features/integration_text_annotate.feature +22 -0
  100. biblicus-1.0.0/features/integration_text_extract.feature +69 -0
  101. biblicus-1.0.0/features/integration_text_link.feature +25 -0
  102. biblicus-1.0.0/features/integration_text_redact.feature +31 -0
  103. biblicus-1.0.0/features/integration_text_slice.feature +27 -0
  104. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_unstructured_extraction.feature +1 -0
  105. biblicus-1.0.0/features/integration_use_cases.feature +10 -0
  106. biblicus-1.0.0/features/integration_use_cases_sequence_markov.feature +14 -0
  107. biblicus-1.0.0/features/markov_analysis.feature +36 -0
  108. biblicus-1.0.0/features/markov_analysis_categorical.feature +42 -0
  109. biblicus-1.0.0/features/markov_analysis_llm.feature +65 -0
  110. biblicus-1.0.0/features/markov_analysis_topic_modeling.feature +40 -0
  111. biblicus-1.0.0/features/markov_analysis_variants.feature +559 -0
  112. biblicus-1.0.0/features/markov_embeddings_errors.feature +13 -0
  113. biblicus-1.0.0/features/markov_internal_branches.feature +297 -0
  114. biblicus-1.0.0/features/markov_schema.feature +161 -0
  115. biblicus-1.0.0/features/markov_start_end_labels.feature +10 -0
  116. biblicus-1.0.0/features/profiling_config_overrides.feature +16 -0
  117. biblicus-1.0.0/features/recipe_cascading.feature +63 -0
  118. biblicus-1.0.0/features/recipe_utilities.feature +77 -0
  119. {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_budget.feature +4 -0
  120. biblicus-1.0.0/features/retrieval_build_recipes.feature +19 -0
  121. biblicus-1.0.0/features/retrieval_evaluation_lab.feature +10 -0
  122. {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_quality.feature +20 -20
  123. {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_scan.feature +6 -18
  124. {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_sqlite_full_text_search.feature +1 -1
  125. {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_uses_extraction_run.feature +5 -5
  126. biblicus-1.0.0/features/select_override_defaults.feature +14 -0
  127. biblicus-1.0.0/features/source_helper_internal_branches.feature +22 -0
  128. biblicus-1.0.0/features/steps/ai_llm_steps.py +44 -0
  129. biblicus-1.0.0/features/steps/ai_models_steps.py +181 -0
  130. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/analysis_steps.py +8 -6
  131. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/backend_steps.py +1 -1
  132. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/cli_parsing_steps.py +16 -0
  133. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/cli_steps.py +73 -7
  134. biblicus-1.0.0/features/steps/context_compaction_steps.py +139 -0
  135. biblicus-1.0.0/features/steps/context_compactor_steps.py +28 -0
  136. biblicus-1.0.0/features/steps/context_default_pack_priority_steps.py +98 -0
  137. biblicus-1.0.0/features/steps/context_default_pack_weights_steps.py +91 -0
  138. biblicus-1.0.0/features/steps/context_default_regeneration_steps.py +69 -0
  139. biblicus-1.0.0/features/steps/context_engine_error_steps.py +111 -0
  140. biblicus-1.0.0/features/steps/context_engine_full_paths_steps.py +696 -0
  141. biblicus-1.0.0/features/steps/context_engine_internal_steps.py +322 -0
  142. biblicus-1.0.0/features/steps/context_engine_model_steps.py +144 -0
  143. biblicus-1.0.0/features/steps/context_engine_registry.py +123 -0
  144. biblicus-1.0.0/features/steps/context_engine_retrieval_internal_steps.py +113 -0
  145. biblicus-1.0.0/features/steps/context_engine_retrieve_context_pack_steps.py +129 -0
  146. biblicus-1.0.0/features/steps/context_engine_retriever.py +104 -0
  147. biblicus-1.0.0/features/steps/context_expansion_steps.py +79 -0
  148. biblicus-1.0.0/features/steps/context_explicit_pack_priority_steps.py +94 -0
  149. biblicus-1.0.0/features/steps/context_explicit_pack_weights_steps.py +83 -0
  150. biblicus-1.0.0/features/steps/context_explicit_regeneration_steps.py +84 -0
  151. biblicus-1.0.0/features/steps/context_history_compaction_steps.py +46 -0
  152. biblicus-1.0.0/features/steps/context_nested_compaction_steps.py +50 -0
  153. biblicus-1.0.0/features/steps/context_nested_context_packs_steps.py +74 -0
  154. biblicus-1.0.0/features/steps/context_nested_pack_budget_cap_steps.py +84 -0
  155. biblicus-1.0.0/features/steps/context_nested_regeneration_steps.py +91 -0
  156. biblicus-1.0.0/features/steps/context_pack_budget_steps.py +81 -0
  157. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/context_pack_steps.py +54 -0
  158. biblicus-1.0.0/features/steps/context_regeneration_steps.py +73 -0
  159. biblicus-1.0.0/features/steps/context_retriever_steps.py +68 -0
  160. biblicus-1.0.0/features/steps/corpus_internal_steps.py +190 -0
  161. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/docling_steps.py +7 -0
  162. biblicus-1.0.0/features/steps/embedding_index_evidence_steps.py +150 -0
  163. biblicus-1.0.0/features/steps/embedding_index_internal_steps.py +34 -0
  164. biblicus-1.0.0/features/steps/embedding_retrieval_coverage_steps.py +453 -0
  165. biblicus-1.0.0/features/steps/embeddings_steps.py +122 -0
  166. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_steps.py +20 -0
  167. biblicus-1.0.0/features/steps/hook_logging_steps.py +13 -0
  168. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/markitdown_steps.py +7 -0
  169. biblicus-1.0.0/features/steps/markov_embeddings_error_steps.py +69 -0
  170. biblicus-1.0.0/features/steps/markov_internal_steps.py +1933 -0
  171. biblicus-1.0.0/features/steps/markov_schema_steps.py +729 -0
  172. biblicus-1.0.0/features/steps/markov_start_end_steps.py +38 -0
  173. biblicus-1.0.0/features/steps/markov_steps.py +451 -0
  174. biblicus-1.0.0/features/steps/openai_steps.py +735 -0
  175. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/paddleocr_vl_steps.py +7 -0
  176. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/profiling_steps.py +74 -0
  177. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/rapidocr_steps.py +7 -0
  178. biblicus-1.0.0/features/steps/recipe_steps.py +96 -0
  179. biblicus-1.0.0/features/steps/retrieval_build_recipe_steps.py +64 -0
  180. biblicus-1.0.0/features/steps/retrieval_evaluation_lab_steps.py +77 -0
  181. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/retrieval_quality_steps.py +3 -3
  182. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/retrieval_steps.py +87 -4
  183. biblicus-1.0.0/features/steps/select_override_defaults_steps.py +21 -0
  184. biblicus-1.0.0/features/steps/source_helper_steps.py +35 -0
  185. biblicus-1.0.0/features/steps/text_annotate_steps.py +477 -0
  186. biblicus-1.0.0/features/steps/text_extract_steps.py +480 -0
  187. biblicus-1.0.0/features/steps/text_internal_steps.py +64 -0
  188. biblicus-1.0.0/features/steps/text_link_internal_steps.py +411 -0
  189. biblicus-1.0.0/features/steps/text_link_steps.py +494 -0
  190. biblicus-1.0.0/features/steps/text_mock_steps.py +199 -0
  191. biblicus-1.0.0/features/steps/text_redact_steps.py +509 -0
  192. biblicus-1.0.0/features/steps/text_slice_steps.py +433 -0
  193. biblicus-1.0.0/features/steps/text_tool_loop_steps.py +36 -0
  194. biblicus-1.0.0/features/steps/tf_vector_internal_steps.py +14 -0
  195. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/topic_modeling_steps.py +45 -0
  196. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/unstructured_steps.py +7 -0
  197. biblicus-1.0.0/features/steps/use_cases_steps.py +139 -0
  198. biblicus-1.0.0/features/steps/wikitext_steps.py +31 -0
  199. biblicus-1.0.0/features/text_annotate.feature +227 -0
  200. biblicus-1.0.0/features/text_extract.feature +226 -0
  201. biblicus-1.0.0/features/text_internal_branches.feature +52 -0
  202. biblicus-1.0.0/features/text_link.feature +146 -0
  203. biblicus-1.0.0/features/text_link_internal_branches.feature +114 -0
  204. biblicus-1.0.0/features/text_mock.feature +86 -0
  205. biblicus-1.0.0/features/text_redact.feature +135 -0
  206. biblicus-1.0.0/features/text_slice.feature +135 -0
  207. biblicus-1.0.0/features/text_utilities.feature +51 -0
  208. {biblicus-0.13.0 → biblicus-1.0.0}/features/topic_modeling.feature +3 -3
  209. biblicus-1.0.0/features/use_cases.feature +21 -0
  210. {biblicus-0.13.0 → biblicus-1.0.0}/pyproject.toml +12 -1
  211. biblicus-1.0.0/scripts/demo_context_engine.py +328 -0
  212. biblicus-1.0.0/scripts/markov_analysis_demo.py +279 -0
  213. biblicus-1.0.0/scripts/markov_cached_segments_demo.py +603 -0
  214. biblicus-1.0.0/scripts/markov_run_report.py +243 -0
  215. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/readme_end_to_end_demo.py +1 -1
  216. biblicus-1.0.0/scripts/retrieval_evaluation_lab.py +284 -0
  217. biblicus-1.0.0/scripts/use_cases/notes_to_context_pack_demo.py +121 -0
  218. biblicus-1.0.0/scripts/use_cases/sequence_markov_demo.py +189 -0
  219. biblicus-1.0.0/scripts/use_cases/text_folder_search_demo.py +132 -0
  220. biblicus-1.0.0/scripts/use_cases/text_redact_demo.py +116 -0
  221. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/wikipedia_rag_demo.py +1 -1
  222. biblicus-1.0.0/src/biblicus/__init__.py +50 -0
  223. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/__init__.py +2 -2
  224. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/loader.py +40 -1
  225. biblicus-1.0.0/src/biblicus/ai/__init__.py +39 -0
  226. biblicus-1.0.0/src/biblicus/ai/embeddings.py +114 -0
  227. biblicus-1.0.0/src/biblicus/ai/llm.py +138 -0
  228. biblicus-1.0.0/src/biblicus/ai/models.py +226 -0
  229. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/__init__.py +5 -2
  230. biblicus-1.0.0/src/biblicus/analysis/markov.py +1656 -0
  231. biblicus-1.0.0/src/biblicus/analysis/models.py +1530 -0
  232. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/topic_modeling.py +98 -19
  233. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/__init__.py +6 -2
  234. biblicus-1.0.0/src/biblicus/backends/embedding_index_common.py +334 -0
  235. biblicus-1.0.0/src/biblicus/backends/embedding_index_file.py +272 -0
  236. biblicus-1.0.0/src/biblicus/backends/embedding_index_inmemory.py +270 -0
  237. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/hybrid.py +14 -6
  238. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/scan.py +1 -0
  239. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/sqlite_full_text_search.py +5 -3
  240. biblicus-0.13.0/src/biblicus/backends/vector.py → biblicus-1.0.0/src/biblicus/backends/tf_vector.py +28 -35
  241. biblicus-1.0.0/src/biblicus/chunking.py +396 -0
  242. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/cli.py +193 -48
  243. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/context.py +29 -14
  244. biblicus-1.0.0/src/biblicus/context_engine/__init__.py +53 -0
  245. biblicus-1.0.0/src/biblicus/context_engine/assembler.py +1060 -0
  246. biblicus-1.0.0/src/biblicus/context_engine/compaction.py +110 -0
  247. biblicus-1.0.0/src/biblicus/context_engine/models.py +423 -0
  248. biblicus-1.0.0/src/biblicus/context_engine/retrieval.py +129 -0
  249. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/corpus.py +117 -16
  250. biblicus-1.0.0/src/biblicus/embedding_providers.py +122 -0
  251. biblicus-1.0.0/src/biblicus/errors.py +39 -0
  252. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/frontmatter.py +2 -0
  253. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/knowledge_base.py +1 -1
  254. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/models.py +15 -3
  255. biblicus-1.0.0/src/biblicus/recipes.py +136 -0
  256. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/retrieval.py +7 -2
  257. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/sources.py +46 -11
  258. biblicus-1.0.0/src/biblicus/text/__init__.py +43 -0
  259. biblicus-1.0.0/src/biblicus/text/annotate.py +222 -0
  260. biblicus-1.0.0/src/biblicus/text/extract.py +210 -0
  261. biblicus-1.0.0/src/biblicus/text/link.py +525 -0
  262. biblicus-1.0.0/src/biblicus/text/markup.py +200 -0
  263. biblicus-1.0.0/src/biblicus/text/models.py +319 -0
  264. biblicus-1.0.0/src/biblicus/text/prompts.py +115 -0
  265. biblicus-1.0.0/src/biblicus/text/redact.py +229 -0
  266. biblicus-1.0.0/src/biblicus/text/slice.py +155 -0
  267. biblicus-1.0.0/src/biblicus/text/tool_loop.py +334 -0
  268. {biblicus-0.13.0 → biblicus-1.0.0/src/biblicus.egg-info}/PKG-INFO +103 -31
  269. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/SOURCES.txt +183 -4
  270. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/requires.txt +10 -0
  271. biblicus-0.13.0/docs/ANALYSIS.md +0 -47
  272. biblicus-0.13.0/docs/ARCHITECTURE.md +0 -180
  273. biblicus-0.13.0/docs/RETRIEVAL.md +0 -47
  274. biblicus-0.13.0/docs/RETRIEVAL_EVALUATION.md +0 -74
  275. biblicus-0.13.0/docs/RETRIEVAL_QUALITY.md +0 -42
  276. biblicus-0.13.0/docs/backends/vector.md +0 -59
  277. biblicus-0.13.0/docs/index.rst +0 -33
  278. biblicus-0.13.0/features/steps/openai_steps.py +0 -314
  279. biblicus-0.13.0/src/biblicus/__init__.py +0 -30
  280. biblicus-0.13.0/src/biblicus/analysis/llm.py +0 -106
  281. biblicus-0.13.0/src/biblicus/analysis/models.py +0 -777
  282. biblicus-0.13.0/src/biblicus/errors.py +0 -15
  283. {biblicus-0.13.0 → biblicus-1.0.0}/LICENSE +0 -0
  284. {biblicus-0.13.0 → biblicus-1.0.0}/MANIFEST.in +0 -0
  285. {biblicus-0.13.0 → biblicus-1.0.0}/THIRD_PARTY_NOTICES.md +0 -0
  286. {biblicus-0.13.0 → biblicus-1.0.0}/datasets/extraction_lab/labels.json +0 -0
  287. {biblicus-0.13.0 → biblicus-1.0.0}/datasets/wikipedia_mini.json +0 -0
  288. {biblicus-0.13.0 → biblicus-1.0.0}/docs/api.rst +0 -0
  289. {biblicus-0.13.0 → biblicus-1.0.0}/docs/backends/scan.md +0 -0
  290. {biblicus-0.13.0 → biblicus-1.0.0}/docs/backends/sqlite-full-text-search.md +0 -0
  291. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
  292. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/ocr/rapidocr.md +0 -0
  293. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
  294. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
  295. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
  296. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
  297. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
  298. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
  299. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/speech-to-text/openai.md +0 -0
  300. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/markitdown.md +0 -0
  301. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/metadata.md +0 -0
  302. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/pass-through.md +0 -0
  303. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/pdf.md +0 -0
  304. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/text-document/unstructured.md +0 -0
  305. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
  306. {biblicus-0.13.0 → biblicus-1.0.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
  307. {biblicus-0.13.0 → biblicus-1.0.0}/features/backend_validation.feature +0 -0
  308. {biblicus-0.13.0 → biblicus-1.0.0}/features/cli_entrypoint.feature +0 -0
  309. {biblicus-0.13.0 → biblicus-1.0.0}/features/cli_step_spec_parsing.feature +0 -0
  310. {biblicus-0.13.0 → biblicus-1.0.0}/features/content_sniffing.feature +0 -0
  311. {biblicus-0.13.0 → biblicus-1.0.0}/features/context_pack.feature +0 -0
  312. {biblicus-0.13.0 → biblicus-1.0.0}/features/context_pack_cli.feature +0 -0
  313. {biblicus-0.13.0 → biblicus-1.0.0}/features/corpus_edge_cases.feature +0 -0
  314. {biblicus-0.13.0 → biblicus-1.0.0}/features/corpus_identity.feature +0 -0
  315. {biblicus-0.13.0 → biblicus-1.0.0}/features/corpus_purge.feature +0 -0
  316. {biblicus-0.13.0 → biblicus-1.0.0}/features/crawl.feature +0 -0
  317. {biblicus-0.13.0 → biblicus-1.0.0}/features/docling_granite_extractor.feature +0 -0
  318. {biblicus-0.13.0 → biblicus-1.0.0}/features/docling_smol_extractor.feature +0 -0
  319. {biblicus-0.13.0 → biblicus-1.0.0}/features/evidence_processing.feature +0 -0
  320. {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_error_handling.feature +0 -0
  321. {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_evaluation.feature +0 -0
  322. {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_evaluation_lab.feature +0 -0
  323. {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_run_lifecycle.feature +0 -0
  324. {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_selection.feature +0 -0
  325. {biblicus-0.13.0 → biblicus-1.0.0}/features/extraction_selection_longest.feature +0 -0
  326. {biblicus-0.13.0 → biblicus-1.0.0}/features/extractor_pipeline.feature +0 -0
  327. {biblicus-0.13.0 → biblicus-1.0.0}/features/extractor_validation.feature +0 -0
  328. {biblicus-0.13.0 → biblicus-1.0.0}/features/frontmatter.feature +0 -0
  329. {biblicus-0.13.0 → biblicus-1.0.0}/features/hook_config_validation.feature +0 -0
  330. {biblicus-0.13.0 → biblicus-1.0.0}/features/hook_error_handling.feature +0 -0
  331. {biblicus-0.13.0 → biblicus-1.0.0}/features/import_tree.feature +0 -0
  332. {biblicus-0.13.0 → biblicus-1.0.0}/features/inference_backend.feature +0 -0
  333. {biblicus-0.13.0 → biblicus-1.0.0}/features/ingest_sources.feature +0 -0
  334. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_audio_samples.feature +0 -0
  335. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_image_samples.feature +0 -0
  336. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_mixed_corpus.feature +0 -0
  337. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_mixed_extraction.feature +0 -0
  338. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_pdf_samples.feature +0 -0
  339. {biblicus-0.13.0 → biblicus-1.0.0}/features/integration_wikipedia.feature +0 -0
  340. {biblicus-0.13.0 → biblicus-1.0.0}/features/knowledge_base.feature +0 -0
  341. {biblicus-0.13.0 → biblicus-1.0.0}/features/lifecycle_hooks.feature +0 -0
  342. {biblicus-0.13.0 → biblicus-1.0.0}/features/markitdown_extractor.feature +0 -0
  343. {biblicus-0.13.0 → biblicus-1.0.0}/features/model_validation.feature +0 -0
  344. {biblicus-0.13.0 → biblicus-1.0.0}/features/ocr_extractor.feature +0 -0
  345. {biblicus-0.13.0 → biblicus-1.0.0}/features/paddleocr_vl_extractor.feature +0 -0
  346. {biblicus-0.13.0 → biblicus-1.0.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
  347. {biblicus-0.13.0 → biblicus-1.0.0}/features/pdf_text_extraction.feature +0 -0
  348. {biblicus-0.13.0 → biblicus-1.0.0}/features/profiling.feature +0 -0
  349. {biblicus-0.13.0 → biblicus-1.0.0}/features/python_api.feature +0 -0
  350. {biblicus-0.13.0 → biblicus-1.0.0}/features/python_hook_logging.feature +0 -0
  351. {biblicus-0.13.0 → biblicus-1.0.0}/features/query_processing.feature +0 -0
  352. {biblicus-0.13.0 → biblicus-1.0.0}/features/recipe_file_extraction.feature +0 -0
  353. {biblicus-0.13.0 → biblicus-1.0.0}/features/retrieval_utilities.feature +0 -0
  354. {biblicus-0.13.0 → biblicus-1.0.0}/features/select_override.feature +0 -0
  355. {biblicus-0.13.0 → biblicus-1.0.0}/features/smart_override_selection.feature +0 -0
  356. {biblicus-0.13.0 → biblicus-1.0.0}/features/source_loading.feature +0 -0
  357. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/crawl_steps.py +0 -0
  358. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/deepgram_steps.py +0 -0
  359. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/evidence_processing_steps.py +0 -0
  360. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_evaluation_lab_steps.py +0 -0
  361. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_evaluation_steps.py +0 -0
  362. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
  363. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/extractor_steps.py +0 -0
  364. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/frontmatter_steps.py +0 -0
  365. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/inference_steps.py +0 -0
  366. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/knowledge_base_steps.py +0 -0
  367. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/model_steps.py +0 -0
  368. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/paddleocr_mock_steps.py +0 -0
  369. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/paddleocr_vl_unit_steps.py +0 -0
  370. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/pdf_steps.py +0 -0
  371. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/python_api_steps.py +0 -0
  372. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/requests_mock_steps.py +0 -0
  373. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/stt_deepgram_steps.py +0 -0
  374. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/stt_steps.py +0 -0
  375. {biblicus-0.13.0 → biblicus-1.0.0}/features/steps/user_config_steps.py +0 -0
  376. {biblicus-0.13.0 → biblicus-1.0.0}/features/streaming_ingest.feature +0 -0
  377. {biblicus-0.13.0 → biblicus-1.0.0}/features/stt_deepgram_extractor.feature +0 -0
  378. {biblicus-0.13.0 → biblicus-1.0.0}/features/stt_extractor.feature +0 -0
  379. {biblicus-0.13.0 → biblicus-1.0.0}/features/text_extraction_runs.feature +0 -0
  380. {biblicus-0.13.0 → biblicus-1.0.0}/features/token_budget.feature +0 -0
  381. {biblicus-0.13.0 → biblicus-1.0.0}/features/unstructured_extractor.feature +0 -0
  382. {biblicus-0.13.0 → biblicus-1.0.0}/features/user_config.feature +0 -0
  383. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_ag_news.py +0 -0
  384. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_audio_samples.py +0 -0
  385. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_image_samples.py +0 -0
  386. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_mixed_samples.py +0 -0
  387. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_pdf_samples.py +0 -0
  388. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/download_wikipedia.py +0 -0
  389. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/extraction_evaluation_demo.py +0 -0
  390. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/extraction_evaluation_lab.py +0 -0
  391. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/profiling_demo.py +0 -0
  392. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/test.py +0 -0
  393. {biblicus-0.13.0 → biblicus-1.0.0}/scripts/topic_modeling_integration.py +0 -0
  394. {biblicus-0.13.0 → biblicus-1.0.0}/setup.cfg +0 -0
  395. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/__main__.py +0 -0
  396. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
  397. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
  398. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/base.py +0 -0
  399. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/profiling.py +0 -0
  400. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/analysis/schema.py +0 -0
  401. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/backends/base.py +0 -0
  402. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/constants.py +0 -0
  403. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/crawl.py +0 -0
  404. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/evaluation.py +0 -0
  405. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/evidence_processing.py +0 -0
  406. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extraction.py +0 -0
  407. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extraction_evaluation.py +0 -0
  408. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/__init__.py +0 -0
  409. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/base.py +0 -0
  410. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
  411. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
  412. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
  413. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/markitdown_text.py +0 -0
  414. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/metadata_text.py +0 -0
  415. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/openai_stt.py +0 -0
  416. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/paddleocr_vl_text.py +0 -0
  417. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/pass_through_text.py +0 -0
  418. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/pdf_text.py +0 -0
  419. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/pipeline.py +0 -0
  420. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
  421. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_longest_text.py +0 -0
  422. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_override.py +0 -0
  423. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_smart_override.py +0 -0
  424. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/select_text.py +0 -0
  425. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/extractors/unstructured_text.py +0 -0
  426. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/hook_logging.py +0 -0
  427. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/hook_manager.py +0 -0
  428. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/hooks.py +0 -0
  429. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/ignore.py +0 -0
  430. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/inference.py +0 -0
  431. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/time.py +0 -0
  432. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/uris.py +0 -0
  433. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus/user_config.py +0 -0
  434. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  435. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  436. {biblicus-0.13.0 → biblicus-1.0.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.13.0
3
+ Version: 1.0.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -9,6 +9,9 @@ License-File: LICENSE
9
9
  Requires-Dist: pydantic>=2.0
10
10
  Requires-Dist: PyYAML>=6.0
11
11
  Requires-Dist: pypdf>=4.0
12
+ Requires-Dist: Jinja2>=3.1
13
+ Requires-Dist: dotyaml>=0.1.3
14
+ Requires-Dist: numpy>=1.24
12
15
  Provides-Extra: dev
13
16
  Requires-Dist: behave>=1.2.6; extra == "dev"
14
17
  Requires-Dist: coverage[toml]>=7.0; extra == "dev"
@@ -18,6 +21,9 @@ Requires-Dist: sphinx_rtd_theme>=2.0; extra == "dev"
18
21
  Requires-Dist: ruff>=0.4.0; extra == "dev"
19
22
  Requires-Dist: black>=24.0; extra == "dev"
20
23
  Requires-Dist: python-semantic-release>=9.0.0; extra == "dev"
24
+ Provides-Extra: dspy
25
+ Requires-Dist: dspy>=2.5; extra == "dspy"
26
+ Requires-Dist: litellm>=1.0; extra == "dspy"
21
27
  Provides-Extra: openai
22
28
  Requires-Dist: openai>=1.0; extra == "openai"
23
29
  Provides-Extra: unstructured
@@ -40,6 +46,8 @@ Provides-Extra: docling-mlx
40
46
  Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
41
47
  Provides-Extra: topic-modeling
42
48
  Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
49
+ Provides-Extra: markov-analysis
50
+ Requires-Dist: hmmlearn>=0.3.0; extra == "markov-analysis"
43
51
  Provides-Extra: datasets
44
52
  Requires-Dist: datasets>=2.18.0; extra == "datasets"
45
53
  Dynamic: license-file
@@ -50,18 +58,33 @@ Dynamic: license-file
50
58
  ![Coverage][coverage-badge]
51
59
  ![Documentation][documentation-badge]
52
60
 
53
- Make your documents usable by your assistant, then decide later how you will search and retrieve them.
54
-
61
+ <p>
62
+ <img
63
+ src="docs/_static/Biblicus-logo.png"
64
+ alt="Biblicus logo"
65
+ align="right"
66
+ width="216"
67
+ />
68
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
69
+ </p>
55
70
  If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
56
71
 
57
72
  The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
58
73
 
59
- This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
74
+ Biblicus gives you a normal folder on disk to manage. In Biblicus documentation, that managed folder is called a *corpus* (plural: *corpora*). It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw files.
60
75
 
61
76
  It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
62
77
 
63
78
  See [retrieval augmented generation overview] for a short introduction to the idea.
64
79
 
80
+ ## Analysis highlights
81
+
82
+ - `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
83
+ - YAML recipes support cascading composition plus dotted `--config key=value` overrides.
84
+ - Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
85
+ - See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
86
+ - See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
87
+
65
88
  ## Start with a knowledge base
66
89
 
67
90
  If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
@@ -106,7 +129,7 @@ Think in three stages.
106
129
 
107
130
  If you learn a few project words, the rest of the system becomes predictable.
108
131
 
109
- - Corpus is the folder that holds raw items and their metadata.
132
+ - Corpus is the managed folder that holds raw items and their metadata.
110
133
  - Item is the raw bytes plus optional metadata and source information.
111
134
  - Catalog is the rebuildable index of the corpus.
112
135
  - Extraction run is a recorded extraction build that produces text artifacts.
@@ -161,28 +184,28 @@ sequenceDiagram
161
184
  This repository is a working Python package. Install it into a virtual environment from the repository root.
162
185
 
163
186
  ```
164
- python3 -m pip install -e .
187
+ python -m pip install -e .
165
188
  ```
166
189
 
167
190
  After the first release, you can install it from Python Package Index.
168
191
 
169
192
  ```
170
- python3 -m pip install biblicus
193
+ python -m pip install biblicus
171
194
  ```
172
195
 
173
196
  ### Optional extras
174
197
 
175
198
  Some extractors are optional so the base install stays small.
176
199
 
177
- - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
178
- - Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
179
- - Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
180
- - Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
181
- - Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
182
- - Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
183
- - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
184
- - MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
185
- - Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
200
+ - Optical character recognition for images: `python -m pip install "biblicus[ocr]"`
201
+ - Advanced optical character recognition with PaddleOCR: `python -m pip install "biblicus[paddleocr]"`
202
+ - Document understanding with Docling VLM: `python -m pip install "biblicus[docling]"`
203
+ - Document understanding with Docling VLM and MLX acceleration: `python -m pip install "biblicus[docling-mlx]"`
204
+ - Speech to text transcription with OpenAI: `python -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
205
+ - Speech to text transcription with Deepgram: `python -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
206
+ - Broad document parsing fallback: `python -m pip install "biblicus[unstructured]"`
207
+ - MarkItDown document conversion (requires Python 3.10 or higher): `python -m pip install "biblicus[markitdown]"`
208
+ - Topic modeling analysis with BERTopic: `python -m pip install "biblicus[topic-modeling]"`
186
209
 
187
210
  ## Quick start
188
211
 
@@ -200,16 +223,49 @@ biblicus build --corpus corpora/example --backend scan
200
223
  biblicus query --corpus corpora/example --query "note"
201
224
  ```
202
225
 
203
- If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
226
+ ## Web Ingestion
227
+
228
+ Biblicus supports ingesting content directly from the web using two approaches.
229
+
230
+ ### Ingest from URLs
204
231
 
232
+ Ingest individual documents or web pages from URLs. The `ingest` command automatically detects content types including PDF, HTML, Markdown, images, and audio:
233
+
234
+ ```bash
235
+ # Ingest a document from a URL
236
+ biblicus ingest https://example.com/document.pdf --tags "research"
237
+
238
+ # Ingest a web page
239
+ biblicus ingest https://example.com/article.html --tags "article"
240
+
241
+ # Ingest with a corpus path specified
242
+ biblicus ingest --corpus corpora/example https://docs.example.com/guide.md --tags "documentation"
205
243
  ```
206
- biblicus crawl --corpus corpora/example \\
207
- --root-url https://example.com/docs/index.html \\
208
- --allowed-prefix https://example.com/docs/ \\
209
- --max-items 50 \\
210
- --tag crawled
244
+
245
+ ### Crawl Websites
246
+
247
+ Crawl entire website sections with automatic link discovery. The crawler follows links within the allowed prefix and stores discovered content:
248
+
249
+ ```bash
250
+ # Crawl a documentation site
251
+ biblicus crawl \
252
+ --corpus corpora/example \
253
+ --root-url https://docs.example.com/ \
254
+ --allowed-prefix https://docs.example.com/ \
255
+ --max-items 100 \
256
+ --tags "documentation"
257
+
258
+ # Crawl a specific blog category
259
+ biblicus crawl \
260
+ --corpus corpora/example \
261
+ --root-url https://blog.example.com/category/tutorials/ \
262
+ --allowed-prefix https://blog.example.com/category/tutorials/ \
263
+ --max-items 50 \
264
+ --tags "tutorials,blog"
211
265
  ```
212
266
 
267
+ The `--allowed-prefix` parameter restricts the crawler to only follow links that start with the specified URL prefix, preventing it from crawling outside the intended scope. The crawler respects `.biblicusignore` rules and stores items under `raw/imports/crawl/` in your corpus.
268
+
213
269
  ## End-to-end example: lower-level control
214
270
 
215
271
  The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
@@ -237,7 +293,7 @@ for note_title, note_text in notes:
237
293
 
238
294
  backend = get_backend("scan")
239
295
  run = backend.build_run(corpus, recipe_name="Story demo", config={})
240
- budget = QueryBudget(max_total_items=5, max_total_characters=2000, max_items_per_source=None)
296
+ budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
241
297
  result = backend.query(
242
298
  corpus,
243
299
  run=run,
@@ -277,7 +333,7 @@ Example output:
277
333
  "query_text": "Primary button style preference",
278
334
  "budget": {
279
335
  "max_total_items": 5,
280
- "max_total_characters": 2000,
336
+ "maximum_total_characters": 2000,
281
337
  "max_items_per_source": null
282
338
  },
283
339
  "run_id": "RUN_ID",
@@ -490,7 +546,7 @@ Three backends are included.
490
546
 
491
547
  - `scan` is a minimal baseline that scans raw items directly.
492
548
  - `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
493
- - `vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
549
+ - `tf-vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
494
550
 
495
551
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
496
552
 
@@ -498,7 +554,8 @@ For detailed documentation including configuration options, performance characte
498
554
 
499
555
  For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
500
556
  (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
501
- and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
557
+ and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
558
+ script (`scripts/retrieval_evaluation_lab.py`).
502
559
 
503
560
  ## Extraction backends
504
561
 
@@ -539,6 +596,21 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
539
596
  For extraction evaluation workflows, dataset formats, and report interpretation, see
540
597
  `docs/EXTRACTION_EVALUATION.md`.
541
598
 
599
+ ## Text extract utility
600
+
601
+ Text extract is a reusable analysis utility that lets a model insert XML tags into a long text without re-emitting the
602
+ entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
603
+ analysis.
604
+
605
+ See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
606
+
607
+ ## Text slice utility
608
+
609
+ Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
610
+ re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
611
+
612
+ See `docs/TEXT_SLICE.md` for the utility API and examples.
613
+
542
614
  ## Topic modeling analysis
543
615
 
544
616
  Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
@@ -593,7 +665,7 @@ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[t
593
665
  For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
594
666
 
595
667
  ```
596
- python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
668
+ python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
597
669
  ```
598
670
 
599
671
  See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
@@ -607,13 +679,13 @@ Use `scripts/download_pdf_samples.py` to download a small Portable Document Form
607
679
  ## Tests and coverage
608
680
 
609
681
  ```
610
- python3 scripts/test.py
682
+ python scripts/test.py
611
683
  ```
612
684
 
613
685
  To include integration scenarios that download public test data at runtime, run this command.
614
686
 
615
687
  ```
616
- python3 scripts/test.py --integration
688
+ python scripts/test.py --integration
617
689
  ```
618
690
 
619
691
  ## Releases
@@ -631,13 +703,13 @@ Reference documentation is generated from Sphinx style docstrings.
631
703
  Install development dependencies:
632
704
 
633
705
  ```
634
- python3 -m pip install -e ".[dev]"
706
+ python -m pip install -e ".[dev]"
635
707
  ```
636
708
 
637
709
  Build the documentation:
638
710
 
639
711
  ```
640
- python3 -m sphinx -b html docs docs/_build/html
712
+ python -m sphinx -b html docs docs/_build/html
641
713
  ```
642
714
 
643
715
  ## License
@@ -4,18 +4,33 @@
4
4
  ![Coverage][coverage-badge]
5
5
  ![Documentation][documentation-badge]
6
6
 
7
- Make your documents usable by your assistant, then decide later how you will search and retrieve them.
8
-
7
+ <p>
8
+ <img
9
+ src="docs/_static/Biblicus-logo.png"
10
+ alt="Biblicus logo"
11
+ align="right"
12
+ width="216"
13
+ />
14
+ Make your documents usable by your assistant, then decide later how you will search and retrieve them.
15
+ </p>
9
16
  If you are building an assistant in Python, you probably have material you want it to use: notes, documents, web pages, and reference files. A common approach is retrieval augmented generation, where a system retrieves relevant material and uses it as evidence when generating a response.
10
17
 
11
18
  The first practical problem is not retrieval. It is collection and care. You need a stable place to put raw items, you need a small amount of metadata so you can find them again, and you need a way to evolve your retrieval approach over time without rewriting ingestion.
12
19
 
13
- This library gives you a corpus, which is a normal folder on disk. It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw corpus.
20
+ Biblicus gives you a normal folder on disk to manage. In Biblicus documentation, that managed folder is called a *corpus* (plural: *corpora*). It stores each ingested item as a file, with optional metadata stored next to it. You can open and inspect the raw files directly. Any derived catalog or index can be rebuilt from the raw files.
14
21
 
15
22
  It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or your own setup. Use it from Python or from the command line interface.
16
23
 
17
24
  See [retrieval augmented generation overview] for a short introduction to the idea.
18
25
 
26
+ ## Analysis highlights
27
+
28
+ - `biblicus analyze markov` learns a directed, weighted state transition graph over segmented text.
29
+ - YAML recipes support cascading composition plus dotted `--config key=value` overrides.
30
+ - Text extract splits long texts with an LLM by inserting XML tags in-place for structured spans.
31
+ - See `docs/MARKOV_ANALYSIS.md` for Markov analysis details and runnable demos.
32
+ - See `docs/TEXT_EXTRACT.md` for the text extract utility and examples.
33
+
19
34
  ## Start with a knowledge base
20
35
 
21
36
  If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
@@ -60,7 +75,7 @@ Think in three stages.
60
75
 
61
76
  If you learn a few project words, the rest of the system becomes predictable.
62
77
 
63
- - Corpus is the folder that holds raw items and their metadata.
78
+ - Corpus is the managed folder that holds raw items and their metadata.
64
79
  - Item is the raw bytes plus optional metadata and source information.
65
80
  - Catalog is the rebuildable index of the corpus.
66
81
  - Extraction run is a recorded extraction build that produces text artifacts.
@@ -115,28 +130,28 @@ sequenceDiagram
115
130
  This repository is a working Python package. Install it into a virtual environment from the repository root.
116
131
 
117
132
  ```
118
- python3 -m pip install -e .
133
+ python -m pip install -e .
119
134
  ```
120
135
 
121
136
  After the first release, you can install it from Python Package Index.
122
137
 
123
138
  ```
124
- python3 -m pip install biblicus
139
+ python -m pip install biblicus
125
140
  ```
126
141
 
127
142
  ### Optional extras
128
143
 
129
144
  Some extractors are optional so the base install stays small.
130
145
 
131
- - Optical character recognition for images: `python3 -m pip install "biblicus[ocr]"`
132
- - Advanced optical character recognition with PaddleOCR: `python3 -m pip install "biblicus[paddleocr]"`
133
- - Document understanding with Docling VLM: `python3 -m pip install "biblicus[docling]"`
134
- - Document understanding with Docling VLM and MLX acceleration: `python3 -m pip install "biblicus[docling-mlx]"`
135
- - Speech to text transcription with OpenAI: `python3 -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
136
- - Speech to text transcription with Deepgram: `python3 -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
137
- - Broad document parsing fallback: `python3 -m pip install "biblicus[unstructured]"`
138
- - MarkItDown document conversion (requires Python 3.10 or higher): `python3 -m pip install "biblicus[markitdown]"`
139
- - Topic modeling analysis with BERTopic: `python3 -m pip install "biblicus[topic-modeling]"`
146
+ - Optical character recognition for images: `python -m pip install "biblicus[ocr]"`
147
+ - Advanced optical character recognition with PaddleOCR: `python -m pip install "biblicus[paddleocr]"`
148
+ - Document understanding with Docling VLM: `python -m pip install "biblicus[docling]"`
149
+ - Document understanding with Docling VLM and MLX acceleration: `python -m pip install "biblicus[docling-mlx]"`
150
+ - Speech to text transcription with OpenAI: `python -m pip install "biblicus[openai]"` (requires an OpenAI API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
151
+ - Speech to text transcription with Deepgram: `python -m pip install "biblicus[deepgram]"` (requires a Deepgram API key in `~/.biblicus/config.yml` or `./.biblicus/config.yml`)
152
+ - Broad document parsing fallback: `python -m pip install "biblicus[unstructured]"`
153
+ - MarkItDown document conversion (requires Python 3.10 or higher): `python -m pip install "biblicus[markitdown]"`
154
+ - Topic modeling analysis with BERTopic: `python -m pip install "biblicus[topic-modeling]"`
140
155
 
141
156
  ## Quick start
142
157
 
@@ -154,16 +169,49 @@ biblicus build --corpus corpora/example --backend scan
154
169
  biblicus query --corpus corpora/example --query "note"
155
170
  ```
156
171
 
157
- If you want to turn a website section into corpus items, crawl a root web address while restricting the crawl to an allowed prefix:
172
+ ## Web Ingestion
173
+
174
+ Biblicus supports ingesting content directly from the web using two approaches.
175
+
176
+ ### Ingest from URLs
158
177
 
178
+ Ingest individual documents or web pages from URLs. The `ingest` command automatically detects content types including PDF, HTML, Markdown, images, and audio:
179
+
180
+ ```bash
181
+ # Ingest a document from a URL
182
+ biblicus ingest https://example.com/document.pdf --tags "research"
183
+
184
+ # Ingest a web page
185
+ biblicus ingest https://example.com/article.html --tags "article"
186
+
187
+ # Ingest with a corpus path specified
188
+ biblicus ingest --corpus corpora/example https://docs.example.com/guide.md --tags "documentation"
159
189
  ```
160
- biblicus crawl --corpus corpora/example \\
161
- --root-url https://example.com/docs/index.html \\
162
- --allowed-prefix https://example.com/docs/ \\
163
- --max-items 50 \\
164
- --tag crawled
190
+
191
+ ### Crawl Websites
192
+
193
+ Crawl entire website sections with automatic link discovery. The crawler follows links within the allowed prefix and stores discovered content:
194
+
195
+ ```bash
196
+ # Crawl a documentation site
197
+ biblicus crawl \
198
+ --corpus corpora/example \
199
+ --root-url https://docs.example.com/ \
200
+ --allowed-prefix https://docs.example.com/ \
201
+ --max-items 100 \
202
+ --tags "documentation"
203
+
204
+ # Crawl a specific blog category
205
+ biblicus crawl \
206
+ --corpus corpora/example \
207
+ --root-url https://blog.example.com/category/tutorials/ \
208
+ --allowed-prefix https://blog.example.com/category/tutorials/ \
209
+ --max-items 50 \
210
+ --tags "tutorials,blog"
165
211
  ```
166
212
 
213
+ The `--allowed-prefix` parameter restricts the crawler to only follow links that start with the specified URL prefix, preventing it from crawling outside the intended scope. The crawler respects `.biblicusignore` rules and stores items under `raw/imports/crawl/` in your corpus.
214
+
167
215
  ## End-to-end example: lower-level control
168
216
 
169
217
  The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
@@ -191,7 +239,7 @@ for note_title, note_text in notes:
191
239
 
192
240
  backend = get_backend("scan")
193
241
  run = backend.build_run(corpus, recipe_name="Story demo", config={})
194
- budget = QueryBudget(max_total_items=5, max_total_characters=2000, max_items_per_source=None)
242
+ budget = QueryBudget(max_total_items=5, maximum_total_characters=2000, max_items_per_source=None)
195
243
  result = backend.query(
196
244
  corpus,
197
245
  run=run,
@@ -231,7 +279,7 @@ Example output:
231
279
  "query_text": "Primary button style preference",
232
280
  "budget": {
233
281
  "max_total_items": 5,
234
- "max_total_characters": 2000,
282
+ "maximum_total_characters": 2000,
235
283
  "max_items_per_source": null
236
284
  },
237
285
  "run_id": "RUN_ID",
@@ -444,7 +492,7 @@ Three backends are included.
444
492
 
445
493
  - `scan` is a minimal baseline that scans raw items directly.
446
494
  - `sqlite-full-text-search` is a practical baseline that builds a full text search index in SQLite.
447
- - `vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
495
+ - `tf-vector` is a deterministic term-frequency vector baseline with cosine similarity scoring.
448
496
 
449
497
  For detailed documentation including configuration options, performance characteristics, and usage examples, see the [Backend Reference][backend-reference].
450
498
 
@@ -452,7 +500,8 @@ For detailed documentation including configuration options, performance characte
452
500
 
453
501
  For the retrieval pipeline overview and run artifacts, see `docs/RETRIEVAL.md`. For retrieval quality upgrades
454
502
  (tuned lexical baseline, reranking, hybrid retrieval), see `docs/RETRIEVAL_QUALITY.md`. For evaluation workflows
455
- and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`.
503
+ and dataset formats, see `docs/RETRIEVAL_EVALUATION.md`. For a runnable walkthrough, use the retrieval evaluation lab
504
+ script (`scripts/retrieval_evaluation_lab.py`).
456
505
 
457
506
  ## Extraction backends
458
507
 
@@ -493,6 +542,21 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
493
542
  For extraction evaluation workflows, dataset formats, and report interpretation, see
494
543
  `docs/EXTRACTION_EVALUATION.md`.
495
544
 
545
+ ## Text extract utility
546
+
547
+ Text extract is a reusable analysis utility that lets a model insert XML tags into a long text without re-emitting the
548
+ entire document. It returns structured spans and the marked-up text, and it is used as a segmentation option in Markov
549
+ analysis.
550
+
551
+ See `docs/TEXT_EXTRACT.md` for the utility API and examples, and `docs/MARKOV_ANALYSIS.md` for the Markov integration.
552
+
553
+ ## Text slice utility
554
+
555
+ Text slice is a reusable analysis utility that lets a model insert `<slice/>` markers into a long text without
556
+ re-emitting the entire document. It returns ordered slices and the marked-up text for auditing and reuse.
557
+
558
+ See `docs/TEXT_SLICE.md` for the utility API and examples.
559
+
496
560
  ## Topic modeling analysis
497
561
 
498
562
  Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
@@ -547,7 +611,7 @@ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[t
547
611
  For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
548
612
 
549
613
  ```
550
- python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
614
+ python scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
551
615
  ```
552
616
 
553
617
  See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
@@ -561,13 +625,13 @@ Use `scripts/download_pdf_samples.py` to download a small Portable Document Form
561
625
  ## Tests and coverage
562
626
 
563
627
  ```
564
- python3 scripts/test.py
628
+ python scripts/test.py
565
629
  ```
566
630
 
567
631
  To include integration scenarios that download public test data at runtime, run this command.
568
632
 
569
633
  ```
570
- python3 scripts/test.py --integration
634
+ python scripts/test.py --integration
571
635
  ```
572
636
 
573
637
  ## Releases
@@ -585,13 +649,13 @@ Reference documentation is generated from Sphinx style docstrings.
585
649
  Install development dependencies:
586
650
 
587
651
  ```
588
- python3 -m pip install -e ".[dev]"
652
+ python -m pip install -e ".[dev]"
589
653
  ```
590
654
 
591
655
  Build the documentation:
592
656
 
593
657
  ```
594
- python3 -m sphinx -b html docs docs/_build/html
658
+ python -m sphinx -b html docs docs/_build/html
595
659
  ```
596
660
 
597
661
  ## License
@@ -0,0 +1,25 @@
1
+ {
2
+ "schema_version": 1,
3
+ "name": "retrieval-evaluation-lab",
4
+ "description": "Bundled labels for the retrieval evaluation lab.",
5
+ "queries": [
6
+ {
7
+ "query_id": "q1",
8
+ "query_text": "alpha unique",
9
+ "expected_filename": "alpha.txt",
10
+ "kind": "gold"
11
+ },
12
+ {
13
+ "query_id": "q2",
14
+ "query_text": "beta unique",
15
+ "expected_filename": "beta.txt",
16
+ "kind": "gold"
17
+ },
18
+ {
19
+ "query_id": "q3",
20
+ "query_text": "gamma unique",
21
+ "expected_filename": "gamma.txt",
22
+ "kind": "gold"
23
+ }
24
+ ]
25
+ }