biblicus 0.9.0__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. {biblicus-0.9.0/src/biblicus.egg-info → biblicus-0.10.0}/PKG-INFO +7 -6
  2. {biblicus-0.9.0 → biblicus-0.10.0}/README.md +6 -5
  3. {biblicus-0.9.0 → biblicus-0.10.0}/docs/ANALYSIS.md +11 -0
  4. {biblicus-0.9.0 → biblicus-0.10.0}/docs/DEMOS.md +8 -0
  5. biblicus-0.10.0/docs/PROFILING.md +98 -0
  6. {biblicus-0.9.0 → biblicus-0.10.0}/docs/conf.py +5 -8
  7. {biblicus-0.9.0 → biblicus-0.10.0}/docs/index.rst +1 -0
  8. {biblicus-0.9.0 → biblicus-0.10.0}/features/analysis_schema.feature +52 -0
  9. {biblicus-0.9.0 → biblicus-0.10.0}/features/environment.py +3 -5
  10. biblicus-0.10.0/features/profiling.feature +150 -0
  11. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/analysis_steps.py +149 -9
  12. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/cli_steps.py +13 -7
  13. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/crawl_steps.py +6 -2
  14. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/deepgram_steps.py +3 -11
  15. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/docling_steps.py +2 -6
  16. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/evidence_processing_steps.py +0 -1
  17. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/extraction_run_lifecycle_steps.py +6 -2
  18. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/extraction_steps.py +25 -6
  19. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/inference_steps.py +12 -6
  20. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/markitdown_steps.py +1 -3
  21. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/openai_steps.py +3 -1
  22. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/paddleocr_mock_steps.py +0 -1
  23. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_steps.py +17 -19
  24. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_unit_steps.py +10 -9
  25. biblicus-0.10.0/features/steps/profiling_steps.py +205 -0
  26. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/requests_mock_steps.py +32 -13
  27. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/topic_modeling_steps.py +7 -3
  28. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/user_config_steps.py +6 -7
  29. {biblicus-0.9.0 → biblicus-0.10.0}/pyproject.toml +2 -1
  30. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_ag_news.py +1 -2
  31. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_audio_samples.py +9 -5
  32. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_image_samples.py +0 -5
  33. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_mixed_samples.py +0 -6
  34. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_pdf_samples.py +0 -5
  35. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/download_wikipedia.py +1 -5
  36. biblicus-0.10.0/scripts/profiling_demo.py +212 -0
  37. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/readme_end_to_end_demo.py +0 -1
  38. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/test.py +0 -4
  39. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/topic_modeling_integration.py +15 -10
  40. {biblicus-0.9.0 → biblicus-0.10.0}/scripts/wikipedia_rag_demo.py +3 -8
  41. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/__init__.py +1 -1
  42. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -1
  43. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -1
  44. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -1
  45. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -1
  46. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/__init__.py +2 -0
  47. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/models.py +228 -5
  48. biblicus-0.10.0/src/biblicus/analysis/profiling.py +337 -0
  49. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/topic_modeling.py +3 -6
  50. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/sqlite_full_text_search.py +2 -4
  51. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/cli.py +83 -4
  52. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/corpus.py +9 -3
  53. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/evidence_processing.py +4 -2
  54. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extraction.py +3 -1
  55. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/markitdown_text.py +1 -0
  56. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/paddleocr_vl_text.py +1 -3
  57. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/user_config.py +2 -6
  58. {biblicus-0.9.0 → biblicus-0.10.0/src/biblicus.egg-info}/PKG-INFO +7 -6
  59. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/SOURCES.txt +5 -0
  60. {biblicus-0.9.0 → biblicus-0.10.0}/LICENSE +0 -0
  61. {biblicus-0.9.0 → biblicus-0.10.0}/MANIFEST.in +0 -0
  62. {biblicus-0.9.0 → biblicus-0.10.0}/THIRD_PARTY_NOTICES.md +0 -0
  63. {biblicus-0.9.0 → biblicus-0.10.0}/datasets/wikipedia_mini.json +0 -0
  64. {biblicus-0.9.0 → biblicus-0.10.0}/docs/ARCHITECTURE.md +0 -0
  65. {biblicus-0.9.0 → biblicus-0.10.0}/docs/BACKENDS.md +0 -0
  66. {biblicus-0.9.0 → biblicus-0.10.0}/docs/CONTEXT_PACK.md +0 -0
  67. {biblicus-0.9.0 → biblicus-0.10.0}/docs/CORPUS.md +0 -0
  68. {biblicus-0.9.0 → biblicus-0.10.0}/docs/CORPUS_DESIGN.md +0 -0
  69. {biblicus-0.9.0 → biblicus-0.10.0}/docs/EXTRACTION.md +0 -0
  70. {biblicus-0.9.0 → biblicus-0.10.0}/docs/FEATURE_INDEX.md +0 -0
  71. {biblicus-0.9.0 → biblicus-0.10.0}/docs/KNOWLEDGE_BASE.md +0 -0
  72. {biblicus-0.9.0 → biblicus-0.10.0}/docs/ROADMAP.md +0 -0
  73. {biblicus-0.9.0 → biblicus-0.10.0}/docs/STT.md +0 -0
  74. {biblicus-0.9.0 → biblicus-0.10.0}/docs/TESTING.md +0 -0
  75. {biblicus-0.9.0 → biblicus-0.10.0}/docs/TOPIC_MODELING.md +0 -0
  76. {biblicus-0.9.0 → biblicus-0.10.0}/docs/USER_CONFIGURATION.md +0 -0
  77. {biblicus-0.9.0 → biblicus-0.10.0}/docs/api.rst +0 -0
  78. {biblicus-0.9.0 → biblicus-0.10.0}/docs/backends/index.md +0 -0
  79. {biblicus-0.9.0 → biblicus-0.10.0}/docs/backends/scan.md +0 -0
  80. {biblicus-0.9.0 → biblicus-0.10.0}/docs/backends/sqlite-full-text-search.md +0 -0
  81. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/index.md +0 -0
  82. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/ocr/index.md +0 -0
  83. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
  84. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/ocr/rapidocr.md +0 -0
  85. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/index.md +0 -0
  86. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
  87. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
  88. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
  89. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
  90. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
  91. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
  92. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/index.md +0 -0
  93. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/openai.md +0 -0
  94. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/index.md +0 -0
  95. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/markitdown.md +0 -0
  96. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/metadata.md +0 -0
  97. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/pass-through.md +0 -0
  98. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/pdf.md +0 -0
  99. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/text-document/unstructured.md +0 -0
  100. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
  101. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
  102. {biblicus-0.9.0 → biblicus-0.10.0}/docs/extractors/vlm-document/index.md +0 -0
  103. {biblicus-0.9.0 → biblicus-0.10.0}/features/backend_validation.feature +0 -0
  104. {biblicus-0.9.0 → biblicus-0.10.0}/features/biblicus_corpus.feature +0 -0
  105. {biblicus-0.9.0 → biblicus-0.10.0}/features/cli_entrypoint.feature +0 -0
  106. {biblicus-0.9.0 → biblicus-0.10.0}/features/cli_parsing.feature +0 -0
  107. {biblicus-0.9.0 → biblicus-0.10.0}/features/cli_step_spec_parsing.feature +0 -0
  108. {biblicus-0.9.0 → biblicus-0.10.0}/features/content_sniffing.feature +0 -0
  109. {biblicus-0.9.0 → biblicus-0.10.0}/features/context_pack.feature +0 -0
  110. {biblicus-0.9.0 → biblicus-0.10.0}/features/context_pack_cli.feature +0 -0
  111. {biblicus-0.9.0 → biblicus-0.10.0}/features/corpus_edge_cases.feature +0 -0
  112. {biblicus-0.9.0 → biblicus-0.10.0}/features/corpus_identity.feature +0 -0
  113. {biblicus-0.9.0 → biblicus-0.10.0}/features/corpus_purge.feature +0 -0
  114. {biblicus-0.9.0 → biblicus-0.10.0}/features/crawl.feature +0 -0
  115. {biblicus-0.9.0 → biblicus-0.10.0}/features/docling_granite_extractor.feature +0 -0
  116. {biblicus-0.9.0 → biblicus-0.10.0}/features/docling_smol_extractor.feature +0 -0
  117. {biblicus-0.9.0 → biblicus-0.10.0}/features/error_cases.feature +0 -0
  118. {biblicus-0.9.0 → biblicus-0.10.0}/features/evaluation.feature +0 -0
  119. {biblicus-0.9.0 → biblicus-0.10.0}/features/evidence_processing.feature +0 -0
  120. {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_error_handling.feature +0 -0
  121. {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_run_lifecycle.feature +0 -0
  122. {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_selection.feature +0 -0
  123. {biblicus-0.9.0 → biblicus-0.10.0}/features/extraction_selection_longest.feature +0 -0
  124. {biblicus-0.9.0 → biblicus-0.10.0}/features/extractor_pipeline.feature +0 -0
  125. {biblicus-0.9.0 → biblicus-0.10.0}/features/extractor_validation.feature +0 -0
  126. {biblicus-0.9.0 → biblicus-0.10.0}/features/frontmatter.feature +0 -0
  127. {biblicus-0.9.0 → biblicus-0.10.0}/features/hook_config_validation.feature +0 -0
  128. {biblicus-0.9.0 → biblicus-0.10.0}/features/hook_error_handling.feature +0 -0
  129. {biblicus-0.9.0 → biblicus-0.10.0}/features/import_tree.feature +0 -0
  130. {biblicus-0.9.0 → biblicus-0.10.0}/features/inference_backend.feature +0 -0
  131. {biblicus-0.9.0 → biblicus-0.10.0}/features/ingest_sources.feature +0 -0
  132. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_audio_samples.feature +0 -0
  133. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_image_samples.feature +0 -0
  134. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_mixed_corpus.feature +0 -0
  135. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_mixed_extraction.feature +0 -0
  136. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_ocr_image_extraction.feature +0 -0
  137. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_pdf_retrieval.feature +0 -0
  138. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_pdf_samples.feature +0 -0
  139. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_unstructured_extraction.feature +0 -0
  140. {biblicus-0.9.0 → biblicus-0.10.0}/features/integration_wikipedia.feature +0 -0
  141. {biblicus-0.9.0 → biblicus-0.10.0}/features/knowledge_base.feature +0 -0
  142. {biblicus-0.9.0 → biblicus-0.10.0}/features/lifecycle_hooks.feature +0 -0
  143. {biblicus-0.9.0 → biblicus-0.10.0}/features/markitdown_extractor.feature +0 -0
  144. {biblicus-0.9.0 → biblicus-0.10.0}/features/model_validation.feature +0 -0
  145. {biblicus-0.9.0 → biblicus-0.10.0}/features/ocr_extractor.feature +0 -0
  146. {biblicus-0.9.0 → biblicus-0.10.0}/features/paddleocr_vl_extractor.feature +0 -0
  147. {biblicus-0.9.0 → biblicus-0.10.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
  148. {biblicus-0.9.0 → biblicus-0.10.0}/features/pdf_text_extraction.feature +0 -0
  149. {biblicus-0.9.0 → biblicus-0.10.0}/features/python_api.feature +0 -0
  150. {biblicus-0.9.0 → biblicus-0.10.0}/features/python_hook_logging.feature +0 -0
  151. {biblicus-0.9.0 → biblicus-0.10.0}/features/query_processing.feature +0 -0
  152. {biblicus-0.9.0 → biblicus-0.10.0}/features/recipe_file_extraction.feature +0 -0
  153. {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_budget.feature +0 -0
  154. {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_scan.feature +0 -0
  155. {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  156. {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_uses_extraction_run.feature +0 -0
  157. {biblicus-0.9.0 → biblicus-0.10.0}/features/retrieval_utilities.feature +0 -0
  158. {biblicus-0.9.0 → biblicus-0.10.0}/features/select_override.feature +0 -0
  159. {biblicus-0.9.0 → biblicus-0.10.0}/features/smart_override_selection.feature +0 -0
  160. {biblicus-0.9.0 → biblicus-0.10.0}/features/source_loading.feature +0 -0
  161. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/backend_steps.py +0 -0
  162. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/cli_parsing_steps.py +0 -0
  163. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/context_pack_steps.py +0 -0
  164. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/extractor_steps.py +1 -1
  165. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/frontmatter_steps.py +0 -0
  166. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/knowledge_base_steps.py +0 -0
  167. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/model_steps.py +0 -0
  168. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/pdf_steps.py +0 -0
  169. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/python_api_steps.py +1 -1
  170. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/rapidocr_steps.py +0 -0
  171. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/retrieval_steps.py +0 -0
  172. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/stt_deepgram_steps.py +0 -0
  173. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/stt_steps.py +0 -0
  174. {biblicus-0.9.0 → biblicus-0.10.0}/features/steps/unstructured_steps.py +0 -0
  175. {biblicus-0.9.0 → biblicus-0.10.0}/features/streaming_ingest.feature +0 -0
  176. {biblicus-0.9.0 → biblicus-0.10.0}/features/stt_deepgram_extractor.feature +0 -0
  177. {biblicus-0.9.0 → biblicus-0.10.0}/features/stt_extractor.feature +0 -0
  178. {biblicus-0.9.0 → biblicus-0.10.0}/features/text_extraction_runs.feature +0 -0
  179. {biblicus-0.9.0 → biblicus-0.10.0}/features/token_budget.feature +0 -0
  180. {biblicus-0.9.0 → biblicus-0.10.0}/features/topic_modeling.feature +0 -0
  181. {biblicus-0.9.0 → biblicus-0.10.0}/features/unstructured_extractor.feature +0 -0
  182. {biblicus-0.9.0 → biblicus-0.10.0}/features/user_config.feature +0 -0
  183. {biblicus-0.9.0 → biblicus-0.10.0}/setup.cfg +0 -0
  184. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/__main__.py +0 -0
  185. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/base.py +0 -0
  186. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/llm.py +0 -0
  187. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/analysis/schema.py +0 -0
  188. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/__init__.py +0 -0
  189. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/base.py +0 -0
  190. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/backends/scan.py +0 -0
  191. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/constants.py +0 -0
  192. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/context.py +0 -0
  193. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/crawl.py +0 -0
  194. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/errors.py +0 -0
  195. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/evaluation.py +0 -0
  196. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/__init__.py +0 -0
  197. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/base.py +0 -0
  198. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
  199. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
  200. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
  201. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/metadata_text.py +0 -0
  202. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/openai_stt.py +0 -0
  203. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/pass_through_text.py +0 -0
  204. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/pdf_text.py +0 -0
  205. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/pipeline.py +0 -0
  206. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
  207. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_longest_text.py +0 -0
  208. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_override.py +0 -0
  209. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_smart_override.py +0 -0
  210. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/select_text.py +0 -0
  211. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/extractors/unstructured_text.py +0 -0
  212. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/frontmatter.py +0 -0
  213. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/hook_logging.py +0 -0
  214. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/hook_manager.py +0 -0
  215. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/hooks.py +0 -0
  216. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/ignore.py +0 -0
  217. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/inference.py +0 -0
  218. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/knowledge_base.py +0 -0
  219. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/models.py +0 -0
  220. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/retrieval.py +0 -0
  221. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/sources.py +0 -0
  222. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/time.py +0 -0
  223. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus/uris.py +0 -0
  224. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  225. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  226. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/requires.txt +0 -0
  227. {biblicus-0.9.0 → biblicus-0.10.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -531,12 +531,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
531
531
 
532
532
  ## Topic modeling analysis
533
533
 
534
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
535
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
536
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
537
- JavaScript Object Notation.
534
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
535
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
536
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
537
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
538
538
 
539
- See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
539
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
540
+ `docs/TOPIC_MODELING.md` for topic modeling details.
540
541
 
541
542
  Run a topic analysis using a recipe file:
542
543
 
@@ -485,12 +485,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
485
485
 
486
486
  ## Topic modeling analysis
487
487
 
488
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
489
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
490
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
491
- JavaScript Object Notation.
488
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
489
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
490
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
491
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
492
492
 
493
- See `docs/ANALYSIS.md` for the analysis pipeline overview and `docs/TOPIC_MODELING.md` for topic modeling details.
493
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
494
+ `docs/TOPIC_MODELING.md` for topic modeling details.
494
495
 
495
496
  Run a topic analysis using a recipe file:
496
497
 
@@ -34,3 +34,14 @@ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --fo
34
34
 
35
35
  The command prints the analysis run identifier and the output path. Open the resulting `output.json` to inspect per-topic
36
36
  labels, keywords, and document examples.
37
+
38
+ ## Profiling analysis
39
+
40
+ Profiling is the baseline analysis backend. It summarizes corpus composition and extraction coverage using
41
+ deterministic counts and distribution metrics. See `docs/PROFILING.md` for the full reference and working demo.
42
+
43
+ Run profiling from the CLI:
44
+
45
+ ```
46
+ biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
47
+ ```
@@ -214,6 +214,14 @@ python3 scripts/topic_modeling_integration.py \
214
214
  The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
215
215
  keywords, and document examples.
216
216
 
217
+ ### Profiling analysis demo
218
+
219
+ The profiling demo downloads AG News, runs extraction, and produces a profiling report.
220
+
221
+ ```
222
+ python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
223
+ ```
224
+
217
225
  ### Select extracted text within a pipeline
218
226
 
219
227
  When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
@@ -0,0 +1,98 @@
1
+ # Corpus profiling analysis
2
+
3
+ Biblicus provides a profiling analysis backend that summarizes corpus contents using deterministic counts and
4
+ coverage metrics. Profiling is intended as a fast, local baseline before heavier analysis such as topic modeling.
5
+
6
+ ## What profiling does
7
+
8
+ The profiling analysis reports:
9
+
10
+ - Total item count and media type distribution
11
+ - Extracted text coverage (present, empty, missing)
12
+ - Size and length distributions with percentiles
13
+ - Tag coverage and top tags
14
+
15
+ The output is structured JSON that can be stored, versioned, and compared across runs.
16
+
17
+ ## Run profiling from the CLI
18
+
19
+ ```
20
+ biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
21
+ ```
22
+
23
+ If you omit `--extraction-run`, Biblicus uses the latest extraction run and emits a reproducibility warning.
24
+
25
+ To customize profiling metrics, pass a recipe file:
26
+
27
+ ```
28
+ biblicus analyze profile --corpus corpora/example --recipe recipes/profiling.yml --extraction-run pipeline:RUN_ID
29
+ ```
30
+
31
+ ### Profiling recipe configuration
32
+
33
+ Profiling recipes use the analysis schema version and accept these fields:
34
+
35
+ - `schema_version`: analysis schema version, currently `1`
36
+ - `sample_size`: optional cap for distribution calculations
37
+ - `min_text_characters`: minimum extracted text length for inclusion
38
+ - `percentiles`: percentiles to compute for size and length distributions
39
+ - `top_tag_count`: maximum number of tags to list in `top_tags`
40
+ - `tag_filters`: optional list of tags to include in tag coverage metrics
41
+
42
+ Example recipe:
43
+
44
+ ```
45
+ schema_version: 1
46
+ sample_size: 500
47
+ min_text_characters: 50
48
+ percentiles: [50, 90, 99]
49
+ top_tag_count: 10
50
+ tag_filters: ["ag_news", "label:World"]
51
+ ```
52
+
53
+ ## Run profiling from Python
54
+
55
+ ```
56
+ from pathlib import Path
57
+
58
+ from biblicus.analysis import get_analysis_backend
59
+ from biblicus.corpus import Corpus
60
+ from biblicus.models import ExtractionRunReference
61
+
62
+ corpus = Corpus.open(Path("corpora/example"))
63
+ backend = get_analysis_backend("profiling")
64
+ output = backend.run_analysis(
65
+ corpus,
66
+ recipe_name="default",
67
+ config={
68
+ "schema_version": 1,
69
+ "sample_size": 500,
70
+ "min_text_characters": 50,
71
+ "percentiles": [50, 90, 99],
72
+ "top_tag_count": 10,
73
+ "tag_filters": ["ag_news"],
74
+ },
75
+ extraction_run=ExtractionRunReference(
76
+ extractor_id="pipeline",
77
+ run_id="RUN_ID",
78
+ ),
79
+ )
80
+ print(output.model_dump())
81
+ ```
82
+
83
+ ## Output location
84
+
85
+ Profiling output is stored under:
86
+
87
+ ```
88
+ .biblicus/runs/analysis/profiling/<run_id>/output.json
89
+ ```
90
+
91
+ ## Working demo
92
+
93
+ A runnable demo is provided in `scripts/profiling_demo.py`. It downloads a corpus, runs extraction, and executes the
94
+ profiling analysis so you can inspect the output:
95
+
96
+ ```
97
+ python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
98
+ ```
@@ -4,8 +4,13 @@ Sphinx configuration for Biblicus documentation.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ import os
8
+ import sys
7
9
  from pathlib import Path
8
10
 
11
+ from pygments.lexers.special import TextLexer
12
+ from sphinx.highlighting import lexers
13
+
9
14
  PROJECT_ROOT = Path(__file__).resolve().parent.parent
10
15
  SOURCE_ROOT = PROJECT_ROOT / "src"
11
16
 
@@ -31,8 +36,6 @@ html_theme_options = {
31
36
  }
32
37
 
33
38
  # ReadTheDocs integration - canonical URL for SEO
34
- import os
35
-
36
39
  if os.environ.get("READTHEDOCS"):
37
40
  rtd_version = os.environ.get("READTHEDOCS_VERSION", "latest")
38
41
  rtd_project = os.environ.get("READTHEDOCS_PROJECT", "biblicus")
@@ -44,12 +47,6 @@ source_suffix = {
44
47
  }
45
48
 
46
49
  suppress_warnings = ["misc.highlighting_failure"]
47
-
48
- import sys
49
-
50
50
  sys.path.insert(0, str(SOURCE_ROOT))
51
51
 
52
- from pygments.lexers.special import TextLexer
53
- from sphinx.highlighting import lexers
54
-
55
52
  lexers["mermaid"] = TextLexer()
@@ -17,6 +17,7 @@ Contents
17
17
  backends/index
18
18
  CONTEXT_PACK
19
19
  ANALYSIS
20
+ PROFILING
20
21
  TOPIC_MODELING
21
22
  DEMOS
22
23
  USER_CONFIGURATION
@@ -56,3 +56,55 @@ Feature: Analysis schema validation
56
56
  When I attempt to validate a vectorizer config with stop words "spanish"
57
57
  Then a model validation error is raised
58
58
  And the validation error mentions "vectorizer.stop_words must be"
59
+
60
+ Scenario: Profiling config rejects invalid sample size
61
+ When I attempt to validate a profiling config with sample size 0
62
+ Then a model validation error is raised
63
+ And the validation error mentions "sample_size"
64
+
65
+ Scenario: Profiling config rejects unsupported schema version
66
+ When I attempt to validate a profiling config with schema version 2
67
+ Then a model validation error is raised
68
+ And the validation error mentions "Unsupported analysis schema version"
69
+
70
+ Scenario: Profiling config rejects invalid percentiles
71
+ When I attempt to validate a profiling config with percentiles "0,101"
72
+ Then a model validation error is raised
73
+ And the validation error mentions "percentiles"
74
+
75
+ Scenario: Profiling config rejects empty percentiles
76
+ When I attempt to validate a profiling config with empty percentiles
77
+ Then a model validation error is raised
78
+ And the validation error mentions "percentiles"
79
+
80
+ Scenario: Profiling config rejects unsorted percentiles
81
+ When I attempt to validate a profiling config with percentiles "90,50"
82
+ Then a model validation error is raised
83
+ And the validation error mentions "percentiles"
84
+
85
+ Scenario: Profiling config rejects empty tag filters
86
+ When I attempt to validate a profiling config with tag filters "alpha,,beta"
87
+ Then a model validation error is raised
88
+ And the validation error mentions "tag_filters"
89
+
90
+ Scenario: Profiling config rejects non-list tag filters
91
+ When I attempt to validate a profiling config with tag filters string "alpha"
92
+ Then a model validation error is raised
93
+ And the validation error mentions "tag_filters"
94
+
95
+ Scenario: Profiling config accepts tag filters None
96
+ When I validate a profiling config with tag filters None
97
+ Then the profiling tag filters are absent
98
+
99
+ Scenario: Profiling config normalizes tag filters
100
+ When I validate a profiling config with tag filters list " alpha ,beta "
101
+ Then the profiling tag filters include "alpha"
102
+ And the profiling tag filters include "beta"
103
+
104
+ Scenario: Profiling ordering helper ignores missing items
105
+ When I order catalog items with missing entries
106
+ Then the ordered catalog item identifiers equal "a,c,b"
107
+
108
+ Scenario: Profiling percentile helper handles empty values
109
+ When I compute a profiling percentile on empty values
110
+ Then the profiling percentile value equals 0
@@ -17,7 +17,6 @@ def _repo_root() -> Path:
17
17
  :return: Repository root path.
18
18
  :rtype: Path
19
19
  """
20
-
21
20
  return Path(__file__).resolve().parent.parent
22
21
 
23
22
 
@@ -32,7 +31,6 @@ def before_scenario(context, scenario) -> None:
32
31
  :return: None.
33
32
  :rtype: None
34
33
  """
35
-
36
34
  import biblicus.__main__ as _biblicus_main
37
35
 
38
36
  _ = _biblicus_main
@@ -74,7 +72,6 @@ def after_scenario(context, scenario) -> None:
74
72
  :return: None.
75
73
  :rtype: None
76
74
  """
77
-
78
75
  if getattr(context, "httpd", None) is not None:
79
76
  context.httpd.shutdown()
80
77
  context.httpd.server_close()
@@ -221,7 +218,9 @@ def after_scenario(context, scenario) -> None:
221
218
  context.fake_paddleocr_vl_behaviors.clear()
222
219
  if getattr(context, "_fake_paddleocr_installed", False):
223
220
  # Remove all paddle-related modules
224
- paddle_module_names = [name for name in list(sys.modules.keys()) if "paddle" in name.lower()]
221
+ paddle_module_names = [
222
+ name for name in list(sys.modules.keys()) if "paddle" in name.lower()
223
+ ]
225
224
  for name in paddle_module_names:
226
225
  sys.modules.pop(name, None)
227
226
  # Restore original modules
@@ -345,7 +344,6 @@ def run_biblicus(
345
344
  :return: Captured execution result.
346
345
  :rtype: RunResult
347
346
  """
348
-
349
347
  import contextlib
350
348
  import io
351
349
 
@@ -0,0 +1,150 @@
1
+ Feature: Profiling analysis
2
+ Profiling analysis summarizes raw corpus composition and extracted text coverage.
3
+
4
+ Scenario: Profiling analysis reports raw and extracted counts
5
+ Given I initialized a corpus at "corpus"
6
+ And a binary file "blob.bin" exists
7
+ When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
8
+ And I ingest the file "blob.bin" into corpus "corpus"
9
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
10
+ | extractor_id | config_json |
11
+ | pass-through-text | {} |
12
+ And I run a profiling analysis in corpus "corpus" using the latest extraction run
13
+ Then the profiling output includes raw item total 2
14
+ And the profiling output includes media type count "text/markdown" 1
15
+ And the profiling output includes media type count "application/octet-stream" 1
16
+ And the profiling output includes raw bytes distribution count 2
17
+ And the profiling output includes raw bytes percentiles 50,90,99
18
+ And the profiling output includes tagged items 1
19
+ And the profiling output includes untagged items 1
20
+ And the profiling output includes top tag "t" with count 1
21
+ And the profiling output includes extracted source items 2
22
+ And the profiling output includes extracted nonempty items 1
23
+ And the profiling output includes extracted empty items 0
24
+ And the profiling output includes extracted missing items 1
25
+ And the profiling output includes extracted text distribution count 1
26
+ And the profiling output includes extracted text percentiles 50,90,99
27
+
28
+ Scenario: Profiling analysis uses the latest extraction run when omitted
29
+ Given I initialized a corpus at "corpus"
30
+ And a binary file "blob.bin" exists
31
+ When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
32
+ And I ingest the file "blob.bin" into corpus "corpus"
33
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
34
+ | extractor_id | config_json |
35
+ | pass-through-text | {} |
36
+ And I run a profiling analysis in corpus "corpus"
37
+ Then the command succeeds
38
+ And standard error includes "latest extraction run"
39
+
40
+ Scenario: Profiling analysis accepts a recipe file
41
+ Given I initialized a corpus at "corpus"
42
+ And a binary file "blob.bin" exists
43
+ When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
44
+ And I ingest the file "blob.bin" into corpus "corpus"
45
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
46
+ | extractor_id | config_json |
47
+ | pass-through-text | {} |
48
+ And I create a profiling recipe file "profiling_recipe.yml" with:
49
+ """
50
+ schema_version: 1
51
+ sample_size: 1
52
+ percentiles: [50]
53
+ top_tag_count: 1
54
+ """
55
+ And I run a profiling analysis in corpus "corpus" using recipe "profiling_recipe.yml" and the latest extraction run
56
+ Then the profiling output includes raw bytes distribution count 1
57
+ And the profiling output includes raw bytes percentiles 50
58
+ And the profiling output includes top tag "t" with count 1
59
+
60
+ Scenario: Profiling analysis reports empty corpus distributions
61
+ Given I initialized a corpus at "corpus"
62
+ When I build a "pipeline" extraction run in corpus "corpus" with steps:
63
+ | extractor_id | config_json |
64
+ | pass-through-text | {} |
65
+ And I run a profiling analysis in corpus "corpus" using the latest extraction run
66
+ Then the profiling output includes raw item total 0
67
+ And the profiling output includes raw bytes distribution count 0
68
+ And the profiling output includes extracted source items 0
69
+ And the profiling output includes extracted text distribution count 0
70
+
71
+ Scenario: Profiling analysis counts empty extracted text
72
+ Given I initialized a corpus at "corpus"
73
+ When I ingest the text " " with title "Blank" and tags "t" into corpus "corpus"
74
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
75
+ | extractor_id | config_json |
76
+ | pass-through-text | {} |
77
+ And I run a profiling analysis in corpus "corpus" using the latest extraction run
78
+ Then the profiling output includes extracted nonempty items 0
79
+ And the profiling output includes extracted empty items 1
80
+
81
+ Scenario: Profiling analysis respects minimum text length
82
+ Given I initialized a corpus at "corpus"
83
+ When I ingest the text "short" with title "Short" and tags "t" into corpus "corpus"
84
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
85
+ | extractor_id | config_json |
86
+ | pass-through-text | {} |
87
+ And I create a profiling recipe file "profiling_min_text.yml" with:
88
+ """
89
+ schema_version: 1
90
+ min_text_characters: 10
91
+ """
92
+ And I run a profiling analysis in corpus "corpus" using recipe "profiling_min_text.yml" and the latest extraction run
93
+ Then the profiling output includes extracted nonempty items 0
94
+ And the profiling output includes extracted empty items 1
95
+
96
+ Scenario: Profiling analysis applies tag filters
97
+ Given I initialized a corpus at "corpus"
98
+ When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
99
+ And I ingest the text "Beta note" with title "Beta" and tags "other" into corpus "corpus"
100
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
101
+ | extractor_id | config_json |
102
+ | pass-through-text | {} |
103
+ And I create a profiling recipe file "profiling_tags.yml" with:
104
+ """
105
+ schema_version: 1
106
+ tag_filters: ["t"]
107
+ """
108
+ And I run a profiling analysis in corpus "corpus" using recipe "profiling_tags.yml" and the latest extraction run
109
+ Then the profiling output includes top tag "t" with count 1
110
+ And the profiling output includes tagged items 1
111
+ And the profiling output includes untagged items 1
112
+
113
+ Scenario: Profiling analysis rejects missing recipe file
114
+ Given I initialized a corpus at "corpus"
115
+ When I run a profiling analysis in corpus "corpus" using recipe "missing.yml" without extraction run
116
+ Then the command fails with exit code 2
117
+ And standard error includes "Recipe file not found"
118
+
119
+ Scenario: Profiling analysis rejects non-mapping recipe
120
+ Given I initialized a corpus at "corpus"
121
+ When I create a profiling recipe file "profiling_invalid.yml" with:
122
+ """
123
+ - not
124
+ - a
125
+ - mapping
126
+ """
127
+ And I run a profiling analysis in corpus "corpus" using recipe "profiling_invalid.yml" without extraction run
128
+ Then the command fails with exit code 2
129
+ And standard error includes "Profiling recipe must be a mapping/object"
130
+
131
+ Scenario: Profiling analysis rejects invalid recipe values
132
+ Given I initialized a corpus at "corpus"
133
+ When I ingest the text "Alpha note" with title "Alpha" and tags "t" into corpus "corpus"
134
+ And I build a "pipeline" extraction run in corpus "corpus" with steps:
135
+ | extractor_id | config_json |
136
+ | pass-through-text | {} |
137
+ And I create a profiling recipe file "profiling_invalid_values.yml" with:
138
+ """
139
+ schema_version: 1
140
+ percentiles: ["bad"]
141
+ """
142
+ And I run a profiling analysis in corpus "corpus" using recipe "profiling_invalid_values.yml" and the latest extraction run
143
+ Then the command fails with exit code 2
144
+ And standard error includes "Invalid profiling recipe"
145
+
146
+ Scenario: Profiling analysis requires extraction run
147
+ Given I initialized a corpus at "corpus"
148
+ When I run a profiling analysis in corpus "corpus"
149
+ Then the command fails with exit code 2
150
+ And standard error includes "Profiling analysis requires an extraction run"
@@ -9,23 +9,25 @@ from biblicus.analysis import get_analysis_backend
9
9
  from biblicus.analysis.base import CorpusAnalysisBackend
10
10
  from biblicus.analysis.llm import LlmClientConfig, LlmProvider
11
11
  from biblicus.analysis.models import (
12
+ ProfilingRecipeConfig,
13
+ TopicModelingKeyword,
14
+ TopicModelingLabelSource,
12
15
  TopicModelingLlmExtractionConfig,
13
16
  TopicModelingLlmExtractionMethod,
14
17
  TopicModelingLlmFineTuningConfig,
15
- TopicModelingKeyword,
16
- TopicModelingLabelSource,
17
18
  TopicModelingTopic,
18
19
  TopicModelingVectorizerConfig,
19
20
  )
21
+ from biblicus.analysis.profiling import _ordered_catalog_items, _percentile_value
20
22
  from biblicus.analysis.topic_modeling import (
21
- _TopicDocument,
22
23
  _apply_llm_fine_tuning,
23
24
  _parse_itemized_response,
25
+ _TopicDocument,
24
26
  )
25
- from biblicus.models import ExtractionRunReference
27
+ from biblicus.models import CatalogItem, ExtractionRunReference
26
28
  from features.steps.openai_steps import (
27
- _FakeOpenAiChatBehavior,
28
29
  _ensure_fake_openai_chat_behaviors,
30
+ _FakeOpenAiChatBehavior,
29
31
  _install_fake_openai_module,
30
32
  )
31
33
 
@@ -163,9 +165,7 @@ def step_run_llm_fine_tuning_missing_documents(context) -> None:
163
165
  document_ids=["missing"],
164
166
  )
165
167
  ]
166
- documents = [
167
- _TopicDocument(document_id="present", source_item_id="present", text="Text")
168
- ]
168
+ documents = [_TopicDocument(document_id="present", source_item_id="present", text="Text")]
169
169
  report, labeled_topics = _apply_llm_fine_tuning(
170
170
  topics=topics,
171
171
  documents=documents,
@@ -184,7 +184,7 @@ def step_fine_tuning_topics_labeled(context, count: int) -> None:
184
184
 
185
185
  @when("I parse an itemized response JSON string")
186
186
  def step_parse_itemized_response_json_string(context) -> None:
187
- response_text = "\"[\\\"Alpha\\\", \\\"Beta\\\"]\""
187
+ response_text = '"[\\"Alpha\\", \\"Beta\\"]"'
188
188
  context.itemized_response = _parse_itemized_response(response_text)
189
189
 
190
190
 
@@ -247,3 +247,143 @@ def step_vectorizer_stop_words_equals(context, value: str) -> None:
247
247
  def step_vectorizer_stop_words_absent(context) -> None:
248
248
  model = context.last_model
249
249
  assert model.stop_words is None
250
+
251
+
252
+ @when("I attempt to validate a profiling config with sample size {value:d}")
253
+ def step_validate_profiling_sample_size(context, value: int) -> None:
254
+ try:
255
+ ProfilingRecipeConfig(sample_size=value)
256
+ context.validation_error = None
257
+ except ValidationError as exc:
258
+ context.validation_error = exc
259
+
260
+
261
+ @when('I attempt to validate a profiling config with percentiles "{values}"')
262
+ def step_validate_profiling_percentiles(context, values: str) -> None:
263
+ try:
264
+ percentiles = [int(value.strip()) for value in values.split(",") if value.strip()]
265
+ ProfilingRecipeConfig(percentiles=percentiles)
266
+ context.validation_error = None
267
+ except ValidationError as exc:
268
+ context.validation_error = exc
269
+
270
+
271
+ @when('I attempt to validate a profiling config with tag filters "{values}"')
272
+ def step_validate_profiling_tag_filters(context, values: str) -> None:
273
+ try:
274
+ tags = [value.strip() for value in values.split(",")]
275
+ ProfilingRecipeConfig(tag_filters=tags)
276
+ context.validation_error = None
277
+ except ValidationError as exc:
278
+ context.validation_error = exc
279
+
280
+
281
+ @when("I attempt to validate a profiling config with schema version {value:d}")
282
+ def step_validate_profiling_schema_version(context, value: int) -> None:
283
+ try:
284
+ ProfilingRecipeConfig(schema_version=value)
285
+ context.validation_error = None
286
+ except ValidationError as exc:
287
+ context.validation_error = exc
288
+
289
+
290
+ @when("I attempt to validate a profiling config with empty percentiles")
291
+ def step_validate_profiling_empty_percentiles(context) -> None:
292
+ try:
293
+ ProfilingRecipeConfig(percentiles=[])
294
+ context.validation_error = None
295
+ except ValidationError as exc:
296
+ context.validation_error = exc
297
+
298
+
299
+ @when('I attempt to validate a profiling config with tag filters string "{value}"')
300
+ def step_validate_profiling_tag_filters_string(context, value: str) -> None:
301
+ try:
302
+ ProfilingRecipeConfig(tag_filters=value)
303
+ context.validation_error = None
304
+ except ValidationError as exc:
305
+ context.validation_error = exc
306
+
307
+
308
+ @when("I validate a profiling config with tag filters None")
309
+ def step_validate_profiling_tag_filters_none(context) -> None:
310
+ context.last_model = ProfilingRecipeConfig(tag_filters=None)
311
+
312
+
313
+ @when('I validate a profiling config with tag filters list "{values}"')
314
+ def step_validate_profiling_tag_filters_list(context, values: str) -> None:
315
+ tags = [value.strip() for value in values.split(",")]
316
+ context.last_model = ProfilingRecipeConfig(tag_filters=tags)
317
+
318
+
319
+ @then("the profiling tag filters are absent")
320
+ def step_profiling_tag_filters_absent(context) -> None:
321
+ model = context.last_model
322
+ assert model.tag_filters is None
323
+
324
+
325
+ @then('the profiling tag filters include "{value}"')
326
+ def step_profiling_tag_filters_include(context, value: str) -> None:
327
+ model = context.last_model
328
+ assert model.tag_filters is not None
329
+ assert value in model.tag_filters
330
+
331
+
332
+ @when("I order catalog items with missing entries")
333
+ def step_order_catalog_items_with_missing_entries(context) -> None:
334
+ items = {
335
+ "a": CatalogItem(
336
+ id="a",
337
+ relpath="raw/a.txt",
338
+ sha256="a",
339
+ bytes=1,
340
+ media_type="text/plain",
341
+ title=None,
342
+ tags=[],
343
+ metadata={},
344
+ created_at="2020-01-01T00:00:00Z",
345
+ source_uri=None,
346
+ ),
347
+ "b": CatalogItem(
348
+ id="b",
349
+ relpath="raw/b.txt",
350
+ sha256="b",
351
+ bytes=2,
352
+ media_type="text/plain",
353
+ title=None,
354
+ tags=[],
355
+ metadata={},
356
+ created_at="2020-01-01T00:00:00Z",
357
+ source_uri=None,
358
+ ),
359
+ "c": CatalogItem(
360
+ id="c",
361
+ relpath="raw/c.txt",
362
+ sha256="c",
363
+ bytes=3,
364
+ media_type="text/plain",
365
+ title=None,
366
+ tags=[],
367
+ metadata={},
368
+ created_at="2020-01-01T00:00:00Z",
369
+ source_uri=None,
370
+ ),
371
+ }
372
+ ordered = _ordered_catalog_items(items, ["a", "missing", "c"])
373
+ context.ordered_catalog_ids = [item.id for item in ordered]
374
+
375
+
376
+ @then('the ordered catalog item identifiers equal "{values}"')
377
+ def step_ordered_catalog_item_identifiers_equal(context, values: str) -> None:
378
+ expected = [value.strip() for value in values.split(",") if value.strip()]
379
+ assert context.ordered_catalog_ids == expected
380
+
381
+
382
+ @when("I compute a profiling percentile on empty values")
383
+ def step_compute_profiling_percentile_empty(context) -> None:
384
+ context.percentile_value = _percentile_value([], 50)
385
+
386
+
387
+ @then("the profiling percentile value equals {value:d}")
388
+ def step_profiling_percentile_value_equals(context, value: int) -> None:
389
+ assert context.percentile_value == value