biblicus 0.8.0__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. {biblicus-0.8.0/src/biblicus.egg-info → biblicus-0.10.0}/PKG-INFO +17 -10
  2. {biblicus-0.8.0 → biblicus-0.10.0}/README.md +14 -9
  3. biblicus-0.10.0/docs/ANALYSIS.md +47 -0
  4. {biblicus-0.8.0 → biblicus-0.10.0}/docs/DEMOS.md +20 -31
  5. biblicus-0.10.0/docs/PROFILING.md +98 -0
  6. {biblicus-0.8.0 → biblicus-0.10.0}/docs/ROADMAP.md +10 -54
  7. {biblicus-0.8.0 → biblicus-0.10.0}/docs/TESTING.md +1 -1
  8. biblicus-0.10.0/docs/TOPIC_MODELING.md +159 -0
  9. {biblicus-0.8.0 → biblicus-0.10.0}/docs/conf.py +5 -8
  10. {biblicus-0.8.0 → biblicus-0.10.0}/docs/index.rst +2 -0
  11. biblicus-0.10.0/features/analysis_schema.feature +110 -0
  12. {biblicus-0.8.0 → biblicus-0.10.0}/features/environment.py +29 -5
  13. biblicus-0.10.0/features/profiling.feature +150 -0
  14. biblicus-0.10.0/features/steps/analysis_steps.py +389 -0
  15. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/cli_steps.py +13 -7
  16. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/crawl_steps.py +6 -2
  17. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/deepgram_steps.py +3 -11
  18. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/docling_steps.py +2 -6
  19. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/evidence_processing_steps.py +0 -1
  20. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/extraction_run_lifecycle_steps.py +6 -2
  21. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/extraction_steps.py +25 -6
  22. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/inference_steps.py +12 -6
  23. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/markitdown_steps.py +1 -3
  24. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/openai_steps.py +3 -1
  25. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/paddleocr_mock_steps.py +0 -1
  26. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_steps.py +17 -19
  27. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/paddleocr_vl_unit_steps.py +10 -9
  28. biblicus-0.10.0/features/steps/profiling_steps.py +205 -0
  29. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/requests_mock_steps.py +32 -13
  30. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/topic_modeling_steps.py +98 -7
  31. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/user_config_steps.py +6 -7
  32. {biblicus-0.8.0 → biblicus-0.10.0}/features/topic_modeling.feature +170 -0
  33. {biblicus-0.8.0 → biblicus-0.10.0}/pyproject.toml +5 -1
  34. biblicus-0.10.0/scripts/download_ag_news.py +150 -0
  35. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_audio_samples.py +9 -5
  36. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_image_samples.py +0 -5
  37. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_mixed_samples.py +0 -6
  38. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_pdf_samples.py +0 -5
  39. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/download_wikipedia.py +1 -5
  40. biblicus-0.10.0/scripts/profiling_demo.py +212 -0
  41. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/readme_end_to_end_demo.py +0 -1
  42. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/test.py +0 -4
  43. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/topic_modeling_integration.py +76 -14
  44. {biblicus-0.8.0 → biblicus-0.10.0}/scripts/wikipedia_rag_demo.py +3 -8
  45. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/__init__.py +1 -1
  46. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -1
  47. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -1
  48. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -1
  49. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -1
  50. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/__init__.py +2 -0
  51. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/models.py +268 -3
  52. biblicus-0.10.0/src/biblicus/analysis/profiling.py +337 -0
  53. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/topic_modeling.py +28 -7
  54. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/sqlite_full_text_search.py +2 -4
  55. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/cli.py +83 -4
  56. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/corpus.py +9 -3
  57. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/evidence_processing.py +4 -2
  58. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extraction.py +3 -1
  59. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/markitdown_text.py +1 -0
  60. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/paddleocr_vl_text.py +1 -3
  61. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/user_config.py +2 -6
  62. {biblicus-0.8.0 → biblicus-0.10.0/src/biblicus.egg-info}/PKG-INFO +17 -10
  63. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/SOURCES.txt +7 -0
  64. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/requires.txt +3 -0
  65. biblicus-0.8.0/docs/TOPIC_MODELING.md +0 -82
  66. biblicus-0.8.0/features/analysis_schema.feature +0 -36
  67. biblicus-0.8.0/features/steps/analysis_steps.py +0 -194
  68. {biblicus-0.8.0 → biblicus-0.10.0}/LICENSE +0 -0
  69. {biblicus-0.8.0 → biblicus-0.10.0}/MANIFEST.in +0 -0
  70. {biblicus-0.8.0 → biblicus-0.10.0}/THIRD_PARTY_NOTICES.md +0 -0
  71. {biblicus-0.8.0 → biblicus-0.10.0}/datasets/wikipedia_mini.json +0 -0
  72. {biblicus-0.8.0 → biblicus-0.10.0}/docs/ARCHITECTURE.md +0 -0
  73. {biblicus-0.8.0 → biblicus-0.10.0}/docs/BACKENDS.md +0 -0
  74. {biblicus-0.8.0 → biblicus-0.10.0}/docs/CONTEXT_PACK.md +0 -0
  75. {biblicus-0.8.0 → biblicus-0.10.0}/docs/CORPUS.md +0 -0
  76. {biblicus-0.8.0 → biblicus-0.10.0}/docs/CORPUS_DESIGN.md +0 -0
  77. {biblicus-0.8.0 → biblicus-0.10.0}/docs/EXTRACTION.md +0 -0
  78. {biblicus-0.8.0 → biblicus-0.10.0}/docs/FEATURE_INDEX.md +0 -0
  79. {biblicus-0.8.0 → biblicus-0.10.0}/docs/KNOWLEDGE_BASE.md +0 -0
  80. {biblicus-0.8.0 → biblicus-0.10.0}/docs/STT.md +0 -0
  81. {biblicus-0.8.0 → biblicus-0.10.0}/docs/USER_CONFIGURATION.md +0 -0
  82. {biblicus-0.8.0 → biblicus-0.10.0}/docs/api.rst +0 -0
  83. {biblicus-0.8.0 → biblicus-0.10.0}/docs/backends/index.md +0 -0
  84. {biblicus-0.8.0 → biblicus-0.10.0}/docs/backends/scan.md +0 -0
  85. {biblicus-0.8.0 → biblicus-0.10.0}/docs/backends/sqlite-full-text-search.md +0 -0
  86. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/index.md +0 -0
  87. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/ocr/index.md +0 -0
  88. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/ocr/paddleocr-vl.md +0 -0
  89. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/ocr/rapidocr.md +0 -0
  90. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/index.md +0 -0
  91. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/pipeline.md +0 -0
  92. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-longest.md +0 -0
  93. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-override.md +0 -0
  94. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-smart-override.md +0 -0
  95. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/pipeline-utilities/select-text.md +0 -0
  96. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/deepgram.md +0 -0
  97. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/index.md +0 -0
  98. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/speech-to-text/openai.md +0 -0
  99. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/index.md +0 -0
  100. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/markitdown.md +0 -0
  101. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/metadata.md +0 -0
  102. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/pass-through.md +0 -0
  103. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/pdf.md +0 -0
  104. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/text-document/unstructured.md +0 -0
  105. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-granite.md +0 -0
  106. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/vlm-document/docling-smol.md +0 -0
  107. {biblicus-0.8.0 → biblicus-0.10.0}/docs/extractors/vlm-document/index.md +0 -0
  108. {biblicus-0.8.0 → biblicus-0.10.0}/features/backend_validation.feature +0 -0
  109. {biblicus-0.8.0 → biblicus-0.10.0}/features/biblicus_corpus.feature +0 -0
  110. {biblicus-0.8.0 → biblicus-0.10.0}/features/cli_entrypoint.feature +0 -0
  111. {biblicus-0.8.0 → biblicus-0.10.0}/features/cli_parsing.feature +0 -0
  112. {biblicus-0.8.0 → biblicus-0.10.0}/features/cli_step_spec_parsing.feature +0 -0
  113. {biblicus-0.8.0 → biblicus-0.10.0}/features/content_sniffing.feature +0 -0
  114. {biblicus-0.8.0 → biblicus-0.10.0}/features/context_pack.feature +0 -0
  115. {biblicus-0.8.0 → biblicus-0.10.0}/features/context_pack_cli.feature +0 -0
  116. {biblicus-0.8.0 → biblicus-0.10.0}/features/corpus_edge_cases.feature +0 -0
  117. {biblicus-0.8.0 → biblicus-0.10.0}/features/corpus_identity.feature +0 -0
  118. {biblicus-0.8.0 → biblicus-0.10.0}/features/corpus_purge.feature +0 -0
  119. {biblicus-0.8.0 → biblicus-0.10.0}/features/crawl.feature +0 -0
  120. {biblicus-0.8.0 → biblicus-0.10.0}/features/docling_granite_extractor.feature +0 -0
  121. {biblicus-0.8.0 → biblicus-0.10.0}/features/docling_smol_extractor.feature +0 -0
  122. {biblicus-0.8.0 → biblicus-0.10.0}/features/error_cases.feature +0 -0
  123. {biblicus-0.8.0 → biblicus-0.10.0}/features/evaluation.feature +0 -0
  124. {biblicus-0.8.0 → biblicus-0.10.0}/features/evidence_processing.feature +0 -0
  125. {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_error_handling.feature +0 -0
  126. {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_run_lifecycle.feature +0 -0
  127. {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_selection.feature +0 -0
  128. {biblicus-0.8.0 → biblicus-0.10.0}/features/extraction_selection_longest.feature +0 -0
  129. {biblicus-0.8.0 → biblicus-0.10.0}/features/extractor_pipeline.feature +0 -0
  130. {biblicus-0.8.0 → biblicus-0.10.0}/features/extractor_validation.feature +0 -0
  131. {biblicus-0.8.0 → biblicus-0.10.0}/features/frontmatter.feature +0 -0
  132. {biblicus-0.8.0 → biblicus-0.10.0}/features/hook_config_validation.feature +0 -0
  133. {biblicus-0.8.0 → biblicus-0.10.0}/features/hook_error_handling.feature +0 -0
  134. {biblicus-0.8.0 → biblicus-0.10.0}/features/import_tree.feature +0 -0
  135. {biblicus-0.8.0 → biblicus-0.10.0}/features/inference_backend.feature +0 -0
  136. {biblicus-0.8.0 → biblicus-0.10.0}/features/ingest_sources.feature +0 -0
  137. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_audio_samples.feature +0 -0
  138. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_image_samples.feature +0 -0
  139. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_mixed_corpus.feature +0 -0
  140. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_mixed_extraction.feature +0 -0
  141. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_ocr_image_extraction.feature +0 -0
  142. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_pdf_retrieval.feature +0 -0
  143. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_pdf_samples.feature +0 -0
  144. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_unstructured_extraction.feature +0 -0
  145. {biblicus-0.8.0 → biblicus-0.10.0}/features/integration_wikipedia.feature +0 -0
  146. {biblicus-0.8.0 → biblicus-0.10.0}/features/knowledge_base.feature +0 -0
  147. {biblicus-0.8.0 → biblicus-0.10.0}/features/lifecycle_hooks.feature +0 -0
  148. {biblicus-0.8.0 → biblicus-0.10.0}/features/markitdown_extractor.feature +0 -0
  149. {biblicus-0.8.0 → biblicus-0.10.0}/features/model_validation.feature +0 -0
  150. {biblicus-0.8.0 → biblicus-0.10.0}/features/ocr_extractor.feature +0 -0
  151. {biblicus-0.8.0 → biblicus-0.10.0}/features/paddleocr_vl_extractor.feature +0 -0
  152. {biblicus-0.8.0 → biblicus-0.10.0}/features/paddleocr_vl_parse_api_response.feature +0 -0
  153. {biblicus-0.8.0 → biblicus-0.10.0}/features/pdf_text_extraction.feature +0 -0
  154. {biblicus-0.8.0 → biblicus-0.10.0}/features/python_api.feature +0 -0
  155. {biblicus-0.8.0 → biblicus-0.10.0}/features/python_hook_logging.feature +0 -0
  156. {biblicus-0.8.0 → biblicus-0.10.0}/features/query_processing.feature +0 -0
  157. {biblicus-0.8.0 → biblicus-0.10.0}/features/recipe_file_extraction.feature +0 -0
  158. {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_budget.feature +0 -0
  159. {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_scan.feature +0 -0
  160. {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  161. {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_uses_extraction_run.feature +0 -0
  162. {biblicus-0.8.0 → biblicus-0.10.0}/features/retrieval_utilities.feature +0 -0
  163. {biblicus-0.8.0 → biblicus-0.10.0}/features/select_override.feature +0 -0
  164. {biblicus-0.8.0 → biblicus-0.10.0}/features/smart_override_selection.feature +0 -0
  165. {biblicus-0.8.0 → biblicus-0.10.0}/features/source_loading.feature +0 -0
  166. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/backend_steps.py +0 -0
  167. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/cli_parsing_steps.py +0 -0
  168. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/context_pack_steps.py +0 -0
  169. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/extractor_steps.py +1 -1
  170. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/frontmatter_steps.py +0 -0
  171. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/knowledge_base_steps.py +0 -0
  172. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/model_steps.py +0 -0
  173. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/pdf_steps.py +0 -0
  174. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/python_api_steps.py +1 -1
  175. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/rapidocr_steps.py +0 -0
  176. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/retrieval_steps.py +0 -0
  177. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/stt_deepgram_steps.py +0 -0
  178. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/stt_steps.py +0 -0
  179. {biblicus-0.8.0 → biblicus-0.10.0}/features/steps/unstructured_steps.py +0 -0
  180. {biblicus-0.8.0 → biblicus-0.10.0}/features/streaming_ingest.feature +0 -0
  181. {biblicus-0.8.0 → biblicus-0.10.0}/features/stt_deepgram_extractor.feature +0 -0
  182. {biblicus-0.8.0 → biblicus-0.10.0}/features/stt_extractor.feature +0 -0
  183. {biblicus-0.8.0 → biblicus-0.10.0}/features/text_extraction_runs.feature +0 -0
  184. {biblicus-0.8.0 → biblicus-0.10.0}/features/token_budget.feature +0 -0
  185. {biblicus-0.8.0 → biblicus-0.10.0}/features/unstructured_extractor.feature +0 -0
  186. {biblicus-0.8.0 → biblicus-0.10.0}/features/user_config.feature +0 -0
  187. {biblicus-0.8.0 → biblicus-0.10.0}/setup.cfg +0 -0
  188. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/__main__.py +0 -0
  189. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/base.py +0 -0
  190. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/llm.py +0 -0
  191. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/analysis/schema.py +0 -0
  192. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/__init__.py +0 -0
  193. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/base.py +0 -0
  194. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/backends/scan.py +0 -0
  195. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/constants.py +0 -0
  196. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/context.py +0 -0
  197. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/crawl.py +0 -0
  198. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/errors.py +0 -0
  199. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/evaluation.py +0 -0
  200. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/__init__.py +0 -0
  201. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/base.py +0 -0
  202. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/deepgram_stt.py +0 -0
  203. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_granite_text.py +0 -0
  204. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/docling_smol_text.py +0 -0
  205. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/metadata_text.py +0 -0
  206. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/openai_stt.py +0 -0
  207. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/pass_through_text.py +0 -0
  208. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/pdf_text.py +0 -0
  209. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/pipeline.py +0 -0
  210. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
  211. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_longest_text.py +0 -0
  212. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_override.py +0 -0
  213. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_smart_override.py +0 -0
  214. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/select_text.py +0 -0
  215. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/extractors/unstructured_text.py +0 -0
  216. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/frontmatter.py +0 -0
  217. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/hook_logging.py +0 -0
  218. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/hook_manager.py +0 -0
  219. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/hooks.py +0 -0
  220. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/ignore.py +0 -0
  221. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/inference.py +0 -0
  222. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/knowledge_base.py +0 -0
  223. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/models.py +0 -0
  224. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/retrieval.py +0 -0
  225. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/sources.py +0 -0
  226. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/time.py +0 -0
  227. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus/uris.py +0 -0
  228. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  229. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  230. {biblicus-0.8.0 → biblicus-0.10.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.8.0
3
+ Version: 0.10.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -40,6 +40,8 @@ Provides-Extra: docling-mlx
40
40
  Requires-Dist: docling[mlx-vlm]>=2.0.0; extra == "docling-mlx"
41
41
  Provides-Extra: topic-modeling
42
42
  Requires-Dist: bertopic>=0.15.0; extra == "topic-modeling"
43
+ Provides-Extra: datasets
44
+ Requires-Dist: datasets>=2.18.0; extra == "datasets"
43
45
  Dynamic: license-file
44
46
 
45
47
  # Biblicus
@@ -529,10 +531,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
529
531
 
530
532
  ## Topic modeling analysis
531
533
 
532
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
533
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
534
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
535
- JavaScript Object Notation.
534
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
535
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
536
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
537
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
538
+
539
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
540
+ `docs/TOPIC_MODELING.md` for topic modeling details.
536
541
 
537
542
  Run a topic analysis using a recipe file:
538
543
 
@@ -564,26 +569,28 @@ bertopic_analysis:
564
569
  parameters:
565
570
  min_topic_size: 8
566
571
  nr_topics: 10
572
+ vectorizer:
573
+ ngram_range: [1, 2]
574
+ stop_words: english
567
575
  llm_fine_tuning:
568
576
  enabled: false
569
577
  ```
570
578
 
571
579
  LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
572
580
  Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
581
+ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
573
582
 
574
- For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
583
+ For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
575
584
 
576
585
  ```
577
- python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
586
+ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
578
587
  ```
579
588
 
580
589
  See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
581
590
 
582
591
  ## Integration corpus and evaluation dataset
583
592
 
584
- Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
585
-
586
- The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
593
+ Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
587
594
 
588
595
  Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
589
596
 
@@ -485,10 +485,13 @@ For detailed documentation on all extractors, see the [Extractor Reference][extr
485
485
 
486
486
  ## Topic modeling analysis
487
487
 
488
- Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Topic modeling is the first
489
- analysis backend. It reads an extraction run, optionally applies an LLM-driven extraction pass, applies lexical
490
- processing, runs BERTopic, and optionally applies an LLM fine-tuning pass to label topics. The output is structured
491
- JavaScript Object Notation.
488
+ Biblicus can run analysis pipelines on extracted text without changing the raw corpus. Profiling and topic modeling
489
+ are the first analysis backends. Profiling summarizes corpus composition and extraction coverage. Topic modeling reads
490
+ an extraction run, optionally applies an LLM-driven extraction pass, applies lexical processing, runs BERTopic, and
491
+ optionally applies an LLM fine-tuning pass to label topics. The output is structured JavaScript Object Notation.
492
+
493
+ See `docs/ANALYSIS.md` for the analysis pipeline overview, `docs/PROFILING.md` for profiling, and
494
+ `docs/TOPIC_MODELING.md` for topic modeling details.
492
495
 
493
496
  Run a topic analysis using a recipe file:
494
497
 
@@ -520,26 +523,28 @@ bertopic_analysis:
520
523
  parameters:
521
524
  min_topic_size: 8
522
525
  nr_topics: 10
526
+ vectorizer:
527
+ ngram_range: [1, 2]
528
+ stop_words: english
523
529
  llm_fine_tuning:
524
530
  enabled: false
525
531
  ```
526
532
 
527
533
  LLM extraction and fine-tuning require `biblicus[openai]` and a configured OpenAI API key.
528
534
  Recipe files are validated strictly against the topic modeling schema, so type mismatches or unknown fields are errors.
535
+ AG News integration runs require `biblicus[datasets]` in addition to `biblicus[topic-modeling]`.
529
536
 
530
- For a repeatable, real-world integration run that downloads a Wikipedia corpus and executes topic modeling, use:
537
+ For a repeatable, real-world integration run that downloads AG News and executes topic modeling, use:
531
538
 
532
539
  ```
533
- python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
540
+ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
534
541
  ```
535
542
 
536
543
  See `docs/TOPIC_MODELING.md` for parameter examples and per-topic output behavior.
537
544
 
538
545
  ## Integration corpus and evaluation dataset
539
546
 
540
- Use `scripts/download_wikipedia.py` to download a small integration corpus from Wikipedia when running tests or demos. The repository does not include that content.
541
-
542
- The dataset file `datasets/wikipedia_mini.json` provides a small evaluation set that matches the integration corpus.
547
+ Use `scripts/download_ag_news.py` to download the AG News dataset when running topic modeling demos. The repository does not include that content.
543
548
 
544
549
  Use `scripts/download_pdf_samples.py` to download a small Portable Document Format integration corpus when running tests or demos. The repository does not include that content.
545
550
 
@@ -0,0 +1,47 @@
1
+ # Corpus analysis
2
+
3
+ Biblicus supports analysis backends that run on extracted text artifacts without changing the raw corpus. Analysis is a
4
+ pluggable phase that reads an extraction run, produces structured output, and stores artifacts under the corpus runs
5
+ folder. Each analysis backend declares its own configuration schema and output contract, and all schemas are validated
6
+ strictly.
7
+
8
+ ## How analysis runs work
9
+
10
+ - Analysis runs are tied to a corpus state via the extraction run reference.
11
+ - The analysis output is written under `.biblicus/runs/analysis/<analysis-id>/<run_id>/`.
12
+ - Analysis is reproducible when you supply the same extraction run and corpus catalog state.
13
+ - Analysis configuration is stored as a recipe manifest in the run metadata.
14
+
15
+ If you omit the extraction run, Biblicus uses the most recent extraction run and emits a reproducibility warning. For
16
+ repeatable analysis runs, always pass the extraction run reference explicitly.
17
+
18
+ ## Pluggable analysis backends
19
+
20
+ Analysis backends implement the `CorpusAnalysisBackend` interface and are registered under `biblicus.analysis`.
21
+ A backend receives the corpus, a recipe name, a configuration mapping, and an extraction run reference. It returns a
22
+ Pydantic model that is serialized to JavaScript Object Notation for storage.
23
+
24
+ ## Topic modeling
25
+
26
+ Topic modeling is the first analysis backend. It uses BERTopic to cluster extracted text, produces per-topic evidence,
27
+ and optionally labels topics using an LLM. See `docs/TOPIC_MODELING.md` for detailed configuration and examples.
28
+
29
+ The integration demo script is a working reference you can use as a starting point:
30
+
31
+ ```
32
+ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
33
+ ```
34
+
35
+ The command prints the analysis run identifier and the output path. Open the resulting `output.json` to inspect per-topic
36
+ labels, keywords, and document examples.
37
+
38
+ ## Profiling analysis
39
+
40
+ Profiling is the baseline analysis backend. It summarizes corpus composition and extraction coverage using
41
+ deterministic counts and distribution metrics. See `docs/PROFILING.md` for the full reference and working demo.
42
+
43
+ Run profiling from the CLI:
44
+
45
+ ```
46
+ biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
47
+ ```
@@ -187,19 +187,26 @@ The output includes a `run_id` you can reuse when building a retrieval backend.
187
187
 
188
188
  ### Topic modeling integration run
189
189
 
190
- Use the integration script to download a Wikipedia corpus, run extraction, and run topic modeling with a single command.
190
+ Use the integration script to download AG News, run extraction, and run topic modeling with a single command.
191
+ Install optional dependencies first:
191
192
 
192
193
  ```
193
- python3 scripts/topic_modeling_integration.py --corpus corpora/wiki_demo --force
194
+ python3 -m pip install "biblicus[datasets,topic-modeling]"
194
195
  ```
195
196
 
196
- Run with a smaller corpus and a higher topic count:
197
+ ```
198
+ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
199
+ ```
200
+
201
+ Run with a larger corpus and a higher topic count:
197
202
 
198
203
  ```
199
204
  python3 scripts/topic_modeling_integration.py \
200
- --corpus corpora/wiki_demo \
205
+ --corpus corpora/ag_news_demo \
201
206
  --force \
202
- --limit 20 \
207
+ --limit 10000 \
208
+ --vectorizer-ngram-min 1 \
209
+ --vectorizer-ngram-max 2 \
203
210
  --bertopic-param nr_topics=8 \
204
211
  --bertopic-param min_topic_size=2
205
212
  ```
@@ -207,6 +214,14 @@ python3 scripts/topic_modeling_integration.py \
207
214
  The command prints the analysis run identifier and the output path. Open the `output.json` file to inspect per-topic labels,
208
215
  keywords, and document examples.
209
216
 
217
+ ### Profiling analysis demo
218
+
219
+ The profiling demo downloads AG News, runs extraction, and produces a profiling report.
220
+
221
+ ```
222
+ python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
223
+ ```
224
+
210
225
  ### Select extracted text within a pipeline
211
226
 
212
227
  When you want an explicit choice among multiple extraction outputs, add a selection extractor step at the end of the pipeline.
@@ -243,15 +258,6 @@ python3 -m biblicus build --corpus corpora/pdf_samples --backend sqlite-full-tex
243
258
  python3 -m biblicus query --corpus corpora/pdf_samples --query "Dummy PDF file"
244
259
  ```
245
260
 
246
- ### Wikipedia retrieval demo (Python)
247
-
248
- This example downloads a few Wikipedia summaries about retrieval and knowledge bases, builds an extraction run, creates a local full text index, and returns evidence plus a context pack.
249
-
250
- ```
251
- rm -rf corpora/wikipedia_rag_demo
252
- python3 scripts/wikipedia_rag_demo.py --corpus corpora/wikipedia_rag_demo --force
253
- ```
254
-
255
261
  ### MarkItDown extraction demo (Python 3.10+)
256
262
 
257
263
  MarkItDown requires Python 3.10 or higher. This example uses the `py311` conda environment to run the extractor over the mixed sample corpus.
@@ -374,23 +380,6 @@ python3 -m biblicus build --corpus corpora/demo --backend sqlite-full-text-searc
374
380
  python3 -m biblicus query --corpus corpora/demo --query "tiny"
375
381
  ```
376
382
 
377
- ### Evaluate a run against a dataset
378
-
379
- The repository includes a small dataset that matches the Wikipedia integration corpus.
380
-
381
- ```
382
- python3 -m biblicus eval --corpus corpora/demo --dataset datasets/wikipedia_mini.json
383
- ```
384
-
385
- If you want the matching corpus content, download it first into a separate corpus.
386
-
387
- ```
388
- rm -rf corpora/wikipedia
389
- python3 scripts/download_wikipedia.py --corpus corpora/wikipedia --limit 5 --force
390
- python3 -m biblicus build --corpus corpora/wikipedia --backend sqlite-full-text-search
391
- python3 -m biblicus eval --corpus corpora/wikipedia --dataset datasets/wikipedia_mini.json
392
- ```
393
-
394
383
  ### Run the test suite and view coverage
395
384
 
396
385
  ```
@@ -0,0 +1,98 @@
1
+ # Corpus profiling analysis
2
+
3
+ Biblicus provides a profiling analysis backend that summarizes corpus contents using deterministic counts and
4
+ coverage metrics. Profiling is intended as a fast, local baseline before heavier analysis such as topic modeling.
5
+
6
+ ## What profiling does
7
+
8
+ The profiling analysis reports:
9
+
10
+ - Total item count and media type distribution
11
+ - Extracted text coverage (present, empty, missing)
12
+ - Size and length distributions with percentiles
13
+ - Tag coverage and top tags
14
+
15
+ The output is structured JSON that can be stored, versioned, and compared across runs.
16
+
17
+ ## Run profiling from the CLI
18
+
19
+ ```
20
+ biblicus analyze profile --corpus corpora/example --extraction-run pipeline:RUN_ID
21
+ ```
22
+
23
+ If you omit `--extraction-run`, Biblicus uses the latest extraction run and emits a reproducibility warning.
24
+
25
+ To customize profiling metrics, pass a recipe file:
26
+
27
+ ```
28
+ biblicus analyze profile --corpus corpora/example --recipe recipes/profiling.yml --extraction-run pipeline:RUN_ID
29
+ ```
30
+
31
+ ### Profiling recipe configuration
32
+
33
+ Profiling recipes use the analysis schema version and accept these fields:
34
+
35
+ - `schema_version`: analysis schema version, currently `1`
36
+ - `sample_size`: optional cap for distribution calculations
37
+ - `min_text_characters`: minimum extracted text length for inclusion
38
+ - `percentiles`: percentiles to compute for size and length distributions
39
+ - `top_tag_count`: maximum number of tags to list in `top_tags`
40
+ - `tag_filters`: optional list of tags to include in tag coverage metrics
41
+
42
+ Example recipe:
43
+
44
+ ```
45
+ schema_version: 1
46
+ sample_size: 500
47
+ min_text_characters: 50
48
+ percentiles: [50, 90, 99]
49
+ top_tag_count: 10
50
+ tag_filters: ["ag_news", "label:World"]
51
+ ```
52
+
53
+ ## Run profiling from Python
54
+
55
+ ```
56
+ from pathlib import Path
57
+
58
+ from biblicus.analysis import get_analysis_backend
59
+ from biblicus.corpus import Corpus
60
+ from biblicus.models import ExtractionRunReference
61
+
62
+ corpus = Corpus.open(Path("corpora/example"))
63
+ backend = get_analysis_backend("profiling")
64
+ output = backend.run_analysis(
65
+ corpus,
66
+ recipe_name="default",
67
+ config={
68
+ "schema_version": 1,
69
+ "sample_size": 500,
70
+ "min_text_characters": 50,
71
+ "percentiles": [50, 90, 99],
72
+ "top_tag_count": 10,
73
+ "tag_filters": ["ag_news"],
74
+ },
75
+ extraction_run=ExtractionRunReference(
76
+ extractor_id="pipeline",
77
+ run_id="RUN_ID",
78
+ ),
79
+ )
80
+ print(output.model_dump())
81
+ ```
82
+
83
+ ## Output location
84
+
85
+ Profiling output is stored under:
86
+
87
+ ```
88
+ .biblicus/runs/analysis/profiling/<run_id>/output.json
89
+ ```
90
+
91
+ ## Working demo
92
+
93
+ A runnable demo is provided in `scripts/profiling_demo.py`. It downloads a corpus, runs extraction, and executes the
94
+ profiling analysis so you can inspect the output:
95
+
96
+ ```
97
+ python3 scripts/profiling_demo.py --corpus corpora/profiling_demo --force
98
+ ```
@@ -46,23 +46,20 @@ Acceptance checks:
46
46
  - Behavior specifications cover policy selection and budgeting behaviors.
47
47
  - Example outputs show how context packs differ across policies.
48
48
 
49
- ## Next: extraction backends (OCR and document understanding)
49
+ ## Next: extraction evaluation harness
50
50
 
51
- Goal: treat optical character recognition and document understanding as pluggable extractors with consistent inputs and outputs.
51
+ Goal: compare extraction approaches in a way that is measurable, repeatable, and useful for practical engineering decisions.
52
52
 
53
53
  Deliverables:
54
54
 
55
- - A baseline OCR extractor that is fast and local for smoke tests.
56
- - A higher quality OCR extractor candidate (for example: Paddle OCR or Docling OCR).
57
- - A general document understanding extractor candidate (for example: Docling or Unstructured).
58
- - A consistent output contract that captures text plus optional confidence and per-page metadata.
59
- - A selector policy for choosing between multiple extractor outputs in a pipeline.
60
- - A shared evaluation harness for extraction backends using the same corpus and dataset.
55
+ - Dataset authoring workflow for extraction ground truth (for example: expected transcripts and expected OCR text).
56
+ - Evaluation metrics for accuracy, speed, and cost, including processable fraction for a given extractor recipe.
57
+ - A report format that can compare multiple extraction recipes against the same corpus and dataset.
61
58
 
62
59
  Acceptance checks:
63
60
 
64
- - Behavior specifications cover extractor selection and output provenance.
65
- - Evaluation reports compare accuracy, processable fraction, latency, and cost.
61
+ - Evaluation results are stable and reproducible for the same corpus and dataset inputs.
62
+ - Reports make it clear when an extractor fails to process an item versus producing empty output.
66
63
 
67
64
  ## Next: corpus analysis tools
68
65
 
@@ -70,41 +67,15 @@ Goal: provide lightweight analysis utilities that summarize corpus themes and gu
70
67
 
71
68
  Deliverables:
72
69
 
73
- - A topic modeling workflow for corpus analysis (for example: BERTopic).
74
- - A report that highlights dominant themes and outliers.
75
- - A way to compare topic distributions across corpora or corpus snapshots.
70
+ - Basic data profiling reports (counts, media types, size distributions, tag coverage).
71
+ - Hidden Markov modeling analysis for sequence-driven corpora.
72
+ - A way to compare analysis outputs across corpora or corpus snapshots.
76
73
 
77
74
  Acceptance checks:
78
75
 
79
76
  - Analysis is reproducible for the same corpus state.
80
77
  - Reports are exportable and readable without custom tooling.
81
78
 
82
- ### Candidate backend ecosystem (for planning and evaluation)
83
-
84
- Document understanding and OCR blur together at the interface level in Biblicus, so the roadmap treats them as extractor candidates with the same input/output contract.
85
-
86
- Docling family candidates:
87
-
88
- - Docling (document understanding with structured outputs)
89
- - docling-ocr (OCR component in the Docling ecosystem)
90
-
91
- General-purpose extraction candidates:
92
-
93
- - Unstructured (element-oriented extraction for many formats)
94
- - MarkItDown (lightweight conversion to Markdown)
95
- - Kreuzberg (speed-focused extraction for bulk workflows)
96
- - ExtractThinker (schema-driven extraction using Pydantic contracts)
97
-
98
- Ecosystem adapters:
99
-
100
- - LangChain document loaders (uniform loader interface across many sources)
101
-
102
- ### Guidance for choosing early targets
103
-
104
- - If you need layout and table understanding, prioritize Docling and docling-ocr.
105
- - If you need speed and simplicity, prioritize MarkItDown or Kreuzberg.
106
- - If you need schema-first extraction, prioritize ExtractThinker layered on an OCR or document extractor.
107
-
108
79
  ## Later: alternate backends and hosting modes
109
80
 
110
81
  Goal: broaden the backend surface while keeping the core predictable.
@@ -138,18 +109,3 @@ Acceptance checks:
138
109
 
139
110
  - Behavior specifications cover ingestion, listing, and reindexing in memory.
140
111
  - Retrieval and extraction can operate on the in-memory corpus without special casing.
141
-
142
- ### Extractor datasets and evaluation harness
143
-
144
- Goal: compare extraction approaches in a way that is measurable, repeatable, and useful for practical engineering decisions.
145
-
146
- Deliverables:
147
-
148
- - Dataset authoring workflow for extraction ground truth (for example: expected transcripts and expected optical character recognition text).
149
- - Evaluation metrics for accuracy, speed, and cost, including “processable fraction” for a given extractor recipe.
150
- - A report format that can compare multiple extraction recipes against the same corpus and dataset.
151
-
152
- Acceptance checks:
153
-
154
- - Evaluation results are stable and reproducible for the same corpus and dataset inputs.
155
- - Reports make it clear when an extractor fails to process an item versus producing empty output.
@@ -36,7 +36,7 @@ Integration scenarios are tagged `@integration`.
36
36
 
37
37
  The repository does not include downloaded content. Integration scripts download content into a corpus path you choose and then ingest it for a test run.
38
38
 
39
- - Wikipedia summaries: `scripts/download_wikipedia.py`
39
+ - AG News dataset: `scripts/download_ag_news.py`
40
40
  - Portable Document Format samples: `scripts/download_pdf_samples.py`
41
41
  - Image samples: `scripts/download_image_samples.py`
42
42
  - Mixed modality samples: `scripts/download_mixed_samples.py`
@@ -0,0 +1,159 @@
1
+ # Topic modeling
2
+
3
+ Biblicus provides a topic modeling analysis backend that reads extracted text artifacts, optionally applies an LLM
4
+ extraction pass, applies lexical processing, runs BERTopic, and optionally applies an LLM fine-tuning pass for
5
+ labels. The output is structured JavaScript Object Notation with explicit per-topic evidence.
6
+
7
+ ## What topic modeling does
8
+
9
+ Topic modeling groups documents into clusters based on shared terms or phrases, then surfaces representative
10
+ keywords for each cluster. It is a fast way to summarize large corpora, identify dominant themes, and spot outliers
11
+ without manual labeling. The output is not a classifier; it is an exploratory tool that produces evidence that can
12
+ be inspected or reviewed by humans.
13
+
14
+ ## About BERTopic
15
+
16
+ BERTopic combines document embeddings with clustering and a class-based term frequency approach to extract topic
17
+ keywords. Biblicus supports BERTopic as an optional dependency and forwards its configuration parameters directly to
18
+ the BERTopic constructor. This allows you to tune clustering behavior while keeping the output in a consistent
19
+ schema.
20
+
21
+ ## Pipeline stages
22
+
23
+ - Text collection reads extracted text artifacts from an extraction run.
24
+ - LLM extraction optionally transforms each document into one or more analysis documents.
25
+ - Lexical processing optionally normalizes text before BERTopic.
26
+ - BERTopic produces topic assignments and keyword weights.
27
+ - LLM fine-tuning optionally replaces topic labels based on sampled documents.
28
+
29
+ ## Output structure
30
+
31
+ Topic modeling writes a single `output.json` file under the analysis run directory. The output contains:
32
+
33
+ - `run.run_id` and `run.stats` for reproducible tracking.
34
+ - `report.topics` with the modeled topics.
35
+ - `report.text_collection`, `report.llm_extraction`, `report.lexical_processing`, `report.bertopic_analysis`,
36
+ and `report.llm_fine_tuning` describing each pipeline stage.
37
+
38
+ Each topic record includes:
39
+
40
+ - `topic_id`: The BERTopic topic identifier. The outlier topic uses `-1`.
41
+ - `label`: The human-readable label.
42
+ - `label_source`: `bertopic` or `llm` depending on the stage that set the label.
43
+ - `keywords`: Keyword list with weights.
44
+ - `document_count`: Number of documents assigned to the topic.
45
+ - `document_ids`: Item identifiers for the assigned documents.
46
+ - `document_examples`: Sampled document text used for inspection.
47
+
48
+ Per-topic behavior is determined by the BERTopic assignments and the optional fine-tuning stage. The lexical
49
+ processing flags can substantially change tokenization and therefore the resulting topic labels. The outlier
50
+ `topic_id` `-1` indicates documents that BERTopic could not confidently assign to a cluster.
51
+
52
+ ## Configuration reference
53
+
54
+ Topic modeling recipes use a strict schema. Unknown fields or type mismatches are errors.
55
+
56
+ ### Text source
57
+
58
+ - `text_source.sample_size`: Limit the number of documents used for analysis.
59
+ - `text_source.min_text_characters`: Drop documents shorter than this count.
60
+
61
+ ### LLM extraction
62
+
63
+ - `llm_extraction.enabled`: Enable the LLM extraction stage.
64
+ - `llm_extraction.method`: `single` or `itemize` to control whether an input maps to one or many documents.
65
+ - `llm_extraction.client`: LLM client configuration (requires `biblicus[openai]`).
66
+ - `llm_extraction.prompt_template`: Prompt template for the extraction stage.
67
+ - `llm_extraction.system_prompt`: Optional system prompt.
68
+
69
+ ### Lexical processing
70
+
71
+ - `lexical_processing.enabled`: Enable normalization.
72
+ - `lexical_processing.lowercase`: Lowercase text before tokenization.
73
+ - `lexical_processing.strip_punctuation`: Remove punctuation before tokenization.
74
+ - `lexical_processing.collapse_whitespace`: Normalize repeated whitespace.
75
+
76
+ ### BERTopic configuration
77
+
78
+ - `bertopic_analysis.parameters`: Mapping of BERTopic constructor parameters.
79
+ - `bertopic_analysis.vectorizer.ngram_range`: Inclusive n-gram range (for example `[1, 2]`).
80
+ - `bertopic_analysis.vectorizer.stop_words`: `english` or a list of stop words. Set to `null` to disable.
81
+
82
+ ### LLM fine-tuning
83
+
84
+ - `llm_fine_tuning.enabled`: Enable LLM topic labeling.
85
+ - `llm_fine_tuning.client`: LLM client configuration.
86
+ - `llm_fine_tuning.prompt_template`: Prompt template containing `{keywords}` and `{documents}`.
87
+ - `llm_fine_tuning.system_prompt`: Optional system prompt.
88
+ - `llm_fine_tuning.max_keywords`: Maximum keywords included per prompt.
89
+ - `llm_fine_tuning.max_documents`: Maximum documents included per prompt.
90
+
91
+ ## Vectorizer configuration
92
+
93
+ Biblicus forwards BERTopic configuration through `bertopic_analysis.parameters` and exposes vectorizer settings
94
+ through `bertopic_analysis.vectorizer`. To include bigrams, set `ngram_range` to `[1, 2]`. To remove stop words,
95
+ set `stop_words` to `english` or a list.
96
+
97
+ ```yaml
98
+ bertopic_analysis:
99
+ parameters:
100
+ min_topic_size: 10
101
+ nr_topics: 12
102
+ vectorizer:
103
+ ngram_range: [1, 2]
104
+ stop_words: english
105
+ ```
106
+
107
+ ## Repeatable integration script
108
+
109
+ The integration script downloads AG News, runs extraction, and then runs topic modeling with the selected
110
+ parameters. It prints a summary with the analysis run identifier and the output path.
111
+
112
+ ```
113
+ python3 scripts/topic_modeling_integration.py --corpus corpora/ag_news_demo --force
114
+ ```
115
+
116
+ ### Example: raise topic count
117
+
118
+ ```
119
+ python3 scripts/topic_modeling_integration.py \
120
+ --corpus corpora/ag_news_demo \
121
+ --force \
122
+ --limit 10000 \
123
+ --vectorizer-ngram-min 1 \
124
+ --vectorizer-ngram-max 2 \
125
+ --bertopic-param nr_topics=8 \
126
+ --bertopic-param min_topic_size=2
127
+ ```
128
+
129
+ ### Example: disable lexical processing and restrict inputs
130
+
131
+ ```
132
+ python3 scripts/topic_modeling_integration.py \
133
+ --corpus corpora/ag_news_demo \
134
+ --force \
135
+ --sample-size 200 \
136
+ --min-text-characters 200 \
137
+ --no-lexical-enabled
138
+ ```
139
+
140
+ ### Example: keep lexical processing but preserve punctuation
141
+
142
+ ```
143
+ python3 scripts/topic_modeling_integration.py \
144
+ --corpus corpora/ag_news_demo \
145
+ --force \
146
+ --no-lexical-strip-punctuation
147
+ ```
148
+
149
+ BERTopic parameters are passed directly to the constructor. Use repeated `--bertopic-param key=value` pairs for
150
+ multiple parameters. Values that look like JSON objects or arrays are parsed as JSON.
151
+
152
+ The integration script requires at least 16 documents to avoid BERTopic default UMAP errors. Increase `--limit` or
153
+ use a larger corpus if you receive a small-corpus error.
154
+
155
+ AG News downloads require the `datasets` dependency. Install with:
156
+
157
+ ```
158
+ python3 -m pip install "biblicus[datasets,topic-modeling]"
159
+ ```
@@ -4,8 +4,13 @@ Sphinx configuration for Biblicus documentation.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ import os
8
+ import sys
7
9
  from pathlib import Path
8
10
 
11
+ from pygments.lexers.special import TextLexer
12
+ from sphinx.highlighting import lexers
13
+
9
14
  PROJECT_ROOT = Path(__file__).resolve().parent.parent
10
15
  SOURCE_ROOT = PROJECT_ROOT / "src"
11
16
 
@@ -31,8 +36,6 @@ html_theme_options = {
31
36
  }
32
37
 
33
38
  # ReadTheDocs integration - canonical URL for SEO
34
- import os
35
-
36
39
  if os.environ.get("READTHEDOCS"):
37
40
  rtd_version = os.environ.get("READTHEDOCS_VERSION", "latest")
38
41
  rtd_project = os.environ.get("READTHEDOCS_PROJECT", "biblicus")
@@ -44,12 +47,6 @@ source_suffix = {
44
47
  }
45
48
 
46
49
  suppress_warnings = ["misc.highlighting_failure"]
47
-
48
- import sys
49
-
50
50
  sys.path.insert(0, str(SOURCE_ROOT))
51
51
 
52
- from pygments.lexers.special import TextLexer
53
- from sphinx.highlighting import lexers
54
-
55
52
  lexers["mermaid"] = TextLexer()
@@ -16,6 +16,8 @@ Contents
16
16
  BACKENDS
17
17
  backends/index
18
18
  CONTEXT_PACK
19
+ ANALYSIS
20
+ PROFILING
19
21
  TOPIC_MODELING
20
22
  DEMOS
21
23
  USER_CONFIGURATION