biblicus 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. {biblicus-0.5.0/src/biblicus.egg-info → biblicus-0.6.0}/PKG-INFO +39 -3
  2. {biblicus-0.5.0 → biblicus-0.6.0}/README.md +38 -2
  3. {biblicus-0.5.0 → biblicus-0.6.0}/docs/FEATURE_INDEX.md +15 -0
  4. biblicus-0.6.0/docs/KNOWLEDGE_BASE.md +68 -0
  5. {biblicus-0.5.0 → biblicus-0.6.0}/docs/ROADMAP.md +59 -0
  6. {biblicus-0.5.0 → biblicus-0.6.0}/docs/api.rst +4 -0
  7. {biblicus-0.5.0 → biblicus-0.6.0}/docs/index.rst +1 -0
  8. biblicus-0.6.0/features/knowledge_base.feature +55 -0
  9. biblicus-0.6.0/features/steps/knowledge_base_steps.py +90 -0
  10. {biblicus-0.5.0 → biblicus-0.6.0}/pyproject.toml +1 -1
  11. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/__init__.py +3 -1
  12. biblicus-0.6.0/src/biblicus/knowledge_base.py +191 -0
  13. {biblicus-0.5.0 → biblicus-0.6.0/src/biblicus.egg-info}/PKG-INFO +39 -3
  14. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/SOURCES.txt +4 -0
  15. {biblicus-0.5.0 → biblicus-0.6.0}/LICENSE +0 -0
  16. {biblicus-0.5.0 → biblicus-0.6.0}/MANIFEST.in +0 -0
  17. {biblicus-0.5.0 → biblicus-0.6.0}/THIRD_PARTY_NOTICES.md +0 -0
  18. {biblicus-0.5.0 → biblicus-0.6.0}/datasets/wikipedia_mini.json +0 -0
  19. {biblicus-0.5.0 → biblicus-0.6.0}/docs/ARCHITECTURE.md +0 -0
  20. {biblicus-0.5.0 → biblicus-0.6.0}/docs/BACKENDS.md +0 -0
  21. {biblicus-0.5.0 → biblicus-0.6.0}/docs/CONTEXT_PACK.md +0 -0
  22. {biblicus-0.5.0 → biblicus-0.6.0}/docs/CORPUS.md +0 -0
  23. {biblicus-0.5.0 → biblicus-0.6.0}/docs/CORPUS_DESIGN.md +0 -0
  24. {biblicus-0.5.0 → biblicus-0.6.0}/docs/DEMOS.md +0 -0
  25. {biblicus-0.5.0 → biblicus-0.6.0}/docs/EXTRACTION.md +0 -0
  26. {biblicus-0.5.0 → biblicus-0.6.0}/docs/TESTING.md +0 -0
  27. {biblicus-0.5.0 → biblicus-0.6.0}/docs/USER_CONFIGURATION.md +0 -0
  28. {biblicus-0.5.0 → biblicus-0.6.0}/docs/conf.py +0 -0
  29. {biblicus-0.5.0 → biblicus-0.6.0}/features/backend_validation.feature +0 -0
  30. {biblicus-0.5.0 → biblicus-0.6.0}/features/biblicus_corpus.feature +0 -0
  31. {biblicus-0.5.0 → biblicus-0.6.0}/features/cli_entrypoint.feature +0 -0
  32. {biblicus-0.5.0 → biblicus-0.6.0}/features/cli_parsing.feature +0 -0
  33. {biblicus-0.5.0 → biblicus-0.6.0}/features/content_sniffing.feature +0 -0
  34. {biblicus-0.5.0 → biblicus-0.6.0}/features/context_pack.feature +0 -0
  35. {biblicus-0.5.0 → biblicus-0.6.0}/features/context_pack_cli.feature +0 -0
  36. {biblicus-0.5.0 → biblicus-0.6.0}/features/corpus_edge_cases.feature +0 -0
  37. {biblicus-0.5.0 → biblicus-0.6.0}/features/corpus_identity.feature +0 -0
  38. {biblicus-0.5.0 → biblicus-0.6.0}/features/corpus_purge.feature +0 -0
  39. {biblicus-0.5.0 → biblicus-0.6.0}/features/crawl.feature +0 -0
  40. {biblicus-0.5.0 → biblicus-0.6.0}/features/environment.py +0 -0
  41. {biblicus-0.5.0 → biblicus-0.6.0}/features/error_cases.feature +0 -0
  42. {biblicus-0.5.0 → biblicus-0.6.0}/features/evaluation.feature +0 -0
  43. {biblicus-0.5.0 → biblicus-0.6.0}/features/evidence_processing.feature +0 -0
  44. {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_error_handling.feature +0 -0
  45. {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_run_lifecycle.feature +0 -0
  46. {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_selection.feature +0 -0
  47. {biblicus-0.5.0 → biblicus-0.6.0}/features/extraction_selection_longest.feature +0 -0
  48. {biblicus-0.5.0 → biblicus-0.6.0}/features/extractor_pipeline.feature +0 -0
  49. {biblicus-0.5.0 → biblicus-0.6.0}/features/extractor_validation.feature +0 -0
  50. {biblicus-0.5.0 → biblicus-0.6.0}/features/frontmatter.feature +0 -0
  51. {biblicus-0.5.0 → biblicus-0.6.0}/features/hook_config_validation.feature +0 -0
  52. {biblicus-0.5.0 → biblicus-0.6.0}/features/hook_error_handling.feature +0 -0
  53. {biblicus-0.5.0 → biblicus-0.6.0}/features/import_tree.feature +0 -0
  54. {biblicus-0.5.0 → biblicus-0.6.0}/features/ingest_sources.feature +0 -0
  55. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_audio_samples.feature +0 -0
  56. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_image_samples.feature +0 -0
  57. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_mixed_corpus.feature +0 -0
  58. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_mixed_extraction.feature +0 -0
  59. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_ocr_image_extraction.feature +0 -0
  60. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_pdf_retrieval.feature +0 -0
  61. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_pdf_samples.feature +0 -0
  62. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_unstructured_extraction.feature +0 -0
  63. {biblicus-0.5.0 → biblicus-0.6.0}/features/integration_wikipedia.feature +0 -0
  64. {biblicus-0.5.0 → biblicus-0.6.0}/features/lifecycle_hooks.feature +0 -0
  65. {biblicus-0.5.0 → biblicus-0.6.0}/features/model_validation.feature +0 -0
  66. {biblicus-0.5.0 → biblicus-0.6.0}/features/ocr_extractor.feature +0 -0
  67. {biblicus-0.5.0 → biblicus-0.6.0}/features/pdf_text_extraction.feature +0 -0
  68. {biblicus-0.5.0 → biblicus-0.6.0}/features/python_api.feature +0 -0
  69. {biblicus-0.5.0 → biblicus-0.6.0}/features/python_hook_logging.feature +0 -0
  70. {biblicus-0.5.0 → biblicus-0.6.0}/features/query_processing.feature +0 -0
  71. {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_budget.feature +0 -0
  72. {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_scan.feature +0 -0
  73. {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_sqlite_full_text_search.feature +0 -0
  74. {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_uses_extraction_run.feature +0 -0
  75. {biblicus-0.5.0 → biblicus-0.6.0}/features/retrieval_utilities.feature +0 -0
  76. {biblicus-0.5.0 → biblicus-0.6.0}/features/source_loading.feature +0 -0
  77. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/backend_steps.py +0 -0
  78. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/cli_parsing_steps.py +0 -0
  79. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/cli_steps.py +0 -0
  80. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/context_pack_steps.py +0 -0
  81. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/crawl_steps.py +0 -0
  82. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/evidence_processing_steps.py +0 -0
  83. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/extraction_run_lifecycle_steps.py +0 -0
  84. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/extraction_steps.py +0 -0
  85. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/extractor_steps.py +0 -0
  86. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/frontmatter_steps.py +0 -0
  87. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/model_steps.py +0 -0
  88. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/openai_steps.py +0 -0
  89. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/pdf_steps.py +0 -0
  90. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/python_api_steps.py +0 -0
  91. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/rapidocr_steps.py +0 -0
  92. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/retrieval_steps.py +0 -0
  93. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/stt_steps.py +0 -0
  94. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/unstructured_steps.py +0 -0
  95. {biblicus-0.5.0 → biblicus-0.6.0}/features/steps/user_config_steps.py +0 -0
  96. {biblicus-0.5.0 → biblicus-0.6.0}/features/streaming_ingest.feature +0 -0
  97. {biblicus-0.5.0 → biblicus-0.6.0}/features/stt_extractor.feature +0 -0
  98. {biblicus-0.5.0 → biblicus-0.6.0}/features/text_extraction_runs.feature +0 -0
  99. {biblicus-0.5.0 → biblicus-0.6.0}/features/token_budget.feature +0 -0
  100. {biblicus-0.5.0 → biblicus-0.6.0}/features/unstructured_extractor.feature +0 -0
  101. {biblicus-0.5.0 → biblicus-0.6.0}/features/user_config.feature +0 -0
  102. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_audio_samples.py +0 -0
  103. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_image_samples.py +0 -0
  104. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_mixed_samples.py +0 -0
  105. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_pdf_samples.py +0 -0
  106. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/download_wikipedia.py +0 -0
  107. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/readme_end_to_end_demo.py +0 -0
  108. {biblicus-0.5.0 → biblicus-0.6.0}/scripts/test.py +0 -0
  109. {biblicus-0.5.0 → biblicus-0.6.0}/setup.cfg +0 -0
  110. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/__main__.py +0 -0
  111. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/__init__.py +0 -0
  112. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/interpolation.py +0 -0
  113. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/loader.py +0 -0
  114. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/_vendor/dotyaml/transformer.py +0 -0
  115. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/__init__.py +0 -0
  116. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/base.py +0 -0
  117. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/scan.py +0 -0
  118. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/backends/sqlite_full_text_search.py +0 -0
  119. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/cli.py +0 -0
  120. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/constants.py +0 -0
  121. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/context.py +0 -0
  122. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/corpus.py +0 -0
  123. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/crawl.py +0 -0
  124. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/errors.py +0 -0
  125. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/evaluation.py +0 -0
  126. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/evidence_processing.py +0 -0
  127. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extraction.py +0 -0
  128. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/__init__.py +0 -0
  129. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/base.py +0 -0
  130. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/metadata_text.py +0 -0
  131. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/openai_stt.py +0 -0
  132. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/pass_through_text.py +0 -0
  133. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/pdf_text.py +0 -0
  134. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/pipeline.py +0 -0
  135. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/rapidocr_text.py +0 -0
  136. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/select_longest_text.py +0 -0
  137. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/select_text.py +0 -0
  138. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/extractors/unstructured_text.py +0 -0
  139. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/frontmatter.py +0 -0
  140. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/hook_logging.py +0 -0
  141. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/hook_manager.py +0 -0
  142. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/hooks.py +0 -0
  143. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/ignore.py +0 -0
  144. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/models.py +0 -0
  145. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/retrieval.py +0 -0
  146. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/sources.py +0 -0
  147. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/time.py +0 -0
  148. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/uris.py +0 -0
  149. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus/user_config.py +0 -0
  150. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/dependency_links.txt +0 -0
  151. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/entry_points.txt +0 -0
  152. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/requires.txt +0 -0
  153. {biblicus-0.5.0 → biblicus-0.6.0}/src/biblicus.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblicus
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Command line interface and Python library for corpus ingestion, retrieval, and evaluation.
5
5
  License: MIT
6
6
  Requires-Python: >=3.9
@@ -45,6 +45,40 @@ It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or
45
45
 
46
46
  See [retrieval augmented generation overview] for a short introduction to the idea.
47
47
 
48
+ ## Start with a knowledge base
49
+
50
+ If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
51
+
52
+ This example assumes a folder called `notes/` with a few `.txt` files. The knowledge base handles sensible defaults and still gives you a clear context pack for your model call.
53
+
54
+ ```python
55
+ from biblicus.knowledge_base import KnowledgeBase
56
+
57
+
58
+ kb = KnowledgeBase.from_folder("notes")
59
+ result = kb.query("Primary button style preference")
60
+ context_pack = kb.context_pack(result, max_tokens=800)
61
+
62
+ print(context_pack.text)
63
+ ```
64
+
65
+ If you want to run a real, executable version of this story, use `scripts/readme_end_to_end_demo.py` from a fresh clone.
66
+
67
+ This simplified sequence diagram shows the same idea at a high level.
68
+
69
+ ```mermaid
70
+ %%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
71
+ sequenceDiagram
72
+ participant App as Your assistant code
73
+ participant KB as Knowledge base
74
+ participant LLM as Large language model
75
+
76
+ App->>KB: query
77
+ KB-->>App: evidence and context
78
+ App->>LLM: context plus prompt
79
+ LLM-->>App: response draft
80
+ ```
81
+
48
82
  ## A simple mental model
49
83
 
50
84
  Think in three stages.
@@ -153,11 +187,11 @@ biblicus crawl --corpus corpora/example \\
153
187
  --tag crawled
154
188
  ```
155
189
 
156
- ## End-to-end example: evidence to assistant context
190
+ ## End-to-end example: lower-level control
157
191
 
158
192
  The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
159
193
 
160
- Start with a few short “memories” from a chat system. Each memory is stored as a normal item in the corpus.
194
+ This version shows the lower-level pieces explicitly. You are building the corpus, controlling each memory string, choosing the backend, and shaping the context pack yourself.
161
195
 
162
196
  ```python
163
197
  from biblicus.backends import get_backend
@@ -383,6 +417,7 @@ The documents below follow the pipeline from raw items to model context:
383
417
 
384
418
  - [Corpus][corpus]
385
419
  - [Text extraction][text-extraction]
420
+ - [Knowledge base][knowledge-base]
386
421
  - [Backends][backends]
387
422
  - [Context packs][context-packs]
388
423
  - [Testing and evaluation][testing]
@@ -485,6 +520,7 @@ License terms are in `LICENSE`.
485
520
  [roadmap]: docs/ROADMAP.md
486
521
  [feature-index]: docs/FEATURE_INDEX.md
487
522
  [corpus]: docs/CORPUS.md
523
+ [knowledge-base]: docs/KNOWLEDGE_BASE.md
488
524
  [text-extraction]: docs/EXTRACTION.md
489
525
  [user-configuration]: docs/USER_CONFIGURATION.md
490
526
  [backends]: docs/BACKENDS.md
@@ -16,6 +16,40 @@ It can be used alongside LangGraph, Tactus, Pydantic AI, any agent framework, or
16
16
 
17
17
  See [retrieval augmented generation overview] for a short introduction to the idea.
18
18
 
19
+ ## Start with a knowledge base
20
+
21
+ If you just want to hand a folder to your assistant and move on, use the high-level knowledge base interface. The folder can be nothing more than a handful of plain text files. You are not choosing a retrieval strategy yet. You are just collecting.
22
+
23
+ This example assumes a folder called `notes/` with a few `.txt` files. The knowledge base handles sensible defaults and still gives you a clear context pack for your model call.
24
+
25
+ ```python
26
+ from biblicus.knowledge_base import KnowledgeBase
27
+
28
+
29
+ kb = KnowledgeBase.from_folder("notes")
30
+ result = kb.query("Primary button style preference")
31
+ context_pack = kb.context_pack(result, max_tokens=800)
32
+
33
+ print(context_pack.text)
34
+ ```
35
+
36
+ If you want to run a real, executable version of this story, use `scripts/readme_end_to_end_demo.py` from a fresh clone.
37
+
38
+ This simplified sequence diagram shows the same idea at a high level.
39
+
40
+ ```mermaid
41
+ %%{init: {"theme": "base", "themeVariables": {"primaryColor": "#f3e5f5", "primaryTextColor": "#111111", "primaryBorderColor": "#8e24aa", "lineColor": "#90a4ae", "secondaryColor": "#eceff1", "tertiaryColor": "#ffffff", "noteBkgColor": "#ffffff", "noteTextColor": "#111111", "actorBkg": "#f3e5f5", "actorBorder": "#8e24aa", "actorTextColor": "#111111"}}}%%
42
+ sequenceDiagram
43
+ participant App as Your assistant code
44
+ participant KB as Knowledge base
45
+ participant LLM as Large language model
46
+
47
+ App->>KB: query
48
+ KB-->>App: evidence and context
49
+ App->>LLM: context plus prompt
50
+ LLM-->>App: response draft
51
+ ```
52
+
19
53
  ## A simple mental model
20
54
 
21
55
  Think in three stages.
@@ -124,11 +158,11 @@ biblicus crawl --corpus corpora/example \\
124
158
  --tag crawled
125
159
  ```
126
160
 
127
- ## End-to-end example: evidence to assistant context
161
+ ## End-to-end example: lower-level control
128
162
 
129
163
  The command-line interface returns JavaScript Object Notation by default. This makes it easy to use Biblicus in scripts and to treat retrieval as a deterministic, testable step.
130
164
 
131
- Start with a few short “memories” from a chat system. Each memory is stored as a normal item in the corpus.
165
+ This version shows the lower-level pieces explicitly. You are building the corpus, controlling each memory string, choosing the backend, and shaping the context pack yourself.
132
166
 
133
167
  ```python
134
168
  from biblicus.backends import get_backend
@@ -354,6 +388,7 @@ The documents below follow the pipeline from raw items to model context:
354
388
 
355
389
  - [Corpus][corpus]
356
390
  - [Text extraction][text-extraction]
391
+ - [Knowledge base][knowledge-base]
357
392
  - [Backends][backends]
358
393
  - [Context packs][context-packs]
359
394
  - [Testing and evaluation][testing]
@@ -456,6 +491,7 @@ License terms are in `LICENSE`.
456
491
  [roadmap]: docs/ROADMAP.md
457
492
  [feature-index]: docs/FEATURE_INDEX.md
458
493
  [corpus]: docs/CORPUS.md
494
+ [knowledge-base]: docs/KNOWLEDGE_BASE.md
459
495
  [text-extraction]: docs/EXTRACTION.md
460
496
  [user-configuration]: docs/USER_CONFIGURATION.md
461
497
  [backends]: docs/BACKENDS.md
@@ -208,6 +208,21 @@ Primary implementation:
208
208
 
209
209
  - `src/biblicus/context.py`
210
210
 
211
+ ## Knowledge base
212
+
213
+ What it does:
214
+
215
+ - Provides a turnkey interface that accepts a folder and returns a ready-to-query workflow.
216
+ - Applies sensible defaults for import, retrieval, and context pack shaping.
217
+
218
+ Behavior specifications:
219
+
220
+ - `features/knowledge_base.feature`
221
+
222
+ Primary implementation:
223
+
224
+ - `src/biblicus/knowledge_base.py`
225
+
211
226
  ## Testing, coverage, and documentation build
212
227
 
213
228
  What it does:
@@ -0,0 +1,68 @@
1
+ # Knowledge base
2
+
3
+ The knowledge base is the high‑level, turnkey workflow that makes Biblicus feel effortless. You hand it a folder. It chooses sensible defaults, builds a retrieval run, and gives you evidence you can turn into context.
4
+
5
+ This is the right layer when you want to use Biblicus without spending time on setup. You can still override the defaults later when you want fine‑grained control.
6
+
7
+ ## What it does
8
+
9
+ - Creates or opens a corpus at a chosen location (or a temporary location if you do not provide one).
10
+ - Imports a folder tree into that corpus.
11
+ - Builds a retrieval run using a default backend.
12
+ - Exposes a simple `query` method that returns evidence.
13
+ - Exposes a `context_pack` helper to shape evidence into model context.
14
+
15
+ ## Minimal use
16
+
17
+ ```python
18
+ from biblicus.knowledge_base import KnowledgeBase
19
+
20
+
21
+ kb = KnowledgeBase.from_folder("notes")
22
+ result = kb.query("Primary button style preference")
23
+ context_pack = kb.context_pack(result, max_tokens=800)
24
+
25
+ print(context_pack.text)
26
+ ```
27
+
28
+ ## Default behavior
29
+
30
+ The knowledge base wraps existing primitives. Defaults are explicit and deterministic.
31
+
32
+ - **Corpus**: stored on disk and fully inspectable.
33
+ - **Import**: uses the folder tree import, preserving relative paths.
34
+ - **Backend**: defaults to the `scan` backend.
35
+ - **Query budget**: defaults to a small, conservative evidence budget.
36
+
37
+ ## Overrides
38
+
39
+ You can override the defaults when needed.
40
+
41
+ ```python
42
+ from biblicus.knowledge_base import KnowledgeBase
43
+ from biblicus.models import QueryBudget
44
+
45
+
46
+ kb = KnowledgeBase.from_folder(
47
+ "notes",
48
+ backend_id="scan",
49
+ recipe_name="Knowledge base demo",
50
+ query_budget=QueryBudget(max_total_items=10, max_total_characters=4000, max_items_per_source=None),
51
+ tags=["memory"],
52
+ corpus_root="corpora/knowledge-base",
53
+ )
54
+ ```
55
+
56
+ ## How it relates to lower‑level control
57
+
58
+ The knowledge base is a convenience layer. It uses the same underlying parts that the lower‑level examples use.
59
+
60
+ - `Corpus` for ingestion and storage
61
+ - `import_tree` for folder ingestion
62
+ - A backend run (`scan` by default)
63
+ - `QueryBudget` for evidence limits
64
+ - `ContextPackPolicy` and token fitting for context shaping
65
+
66
+ You can always drop down to those lower‑level primitives when you need more control.
67
+
68
+ If the high‑level workflow is not enough, switch to `Corpus`, `get_backend`, and `ContextPackPolicy` directly.
@@ -46,6 +46,65 @@ Acceptance checks:
46
46
  - Behavior specifications cover policy selection and budgeting behaviors.
47
47
  - Example outputs show how context packs differ across policies.
48
48
 
49
+ ## Next: extraction backends (OCR and document understanding)
50
+
51
+ Goal: treat optical character recognition and document understanding as pluggable extractors with consistent inputs and outputs.
52
+
53
+ Deliverables:
54
+
55
+ - A baseline OCR extractor that is fast and local for smoke tests.
56
+ - A higher quality OCR extractor candidate (for example: Paddle OCR or Docling OCR).
57
+ - A general document understanding extractor candidate (for example: Docling or Unstructured).
58
+ - A consistent output contract that captures text plus optional confidence and per-page metadata.
59
+ - A selector policy for choosing between multiple extractor outputs in a pipeline.
60
+ - A shared evaluation harness for extraction backends using the same corpus and dataset.
61
+
62
+ Acceptance checks:
63
+
64
+ - Behavior specifications cover extractor selection and output provenance.
65
+ - Evaluation reports compare accuracy, processable fraction, latency, and cost.
66
+
67
+ ## Next: corpus analysis tools
68
+
69
+ Goal: provide lightweight analysis utilities that summarize corpus themes and guide curation.
70
+
71
+ Deliverables:
72
+
73
+ - A topic modeling workflow for corpus analysis (for example: BERTopic).
74
+ - A report that highlights dominant themes and outliers.
75
+ - A way to compare topic distributions across corpora or corpus snapshots.
76
+
77
+ Acceptance checks:
78
+
79
+ - Analysis is reproducible for the same corpus state.
80
+ - Reports are exportable and readable without custom tooling.
81
+
82
+ ### Candidate backend ecosystem (for planning and evaluation)
83
+
84
+ Document understanding and OCR blur together at the interface level in Biblicus, so the roadmap treats them as extractor candidates with the same input/output contract.
85
+
86
+ Docling family candidates:
87
+
88
+ - Docling (document understanding with structured outputs)
89
+ - docling-ocr (OCR component in the Docling ecosystem)
90
+
91
+ General-purpose extraction candidates:
92
+
93
+ - Unstructured (element-oriented extraction for many formats)
94
+ - MarkItDown (lightweight conversion to Markdown)
95
+ - Kreuzberg (speed-focused extraction for bulk workflows)
96
+ - ExtractThinker (schema-driven extraction using Pydantic contracts)
97
+
98
+ Ecosystem adapters:
99
+
100
+ - LangChain document loaders (uniform loader interface across many sources)
101
+
102
+ ### Guidance for choosing early targets
103
+
104
+ - If you need layout and table understanding, prioritize Docling and docling-ocr.
105
+ - If you need speed and simplicity, prioritize MarkItDown or Kreuzberg.
106
+ - If you need schema-first extraction, prioritize ExtractThinker layered on an OCR or document extractor.
107
+
49
108
  ## Later: alternate backends and hosting modes
50
109
 
51
110
  Goal: broaden the backend surface while keeping the core predictable.
@@ -8,6 +8,10 @@ Core
8
8
  :members:
9
9
  :undoc-members:
10
10
 
11
+ .. automodule:: biblicus.knowledge_base
12
+ :members:
13
+ :undoc-members:
14
+
11
15
  .. automodule:: biblicus.models
12
16
  :members:
13
17
  :undoc-members:
@@ -11,6 +11,7 @@ Contents
11
11
 
12
12
  CORPUS
13
13
  EXTRACTION
14
+ KNOWLEDGE_BASE
14
15
  BACKENDS
15
16
  CONTEXT_PACK
16
17
  DEMOS
@@ -0,0 +1,55 @@
1
+ Feature: Knowledge base (turnkey workflow)
2
+ A knowledge base is a high-level workflow that hides the plumbing while keeping behavior explicit.
3
+ It should accept a folder, ingest files, build defaults, and allow retrieval with minimal configuration.
4
+
5
+ Scenario: Build a knowledge base from a folder and query it
6
+ Given a folder "notes" exists with text files:
7
+ | filename | contents |
8
+ | note1.txt | The user's name is Tactus Maximus. |
9
+ | note2.txt | Primary button style preference: the user's favorite color is magenta. |
10
+ When I create a knowledge base from folder "notes" only
11
+ And I query the knowledge base for "Primary button style preference"
12
+ Then the knowledge base returns evidence that includes "favorite color is magenta"
13
+
14
+ Scenario: Knowledge base context pack is shaped with a token budget
15
+ Given a folder "notes" exists with text files:
16
+ | filename | contents |
17
+ | note1.txt | one two three |
18
+ | note2.txt | four five six |
19
+ When I create a knowledge base from folder "notes" only
20
+ And I query the knowledge base for "one"
21
+ And I build a context pack from the knowledge base query with token budget 3
22
+ Then the context pack text equals:
23
+ """
24
+ one two three
25
+ """
26
+
27
+ Scenario: Knowledge base context pack defaults to no token budget
28
+ Given a folder "notes" exists with text files:
29
+ | filename | contents |
30
+ | note1.txt | alpha beta |
31
+ When I create a knowledge base from folder "notes" only
32
+ And I query the knowledge base for "alpha"
33
+ And I build a context pack from the knowledge base query without a token budget
34
+ Then the context pack text equals:
35
+ """
36
+ alpha beta
37
+ """
38
+
39
+ Scenario: Knowledge base rejects missing folder
40
+ When I attempt to create a knowledge base from folder "missing"
41
+ Then the knowledge base error includes "does not exist"
42
+
43
+ Scenario: Knowledge base rejects non-folder path
44
+ Given a file "not-a-folder.txt" exists with contents "hello"
45
+ When I attempt to create a knowledge base from folder "not-a-folder.txt"
46
+ Then the knowledge base error includes "not a directory"
47
+
48
+ Scenario: Knowledge base can use an explicit corpus root
49
+ Given a folder "notes" exists with text files:
50
+ | filename | contents |
51
+ | note1.txt | alpha |
52
+ And a folder "kb-root" exists
53
+ When I create a knowledge base from folder "notes" using corpus root "kb-root"
54
+ And I query the knowledge base for "alpha"
55
+ Then the knowledge base returns evidence that includes "alpha"
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from behave import given, then, when
6
+
7
+ from biblicus.knowledge_base import KnowledgeBase
8
+
9
+
10
+ @given('a folder "{folder}" exists')
11
+ def given_folder_exists(context, folder: str) -> None:
12
+ root = Path(context.workdir) / folder
13
+ root.mkdir(parents=True, exist_ok=True)
14
+ context.knowledge_base_folder = root
15
+
16
+
17
+ @given('a folder "{folder}" exists with text files:')
18
+ def given_folder_exists_with_text_files(context, folder: str) -> None:
19
+ root = Path(context.workdir) / folder
20
+ root.mkdir(parents=True, exist_ok=True)
21
+ for row in context.table:
22
+ filename = row["filename"]
23
+ contents = row["contents"]
24
+ path = root / filename
25
+ path.write_text(contents, encoding="utf-8")
26
+ context.knowledge_base_folder = root
27
+
28
+
29
+ @given('a file "{filename}" exists with contents "{contents}"')
30
+ def given_file_exists_with_contents(context, filename: str, contents: str) -> None:
31
+ path = Path(context.workdir) / filename
32
+ path.write_text(contents, encoding="utf-8")
33
+ context.knowledge_base_file = path
34
+
35
+
36
+ @when('I create a knowledge base from folder "{folder}" only')
37
+ def when_create_knowledge_base_from_folder(context, folder: str) -> None:
38
+ root = Path(context.workdir) / folder
39
+ context.knowledge_base = KnowledgeBase.from_folder(root)
40
+
41
+
42
+ @when('I create a knowledge base from folder "{folder}" using corpus root "{corpus_root}"')
43
+ def when_create_knowledge_base_from_folder_with_corpus_root(
44
+ context, folder: str, corpus_root: str
45
+ ) -> None:
46
+ root = Path(context.workdir) / folder
47
+ corpus_root_path = Path(context.workdir) / corpus_root
48
+ context.knowledge_base = KnowledgeBase.from_folder(root, corpus_root=corpus_root_path)
49
+
50
+
51
+ @when('I attempt to create a knowledge base from folder "{folder}"')
52
+ def when_attempt_create_knowledge_base_from_folder(context, folder: str) -> None:
53
+ root = Path(context.workdir) / folder
54
+ try:
55
+ KnowledgeBase.from_folder(root)
56
+ except (FileNotFoundError, NotADirectoryError) as exc:
57
+ context.knowledge_base_error = exc
58
+
59
+
60
+ @then('the knowledge base error includes "{text}"')
61
+ def then_knowledge_base_error_includes(context, text: str) -> None:
62
+ error = context.knowledge_base_error
63
+ assert text in str(error)
64
+
65
+
66
+ @when('I query the knowledge base for "{query_text}"')
67
+ def when_query_knowledge_base(context, query_text: str) -> None:
68
+ context.knowledge_base_result = context.knowledge_base.query(query_text)
69
+
70
+
71
+ @when("I build a context pack from the knowledge base query with token budget {max_tokens:d}")
72
+ def when_build_context_pack_from_knowledge_base_query(context, max_tokens: int) -> None:
73
+ context.context_pack = context.knowledge_base.context_pack(
74
+ context.knowledge_base_result,
75
+ max_tokens=max_tokens,
76
+ )
77
+
78
+
79
+ @when("I build a context pack from the knowledge base query without a token budget")
80
+ def when_build_context_pack_from_knowledge_base_query_without_budget(context) -> None:
81
+ context.context_pack = context.knowledge_base.context_pack(
82
+ context.knowledge_base_result,
83
+ )
84
+
85
+
86
+ @then('the knowledge base returns evidence that includes "{text}"')
87
+ def then_knowledge_base_returns_evidence_that_includes(context, text: str) -> None:
88
+ evidence_items = context.knowledge_base_result.evidence
89
+ evidence_texts = [item.text or "" for item in evidence_items]
90
+ assert any(text in evidence_text for evidence_text in evidence_texts)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "biblicus"
7
- version = "0.5.0"
7
+ version = "0.6.0"
8
8
  description = "Command line interface and Python library for corpus ingestion, retrieval, and evaluation."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -3,6 +3,7 @@ Biblicus public package interface.
3
3
  """
4
4
 
5
5
  from .corpus import Corpus
6
+ from .knowledge_base import KnowledgeBase
6
7
  from .models import (
7
8
  CorpusConfig,
8
9
  Evidence,
@@ -19,10 +20,11 @@ __all__ = [
19
20
  "CorpusConfig",
20
21
  "Evidence",
21
22
  "IngestResult",
23
+ "KnowledgeBase",
22
24
  "QueryBudget",
23
25
  "RecipeManifest",
24
26
  "RetrievalResult",
25
27
  "RetrievalRun",
26
28
  ]
27
29
 
28
- __version__ = "0.5.0"
30
+ __version__ = "0.6.0"