cbrkit 1.2.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. {cbrkit-1.2.0 → cbrkit-1.4.0}/PKG-INFO +76 -47
  2. {cbrkit-1.2.0 → cbrkit-1.4.0}/README.md +58 -38
  3. {cbrkit-1.2.0 → cbrkit-1.4.0}/pyproject.toml +64 -48
  4. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/api.py +2 -2
  5. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/eval/common.py +12 -9
  6. cbrkit-1.4.0/src/cbrkit/filter.py +81 -0
  7. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/helpers.py +174 -157
  8. cbrkit-1.4.0/src/cbrkit/indexable/__init__.py +54 -0
  9. cbrkit-1.4.0/src/cbrkit/indexable/_common.py +262 -0
  10. cbrkit-1.4.0/src/cbrkit/indexable/chromadb.py +271 -0
  11. cbrkit-1.4.0/src/cbrkit/indexable/lancedb.py +290 -0
  12. cbrkit-1.4.0/src/cbrkit/indexable/pgvector.py +345 -0
  13. cbrkit-1.4.0/src/cbrkit/indexable/sqlalchemy.py +733 -0
  14. cbrkit-1.4.0/src/cbrkit/indexable/sqlite_vec.py +403 -0
  15. cbrkit-1.4.0/src/cbrkit/indexable/zvec.py +353 -0
  16. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/loaders.py +34 -7
  17. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retain/build.py +2 -2
  18. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retain/storage.py +10 -5
  19. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retrieval/__init__.py +16 -0
  20. cbrkit-1.4.0/src/cbrkit/retrieval/apply.py +293 -0
  21. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/__init__.py +34 -0
  22. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/_common.py +472 -0
  23. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/bm25.py +178 -0
  24. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/chromadb.py +161 -0
  25. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/embed.py +247 -0
  26. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/lancedb.py +162 -0
  27. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/pgvector.py +248 -0
  28. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/sqlite_vec.py +294 -0
  29. cbrkit-1.4.0/src/cbrkit/retrieval/indexable/zvec.py +205 -0
  30. cbrkit-1.4.0/src/cbrkit/retrieval/rerank/__init__.py +18 -0
  31. cbrkit-1.4.0/src/cbrkit/retrieval/rerank/_common.py +53 -0
  32. cbrkit-1.4.0/src/cbrkit/retrieval/rerank/cohere.py +41 -0
  33. cbrkit-1.4.0/src/cbrkit/retrieval/rerank/sentence_transformers.py +101 -0
  34. cbrkit-1.4.0/src/cbrkit/retrieval/rerank/voyageai.py +38 -0
  35. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retrieval/wrappers.py +38 -6
  36. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/reuse/build.py +3 -1
  37. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/revise/build.py +4 -2
  38. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/aggregator.py +4 -3
  39. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/collections.py +3 -3
  40. cbrkit-1.4.0/src/cbrkit/sim/embed/__init__.py +66 -0
  41. cbrkit-1.4.0/src/cbrkit/sim/embed/core.py +327 -0
  42. cbrkit-1.4.0/src/cbrkit/sim/embed/metrics.py +158 -0
  43. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/__init__.py +43 -0
  44. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/bm25.py +181 -0
  45. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/cohere.py +45 -0
  46. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/ollama.py +39 -0
  47. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/openai.py +65 -0
  48. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/pydantic_ai.py +31 -0
  49. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/sentence_transformers.py +80 -0
  50. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/spacy.py +124 -0
  51. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/sparse_encoder.py +93 -0
  52. cbrkit-1.4.0/src/cbrkit/sim/embed/providers/voyageai.py +38 -0
  53. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/alignment.py +1 -1
  54. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/wrappers.py +5 -5
  55. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/apply.py +2 -1
  56. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/anthropic.py +1 -1
  57. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/cohere.py +3 -1
  58. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/openai_completions.py +1 -1
  59. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/openai_responses.py +1 -1
  60. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/system.py +1 -1
  61. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/typing.py +116 -6
  62. cbrkit-1.2.0/src/cbrkit/indexable.py +0 -717
  63. cbrkit-1.2.0/src/cbrkit/retrieval/apply.py +0 -164
  64. cbrkit-1.2.0/src/cbrkit/retrieval/indexable.py +0 -1050
  65. cbrkit-1.2.0/src/cbrkit/retrieval/rerank.py +0 -219
  66. cbrkit-1.2.0/src/cbrkit/sim/embed.py +0 -994
  67. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/__init__.py +0 -0
  68. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/__main__.py +0 -0
  69. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/adapt/__init__.py +0 -0
  70. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/adapt/attribute_value.py +0 -0
  71. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/adapt/generic.py +0 -0
  72. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/adapt/numbers.py +0 -0
  73. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/adapt/strings.py +0 -0
  74. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/cli.py +0 -0
  75. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/constants.py +0 -0
  76. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/cycle.py +0 -0
  77. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/dumpers.py +0 -0
  78. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/eval/__init__.py +0 -0
  79. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/eval/retrieval.py +0 -0
  80. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/model/__init__.py +0 -0
  81. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/model/graph.py +0 -0
  82. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/model/result.py +0 -0
  83. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/py.typed +0 -0
  84. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retain/__init__.py +0 -0
  85. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retain/apply.py +0 -0
  86. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/retrieval/build.py +0 -0
  87. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/reuse/__init__.py +0 -0
  88. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/reuse/apply.py +0 -0
  89. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/revise/__init__.py +0 -0
  90. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/revise/apply.py +0 -0
  91. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/__init__.py +0 -0
  92. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/attribute_value.py +0 -0
  93. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/generic.py +0 -0
  94. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/__init__.py +0 -0
  95. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/astar.py +0 -0
  96. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/brute_force.py +0 -0
  97. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/common.py +0 -0
  98. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/dfs.py +0 -0
  99. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/greedy.py +0 -0
  100. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/lap.py +0 -0
  101. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/qap.py +0 -0
  102. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/graphs/vf2.py +0 -0
  103. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/numbers.py +0 -0
  104. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/pooling.py +0 -0
  105. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/strings.py +0 -0
  106. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/sim/taxonomy.py +0 -0
  107. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/__init__.py +0 -0
  108. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/build.py +0 -0
  109. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/model.py +0 -0
  110. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/prompts.py +0 -0
  111. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/__init__.py +0 -0
  112. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/google.py +0 -0
  113. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/instructor.py +0 -0
  114. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/model.py +0 -0
  115. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/ollama.py +0 -0
  116. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/openai_agents.py +0 -0
  117. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/pydantic_ai.py +0 -0
  118. {cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/synthesis/providers/wrappers.py +0 -0
@@ -1,16 +1,16 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: cbrkit
3
- Version: 1.2.0
3
+ Version: 1.4.0
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
5
5
  Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
6
6
  Author: Mirko Lenz
7
7
  Author-email: Mirko Lenz <mirko@mirkolenz.com>
8
+ License-Expression: MIT
8
9
  Classifier: Development Status :: 4 - Beta
9
10
  Classifier: Environment :: Console
10
11
  Classifier: Framework :: Pytest
11
12
  Classifier: Intended Audience :: Developers
12
13
  Classifier: Intended Audience :: Science/Research
13
- Classifier: License :: OSI Approved :: MIT License
14
14
  Classifier: Natural Language :: English
15
15
  Classifier: Operating System :: OS Independent
16
16
  Classifier: Programming Language :: Python :: 3.13
@@ -30,7 +30,7 @@ Requires-Dist: pyyaml>=6,<7
30
30
  Requires-Dist: rtoml>=0.12,<1
31
31
  Requires-Dist: scipy>=1,<2
32
32
  Requires-Dist: xmltodict>=1,<2
33
- Requires-Dist: cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pydantic-ai,spacy,sql,timeseries,transformers,voyageai,zvec] ; extra == 'all'
33
+ Requires-Dist: cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pgvector,pydantic-ai,spacy,sql,sqlite-vec,timeseries,transformers,voyageai,zvec] ; extra == 'all'
34
34
  Requires-Dist: anthropic>=0.40,<1 ; extra == 'anthropic'
35
35
  Requires-Dist: cbrkit[cli] ; extra == 'api'
36
36
  Requires-Dist: fastapi>=0.100,<1 ; extra == 'api'
@@ -41,11 +41,11 @@ Requires-Dist: fastmcp>=3,<4 ; extra == 'api'
41
41
  Requires-Dist: bm25s[core,stem,indexing]>=0.3,<1 ; extra == 'bm25'
42
42
  Requires-Dist: chromadb>=1,<2 ; extra == 'chromadb'
43
43
  Requires-Dist: chonkie>=1,<2 ; extra == 'chunking'
44
- Requires-Dist: rich>=13,<15 ; extra == 'cli'
45
- Requires-Dist: typer>=0.9,<1 ; extra == 'cli'
46
- Requires-Dist: cohere>=5,<6 ; extra == 'cohere'
44
+ Requires-Dist: rich>=14,<16 ; extra == 'cli'
45
+ Requires-Dist: typer>=0.20,<1 ; extra == 'cli'
46
+ Requires-Dist: cohere>=7,<8 ; extra == 'cohere'
47
47
  Requires-Dist: ranx>=0.3,<1 ; extra == 'eval'
48
- Requires-Dist: google-genai>=1,<2 ; extra == 'google'
48
+ Requires-Dist: google-genai>=2,<3 ; extra == 'google'
49
49
  Requires-Dist: networkx>=3,<4 ; extra == 'graphs'
50
50
  Requires-Dist: rustworkx>=0.15,<1 ; extra == 'graphs'
51
51
  Requires-Dist: pygraphviz>=1,<2 ; extra == 'graphviz'
@@ -58,14 +58,20 @@ Requires-Dist: openai>=1,<3 ; extra == 'openai'
58
58
  Requires-Dist: tiktoken>=0.8,<1 ; extra == 'openai'
59
59
  Requires-Dist: openai-agents>=0.2,<1 ; extra == 'openai-agents'
60
60
  Requires-Dist: pandas>=2,<4 ; extra == 'pandas'
61
+ Requires-Dist: pgvector>=0.4,<1 ; extra == 'pgvector'
62
+ Requires-Dist: cbrkit[sql] ; extra == 'pgvector'
61
63
  Requires-Dist: pydantic-ai-slim>=1,<2 ; extra == 'pydantic-ai'
62
64
  Requires-Dist: spacy>=3.8,<4 ; extra == 'spacy'
63
- Requires-Dist: sqlalchemy>=2,<3 ; extra == 'sql'
65
+ Requires-Dist: sqlalchemy[asyncio]>=2,<3 ; extra == 'sql'
66
+ Requires-Dist: sqlite-vec>=0.1,<1 ; extra == 'sqlite-vec'
67
+ Requires-Dist: aiosqlite>=0.20,<1 ; extra == 'sqlite-vec'
68
+ Requires-Dist: cbrkit[sql] ; extra == 'sqlite-vec'
64
69
  Requires-Dist: minineedle>=3,<4 ; extra == 'timeseries'
65
70
  Requires-Dist: sentence-transformers>=4,<6 ; extra == 'transformers'
66
71
  Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
67
72
  Requires-Dist: transformers>=4,<6 ; extra == 'transformers'
68
73
  Requires-Dist: voyageai>=0.3,<1 ; extra == 'voyageai'
74
+ Requires-Dist: zvec>=0.2,<1 ; extra == 'zvec'
69
75
  Requires-Python: >=3.13, <4
70
76
  Project-URL: Repository, https://github.com/wi2trier/cbrkit
71
77
  Project-URL: Documentation, https://wi2trier.github.io/cbrkit/
@@ -91,12 +97,15 @@ Provides-Extra: ollama
91
97
  Provides-Extra: openai
92
98
  Provides-Extra: openai-agents
93
99
  Provides-Extra: pandas
100
+ Provides-Extra: pgvector
94
101
  Provides-Extra: pydantic-ai
95
102
  Provides-Extra: spacy
96
103
  Provides-Extra: sql
104
+ Provides-Extra: sqlite-vec
97
105
  Provides-Extra: timeseries
98
106
  Provides-Extra: transformers
99
107
  Provides-Extra: voyageai
108
+ Provides-Extra: zvec
100
109
  Description-Content-Type: text/markdown
101
110
 
102
111
  <!-- markdownlint-disable MD033 MD041 -->
@@ -229,12 +238,14 @@ df = pl.read_csv("path/to/cases.csv")
229
238
  casebase = cbrkit.loaders.polars(df)
230
239
  ```
231
240
 
232
- For database access, CBRkit provides `sqlite` and `sqlalchemy` loaders (the latter requires the `sql` extra):
241
+ For ad-hoc SQLite loading, CBRkit ships a stdlib-based loader:
233
242
 
234
243
  ```python
235
244
  casebase = cbrkit.loaders.sqlite("path/to/database.db", "SELECT * FROM cases")
236
245
  ```
237
246
 
247
+ For richer relational backends (filters, upserts, vector/FTS search via pgvector on PostgreSQL or sqlite-vec on SQLite), see `cbrkit.indexable.sqlalchemy`, `cbrkit.indexable.pgvector`, and `cbrkit.indexable.sqlite_vec`.
248
+
238
249
  **Tip:** You can validate a loaded casebase against a Pydantic model using `cbrkit.loaders.validate()`:
239
250
 
240
251
  ```python
@@ -680,8 +691,7 @@ The result contains `similarities` with quality assessment scores for each case.
680
691
  ## Retain
681
692
 
682
693
  The retain phase decides whether and how to integrate new cases into the casebase.
683
- The `cbrkit.retain` module provides utility functions for this purpose.
684
- You build a retain pipeline by specifying an assessment function and a storage function:
694
+ Build a retain pipeline from an assessment function and a storage function:
685
695
 
686
696
  ```python
687
697
  retainer = cbrkit.retain.build(
@@ -693,27 +703,9 @@ retainer = cbrkit.retain.build(
693
703
  )
694
704
  ```
695
705
 
696
- CBRkit provides several built-in storage functions:
697
-
698
- - `static`: Generates keys from a fixed reference casebase to avoid collisions.
699
- - `indexable`: Keeps an `IndexableFunc`'s index in sync with the casebase.
700
-
701
- You can filter retained cases based on their assessment scores using the `dropout` wrapper:
702
-
703
- ```python
704
- retainer = cbrkit.retain.dropout(
705
- retainer_func=cbrkit.retain.build(...),
706
- min_similarity=0.5,
707
- )
708
- ```
709
-
710
- The retainer can be applied to a revise result:
711
-
712
- ```python
713
- result = cbrkit.retain.apply_result(revise_result, retainer)
714
- ```
715
-
716
- The result contains `similarities` with fitness scores and `casebase` with the updated cases.
706
+ The built-in storage functions are `static` (generates collision-free keys from a reference casebase) and `indexable` (keeps an `IndexableFunc`'s index in sync with the casebase).
707
+ Wrap a retainer with `dropout` to filter by assessment score (e.g. `min_similarity=0.5`), then apply it to a revise result via `cbrkit.retain.apply_result(revise_result, retainer)`.
708
+ The result exposes `similarities` (fitness scores) and `casebase` (updated cases).
717
709
 
718
710
  ## Full CBR Cycle
719
711
 
@@ -846,37 +838,74 @@ result = cbrkit.retrieval.apply_query(casebase, query, (retriever, reranker))
846
838
 
847
839
  ### Indexed Retrieval
848
840
 
849
- Some retrievers like `bm25`, `embed`, and `lancedb` support **indexed retrieval**, where the casebase is pre-indexed once and then queried without passing the full casebase each time.
850
- This is useful for large casebases or when using external search backends.
841
+ Indexed retrieval pre-indexes the casebase once and then queries it without passing the full casebase each time, which helps for large casebases or external search backends.
842
+ Index maintenance lives on whichever object owns the index.
851
843
 
852
- To use indexed retrieval, first create a retriever and call its `index()` method:
844
+ The self-contained `bm25` and `embed` retrievers own their index, so you call `put_index()` on the retriever:
853
845
 
854
846
  ```python
855
847
  from frozendict import frozendict
856
848
 
857
- bm25_func = cbrkit.sim.embed.bm25(language="en")
858
- retriever = cbrkit.retrieval.bm25(conversion_func=bm25_func)
859
- retriever.create_index(frozendict(casebase))
849
+ retriever = cbrkit.retrieval.bm25(conversion_func=cbrkit.sim.embed.bm25(language="en"))
850
+ retriever.put_index(frozendict(casebase))
860
851
  ```
861
852
 
862
- Then pass an empty casebase (`{}`) to signal that the retriever should use its pre-indexed data:
853
+ The storage-backed `lancedb`, `chromadb`, `zvec`, `pgvector`, and `sqlite_vec` retrievers are pure query paths over a separate `cbrkit.indexable` storage that owns the index, so you index on the storage and wrap it for querying:
863
854
 
864
855
  ```python
865
- result = cbrkit.retrieval.apply_query({}, query, retriever)
856
+ storage = cbrkit.indexable.lancedb(uri="./cases", table_name="cases")
857
+ storage.put_index(frozendict(casebase))
858
+ retriever = cbrkit.retrieval.lancedb(storage=storage, search_type="dense")
866
859
  ```
867
860
 
868
- As a convenience, CBRkit provides `apply_query_indexed` and `apply_queries_indexed` which handle the empty casebase automatically:
861
+ Query a pre-indexed retriever with `apply_query_indexed` / `apply_queries_indexed` (or pass an empty casebase `{}` to `apply_query`); querying an un-indexed retriever raises `ValueError`:
869
862
 
870
863
  ```python
871
864
  result = cbrkit.retrieval.apply_query_indexed(query, retriever)
872
- # or for multiple queries:
873
- result = cbrkit.retrieval.apply_queries_indexed(queries, retriever)
874
865
  ```
875
866
 
876
- If a retriever receives an empty casebase but has not been indexed yet, a `ValueError` is raised with a message to call `index()` first.
867
+ The `System` class also defaults its casebase to `{}`, so a system of pre-indexed retrievers needs no casebase at query time.
868
+
869
+ #### Typed Values and the Retain Caveat
870
+
871
+ Each backend has one text-field knob — `value_column` (`value_field` for `zvec`/`chromadb`) — naming the embeddable text, and the value type `V` follows the schema source:
872
+
873
+ - **Plain text** (`V = str`, the default) — the bare string is stored under the text knob and read back as a string.
874
+ - **Typed model** (`V = YourModel`) — pass a `model`: a dataclass or Pydantic model for `lancedb`/`zvec`/`chromadb` (fields become columns), or a SQLAlchemy mapped class for `sqlalchemy`/`pgvector`/`sqlite_vec` (its `__table__` defines the schema). Reads reconstruct model instances.
875
+ - **Mapping** (`V = Mapping[str, Any]`) — `sqlalchemy`/`pgvector`/`sqlite_vec` only, via a host-supplied `table` or `reflect=True`.
876
+
877
+ ```python
878
+ # plain strings — cbrkit builds the table
879
+ store = cbrkit.indexable.pgvector[str, str](
880
+ url=..., value_column="body", pgvector_dim=384, conversion_func=embed,
881
+ )
882
+
883
+ # typed rows — a SQLAlchemy mapped class defines the schema
884
+ class Car(Base):
885
+ __tablename__ = "cars"
886
+ key: Mapped[str] = mapped_column(primary_key=True)
887
+ desc: Mapped[str] = mapped_column()
888
+ _pgvec: Mapped[Any] = mapped_column(cbrkit.indexable.PGVECTOR(384), nullable=False)
889
+
890
+ store = cbrkit.indexable.pgvector[str, Car](url=..., model=Car, value_column="desc", ...)
891
+ ```
892
+
893
+ Pass `vector_type="halfvec"` for half-precision storage (~2x smaller, negligible recall loss); for a typed model, declare the column with the re-exported `cbrkit.indexable.HALFVEC` instead of `PGVECTOR`.
894
+
895
+ For a self-contained, file-based store, `sqlite_vec` offers the same dense/sparse/hybrid API on SQLite via the [`sqlite-vec`](https://github.com/asg017/sqlite-vec) extension (loaded automatically). Dense KNN uses a `vec0` virtual table (so the backend inherits future `vec0` capabilities such as approximate search, and supports quantized `vector_type="int8"` storage today), sparse search uses built-in FTS5, and `Filter` `WHERE` clauses compose by joining matches back to the main table:
896
+
897
+ ```python
898
+ store = cbrkit.indexable.sqlite_vec[str, str](
899
+ url="sqlite+aiosqlite:///cases.db",
900
+ value_column="body", vector_dim=384, index_type="hybrid", conversion_func=embed,
901
+ )
902
+ store.put_index(frozendict(casebase))
903
+ retriever = cbrkit.retrieval.indexable.sqlite_vec(storage=store, search_type="hybrid")
904
+ ```
877
905
 
878
- The `System` class also supports indexed retrieval by defaulting the casebase to an empty dict.
879
- This allows creating a system where all retrievers are pre-indexed and no casebase needs to be provided at query time.
906
+ **Retain caveat:** the storage-backed retrievers search by the text column and always return `Casebase[K, str]`, projecting richer values down to their text.
907
+ A retrieve retain round-trip via `cbrkit.retrieval.indexable.*` + `cbrkit.retain.indexable` therefore lines up cleanly only when `V = str`.
908
+ For model or mapping stores, either use the backend as a typed store (read `storage.index` as `Casebase[K, V]` and retrieve with a value-based retriever like `cbrkit.retrieval.build(...)`), or re-hydrate full rows by key from `storage.index` after text retrieval.
880
909
 
881
910
  ## Evaluation
882
911
 
@@ -128,12 +128,14 @@ df = pl.read_csv("path/to/cases.csv")
128
128
  casebase = cbrkit.loaders.polars(df)
129
129
  ```
130
130
 
131
- For database access, CBRkit provides `sqlite` and `sqlalchemy` loaders (the latter requires the `sql` extra):
131
+ For ad-hoc SQLite loading, CBRkit ships a stdlib-based loader:
132
132
 
133
133
  ```python
134
134
  casebase = cbrkit.loaders.sqlite("path/to/database.db", "SELECT * FROM cases")
135
135
  ```
136
136
 
137
+ For richer relational backends (filters, upserts, vector/FTS search via pgvector on PostgreSQL or sqlite-vec on SQLite), see `cbrkit.indexable.sqlalchemy`, `cbrkit.indexable.pgvector`, and `cbrkit.indexable.sqlite_vec`.
138
+
137
139
  **Tip:** You can validate a loaded casebase against a Pydantic model using `cbrkit.loaders.validate()`:
138
140
 
139
141
  ```python
@@ -579,8 +581,7 @@ The result contains `similarities` with quality assessment scores for each case.
579
581
  ## Retain
580
582
 
581
583
  The retain phase decides whether and how to integrate new cases into the casebase.
582
- The `cbrkit.retain` module provides utility functions for this purpose.
583
- You build a retain pipeline by specifying an assessment function and a storage function:
584
+ Build a retain pipeline from an assessment function and a storage function:
584
585
 
585
586
  ```python
586
587
  retainer = cbrkit.retain.build(
@@ -592,27 +593,9 @@ retainer = cbrkit.retain.build(
592
593
  )
593
594
  ```
594
595
 
595
- CBRkit provides several built-in storage functions:
596
-
597
- - `static`: Generates keys from a fixed reference casebase to avoid collisions.
598
- - `indexable`: Keeps an `IndexableFunc`'s index in sync with the casebase.
599
-
600
- You can filter retained cases based on their assessment scores using the `dropout` wrapper:
601
-
602
- ```python
603
- retainer = cbrkit.retain.dropout(
604
- retainer_func=cbrkit.retain.build(...),
605
- min_similarity=0.5,
606
- )
607
- ```
608
-
609
- The retainer can be applied to a revise result:
610
-
611
- ```python
612
- result = cbrkit.retain.apply_result(revise_result, retainer)
613
- ```
614
-
615
- The result contains `similarities` with fitness scores and `casebase` with the updated cases.
596
+ The built-in storage functions are `static` (generates collision-free keys from a reference casebase) and `indexable` (keeps an `IndexableFunc`'s index in sync with the casebase).
597
+ Wrap a retainer with `dropout` to filter by assessment score (e.g. `min_similarity=0.5`), then apply it to a revise result via `cbrkit.retain.apply_result(revise_result, retainer)`.
598
+ The result exposes `similarities` (fitness scores) and `casebase` (updated cases).
616
599
 
617
600
  ## Full CBR Cycle
618
601
 
@@ -745,37 +728,74 @@ result = cbrkit.retrieval.apply_query(casebase, query, (retriever, reranker))
745
728
 
746
729
  ### Indexed Retrieval
747
730
 
748
- Some retrievers like `bm25`, `embed`, and `lancedb` support **indexed retrieval**, where the casebase is pre-indexed once and then queried without passing the full casebase each time.
749
- This is useful for large casebases or when using external search backends.
731
+ Indexed retrieval pre-indexes the casebase once and then queries it without passing the full casebase each time, which helps for large casebases or external search backends.
732
+ Index maintenance lives on whichever object owns the index.
750
733
 
751
- To use indexed retrieval, first create a retriever and call its `index()` method:
734
+ The self-contained `bm25` and `embed` retrievers own their index, so you call `put_index()` on the retriever:
752
735
 
753
736
  ```python
754
737
  from frozendict import frozendict
755
738
 
756
- bm25_func = cbrkit.sim.embed.bm25(language="en")
757
- retriever = cbrkit.retrieval.bm25(conversion_func=bm25_func)
758
- retriever.create_index(frozendict(casebase))
739
+ retriever = cbrkit.retrieval.bm25(conversion_func=cbrkit.sim.embed.bm25(language="en"))
740
+ retriever.put_index(frozendict(casebase))
759
741
  ```
760
742
 
761
- Then pass an empty casebase (`{}`) to signal that the retriever should use its pre-indexed data:
743
+ The storage-backed `lancedb`, `chromadb`, `zvec`, `pgvector`, and `sqlite_vec` retrievers are pure query paths over a separate `cbrkit.indexable` storage that owns the index, so you index on the storage and wrap it for querying:
762
744
 
763
745
  ```python
764
- result = cbrkit.retrieval.apply_query({}, query, retriever)
746
+ storage = cbrkit.indexable.lancedb(uri="./cases", table_name="cases")
747
+ storage.put_index(frozendict(casebase))
748
+ retriever = cbrkit.retrieval.lancedb(storage=storage, search_type="dense")
765
749
  ```
766
750
 
767
- As a convenience, CBRkit provides `apply_query_indexed` and `apply_queries_indexed` which handle the empty casebase automatically:
751
+ Query a pre-indexed retriever with `apply_query_indexed` / `apply_queries_indexed` (or pass an empty casebase `{}` to `apply_query`); querying an un-indexed retriever raises `ValueError`:
768
752
 
769
753
  ```python
770
754
  result = cbrkit.retrieval.apply_query_indexed(query, retriever)
771
- # or for multiple queries:
772
- result = cbrkit.retrieval.apply_queries_indexed(queries, retriever)
773
755
  ```
774
756
 
775
- If a retriever receives an empty casebase but has not been indexed yet, a `ValueError` is raised with a message to call `index()` first.
757
+ The `System` class also defaults its casebase to `{}`, so a system of pre-indexed retrievers needs no casebase at query time.
758
+
759
+ #### Typed Values and the Retain Caveat
760
+
761
+ Each backend has one text-field knob — `value_column` (`value_field` for `zvec`/`chromadb`) — naming the embeddable text, and the value type `V` follows the schema source:
762
+
763
+ - **Plain text** (`V = str`, the default) — the bare string is stored under the text knob and read back as a string.
764
+ - **Typed model** (`V = YourModel`) — pass a `model`: a dataclass or Pydantic model for `lancedb`/`zvec`/`chromadb` (fields become columns), or a SQLAlchemy mapped class for `sqlalchemy`/`pgvector`/`sqlite_vec` (its `__table__` defines the schema). Reads reconstruct model instances.
765
+ - **Mapping** (`V = Mapping[str, Any]`) — `sqlalchemy`/`pgvector`/`sqlite_vec` only, via a host-supplied `table` or `reflect=True`.
766
+
767
+ ```python
768
+ # plain strings — cbrkit builds the table
769
+ store = cbrkit.indexable.pgvector[str, str](
770
+ url=..., value_column="body", pgvector_dim=384, conversion_func=embed,
771
+ )
772
+
773
+ # typed rows — a SQLAlchemy mapped class defines the schema
774
+ class Car(Base):
775
+ __tablename__ = "cars"
776
+ key: Mapped[str] = mapped_column(primary_key=True)
777
+ desc: Mapped[str] = mapped_column()
778
+ _pgvec: Mapped[Any] = mapped_column(cbrkit.indexable.PGVECTOR(384), nullable=False)
779
+
780
+ store = cbrkit.indexable.pgvector[str, Car](url=..., model=Car, value_column="desc", ...)
781
+ ```
782
+
783
+ Pass `vector_type="halfvec"` for half-precision storage (~2x smaller, negligible recall loss); for a typed model, declare the column with the re-exported `cbrkit.indexable.HALFVEC` instead of `PGVECTOR`.
784
+
785
+ For a self-contained, file-based store, `sqlite_vec` offers the same dense/sparse/hybrid API on SQLite via the [`sqlite-vec`](https://github.com/asg017/sqlite-vec) extension (loaded automatically). Dense KNN uses a `vec0` virtual table (so the backend inherits future `vec0` capabilities such as approximate search, and supports quantized `vector_type="int8"` storage today), sparse search uses built-in FTS5, and `Filter` `WHERE` clauses compose by joining matches back to the main table:
786
+
787
+ ```python
788
+ store = cbrkit.indexable.sqlite_vec[str, str](
789
+ url="sqlite+aiosqlite:///cases.db",
790
+ value_column="body", vector_dim=384, index_type="hybrid", conversion_func=embed,
791
+ )
792
+ store.put_index(frozendict(casebase))
793
+ retriever = cbrkit.retrieval.indexable.sqlite_vec(storage=store, search_type="hybrid")
794
+ ```
776
795
 
777
- The `System` class also supports indexed retrieval by defaulting the casebase to an empty dict.
778
- This allows creating a system where all retrievers are pre-indexed and no casebase needs to be provided at query time.
796
+ **Retain caveat:** the storage-backed retrievers search by the text column and always return `Casebase[K, str]`, projecting richer values down to their text.
797
+ A retrieve retain round-trip via `cbrkit.retrieval.indexable.*` + `cbrkit.retain.indexable` therefore lines up cleanly only when `V = str`.
798
+ For model or mapping stores, either use the backend as a typed store (read `storage.index` as `Casebase[K, V]` and retrieve with a value-based retriever like `cbrkit.retrieval.build(...)`), or re-hydrate full rows by key from `storage.index` after text retrieval.
779
799
 
780
800
  ## Evaluation
781
801
 
@@ -1,56 +1,56 @@
1
1
  [project]
2
2
  name = "cbrkit"
3
- version = "1.2.0"
3
+ version = "1.4.0"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI"
5
5
  authors = [{ name = "Mirko Lenz", email = "mirko@mirkolenz.com" }]
6
6
  readme = "README.md"
7
+ license = "MIT"
7
8
  keywords = [
8
- "cbr",
9
- "case-based reasoning",
10
- "api",
11
- "similarity",
12
- "nlp",
13
- "retrieval",
14
- "cli",
15
- "tool",
16
- "library",
9
+ "cbr",
10
+ "case-based reasoning",
11
+ "api",
12
+ "similarity",
13
+ "nlp",
14
+ "retrieval",
15
+ "cli",
16
+ "tool",
17
+ "library",
17
18
  ]
18
19
  classifiers = [
19
- "Development Status :: 4 - Beta",
20
- "Environment :: Console",
21
- "Framework :: Pytest",
22
- "Intended Audience :: Developers",
23
- "Intended Audience :: Science/Research",
24
- "License :: OSI Approved :: MIT License",
25
- "Natural Language :: English",
26
- "Operating System :: OS Independent",
27
- "Programming Language :: Python :: 3.13",
28
- "Programming Language :: Python :: 3.14",
29
- "Programming Language :: Python :: 3",
30
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
31
- "Topic :: Scientific/Engineering :: Information Analysis",
32
- "Topic :: Software Development :: Libraries :: Python Modules",
33
- "Topic :: Utilities",
34
- "Typing :: Typed",
20
+ "Development Status :: 4 - Beta",
21
+ "Environment :: Console",
22
+ "Framework :: Pytest",
23
+ "Intended Audience :: Developers",
24
+ "Intended Audience :: Science/Research",
25
+ "Natural Language :: English",
26
+ "Operating System :: OS Independent",
27
+ "Programming Language :: Python :: 3.13",
28
+ "Programming Language :: Python :: 3.14",
29
+ "Programming Language :: Python :: 3",
30
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
31
+ "Topic :: Scientific/Engineering :: Information Analysis",
32
+ "Topic :: Software Development :: Libraries :: Python Modules",
33
+ "Topic :: Utilities",
34
+ "Typing :: Typed",
35
35
  ]
36
36
  requires-python = ">=3.13,<4"
37
37
  dependencies = [
38
- "frozendict>=2,<3",
39
- "numpy>=2,<3",
40
- "orjson>=3,<4",
41
- "polars>=1,<2",
42
- "pydantic>=2,<3",
43
- "pyyaml>=6,<7",
44
- "rtoml>=0.12,<1",
45
- "scipy>=1,<2",
46
- "xmltodict>=1,<2",
38
+ "frozendict>=2,<3",
39
+ "numpy>=2,<3",
40
+ "orjson>=3,<4",
41
+ "polars>=1,<2",
42
+ "pydantic>=2,<3",
43
+ "pyyaml>=6,<7",
44
+ "rtoml>=0.12,<1",
45
+ "scipy>=1,<2",
46
+ "xmltodict>=1,<2",
47
47
  ]
48
48
 
49
49
  [project.optional-dependencies]
50
50
  # LLM providers
51
51
  anthropic = ["anthropic>=0.40,<1"]
52
- cohere = ["cohere>=5,<6"]
53
- google = ["google-genai>=1,<2"]
52
+ cohere = ["cohere>=7,<8"]
53
+ google = ["google-genai>=2,<3"]
54
54
  instructor = ["instructor>=1,<2"]
55
55
  ollama = ["ollama>=0.3,<1"]
56
56
  openai = ["openai>=1,<3", "tiktoken>=0.8,<1"]
@@ -76,26 +76,30 @@ graphviz = ["pygraphviz>=1,<2"]
76
76
  chromadb = ["chromadb>=1,<2"]
77
77
  lancedb = ["lancedb>=0.20,<1"]
78
78
  pandas = ["pandas>=2,<4"]
79
- sql = ["sqlalchemy>=2,<3"]
80
- # zvec = ["zvec>=0.2,<1"]
79
+ pgvector = ["pgvector>=0.4,<1", "cbrkit[sql]"]
80
+ sql = ["sqlalchemy[asyncio]>=2,<3"]
81
+ sqlite-vec = ["sqlite-vec>=0.1,<1", "aiosqlite>=0.20,<1", "cbrkit[sql]"]
82
+ zvec = ["zvec>=0.2,<1"]
81
83
 
82
84
  # Tools
83
- cli = ["rich>=13,<15", "typer>=0.9,<1"]
85
+ cli = ["rich>=14,<16", "typer>=0.20,<1"]
84
86
  eval = ["ranx>=0.3,<1"]
85
87
  timeseries = ["minineedle>=3,<4"]
86
88
 
87
89
  # Entry points
88
90
  api = [
89
- "cbrkit[cli]",
90
- "fastapi>=0.100,<1",
91
- "pydantic-settings>=2,<3",
92
- "python-multipart>=0.0.15,<1",
93
- "uvicorn[standard]>=0.30,<1",
94
- "fastmcp>=3,<4",
91
+ "cbrkit[cli]",
92
+ "fastapi>=0.100,<1",
93
+ "pydantic-settings>=2,<3",
94
+ "python-multipart>=0.0.15,<1",
95
+ "uvicorn[standard]>=0.30,<1",
96
+ "fastmcp>=3,<4",
95
97
  ]
96
98
 
97
99
  # Bundle
98
- all = ["cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pydantic-ai,spacy,sql,timeseries,transformers,voyageai,zvec]"]
100
+ all = [
101
+ "cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pgvector,pydantic-ai,spacy,sql,sqlite-vec,timeseries,transformers,voyageai,zvec]",
102
+ ]
99
103
 
100
104
  [project.urls]
101
105
  Repository = "https://github.com/wi2trier/cbrkit"
@@ -117,11 +121,23 @@ build-backend = "uv_build"
117
121
 
118
122
  [tool.pytest]
119
123
  testpaths = ["src", "tests"]
120
- addopts = ["--cov=src/cbrkit", "--cov-report=term-missing", "--doctest-modules", "--import-mode=importlib"]
124
+ addopts = [
125
+ "--cov=src/cbrkit",
126
+ "--cov-report=term-missing",
127
+ "--doctest-modules",
128
+ "--import-mode=importlib",
129
+ ]
121
130
  doctest_optionflags = ["NORMALIZE_WHITESPACE", "IGNORE_EXCEPTION_DETAIL", "ELLIPSIS"]
122
131
 
123
132
  [tool.uv]
124
133
  default-groups = ["dev", "test", "docs"]
125
134
 
135
+ [tool.uv.extra-build-dependencies]
136
+ pygraphviz = ["setuptools"]
137
+ cbor = ["setuptools"]
138
+ warc3-wet-clueweb09 = ["setuptools"]
139
+ zlib-state = ["setuptools"]
140
+ pystemmer = ["setuptools", "cython"]
141
+
126
142
  [tool.ruff.lint.pydocstyle]
127
143
  convention = "google"
@@ -189,7 +189,7 @@ def synthesize(
189
189
  )
190
190
 
191
191
 
192
- def openapi_generator():
192
+ def openapi_generator() -> dict[str, Any]:
193
193
  """Generate and cache the OpenAPI schema for the CBRKit API."""
194
194
  if not app.openapi_schema:
195
195
  app.openapi_schema = get_openapi(
@@ -203,4 +203,4 @@ def openapi_generator():
203
203
  return app.openapi_schema
204
204
 
205
205
 
206
- app.openapi = openapi_generator # type: ignore[assignment]
206
+ app.openapi = openapi_generator # type: ignore[assignment] # ty: ignore[invalid-assignment]
@@ -7,7 +7,7 @@ from typing import Literal, cast
7
7
  from ..helpers import (
8
8
  get_logger,
9
9
  normalize_and_scale,
10
- round,
10
+ round_int,
11
11
  sim_map2ranking,
12
12
  unpack_float,
13
13
  unpack_floats,
@@ -487,15 +487,18 @@ def generate_metrics(
487
487
  >>> generate_metrics(["precision", "recall"], ks=5)
488
488
  ['precision@5', 'recall@5']
489
489
  """
490
- if not isinstance(ks, Iterable):
491
- ks = [ks]
492
-
493
- if not isinstance(relevance_levels, Iterable):
494
- relevance_levels = [relevance_levels]
490
+ ks_list: list[int | None] = [ks] if ks is None or isinstance(ks, int) else list(ks)
491
+ relevance_levels_list: list[int | None] = (
492
+ [relevance_levels]
493
+ if relevance_levels is None or isinstance(relevance_levels, int)
494
+ else list(relevance_levels)
495
+ )
495
496
 
496
497
  return [
497
- generate_metric(*args)
498
- for args in itertools.product(metrics, ks, relevance_levels)
498
+ generate_metric(metric, k, relevance_level)
499
+ for metric, k, relevance_level in itertools.product(
500
+ metrics, ks_list, relevance_levels_list
501
+ )
499
502
  ]
500
503
 
501
504
 
@@ -528,7 +531,7 @@ def similarities_to_qrels[Q, C](
528
531
 
529
532
  return {
530
533
  query: {
531
- case: round(
534
+ case: round_int(
532
535
  normalize_and_scale(sim, min_sim, max_sim, min_qrel, max_qrel),
533
536
  round_mode,
534
537
  )