PyPI - cbrkit - Versions diffs - 1.2.0__tar.gz → 1.4.0__tar.gz - Mend

cbrkit 1.2.0tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

{cbrkit-1.2.0 → cbrkit-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,16 +1,16 @@
-Metadata-Version: 2.3
+Metadata-Version: 2.4
 Name: cbrkit
-Version: 1.2.0
+Version: 1.4.0
 Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI
 Keywords: cbr,case-based reasoning,api,similarity,nlp,retrieval,cli,tool,library
 Author: Mirko Lenz
 Author-email: Mirko Lenz <mirko@mirkolenz.com>
+License-Expression: MIT
 Classifier: Development Status :: 4 - Beta
 Classifier: Environment :: Console
 Classifier: Framework :: Pytest
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
-Classifier: License :: OSI Approved :: MIT License
 Classifier: Natural Language :: English
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3.13
@@ -30,7 +30,7 @@ Requires-Dist: pyyaml>=6,<7
 Requires-Dist: rtoml>=0.12,<1
 Requires-Dist: scipy>=1,<2
 Requires-Dist: xmltodict>=1,<2
-Requires-Dist: cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pydantic-ai,spacy,sql,timeseries,transformers,voyageai,zvec] ; extra == 'all'
+Requires-Dist: cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pgvector,pydantic-ai,spacy,sql,sqlite-vec,timeseries,transformers,voyageai,zvec] ; extra == 'all'
 Requires-Dist: anthropic>=0.40,<1 ; extra == 'anthropic'
 Requires-Dist: cbrkit[cli] ; extra == 'api'
 Requires-Dist: fastapi>=0.100,<1 ; extra == 'api'
@@ -41,11 +41,11 @@ Requires-Dist: fastmcp>=3,<4 ; extra == 'api'
 Requires-Dist: bm25s[core,stem,indexing]>=0.3,<1 ; extra == 'bm25'
 Requires-Dist: chromadb>=1,<2 ; extra == 'chromadb'
 Requires-Dist: chonkie>=1,<2 ; extra == 'chunking'
-Requires-Dist: rich>=13,<15 ; extra == 'cli'
-Requires-Dist: typer>=0.9,<1 ; extra == 'cli'
-Requires-Dist: cohere>=5,<6 ; extra == 'cohere'
+Requires-Dist: rich>=14,<16 ; extra == 'cli'
+Requires-Dist: typer>=0.20,<1 ; extra == 'cli'
+Requires-Dist: cohere>=7,<8 ; extra == 'cohere'
 Requires-Dist: ranx>=0.3,<1 ; extra == 'eval'
-Requires-Dist: google-genai>=1,<2 ; extra == 'google'
+Requires-Dist: google-genai>=2,<3 ; extra == 'google'
 Requires-Dist: networkx>=3,<4 ; extra == 'graphs'
 Requires-Dist: rustworkx>=0.15,<1 ; extra == 'graphs'
 Requires-Dist: pygraphviz>=1,<2 ; extra == 'graphviz'
@@ -58,14 +58,20 @@ Requires-Dist: openai>=1,<3 ; extra == 'openai'
 Requires-Dist: tiktoken>=0.8,<1 ; extra == 'openai'
 Requires-Dist: openai-agents>=0.2,<1 ; extra == 'openai-agents'
 Requires-Dist: pandas>=2,<4 ; extra == 'pandas'
+Requires-Dist: pgvector>=0.4,<1 ; extra == 'pgvector'
+Requires-Dist: cbrkit[sql] ; extra == 'pgvector'
 Requires-Dist: pydantic-ai-slim>=1,<2 ; extra == 'pydantic-ai'
 Requires-Dist: spacy>=3.8,<4 ; extra == 'spacy'
-Requires-Dist: sqlalchemy>=2,<3 ; extra == 'sql'
+Requires-Dist: sqlalchemy[asyncio]>=2,<3 ; extra == 'sql'
+Requires-Dist: sqlite-vec>=0.1,<1 ; extra == 'sqlite-vec'
+Requires-Dist: aiosqlite>=0.20,<1 ; extra == 'sqlite-vec'
+Requires-Dist: cbrkit[sql] ; extra == 'sqlite-vec'
 Requires-Dist: minineedle>=3,<4 ; extra == 'timeseries'
 Requires-Dist: sentence-transformers>=4,<6 ; extra == 'transformers'
 Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
 Requires-Dist: transformers>=4,<6 ; extra == 'transformers'
 Requires-Dist: voyageai>=0.3,<1 ; extra == 'voyageai'
+Requires-Dist: zvec>=0.2,<1 ; extra == 'zvec'
 Requires-Python: >=3.13, <4
 Project-URL: Repository, https://github.com/wi2trier/cbrkit
 Project-URL: Documentation, https://wi2trier.github.io/cbrkit/
@@ -91,12 +97,15 @@ Provides-Extra: ollama
 Provides-Extra: openai
 Provides-Extra: openai-agents
 Provides-Extra: pandas
+Provides-Extra: pgvector
 Provides-Extra: pydantic-ai
 Provides-Extra: spacy
 Provides-Extra: sql
+Provides-Extra: sqlite-vec
 Provides-Extra: timeseries
 Provides-Extra: transformers
 Provides-Extra: voyageai
+Provides-Extra: zvec
 Description-Content-Type: text/markdown
 <!-- markdownlint-disable MD033 MD041 -->
@@ -229,12 +238,14 @@ df = pl.read_csv("path/to/cases.csv")
 casebase = cbrkit.loaders.polars(df)
 ```
-For database access, CBRkit provides `sqlite` and `sqlalchemy` loaders (the latter requires the `sql` extra):
+For ad-hoc SQLite loading, CBRkit ships a stdlib-based loader:
 ```python
 casebase = cbrkit.loaders.sqlite("path/to/database.db", "SELECT * FROM cases")
 ```
+For richer relational backends (filters, upserts, vector/FTS search via pgvector on PostgreSQL or sqlite-vec on SQLite), see `cbrkit.indexable.sqlalchemy`, `cbrkit.indexable.pgvector`, and `cbrkit.indexable.sqlite_vec`.
 **Tip:** You can validate a loaded casebase against a Pydantic model using `cbrkit.loaders.validate()`:
 ```python
@@ -680,8 +691,7 @@ The result contains `similarities` with quality assessment scores for each case.
 ## Retain
 The retain phase decides whether and how to integrate new cases into the casebase.
-The `cbrkit.retain` module provides utility functions for this purpose.
-You build a retain pipeline by specifying an assessment function and a storage function:
+Build a retain pipeline from an assessment function and a storage function:
 ```python
 retainer = cbrkit.retain.build(
@@ -693,27 +703,9 @@ retainer = cbrkit.retain.build(
 )
 ```
-CBRkit provides several built-in storage functions:
-- `static`: Generates keys from a fixed reference casebase to avoid collisions.
-- `indexable`: Keeps an `IndexableFunc`'s index in sync with the casebase.
-You can filter retained cases based on their assessment scores using the `dropout` wrapper:
-```python
-retainer = cbrkit.retain.dropout(
-    retainer_func=cbrkit.retain.build(...),
-    min_similarity=0.5,
-)
-```
-The retainer can be applied to a revise result:
-```python
-result = cbrkit.retain.apply_result(revise_result, retainer)
-```
-The result contains `similarities` with fitness scores and `casebase` with the updated cases.
+The built-in storage functions are `static` (generates collision-free keys from a reference casebase) and `indexable` (keeps an `IndexableFunc`'s index in sync with the casebase).
+Wrap a retainer with `dropout` to filter by assessment score (e.g. `min_similarity=0.5`), then apply it to a revise result via `cbrkit.retain.apply_result(revise_result, retainer)`.
+The result exposes `similarities` (fitness scores) and `casebase` (updated cases).
 ## Full CBR Cycle
@@ -846,37 +838,74 @@ result = cbrkit.retrieval.apply_query(casebase, query, (retriever, reranker))
 ### Indexed Retrieval
-Some retrievers like `bm25`, `embed`, and `lancedb` support **indexed retrieval**, where the casebase is pre-indexed once and then queried without passing the full casebase each time.
-This is useful for large casebases or when using external search backends.
+Indexed retrieval pre-indexes the casebase once and then queries it without passing the full casebase each time, which helps for large casebases or external search backends.
+Index maintenance lives on whichever object owns the index.
-To use indexed retrieval, first create a retriever and call its `index()` method:
+The self-contained `bm25` and `embed` retrievers own their index, so you call `put_index()` on the retriever:
 ```python
 from frozendict import frozendict
-bm25_func = cbrkit.sim.embed.bm25(language="en")
-retriever = cbrkit.retrieval.bm25(conversion_func=bm25_func)
-retriever.create_index(frozendict(casebase))
+retriever = cbrkit.retrieval.bm25(conversion_func=cbrkit.sim.embed.bm25(language="en"))
+retriever.put_index(frozendict(casebase))
 ```
-Then pass an empty casebase (`{}`) to signal that the retriever should use its pre-indexed data:
+The storage-backed `lancedb`, `chromadb`, `zvec`, `pgvector`, and `sqlite_vec` retrievers are pure query paths over a separate `cbrkit.indexable` storage that owns the index, so you index on the storage and wrap it for querying:
 ```python
-result = cbrkit.retrieval.apply_query({}, query, retriever)
+storage = cbrkit.indexable.lancedb(uri="./cases", table_name="cases")
+storage.put_index(frozendict(casebase))
+retriever = cbrkit.retrieval.lancedb(storage=storage, search_type="dense")
 ```
-As a convenience, CBRkit provides `apply_query_indexed` and `apply_queries_indexed` which handle the empty casebase automatically:
+Query a pre-indexed retriever with `apply_query_indexed` / `apply_queries_indexed` (or pass an empty casebase `{}` to `apply_query`); querying an un-indexed retriever raises `ValueError`:
 ```python
 result = cbrkit.retrieval.apply_query_indexed(query, retriever)
-# or for multiple queries:
-result = cbrkit.retrieval.apply_queries_indexed(queries, retriever)
 ```
-If a retriever receives an empty casebase but has not been indexed yet, a `ValueError` is raised with a message to call `index()` first.
+The `System` class also defaults its casebase to `{}`, so a system of pre-indexed retrievers needs no casebase at query time.
+#### Typed Values and the Retain Caveat
+Each backend has one text-field knob — `value_column` (`value_field` for `zvec`/`chromadb`) — naming the embeddable text, and the value type `V` follows the schema source:
+- **Plain text** (`V = str`, the default) — the bare string is stored under the text knob and read back as a string.
+- **Typed model** (`V = YourModel`) — pass a `model`: a dataclass or Pydantic model for `lancedb`/`zvec`/`chromadb` (fields become columns), or a SQLAlchemy mapped class for `sqlalchemy`/`pgvector`/`sqlite_vec` (its `__table__` defines the schema). Reads reconstruct model instances.
+- **Mapping** (`V = Mapping[str, Any]`) — `sqlalchemy`/`pgvector`/`sqlite_vec` only, via a host-supplied `table` or `reflect=True`.
+```python
+# plain strings — cbrkit builds the table
+store = cbrkit.indexable.pgvector[str, str](
+    url=..., value_column="body", pgvector_dim=384, conversion_func=embed,
+)
+# typed rows — a SQLAlchemy mapped class defines the schema
+class Car(Base):
+    __tablename__ = "cars"
+    key: Mapped[str] = mapped_column(primary_key=True)
+    desc: Mapped[str] = mapped_column()
+    _pgvec: Mapped[Any] = mapped_column(cbrkit.indexable.PGVECTOR(384), nullable=False)
+store = cbrkit.indexable.pgvector[str, Car](url=..., model=Car, value_column="desc", ...)
+```
+Pass `vector_type="halfvec"` for half-precision storage (~2x smaller, negligible recall loss); for a typed model, declare the column with the re-exported `cbrkit.indexable.HALFVEC` instead of `PGVECTOR`.
+For a self-contained, file-based store, `sqlite_vec` offers the same dense/sparse/hybrid API on SQLite via the [`sqlite-vec`](https://github.com/asg017/sqlite-vec) extension (loaded automatically). Dense KNN uses a `vec0` virtual table (so the backend inherits future `vec0` capabilities such as approximate search, and supports quantized `vector_type="int8"` storage today), sparse search uses built-in FTS5, and `Filter` `WHERE` clauses compose by joining matches back to the main table:
+```python
+store = cbrkit.indexable.sqlite_vec[str, str](
+    url="sqlite+aiosqlite:///cases.db",
+    value_column="body", vector_dim=384, index_type="hybrid", conversion_func=embed,
+)
+store.put_index(frozendict(casebase))
+retriever = cbrkit.retrieval.indexable.sqlite_vec(storage=store, search_type="hybrid")
+```
-The `System` class also supports indexed retrieval by defaulting the casebase to an empty dict.
-This allows creating a system where all retrievers are pre-indexed and no casebase needs to be provided at query time.
+**Retain caveat:** the storage-backed retrievers search by the text column and always return `Casebase[K, str]`, projecting richer values down to their text.
+A retrieve → retain round-trip via `cbrkit.retrieval.indexable.*` + `cbrkit.retain.indexable` therefore lines up cleanly only when `V = str`.
+For model or mapping stores, either use the backend as a typed store (read `storage.index` as `Casebase[K, V]` and retrieve with a value-based retriever like `cbrkit.retrieval.build(...)`), or re-hydrate full rows by key from `storage.index` after text retrieval.
 ## Evaluation

{cbrkit-1.2.0 → cbrkit-1.4.0}/README.md RENAMED Viewed

@@ -128,12 +128,14 @@ df = pl.read_csv("path/to/cases.csv")
 casebase = cbrkit.loaders.polars(df)
 ```
-For database access, CBRkit provides `sqlite` and `sqlalchemy` loaders (the latter requires the `sql` extra):
+For ad-hoc SQLite loading, CBRkit ships a stdlib-based loader:
 ```python
 casebase = cbrkit.loaders.sqlite("path/to/database.db", "SELECT * FROM cases")
 ```
+For richer relational backends (filters, upserts, vector/FTS search via pgvector on PostgreSQL or sqlite-vec on SQLite), see `cbrkit.indexable.sqlalchemy`, `cbrkit.indexable.pgvector`, and `cbrkit.indexable.sqlite_vec`.
 **Tip:** You can validate a loaded casebase against a Pydantic model using `cbrkit.loaders.validate()`:
 ```python
@@ -579,8 +581,7 @@ The result contains `similarities` with quality assessment scores for each case.
 ## Retain
 The retain phase decides whether and how to integrate new cases into the casebase.
-The `cbrkit.retain` module provides utility functions for this purpose.
-You build a retain pipeline by specifying an assessment function and a storage function:
+Build a retain pipeline from an assessment function and a storage function:
 ```python
 retainer = cbrkit.retain.build(
@@ -592,27 +593,9 @@ retainer = cbrkit.retain.build(
 )
 ```
-CBRkit provides several built-in storage functions:
-- `static`: Generates keys from a fixed reference casebase to avoid collisions.
-- `indexable`: Keeps an `IndexableFunc`'s index in sync with the casebase.
-You can filter retained cases based on their assessment scores using the `dropout` wrapper:
-```python
-retainer = cbrkit.retain.dropout(
-    retainer_func=cbrkit.retain.build(...),
-    min_similarity=0.5,
-)
-```
-The retainer can be applied to a revise result:
-```python
-result = cbrkit.retain.apply_result(revise_result, retainer)
-```
-The result contains `similarities` with fitness scores and `casebase` with the updated cases.
+The built-in storage functions are `static` (generates collision-free keys from a reference casebase) and `indexable` (keeps an `IndexableFunc`'s index in sync with the casebase).
+Wrap a retainer with `dropout` to filter by assessment score (e.g. `min_similarity=0.5`), then apply it to a revise result via `cbrkit.retain.apply_result(revise_result, retainer)`.
+The result exposes `similarities` (fitness scores) and `casebase` (updated cases).
 ## Full CBR Cycle
@@ -745,37 +728,74 @@ result = cbrkit.retrieval.apply_query(casebase, query, (retriever, reranker))
 ### Indexed Retrieval
-Some retrievers like `bm25`, `embed`, and `lancedb` support **indexed retrieval**, where the casebase is pre-indexed once and then queried without passing the full casebase each time.
-This is useful for large casebases or when using external search backends.
+Indexed retrieval pre-indexes the casebase once and then queries it without passing the full casebase each time, which helps for large casebases or external search backends.
+Index maintenance lives on whichever object owns the index.
-To use indexed retrieval, first create a retriever and call its `index()` method:
+The self-contained `bm25` and `embed` retrievers own their index, so you call `put_index()` on the retriever:
 ```python
 from frozendict import frozendict
-bm25_func = cbrkit.sim.embed.bm25(language="en")
-retriever = cbrkit.retrieval.bm25(conversion_func=bm25_func)
-retriever.create_index(frozendict(casebase))
+retriever = cbrkit.retrieval.bm25(conversion_func=cbrkit.sim.embed.bm25(language="en"))
+retriever.put_index(frozendict(casebase))
 ```
-Then pass an empty casebase (`{}`) to signal that the retriever should use its pre-indexed data:
+The storage-backed `lancedb`, `chromadb`, `zvec`, `pgvector`, and `sqlite_vec` retrievers are pure query paths over a separate `cbrkit.indexable` storage that owns the index, so you index on the storage and wrap it for querying:
 ```python
-result = cbrkit.retrieval.apply_query({}, query, retriever)
+storage = cbrkit.indexable.lancedb(uri="./cases", table_name="cases")
+storage.put_index(frozendict(casebase))
+retriever = cbrkit.retrieval.lancedb(storage=storage, search_type="dense")
 ```
-As a convenience, CBRkit provides `apply_query_indexed` and `apply_queries_indexed` which handle the empty casebase automatically:
+Query a pre-indexed retriever with `apply_query_indexed` / `apply_queries_indexed` (or pass an empty casebase `{}` to `apply_query`); querying an un-indexed retriever raises `ValueError`:
 ```python
 result = cbrkit.retrieval.apply_query_indexed(query, retriever)
-# or for multiple queries:
-result = cbrkit.retrieval.apply_queries_indexed(queries, retriever)
 ```
-If a retriever receives an empty casebase but has not been indexed yet, a `ValueError` is raised with a message to call `index()` first.
+The `System` class also defaults its casebase to `{}`, so a system of pre-indexed retrievers needs no casebase at query time.
+#### Typed Values and the Retain Caveat
+Each backend has one text-field knob — `value_column` (`value_field` for `zvec`/`chromadb`) — naming the embeddable text, and the value type `V` follows the schema source:
+- **Plain text** (`V = str`, the default) — the bare string is stored under the text knob and read back as a string.
+- **Typed model** (`V = YourModel`) — pass a `model`: a dataclass or Pydantic model for `lancedb`/`zvec`/`chromadb` (fields become columns), or a SQLAlchemy mapped class for `sqlalchemy`/`pgvector`/`sqlite_vec` (its `__table__` defines the schema). Reads reconstruct model instances.
+- **Mapping** (`V = Mapping[str, Any]`) — `sqlalchemy`/`pgvector`/`sqlite_vec` only, via a host-supplied `table` or `reflect=True`.
+```python
+# plain strings — cbrkit builds the table
+store = cbrkit.indexable.pgvector[str, str](
+    url=..., value_column="body", pgvector_dim=384, conversion_func=embed,
+)
+# typed rows — a SQLAlchemy mapped class defines the schema
+class Car(Base):
+    __tablename__ = "cars"
+    key: Mapped[str] = mapped_column(primary_key=True)
+    desc: Mapped[str] = mapped_column()
+    _pgvec: Mapped[Any] = mapped_column(cbrkit.indexable.PGVECTOR(384), nullable=False)
+store = cbrkit.indexable.pgvector[str, Car](url=..., model=Car, value_column="desc", ...)
+```
+Pass `vector_type="halfvec"` for half-precision storage (~2x smaller, negligible recall loss); for a typed model, declare the column with the re-exported `cbrkit.indexable.HALFVEC` instead of `PGVECTOR`.
+For a self-contained, file-based store, `sqlite_vec` offers the same dense/sparse/hybrid API on SQLite via the [`sqlite-vec`](https://github.com/asg017/sqlite-vec) extension (loaded automatically). Dense KNN uses a `vec0` virtual table (so the backend inherits future `vec0` capabilities such as approximate search, and supports quantized `vector_type="int8"` storage today), sparse search uses built-in FTS5, and `Filter` `WHERE` clauses compose by joining matches back to the main table:
+```python
+store = cbrkit.indexable.sqlite_vec[str, str](
+    url="sqlite+aiosqlite:///cases.db",
+    value_column="body", vector_dim=384, index_type="hybrid", conversion_func=embed,
+)
+store.put_index(frozendict(casebase))
+retriever = cbrkit.retrieval.indexable.sqlite_vec(storage=store, search_type="hybrid")
+```
-The `System` class also supports indexed retrieval by defaulting the casebase to an empty dict.
-This allows creating a system where all retrievers are pre-indexed and no casebase needs to be provided at query time.
+**Retain caveat:** the storage-backed retrievers search by the text column and always return `Casebase[K, str]`, projecting richer values down to their text.
+A retrieve → retain round-trip via `cbrkit.retrieval.indexable.*` + `cbrkit.retain.indexable` therefore lines up cleanly only when `V = str`.
+For model or mapping stores, either use the backend as a typed store (read `storage.index` as `Casebase[K, V]` and retrieve with a value-based retriever like `cbrkit.retrieval.build(...)`), or re-hydrate full rows by key from `storage.index` after text retrieval.
 ## Evaluation

{cbrkit-1.2.0 → cbrkit-1.4.0}/pyproject.toml RENAMED Viewed

@@ -1,56 +1,56 @@
 [project]
 name = "cbrkit"
-version = "1.2.0"
+version = "1.4.0"
 description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI"
 authors = [{ name = "Mirko Lenz", email = "mirko@mirkolenz.com" }]
 readme = "README.md"
+license = "MIT"
 keywords = [
-    "cbr",
-    "case-based reasoning",
-    "api",
-    "similarity",
-    "nlp",
-    "retrieval",
-    "cli",
-    "tool",
-    "library",
+  "cbr",
+  "case-based reasoning",
+  "api",
+  "similarity",
+  "nlp",
+  "retrieval",
+  "cli",
+  "tool",
+  "library",
 ]
 classifiers = [
-    "Development Status :: 4 - Beta",
-    "Environment :: Console",
-    "Framework :: Pytest",
-    "Intended Audience :: Developers",
-    "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: MIT License",
-    "Natural Language :: English",
-    "Operating System :: OS Independent",
-    "Programming Language :: Python :: 3.13",
-    "Programming Language :: Python :: 3.14",
-    "Programming Language :: Python :: 3",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Scientific/Engineering :: Information Analysis",
-    "Topic :: Software Development :: Libraries :: Python Modules",
-    "Topic :: Utilities",
-    "Typing :: Typed",
+  "Development Status :: 4 - Beta",
+  "Environment :: Console",
+  "Framework :: Pytest",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Natural Language :: English",
+  "Operating System :: OS Independent",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
+  "Programming Language :: Python :: 3",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Scientific/Engineering :: Information Analysis",
+  "Topic :: Software Development :: Libraries :: Python Modules",
+  "Topic :: Utilities",
+  "Typing :: Typed",
 ]
 requires-python = ">=3.13,<4"
 dependencies = [
-    "frozendict>=2,<3",
-    "numpy>=2,<3",
-    "orjson>=3,<4",
-    "polars>=1,<2",
-    "pydantic>=2,<3",
-    "pyyaml>=6,<7",
-    "rtoml>=0.12,<1",
-    "scipy>=1,<2",
-    "xmltodict>=1,<2",
+  "frozendict>=2,<3",
+  "numpy>=2,<3",
+  "orjson>=3,<4",
+  "polars>=1,<2",
+  "pydantic>=2,<3",
+  "pyyaml>=6,<7",
+  "rtoml>=0.12,<1",
+  "scipy>=1,<2",
+  "xmltodict>=1,<2",
 ]
 [project.optional-dependencies]
 # LLM providers
 anthropic = ["anthropic>=0.40,<1"]
-cohere = ["cohere>=5,<6"]
-google = ["google-genai>=1,<2"]
+cohere = ["cohere>=7,<8"]
+google = ["google-genai>=2,<3"]
 instructor = ["instructor>=1,<2"]
 ollama = ["ollama>=0.3,<1"]
 openai = ["openai>=1,<3", "tiktoken>=0.8,<1"]
@@ -76,26 +76,30 @@ graphviz = ["pygraphviz>=1,<2"]
 chromadb = ["chromadb>=1,<2"]
 lancedb = ["lancedb>=0.20,<1"]
 pandas = ["pandas>=2,<4"]
-sql = ["sqlalchemy>=2,<3"]
-# zvec = ["zvec>=0.2,<1"]
+pgvector = ["pgvector>=0.4,<1", "cbrkit[sql]"]
+sql = ["sqlalchemy[asyncio]>=2,<3"]
+sqlite-vec = ["sqlite-vec>=0.1,<1", "aiosqlite>=0.20,<1", "cbrkit[sql]"]
+zvec = ["zvec>=0.2,<1"]
 # Tools
-cli = ["rich>=13,<15", "typer>=0.9,<1"]
+cli = ["rich>=14,<16", "typer>=0.20,<1"]
 eval = ["ranx>=0.3,<1"]
 timeseries = ["minineedle>=3,<4"]
 # Entry points
 api = [
-    "cbrkit[cli]",
-    "fastapi>=0.100,<1",
-    "pydantic-settings>=2,<3",
-    "python-multipart>=0.0.15,<1",
-    "uvicorn[standard]>=0.30,<1",
-    "fastmcp>=3,<4",
+  "cbrkit[cli]",
+  "fastapi>=0.100,<1",
+  "pydantic-settings>=2,<3",
+  "python-multipart>=0.0.15,<1",
+  "uvicorn[standard]>=0.30,<1",
+  "fastmcp>=3,<4",
 ]
 # Bundle
-all = ["cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pydantic-ai,spacy,sql,timeseries,transformers,voyageai,zvec]"]
+all = [
+  "cbrkit[anthropic,api,bm25,chromadb,chunking,cohere,eval,google,graphs,graphviz,instructor,lancedb,levenshtein,nltk,ollama,openai,openai-agents,pandas,pgvector,pydantic-ai,spacy,sql,sqlite-vec,timeseries,transformers,voyageai,zvec]",
+]
 [project.urls]
 Repository = "https://github.com/wi2trier/cbrkit"
@@ -117,11 +121,23 @@ build-backend = "uv_build"
 [tool.pytest]
 testpaths = ["src", "tests"]
-addopts = ["--cov=src/cbrkit", "--cov-report=term-missing", "--doctest-modules", "--import-mode=importlib"]
+addopts = [
+  "--cov=src/cbrkit",
+  "--cov-report=term-missing",
+  "--doctest-modules",
+  "--import-mode=importlib",
+]
 doctest_optionflags = ["NORMALIZE_WHITESPACE", "IGNORE_EXCEPTION_DETAIL", "ELLIPSIS"]
 [tool.uv]
 default-groups = ["dev", "test", "docs"]
+[tool.uv.extra-build-dependencies]
+pygraphviz = ["setuptools"]
+cbor = ["setuptools"]
+warc3-wet-clueweb09 = ["setuptools"]
+zlib-state = ["setuptools"]
+pystemmer = ["setuptools", "cython"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

{cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/api.py RENAMED Viewed

@@ -189,7 +189,7 @@ def synthesize(
     )
-def openapi_generator():
+def openapi_generator() -> dict[str, Any]:
     """Generate and cache the OpenAPI schema for the CBRKit API."""
     if not app.openapi_schema:
         app.openapi_schema = get_openapi(
@@ -203,4 +203,4 @@ def openapi_generator():
     return app.openapi_schema
-app.openapi = openapi_generator  # type: ignore[assignment]
+app.openapi = openapi_generator  # type: ignore[assignment]  # ty: ignore[invalid-assignment]

{cbrkit-1.2.0 → cbrkit-1.4.0}/src/cbrkit/eval/common.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Literal, cast
 from ..helpers import (
     get_logger,
     normalize_and_scale,
-    round,
+    round_int,
     sim_map2ranking,
     unpack_float,
     unpack_floats,
@@ -487,15 +487,18 @@ def generate_metrics(
         >>> generate_metrics(["precision", "recall"], ks=5)
         ['precision@5', 'recall@5']
     """
-    if not isinstance(ks, Iterable):
-        ks = [ks]
-    if not isinstance(relevance_levels, Iterable):
-        relevance_levels = [relevance_levels]
+    ks_list: list[int | None] = [ks] if ks is None or isinstance(ks, int) else list(ks)
+    relevance_levels_list: list[int | None] = (
+        [relevance_levels]
+        if relevance_levels is None or isinstance(relevance_levels, int)
+        else list(relevance_levels)
+    )
     return [
-        generate_metric(*args)
-        for args in itertools.product(metrics, ks, relevance_levels)
+        generate_metric(metric, k, relevance_level)
+        for metric, k, relevance_level in itertools.product(
+            metrics, ks_list, relevance_levels_list
+        )
     ]
@@ -528,7 +531,7 @@ def similarities_to_qrels[Q, C](
     return {
         query: {
-            case: round(
+            case: round_int(
                 normalize_and_scale(sim, min_sim, max_sim, min_qrel, max_qrel),
                 round_mode,
             )

cbrkit 1.2.0__tar.gz → 1.4.0__tar.gz

cbrkit 1.2.0tar.gz → 1.4.0tar.gz