spruceup-ai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spruceup_ai-0.1.0/.gitignore +13 -0
- spruceup_ai-0.1.0/CLAUDE.md +139 -0
- spruceup_ai-0.1.0/LICENSE +21 -0
- spruceup_ai-0.1.0/PKG-INFO +460 -0
- spruceup_ai-0.1.0/README.md +414 -0
- spruceup_ai-0.1.0/example/data_corpus/DEMO_TEXT.txt +83 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000000.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000001.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000002.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000003.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000004.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000005.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000006.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000007.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000008.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000009.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000010.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000011.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000012.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000013.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000014.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000015.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000016.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000017.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000018.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000019.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000020.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000021.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000022.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000023.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000024.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000025.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000026.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000027.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000028.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000029.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000030.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000031.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000032.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000033.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000034.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000035.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000036.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000037.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000038.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000039.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000040.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000041.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000042.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000043.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000044.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000045.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000046.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000047.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000048.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000049.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000050.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000051.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000052.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000053.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000054.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000055.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000056.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000057.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000058.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000059.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000060.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000061.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000062.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000063.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000064.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000065.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000066.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000067.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000068.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000069.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000070.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000071.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000072.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000073.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000074.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000075.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000076.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000077.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000078.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000079.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000080.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000081.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000082.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000083.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000084.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000085.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000086.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000087.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000088.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000089.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000090.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000091.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000092.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000093.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000094.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000095.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000096.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000097.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000098.txt +19 -0
- spruceup_ai-0.1.0/example/data_corpus/file_0000099.txt +19 -0
- spruceup_ai-0.1.0/example/dummy_pipeline.py +183 -0
- spruceup_ai-0.1.0/example/second_local_source/DEMO_TEXT.txt +83 -0
- spruceup_ai-0.1.0/pyproject.toml +62 -0
- spruceup_ai-0.1.0/spruceup_pipeline.py +166 -0
- spruceup_ai-0.1.0/src/spruceup/__init__.py +29 -0
- spruceup_ai-0.1.0/src/spruceup/__main__.py +3 -0
- spruceup_ai-0.1.0/src/spruceup/app.py +201 -0
- spruceup_ai-0.1.0/src/spruceup/cli.py +50 -0
- spruceup_ai-0.1.0/src/spruceup/config.py +45 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/__init__.py +15 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/base.py +170 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/embedders/__init__.py +6 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/embedders/cohere.py +47 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/embedders/embedding_batcher.py +148 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/embedders/gemini.py +73 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/embedders/openai.py +50 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/embedders/voyageai.py +44 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/sources/__init__.py +4 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/sources/google_drive.py +181 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/sources/local.py +80 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/targets/__init__.py +5 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/targets/pgvector.py +128 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/targets/pinecone.py +81 -0
- spruceup_ai-0.1.0/src/spruceup/connectors/targets/weaviate.py +143 -0
- spruceup_ai-0.1.0/src/spruceup/coordinator.py +156 -0
- spruceup_ai-0.1.0/src/spruceup/debounce_queue.py +24 -0
- spruceup_ai-0.1.0/src/spruceup/manifest.py +516 -0
- spruceup_ai-0.1.0/src/spruceup/memoize/__init__.py +3 -0
- spruceup_ai-0.1.0/src/spruceup/memoize/decorator.py +63 -0
- spruceup_ai-0.1.0/src/spruceup/memoize/decorator_utility.py +26 -0
- spruceup_ai-0.1.0/src/spruceup/models.py +32 -0
- spruceup_ai-0.1.0/src/spruceup/monitoring/__init__.py +0 -0
- spruceup_ai-0.1.0/src/spruceup/monitoring/google_drive_watcher.py +246 -0
- spruceup_ai-0.1.0/src/spruceup/monitoring/local_file_watcher.py +185 -0
- spruceup_ai-0.1.0/src/spruceup/monitoring/monitor.py +114 -0
- spruceup_ai-0.1.0/src/spruceup/sync_engine/__init__.py +8 -0
- spruceup_ai-0.1.0/src/spruceup/sync_engine/sync_engine.py +60 -0
- spruceup_ai-0.1.0/src/spruceup/sync_sweeper.py +53 -0
- spruceup_ai-0.1.0/src/spruceup/transform_context.py +33 -0
- spruceup_ai-0.1.0/src/spruceup/utils/__init__.py +9 -0
- spruceup_ai-0.1.0/src/spruceup/utils/hashing.py +59 -0
- spruceup_ai-0.1.0/src/spruceup/utils/schema.py +23 -0
- spruceup_ai-0.1.0/src/spruceup/utils/validation.py +33 -0
- spruceup_ai-0.1.0/tests/conftest.py +12 -0
- spruceup_ai-0.1.0/tests/fakes.py +123 -0
- spruceup_ai-0.1.0/tests/loadtest/bench_embed.py +183 -0
- spruceup_ai-0.1.0/tests/loadtest/gen_corpus.py +68 -0
- spruceup_ai-0.1.0/tests/loadtest/reset.py +43 -0
- spruceup_ai-0.1.0/tests/loadtest/run_ingest.py +616 -0
- spruceup_ai-0.1.0/tests/loadtest/stubs.py +77 -0
- spruceup_ai-0.1.0/tests/test_coordinator.py +131 -0
- spruceup_ai-0.1.0/tests/test_embedders.py +44 -0
- spruceup_ai-0.1.0/tests/test_hashing.py +21 -0
- spruceup_ai-0.1.0/tests/test_memoize.py +75 -0
- spruceup_ai-0.1.0/tests/test_sync_engine.py +97 -0
- spruceup_ai-0.1.0/tests/test_sync_sweeper.py +69 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install dependencies (requires Python 3.14)
|
|
9
|
+
uv sync
|
|
10
|
+
|
|
11
|
+
# Run the app (must be run from the directory containing spruceup_pipeline.py)
|
|
12
|
+
uv run spruceup start
|
|
13
|
+
|
|
14
|
+
# Run all tests
|
|
15
|
+
uv run pytest
|
|
16
|
+
|
|
17
|
+
# Run a single test file
|
|
18
|
+
uv run pytest tests/test_sync_engine.py
|
|
19
|
+
|
|
20
|
+
# Run a specific test
|
|
21
|
+
uv run pytest tests/test_sync_engine.py::test_reconcile_new_file
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Required env vars for the pipeline: `PG_CONNSTR`, and an embedder API key (e.g. `OPENAI_API_KEY`). Copy credentials into `.env`; `spruceup_pipeline.py` calls `dotenv.load_dotenv()` at import time.
|
|
25
|
+
|
|
26
|
+
## Architecture
|
|
27
|
+
|
|
28
|
+
SpruceUp is a document ingestion daemon. It watches source connectors for file changes, transforms documents into chunks, embeds them, and keeps a target vector store in sync.
|
|
29
|
+
|
|
30
|
+
### Pipeline file (`spruceup_pipeline.py`)
|
|
31
|
+
|
|
32
|
+
The user-authored entry point. The CLI (`spruceup start`) imports it dynamically from the CWD. It must define a `config` variable returned by `defineConfig()`:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
config = defineConfig(
|
|
36
|
+
sources=[LocalFilesSource(watched_dir="example/data_corpus")],
|
|
37
|
+
target=PgVectorTarget(connstr=..., table="data_chunks", schema=LectureChunk, vector_column="chunk_embedding"),
|
|
38
|
+
embedder=OpenAIEmbedder(api_key=..., model="text-embedding-3-small"),
|
|
39
|
+
transform=build_lecture_chunks, # async fn(*, file_props: FileProps, embed) -> list[schema]
|
|
40
|
+
cache_files=False, # optional; True caches raw file content in the manifest (default False)
|
|
41
|
+
)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
`defineConfig()` validates types eagerly at import time. `validate_pipeline()` in `cli.py` checks the contract exists before starting the event loop.
|
|
45
|
+
|
|
46
|
+
### Runtime flow (`app.py`)
|
|
47
|
+
|
|
48
|
+
On startup, `app.run(pipeline)` compares persisted fingerprints in the Manifest against the current config. Any mismatch triggers a **full reindex** (all files re-fetched, re-transformed, re-upserted) instead of incremental sync:
|
|
49
|
+
1. Transform function body changed (source hash)
|
|
50
|
+
2. Any `@memoize`-decorated function changed
|
|
51
|
+
3. Embedding model changed
|
|
52
|
+
4. Embedding dimensions changed
|
|
53
|
+
5. Target identity changed — `target.identity()`, a credential-free string (host/db/table or index/collection)
|
|
54
|
+
6. Schema changed — `hash_schema()` over field names+types and the designated `vector_column`
|
|
55
|
+
|
|
56
|
+
Signals 3–4 additionally **flush the embedding cache** (`embeddings_invalidated`). Signals 4–6 are **structural** and additionally **drop + recreate** the target table/index before reingest (`ensure_table_exists(recreate=True)`) — chosen over in-place migration because reingest must re-embed everything anyway.
|
|
57
|
+
|
|
58
|
+
On any mismatch, every file row is marked `needs_reindex` and the new fingerprints are persisted immediately (mark first, persist second — a crash in between just re-marks on the next start). A file stays `needs_reindex` until a sync **succeeds**; failures and restarts don't clear it. `SyncEngine.reconcile` pushes **all** chunks of a `needs_reindex` file instead of the diff (config changes don't alter chunk hashes, so the diff can't see them). `needs_reindex` files are re-enqueued at every startup and retried by the sync sweeper, so an interrupted reindex resumes where it left off.
|
|
59
|
+
|
|
60
|
+
Then it launches three concurrent asyncio tasks:
|
|
61
|
+
|
|
62
|
+
| Task | Role |
|
|
63
|
+
|------|------|
|
|
64
|
+
| `Monitor` | Runs all watchers; each watcher does a catch-up scan then enters a watch loop |
|
|
65
|
+
| `Coordinator` | Dequeues `SyncTask` objects and processes them (up to 32 concurrent) |
|
|
66
|
+
| `SyncSweeper` | Retries `failed` and `needs_reindex` files every 60 seconds |
|
|
67
|
+
|
|
68
|
+
### File change lifecycle
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
Source watcher → DebounceQueue → Coordinator
|
|
72
|
+
↓
|
|
73
|
+
source.fetch() → SpruceFile
|
|
74
|
+
↓
|
|
75
|
+
transform(file_props, embed) → list[UserChunk]
|
|
76
|
+
↓
|
|
77
|
+
SyncEngine.reconcile() → chunk diff → target.sync()
|
|
78
|
+
↓
|
|
79
|
+
Manifest.set_sync_state("synced")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
`DebounceQueue` (wraps `asyncio.Queue`) evicts any already-queued task for the same `file_id` when a newer task arrives, preventing redundant processing. **Tradeoff:** to evict the superseded task it reaches into `asyncio.Queue` internals (`_queue`, `_unfinished_tasks`), so it's coupled to the CPython queue implementation and could break on a stdlib change.
|
|
83
|
+
|
|
84
|
+
### Manifest (`manifest.py`)
|
|
85
|
+
|
|
86
|
+
A local SQLite database (`spruceup_manifest.db`) that is the source of truth for:
|
|
87
|
+
- Registered data sources and their state (e.g. Google Drive page tokens)
|
|
88
|
+
- File rows: content hash, raw content (only when `cache_files=True`), sync state (`needs_reindex` / `in_flight` / `synced` / `failed`)
|
|
89
|
+
- Chunk rows: `(file_id, user_chunk_object_hash)` pairs for diffing
|
|
90
|
+
- Memoize cache: `(file_id, fn_hash, args_hash) → result`
|
|
91
|
+
- Embedding cache: `(file_id, chunk_text_hash) → embedding bytes`
|
|
92
|
+
- Config state: `embedding_model`, `embedding_dimensions`, `target_identity`, `schema_fingerprint`
|
|
93
|
+
|
|
94
|
+
Opened with `autocommit=True`; use `manifest.transaction()` only when multiple writes must be atomic.
|
|
95
|
+
|
|
96
|
+
### Connector ABCs (`connectors/base.py`)
|
|
97
|
+
|
|
98
|
+
All connectors implement one of three ABCs:
|
|
99
|
+
|
|
100
|
+
- **`SourceConnector`** — `source_type`, `source_identifier`, `create_watcher()`, `fetch()`, `validate()`, `is_supported()`, `decode_content()`
|
|
101
|
+
- **`TargetConnector`** — `vector_column`, `identity()`, `ensure_table_exists(recreate=False)`, `sync(upserts, deletes)`, `aclose()`
|
|
102
|
+
- **`EmbedderConnector`** — `embed_batch(batch)`, `process_chunks(chunks)`, `aclose()`
|
|
103
|
+
|
|
104
|
+
Available implementations:
|
|
105
|
+
|
|
106
|
+
| Type | Implementations |
|
|
107
|
+
|------|----------------|
|
|
108
|
+
| Source | `LocalFilesSource`, `GoogleDriveSource` |
|
|
109
|
+
| Target | `PgVectorTarget`, `PineconeTarget`, `WeaviateTarget` |
|
|
110
|
+
| Embedder | `OpenAIEmbedder`, `CohereEmbedder`, `GeminiEmbedder`, `VoyageAIEmbedder` |
|
|
111
|
+
|
|
112
|
+
`LocalFilesSource` and `LocalFileWatcher` exist for local testing. Production reasoning should be framed in terms of the connector ABCs.
|
|
113
|
+
|
|
114
|
+
An embedder's `api_key` accepts a `str` or a `Callable[[], str]` (e.g. a secrets-manager fetch, resolved at client build). On a credential rejection the embedder raises `TokenExpiredError`; the base `embed_batch_retrying` then drops the cached client so the next retry rebuilds it with a re-resolved token. Static-string keys are left untouched (no point re-resolving). Auth-error detection is per-SDK (each `embed_batch` catches its provider's exception and normalizes to `TokenExpiredError`).
|
|
115
|
+
|
|
116
|
+
### EmbeddingBatcher (`connectors/embedders/embedding_batcher.py`)
|
|
117
|
+
|
|
118
|
+
Wraps any `EmbedderConnector`. Accumulates chunks from concurrent file transforms and flushes them as batched API calls (max 100ms wait or `max_batch_size` chunks, max 5 concurrent API calls). Also consults the Manifest embedding cache before calling the API — cache is scoped per `file_id` and keyed by `blake2b(chunk_text)`.
|
|
119
|
+
|
|
120
|
+
Each accumulated chunk gets its own `asyncio.Future`; a per-call `asyncio.gather` over those futures reassembles a caller's embeddings in order and waits for completion, so the batcher carries no per-file slot bookkeeping (a flushed batch may mix chunks from several callers).
|
|
121
|
+
|
|
122
|
+
### `@memoize` decorator (`memoize/decorator.py`)
|
|
123
|
+
|
|
124
|
+
Caches async subfunctions in the Manifest, scoped per file. The decorated function must be `async` — decorating a sync function raises `TypeError` at import. Results are invalidated when the function body changes. Valid **only** when called from within the `transform` function — it reads the `contextvars` set by `Coordinator.upsert_file()` via `transform_scope`.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
@memoize(return_type=str)
|
|
128
|
+
async def summarize(text: str) -> str: ...
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Supported return types: `str`, `int`, `float`, `bool`, `list`, `dict`.
|
|
132
|
+
|
|
133
|
+
### PgVectorTarget schema mapping
|
|
134
|
+
|
|
135
|
+
`ensure_table_exists()` inspects the user dataclass with `typing.get_type_hints()` and maps Python types to Postgres types. The embedding column is named **explicitly** via the target's `vector_column=` (validated at construction to be a `list[float]` field) and becomes `vector(N)` (requires `pgvector` extension), where `N` is the embedder's `embedding_dimensions`. Any *other* `list[float]` field maps to a plain `DOUBLE PRECISION[]` array, not a vector. The `id` column is always `TEXT PRIMARY KEY`, set to `f"{file_id}:{chunk.user_chunk_object_hash.hex()}"` (keyed per file). Upserts use `ON CONFLICT (id) DO UPDATE` so re-embeds (e.g. after a model change) overwrite existing rows.
|
|
136
|
+
|
|
137
|
+
### Google Drive source
|
|
138
|
+
|
|
139
|
+
`GoogleDriveSource` takes a `watched_dir` (folder ID) and an `on_token_expired: Callable[[], str]` that returns a fresh OAuth access token. The `drive.readonly` scope covers all required API calls (list, download, export, changes). Startup validation rejects nested watched folders.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 SpruceUp Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spruceup-ai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A standalone system for making automated, incremental updates to a vector database.
|
|
5
|
+
Project-URL: Repository, https://github.com/SpruceUp-ai/SpruceUp
|
|
6
|
+
Author-email: Ekerin Agboola <ekerin.m.a@gmail.com>, Eric Cho <ekc7590@gmail.com>, AJ Fuhler <aart.fuhler@gmail.com>, Caleb Pickard <caleb.pickard@gmail.com>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 SpruceUp Contributors
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Requires-Python: <3.15,>=3.12
|
|
33
|
+
Requires-Dist: cohere<8.0.0,>=7.0.0
|
|
34
|
+
Requires-Dist: google-api-python-client<3.0.0,>=2.197.0
|
|
35
|
+
Requires-Dist: google-auth<3.0.0,>=2.53.0
|
|
36
|
+
Requires-Dist: google-genai<3.0.0,>=2.6.0
|
|
37
|
+
Requires-Dist: openai<3.0.0,>=2.37.0
|
|
38
|
+
Requires-Dist: pinecone>=3.0.0
|
|
39
|
+
Requires-Dist: psycopg-pool<4.0.0,>=3.0.0
|
|
40
|
+
Requires-Dist: psycopg<4.0.0,>=3.3.4
|
|
41
|
+
Requires-Dist: tenacity<10.0.0,>=9.1.4
|
|
42
|
+
Requires-Dist: voyageai<0.5.0,>=0.4.0
|
|
43
|
+
Requires-Dist: watchfiles>=1.0.0
|
|
44
|
+
Requires-Dist: weaviate-client<5.0.0,>=4.0.0
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
## SpruceUp
|
|
48
|
+
|
|
49
|
+
**SpruceUp** is a standalone system for making automated, incremental updates to a vector database.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
Add SpruceUp to your project (e.g., with `poetry`, `pip`, or `uv`):
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
poetry add spruceup-ai # OR
|
|
57
|
+
pip install spruceup-ai # OR
|
|
58
|
+
uv add spruceup-ai
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Setup
|
|
64
|
+
|
|
65
|
+
Create a file named `spruceup_pipeline.py` in your project directory. This is the user-authored entry point SpruceUp loads at startup. It must export a single `config` variable built with `defineConfig()`.
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# spruceup_pipeline.py
|
|
69
|
+
import re
|
|
70
|
+
import os
|
|
71
|
+
from dataclasses import dataclass
|
|
72
|
+
from spruceup import defineConfig, FileProps, LocalFilesSource, PgVectorTarget, OpenAIEmbedder
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class ArticleChunk:
|
|
76
|
+
title: str
|
|
77
|
+
content: str
|
|
78
|
+
embedding: list[float]
|
|
79
|
+
|
|
80
|
+
def split_into_paragraphs(text: str) -> list[str]:
|
|
81
|
+
return [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
|
|
82
|
+
|
|
83
|
+
async def transform(*, file_props: FileProps, embed) -> list[ArticleChunk]:
|
|
84
|
+
paragraphs = split_into_paragraphs(file_props.raw_content)
|
|
85
|
+
embeddings = await embed(paragraphs)
|
|
86
|
+
return [
|
|
87
|
+
ArticleChunk(title=file_props.display_name, content=para, embedding=emb)
|
|
88
|
+
for para, emb in zip(paragraphs, embeddings)
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
config = defineConfig(
|
|
92
|
+
sources=[LocalFilesSource(watched_dir="./articles")],
|
|
93
|
+
target=PgVectorTarget(
|
|
94
|
+
connstr=os.environ["PG_CONNSTR"],
|
|
95
|
+
table="article_chunks",
|
|
96
|
+
schema=ArticleChunk,
|
|
97
|
+
vector_column="embedding",
|
|
98
|
+
),
|
|
99
|
+
embedder=OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"]),
|
|
100
|
+
transform=transform,
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Running SpruceUp
|
|
107
|
+
|
|
108
|
+
From the directory containing your `spruceup_pipeline.py` file:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
spruceup start
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
SpruceUp will scan your sources, sync any files not yet in the manifest, then enter a watch loop for incremental updates.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Imports
|
|
119
|
+
|
|
120
|
+
Everything you need is importable from the top-level `spruceup` package:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from spruceup import (
|
|
124
|
+
defineConfig,
|
|
125
|
+
FileProps,
|
|
126
|
+
|
|
127
|
+
# Sources
|
|
128
|
+
LocalFilesSource,
|
|
129
|
+
GoogleDriveSource,
|
|
130
|
+
|
|
131
|
+
# Targets
|
|
132
|
+
PgVectorTarget,
|
|
133
|
+
PineconeTarget,
|
|
134
|
+
WeaviateTarget,
|
|
135
|
+
|
|
136
|
+
# Embedders
|
|
137
|
+
OpenAIEmbedder,
|
|
138
|
+
CohereEmbedder,
|
|
139
|
+
GeminiEmbedder,
|
|
140
|
+
VoyageAIEmbedder,
|
|
141
|
+
|
|
142
|
+
# Utilities
|
|
143
|
+
memoize,
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## `defineConfig()`
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
config = defineConfig(
|
|
153
|
+
sources=[...],
|
|
154
|
+
target=...,
|
|
155
|
+
embedder=...,
|
|
156
|
+
transform=...,
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
All parameters are keyword-only.
|
|
161
|
+
|
|
162
|
+
| Parameter | Type | Required | Default | Description |
|
|
163
|
+
| ------------- | ----------------------- | -------- | ------- | ------------------------------------- |
|
|
164
|
+
| `sources` | `list[SourceConnector]` | Yes | — | At least one source connector |
|
|
165
|
+
| `target` | `TargetConnector` | Yes | — | Where synced chunks are written |
|
|
166
|
+
| `embedder` | `EmbedderConnector` | Yes | — | Generates embeddings for your chunks |
|
|
167
|
+
| `transform` | `async callable` | Yes | — | Converts a file into a list of chunks |
|
|
168
|
+
| `cache_files` | `bool` | No | `False` | Cache raw file bytes in the manifest |
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## The Transform Function
|
|
173
|
+
|
|
174
|
+
The transform function is where you split, enrich, and embed your documents. SpruceUp calls it for every file that changes. This function **must** be async.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
async def transform(*, file_props: FileProps, embed) -> list[YourSchema]:
|
|
178
|
+
...
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### `FileProps`
|
|
182
|
+
|
|
183
|
+
| Field | Type | Description |
|
|
184
|
+
| -------------- | -------------- | ------------------------------------------------------------ |
|
|
185
|
+
| `raw_content` | `str \| bytes` | File content. Text formats are decoded as UTF-8; binary formats like PDF are passed through as raw `bytes`. |
|
|
186
|
+
| `display_name` | `str` | The filename |
|
|
187
|
+
| `file_type` | `str` | File extension (e.g. `"txt"`, `"pdf"`) |
|
|
188
|
+
|
|
189
|
+
### `embed`
|
|
190
|
+
|
|
191
|
+
`embed` is an async callable that takes a list of strings and returns a list of embedding vectors:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
embeddings: list[list[float]] = await embed(["chunk one", "chunk two"])
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Chunk Schema
|
|
198
|
+
|
|
199
|
+
Your transform returns a list of instances of a user-defined dataclass. SpruceUp uses this schema for diffing and for writing to the target store. Define it as a plain dataclass:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
@dataclass
|
|
203
|
+
class MyChunk:
|
|
204
|
+
title: str
|
|
205
|
+
text: str
|
|
206
|
+
embedding: list[float]
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
All target connectors support `str`, `int`, `float`, `bool`, and `list[float]` as field types. Use `list[float]` for your embedding vector. You do not need to define an `id` field. SpruceUp generates one from each chunk's content hash.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Source Connectors
|
|
214
|
+
|
|
215
|
+
### `LocalFilesSource`
|
|
216
|
+
|
|
217
|
+
Watches a local directory for file changes.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
LocalFilesSource(watched_dir="./data")
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
| Parameter | Type | Required | Default | Description |
|
|
224
|
+
| ------------- | ----- | -------- | ------- | ------------------------------ |
|
|
225
|
+
| `watched_dir` | `str` | Yes | — | Path to the directory to watch |
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
### `GoogleDriveSource`
|
|
230
|
+
|
|
231
|
+
Watches a Google Drive folder for file changes. Requires the `drive.readonly` OAuth scope.
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
GoogleDriveSource(
|
|
235
|
+
watched_dir="<folder-id>",
|
|
236
|
+
on_token_expired=get_access_token,
|
|
237
|
+
recursive=True,
|
|
238
|
+
)
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
| Parameter | Type | Required | Default | Description |
|
|
242
|
+
| ------------------ | ------------------- | -------- | ------- | ------------------------------------------------------------ |
|
|
243
|
+
| `watched_dir` | `str` | Yes | — | Google Drive folder ID |
|
|
244
|
+
| `on_token_expired` | `Callable[[], str]` | Yes | — | Called when the access token expires; must return a fresh token string |
|
|
245
|
+
| `recursive` | `bool` | No | `True` | Whether to watch subfolders |
|
|
246
|
+
|
|
247
|
+
The `on_token_expired` callback is invoked whenever the connector needs a new OAuth token. It should return a valid access token or raise an exception.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Target Connectors
|
|
252
|
+
|
|
253
|
+
### `PgVectorTarget`
|
|
254
|
+
|
|
255
|
+
Syncs chunks to a PostgreSQL table using the `pgvector` extension.
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
PgVectorTarget(
|
|
259
|
+
connstr="postgresql://user:pass@localhost/mydb",
|
|
260
|
+
table="my_chunks",
|
|
261
|
+
schema=MyChunk,
|
|
262
|
+
vector_column="embedding",
|
|
263
|
+
)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
| Parameter | Type | Required | Default | Description |
|
|
267
|
+
| --------------- | ------ | -------- | ------- | ---------------------------------------------- |
|
|
268
|
+
| `connstr` | `str` | Yes | — | PostgreSQL connection string |
|
|
269
|
+
| `table` | `str` | Yes | — | Table name |
|
|
270
|
+
| `schema` | `type` | Yes | — | Your chunk dataclass |
|
|
271
|
+
| `vector_column` | `str` | Yes | — | Field name on your schema that holds the vector |
|
|
272
|
+
|
|
273
|
+
SpruceUp creates the table and its columns automatically based on your schema's type hints. The `pgvector` extension must be installed on your database.
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
### `PineconeTarget`
|
|
278
|
+
|
|
279
|
+
Syncs chunks to a Pinecone index.
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
PineconeTarget(
|
|
283
|
+
api_key="pc-...",
|
|
284
|
+
index_name="my-index",
|
|
285
|
+
schema=MyChunk,
|
|
286
|
+
vector_column="embedding",
|
|
287
|
+
namespace="",
|
|
288
|
+
metric="cosine",
|
|
289
|
+
cloud="aws",
|
|
290
|
+
region="us-east-1",
|
|
291
|
+
)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
| Parameter | Type | Required | Default | Description |
|
|
295
|
+
| --------------- | ------------- | -------- | ------------- | ----------------------------------------------------------- |
|
|
296
|
+
| `api_key` | `str \| None` | Yes | — | Pinecone API key |
|
|
297
|
+
| `index_name` | `str` | Yes | — | Name of the Pinecone index |
|
|
298
|
+
| `schema` | `type` | Yes | — | Your chunk dataclass |
|
|
299
|
+
| `vector_column` | `str` | Yes | — | Field name on your schema that holds the vector |
|
|
300
|
+
| `namespace` | `str` | No | `""` | Namespace within the index |
|
|
301
|
+
| `metric` | `str` | No | `"cosine"` | Distance metric (`"cosine"`, `"euclidean"`, `"dotproduct"`) |
|
|
302
|
+
| `cloud` | `str` | No | `"aws"` | Cloud provider |
|
|
303
|
+
| `region` | `str` | No | `"us-east-1"` | Cloud region |
|
|
304
|
+
|
|
305
|
+
---
|
|
306
|
+
|
|
307
|
+
### `WeaviateTarget`
|
|
308
|
+
|
|
309
|
+
Syncs chunks to a Weaviate collection.
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
# Local instance
|
|
313
|
+
WeaviateTarget(
|
|
314
|
+
collection_name="MyChunks",
|
|
315
|
+
schema=MyChunk,
|
|
316
|
+
vector_column="embedding",
|
|
317
|
+
url="http://localhost:8080",
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Weaviate Cloud
|
|
321
|
+
WeaviateTarget(
|
|
322
|
+
collection_name="MyChunks",
|
|
323
|
+
schema=MyChunk,
|
|
324
|
+
vector_column="embedding",
|
|
325
|
+
cluster_url="https://my-cluster.weaviate.network",
|
|
326
|
+
api_key="wvp-...",
|
|
327
|
+
)
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
| Parameter | Type | Required | Default | Description |
|
|
331
|
+
| ----------------- | ------------- | -------- | ------------------------- | ----------------------------------------- |
|
|
332
|
+
| `collection_name` | `str` | Yes | — | Weaviate collection name |
|
|
333
|
+
| `schema` | `type` | Yes | — | Your chunk dataclass |
|
|
334
|
+
| `vector_column` | `str` | Yes | — | Field name on your schema that holds the vector |
|
|
335
|
+
| `url` | `str` | No | `"http://localhost:8080"` | URL for a local Weaviate instance |
|
|
336
|
+
| `cluster_url` | `str \| None` | No | `None` | URL for a Weaviate Cloud cluster |
|
|
337
|
+
| `api_key` | `str \| None` | No | `None` | API key for Weaviate Cloud authentication |
|
|
338
|
+
|
|
339
|
+
Use either `url` for a local instance or `cluster_url` + `api_key` for a cloud deployment.
|
|
340
|
+
|
|
341
|
+
---
|
|
342
|
+
|
|
343
|
+
## Embedder Connectors
|
|
344
|
+
|
|
345
|
+
SpruceUp runs a health check at startup that embeds a test string and reads the actual output size from the API. The `embedding_dimensions` parameter is optional on all embedders. If omitted, the dimension is detected automatically. If provided, SpruceUp validates it matches what the API actually returns and raises an error if not.
|
|
346
|
+
|
|
347
|
+
### `OpenAIEmbedder`
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
OpenAIEmbedder(
|
|
351
|
+
api_key="sk-...",
|
|
352
|
+
model="text-embedding-3-small",
|
|
353
|
+
max_batch_size=150,
|
|
354
|
+
embedding_dimensions=None,
|
|
355
|
+
)
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
| Parameter | Type | Required | Default | Description |
|
|
359
|
+
| ---------------------- | --------------------------- | -------- | -------------------------- | -------------------------- |
|
|
360
|
+
| `api_key` | `str \| Callable[[], str]` | Yes | — | OpenAI API key, or a callable that returns one |
|
|
361
|
+
| `model` | `str` | No | `"text-embedding-3-small"` | Embedding model |
|
|
362
|
+
| `max_batch_size` | `int` | No | `150` | Max texts per API call |
|
|
363
|
+
| `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
### `CohereEmbedder`
|
|
368
|
+
|
|
369
|
+
```python
|
|
370
|
+
CohereEmbedder(
|
|
371
|
+
api_key="...",
|
|
372
|
+
model="embed-v4.0",
|
|
373
|
+
max_batch_size=96,
|
|
374
|
+
embedding_dimensions=None,
|
|
375
|
+
)
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
| Parameter | Type | Required | Default | Description |
|
|
379
|
+
| ---------------------- | --------------------------- | -------- | -------------- | -------------------------- |
|
|
380
|
+
| `api_key` | `str \| Callable[[], str]` | Yes | — | Cohere API key, or a callable that returns one |
|
|
381
|
+
| `model` | `str` | No | `"embed-v4.0"` | Embedding model |
|
|
382
|
+
| `max_batch_size` | `int` | No | `96` | Max texts per API call |
|
|
383
|
+
| `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
|
|
384
|
+
|
|
385
|
+
When using an `embed-v4` model with a custom `embedding_dimensions`, the value must be one of `256`, `512`, `1024`, or `1536`.
|
|
386
|
+
|
|
387
|
+
---
|
|
388
|
+
|
|
389
|
+
### `GeminiEmbedder`
|
|
390
|
+
|
|
391
|
+
```python
|
|
392
|
+
GeminiEmbedder(
|
|
393
|
+
api_key="...",
|
|
394
|
+
model="gemini-embedding-001",
|
|
395
|
+
max_batch_size=100,
|
|
396
|
+
)
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
| Parameter | Type | Required | Default | Description |
|
|
400
|
+
| ---------------------- | --------------------------- | -------- | ------------------------ | ---------------------------------------- |
|
|
401
|
+
| `api_key` | `str \| Callable[[], str]` | Yes | — | Google Generative AI API key, or a callable that returns one |
|
|
402
|
+
| `model` | `str` | No | `"gemini-embedding-001"` | Embedding model |
|
|
403
|
+
| `max_batch_size` | `int` | No | `100` | Max texts per API call (hard limit: 100) |
|
|
404
|
+
| `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
### `VoyageAIEmbedder`
|
|
409
|
+
|
|
410
|
+
```python
|
|
411
|
+
VoyageAIEmbedder(
|
|
412
|
+
api_key="...",
|
|
413
|
+
model="voyage-4-large",
|
|
414
|
+
max_batch_size=150,
|
|
415
|
+
embedding_dimensions=None,
|
|
416
|
+
)
|
|
417
|
+
```
|
|
418
|
+
|
|
419
|
+
| Parameter | Type | Required | Default | Description |
|
|
420
|
+
| ---------------------- | --------------------------- | -------- | ------------------ | -------------------------- |
|
|
421
|
+
| `api_key` | `str \| Callable[[], str]` | Yes | — | Voyage AI API key, or a callable that returns one |
|
|
422
|
+
| `model` | `str` | No | `"voyage-4-large"` | Embedding model |
|
|
423
|
+
| `max_batch_size` | `int` | No | `150` | Max texts per API call |
|
|
424
|
+
| `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
|
|
425
|
+
|
|
426
|
+
When using a `voyage-4` model with a custom `embedding_dimensions`, the value must be one of `256`, `512`, `1024`, or `2048`.
|
|
427
|
+
|
|
428
|
+
---
|
|
429
|
+
|
|
430
|
+
## `@memoize`
|
|
431
|
+
|
|
432
|
+
The `memoize` decorator caches the results of expensive subfunctions inside your transform. Results are stored in the SpruceUp manifest (a local SQLite database), scoped per file and invalidated automatically when the decorated function's body changes.
|
|
433
|
+
|
|
434
|
+
```python
|
|
435
|
+
from spruceup import memoize
|
|
436
|
+
import asyncio
|
|
437
|
+
|
|
438
|
+
@memoize(return_type=str)
|
|
439
|
+
async def summarize(text: str) -> str:
|
|
440
|
+
# expensive LLM call
|
|
441
|
+
...
|
|
442
|
+
|
|
443
|
+
async def transform(*, file_props: FileProps, embed) -> list[MyChunk]:
|
|
444
|
+
chunk_strs = split_into_chunks(file_props.raw_content)
|
|
445
|
+
# summarize each chunk concurrently; results are cached per file
|
|
446
|
+
summaries = await asyncio.gather(*[summarize(c) for c in chunk_strs])
|
|
447
|
+
embeddings = await embed(chunk_strs)
|
|
448
|
+
return [
|
|
449
|
+
MyChunk(content=c, summary=s, embedding=e)
|
|
450
|
+
for c, s, e in zip(chunk_strs, summaries, embeddings)
|
|
451
|
+
]
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
| Parameter | Type | Required | Description |
|
|
455
|
+
| ------------- | ------ | -------- | ------------------------------------------------------------ |
|
|
456
|
+
| `return_type` | `type` | Yes | Return type of the decorated function — used for serialization |
|
|
457
|
+
|
|
458
|
+
Supported return types: `str`, `int`, `float`, `bool`, `list`, `dict`.
|
|
459
|
+
|
|
460
|
+
`memoize` only works on `async` functions. Decorating a sync function raises a `TypeError`. It can only be used inside a transform function. Calling a memoized function outside of a transform context will raise a `RuntimeError`.
|