spruceup-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. spruceup_ai-0.1.0/.gitignore +13 -0
  2. spruceup_ai-0.1.0/CLAUDE.md +139 -0
  3. spruceup_ai-0.1.0/LICENSE +21 -0
  4. spruceup_ai-0.1.0/PKG-INFO +460 -0
  5. spruceup_ai-0.1.0/README.md +414 -0
  6. spruceup_ai-0.1.0/example/data_corpus/DEMO_TEXT.txt +83 -0
  7. spruceup_ai-0.1.0/example/data_corpus/file_0000000.txt +19 -0
  8. spruceup_ai-0.1.0/example/data_corpus/file_0000001.txt +19 -0
  9. spruceup_ai-0.1.0/example/data_corpus/file_0000002.txt +19 -0
  10. spruceup_ai-0.1.0/example/data_corpus/file_0000003.txt +19 -0
  11. spruceup_ai-0.1.0/example/data_corpus/file_0000004.txt +19 -0
  12. spruceup_ai-0.1.0/example/data_corpus/file_0000005.txt +19 -0
  13. spruceup_ai-0.1.0/example/data_corpus/file_0000006.txt +19 -0
  14. spruceup_ai-0.1.0/example/data_corpus/file_0000007.txt +19 -0
  15. spruceup_ai-0.1.0/example/data_corpus/file_0000008.txt +19 -0
  16. spruceup_ai-0.1.0/example/data_corpus/file_0000009.txt +19 -0
  17. spruceup_ai-0.1.0/example/data_corpus/file_0000010.txt +19 -0
  18. spruceup_ai-0.1.0/example/data_corpus/file_0000011.txt +19 -0
  19. spruceup_ai-0.1.0/example/data_corpus/file_0000012.txt +19 -0
  20. spruceup_ai-0.1.0/example/data_corpus/file_0000013.txt +19 -0
  21. spruceup_ai-0.1.0/example/data_corpus/file_0000014.txt +19 -0
  22. spruceup_ai-0.1.0/example/data_corpus/file_0000015.txt +19 -0
  23. spruceup_ai-0.1.0/example/data_corpus/file_0000016.txt +19 -0
  24. spruceup_ai-0.1.0/example/data_corpus/file_0000017.txt +19 -0
  25. spruceup_ai-0.1.0/example/data_corpus/file_0000018.txt +19 -0
  26. spruceup_ai-0.1.0/example/data_corpus/file_0000019.txt +19 -0
  27. spruceup_ai-0.1.0/example/data_corpus/file_0000020.txt +19 -0
  28. spruceup_ai-0.1.0/example/data_corpus/file_0000021.txt +19 -0
  29. spruceup_ai-0.1.0/example/data_corpus/file_0000022.txt +19 -0
  30. spruceup_ai-0.1.0/example/data_corpus/file_0000023.txt +19 -0
  31. spruceup_ai-0.1.0/example/data_corpus/file_0000024.txt +19 -0
  32. spruceup_ai-0.1.0/example/data_corpus/file_0000025.txt +19 -0
  33. spruceup_ai-0.1.0/example/data_corpus/file_0000026.txt +19 -0
  34. spruceup_ai-0.1.0/example/data_corpus/file_0000027.txt +19 -0
  35. spruceup_ai-0.1.0/example/data_corpus/file_0000028.txt +19 -0
  36. spruceup_ai-0.1.0/example/data_corpus/file_0000029.txt +19 -0
  37. spruceup_ai-0.1.0/example/data_corpus/file_0000030.txt +19 -0
  38. spruceup_ai-0.1.0/example/data_corpus/file_0000031.txt +19 -0
  39. spruceup_ai-0.1.0/example/data_corpus/file_0000032.txt +19 -0
  40. spruceup_ai-0.1.0/example/data_corpus/file_0000033.txt +19 -0
  41. spruceup_ai-0.1.0/example/data_corpus/file_0000034.txt +19 -0
  42. spruceup_ai-0.1.0/example/data_corpus/file_0000035.txt +19 -0
  43. spruceup_ai-0.1.0/example/data_corpus/file_0000036.txt +19 -0
  44. spruceup_ai-0.1.0/example/data_corpus/file_0000037.txt +19 -0
  45. spruceup_ai-0.1.0/example/data_corpus/file_0000038.txt +19 -0
  46. spruceup_ai-0.1.0/example/data_corpus/file_0000039.txt +19 -0
  47. spruceup_ai-0.1.0/example/data_corpus/file_0000040.txt +19 -0
  48. spruceup_ai-0.1.0/example/data_corpus/file_0000041.txt +19 -0
  49. spruceup_ai-0.1.0/example/data_corpus/file_0000042.txt +19 -0
  50. spruceup_ai-0.1.0/example/data_corpus/file_0000043.txt +19 -0
  51. spruceup_ai-0.1.0/example/data_corpus/file_0000044.txt +19 -0
  52. spruceup_ai-0.1.0/example/data_corpus/file_0000045.txt +19 -0
  53. spruceup_ai-0.1.0/example/data_corpus/file_0000046.txt +19 -0
  54. spruceup_ai-0.1.0/example/data_corpus/file_0000047.txt +19 -0
  55. spruceup_ai-0.1.0/example/data_corpus/file_0000048.txt +19 -0
  56. spruceup_ai-0.1.0/example/data_corpus/file_0000049.txt +19 -0
  57. spruceup_ai-0.1.0/example/data_corpus/file_0000050.txt +19 -0
  58. spruceup_ai-0.1.0/example/data_corpus/file_0000051.txt +19 -0
  59. spruceup_ai-0.1.0/example/data_corpus/file_0000052.txt +19 -0
  60. spruceup_ai-0.1.0/example/data_corpus/file_0000053.txt +19 -0
  61. spruceup_ai-0.1.0/example/data_corpus/file_0000054.txt +19 -0
  62. spruceup_ai-0.1.0/example/data_corpus/file_0000055.txt +19 -0
  63. spruceup_ai-0.1.0/example/data_corpus/file_0000056.txt +19 -0
  64. spruceup_ai-0.1.0/example/data_corpus/file_0000057.txt +19 -0
  65. spruceup_ai-0.1.0/example/data_corpus/file_0000058.txt +19 -0
  66. spruceup_ai-0.1.0/example/data_corpus/file_0000059.txt +19 -0
  67. spruceup_ai-0.1.0/example/data_corpus/file_0000060.txt +19 -0
  68. spruceup_ai-0.1.0/example/data_corpus/file_0000061.txt +19 -0
  69. spruceup_ai-0.1.0/example/data_corpus/file_0000062.txt +19 -0
  70. spruceup_ai-0.1.0/example/data_corpus/file_0000063.txt +19 -0
  71. spruceup_ai-0.1.0/example/data_corpus/file_0000064.txt +19 -0
  72. spruceup_ai-0.1.0/example/data_corpus/file_0000065.txt +19 -0
  73. spruceup_ai-0.1.0/example/data_corpus/file_0000066.txt +19 -0
  74. spruceup_ai-0.1.0/example/data_corpus/file_0000067.txt +19 -0
  75. spruceup_ai-0.1.0/example/data_corpus/file_0000068.txt +19 -0
  76. spruceup_ai-0.1.0/example/data_corpus/file_0000069.txt +19 -0
  77. spruceup_ai-0.1.0/example/data_corpus/file_0000070.txt +19 -0
  78. spruceup_ai-0.1.0/example/data_corpus/file_0000071.txt +19 -0
  79. spruceup_ai-0.1.0/example/data_corpus/file_0000072.txt +19 -0
  80. spruceup_ai-0.1.0/example/data_corpus/file_0000073.txt +19 -0
  81. spruceup_ai-0.1.0/example/data_corpus/file_0000074.txt +19 -0
  82. spruceup_ai-0.1.0/example/data_corpus/file_0000075.txt +19 -0
  83. spruceup_ai-0.1.0/example/data_corpus/file_0000076.txt +19 -0
  84. spruceup_ai-0.1.0/example/data_corpus/file_0000077.txt +19 -0
  85. spruceup_ai-0.1.0/example/data_corpus/file_0000078.txt +19 -0
  86. spruceup_ai-0.1.0/example/data_corpus/file_0000079.txt +19 -0
  87. spruceup_ai-0.1.0/example/data_corpus/file_0000080.txt +19 -0
  88. spruceup_ai-0.1.0/example/data_corpus/file_0000081.txt +19 -0
  89. spruceup_ai-0.1.0/example/data_corpus/file_0000082.txt +19 -0
  90. spruceup_ai-0.1.0/example/data_corpus/file_0000083.txt +19 -0
  91. spruceup_ai-0.1.0/example/data_corpus/file_0000084.txt +19 -0
  92. spruceup_ai-0.1.0/example/data_corpus/file_0000085.txt +19 -0
  93. spruceup_ai-0.1.0/example/data_corpus/file_0000086.txt +19 -0
  94. spruceup_ai-0.1.0/example/data_corpus/file_0000087.txt +19 -0
  95. spruceup_ai-0.1.0/example/data_corpus/file_0000088.txt +19 -0
  96. spruceup_ai-0.1.0/example/data_corpus/file_0000089.txt +19 -0
  97. spruceup_ai-0.1.0/example/data_corpus/file_0000090.txt +19 -0
  98. spruceup_ai-0.1.0/example/data_corpus/file_0000091.txt +19 -0
  99. spruceup_ai-0.1.0/example/data_corpus/file_0000092.txt +19 -0
  100. spruceup_ai-0.1.0/example/data_corpus/file_0000093.txt +19 -0
  101. spruceup_ai-0.1.0/example/data_corpus/file_0000094.txt +19 -0
  102. spruceup_ai-0.1.0/example/data_corpus/file_0000095.txt +19 -0
  103. spruceup_ai-0.1.0/example/data_corpus/file_0000096.txt +19 -0
  104. spruceup_ai-0.1.0/example/data_corpus/file_0000097.txt +19 -0
  105. spruceup_ai-0.1.0/example/data_corpus/file_0000098.txt +19 -0
  106. spruceup_ai-0.1.0/example/data_corpus/file_0000099.txt +19 -0
  107. spruceup_ai-0.1.0/example/dummy_pipeline.py +183 -0
  108. spruceup_ai-0.1.0/example/second_local_source/DEMO_TEXT.txt +83 -0
  109. spruceup_ai-0.1.0/pyproject.toml +62 -0
  110. spruceup_ai-0.1.0/spruceup_pipeline.py +166 -0
  111. spruceup_ai-0.1.0/src/spruceup/__init__.py +29 -0
  112. spruceup_ai-0.1.0/src/spruceup/__main__.py +3 -0
  113. spruceup_ai-0.1.0/src/spruceup/app.py +201 -0
  114. spruceup_ai-0.1.0/src/spruceup/cli.py +50 -0
  115. spruceup_ai-0.1.0/src/spruceup/config.py +45 -0
  116. spruceup_ai-0.1.0/src/spruceup/connectors/__init__.py +15 -0
  117. spruceup_ai-0.1.0/src/spruceup/connectors/base.py +170 -0
  118. spruceup_ai-0.1.0/src/spruceup/connectors/embedders/__init__.py +6 -0
  119. spruceup_ai-0.1.0/src/spruceup/connectors/embedders/cohere.py +47 -0
  120. spruceup_ai-0.1.0/src/spruceup/connectors/embedders/embedding_batcher.py +148 -0
  121. spruceup_ai-0.1.0/src/spruceup/connectors/embedders/gemini.py +73 -0
  122. spruceup_ai-0.1.0/src/spruceup/connectors/embedders/openai.py +50 -0
  123. spruceup_ai-0.1.0/src/spruceup/connectors/embedders/voyageai.py +44 -0
  124. spruceup_ai-0.1.0/src/spruceup/connectors/sources/__init__.py +4 -0
  125. spruceup_ai-0.1.0/src/spruceup/connectors/sources/google_drive.py +181 -0
  126. spruceup_ai-0.1.0/src/spruceup/connectors/sources/local.py +80 -0
  127. spruceup_ai-0.1.0/src/spruceup/connectors/targets/__init__.py +5 -0
  128. spruceup_ai-0.1.0/src/spruceup/connectors/targets/pgvector.py +128 -0
  129. spruceup_ai-0.1.0/src/spruceup/connectors/targets/pinecone.py +81 -0
  130. spruceup_ai-0.1.0/src/spruceup/connectors/targets/weaviate.py +143 -0
  131. spruceup_ai-0.1.0/src/spruceup/coordinator.py +156 -0
  132. spruceup_ai-0.1.0/src/spruceup/debounce_queue.py +24 -0
  133. spruceup_ai-0.1.0/src/spruceup/manifest.py +516 -0
  134. spruceup_ai-0.1.0/src/spruceup/memoize/__init__.py +3 -0
  135. spruceup_ai-0.1.0/src/spruceup/memoize/decorator.py +63 -0
  136. spruceup_ai-0.1.0/src/spruceup/memoize/decorator_utility.py +26 -0
  137. spruceup_ai-0.1.0/src/spruceup/models.py +32 -0
  138. spruceup_ai-0.1.0/src/spruceup/monitoring/__init__.py +0 -0
  139. spruceup_ai-0.1.0/src/spruceup/monitoring/google_drive_watcher.py +246 -0
  140. spruceup_ai-0.1.0/src/spruceup/monitoring/local_file_watcher.py +185 -0
  141. spruceup_ai-0.1.0/src/spruceup/monitoring/monitor.py +114 -0
  142. spruceup_ai-0.1.0/src/spruceup/sync_engine/__init__.py +8 -0
  143. spruceup_ai-0.1.0/src/spruceup/sync_engine/sync_engine.py +60 -0
  144. spruceup_ai-0.1.0/src/spruceup/sync_sweeper.py +53 -0
  145. spruceup_ai-0.1.0/src/spruceup/transform_context.py +33 -0
  146. spruceup_ai-0.1.0/src/spruceup/utils/__init__.py +9 -0
  147. spruceup_ai-0.1.0/src/spruceup/utils/hashing.py +59 -0
  148. spruceup_ai-0.1.0/src/spruceup/utils/schema.py +23 -0
  149. spruceup_ai-0.1.0/src/spruceup/utils/validation.py +33 -0
  150. spruceup_ai-0.1.0/tests/conftest.py +12 -0
  151. spruceup_ai-0.1.0/tests/fakes.py +123 -0
  152. spruceup_ai-0.1.0/tests/loadtest/bench_embed.py +183 -0
  153. spruceup_ai-0.1.0/tests/loadtest/gen_corpus.py +68 -0
  154. spruceup_ai-0.1.0/tests/loadtest/reset.py +43 -0
  155. spruceup_ai-0.1.0/tests/loadtest/run_ingest.py +616 -0
  156. spruceup_ai-0.1.0/tests/loadtest/stubs.py +77 -0
  157. spruceup_ai-0.1.0/tests/test_coordinator.py +131 -0
  158. spruceup_ai-0.1.0/tests/test_embedders.py +44 -0
  159. spruceup_ai-0.1.0/tests/test_hashing.py +21 -0
  160. spruceup_ai-0.1.0/tests/test_memoize.py +75 -0
  161. spruceup_ai-0.1.0/tests/test_sync_engine.py +97 -0
  162. spruceup_ai-0.1.0/tests/test_sync_sweeper.py +69 -0
@@ -0,0 +1,13 @@
1
+ dist/
2
+ **/.env
3
+ **/*.pyc
4
+ **/.venv
5
+ .pytest_cache/
6
+ pyrightconfig.json
7
+ __pycache__
8
+ .claude
9
+ poetry.lock
10
+ uv.lock
11
+ Notes.md
12
+ tenacity_test.py
13
+ spruceup_manifest.db
@@ -0,0 +1,139 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Commands
6
+
7
+ ```bash
8
+ # Install dependencies (requires Python 3.14)
9
+ uv sync
10
+
11
+ # Run the app (must be run from the directory containing spruceup_pipeline.py)
12
+ uv run spruceup start
13
+
14
+ # Run all tests
15
+ uv run pytest
16
+
17
+ # Run a single test file
18
+ uv run pytest tests/test_sync_engine.py
19
+
20
+ # Run a specific test
21
+ uv run pytest tests/test_sync_engine.py::test_reconcile_new_file
22
+ ```
23
+
24
+ Required env vars for the pipeline: `PG_CONNSTR`, and an embedder API key (e.g. `OPENAI_API_KEY`). Copy credentials into `.env`; `spruceup_pipeline.py` calls `dotenv.load_dotenv()` at import time.
25
+
26
+ ## Architecture
27
+
28
+ SpruceUp is a document ingestion daemon. It watches source connectors for file changes, transforms documents into chunks, embeds them, and keeps a target vector store in sync.
29
+
30
+ ### Pipeline file (`spruceup_pipeline.py`)
31
+
32
+ The user-authored entry point. The CLI (`spruceup start`) imports it dynamically from the CWD. It must define a `config` variable returned by `defineConfig()`:
33
+
34
+ ```python
35
+ config = defineConfig(
36
+ sources=[LocalFilesSource(watched_dir="example/data_corpus")],
37
+ target=PgVectorTarget(connstr=..., table="data_chunks", schema=LectureChunk, vector_column="chunk_embedding"),
38
+ embedder=OpenAIEmbedder(api_key=..., model="text-embedding-3-small"),
39
+ transform=build_lecture_chunks, # async fn(*, file_props: FileProps, embed) -> list[schema]
40
+ cache_files=False, # optional; True caches raw file content in the manifest (default False)
41
+ )
42
+ ```
43
+
44
+ `defineConfig()` validates types eagerly at import time. `validate_pipeline()` in `cli.py` checks the contract exists before starting the event loop.
45
+
46
+ ### Runtime flow (`app.py`)
47
+
48
+ On startup, `app.run(pipeline)` compares persisted fingerprints in the Manifest against the current config. Any mismatch triggers a **full reindex** (all files re-fetched, re-transformed, re-upserted) instead of incremental sync:
49
+ 1. Transform function body changed (source hash)
50
+ 2. Any `@memoize`-decorated function changed
51
+ 3. Embedding model changed
52
+ 4. Embedding dimensions changed
53
+ 5. Target identity changed — `target.identity()`, a credential-free string (host/db/table or index/collection)
54
+ 6. Schema changed — `hash_schema()` over field names+types and the designated `vector_column`
55
+
56
+ Signals 3–4 additionally **flush the embedding cache** (`embeddings_invalidated`). Signals 4–6 are **structural** and additionally **drop + recreate** the target table/index before reingest (`ensure_table_exists(recreate=True)`) — chosen over in-place migration because reingest must re-embed everything anyway.
57
+
58
+ On any mismatch, every file row is marked `needs_reindex` and the new fingerprints are persisted immediately (mark first, persist second — a crash in between just re-marks on the next start). A file stays `needs_reindex` until a sync **succeeds**; failures and restarts don't clear it. `SyncEngine.reconcile` pushes **all** chunks of a `needs_reindex` file instead of the diff (config changes don't alter chunk hashes, so the diff can't see them). `needs_reindex` files are re-enqueued at every startup and retried by the sync sweeper, so an interrupted reindex resumes where it left off.
59
+
60
+ Then it launches three concurrent asyncio tasks:
61
+
62
+ | Task | Role |
63
+ |------|------|
64
+ | `Monitor` | Runs all watchers; each watcher does a catch-up scan then enters a watch loop |
65
+ | `Coordinator` | Dequeues `SyncTask` objects and processes them (up to 32 concurrent) |
66
+ | `SyncSweeper` | Retries `failed` and `needs_reindex` files every 60 seconds |
67
+
68
+ ### File change lifecycle
69
+
70
+ ```
71
+ Source watcher → DebounceQueue → Coordinator
72
+
73
+ source.fetch() → SpruceFile
74
+
75
+ transform(file_props, embed) → list[UserChunk]
76
+
77
+ SyncEngine.reconcile() → chunk diff → target.sync()
78
+
79
+ Manifest.set_sync_state("synced")
80
+ ```
81
+
82
+ `DebounceQueue` (wraps `asyncio.Queue`) evicts any already-queued task for the same `file_id` when a newer task arrives, preventing redundant processing. **Tradeoff:** to evict the superseded task it reaches into `asyncio.Queue` internals (`_queue`, `_unfinished_tasks`), so it's coupled to the CPython queue implementation and could break on a stdlib change.
83
+
84
+ ### Manifest (`manifest.py`)
85
+
86
+ A local SQLite database (`spruceup_manifest.db`) that is the source of truth for:
87
+ - Registered data sources and their state (e.g. Google Drive page tokens)
88
+ - File rows: content hash, raw content (only when `cache_files=True`), sync state (`needs_reindex` / `in_flight` / `synced` / `failed`)
89
+ - Chunk rows: `(file_id, user_chunk_object_hash)` pairs for diffing
90
+ - Memoize cache: `(file_id, fn_hash, args_hash) → result`
91
+ - Embedding cache: `(file_id, chunk_text_hash) → embedding bytes`
92
+ - Config state: `embedding_model`, `embedding_dimensions`, `target_identity`, `schema_fingerprint`
93
+
94
+ Opened with `autocommit=True`; use `manifest.transaction()` only when multiple writes must be atomic.
95
+
96
+ ### Connector ABCs (`connectors/base.py`)
97
+
98
+ All connectors implement one of three ABCs:
99
+
100
+ - **`SourceConnector`** — `source_type`, `source_identifier`, `create_watcher()`, `fetch()`, `validate()`, `is_supported()`, `decode_content()`
101
+ - **`TargetConnector`** — `vector_column`, `identity()`, `ensure_table_exists(recreate=False)`, `sync(upserts, deletes)`, `aclose()`
102
+ - **`EmbedderConnector`** — `embed_batch(batch)`, `process_chunks(chunks)`, `aclose()`
103
+
104
+ Available implementations:
105
+
106
+ | Type | Implementations |
107
+ |------|----------------|
108
+ | Source | `LocalFilesSource`, `GoogleDriveSource` |
109
+ | Target | `PgVectorTarget`, `PineconeTarget`, `WeaviateTarget` |
110
+ | Embedder | `OpenAIEmbedder`, `CohereEmbedder`, `GeminiEmbedder`, `VoyageAIEmbedder` |
111
+
112
+ `LocalFilesSource` and `LocalFileWatcher` exist for local testing. Production reasoning should be framed in terms of the connector ABCs.
113
+
114
+ An embedder's `api_key` accepts a `str` or a `Callable[[], str]` (e.g. a secrets-manager fetch, resolved at client build). On a credential rejection the embedder raises `TokenExpiredError`; the base `embed_batch_retrying` then drops the cached client so the next retry rebuilds it with a re-resolved token. Static-string keys are left untouched (no point re-resolving). Auth-error detection is per-SDK (each `embed_batch` catches its provider's exception and normalizes to `TokenExpiredError`).
115
+
116
+ ### EmbeddingBatcher (`connectors/embedders/embedding_batcher.py`)
117
+
118
+ Wraps any `EmbedderConnector`. Accumulates chunks from concurrent file transforms and flushes them as batched API calls (max 100ms wait or `max_batch_size` chunks, max 5 concurrent API calls). Also consults the Manifest embedding cache before calling the API — cache is scoped per `file_id` and keyed by `blake2b(chunk_text)`.
119
+
120
+ Each accumulated chunk gets its own `asyncio.Future`; a per-call `asyncio.gather` over those futures reassembles a caller's embeddings in order and waits for completion, so the batcher carries no per-file slot bookkeeping (a flushed batch may mix chunks from several callers).
121
+
122
+ ### `@memoize` decorator (`memoize/decorator.py`)
123
+
124
+ Caches async subfunctions in the Manifest, scoped per file. The decorated function must be `async` — decorating a sync function raises `TypeError` at import. Results are invalidated when the function body changes. Valid **only** when called from within the `transform` function — it reads the `contextvars` set by `Coordinator.upsert_file()` via `transform_scope`.
125
+
126
+ ```python
127
+ @memoize(return_type=str)
128
+ async def summarize(text: str) -> str: ...
129
+ ```
130
+
131
+ Supported return types: `str`, `int`, `float`, `bool`, `list`, `dict`.
132
+
133
+ ### PgVectorTarget schema mapping
134
+
135
+ `ensure_table_exists()` inspects the user dataclass with `typing.get_type_hints()` and maps Python types to Postgres types. The embedding column is named **explicitly** via the target's `vector_column=` (validated at construction to be a `list[float]` field) and becomes `vector(N)` (requires `pgvector` extension), where `N` is the embedder's `embedding_dimensions`. Any *other* `list[float]` field maps to a plain `DOUBLE PRECISION[]` array, not a vector. The `id` column is always `TEXT PRIMARY KEY`, set to `f"{file_id}:{chunk.user_chunk_object_hash.hex()}"` (keyed per file). Upserts use `ON CONFLICT (id) DO UPDATE` so re-embeds (e.g. after a model change) overwrite existing rows.
136
+
137
+ ### Google Drive source
138
+
139
+ `GoogleDriveSource` takes a `watched_dir` (folder ID) and an `on_token_expired: Callable[[], str]` that returns a fresh OAuth access token. The `drive.readonly` scope covers all required API calls (list, download, export, changes). Startup validation rejects nested watched folders.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 SpruceUp Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,460 @@
1
+ Metadata-Version: 2.4
2
+ Name: spruceup-ai
3
+ Version: 0.1.0
4
+ Summary: A standalone system for making automated, incremental updates to a vector database.
5
+ Project-URL: Repository, https://github.com/SpruceUp-ai/SpruceUp
6
+ Author-email: Ekerin Agboola <ekerin.m.a@gmail.com>, Eric Cho <ekc7590@gmail.com>, AJ Fuhler <aart.fuhler@gmail.com>, Caleb Pickard <caleb.pickard@gmail.com>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 SpruceUp Contributors
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+ License-File: LICENSE
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Operating System :: OS Independent
31
+ Classifier: Programming Language :: Python :: 3
32
+ Requires-Python: <3.15,>=3.12
33
+ Requires-Dist: cohere<8.0.0,>=7.0.0
34
+ Requires-Dist: google-api-python-client<3.0.0,>=2.197.0
35
+ Requires-Dist: google-auth<3.0.0,>=2.53.0
36
+ Requires-Dist: google-genai<3.0.0,>=2.6.0
37
+ Requires-Dist: openai<3.0.0,>=2.37.0
38
+ Requires-Dist: pinecone>=3.0.0
39
+ Requires-Dist: psycopg-pool<4.0.0,>=3.0.0
40
+ Requires-Dist: psycopg<4.0.0,>=3.3.4
41
+ Requires-Dist: tenacity<10.0.0,>=9.1.4
42
+ Requires-Dist: voyageai<0.5.0,>=0.4.0
43
+ Requires-Dist: watchfiles>=1.0.0
44
+ Requires-Dist: weaviate-client<5.0.0,>=4.0.0
45
+ Description-Content-Type: text/markdown
46
+
47
+ ## SpruceUp
48
+
49
+ **SpruceUp** is a standalone system for making automated, incremental updates to a vector database.
50
+
51
+ ## Installation
52
+
53
+ Add SpruceUp to your project (e.g., with `poetry`, `pip`, or `uv`):
54
+
55
+ ```bash
56
+ poetry add spruceup-ai # OR
57
+ pip install spruceup-ai # OR
58
+ uv add spruceup-ai
59
+ ```
60
+
61
+ ---
62
+
63
+ ## Setup
64
+
65
+ Create a file named `spruceup_pipeline.py` in your project directory. This is the user-authored entry point SpruceUp loads at startup. It must export a single `config` variable built with `defineConfig()`.
66
+
67
+ ```python
68
+ # spruceup_pipeline.py
69
+ import re
70
+ import os
71
+ from dataclasses import dataclass
72
+ from spruceup import defineConfig, FileProps, LocalFilesSource, PgVectorTarget, OpenAIEmbedder
73
+
74
+ @dataclass
75
+ class ArticleChunk:
76
+ title: str
77
+ content: str
78
+ embedding: list[float]
79
+
80
+ def split_into_paragraphs(text: str) -> list[str]:
81
+ return [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
82
+
83
+ async def transform(*, file_props: FileProps, embed) -> list[ArticleChunk]:
84
+ paragraphs = split_into_paragraphs(file_props.raw_content)
85
+ embeddings = await embed(paragraphs)
86
+ return [
87
+ ArticleChunk(title=file_props.display_name, content=para, embedding=emb)
88
+ for para, emb in zip(paragraphs, embeddings)
89
+ ]
90
+
91
+ config = defineConfig(
92
+ sources=[LocalFilesSource(watched_dir="./articles")],
93
+ target=PgVectorTarget(
94
+ connstr=os.environ["PG_CONNSTR"],
95
+ table="article_chunks",
96
+ schema=ArticleChunk,
97
+ vector_column="embedding",
98
+ ),
99
+ embedder=OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"]),
100
+ transform=transform,
101
+ )
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Running SpruceUp
107
+
108
+ From the directory containing your `spruceup_pipeline.py` file:
109
+
110
+ ```bash
111
+ spruceup start
112
+ ```
113
+
114
+ SpruceUp will scan your sources, sync any files not yet in the manifest, then enter a watch loop for incremental updates.
115
+
116
+ ---
117
+
118
+ ## Imports
119
+
120
+ Everything you need is importable from the top-level `spruceup` package:
121
+
122
+ ```python
123
+ from spruceup import (
124
+ defineConfig,
125
+ FileProps,
126
+
127
+ # Sources
128
+ LocalFilesSource,
129
+ GoogleDriveSource,
130
+
131
+ # Targets
132
+ PgVectorTarget,
133
+ PineconeTarget,
134
+ WeaviateTarget,
135
+
136
+ # Embedders
137
+ OpenAIEmbedder,
138
+ CohereEmbedder,
139
+ GeminiEmbedder,
140
+ VoyageAIEmbedder,
141
+
142
+ # Utilities
143
+ memoize,
144
+ )
145
+ ```
146
+
147
+ ---
148
+
149
+ ## `defineConfig()`
150
+
151
+ ```python
152
+ config = defineConfig(
153
+ sources=[...],
154
+ target=...,
155
+ embedder=...,
156
+ transform=...,
157
+ )
158
+ ```
159
+
160
+ All parameters are keyword-only.
161
+
162
+ | Parameter | Type | Required | Default | Description |
163
+ | ------------- | ----------------------- | -------- | ------- | ------------------------------------- |
164
+ | `sources` | `list[SourceConnector]` | Yes | — | At least one source connector |
165
+ | `target` | `TargetConnector` | Yes | — | Where synced chunks are written |
166
+ | `embedder` | `EmbedderConnector` | Yes | — | Generates embeddings for your chunks |
167
+ | `transform` | `async callable` | Yes | — | Converts a file into a list of chunks |
168
+ | `cache_files` | `bool` | No | `False` | Cache raw file bytes in the manifest |
169
+
170
+ ---
171
+
172
+ ## The Transform Function
173
+
174
+ The transform function is where you split, enrich, and embed your documents. SpruceUp calls it for every file that changes. This function **must** be async.
175
+
176
+ ```python
177
+ async def transform(*, file_props: FileProps, embed) -> list[YourSchema]:
178
+ ...
179
+ ```
180
+
181
+ ### `FileProps`
182
+
183
+ | Field | Type | Description |
184
+ | -------------- | -------------- | ------------------------------------------------------------ |
185
+ | `raw_content` | `str \| bytes` | File content. Text formats are decoded as UTF-8; binary formats like PDF are passed through as raw `bytes`. |
186
+ | `display_name` | `str` | The filename |
187
+ | `file_type` | `str` | File extension (e.g. `"txt"`, `"pdf"`) |
188
+
189
+ ### `embed`
190
+
191
+ `embed` is an async callable that takes a list of strings and returns a list of embedding vectors:
192
+
193
+ ```python
194
+ embeddings: list[list[float]] = await embed(["chunk one", "chunk two"])
195
+ ```
196
+
197
+ ### Chunk Schema
198
+
199
+ Your transform returns a list of instances of a user-defined dataclass. SpruceUp uses this schema for diffing and for writing to the target store. Define it as a plain dataclass:
200
+
201
+ ```python
202
+ @dataclass
203
+ class MyChunk:
204
+ title: str
205
+ text: str
206
+ embedding: list[float]
207
+ ```
208
+
209
+ All target connectors support `str`, `int`, `float`, `bool`, and `list[float]` as field types. Use `list[float]` for your embedding vector. You do not need to define an `id` field. SpruceUp generates one from each chunk's content hash.
210
+
211
+ ---
212
+
213
+ ## Source Connectors
214
+
215
+ ### `LocalFilesSource`
216
+
217
+ Watches a local directory for file changes.
218
+
219
+ ```python
220
+ LocalFilesSource(watched_dir="./data")
221
+ ```
222
+
223
+ | Parameter | Type | Required | Default | Description |
224
+ | ------------- | ----- | -------- | ------- | ------------------------------ |
225
+ | `watched_dir` | `str` | Yes | — | Path to the directory to watch |
226
+
227
+ ---
228
+
229
+ ### `GoogleDriveSource`
230
+
231
+ Watches a Google Drive folder for file changes. Requires the `drive.readonly` OAuth scope.
232
+
233
+ ```python
234
+ GoogleDriveSource(
235
+ watched_dir="<folder-id>",
236
+ on_token_expired=get_access_token,
237
+ recursive=True,
238
+ )
239
+ ```
240
+
241
+ | Parameter | Type | Required | Default | Description |
242
+ | ------------------ | ------------------- | -------- | ------- | ------------------------------------------------------------ |
243
+ | `watched_dir` | `str` | Yes | — | Google Drive folder ID |
244
+ | `on_token_expired` | `Callable[[], str]` | Yes | — | Called when the access token expires; must return a fresh token string |
245
+ | `recursive` | `bool` | No | `True` | Whether to watch subfolders |
246
+
247
+ The `on_token_expired` callback is invoked whenever the connector needs a new OAuth token. It should return a valid access token or raise an exception.
248
+
249
+ ---
250
+
251
+ ## Target Connectors
252
+
253
+ ### `PgVectorTarget`
254
+
255
+ Syncs chunks to a PostgreSQL table using the `pgvector` extension.
256
+
257
+ ```python
258
+ PgVectorTarget(
259
+ connstr="postgresql://user:pass@localhost/mydb",
260
+ table="my_chunks",
261
+ schema=MyChunk,
262
+ vector_column="embedding",
263
+ )
264
+ ```
265
+
266
+ | Parameter | Type | Required | Default | Description |
267
+ | --------------- | ------ | -------- | ------- | ---------------------------------------------- |
268
+ | `connstr` | `str` | Yes | — | PostgreSQL connection string |
269
+ | `table` | `str` | Yes | — | Table name |
270
+ | `schema` | `type` | Yes | — | Your chunk dataclass |
271
+ | `vector_column` | `str` | Yes | — | Field name on your schema that holds the vector |
272
+
273
+ SpruceUp creates the table and its columns automatically based on your schema's type hints. The `pgvector` extension must be installed on your database.
274
+
275
+ ---
276
+
277
+ ### `PineconeTarget`
278
+
279
+ Syncs chunks to a Pinecone index.
280
+
281
+ ```python
282
+ PineconeTarget(
283
+ api_key="pc-...",
284
+ index_name="my-index",
285
+ schema=MyChunk,
286
+ vector_column="embedding",
287
+ namespace="",
288
+ metric="cosine",
289
+ cloud="aws",
290
+ region="us-east-1",
291
+ )
292
+ ```
293
+
294
+ | Parameter | Type | Required | Default | Description |
295
+ | --------------- | ------------- | -------- | ------------- | ----------------------------------------------------------- |
296
+ | `api_key` | `str \| None` | Yes | — | Pinecone API key |
297
+ | `index_name` | `str` | Yes | — | Name of the Pinecone index |
298
+ | `schema` | `type` | Yes | — | Your chunk dataclass |
299
+ | `vector_column` | `str` | Yes | — | Field name on your schema that holds the vector |
300
+ | `namespace` | `str` | No | `""` | Namespace within the index |
301
+ | `metric` | `str` | No | `"cosine"` | Distance metric (`"cosine"`, `"euclidean"`, `"dotproduct"`) |
302
+ | `cloud` | `str` | No | `"aws"` | Cloud provider |
303
+ | `region` | `str` | No | `"us-east-1"` | Cloud region |
304
+
305
+ ---
306
+
307
+ ### `WeaviateTarget`
308
+
309
+ Syncs chunks to a Weaviate collection.
310
+
311
+ ```python
312
+ # Local instance
313
+ WeaviateTarget(
314
+ collection_name="MyChunks",
315
+ schema=MyChunk,
316
+ vector_column="embedding",
317
+ url="http://localhost:8080",
318
+ )
319
+
320
+ # Weaviate Cloud
321
+ WeaviateTarget(
322
+ collection_name="MyChunks",
323
+ schema=MyChunk,
324
+ vector_column="embedding",
325
+ cluster_url="https://my-cluster.weaviate.network",
326
+ api_key="wvp-...",
327
+ )
328
+ ```
329
+
330
+ | Parameter | Type | Required | Default | Description |
331
+ | ----------------- | ------------- | -------- | ------------------------- | ----------------------------------------- |
332
+ | `collection_name` | `str` | Yes | — | Weaviate collection name |
333
+ | `schema` | `type` | Yes | — | Your chunk dataclass |
334
+ | `vector_column` | `str` | Yes | — | Field name on your schema that holds the vector |
335
+ | `url` | `str` | No | `"http://localhost:8080"` | URL for a local Weaviate instance |
336
+ | `cluster_url` | `str \| None` | No | `None` | URL for a Weaviate Cloud cluster |
337
+ | `api_key` | `str \| None` | No | `None` | API key for Weaviate Cloud authentication |
338
+
339
+ Use either `url` for a local instance or `cluster_url` + `api_key` for a cloud deployment.
340
+
341
+ ---
342
+
343
+ ## Embedder Connectors
344
+
345
+ SpruceUp runs a health check at startup that embeds a test string and reads the actual output size from the API. The `embedding_dimensions` parameter is optional on all embedders. If omitted, the dimension is detected automatically. If provided, SpruceUp validates it matches what the API actually returns and raises an error if not.
346
+
347
+ ### `OpenAIEmbedder`
348
+
349
+ ```python
350
+ OpenAIEmbedder(
351
+ api_key="sk-...",
352
+ model="text-embedding-3-small",
353
+ max_batch_size=150,
354
+ embedding_dimensions=None,
355
+ )
356
+ ```
357
+
358
+ | Parameter | Type | Required | Default | Description |
359
+ | ---------------------- | --------------------------- | -------- | -------------------------- | -------------------------- |
360
+ | `api_key` | `str \| Callable[[], str]` | Yes | — | OpenAI API key, or a callable that returns one |
361
+ | `model` | `str` | No | `"text-embedding-3-small"` | Embedding model |
362
+ | `max_batch_size` | `int` | No | `150` | Max texts per API call |
363
+ | `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
364
+
365
+ ---
366
+
367
+ ### `CohereEmbedder`
368
+
369
+ ```python
370
+ CohereEmbedder(
371
+ api_key="...",
372
+ model="embed-v4.0",
373
+ max_batch_size=96,
374
+ embedding_dimensions=None,
375
+ )
376
+ ```
377
+
378
+ | Parameter | Type | Required | Default | Description |
379
+ | ---------------------- | --------------------------- | -------- | -------------- | -------------------------- |
380
+ | `api_key` | `str \| Callable[[], str]` | Yes | — | Cohere API key, or a callable that returns one |
381
+ | `model` | `str` | No | `"embed-v4.0"` | Embedding model |
382
+ | `max_batch_size` | `int` | No | `96` | Max texts per API call |
383
+ | `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
384
+
385
+ When using an `embed-v4` model with a custom `embedding_dimensions`, the value must be one of `256`, `512`, `1024`, or `1536`.
386
+
387
+ ---
388
+
389
+ ### `GeminiEmbedder`
390
+
391
+ ```python
392
+ GeminiEmbedder(
393
+ api_key="...",
394
+ model="gemini-embedding-001",
395
+ max_batch_size=100,
396
+ )
397
+ ```
398
+
399
+ | Parameter | Type | Required | Default | Description |
400
+ | ---------------------- | --------------------------- | -------- | ------------------------ | ---------------------------------------- |
401
+ | `api_key` | `str \| Callable[[], str]` | Yes | — | Google Generative AI API key, or a callable that returns one |
402
+ | `model` | `str` | No | `"gemini-embedding-001"` | Embedding model |
403
+ | `max_batch_size` | `int` | No | `100` | Max texts per API call (hard limit: 100) |
404
+ | `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
405
+
406
+ ---
407
+
408
+ ### `VoyageAIEmbedder`
409
+
410
+ ```python
411
+ VoyageAIEmbedder(
412
+ api_key="...",
413
+ model="voyage-4-large",
414
+ max_batch_size=150,
415
+ embedding_dimensions=None,
416
+ )
417
+ ```
418
+
419
+ | Parameter | Type | Required | Default | Description |
420
+ | ---------------------- | --------------------------- | -------- | ------------------ | -------------------------- |
421
+ | `api_key` | `str \| Callable[[], str]` | Yes | — | Voyage AI API key, or a callable that returns one |
422
+ | `model` | `str` | No | `"voyage-4-large"` | Embedding model |
423
+ | `max_batch_size` | `int` | No | `150` | Max texts per API call |
424
+ | `embedding_dimensions` | `int \| None` | No | `None` | Override output dimensions. If omitted, SpruceUp reads the actual dimension from the API at startup. |
425
+
426
+ When using a `voyage-4` model with a custom `embedding_dimensions`, the value must be one of `256`, `512`, `1024`, or `2048`.
427
+
428
+ ---
429
+
430
+ ## `@memoize`
431
+
432
+ The `memoize` decorator caches the results of expensive subfunctions inside your transform. Results are stored in the SpruceUp manifest (a local SQLite database), scoped per file and invalidated automatically when the decorated function's body changes.
433
+
434
+ ```python
435
+ from spruceup import memoize
436
+ import asyncio
437
+
438
+ @memoize(return_type=str)
439
+ async def summarize(text: str) -> str:
440
+ # expensive LLM call
441
+ ...
442
+
443
+ async def transform(*, file_props: FileProps, embed) -> list[MyChunk]:
444
+ chunk_strs = split_into_chunks(file_props.raw_content)
445
+ # summarize each chunk concurrently; results are cached per file
446
+ summaries = await asyncio.gather(*[summarize(c) for c in chunk_strs])
447
+ embeddings = await embed(chunk_strs)
448
+ return [
449
+ MyChunk(content=c, summary=s, embedding=e)
450
+ for c, s, e in zip(chunk_strs, summaries, embeddings)
451
+ ]
452
+ ```
453
+
454
+ | Parameter | Type | Required | Description |
455
+ | ------------- | ------ | -------- | ------------------------------------------------------------ |
456
+ | `return_type` | `type` | Yes | Return type of the decorated function — used for serialization |
457
+
458
+ Supported return types: `str`, `int`, `float`, `bool`, `list`, `dict`.
459
+
460
+ `memoize` only works on `async` functions. Decorating a sync function raises a `TypeError`. It can only be used inside a transform function. Calling a memoized function outside of a transform context will raise a `RuntimeError`.