vectormeta 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectormeta-0.1.0/LICENSE +21 -0
- vectormeta-0.1.0/PKG-INFO +378 -0
- vectormeta-0.1.0/README.md +343 -0
- vectormeta-0.1.0/pyproject.toml +84 -0
- vectormeta-0.1.0/setup.cfg +4 -0
- vectormeta-0.1.0/tests/test_analyzer.py +25 -0
- vectormeta-0.1.0/tests/test_cli.py +144 -0
- vectormeta-0.1.0/tests/test_fixer.py +156 -0
- vectormeta-0.1.0/tests/test_hydrate.py +105 -0
- vectormeta-0.1.0/tests/test_io.py +39 -0
- vectormeta-0.1.0/tests/test_sizing.py +27 -0
- vectormeta-0.1.0/vectormeta/__init__.py +3 -0
- vectormeta-0.1.0/vectormeta/__main__.py +8 -0
- vectormeta-0.1.0/vectormeta/analyzer.py +51 -0
- vectormeta-0.1.0/vectormeta/cli.py +287 -0
- vectormeta-0.1.0/vectormeta/config.py +43 -0
- vectormeta-0.1.0/vectormeta/errors.py +23 -0
- vectormeta-0.1.0/vectormeta/fixer.py +224 -0
- vectormeta-0.1.0/vectormeta/hydrate.py +122 -0
- vectormeta-0.1.0/vectormeta/io.py +139 -0
- vectormeta-0.1.0/vectormeta/limits.py +94 -0
- vectormeta-0.1.0/vectormeta/models.py +143 -0
- vectormeta-0.1.0/vectormeta/py.typed +1 -0
- vectormeta-0.1.0/vectormeta/reporting.py +134 -0
- vectormeta-0.1.0/vectormeta/sizing.py +34 -0
- vectormeta-0.1.0/vectormeta.egg-info/PKG-INFO +378 -0
- vectormeta-0.1.0/vectormeta.egg-info/SOURCES.txt +29 -0
- vectormeta-0.1.0/vectormeta.egg-info/dependency_links.txt +1 -0
- vectormeta-0.1.0/vectormeta.egg-info/entry_points.txt +2 -0
- vectormeta-0.1.0/vectormeta.egg-info/requires.txt +11 -0
- vectormeta-0.1.0/vectormeta.egg-info/top_level.txt +1 -0
vectormeta-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Achal Jain
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vectormeta
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect and fix oversized vector database metadata before upsert.
|
|
5
|
+
Author: Achal Jain
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Achal13jain/vectormeta
|
|
8
|
+
Project-URL: Repository, https://github.com/Achal13jain/vectormeta
|
|
9
|
+
Project-URL: Issues, https://github.com/Achal13jain/vectormeta/issues
|
|
10
|
+
Keywords: vector-database,metadata,pinecone,chroma,qdrant,weaviate,cli
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Database
|
|
19
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: pydantic>=2.6
|
|
25
|
+
Requires-Dist: PyYAML>=6.0
|
|
26
|
+
Requires-Dist: rich>=13.7
|
|
27
|
+
Requires-Dist: typer>=0.12
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy>=1.8; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
33
|
+
Requires-Dist: types-PyYAML>=6.0; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# vectormeta
|
|
37
|
+
|
|
38
|
+
[](https://github.com/Achal13jain/vectormeta/actions/workflows/ci.yml)
|
|
39
|
+
[](https://github.com/Achal13jain/vectormeta/actions/workflows/pages.yml)
|
|
40
|
+
|
|
41
|
+
Stop vector DB metadata limit errors before upsert.
|
|
42
|
+
|
|
43
|
+
Website: <https://achal13jain.github.io/vectormeta/>
|
|
44
|
+
|
|
45
|
+
`vectormeta` is a Python CLI package for detecting and fixing oversized metadata in
|
|
46
|
+
vector database records. It scans JSON or JSONL vector records, reports the largest
|
|
47
|
+
metadata fields, and can move heavy content fields into local JSON sidecar files while
|
|
48
|
+
leaving clean filterable metadata in the vector database payload.
|
|
49
|
+
|
|
50
|
+
The project is designed for developers preparing records for Pinecone, Chroma, Qdrant,
|
|
51
|
+
Weaviate, or a custom metadata policy. Pinecone is the clearest strict-limit target in
|
|
52
|
+
the MVP. Other targets use conservative advisory limits that should be adjusted for each
|
|
53
|
+
deployment.
|
|
54
|
+
|
|
55
|
+
## Why This Exists
|
|
56
|
+
|
|
57
|
+
Vector database metadata should usually stay small and filterable:
|
|
58
|
+
|
|
59
|
+
- `source`
|
|
60
|
+
- `page`
|
|
61
|
+
- `section`
|
|
62
|
+
- `doc_id`
|
|
63
|
+
- `chunk_id`
|
|
64
|
+
- `tags`
|
|
65
|
+
- `language`
|
|
66
|
+
|
|
67
|
+
Large payloads such as full chunk text, raw HTML, Markdown, OCR text, summaries, tables,
|
|
68
|
+
or full documents can push records over service metadata limits and make upserts fail.
|
|
69
|
+
`vectormeta` catches that problem before upload and can rewrite records into a safer
|
|
70
|
+
shape:
|
|
71
|
+
|
|
72
|
+
```text
|
|
73
|
+
vector record metadata -> small filterable fields + content_ref
|
|
74
|
+
sidecar JSON file -> large text, HTML, tables, summaries, payloads
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Features
|
|
78
|
+
|
|
79
|
+
- Scan JSON arrays and newline-delimited JSON records.
|
|
80
|
+
- Measure metadata using compact UTF-8 JSON bytes.
|
|
81
|
+
- Report oversized records, largest fields, byte counts, KB counts, and suggested moves.
|
|
82
|
+
- Exit with code `1` when oversized records are found, which makes scans useful in CI.
|
|
83
|
+
- Move heavy metadata fields into sidecar JSON files.
|
|
84
|
+
- Preserve unknown record fields and original record order.
|
|
85
|
+
- Sanitize sidecar filenames derived from record IDs.
|
|
86
|
+
- Protect output files and sidecars from accidental overwrite.
|
|
87
|
+
- Hydrate records back from sidecar references for debugging and migrations.
|
|
88
|
+
- Keep core logic independent from Typer and Rich so it can be tested and reused.
|
|
89
|
+
|
|
90
|
+
## Tech Stack
|
|
91
|
+
|
|
92
|
+
- Python 3.10+
|
|
93
|
+
- Typer for the CLI
|
|
94
|
+
- Rich for human-readable terminal reports
|
|
95
|
+
- Pydantic for YAML config validation
|
|
96
|
+
- PyYAML for config loading
|
|
97
|
+
- Pytest for tests
|
|
98
|
+
- Ruff for linting and formatting
|
|
99
|
+
- Mypy for strict type checks
|
|
100
|
+
- Setuptools and `python -m build` for packaging
|
|
101
|
+
|
|
102
|
+
## Installation
|
|
103
|
+
|
|
104
|
+
Clone and install locally:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
git clone https://github.com/Achal13jain/vectormeta.git
|
|
108
|
+
cd vectormeta
|
|
109
|
+
pip install -e ".[dev]"
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Check the CLI:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
vectormeta --help
|
|
116
|
+
vectormeta --version
|
|
117
|
+
python -m vectormeta --help
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
After the package is published to PyPI, the intended install command is:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pip install vectormeta
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Input Format
|
|
127
|
+
|
|
128
|
+
JSON array:
|
|
129
|
+
|
|
130
|
+
```json
|
|
131
|
+
[
|
|
132
|
+
{
|
|
133
|
+
"id": "doc_1_chunk_1",
|
|
134
|
+
"values": [0.1, 0.2, 0.3],
|
|
135
|
+
"metadata": {
|
|
136
|
+
"source": "paper.pdf",
|
|
137
|
+
"page": 1,
|
|
138
|
+
"chunk_text": "large text..."
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
]
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
JSONL:
|
|
145
|
+
|
|
146
|
+
```jsonl
|
|
147
|
+
{"id":"doc_1","values":[0.1],"metadata":{"text":"large text..."}}
|
|
148
|
+
{"id":"doc_2","values":[0.2],"metadata":{"text":"large text..."}}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Each record must contain:
|
|
152
|
+
|
|
153
|
+
- `id` or `_id`
|
|
154
|
+
- `metadata` as a JSON object
|
|
155
|
+
|
|
156
|
+
Vector fields such as `values`, `vector`, or `embedding` are preserved but not deeply
|
|
157
|
+
validated by the MVP.
|
|
158
|
+
|
|
159
|
+
## Quickstart
|
|
160
|
+
|
|
161
|
+
Scan the included oversized Pinecone example:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
vectormeta scan examples/oversized_pinecone_records.json --target pinecone --no-fail
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Fix the records:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
vectormeta fix examples/oversized_pinecone_records.json \
|
|
171
|
+
--target pinecone \
|
|
172
|
+
--sidecar examples/sidecar \
|
|
173
|
+
--out examples/pinecone_ready.json \
|
|
174
|
+
--overwrite
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Verify the cleaned file now fits the Pinecone-sized policy:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
vectormeta scan examples/pinecone_ready.json --target pinecone --no-fail
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Hydrate records for local inspection:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
vectormeta hydrate examples/pinecone_ready.json \
|
|
187
|
+
--sidecar examples/sidecar \
|
|
188
|
+
--out examples/hydrated.json \
|
|
189
|
+
--overwrite
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Commands
|
|
193
|
+
|
|
194
|
+
### Scan
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
vectormeta scan chunks.json --target pinecone
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Useful options:
|
|
201
|
+
|
|
202
|
+
- `--target pinecone|chroma|qdrant|weaviate|custom`
|
|
203
|
+
- `--limit-kb <number>` for custom or overridden limits
|
|
204
|
+
- `--top <number>` for the largest oversized records to show
|
|
205
|
+
- `--format table|json`
|
|
206
|
+
- `--no-fail` to exit `0` even when oversized records are found
|
|
207
|
+
|
|
208
|
+
Exit codes:
|
|
209
|
+
|
|
210
|
+
- `0`: all records fit, or `--no-fail` was passed
|
|
211
|
+
- `1`: oversized records were found
|
|
212
|
+
- `2`: expected user-facing input, config, target, or overwrite error
|
|
213
|
+
|
|
214
|
+
### Fix
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
vectormeta fix chunks.json --target pinecone --sidecar ./sidecar --out pinecone_ready.json
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Move explicit fields:
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
vectormeta fix chunks.json \
|
|
224
|
+
--target pinecone \
|
|
225
|
+
--move-fields chunk_text,raw_html,summary \
|
|
226
|
+
--keep-fields source,page,section,doc_id,chunk_id \
|
|
227
|
+
--content-ref-field content_ref \
|
|
228
|
+
--sidecar ./sidecar \
|
|
229
|
+
--out pinecone_ready.json
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Preview without writing:
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
vectormeta fix chunks.json --target pinecone --sidecar ./sidecar --out ready.json --dry-run
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
`fix` does not overwrite files unless `--overwrite` is passed.
|
|
239
|
+
|
|
240
|
+
If your input metadata already contains `content_ref`, choose another reference field:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
vectormeta fix chunks.json \
|
|
244
|
+
--target pinecone \
|
|
245
|
+
--content-ref-field vectormeta_content_ref \
|
|
246
|
+
--sidecar ./sidecar \
|
|
247
|
+
--out pinecone_ready.json
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Hydrate
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
vectormeta hydrate pinecone_ready.json --sidecar ./sidecar --out hydrated.json
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Hydrate sidecar content into a separate record field:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
vectormeta hydrate pinecone_ready.json \
|
|
260
|
+
--sidecar ./sidecar \
|
|
261
|
+
--mode content_field \
|
|
262
|
+
--content-field payload \
|
|
263
|
+
--out hydrated.json
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Limits
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
vectormeta limits
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Current MVP defaults:
|
|
273
|
+
|
|
274
|
+
| Target | Default | Meaning |
|
|
275
|
+
| --- | ---: | --- |
|
|
276
|
+
| `pinecone` | 40 KB | Primary strict-limit target for this MVP |
|
|
277
|
+
| `chroma` | 256 KB | Advisory local/configurable policy |
|
|
278
|
+
| `qdrant` | 64 KB | Conservative advisory policy |
|
|
279
|
+
| `weaviate` | 64 KB | Conservative advisory policy |
|
|
280
|
+
| `custom` | none | Requires `--limit-kb` |
|
|
281
|
+
|
|
282
|
+
Limits and provider behavior can change. Verify official vector database documentation
|
|
283
|
+
before treating any preset as a production guarantee.
|
|
284
|
+
|
|
285
|
+
## How Metadata Reduction Works
|
|
286
|
+
|
|
287
|
+
`vectormeta` sizes metadata exactly as compact UTF-8 JSON:
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
json.dumps(metadata, ensure_ascii=False, separators=(",", ":")).encode("utf-8")
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
The fixer reduces metadata in this order:
|
|
294
|
+
|
|
295
|
+
1. Move explicit `--move-fields`, if provided.
|
|
296
|
+
2. Otherwise move known heavy fields such as `text`, `chunk_text`, `raw_html`,
|
|
297
|
+
`markdown`, `summary`, `tables`, and `ocr_text`.
|
|
298
|
+
3. If metadata is still above the limit, move the largest non-keep fields one at a
|
|
299
|
+
time until the record fits.
|
|
300
|
+
4. Keep fields such as `source`, `page`, `doc_id`, and `tags` are preserved unless the
|
|
301
|
+
record cannot fit without moving them.
|
|
302
|
+
5. When fields are moved, metadata receives a `content_ref`, and moved fields are
|
|
303
|
+
written to a sidecar JSON payload.
|
|
304
|
+
|
|
305
|
+
The logic is covered by tests for Unicode byte sizing, nested metadata sizing,
|
|
306
|
+
JSON/JSONL input, fixer output, sidecar overwrite protection, hydration, and CLI exit
|
|
307
|
+
codes. See [docs/metadata-reduction.md](docs/metadata-reduction.md).
|
|
308
|
+
|
|
309
|
+
## Local Verification
|
|
310
|
+
|
|
311
|
+
Run the same checks used in CI:
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
python -m pytest
|
|
315
|
+
ruff check .
|
|
316
|
+
ruff format --check .
|
|
317
|
+
mypy vectormeta
|
|
318
|
+
python -m build
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
Run the acceptance workflow:
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
vectormeta scan examples/oversized_pinecone_records.json --target pinecone --no-fail
|
|
325
|
+
vectormeta fix examples/oversized_pinecone_records.json --target pinecone --sidecar examples/sidecar --out examples/pinecone_ready.json --overwrite
|
|
326
|
+
vectormeta scan examples/pinecone_ready.json --target pinecone --no-fail
|
|
327
|
+
vectormeta hydrate examples/pinecone_ready.json --sidecar examples/sidecar --out examples/hydrated.json --overwrite
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
Expected result:
|
|
331
|
+
|
|
332
|
+
- The original example reports one oversized record.
|
|
333
|
+
- The fixed output reports zero oversized records.
|
|
334
|
+
- Sidecar files are created under `examples/sidecar`.
|
|
335
|
+
- Hydration restores moved fields for inspection.
|
|
336
|
+
|
|
337
|
+
## Documentation
|
|
338
|
+
|
|
339
|
+
- [Project website](https://achal13jain.github.io/vectormeta/)
|
|
340
|
+
- [Architecture overview](docs/architecture.md)
|
|
341
|
+
- [Metadata reduction logic](docs/metadata-reduction.md)
|
|
342
|
+
- [Usage guide](docs/usage.md)
|
|
343
|
+
- [Testing checklist](docs/testing.md)
|
|
344
|
+
- [Vector database notes](docs/vector-db-notes.md)
|
|
345
|
+
|
|
346
|
+
## Limitations
|
|
347
|
+
|
|
348
|
+
- Local JSON sidecars only. Keep the cleaned output file and sidecar directory together;
|
|
349
|
+
the MVP does not provide an atomic database-backed sidecar store.
|
|
350
|
+
- Sidecars are one file per changed record. The MVP does not deduplicate repeated fields
|
|
351
|
+
such as shared `raw_html` across chunks from the same document.
|
|
352
|
+
- Input support is JSON arrays and JSONL records, but files are currently read into
|
|
353
|
+
memory. Streaming JSONL scan/fix is planned for larger embedding datasets.
|
|
354
|
+
- Vector values are preserved but not deeply validated.
|
|
355
|
+
- Provider-specific metadata schemas and value types are not fully validated. For
|
|
356
|
+
example, Pinecone has metadata format rules beyond byte size.
|
|
357
|
+
- Non-Pinecone target limits are conservative advisory defaults, not vendor claims.
|
|
358
|
+
- The fixer is policy-based; review cleaned outputs before production ingestion.
|
|
359
|
+
|
|
360
|
+
## Roadmap
|
|
361
|
+
|
|
362
|
+
Planned ideas include:
|
|
363
|
+
|
|
364
|
+
- SQLite sidecar backend
|
|
365
|
+
- Content-addressed sidecar deduplication
|
|
366
|
+
- Streaming JSONL scan/fix
|
|
367
|
+
- S3 sidecar backend
|
|
368
|
+
- LangChain `Document` adapter
|
|
369
|
+
- LlamaIndex `Node` adapter
|
|
370
|
+
- Pinecone upsert wrapper
|
|
371
|
+
- GitHub Action for metadata checks
|
|
372
|
+
- HTML report output
|
|
373
|
+
|
|
374
|
+
See [ROADMAP.md](ROADMAP.md).
|
|
375
|
+
|
|
376
|
+
## License
|
|
377
|
+
|
|
378
|
+
MIT. See [LICENSE](LICENSE).
|