poolin 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
poolin-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Wenxi Wang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
poolin-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: poolin
3
+ Version: 0.1.0
4
+ Summary: A local UI package for pooling existing embedding zips into grouped vectors
5
+ Author: Wenxi Wang
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://example.com/poolin
8
+ Project-URL: Repository, https://example.com/poolin
9
+ Keywords: pooling,embedding,rag,vectors,streamlit,ui
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: streamlit>=1.32
24
+ Requires-Dist: numpy>=1.24
25
+ Dynamic: license-file
26
+
27
+ # poolin
28
+
29
+ A local UI package for pooling existing embedding vectors from an embedding zip into grouped higher-level vectors.
30
+
31
+ ## Important note
32
+
33
+ Standard sentence-transformer style pooling usually happens **inside the embedding model** when token embeddings are converted into one sentence embedding. This package does **post-embedding vector pooling** over already-created chunk embeddings.
34
+
35
+ ## What it does
36
+
37
+ - launches with the `poolin` command
38
+ - reads an embedding zip such as `RAG_chunks_recursive_chunks_embeddings.zip`
39
+ - auto-groups related chunk embeddings by filename pattern like `RAG_chunk_001_rcs_001.md -> RAG_chunk_001`
40
+ - pools vectors with one of these methods:
41
+ - `auto`
42
+ - `mean`
43
+ - `max`
44
+ - `weighted_char_mean`
45
+ - `weighted_word_mean`
46
+ - `mean_sqrt_len`
47
+ - exports a zip with:
48
+ - `pooling_summary.json`
49
+ - `pooling_manifest.csv`
50
+ - `*_pooled_embeddings.jsonl` (optional)
51
+ - `*_pooled_embeddings.csv` (optional)
52
+ - `*_pooled_embeddings.npz` (optional)
53
+
54
+ ## Install
55
+
56
+ ```bash
57
+ pip install poolin
58
+ ```
59
+
60
+ ## Run
61
+
62
+ ```bash
63
+ poolin
64
+ ```
65
+
66
+ ## Suggested input
67
+
68
+ Use a zip produced by your embedding step, containing an embeddings `.npz` or `.jsonl` payload plus the summary file.
69
+
70
+ ## Ownership note
71
+
72
+ The package metadata and copyright notice are set to Wenxi Wang. You should still verify PyPI package-name availability, trademark questions, and any legal or patent issues yourself before publishing.
poolin-0.1.0/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # poolin
2
+
3
+ A local UI package for pooling existing embedding vectors from an embedding zip into grouped higher-level vectors.
4
+
5
+ ## Important note
6
+
7
+ Standard sentence-transformer style pooling usually happens **inside the embedding model** when token embeddings are converted into one sentence embedding. This package does **post-embedding vector pooling** over already-created chunk embeddings.
8
+
9
+ ## What it does
10
+
11
+ - launches with the `poolin` command
12
+ - reads an embedding zip such as `RAG_chunks_recursive_chunks_embeddings.zip`
13
+ - auto-groups related chunk embeddings by filename pattern like `RAG_chunk_001_rcs_001.md -> RAG_chunk_001`
14
+ - pools vectors with one of these methods:
15
+ - `auto`
16
+ - `mean`
17
+ - `max`
18
+ - `weighted_char_mean`
19
+ - `weighted_word_mean`
20
+ - `mean_sqrt_len`
21
+ - exports a zip with:
22
+ - `pooling_summary.json`
23
+ - `pooling_manifest.csv`
24
+ - `*_pooled_embeddings.jsonl` (optional)
25
+ - `*_pooled_embeddings.csv` (optional)
26
+ - `*_pooled_embeddings.npz` (optional)
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install poolin
32
+ ```
33
+
34
+ ## Run
35
+
36
+ ```bash
37
+ poolin
38
+ ```
39
+
40
+ ## Suggested input
41
+
42
+ Use a zip produced by your embedding step, containing an embeddings `.npz` or `.jsonl` payload plus the summary file.
43
+
44
+ ## Ownership note
45
+
46
+ The package metadata and copyright notice are set to Wenxi Wang. You should still verify PyPI package-name availability, trademark questions, and any legal or patent issues yourself before publishing.
@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "0.1.0"
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import streamlit as st
4
+
5
+ from poolin.core import (
6
+ build_output_zip_bytes,
7
+ load_embedding_zip_bytes,
8
+ pool_embedding_records,
9
+ supported_pooling_methods,
10
+ )
11
+
12
+ st.set_page_config(
13
+ page_title="poolin",
14
+ page_icon="🧩",
15
+ layout="wide",
16
+ )
17
+
18
+ st.title("poolin")
19
+ st.caption(
20
+ "Pool existing embedding vectors from an embedding zip into grouped higher-level vectors with a local UI."
21
+ )
22
+
23
+ with st.sidebar:
24
+ st.header("Settings")
25
+ method_options = supported_pooling_methods()
26
+ labels = [f"{name} — {desc}" for name, desc in method_options]
27
+ selected_label = st.selectbox("Pooling method", labels, index=0)
28
+ pooling_method = method_options[labels.index(selected_label)][0]
29
+
30
+ normalize_output = st.checkbox("Normalize pooled embeddings", value=True)
31
+ formats = st.multiselect(
32
+ "Output formats inside export zip",
33
+ options=["jsonl", "csv", "npz"],
34
+ default=["jsonl", "npz"],
35
+ help="jsonl = readable records, csv = spreadsheet-friendly, npz = direct NumPy arrays.",
36
+ )
37
+
38
+ uploaded = st.file_uploader("Drag the embeddings zip here", type=["zip"])
39
+
40
+ if uploaded is None:
41
+ st.info(
42
+ "Upload a zip produced by your embedding step. It should contain an embeddings .npz or .jsonl payload."
43
+ )
44
+ st.stop()
45
+
46
+ if not formats:
47
+ st.warning("Select at least one export format.")
48
+ st.stop()
49
+
50
+ with st.expander("What this step actually does", expanded=False):
51
+ st.write(
52
+ "This app performs post-embedding vector pooling across related chunk embeddings. "
53
+ "It does not replace the token-pooling layer that originally produced each sentence embedding inside the embedding model."
54
+ )
55
+
56
+ if st.button("Generate pooled vectors", type="primary"):
57
+ with st.spinner("Reading embedding zip and pooling vectors..."):
58
+ try:
59
+ records, input_summary = load_embedding_zip_bytes(uploaded.read())
60
+ manifest, items, summary = pool_embedding_records(
61
+ records,
62
+ pooling_method=pooling_method,
63
+ normalize_output=normalize_output,
64
+ )
65
+ export_bytes = build_output_zip_bytes(
66
+ manifest,
67
+ items,
68
+ summary,
69
+ input_zip_name=uploaded.name,
70
+ formats=formats,
71
+ )
72
+ except Exception as exc:
73
+ st.error(str(exc))
74
+ st.stop()
75
+
76
+ col1, col2 = st.columns([1, 2])
77
+ with col1:
78
+ st.metric("Input embeddings", summary["input_embeddings"])
79
+ st.metric("Pooled outputs", summary["pooled_outputs"])
80
+ st.metric("Embedding dimension", summary["output_embedding_dim"])
81
+ st.metric("Pooling method", summary["pooling_method"])
82
+
83
+ with col2:
84
+ st.subheader("Input model")
85
+ st.write(input_summary.get("model_name", summary.get("source_model_name", "unknown")))
86
+ st.subheader("Preview")
87
+ st.dataframe(manifest[:10], use_container_width=True)
88
+
89
+ st.download_button(
90
+ "Download poolin zip",
91
+ data=export_bytes,
92
+ file_name=f"{uploaded.name.rsplit('.', 1)[0]}_pooled.zip",
93
+ mime="application/zip",
94
+ )
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ import sys
5
+
6
+
7
+ def main() -> None:
8
+ try:
9
+ from streamlit.web import cli as stcli
10
+ except Exception as exc: # pragma: no cover
11
+ raise SystemExit(
12
+ "Streamlit is required to launch the UI. Install package dependencies first."
13
+ ) from exc
14
+
15
+ app_path = Path(__file__).with_name("app.py")
16
+ sys.argv = ["streamlit", "run", str(app_path)]
17
+ raise SystemExit(stcli.main())
@@ -0,0 +1,406 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from io import BytesIO, StringIO
5
+ import csv
6
+ import json
7
+ from pathlib import Path
8
+ import re
9
+ from typing import Iterable
10
+ import zipfile
11
+
12
+ import numpy as np
13
+
14
+ POOLING_METHODS: list[tuple[str, str]] = [
15
+ ("auto", "Auto (recommended)"),
16
+ ("mean", "Mean pooling across chunk embeddings"),
17
+ ("max", "Elementwise max pooling"),
18
+ ("weighted_char_mean", "Weighted mean using character counts"),
19
+ ("weighted_word_mean", "Weighted mean using word counts"),
20
+ ("mean_sqrt_len", "Mean scaled by sqrt(number of chunks)"),
21
+ ]
22
+
23
+ SUMMARY_FILES = {"pooling_summary.json", "embedding_summary.json"}
24
+ JSONL_SUFFIX = ".jsonl"
25
+ NPZ_SUFFIX = ".npz"
26
+
27
+
28
+ @dataclass
29
+ class EmbeddedRecord:
30
+ item_id: str
31
+ source_file: str
32
+ text: str
33
+ char_count: int
34
+ word_count: int
35
+ embedding_dim: int
36
+ model_name: str
37
+ embedding: np.ndarray
38
+
39
+
40
+ @dataclass
41
+ class PooledRecord:
42
+ pooled_id: str
43
+ group_key: str
44
+ source_files: list[str]
45
+ member_count: int
46
+ total_characters: int
47
+ total_words: int
48
+ embedding_dim: int
49
+ source_model_name: str
50
+ pooling_method: str
51
+ embedding: np.ndarray
52
+
53
+
54
+ class PoolingError(RuntimeError):
55
+ pass
56
+
57
+
58
+ def supported_pooling_methods() -> list[tuple[str, str]]:
59
+ return POOLING_METHODS.copy()
60
+
61
+
62
+ def infer_default_pooling(model_name: str, normalized: bool | None = None) -> str:
63
+ """Post-embedding pooling heuristic.
64
+
65
+ This is not token pooling inside the original embedding model. It aggregates
66
+ already-produced embedding vectors into higher-level grouped vectors.
67
+ """
68
+ _ = model_name.lower()
69
+ if normalized is True:
70
+ return "mean"
71
+ return "weighted_char_mean"
72
+
73
+
74
+ def _safe_stem(filename: str) -> str:
75
+ return re.sub(r"[^A-Za-z0-9._-]+", "_", Path(filename).stem).strip("_") or "pooling_output"
76
+
77
+
78
+ def _group_key_from_filename(source_file: str) -> str:
79
+ stem = Path(source_file).stem
80
+ patterns = [
81
+ r"^(?P<base>.+?)_rcs_\d+$",
82
+ r"^(?P<base>.+?)_chunk_\d+$",
83
+ r"^(?P<base>.+?)_part_\d+$",
84
+ r"^(?P<base>.+?)_split_\d+$",
85
+ ]
86
+ for pat in patterns:
87
+ m = re.match(pat, stem)
88
+ if m:
89
+ return m.group("base")
90
+ return stem
91
+
92
+
93
+ def _load_summary_from_zip(zf: zipfile.ZipFile) -> dict:
94
+ for name in zf.namelist():
95
+ if Path(name).name in SUMMARY_FILES:
96
+ return json.loads(zf.read(name).decode("utf-8"))
97
+ return {}
98
+
99
+
100
+ def _load_records_from_npz(npz_bytes: bytes, summary: dict) -> list[EmbeddedRecord]:
101
+ data = np.load(BytesIO(npz_bytes), allow_pickle=True)
102
+ if not {"ids", "source_files", "texts", "embeddings"}.issubset(set(data.files)):
103
+ raise PoolingError("The NPZ file does not contain the expected embedding arrays.")
104
+
105
+ ids = data["ids"]
106
+ source_files = data["source_files"]
107
+ texts = data["texts"]
108
+ embeddings = data["embeddings"]
109
+
110
+ model_name = str(summary.get("model_name", "unknown"))
111
+ records: list[EmbeddedRecord] = []
112
+ for i in range(len(ids)):
113
+ text = str(texts[i])
114
+ records.append(
115
+ EmbeddedRecord(
116
+ item_id=str(ids[i]),
117
+ source_file=str(source_files[i]),
118
+ text=text,
119
+ char_count=len(text),
120
+ word_count=len(text.split()),
121
+ embedding_dim=int(embeddings.shape[1]),
122
+ model_name=model_name,
123
+ embedding=np.asarray(embeddings[i], dtype=np.float32),
124
+ )
125
+ )
126
+ return records
127
+
128
+
129
+ def _load_records_from_jsonl(jsonl_text: str) -> list[EmbeddedRecord]:
130
+ records: list[EmbeddedRecord] = []
131
+ for line in jsonl_text.splitlines():
132
+ line = line.strip()
133
+ if not line:
134
+ continue
135
+ obj = json.loads(line)
136
+ records.append(
137
+ EmbeddedRecord(
138
+ item_id=str(obj.get("id", "")),
139
+ source_file=str(obj.get("source_file", "")),
140
+ text=str(obj.get("text", "")),
141
+ char_count=int(obj.get("char_count", 0)),
142
+ word_count=int(obj.get("word_count", 0)),
143
+ embedding_dim=int(obj.get("embedding_dim", 0)),
144
+ model_name=str(obj.get("model_name", "unknown")),
145
+ embedding=np.asarray(obj.get("embedding", []), dtype=np.float32),
146
+ )
147
+ )
148
+ return records
149
+
150
+
151
+ def load_embedding_zip_bytes(zip_bytes: bytes) -> tuple[list[EmbeddedRecord], dict]:
152
+ with zipfile.ZipFile(BytesIO(zip_bytes), "r") as zf:
153
+ summary = _load_summary_from_zip(zf)
154
+
155
+ npz_name = next((name for name in zf.namelist() if name.lower().endswith(NPZ_SUFFIX)), None)
156
+ if npz_name:
157
+ records = _load_records_from_npz(zf.read(npz_name), summary)
158
+ if records:
159
+ return records, summary
160
+
161
+ jsonl_name = next((name for name in zf.namelist() if name.lower().endswith(JSONL_SUFFIX)), None)
162
+ if jsonl_name:
163
+ records = _load_records_from_jsonl(zf.read(jsonl_name).decode("utf-8"))
164
+ if records:
165
+ return records, summary
166
+
167
+ raise PoolingError(
168
+ "Could not find a supported embedding payload inside the zip. Expected an embeddings .npz or .jsonl file."
169
+ )
170
+
171
+
172
+ def _l2_normalize(vec: np.ndarray) -> np.ndarray:
173
+ norm = float(np.linalg.norm(vec))
174
+ if norm <= 0:
175
+ return vec
176
+ return vec / norm
177
+
178
+
179
+ def _pool_vectors(vectors: np.ndarray, weights: np.ndarray | None, method: str) -> np.ndarray:
180
+ if vectors.ndim != 2:
181
+ raise PoolingError("Expected a 2D embedding array for pooling.")
182
+ if len(vectors) == 1:
183
+ return vectors[0].copy()
184
+
185
+ if method == "mean":
186
+ return vectors.mean(axis=0)
187
+ if method == "max":
188
+ return vectors.max(axis=0)
189
+ if method == "weighted_char_mean" or method == "weighted_word_mean":
190
+ if weights is None:
191
+ return vectors.mean(axis=0)
192
+ weights = np.asarray(weights, dtype=np.float32)
193
+ if weights.sum() <= 0:
194
+ return vectors.mean(axis=0)
195
+ weights = weights / weights.sum()
196
+ return np.average(vectors, axis=0, weights=weights)
197
+ if method == "mean_sqrt_len":
198
+ return vectors.mean(axis=0) / np.sqrt(float(len(vectors)))
199
+ raise PoolingError(f"Unsupported pooling method: {method}")
200
+
201
+
202
+ def pool_embedding_records(
203
+ records: list[EmbeddedRecord],
204
+ pooling_method: str = "auto",
205
+ normalize_output: bool = True,
206
+ ) -> tuple[list[dict], list[PooledRecord], dict]:
207
+ if not records:
208
+ raise PoolingError("No embedding records were found in the input zip.")
209
+
210
+ source_model_name = records[0].model_name
211
+ input_dim = records[0].embedding_dim
212
+ inferred_normalized = None
213
+ if records[0].embedding.size:
214
+ inferred_normalized = bool(abs(float(np.linalg.norm(records[0].embedding)) - 1.0) < 0.05)
215
+
216
+ resolved_method = pooling_method
217
+ if pooling_method == "auto":
218
+ resolved_method = infer_default_pooling(source_model_name, inferred_normalized)
219
+
220
+ grouped: dict[str, list[EmbeddedRecord]] = {}
221
+ for rec in records:
222
+ key = _group_key_from_filename(rec.source_file)
223
+ grouped.setdefault(key, []).append(rec)
224
+
225
+ manifest: list[dict] = []
226
+ pooled_records: list[PooledRecord] = []
227
+ for idx, key in enumerate(sorted(grouped.keys()), start=1):
228
+ members = grouped[key]
229
+ vectors = np.stack([m.embedding for m in members]).astype(np.float32)
230
+ weights = None
231
+ if resolved_method == "weighted_char_mean":
232
+ weights = np.asarray([max(m.char_count, 1) for m in members], dtype=np.float32)
233
+ elif resolved_method == "weighted_word_mean":
234
+ weights = np.asarray([max(m.word_count, 1) for m in members], dtype=np.float32)
235
+
236
+ pooled = _pool_vectors(vectors, weights, resolved_method)
237
+ if normalize_output:
238
+ pooled = _l2_normalize(pooled)
239
+
240
+ pooled_id = f"pool_{idx:03d}"
241
+ total_chars = sum(m.char_count for m in members)
242
+ total_words = sum(m.word_count for m in members)
243
+ source_files = [m.source_file for m in members]
244
+
245
+ pooled_rec = PooledRecord(
246
+ pooled_id=pooled_id,
247
+ group_key=key,
248
+ source_files=source_files,
249
+ member_count=len(members),
250
+ total_characters=total_chars,
251
+ total_words=total_words,
252
+ embedding_dim=int(pooled.shape[0]),
253
+ source_model_name=source_model_name,
254
+ pooling_method=resolved_method,
255
+ embedding=pooled.astype(np.float32),
256
+ )
257
+ pooled_records.append(pooled_rec)
258
+ manifest.append(
259
+ {
260
+ "pooled_id": pooled_id,
261
+ "group_key": key,
262
+ "member_count": len(members),
263
+ "total_characters": total_chars,
264
+ "total_words": total_words,
265
+ "embedding_dim": int(pooled.shape[0]),
266
+ "source_model_name": source_model_name,
267
+ "pooling_method": resolved_method,
268
+ }
269
+ )
270
+
271
+ summary = {
272
+ "input_embeddings": len(records),
273
+ "pooled_outputs": len(pooled_records),
274
+ "input_embedding_dim": input_dim,
275
+ "output_embedding_dim": input_dim,
276
+ "source_model_name": source_model_name,
277
+ "pooling_method": resolved_method,
278
+ "normalize_output": normalize_output,
279
+ "grouping_rule": "filename auto-grouping by removing trailing _rcs_N / _chunk_N / _part_N / _split_N",
280
+ }
281
+ return manifest, pooled_records, summary
282
+
283
+
284
+ def _manifest_csv_bytes(manifest: list[dict]) -> bytes:
285
+ sio = StringIO()
286
+ fieldnames = [
287
+ "pooled_id",
288
+ "group_key",
289
+ "member_count",
290
+ "total_characters",
291
+ "total_words",
292
+ "embedding_dim",
293
+ "source_model_name",
294
+ "pooling_method",
295
+ ]
296
+ writer = csv.DictWriter(sio, fieldnames=fieldnames)
297
+ writer.writeheader()
298
+ for row in manifest:
299
+ writer.writerow(row)
300
+ return sio.getvalue().encode("utf-8")
301
+
302
+
303
+ def _pooled_jsonl_bytes(items: list[PooledRecord]) -> bytes:
304
+ lines = []
305
+ for item in items:
306
+ lines.append(
307
+ json.dumps(
308
+ {
309
+ "pooled_id": item.pooled_id,
310
+ "group_key": item.group_key,
311
+ "source_files": item.source_files,
312
+ "member_count": item.member_count,
313
+ "total_characters": item.total_characters,
314
+ "total_words": item.total_words,
315
+ "embedding_dim": item.embedding_dim,
316
+ "source_model_name": item.source_model_name,
317
+ "pooling_method": item.pooling_method,
318
+ "embedding": item.embedding.tolist(),
319
+ },
320
+ ensure_ascii=False,
321
+ )
322
+ )
323
+ return ("\n".join(lines) + "\n").encode("utf-8")
324
+
325
+
326
+ def _pooled_csv_bytes(items: list[PooledRecord]) -> bytes:
327
+ sio = StringIO()
328
+ writer = csv.writer(sio)
329
+ writer.writerow(
330
+ [
331
+ "pooled_id",
332
+ "group_key",
333
+ "member_count",
334
+ "total_characters",
335
+ "total_words",
336
+ "embedding_dim",
337
+ "source_model_name",
338
+ "pooling_method",
339
+ "source_files",
340
+ "embedding",
341
+ ]
342
+ )
343
+ for item in items:
344
+ writer.writerow(
345
+ [
346
+ item.pooled_id,
347
+ item.group_key,
348
+ item.member_count,
349
+ item.total_characters,
350
+ item.total_words,
351
+ item.embedding_dim,
352
+ item.source_model_name,
353
+ item.pooling_method,
354
+ " | ".join(item.source_files),
355
+ json.dumps(item.embedding.tolist()),
356
+ ]
357
+ )
358
+ return sio.getvalue().encode("utf-8")
359
+
360
+
361
+ def _pooled_npz_bytes(items: list[PooledRecord]) -> bytes:
362
+ ids = np.asarray([item.pooled_id for item in items], dtype=object)
363
+ group_keys = np.asarray([item.group_key for item in items], dtype=object)
364
+ source_files = np.asarray([" | ".join(item.source_files) for item in items], dtype=object)
365
+ embeddings = np.stack([item.embedding for item in items]).astype(np.float32)
366
+ member_counts = np.asarray([item.member_count for item in items], dtype=np.int32)
367
+
368
+ buffer = BytesIO()
369
+ np.savez_compressed(
370
+ buffer,
371
+ ids=ids,
372
+ group_keys=group_keys,
373
+ source_files=source_files,
374
+ member_counts=member_counts,
375
+ embeddings=embeddings,
376
+ )
377
+ return buffer.getvalue()
378
+
379
+
380
+ def _summary_json_bytes(summary: dict) -> bytes:
381
+ return json.dumps(summary, ensure_ascii=False, indent=2).encode("utf-8")
382
+
383
+
384
+ def build_output_zip_bytes(
385
+ manifest: list[dict],
386
+ items: list[PooledRecord],
387
+ summary: dict,
388
+ input_zip_name: str,
389
+ formats: list[str] | tuple[str, ...],
390
+ ) -> bytes:
391
+ selected = set(formats)
392
+ base_name = _safe_stem(input_zip_name)
393
+
394
+ buffer = BytesIO()
395
+ with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as zf:
396
+ zf.writestr("pooling_summary.json", _summary_json_bytes(summary))
397
+ zf.writestr("pooling_manifest.csv", _manifest_csv_bytes(manifest))
398
+
399
+ if "jsonl" in selected:
400
+ zf.writestr(f"{base_name}_pooled_embeddings.jsonl", _pooled_jsonl_bytes(items))
401
+ if "csv" in selected:
402
+ zf.writestr(f"{base_name}_pooled_embeddings.csv", _pooled_csv_bytes(items))
403
+ if "npz" in selected:
404
+ zf.writestr(f"{base_name}_pooled_embeddings.npz", _pooled_npz_bytes(items))
405
+
406
+ return buffer.getvalue()
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: poolin
3
+ Version: 0.1.0
4
+ Summary: A local UI package for pooling existing embedding zips into grouped vectors
5
+ Author: Wenxi Wang
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://example.com/poolin
8
+ Project-URL: Repository, https://example.com/poolin
9
+ Keywords: pooling,embedding,rag,vectors,streamlit,ui
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: streamlit>=1.32
24
+ Requires-Dist: numpy>=1.24
25
+ Dynamic: license-file
26
+
27
+ # poolin
28
+
29
+ A local UI package for pooling existing embedding vectors from an embedding zip into grouped higher-level vectors.
30
+
31
+ ## Important note
32
+
33
+ Standard sentence-transformer style pooling usually happens **inside the embedding model** when token embeddings are converted into one sentence embedding. This package does **post-embedding vector pooling** over already-created chunk embeddings.
34
+
35
+ ## What it does
36
+
37
+ - launches with the `poolin` command
38
+ - reads an embedding zip such as `RAG_chunks_recursive_chunks_embeddings.zip`
39
+ - auto-groups related chunk embeddings by filename pattern like `RAG_chunk_001_rcs_001.md -> RAG_chunk_001`
40
+ - pools vectors with one of these methods:
41
+ - `auto`
42
+ - `mean`
43
+ - `max`
44
+ - `weighted_char_mean`
45
+ - `weighted_word_mean`
46
+ - `mean_sqrt_len`
47
+ - exports a zip with:
48
+ - `pooling_summary.json`
49
+ - `pooling_manifest.csv`
50
+ - `*_pooled_embeddings.jsonl` (optional)
51
+ - `*_pooled_embeddings.csv` (optional)
52
+ - `*_pooled_embeddings.npz` (optional)
53
+
54
+ ## Install
55
+
56
+ ```bash
57
+ pip install poolin
58
+ ```
59
+
60
+ ## Run
61
+
62
+ ```bash
63
+ poolin
64
+ ```
65
+
66
+ ## Suggested input
67
+
68
+ Use a zip produced by your embedding step, containing an embeddings `.npz` or `.jsonl` payload plus the summary file.
69
+
70
+ ## Ownership note
71
+
72
+ The package metadata and copyright notice are set to Wenxi Wang. You should still verify PyPI package-name availability, trademark questions, and any legal or patent issues yourself before publishing.
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ poolin/__init__.py
5
+ poolin/app.py
6
+ poolin/cli.py
7
+ poolin/core.py
8
+ poolin.egg-info/PKG-INFO
9
+ poolin.egg-info/SOURCES.txt
10
+ poolin.egg-info/dependency_links.txt
11
+ poolin.egg-info/entry_points.txt
12
+ poolin.egg-info/requires.txt
13
+ poolin.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ poolin = poolin.cli:main
@@ -0,0 +1,2 @@
1
+ streamlit>=1.32
2
+ numpy>=1.24
@@ -0,0 +1 @@
1
+ poolin
@@ -0,0 +1,36 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "poolin"
7
+ version = "0.1.0"
8
+ description = "A local UI package for pooling existing embedding zips into grouped vectors"
9
+ authors = [{name="Wenxi Wang"}]
10
+ readme = "README.md"
11
+ requires-python = ">=3.9"
12
+ license = "MIT"
13
+ dependencies = [
14
+ "streamlit>=1.32",
15
+ "numpy>=1.24",
16
+ ]
17
+ keywords = ["pooling", "embedding", "rag", "vectors", "streamlit", "ui"]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Intended Audience :: Developers",
21
+ "Operating System :: OS Independent",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.9",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ "Topic :: Software Development :: Libraries :: Python Modules",
29
+ ]
30
+
31
+ [project.urls]
32
+ Homepage = "https://example.com/poolin"
33
+ Repository = "https://example.com/poolin"
34
+
35
+ [project.scripts]
36
+ poolin = "poolin.cli:main"
poolin-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+