poolin 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poolin-0.1.0/LICENSE +21 -0
- poolin-0.1.0/PKG-INFO +72 -0
- poolin-0.1.0/README.md +46 -0
- poolin-0.1.0/poolin/__init__.py +2 -0
- poolin-0.1.0/poolin/app.py +94 -0
- poolin-0.1.0/poolin/cli.py +17 -0
- poolin-0.1.0/poolin/core.py +406 -0
- poolin-0.1.0/poolin.egg-info/PKG-INFO +72 -0
- poolin-0.1.0/poolin.egg-info/SOURCES.txt +13 -0
- poolin-0.1.0/poolin.egg-info/dependency_links.txt +1 -0
- poolin-0.1.0/poolin.egg-info/entry_points.txt +2 -0
- poolin-0.1.0/poolin.egg-info/requires.txt +2 -0
- poolin-0.1.0/poolin.egg-info/top_level.txt +1 -0
- poolin-0.1.0/pyproject.toml +36 -0
- poolin-0.1.0/setup.cfg +4 -0
poolin-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Wenxi Wang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
poolin-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: poolin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A local UI package for pooling existing embedding zips into grouped vectors
|
|
5
|
+
Author: Wenxi Wang
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://example.com/poolin
|
|
8
|
+
Project-URL: Repository, https://example.com/poolin
|
|
9
|
+
Keywords: pooling,embedding,rag,vectors,streamlit,ui
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: streamlit>=1.32
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# poolin
|
|
28
|
+
|
|
29
|
+
A local UI package for pooling existing embedding vectors from an embedding zip into grouped higher-level vectors.
|
|
30
|
+
|
|
31
|
+
## Important note
|
|
32
|
+
|
|
33
|
+
Standard sentence-transformer style pooling usually happens **inside the embedding model** when token embeddings are converted into one sentence embedding. This package does **post-embedding vector pooling** over already-created chunk embeddings.
|
|
34
|
+
|
|
35
|
+
## What it does
|
|
36
|
+
|
|
37
|
+
- launches with the `poolin` command
|
|
38
|
+
- reads an embedding zip such as `RAG_chunks_recursive_chunks_embeddings.zip`
|
|
39
|
+
- auto-groups related chunk embeddings by filename pattern like `RAG_chunk_001_rcs_001.md -> RAG_chunk_001`
|
|
40
|
+
- pools vectors with one of these methods:
|
|
41
|
+
- `auto`
|
|
42
|
+
- `mean`
|
|
43
|
+
- `max`
|
|
44
|
+
- `weighted_char_mean`
|
|
45
|
+
- `weighted_word_mean`
|
|
46
|
+
- `mean_sqrt_len`
|
|
47
|
+
- exports a zip with:
|
|
48
|
+
- `pooling_summary.json`
|
|
49
|
+
- `pooling_manifest.csv`
|
|
50
|
+
- `*_pooled_embeddings.jsonl` (optional)
|
|
51
|
+
- `*_pooled_embeddings.csv` (optional)
|
|
52
|
+
- `*_pooled_embeddings.npz` (optional)
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install poolin
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Run
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
poolin
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Suggested input
|
|
67
|
+
|
|
68
|
+
Use a zip produced by your embedding step, containing an embeddings `.npz` or `.jsonl` payload plus the summary file.
|
|
69
|
+
|
|
70
|
+
## Ownership note
|
|
71
|
+
|
|
72
|
+
The package metadata and copyright notice are set to Wenxi Wang. You should still verify PyPI package-name availability, trademark questions, and any legal or patent issues yourself before publishing.
|
poolin-0.1.0/README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# poolin
|
|
2
|
+
|
|
3
|
+
A local UI package for pooling existing embedding vectors from an embedding zip into grouped higher-level vectors.
|
|
4
|
+
|
|
5
|
+
## Important note
|
|
6
|
+
|
|
7
|
+
Standard sentence-transformer style pooling usually happens **inside the embedding model** when token embeddings are converted into one sentence embedding. This package does **post-embedding vector pooling** over already-created chunk embeddings.
|
|
8
|
+
|
|
9
|
+
## What it does
|
|
10
|
+
|
|
11
|
+
- launches with the `poolin` command
|
|
12
|
+
- reads an embedding zip such as `RAG_chunks_recursive_chunks_embeddings.zip`
|
|
13
|
+
- auto-groups related chunk embeddings by filename pattern like `RAG_chunk_001_rcs_001.md -> RAG_chunk_001`
|
|
14
|
+
- pools vectors with one of these methods:
|
|
15
|
+
- `auto`
|
|
16
|
+
- `mean`
|
|
17
|
+
- `max`
|
|
18
|
+
- `weighted_char_mean`
|
|
19
|
+
- `weighted_word_mean`
|
|
20
|
+
- `mean_sqrt_len`
|
|
21
|
+
- exports a zip with:
|
|
22
|
+
- `pooling_summary.json`
|
|
23
|
+
- `pooling_manifest.csv`
|
|
24
|
+
- `*_pooled_embeddings.jsonl` (optional)
|
|
25
|
+
- `*_pooled_embeddings.csv` (optional)
|
|
26
|
+
- `*_pooled_embeddings.npz` (optional)
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install poolin
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Run
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
poolin
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Suggested input
|
|
41
|
+
|
|
42
|
+
Use a zip produced by your embedding step, containing an embeddings `.npz` or `.jsonl` payload plus the summary file.
|
|
43
|
+
|
|
44
|
+
## Ownership note
|
|
45
|
+
|
|
46
|
+
The package metadata and copyright notice are set to Wenxi Wang. You should still verify PyPI package-name availability, trademark questions, and any legal or patent issues yourself before publishing.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import streamlit as st
|
|
4
|
+
|
|
5
|
+
from poolin.core import (
|
|
6
|
+
build_output_zip_bytes,
|
|
7
|
+
load_embedding_zip_bytes,
|
|
8
|
+
pool_embedding_records,
|
|
9
|
+
supported_pooling_methods,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
st.set_page_config(
|
|
13
|
+
page_title="poolin",
|
|
14
|
+
page_icon="🧩",
|
|
15
|
+
layout="wide",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
st.title("poolin")
|
|
19
|
+
st.caption(
|
|
20
|
+
"Pool existing embedding vectors from an embedding zip into grouped higher-level vectors with a local UI."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
with st.sidebar:
|
|
24
|
+
st.header("Settings")
|
|
25
|
+
method_options = supported_pooling_methods()
|
|
26
|
+
labels = [f"{name} — {desc}" for name, desc in method_options]
|
|
27
|
+
selected_label = st.selectbox("Pooling method", labels, index=0)
|
|
28
|
+
pooling_method = method_options[labels.index(selected_label)][0]
|
|
29
|
+
|
|
30
|
+
normalize_output = st.checkbox("Normalize pooled embeddings", value=True)
|
|
31
|
+
formats = st.multiselect(
|
|
32
|
+
"Output formats inside export zip",
|
|
33
|
+
options=["jsonl", "csv", "npz"],
|
|
34
|
+
default=["jsonl", "npz"],
|
|
35
|
+
help="jsonl = readable records, csv = spreadsheet-friendly, npz = direct NumPy arrays.",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
uploaded = st.file_uploader("Drag the embeddings zip here", type=["zip"])
|
|
39
|
+
|
|
40
|
+
if uploaded is None:
|
|
41
|
+
st.info(
|
|
42
|
+
"Upload a zip produced by your embedding step. It should contain an embeddings .npz or .jsonl payload."
|
|
43
|
+
)
|
|
44
|
+
st.stop()
|
|
45
|
+
|
|
46
|
+
if not formats:
|
|
47
|
+
st.warning("Select at least one export format.")
|
|
48
|
+
st.stop()
|
|
49
|
+
|
|
50
|
+
with st.expander("What this step actually does", expanded=False):
|
|
51
|
+
st.write(
|
|
52
|
+
"This app performs post-embedding vector pooling across related chunk embeddings. "
|
|
53
|
+
"It does not replace the token-pooling layer that originally produced each sentence embedding inside the embedding model."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
if st.button("Generate pooled vectors", type="primary"):
|
|
57
|
+
with st.spinner("Reading embedding zip and pooling vectors..."):
|
|
58
|
+
try:
|
|
59
|
+
records, input_summary = load_embedding_zip_bytes(uploaded.read())
|
|
60
|
+
manifest, items, summary = pool_embedding_records(
|
|
61
|
+
records,
|
|
62
|
+
pooling_method=pooling_method,
|
|
63
|
+
normalize_output=normalize_output,
|
|
64
|
+
)
|
|
65
|
+
export_bytes = build_output_zip_bytes(
|
|
66
|
+
manifest,
|
|
67
|
+
items,
|
|
68
|
+
summary,
|
|
69
|
+
input_zip_name=uploaded.name,
|
|
70
|
+
formats=formats,
|
|
71
|
+
)
|
|
72
|
+
except Exception as exc:
|
|
73
|
+
st.error(str(exc))
|
|
74
|
+
st.stop()
|
|
75
|
+
|
|
76
|
+
col1, col2 = st.columns([1, 2])
|
|
77
|
+
with col1:
|
|
78
|
+
st.metric("Input embeddings", summary["input_embeddings"])
|
|
79
|
+
st.metric("Pooled outputs", summary["pooled_outputs"])
|
|
80
|
+
st.metric("Embedding dimension", summary["output_embedding_dim"])
|
|
81
|
+
st.metric("Pooling method", summary["pooling_method"])
|
|
82
|
+
|
|
83
|
+
with col2:
|
|
84
|
+
st.subheader("Input model")
|
|
85
|
+
st.write(input_summary.get("model_name", summary.get("source_model_name", "unknown")))
|
|
86
|
+
st.subheader("Preview")
|
|
87
|
+
st.dataframe(manifest[:10], use_container_width=True)
|
|
88
|
+
|
|
89
|
+
st.download_button(
|
|
90
|
+
"Download poolin zip",
|
|
91
|
+
data=export_bytes,
|
|
92
|
+
file_name=f"{uploaded.name.rsplit('.', 1)[0]}_pooled.zip",
|
|
93
|
+
mime="application/zip",
|
|
94
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main() -> None:
|
|
8
|
+
try:
|
|
9
|
+
from streamlit.web import cli as stcli
|
|
10
|
+
except Exception as exc: # pragma: no cover
|
|
11
|
+
raise SystemExit(
|
|
12
|
+
"Streamlit is required to launch the UI. Install package dependencies first."
|
|
13
|
+
) from exc
|
|
14
|
+
|
|
15
|
+
app_path = Path(__file__).with_name("app.py")
|
|
16
|
+
sys.argv = ["streamlit", "run", str(app_path)]
|
|
17
|
+
raise SystemExit(stcli.main())
|
|
@@ -0,0 +1,406 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from io import BytesIO, StringIO
|
|
5
|
+
import csv
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import re
|
|
9
|
+
from typing import Iterable
|
|
10
|
+
import zipfile
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
POOLING_METHODS: list[tuple[str, str]] = [
|
|
15
|
+
("auto", "Auto (recommended)"),
|
|
16
|
+
("mean", "Mean pooling across chunk embeddings"),
|
|
17
|
+
("max", "Elementwise max pooling"),
|
|
18
|
+
("weighted_char_mean", "Weighted mean using character counts"),
|
|
19
|
+
("weighted_word_mean", "Weighted mean using word counts"),
|
|
20
|
+
("mean_sqrt_len", "Mean scaled by sqrt(number of chunks)"),
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
SUMMARY_FILES = {"pooling_summary.json", "embedding_summary.json"}
|
|
24
|
+
JSONL_SUFFIX = ".jsonl"
|
|
25
|
+
NPZ_SUFFIX = ".npz"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class EmbeddedRecord:
|
|
30
|
+
item_id: str
|
|
31
|
+
source_file: str
|
|
32
|
+
text: str
|
|
33
|
+
char_count: int
|
|
34
|
+
word_count: int
|
|
35
|
+
embedding_dim: int
|
|
36
|
+
model_name: str
|
|
37
|
+
embedding: np.ndarray
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class PooledRecord:
|
|
42
|
+
pooled_id: str
|
|
43
|
+
group_key: str
|
|
44
|
+
source_files: list[str]
|
|
45
|
+
member_count: int
|
|
46
|
+
total_characters: int
|
|
47
|
+
total_words: int
|
|
48
|
+
embedding_dim: int
|
|
49
|
+
source_model_name: str
|
|
50
|
+
pooling_method: str
|
|
51
|
+
embedding: np.ndarray
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PoolingError(RuntimeError):
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def supported_pooling_methods() -> list[tuple[str, str]]:
|
|
59
|
+
return POOLING_METHODS.copy()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def infer_default_pooling(model_name: str, normalized: bool | None = None) -> str:
|
|
63
|
+
"""Post-embedding pooling heuristic.
|
|
64
|
+
|
|
65
|
+
This is not token pooling inside the original embedding model. It aggregates
|
|
66
|
+
already-produced embedding vectors into higher-level grouped vectors.
|
|
67
|
+
"""
|
|
68
|
+
_ = model_name.lower()
|
|
69
|
+
if normalized is True:
|
|
70
|
+
return "mean"
|
|
71
|
+
return "weighted_char_mean"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _safe_stem(filename: str) -> str:
|
|
75
|
+
return re.sub(r"[^A-Za-z0-9._-]+", "_", Path(filename).stem).strip("_") or "pooling_output"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _group_key_from_filename(source_file: str) -> str:
|
|
79
|
+
stem = Path(source_file).stem
|
|
80
|
+
patterns = [
|
|
81
|
+
r"^(?P<base>.+?)_rcs_\d+$",
|
|
82
|
+
r"^(?P<base>.+?)_chunk_\d+$",
|
|
83
|
+
r"^(?P<base>.+?)_part_\d+$",
|
|
84
|
+
r"^(?P<base>.+?)_split_\d+$",
|
|
85
|
+
]
|
|
86
|
+
for pat in patterns:
|
|
87
|
+
m = re.match(pat, stem)
|
|
88
|
+
if m:
|
|
89
|
+
return m.group("base")
|
|
90
|
+
return stem
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _load_summary_from_zip(zf: zipfile.ZipFile) -> dict:
|
|
94
|
+
for name in zf.namelist():
|
|
95
|
+
if Path(name).name in SUMMARY_FILES:
|
|
96
|
+
return json.loads(zf.read(name).decode("utf-8"))
|
|
97
|
+
return {}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _load_records_from_npz(npz_bytes: bytes, summary: dict) -> list[EmbeddedRecord]:
|
|
101
|
+
data = np.load(BytesIO(npz_bytes), allow_pickle=True)
|
|
102
|
+
if not {"ids", "source_files", "texts", "embeddings"}.issubset(set(data.files)):
|
|
103
|
+
raise PoolingError("The NPZ file does not contain the expected embedding arrays.")
|
|
104
|
+
|
|
105
|
+
ids = data["ids"]
|
|
106
|
+
source_files = data["source_files"]
|
|
107
|
+
texts = data["texts"]
|
|
108
|
+
embeddings = data["embeddings"]
|
|
109
|
+
|
|
110
|
+
model_name = str(summary.get("model_name", "unknown"))
|
|
111
|
+
records: list[EmbeddedRecord] = []
|
|
112
|
+
for i in range(len(ids)):
|
|
113
|
+
text = str(texts[i])
|
|
114
|
+
records.append(
|
|
115
|
+
EmbeddedRecord(
|
|
116
|
+
item_id=str(ids[i]),
|
|
117
|
+
source_file=str(source_files[i]),
|
|
118
|
+
text=text,
|
|
119
|
+
char_count=len(text),
|
|
120
|
+
word_count=len(text.split()),
|
|
121
|
+
embedding_dim=int(embeddings.shape[1]),
|
|
122
|
+
model_name=model_name,
|
|
123
|
+
embedding=np.asarray(embeddings[i], dtype=np.float32),
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
return records
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _load_records_from_jsonl(jsonl_text: str) -> list[EmbeddedRecord]:
|
|
130
|
+
records: list[EmbeddedRecord] = []
|
|
131
|
+
for line in jsonl_text.splitlines():
|
|
132
|
+
line = line.strip()
|
|
133
|
+
if not line:
|
|
134
|
+
continue
|
|
135
|
+
obj = json.loads(line)
|
|
136
|
+
records.append(
|
|
137
|
+
EmbeddedRecord(
|
|
138
|
+
item_id=str(obj.get("id", "")),
|
|
139
|
+
source_file=str(obj.get("source_file", "")),
|
|
140
|
+
text=str(obj.get("text", "")),
|
|
141
|
+
char_count=int(obj.get("char_count", 0)),
|
|
142
|
+
word_count=int(obj.get("word_count", 0)),
|
|
143
|
+
embedding_dim=int(obj.get("embedding_dim", 0)),
|
|
144
|
+
model_name=str(obj.get("model_name", "unknown")),
|
|
145
|
+
embedding=np.asarray(obj.get("embedding", []), dtype=np.float32),
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
return records
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def load_embedding_zip_bytes(zip_bytes: bytes) -> tuple[list[EmbeddedRecord], dict]:
|
|
152
|
+
with zipfile.ZipFile(BytesIO(zip_bytes), "r") as zf:
|
|
153
|
+
summary = _load_summary_from_zip(zf)
|
|
154
|
+
|
|
155
|
+
npz_name = next((name for name in zf.namelist() if name.lower().endswith(NPZ_SUFFIX)), None)
|
|
156
|
+
if npz_name:
|
|
157
|
+
records = _load_records_from_npz(zf.read(npz_name), summary)
|
|
158
|
+
if records:
|
|
159
|
+
return records, summary
|
|
160
|
+
|
|
161
|
+
jsonl_name = next((name for name in zf.namelist() if name.lower().endswith(JSONL_SUFFIX)), None)
|
|
162
|
+
if jsonl_name:
|
|
163
|
+
records = _load_records_from_jsonl(zf.read(jsonl_name).decode("utf-8"))
|
|
164
|
+
if records:
|
|
165
|
+
return records, summary
|
|
166
|
+
|
|
167
|
+
raise PoolingError(
|
|
168
|
+
"Could not find a supported embedding payload inside the zip. Expected an embeddings .npz or .jsonl file."
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _l2_normalize(vec: np.ndarray) -> np.ndarray:
|
|
173
|
+
norm = float(np.linalg.norm(vec))
|
|
174
|
+
if norm <= 0:
|
|
175
|
+
return vec
|
|
176
|
+
return vec / norm
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _pool_vectors(vectors: np.ndarray, weights: np.ndarray | None, method: str) -> np.ndarray:
|
|
180
|
+
if vectors.ndim != 2:
|
|
181
|
+
raise PoolingError("Expected a 2D embedding array for pooling.")
|
|
182
|
+
if len(vectors) == 1:
|
|
183
|
+
return vectors[0].copy()
|
|
184
|
+
|
|
185
|
+
if method == "mean":
|
|
186
|
+
return vectors.mean(axis=0)
|
|
187
|
+
if method == "max":
|
|
188
|
+
return vectors.max(axis=0)
|
|
189
|
+
if method == "weighted_char_mean" or method == "weighted_word_mean":
|
|
190
|
+
if weights is None:
|
|
191
|
+
return vectors.mean(axis=0)
|
|
192
|
+
weights = np.asarray(weights, dtype=np.float32)
|
|
193
|
+
if weights.sum() <= 0:
|
|
194
|
+
return vectors.mean(axis=0)
|
|
195
|
+
weights = weights / weights.sum()
|
|
196
|
+
return np.average(vectors, axis=0, weights=weights)
|
|
197
|
+
if method == "mean_sqrt_len":
|
|
198
|
+
return vectors.mean(axis=0) / np.sqrt(float(len(vectors)))
|
|
199
|
+
raise PoolingError(f"Unsupported pooling method: {method}")
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def pool_embedding_records(
|
|
203
|
+
records: list[EmbeddedRecord],
|
|
204
|
+
pooling_method: str = "auto",
|
|
205
|
+
normalize_output: bool = True,
|
|
206
|
+
) -> tuple[list[dict], list[PooledRecord], dict]:
|
|
207
|
+
if not records:
|
|
208
|
+
raise PoolingError("No embedding records were found in the input zip.")
|
|
209
|
+
|
|
210
|
+
source_model_name = records[0].model_name
|
|
211
|
+
input_dim = records[0].embedding_dim
|
|
212
|
+
inferred_normalized = None
|
|
213
|
+
if records[0].embedding.size:
|
|
214
|
+
inferred_normalized = bool(abs(float(np.linalg.norm(records[0].embedding)) - 1.0) < 0.05)
|
|
215
|
+
|
|
216
|
+
resolved_method = pooling_method
|
|
217
|
+
if pooling_method == "auto":
|
|
218
|
+
resolved_method = infer_default_pooling(source_model_name, inferred_normalized)
|
|
219
|
+
|
|
220
|
+
grouped: dict[str, list[EmbeddedRecord]] = {}
|
|
221
|
+
for rec in records:
|
|
222
|
+
key = _group_key_from_filename(rec.source_file)
|
|
223
|
+
grouped.setdefault(key, []).append(rec)
|
|
224
|
+
|
|
225
|
+
manifest: list[dict] = []
|
|
226
|
+
pooled_records: list[PooledRecord] = []
|
|
227
|
+
for idx, key in enumerate(sorted(grouped.keys()), start=1):
|
|
228
|
+
members = grouped[key]
|
|
229
|
+
vectors = np.stack([m.embedding for m in members]).astype(np.float32)
|
|
230
|
+
weights = None
|
|
231
|
+
if resolved_method == "weighted_char_mean":
|
|
232
|
+
weights = np.asarray([max(m.char_count, 1) for m in members], dtype=np.float32)
|
|
233
|
+
elif resolved_method == "weighted_word_mean":
|
|
234
|
+
weights = np.asarray([max(m.word_count, 1) for m in members], dtype=np.float32)
|
|
235
|
+
|
|
236
|
+
pooled = _pool_vectors(vectors, weights, resolved_method)
|
|
237
|
+
if normalize_output:
|
|
238
|
+
pooled = _l2_normalize(pooled)
|
|
239
|
+
|
|
240
|
+
pooled_id = f"pool_{idx:03d}"
|
|
241
|
+
total_chars = sum(m.char_count for m in members)
|
|
242
|
+
total_words = sum(m.word_count for m in members)
|
|
243
|
+
source_files = [m.source_file for m in members]
|
|
244
|
+
|
|
245
|
+
pooled_rec = PooledRecord(
|
|
246
|
+
pooled_id=pooled_id,
|
|
247
|
+
group_key=key,
|
|
248
|
+
source_files=source_files,
|
|
249
|
+
member_count=len(members),
|
|
250
|
+
total_characters=total_chars,
|
|
251
|
+
total_words=total_words,
|
|
252
|
+
embedding_dim=int(pooled.shape[0]),
|
|
253
|
+
source_model_name=source_model_name,
|
|
254
|
+
pooling_method=resolved_method,
|
|
255
|
+
embedding=pooled.astype(np.float32),
|
|
256
|
+
)
|
|
257
|
+
pooled_records.append(pooled_rec)
|
|
258
|
+
manifest.append(
|
|
259
|
+
{
|
|
260
|
+
"pooled_id": pooled_id,
|
|
261
|
+
"group_key": key,
|
|
262
|
+
"member_count": len(members),
|
|
263
|
+
"total_characters": total_chars,
|
|
264
|
+
"total_words": total_words,
|
|
265
|
+
"embedding_dim": int(pooled.shape[0]),
|
|
266
|
+
"source_model_name": source_model_name,
|
|
267
|
+
"pooling_method": resolved_method,
|
|
268
|
+
}
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
summary = {
|
|
272
|
+
"input_embeddings": len(records),
|
|
273
|
+
"pooled_outputs": len(pooled_records),
|
|
274
|
+
"input_embedding_dim": input_dim,
|
|
275
|
+
"output_embedding_dim": input_dim,
|
|
276
|
+
"source_model_name": source_model_name,
|
|
277
|
+
"pooling_method": resolved_method,
|
|
278
|
+
"normalize_output": normalize_output,
|
|
279
|
+
"grouping_rule": "filename auto-grouping by removing trailing _rcs_N / _chunk_N / _part_N / _split_N",
|
|
280
|
+
}
|
|
281
|
+
return manifest, pooled_records, summary
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _manifest_csv_bytes(manifest: list[dict]) -> bytes:
|
|
285
|
+
sio = StringIO()
|
|
286
|
+
fieldnames = [
|
|
287
|
+
"pooled_id",
|
|
288
|
+
"group_key",
|
|
289
|
+
"member_count",
|
|
290
|
+
"total_characters",
|
|
291
|
+
"total_words",
|
|
292
|
+
"embedding_dim",
|
|
293
|
+
"source_model_name",
|
|
294
|
+
"pooling_method",
|
|
295
|
+
]
|
|
296
|
+
writer = csv.DictWriter(sio, fieldnames=fieldnames)
|
|
297
|
+
writer.writeheader()
|
|
298
|
+
for row in manifest:
|
|
299
|
+
writer.writerow(row)
|
|
300
|
+
return sio.getvalue().encode("utf-8")
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _pooled_jsonl_bytes(items: list[PooledRecord]) -> bytes:
|
|
304
|
+
lines = []
|
|
305
|
+
for item in items:
|
|
306
|
+
lines.append(
|
|
307
|
+
json.dumps(
|
|
308
|
+
{
|
|
309
|
+
"pooled_id": item.pooled_id,
|
|
310
|
+
"group_key": item.group_key,
|
|
311
|
+
"source_files": item.source_files,
|
|
312
|
+
"member_count": item.member_count,
|
|
313
|
+
"total_characters": item.total_characters,
|
|
314
|
+
"total_words": item.total_words,
|
|
315
|
+
"embedding_dim": item.embedding_dim,
|
|
316
|
+
"source_model_name": item.source_model_name,
|
|
317
|
+
"pooling_method": item.pooling_method,
|
|
318
|
+
"embedding": item.embedding.tolist(),
|
|
319
|
+
},
|
|
320
|
+
ensure_ascii=False,
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
return ("\n".join(lines) + "\n").encode("utf-8")
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _pooled_csv_bytes(items: list[PooledRecord]) -> bytes:
|
|
327
|
+
sio = StringIO()
|
|
328
|
+
writer = csv.writer(sio)
|
|
329
|
+
writer.writerow(
|
|
330
|
+
[
|
|
331
|
+
"pooled_id",
|
|
332
|
+
"group_key",
|
|
333
|
+
"member_count",
|
|
334
|
+
"total_characters",
|
|
335
|
+
"total_words",
|
|
336
|
+
"embedding_dim",
|
|
337
|
+
"source_model_name",
|
|
338
|
+
"pooling_method",
|
|
339
|
+
"source_files",
|
|
340
|
+
"embedding",
|
|
341
|
+
]
|
|
342
|
+
)
|
|
343
|
+
for item in items:
|
|
344
|
+
writer.writerow(
|
|
345
|
+
[
|
|
346
|
+
item.pooled_id,
|
|
347
|
+
item.group_key,
|
|
348
|
+
item.member_count,
|
|
349
|
+
item.total_characters,
|
|
350
|
+
item.total_words,
|
|
351
|
+
item.embedding_dim,
|
|
352
|
+
item.source_model_name,
|
|
353
|
+
item.pooling_method,
|
|
354
|
+
" | ".join(item.source_files),
|
|
355
|
+
json.dumps(item.embedding.tolist()),
|
|
356
|
+
]
|
|
357
|
+
)
|
|
358
|
+
return sio.getvalue().encode("utf-8")
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _pooled_npz_bytes(items: list[PooledRecord]) -> bytes:
|
|
362
|
+
ids = np.asarray([item.pooled_id for item in items], dtype=object)
|
|
363
|
+
group_keys = np.asarray([item.group_key for item in items], dtype=object)
|
|
364
|
+
source_files = np.asarray([" | ".join(item.source_files) for item in items], dtype=object)
|
|
365
|
+
embeddings = np.stack([item.embedding for item in items]).astype(np.float32)
|
|
366
|
+
member_counts = np.asarray([item.member_count for item in items], dtype=np.int32)
|
|
367
|
+
|
|
368
|
+
buffer = BytesIO()
|
|
369
|
+
np.savez_compressed(
|
|
370
|
+
buffer,
|
|
371
|
+
ids=ids,
|
|
372
|
+
group_keys=group_keys,
|
|
373
|
+
source_files=source_files,
|
|
374
|
+
member_counts=member_counts,
|
|
375
|
+
embeddings=embeddings,
|
|
376
|
+
)
|
|
377
|
+
return buffer.getvalue()
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _summary_json_bytes(summary: dict) -> bytes:
|
|
381
|
+
return json.dumps(summary, ensure_ascii=False, indent=2).encode("utf-8")
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def build_output_zip_bytes(
|
|
385
|
+
manifest: list[dict],
|
|
386
|
+
items: list[PooledRecord],
|
|
387
|
+
summary: dict,
|
|
388
|
+
input_zip_name: str,
|
|
389
|
+
formats: list[str] | tuple[str, ...],
|
|
390
|
+
) -> bytes:
|
|
391
|
+
selected = set(formats)
|
|
392
|
+
base_name = _safe_stem(input_zip_name)
|
|
393
|
+
|
|
394
|
+
buffer = BytesIO()
|
|
395
|
+
with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
396
|
+
zf.writestr("pooling_summary.json", _summary_json_bytes(summary))
|
|
397
|
+
zf.writestr("pooling_manifest.csv", _manifest_csv_bytes(manifest))
|
|
398
|
+
|
|
399
|
+
if "jsonl" in selected:
|
|
400
|
+
zf.writestr(f"{base_name}_pooled_embeddings.jsonl", _pooled_jsonl_bytes(items))
|
|
401
|
+
if "csv" in selected:
|
|
402
|
+
zf.writestr(f"{base_name}_pooled_embeddings.csv", _pooled_csv_bytes(items))
|
|
403
|
+
if "npz" in selected:
|
|
404
|
+
zf.writestr(f"{base_name}_pooled_embeddings.npz", _pooled_npz_bytes(items))
|
|
405
|
+
|
|
406
|
+
return buffer.getvalue()
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: poolin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A local UI package for pooling existing embedding zips into grouped vectors
|
|
5
|
+
Author: Wenxi Wang
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://example.com/poolin
|
|
8
|
+
Project-URL: Repository, https://example.com/poolin
|
|
9
|
+
Keywords: pooling,embedding,rag,vectors,streamlit,ui
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: streamlit>=1.32
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# poolin
|
|
28
|
+
|
|
29
|
+
A local UI package for pooling existing embedding vectors from an embedding zip into grouped higher-level vectors.
|
|
30
|
+
|
|
31
|
+
## Important note
|
|
32
|
+
|
|
33
|
+
Standard sentence-transformer style pooling usually happens **inside the embedding model** when token embeddings are converted into one sentence embedding. This package does **post-embedding vector pooling** over already-created chunk embeddings.
|
|
34
|
+
|
|
35
|
+
## What it does
|
|
36
|
+
|
|
37
|
+
- launches with the `poolin` command
|
|
38
|
+
- reads an embedding zip such as `RAG_chunks_recursive_chunks_embeddings.zip`
|
|
39
|
+
- auto-groups related chunk embeddings by filename pattern like `RAG_chunk_001_rcs_001.md -> RAG_chunk_001`
|
|
40
|
+
- pools vectors with one of these methods:
|
|
41
|
+
- `auto`
|
|
42
|
+
- `mean`
|
|
43
|
+
- `max`
|
|
44
|
+
- `weighted_char_mean`
|
|
45
|
+
- `weighted_word_mean`
|
|
46
|
+
- `mean_sqrt_len`
|
|
47
|
+
- exports a zip with:
|
|
48
|
+
- `pooling_summary.json`
|
|
49
|
+
- `pooling_manifest.csv`
|
|
50
|
+
- `*_pooled_embeddings.jsonl` (optional)
|
|
51
|
+
- `*_pooled_embeddings.csv` (optional)
|
|
52
|
+
- `*_pooled_embeddings.npz` (optional)
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install poolin
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Run
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
poolin
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Suggested input
|
|
67
|
+
|
|
68
|
+
Use a zip produced by your embedding step, containing an embeddings `.npz` or `.jsonl` payload plus the summary file.
|
|
69
|
+
|
|
70
|
+
## Ownership note
|
|
71
|
+
|
|
72
|
+
The package metadata and copyright notice are set to Wenxi Wang. You should still verify PyPI package-name availability, trademark questions, and any legal or patent issues yourself before publishing.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
poolin/__init__.py
|
|
5
|
+
poolin/app.py
|
|
6
|
+
poolin/cli.py
|
|
7
|
+
poolin/core.py
|
|
8
|
+
poolin.egg-info/PKG-INFO
|
|
9
|
+
poolin.egg-info/SOURCES.txt
|
|
10
|
+
poolin.egg-info/dependency_links.txt
|
|
11
|
+
poolin.egg-info/entry_points.txt
|
|
12
|
+
poolin.egg-info/requires.txt
|
|
13
|
+
poolin.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
poolin
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "poolin"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A local UI package for pooling existing embedding zips into grouped vectors"
|
|
9
|
+
authors = [{name="Wenxi Wang"}]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"streamlit>=1.32",
|
|
15
|
+
"numpy>=1.24",
|
|
16
|
+
]
|
|
17
|
+
keywords = ["pooling", "embedding", "rag", "vectors", "streamlit", "ui"]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://example.com/poolin"
|
|
33
|
+
Repository = "https://example.com/poolin"
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
poolin = "poolin.cli:main"
|
poolin-0.1.0/setup.cfg
ADDED